diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -3,50017 +3,100017 @@ "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, - "global_step": 2500, + "global_step": 5000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { - "advantages": -5.960464655174746e-08, - "completion_length": 394.5, + "advantages": -8.514949456639442e-08, + "completion_length": 786.0, "delta_ref_entropy_loss": 0.0, "delta_ref_ppl": 0.0, - "entropy_loss": -0.8828125, - "epoch": 0.0004, - "grad_norm": 45.07482651093602, + "entropy_loss": -1.3671875, + "epoch": 0.0002, + "grad_norm": 73.6867015899596, "k1_kl": 0.0, "k3_kl": 0.0, "kimi_kl": 0.0, - "learning_rate": 4.998e-07, + "learning_rate": 4.999e-07, "loss": 0.0, - "ppl": 0.5244140625, - "reward": 0.4580689072608948, - "reward_std": 0.44782301783561707, - "rewards/perpo_ocr_edit_distance_reward": 0.45806896686553955, + "ppl": 0.7734375, + "reward": 0.35824713110923767, + "reward_std": 0.43857067823410034, + "rewards/perpo_ocr_edit_distance_reward": 0.35824716091156006, "step": 1, "temperature": 0.9 }, { - "advantages": -4.257474728319721e-08, - "completion_length": 573.5, - "delta_ref_entropy_loss": 0.00555419921875, - "delta_ref_ppl": 0.0006561279296875, - "entropy_loss": -1.01953125, - "epoch": 0.0008, - "grad_norm": 40.90053909422321, - "k1_kl": -6.103515625e-05, - "k3_kl": 0.0005397796630859375, - "kimi_kl": 0.001003265380859375, - "learning_rate": 4.996e-07, + "advantages": -8.514949456639442e-08, + "completion_length": 3.0, + "delta_ref_entropy_loss": -0.00341796875, + "delta_ref_ppl": 0.005157470703125, + "entropy_loss": -0.25, + "epoch": 0.0004, + "grad_norm": 79.33792396182126, + "k1_kl": -0.006011962890625, + "k3_kl": 0.0004177093505859375, + "kimi_kl": 0.00063323974609375, + "learning_rate": 4.998e-07, "loss": 0.0, - "ppl": 0.7890625, - "reward": 0.2737664580345154, - "reward_std": 0.4621729552745819, - "rewards/perpo_ocr_edit_distance_reward": 0.27376650273799896, + "ppl": 0.134765625, + "reward": 0.5022302865982056, + "reward_std": 0.2689339518547058, + "rewards/perpo_ocr_edit_distance_reward": 0.5022302865982056, "step": 2, "temperature": 0.9 }, { - "advantages": -3.235680807733843e-07, - "completion_length": 19.5, - "delta_ref_entropy_loss": -0.009765625, - "delta_ref_ppl": 0.005279541015625, - "entropy_loss": -1.080078125, - "epoch": 0.0012, - "grad_norm": 47.52199789143958, - "k1_kl": -0.003875732421875, - "k3_kl": 0.0002237558364868164, - "kimi_kl": 0.000598907470703125, - "learning_rate": 4.994e-07, + "advantages": -3.4059798537100505e-08, + "completion_length": 1197.0, + "delta_ref_entropy_loss": 0.0032806396484375, + "delta_ref_ppl": -0.00061798095703125, + "entropy_loss": -0.88671875, + "epoch": 0.0006, + "grad_norm": 21.246752729422262, + "k1_kl": 0.0003204345703125, + "k3_kl": 0.000370025634765625, + "kimi_kl": 0.000545501708984375, + "learning_rate": 4.997e-07, "loss": 0.0, - "ppl": 0.939453125, - "reward": 0.3049444444477558, - "reward_std": 0.2677665203809738, - "rewards/perpo_ocr_edit_distance_reward": 0.30494444631040096, + "ppl": 0.6484375, + "reward": 0.41619351506233215, + "reward_std": 0.5137242674827576, + "rewards/perpo_ocr_edit_distance_reward": 0.41619354486465454, "step": 3, "temperature": 0.9 }, { - "advantages": -8.514950078364336e-09, - "completion_length": 612.0, - "delta_ref_entropy_loss": 0.0054779052734375, - "delta_ref_ppl": -0.00330352783203125, - "entropy_loss": -0.912109375, - "epoch": 0.0016, - "grad_norm": 8.328454270326722, - "k1_kl": 0.00373077392578125, - "k3_kl": 0.00011229515075683594, - "kimi_kl": 0.00061798095703125, - "learning_rate": 4.991999999999999e-07, + "advantages": -5.10896995820076e-08, + "completion_length": 143.0, + "delta_ref_entropy_loss": -0.0016021728515625, + "delta_ref_ppl": 0.0018157958984375, + "entropy_loss": -0.6875, + "epoch": 0.0008, + "grad_norm": 11.70263679154072, + "k1_kl": -0.0022430419921875, + "k3_kl": 0.000164031982421875, + "kimi_kl": 0.0003871917724609375, + "learning_rate": 4.996e-07, "loss": 0.0, - "ppl": 0.783203125, - "reward": 0.28331276029348373, - "reward_std": 0.3902655690908432, - "rewards/perpo_ocr_edit_distance_reward": 0.28331274539232254, + "ppl": 0.55859375, + "reward": 0.41596347093582153, + "reward_std": 0.3662765622138977, + "rewards/perpo_ocr_edit_distance_reward": 0.4159635007381439, "step": 4, "temperature": 0.9 }, { - "advantages": 1.021793991640152e-07, - "completion_length": 679.0, - "delta_ref_entropy_loss": 8.678436279296875e-05, - "delta_ref_ppl": -8.702278137207031e-05, - "entropy_loss": -0.1671142578125, - "epoch": 0.002, - "grad_norm": 2.1752597450559334, - "k1_kl": 0.0001049041748046875, - "k3_kl": -1.430511474609375e-06, - "kimi_kl": 7.802248001098633e-05, - "learning_rate": 4.99e-07, - "loss": -0.0, - "ppl": 0.0924072265625, - "reward": 0.7595192193984985, - "reward_std": 0.049383968755137175, - "rewards/perpo_ocr_edit_distance_reward": 0.7595192193984985, + "advantages": -6.811959565311554e-07, + "completion_length": 18.0, + "delta_ref_entropy_loss": 0.006683349609375, + "delta_ref_ppl": -0.0078125, + "entropy_loss": -1.28125, + "epoch": 0.001, + "grad_norm": 8.371218834377887, + "k1_kl": 0.007293701171875, + "k3_kl": 0.00020503997802734375, + "kimi_kl": 0.0010223388671875, + "learning_rate": 4.994999999999999e-07, + "loss": 0.0, + "ppl": 1.1796875, + "reward": 0.02945508249104023, + "reward_std": 0.0018368469318374991, + "rewards/perpo_ocr_edit_distance_reward": 0.02945508249104023, "step": 5, "temperature": 0.9 }, { - "advantages": -5.108969780565076e-08, - "completion_length": 198.5, - "delta_ref_entropy_loss": -0.00131988525390625, - "delta_ref_ppl": 0.00270843505859375, - "entropy_loss": -0.833984375, - "epoch": 0.0024, - "grad_norm": 7.471759332083084, - "k1_kl": -0.00249481201171875, - "k3_kl": 0.00013685226440429688, - "kimi_kl": 0.00055694580078125, - "learning_rate": 4.988e-07, - "loss": 0.0, - "ppl": 0.73828125, - "reward": 0.3573243385180831, - "reward_std": 0.21224421309307218, - "rewards/perpo_ocr_edit_distance_reward": 0.3573243683204055, + "advantages": -2.55448497910038e-08, + "completion_length": 186.0, + "delta_ref_entropy_loss": -0.00153350830078125, + "delta_ref_ppl": 0.00015735626220703125, + "entropy_loss": -0.3984375, + "epoch": 0.0012, + "grad_norm": 18.48561682491124, + "k1_kl": 0.00084686279296875, + "k3_kl": -3.3855438232421875e-05, + "kimi_kl": 0.00023651123046875, + "learning_rate": 4.994e-07, + "loss": -0.0, + "ppl": 0.32421875, + "reward": 0.6995318531990051, + "reward_std": 0.47380104660987854, + "rewards/perpo_ocr_edit_distance_reward": 0.6995318531990051, "step": 6, "temperature": 0.9 }, { - "advantages": -1.8136842401261788e-06, - "completion_length": 333.5, - "delta_ref_entropy_loss": -0.00156402587890625, - "delta_ref_ppl": 0.0052928924560546875, - "entropy_loss": -0.51708984375, - "epoch": 0.0028, - "grad_norm": 11.478710257524789, - "k1_kl": -0.0041141510009765625, - "k3_kl": 0.00032448768615722656, - "kimi_kl": 0.0005509853363037109, - "learning_rate": 4.986e-07, + "advantages": 1.7029899268550253e-08, + "completion_length": 459.0, + "delta_ref_entropy_loss": 0.0018157958984375, + "delta_ref_ppl": 0.0020904541015625, + "entropy_loss": -1.1171875, + "epoch": 0.0014, + "grad_norm": 9.249246328020208, + "k1_kl": -0.001617431640625, + "k3_kl": 0.0001468658447265625, + "kimi_kl": 0.000579833984375, + "learning_rate": 4.993e-07, "loss": 0.0, - "ppl": 0.436767578125, - "reward": 0.6301882266998291, - "reward_std": 0.23972284607589245, - "rewards/perpo_ocr_edit_distance_reward": 0.6301882714033127, + "ppl": 0.98046875, + "reward": 0.14086049795150757, + "reward_std": 0.2898608148097992, + "rewards/perpo_ocr_edit_distance_reward": 0.14086049795150757, "step": 7, "temperature": 0.9 }, { - "advantages": -1.447541464472124e-07, - "completion_length": 20.0, - "delta_ref_entropy_loss": -0.002227783203125, - "delta_ref_ppl": -0.0004119873046875, - "entropy_loss": -0.8212890625, - "epoch": 0.0032, - "grad_norm": 11.771573682816689, - "k1_kl": -6.103515625e-05, - "k3_kl": 0.0001678466796875, - "kimi_kl": 0.0005865097045898438, - "learning_rate": 4.984e-07, + "advantages": -3.4059798537100505e-08, + "completion_length": 19.0, + "delta_ref_entropy_loss": -0.0018157958984375, + "delta_ref_ppl": 0.005035400390625, + "entropy_loss": -1.1171875, + "epoch": 0.0016, + "grad_norm": 7.1051112161380905, + "k1_kl": -0.0033111572265625, + "k3_kl": 0.0004825592041015625, + "kimi_kl": 0.00091552734375, + "learning_rate": 4.991999999999999e-07, "loss": 0.0, - "ppl": 0.681640625, - "reward": 0.235648263245821, - "reward_std": 0.14975842623971403, - "rewards/perpo_ocr_edit_distance_reward": 0.23564827907830477, + "ppl": 1.0078125, + "reward": 0.12264399230480194, + "reward_std": 0.3005122244358063, + "rewards/perpo_ocr_edit_distance_reward": 0.12264400720596313, "step": 8, "temperature": 0.9 }, { - "advantages": -1.498631149843277e-06, - "completion_length": 128.0, - "delta_ref_entropy_loss": 0.00020885467529296875, - "delta_ref_ppl": -0.002129673957824707, - "entropy_loss": -0.21875, - "epoch": 0.0036, - "grad_norm": 14.298941193372519, - "k1_kl": 0.0016360282897949219, - "k3_kl": 0.00022232532501220703, - "kimi_kl": 0.0004432201385498047, - "learning_rate": 4.982e-07, - "loss": 0.0, - "ppl": 0.1641845703125, - "reward": 0.7217872142791748, - "reward_std": 0.2377248255070299, - "rewards/perpo_ocr_edit_distance_reward": 0.7217872142791748, + "advantages": -6.811959707420101e-08, + "completion_length": 713.0, + "delta_ref_entropy_loss": 0.00090789794921875, + "delta_ref_ppl": -0.0001392364501953125, + "entropy_loss": -0.30859375, + "epoch": 0.0018, + "grad_norm": 1.1281086773233657, + "k1_kl": 0.0002727508544921875, + "k3_kl": -4.291534423828125e-06, + "kimi_kl": 0.00011587142944335938, + "learning_rate": 4.991e-07, + "loss": -0.0, + "ppl": 0.1748046875, + "reward": 0.44716671109199524, + "reward_std": 0.122065968811512, + "rewards/perpo_ocr_edit_distance_reward": 0.4471667408943176, "step": 9, "temperature": 0.9 }, { - "advantages": 5.108969780565076e-08, - "completion_length": 734.0, - "delta_ref_entropy_loss": -0.00010466575622558594, - "delta_ref_ppl": 0.010607242584228516, - "entropy_loss": -0.501953125, - "epoch": 0.004, - "grad_norm": 48.906809881883405, - "k1_kl": -0.010375022888183594, - "k3_kl": 0.001199483871459961, - "kimi_kl": 0.0010652542114257812, - "learning_rate": 4.979999999999999e-07, + "advantages": -3.4059798537100505e-08, + "completion_length": 19.0, + "delta_ref_entropy_loss": 0.000644683837890625, + "delta_ref_ppl": 0.002685546875, + "entropy_loss": -0.64453125, + "epoch": 0.002, + "grad_norm": 10.941308421778603, + "k1_kl": -0.0025634765625, + "k3_kl": 0.000244140625, + "kimi_kl": 0.000514984130859375, + "learning_rate": 4.99e-07, "loss": 0.0, - "ppl": 0.302978515625, - "reward": 0.7373160719871521, - "reward_std": 0.36148276925086975, - "rewards/perpo_ocr_edit_distance_reward": 0.7373160719871521, + "ppl": 0.56640625, + "reward": 0.43058738112449646, + "reward_std": 0.5185995697975159, + "rewards/perpo_ocr_edit_distance_reward": 0.43058741092681885, "step": 10, "temperature": 0.9 }, { - "advantages": -4.427773774295929e-07, - "completion_length": 120.0, - "delta_ref_entropy_loss": -0.005527496337890625, - "delta_ref_ppl": 0.0019009113311767578, - "entropy_loss": -0.6795654296875, - "epoch": 0.0044, - "grad_norm": 4.8102657637983, - "k1_kl": -0.0013403892517089844, - "k3_kl": 6.496906280517578e-05, - "kimi_kl": 0.0005006790161132812, - "learning_rate": 4.978e-07, - "loss": 0.0, - "ppl": 0.57672119140625, - "reward": 0.345679335296154, - "reward_std": 0.1872396978433244, - "rewards/perpo_ocr_edit_distance_reward": 0.3456793655641377, + "advantages": -7.66345493730114e-08, + "completion_length": 19.0, + "delta_ref_entropy_loss": -0.0189208984375, + "delta_ref_ppl": 0.00445556640625, + "entropy_loss": -1.34375, + "epoch": 0.0022, + "grad_norm": 13.536536504029833, + "k1_kl": -0.0037994384765625, + "k3_kl": 0.0013885498046875, + "kimi_kl": 0.001556396484375, + "learning_rate": 4.989e-07, + "loss": 0.0001, + "ppl": 1.1875, + "reward": 0.025482624769210815, + "reward_std": 0.011557242833077908, + "rewards/perpo_ocr_edit_distance_reward": 0.025482626631855965, "step": 11, "temperature": 0.9 }, { "advantages": -6.811959707420101e-08, - "completion_length": 540.0, - "delta_ref_entropy_loss": -0.002288818359375, - "delta_ref_ppl": 0.000453948974609375, - "entropy_loss": -0.5205078125, - "epoch": 0.0048, - "grad_norm": 10.796385696185071, - "k1_kl": -0.0010304450988769531, - "k3_kl": 0.00173187255859375, - "kimi_kl": 0.00211334228515625, - "learning_rate": 4.976e-07, - "loss": 0.0001, - "ppl": 0.40283203125, - "reward": 0.4935082644224167, - "reward_std": 0.4423958957195282, - "rewards/perpo_ocr_edit_distance_reward": 0.49350830912590027, + "completion_length": 503.0, + "delta_ref_entropy_loss": -0.00118255615234375, + "delta_ref_ppl": 0.0034332275390625, + "entropy_loss": -0.2734375, + "epoch": 0.0024, + "grad_norm": 11.073614456678095, + "k1_kl": -0.00311279296875, + "k3_kl": 0.000629425048828125, + "kimi_kl": 0.0006866455078125, + "learning_rate": 4.988e-07, + "loss": 0.0, + "ppl": 0.23828125, + "reward": 0.7171199321746826, + "reward_std": 0.42681261897087097, + "rewards/perpo_ocr_edit_distance_reward": 0.7171199917793274, "step": 12, "temperature": 0.9 }, { - "advantages": -4.257474994773247e-08, - "completion_length": 603.5, - "delta_ref_entropy_loss": -0.03902435302734375, - "delta_ref_ppl": 0.006084442138671875, - "entropy_loss": -0.8271484375, - "epoch": 0.0052, - "grad_norm": 38.195126396884945, - "k1_kl": -0.0063018798828125, - "k3_kl": 0.000545501708984375, - "kimi_kl": 0.0009670257568359375, - "learning_rate": 4.974e-07, + "advantages": -3.308058012407855e-06, + "completion_length": 595.0, + "delta_ref_entropy_loss": -0.00020885467529296875, + "delta_ref_ppl": -0.000591278076171875, + "entropy_loss": -0.10595703125, + "epoch": 0.0026, + "grad_norm": 1.375416762538948, + "k1_kl": 0.000583648681640625, + "k3_kl": 0.0001220703125, + "kimi_kl": 0.000225067138671875, + "learning_rate": 4.987e-07, "loss": 0.0, - "ppl": 0.6650390625, - "reward": 0.4700995236635208, - "reward_std": 0.41072244942188263, - "rewards/perpo_ocr_edit_distance_reward": 0.4700995534658432, + "ppl": 0.052978515625, + "reward": 0.8763835430145264, + "reward_std": 0.01792137697339058, + "rewards/perpo_ocr_edit_distance_reward": 0.8763836622238159, "step": 13, "temperature": 0.9 }, { - "advantages": -5.10896995820076e-08, - "completion_length": 400.0, - "delta_ref_entropy_loss": -0.014007568359375, - "delta_ref_ppl": 0.00321197509765625, - "entropy_loss": -0.7421875, - "epoch": 0.0056, - "grad_norm": 9.647161646309574, - "k1_kl": -0.00341033935546875, - "k3_kl": 0.0006613731384277344, - "kimi_kl": 0.0009107589721679688, - "learning_rate": 4.972e-07, + "advantages": -1.021793991640152e-07, + "completion_length": 1551.0, + "delta_ref_entropy_loss": 0.0009765625, + "delta_ref_ppl": 0.00139617919921875, + "entropy_loss": -0.30078125, + "epoch": 0.0028, + "grad_norm": 6.911988210844186, + "k1_kl": -0.00189971923828125, + "k3_kl": 0.000461578369140625, + "kimi_kl": 0.000644683837890625, + "learning_rate": 4.986e-07, "loss": 0.0, - "ppl": 0.6513671875, - "reward": 0.36655576527118683, - "reward_std": 0.49204881489276886, - "rewards/perpo_ocr_edit_distance_reward": 0.3665557950735092, + "ppl": 0.232421875, + "reward": 0.4652237594127655, + "reward_std": 0.39568600058555603, + "rewards/perpo_ocr_edit_distance_reward": 0.4652238190174103, "step": 14, "temperature": 0.9 }, { "advantages": -8.514949456639442e-08, - "completion_length": 759.0, - "delta_ref_entropy_loss": -0.019391775131225586, - "delta_ref_ppl": 0.033918142318725586, - "entropy_loss": -0.9169921875, - "epoch": 0.006, - "grad_norm": 41.65076134055909, - "k1_kl": -0.033521175384521484, - "k3_kl": 0.007122039794921875, - "kimi_kl": 0.006618499755859375, - "learning_rate": 4.97e-07, - "loss": 0.0003, - "ppl": 0.73095703125, - "reward": 0.572129100561142, - "reward_std": 0.33293792605400085, - "rewards/perpo_ocr_edit_distance_reward": 0.5721291303634644, + "completion_length": 871.0, + "delta_ref_entropy_loss": -0.004119873046875, + "delta_ref_ppl": 0.00897216796875, + "entropy_loss": -0.34375, + "epoch": 0.003, + "grad_norm": 12.418558100438542, + "k1_kl": -0.00823974609375, + "k3_kl": 0.0021209716796875, + "kimi_kl": 0.00193023681640625, + "learning_rate": 4.985e-07, + "loss": 0.0001, + "ppl": 0.2578125, + "reward": 0.4275246262550354, + "reward_std": 0.2705928087234497, + "rewards/perpo_ocr_edit_distance_reward": 0.4275246560573578, "step": 15, "temperature": 0.9 }, { - "advantages": -8.514948746096707e-09, - "completion_length": 225.5, - "delta_ref_entropy_loss": -0.0146026611328125, - "delta_ref_ppl": 0.005954742431640625, - "entropy_loss": -0.4755859375, - "epoch": 0.0064, - "grad_norm": 11.96486798637264, - "k1_kl": -0.00695037841796875, - "k3_kl": 0.0015430450439453125, - "kimi_kl": 0.0015697479248046875, - "learning_rate": 4.968e-07, + "advantages": -3.4059798537100505e-08, + "completion_length": 82.0, + "delta_ref_entropy_loss": -0.029052734375, + "delta_ref_ppl": 0.000278472900390625, + "entropy_loss": -1.140625, + "epoch": 0.0032, + "grad_norm": 8.402688262597577, + "k1_kl": -0.00020885467529296875, + "k3_kl": 0.0030059814453125, + "kimi_kl": 0.003997802734375, + "learning_rate": 4.984e-07, "loss": 0.0001, - "ppl": 0.3759765625, - "reward": 0.569684311747551, - "reward_std": 0.30852682888507843, - "rewards/perpo_ocr_edit_distance_reward": 0.5696843266487122, + "ppl": 1.03125, + "reward": 0.05840837210416794, + "reward_std": 0.0991189107298851, + "rewards/perpo_ocr_edit_distance_reward": 0.05840837210416794, "step": 16, "temperature": 0.9 }, { - "advantages": -2.4693353850580024e-07, - "completion_length": 1338.0, - "delta_ref_entropy_loss": -0.005950927734375, - "delta_ref_ppl": 0.00262451171875, - "entropy_loss": -0.31494140625, - "epoch": 0.0068, - "grad_norm": 8.571645852199396, - "k1_kl": -0.002597808837890625, - "k3_kl": 0.0010807514190673828, - "kimi_kl": 0.0011553764343261719, - "learning_rate": 4.965999999999999e-07, + "advantages": -2.976826363010332e-05, + "completion_length": 202.0, + "delta_ref_entropy_loss": -0.002227783203125, + "delta_ref_ppl": 0.0003490447998046875, + "entropy_loss": -0.0751953125, + "epoch": 0.0034, + "grad_norm": 2.199939467974588, + "k1_kl": -0.0003528594970703125, + "k3_kl": 0.0003662109375, + "kimi_kl": 0.000400543212890625, + "learning_rate": 4.983e-07, "loss": 0.0, - "ppl": 0.21923828125, - "reward": 0.6450619101524353, - "reward_std": 0.3241228461265564, - "rewards/perpo_ocr_edit_distance_reward": 0.6450619995594025, + "ppl": 0.04248046875, + "reward": 0.7174254059791565, + "reward_std": 0.0024748030118644238, + "rewards/perpo_ocr_edit_distance_reward": 0.717425525188446, "step": 17, "temperature": 0.9 }, { - "advantages": -8.940696893944278e-08, - "completion_length": 514.0, - "delta_ref_entropy_loss": 0.0005035400390625, - "delta_ref_ppl": 0.0428466796875, - "entropy_loss": -0.57421875, - "epoch": 0.0072, - "grad_norm": 46.42793853894654, - "k1_kl": -0.0416259765625, - "k3_kl": 0.011993408203125, - "kimi_kl": 0.010467529296875, - "learning_rate": 4.964e-07, - "loss": 0.0005, - "ppl": 0.443359375, - "reward": 0.5460610091686249, - "reward_std": 0.435526043176651, - "rewards/perpo_ocr_edit_distance_reward": 0.5460610389709473, + "advantages": -2.2138868871479644e-07, + "completion_length": 19.0, + "delta_ref_entropy_loss": -0.0048828125, + "delta_ref_ppl": 0.00421142578125, + "entropy_loss": -0.2353515625, + "epoch": 0.0036, + "grad_norm": 21.502592399922875, + "k1_kl": -0.003936767578125, + "k3_kl": 0.003875732421875, + "kimi_kl": 0.003387451171875, + "learning_rate": 4.982e-07, + "loss": 0.0002, + "ppl": 0.173828125, + "reward": 0.8233418464660645, + "reward_std": 0.3492796719074249, + "rewards/perpo_ocr_edit_distance_reward": 0.8233419060707092, "step": 18, "temperature": 0.9 }, { - "advantages": 8.514949634275126e-09, - "completion_length": 212.0, - "delta_ref_entropy_loss": -0.03277587890625, - "delta_ref_ppl": 0.012582778930664062, - "entropy_loss": -0.8671875, - "epoch": 0.0076, - "grad_norm": 11.971045033502993, - "k1_kl": -0.011835098266601562, - "k3_kl": 0.005672454833984375, - "kimi_kl": 0.005558013916015625, - "learning_rate": 4.961999999999999e-07, - "loss": 0.0002, - "ppl": 0.74609375, - "reward": 0.2343488335609436, - "reward_std": 0.3543483465909958, - "rewards/perpo_ocr_edit_distance_reward": 0.2343488559126854, + "advantages": -3.4059798537100505e-08, + "completion_length": 1.0, + "delta_ref_entropy_loss": -0.03564453125, + "delta_ref_ppl": 0.036376953125, + "entropy_loss": -0.9453125, + "epoch": 0.0038, + "grad_norm": 96.12062492386086, + "k1_kl": -0.036865234375, + "k3_kl": 0.01025390625, + "kimi_kl": 0.01007080078125, + "learning_rate": 4.980999999999999e-07, + "loss": 0.0004, + "ppl": 0.53515625, + "reward": 0.5345980525016785, + "reward_std": 0.3716789484024048, + "rewards/perpo_ocr_edit_distance_reward": 0.5345980525016785, "step": 19, "temperature": 0.9 }, { - "advantages": -9.877341149433505e-07, - "completion_length": 257.5, - "delta_ref_entropy_loss": -0.023977279663085938, - "delta_ref_ppl": 0.005955219268798828, - "entropy_loss": -0.49365234375, - "epoch": 0.008, - "grad_norm": 7.361437609609679, - "k1_kl": -0.005144596099853516, - "k3_kl": 0.007451295852661133, - "kimi_kl": 0.007527828216552734, - "learning_rate": 4.96e-07, + "advantages": -1.3623919414840202e-07, + "completion_length": 1456.0, + "delta_ref_entropy_loss": -0.00128936767578125, + "delta_ref_ppl": 0.006591796875, + "entropy_loss": -0.279296875, + "epoch": 0.004, + "grad_norm": 34.651203686211225, + "k1_kl": -0.00732421875, + "k3_kl": 0.0081787109375, + "kimi_kl": 0.00555419921875, + "learning_rate": 4.979999999999999e-07, "loss": 0.0003, - "ppl": 0.3897705078125, - "reward": 0.5134416669607162, - "reward_std": 0.18999972473829985, - "rewards/perpo_ocr_edit_distance_reward": 0.5134416967630386, + "ppl": 0.2294921875, + "reward": 0.7969490885734558, + "reward_std": 0.350516676902771, + "rewards/perpo_ocr_edit_distance_reward": 0.7969491481781006, "step": 20, "temperature": 0.9 }, { - "advantages": -6.386212270115266e-08, - "completion_length": 381.0, - "delta_ref_entropy_loss": -0.0194091796875, - "delta_ref_ppl": 0.01116943359375, - "entropy_loss": -0.33984375, - "epoch": 0.0084, - "grad_norm": 14.903410140826015, - "k1_kl": -0.0115966796875, - "k3_kl": 0.0085601806640625, - "kimi_kl": 0.0064544677734375, - "learning_rate": 4.958e-07, - "loss": 0.0003, - "ppl": 0.26904296875, - "reward": 0.7773301303386688, - "reward_std": 0.4240600913763046, - "rewards/perpo_ocr_edit_distance_reward": 0.7773301601409912, + "advantages": -2.384185791015625e-07, + "completion_length": 20.0, + "delta_ref_entropy_loss": -0.0712890625, + "delta_ref_ppl": 0.0267333984375, + "entropy_loss": -1.359375, + "epoch": 0.0042, + "grad_norm": 13.982727505690054, + "k1_kl": -0.0234375, + "k3_kl": 0.00372314453125, + "kimi_kl": 0.00372314453125, + "learning_rate": 4.979e-07, + "loss": 0.0001, + "ppl": 1.15625, + "reward": 0.005967761855572462, + "reward_std": 0.00046039826702326536, + "rewards/perpo_ocr_edit_distance_reward": 0.005967761855572462, "step": 21, "temperature": 0.9 }, { - "advantages": -3.107956647596666e-07, - "completion_length": 184.5, - "delta_ref_entropy_loss": 0.0005922317504882812, - "delta_ref_ppl": 0.00103759765625, - "entropy_loss": -0.23681640625, - "epoch": 0.0088, - "grad_norm": 6.2498241322871175, - "k1_kl": -0.0011749267578125, - "k3_kl": 0.001987457275390625, - "kimi_kl": 0.001911163330078125, - "learning_rate": 4.956e-07, + "advantages": -6.811959707420101e-08, + "completion_length": 219.0, + "delta_ref_entropy_loss": 0.0, + "delta_ref_ppl": -0.0042724609375, + "entropy_loss": -0.484375, + "epoch": 0.0044, + "grad_norm": 25.230754137253623, + "k1_kl": 0.00439453125, + "k3_kl": 0.001251220703125, + "kimi_kl": 0.00140380859375, + "learning_rate": 4.978e-07, "loss": 0.0001, - "ppl": 0.14794921875, - "reward": 0.4415174275636673, - "reward_std": 0.24500763416290283, - "rewards/perpo_ocr_edit_distance_reward": 0.4415174573659897, + "ppl": 0.3125, + "reward": 0.5264456272125244, + "reward_std": 0.4076725244522095, + "rewards/perpo_ocr_edit_distance_reward": 0.5264456868171692, "step": 22, "temperature": 0.9 }, { - "advantages": -1.4986311072107128e-06, - "completion_length": 1186.0, - "delta_ref_entropy_loss": -0.005062103271484375, - "delta_ref_ppl": 0.0041713714599609375, - "entropy_loss": -0.16796875, - "epoch": 0.0092, - "grad_norm": 6.950303313447262, - "k1_kl": -0.004627227783203125, - "k3_kl": 0.006091117858886719, - "kimi_kl": 0.0041522979736328125, - "learning_rate": 4.954e-07, - "loss": 0.0002, - "ppl": 0.12744140625, - "reward": 0.8976284563541412, - "reward_std": 0.20129271037876606, - "rewards/perpo_ocr_edit_distance_reward": 0.8976285457611084, + "advantages": -1.7029899268550253e-08, + "completion_length": 639.0, + "delta_ref_entropy_loss": -0.0009765625, + "delta_ref_ppl": -0.000591278076171875, + "entropy_loss": -0.09765625, + "epoch": 0.0046, + "grad_norm": 1.4574083437930572, + "k1_kl": 0.0006561279296875, + "k3_kl": 0.000354766845703125, + "kimi_kl": 0.000522613525390625, + "learning_rate": 4.976999999999999e-07, + "loss": 0.0, + "ppl": 0.048828125, + "reward": 0.9120628237724304, + "reward_std": 0.09342994540929794, + "rewards/perpo_ocr_edit_distance_reward": 0.9120628237724304, "step": 23, "temperature": 0.9 }, { - "advantages": -8.195639100705421e-08, - "completion_length": 365.0, - "delta_ref_entropy_loss": -0.004367828369140625, - "delta_ref_ppl": -0.00052642822265625, - "entropy_loss": -0.11376953125, - "epoch": 0.0096, - "grad_norm": 4.506075168799693, - "k1_kl": 0.000579833984375, - "k3_kl": 0.0009260177612304688, - "kimi_kl": 0.0010633468627929688, - "learning_rate": 4.951999999999999e-07, - "loss": 0.0, - "ppl": 0.0557861328125, - "reward": 0.6555562615394592, - "reward_std": 0.33712907135486603, - "rewards/perpo_ocr_edit_distance_reward": 0.655556321144104, + "advantages": -2.2138868871479644e-07, + "completion_length": 1072.0, + "delta_ref_entropy_loss": -0.07568359375, + "delta_ref_ppl": 0.023681640625, + "entropy_loss": -0.890625, + "epoch": 0.0048, + "grad_norm": 183.32597335593522, + "k1_kl": -0.0238037109375, + "k3_kl": 0.003509521484375, + "kimi_kl": 0.0035247802734375, + "learning_rate": 4.976e-07, + "loss": 0.0001, + "ppl": 0.65234375, + "reward": 0.6949305534362793, + "reward_std": 0.318003386259079, + "rewards/perpo_ocr_edit_distance_reward": 0.6949306130409241, "step": 24, "temperature": 0.9 }, { - "advantages": -1.0643686820799303e-07, - "completion_length": 1395.5, - "delta_ref_entropy_loss": 0.000698089599609375, - "delta_ref_ppl": -0.005680084228515625, - "entropy_loss": -0.3447265625, - "epoch": 0.01, - "grad_norm": 36.7726562643036, - "k1_kl": 0.005401611328125, - "k3_kl": 0.005669593811035156, - "kimi_kl": 0.0069484710693359375, - "learning_rate": 4.95e-07, + "advantages": -1.7029899268550253e-08, + "completion_length": 1243.0, + "delta_ref_entropy_loss": -0.0020294189453125, + "delta_ref_ppl": -0.0033416748046875, + "entropy_loss": -0.263671875, + "epoch": 0.005, + "grad_norm": 28.23069767736638, + "k1_kl": 0.003265380859375, + "k3_kl": 0.0050048828125, + "kimi_kl": 0.005035400390625, + "learning_rate": 4.975e-07, "loss": 0.0002, - "ppl": 0.2275390625, - "reward": 0.6477884352207184, - "reward_std": 0.302707776427269, - "rewards/perpo_ocr_edit_distance_reward": 0.6477884948253632, + "ppl": 0.134765625, + "reward": 0.7186028361320496, + "reward_std": 0.33224740624427795, + "rewards/perpo_ocr_edit_distance_reward": 0.7186028957366943, "step": 25, "temperature": 0.9 }, { - "advantages": 3.4059797648922086e-08, - "completion_length": 405.0, - "delta_ref_entropy_loss": -0.0151519775390625, - "delta_ref_ppl": 0.00372314453125, - "entropy_loss": -0.250244140625, - "epoch": 0.0104, - "grad_norm": 6.196855121517024, - "k1_kl": -0.00405120849609375, - "k3_kl": 0.01132965087890625, - "kimi_kl": 0.00904083251953125, - "learning_rate": 4.948e-07, - "loss": 0.0005, - "ppl": 0.177978515625, - "reward": 0.5746809095144272, - "reward_std": 0.41203801333904266, - "rewards/perpo_ocr_edit_distance_reward": 0.5746809244155884, + "advantages": -3.661428422674362e-07, + "completion_length": 577.0, + "delta_ref_entropy_loss": 0.00066375732421875, + "delta_ref_ppl": -0.0014801025390625, + "entropy_loss": -0.10888671875, + "epoch": 0.0052, + "grad_norm": 3.021917844669352, + "k1_kl": 0.00159454345703125, + "k3_kl": 0.00058746337890625, + "kimi_kl": 0.000766754150390625, + "learning_rate": 4.974e-07, + "loss": 0.0, + "ppl": 0.05859375, + "reward": 0.894582986831665, + "reward_std": 0.11495194584131241, + "rewards/perpo_ocr_edit_distance_reward": 0.8945830464363098, "step": 26, "temperature": 0.9 }, { - "advantages": -7.067407992167318e-07, - "completion_length": 461.5, - "delta_ref_entropy_loss": -0.009024381637573242, - "delta_ref_ppl": 0.0036449432373046875, - "entropy_loss": -0.3154296875, - "epoch": 0.0108, - "grad_norm": 6.350000112621851, - "k1_kl": -0.0038661956787109375, - "k3_kl": 0.025068283081054688, - "kimi_kl": 0.014822006225585938, - "learning_rate": 4.946e-07, - "loss": 0.001, - "ppl": 0.22265625, - "reward": 0.6071482449769974, - "reward_std": 0.11826272681355476, - "rewards/perpo_ocr_edit_distance_reward": 0.6071483194828033, + "advantages": 1.7029899268550253e-08, + "completion_length": 781.0, + "delta_ref_entropy_loss": -0.038818359375, + "delta_ref_ppl": 0.0625, + "entropy_loss": -0.328125, + "epoch": 0.0054, + "grad_norm": 36.225830861880326, + "k1_kl": -0.06298828125, + "k3_kl": 0.02978515625, + "kimi_kl": 0.0235595703125, + "learning_rate": 4.973e-07, + "loss": 0.0012, + "ppl": 0.201171875, + "reward": 0.5297855734825134, + "reward_std": 0.4711359739303589, + "rewards/perpo_ocr_edit_distance_reward": 0.5297855734825134, "step": 27, "temperature": 0.9 }, { - "advantages": -7.237707233542778e-08, - "completion_length": 1277.5, - "delta_ref_entropy_loss": -0.131683349609375, - "delta_ref_ppl": 0.0089874267578125, - "entropy_loss": -0.44287109375, - "epoch": 0.0112, - "grad_norm": 59.29825316871837, - "k1_kl": -0.0086517333984375, - "k3_kl": 0.01564788818359375, - "kimi_kl": 0.013072967529296875, - "learning_rate": 4.944e-07, - "loss": 0.0006, - "ppl": 0.30419921875, - "reward": 0.44214316457509995, - "reward_std": 0.30047257244586945, - "rewards/perpo_ocr_edit_distance_reward": 0.44214320182800293, + "advantages": 2.55448497910038e-08, + "completion_length": 60.0, + "delta_ref_entropy_loss": -0.039306640625, + "delta_ref_ppl": 0.0247802734375, + "entropy_loss": -0.9921875, + "epoch": 0.0056, + "grad_norm": 14.82478188886586, + "k1_kl": -0.02099609375, + "k3_kl": 0.06494140625, + "kimi_kl": 0.036865234375, + "learning_rate": 4.972e-07, + "loss": 0.0026, + "ppl": 0.6953125, + "reward": 0.4065273702144623, + "reward_std": 0.44532540440559387, + "rewards/perpo_ocr_edit_distance_reward": 0.40652740001678467, "step": 28, "temperature": 0.9 }, { - "advantages": -5.449567908044628e-07, - "completion_length": 607.0, - "delta_ref_entropy_loss": 0.00055694580078125, - "delta_ref_ppl": -0.0011425018310546875, - "entropy_loss": -0.197021484375, - "epoch": 0.0116, - "grad_norm": 2.373939129527344, - "k1_kl": 0.0014286041259765625, - "k3_kl": 0.0005970001220703125, - "kimi_kl": 0.0007219314575195312, - "learning_rate": 4.942e-07, + "advantages": -2.741813887041644e-06, + "completion_length": 825.0, + "delta_ref_entropy_loss": -0.00066375732421875, + "delta_ref_ppl": -0.00019168853759765625, + "entropy_loss": -0.04638671875, + "epoch": 0.0058, + "grad_norm": 0.71686784586146, + "k1_kl": 0.00019168853759765625, + "k3_kl": 0.00012063980102539062, + "kimi_kl": 0.0001583099365234375, + "learning_rate": 4.971e-07, "loss": 0.0, - "ppl": 0.11328125, - "reward": 0.901388555765152, - "reward_std": 0.12919071968644857, - "rewards/perpo_ocr_edit_distance_reward": 0.901388555765152, + "ppl": 0.01806640625, + "reward": 0.9811695218086243, + "reward_std": 0.01241773460060358, + "rewards/perpo_ocr_edit_distance_reward": 0.981169581413269, "step": 29, "temperature": 0.9 }, { - "advantages": -1.7072473568191526e-06, - "completion_length": 251.5, - "delta_ref_entropy_loss": -0.017910003662109375, - "delta_ref_ppl": 0.010087966918945312, - "entropy_loss": -0.26416015625, - "epoch": 0.012, - "grad_norm": 8.249826425800137, - "k1_kl": -0.009368896484375, - "k3_kl": 0.014716625213623047, - "kimi_kl": 0.011332035064697266, - "learning_rate": 4.94e-07, - "loss": 0.0006, - "ppl": 0.2340087890625, - "reward": 0.802297055721283, - "reward_std": 0.24577475851401687, - "rewards/perpo_ocr_edit_distance_reward": 0.802297055721283, + "advantages": -6.811959707420101e-08, + "completion_length": 2048.0, + "delta_ref_entropy_loss": -0.0191650390625, + "delta_ref_ppl": 0.0257568359375, + "entropy_loss": -0.9375, + "epoch": 0.006, + "grad_norm": 17.35637982282526, + "k1_kl": -0.0257568359375, + "k3_kl": 0.02099609375, + "kimi_kl": 0.016357421875, + "learning_rate": 4.97e-07, + "loss": 0.0008, + "ppl": 0.5859375, + "reward": 0.25435078144073486, + "reward_std": 0.26854953169822693, + "rewards/perpo_ocr_edit_distance_reward": 0.25435081124305725, "step": 30, "temperature": 0.9 }, { - "advantages": -6.386212021425308e-07, - "completion_length": 833.0, - "delta_ref_entropy_loss": 0.0005578994750976562, - "delta_ref_ppl": -0.001697540283203125, - "entropy_loss": -0.1201171875, - "epoch": 0.0124, - "grad_norm": 1.023794030483368, - "k1_kl": 0.001697540283203125, - "k3_kl": 0.0009632110595703125, - "kimi_kl": 0.0012969970703125, - "learning_rate": 4.938e-07, - "loss": 0.0, - "ppl": 0.0621337890625, - "reward": 0.7501908540725708, - "reward_std": 0.14883649721741676, - "rewards/perpo_ocr_edit_distance_reward": 0.7501909136772156, + "advantages": -1.021793991640152e-07, + "completion_length": 425.0, + "delta_ref_entropy_loss": -0.000244140625, + "delta_ref_ppl": -0.0068359375, + "entropy_loss": -0.22265625, + "epoch": 0.0062, + "grad_norm": 12.55085154313392, + "k1_kl": 0.006591796875, + "k3_kl": 0.00180816650390625, + "kimi_kl": 0.002532958984375, + "learning_rate": 4.969e-07, + "loss": 0.0001, + "ppl": 0.12158203125, + "reward": 0.855750322341919, + "reward_std": 0.34296461939811707, + "rewards/perpo_ocr_edit_distance_reward": 0.8557503819465637, "step": 31, "temperature": 0.9 }, { - "advantages": -1.1069434435739822e-07, - "completion_length": 586.0, - "delta_ref_entropy_loss": -0.057373046875, - "delta_ref_ppl": -0.01770782470703125, - "entropy_loss": -0.458984375, - "epoch": 0.0128, - "grad_norm": 11.848568761632743, - "k1_kl": 0.01776123046875, - "k3_kl": 0.01702117919921875, - "kimi_kl": 0.0247039794921875, - "learning_rate": 4.935999999999999e-07, - "loss": 0.0007, - "ppl": 0.29443359375, - "reward": 0.4163528382778168, - "reward_std": 0.25631335377693176, - "rewards/perpo_ocr_edit_distance_reward": 0.41635285317897797, + "advantages": -1.3623919414840202e-07, + "completion_length": 64.0, + "delta_ref_entropy_loss": -0.031494140625, + "delta_ref_ppl": 0.006195068359375, + "entropy_loss": -0.39453125, + "epoch": 0.0064, + "grad_norm": 11.723837452805517, + "k1_kl": -0.0059814453125, + "k3_kl": 0.041015625, + "kimi_kl": 0.0206298828125, + "learning_rate": 4.968e-07, + "loss": 0.0016, + "ppl": 0.2890625, + "reward": 0.45818522572517395, + "reward_std": 0.27222874760627747, + "rewards/perpo_ocr_edit_distance_reward": 0.4581852853298187, "step": 32, "temperature": 0.9 }, { - "advantages": -1.319817215517105e-07, - "completion_length": 1239.5, - "delta_ref_entropy_loss": 0.00054168701171875, - "delta_ref_ppl": -0.0014095306396484375, - "entropy_loss": -0.27294921875, - "epoch": 0.0132, - "grad_norm": 3.3666051861741106, - "k1_kl": 0.0011425018310546875, - "k3_kl": 0.0006422996520996094, - "kimi_kl": 0.000797271728515625, - "learning_rate": 4.934e-07, + "advantages": -4.427773774295929e-07, + "completion_length": 963.0, + "delta_ref_entropy_loss": 0.0020294189453125, + "delta_ref_ppl": -0.00150299072265625, + "entropy_loss": -0.1748046875, + "epoch": 0.0066, + "grad_norm": 1.0357500447776093, + "k1_kl": 0.00139617919921875, + "k3_kl": 0.0003147125244140625, + "kimi_kl": 0.00049591064453125, + "learning_rate": 4.966999999999999e-07, "loss": 0.0, - "ppl": 0.149169921875, - "reward": 0.4846806973218918, - "reward_std": 0.21668342500925064, - "rewards/perpo_ocr_edit_distance_reward": 0.48468074202537537, + "ppl": 0.10009765625, + "reward": 0.5934605598449707, + "reward_std": 0.17061449587345123, + "rewards/perpo_ocr_edit_distance_reward": 0.5934606790542603, "step": 33, "temperature": 0.9 }, { - "advantages": -1.9328935643159184e-06, - "completion_length": 309.0, - "delta_ref_entropy_loss": -0.0028533935546875, - "delta_ref_ppl": -0.002716064453125, - "entropy_loss": -0.0638427734375, - "epoch": 0.0136, - "grad_norm": 1.6382488231978785, - "k1_kl": 0.0027008056640625, - "k3_kl": 0.00220489501953125, - "kimi_kl": 0.00302886962890625, - "learning_rate": 4.932e-07, - "loss": 0.0001, - "ppl": 0.02825927734375, - "reward": 0.6583679467439651, - "reward_std": 0.20517196971923113, - "rewards/perpo_ocr_edit_distance_reward": 0.6583680361509323, + "advantages": -3.4059798537100505e-08, + "completion_length": 610.0, + "delta_ref_entropy_loss": -0.0115966796875, + "delta_ref_ppl": -0.016845703125, + "entropy_loss": -0.455078125, + "epoch": 0.0068, + "grad_norm": 14.396145747636389, + "k1_kl": 0.015869140625, + "k3_kl": 0.006378173828125, + "kimi_kl": 0.00927734375, + "learning_rate": 4.965999999999999e-07, + "loss": 0.0003, + "ppl": 0.25, + "reward": 0.5330965518951416, + "reward_std": 0.40035659074783325, + "rewards/perpo_ocr_edit_distance_reward": 0.5330965518951416, "step": 34, "temperature": 0.9 }, { - "advantages": -2.55448497910038e-08, - "completion_length": 334.5, - "delta_ref_entropy_loss": -0.01288604736328125, - "delta_ref_ppl": 0.01123809814453125, - "entropy_loss": -0.263671875, - "epoch": 0.014, - "grad_norm": 9.136482364797185, - "k1_kl": -0.0114593505859375, - "k3_kl": 0.03639411926269531, - "kimi_kl": 0.019788742065429688, - "learning_rate": 4.93e-07, - "loss": 0.0015, - "ppl": 0.19189453125, - "reward": 0.8896432816982269, - "reward_std": 0.1935592908412218, - "rewards/perpo_ocr_edit_distance_reward": 0.8896432518959045, + "advantages": 0.0, + "completion_length": 1093.0, + "delta_ref_entropy_loss": 0.00616455078125, + "delta_ref_ppl": 0.055908203125, + "entropy_loss": -0.93359375, + "epoch": 0.007, + "grad_norm": 118.72114612647103, + "k1_kl": -0.057373046875, + "k3_kl": 0.0166015625, + "kimi_kl": 0.01507568359375, + "learning_rate": 4.964999999999999e-07, + "loss": 0.0007, + "ppl": 0.609375, + "reward": 0.6425904631614685, + "reward_std": 0.42398491501808167, + "rewards/perpo_ocr_edit_distance_reward": 0.6425905227661133, "step": 35, "temperature": 0.9 }, { - "advantages": -5.10896995820076e-08, - "completion_length": 638.0, - "delta_ref_entropy_loss": -0.002895355224609375, - "delta_ref_ppl": -0.001171112060546875, - "entropy_loss": -0.23095703125, - "epoch": 0.0144, - "grad_norm": 4.882960777377076, - "k1_kl": 0.0012226104736328125, - "k3_kl": 0.00145721435546875, - "kimi_kl": 0.00200653076171875, - "learning_rate": 4.928e-07, - "loss": 0.0001, - "ppl": 0.13055419921875, - "reward": 0.8734707534313202, - "reward_std": 0.19395603286102414, - "rewards/perpo_ocr_edit_distance_reward": 0.873470813035965, + "advantages": 3.193106223875475e-09, + "completion_length": 749.0, + "delta_ref_entropy_loss": -0.004638671875, + "delta_ref_ppl": -0.005279541015625, + "entropy_loss": -0.2412109375, + "epoch": 0.0072, + "grad_norm": 26.061132121641638, + "k1_kl": 0.004791259765625, + "k3_kl": 0.005645751953125, + "kimi_kl": 0.0062255859375, + "learning_rate": 4.964e-07, + "loss": 0.0002, + "ppl": 0.1796875, + "reward": 0.7327725887298584, + "reward_std": 0.33731359243392944, + "rewards/perpo_ocr_edit_distance_reward": 0.7327725887298584, "step": 36, "temperature": 0.9 }, { - "advantages": -1.2772424540230531e-07, - "completion_length": 1288.5, - "delta_ref_entropy_loss": 0.002197265625, - "delta_ref_ppl": 0.0028743743896484375, - "entropy_loss": -0.199951171875, - "epoch": 0.0148, - "grad_norm": 3.0870502054626527, - "k1_kl": -0.00283050537109375, - "k3_kl": 0.0023565292358398438, - "kimi_kl": 0.0020818710327148438, - "learning_rate": 4.926e-07, - "loss": 0.0001, - "ppl": 0.105712890625, - "reward": 0.6827974915504456, - "reward_std": 0.27184218913316727, - "rewards/perpo_ocr_edit_distance_reward": 0.6827975511550903, + "advantages": -1.7029899268550253e-08, + "completion_length": 375.0, + "delta_ref_entropy_loss": -0.0023651123046875, + "delta_ref_ppl": -0.019287109375, + "entropy_loss": -0.53515625, + "epoch": 0.0074, + "grad_norm": 10.214778559857658, + "k1_kl": 0.0189208984375, + "k3_kl": 0.01019287109375, + "kimi_kl": 0.018310546875, + "learning_rate": 4.963e-07, + "loss": 0.0004, + "ppl": 0.31640625, + "reward": 0.5909614562988281, + "reward_std": 0.2595179080963135, + "rewards/perpo_ocr_edit_distance_reward": 0.5909614562988281, "step": 37, "temperature": 0.9 }, { - "advantages": -5.513429925940727e-07, - "completion_length": 312.0, - "delta_ref_entropy_loss": 0.00501251220703125, - "delta_ref_ppl": -0.00266265869140625, - "entropy_loss": -0.1763916015625, - "epoch": 0.0152, - "grad_norm": 3.3439380164030714, - "k1_kl": 0.002716064453125, - "k3_kl": 0.0020198822021484375, - "kimi_kl": 0.0023479461669921875, - "learning_rate": 4.923999999999999e-07, - "loss": 0.0001, - "ppl": 0.09881591796875, - "reward": 0.7919158339500427, - "reward_std": 0.15615028887987137, - "rewards/perpo_ocr_edit_distance_reward": 0.7919159233570099, + "advantages": -5.10896995820076e-08, + "completion_length": 274.0, + "delta_ref_entropy_loss": -0.0010833740234375, + "delta_ref_ppl": -0.0072021484375, + "entropy_loss": -0.11474609375, + "epoch": 0.0076, + "grad_norm": 3.715028119348908, + "k1_kl": 0.0072021484375, + "k3_kl": 0.006072998046875, + "kimi_kl": 0.00909423828125, + "learning_rate": 4.961999999999999e-07, + "loss": 0.0002, + "ppl": 0.05908203125, + "reward": 0.6364915370941162, + "reward_std": 0.1834774613380432, + "rewards/perpo_ocr_edit_distance_reward": 0.6364915370941162, "step": 38, "temperature": 0.9 }, { - "advantages": -1.6178404171895977e-07, - "completion_length": 924.5, - "delta_ref_entropy_loss": 0.000518798828125, - "delta_ref_ppl": -0.0029449462890625, - "entropy_loss": -0.19580078125, - "epoch": 0.0156, - "grad_norm": 14.309207597233723, - "k1_kl": 0.00296783447265625, - "k3_kl": 0.0019683837890625, - "kimi_kl": 0.002864837646484375, - "learning_rate": 4.922e-07, - "loss": 0.0001, - "ppl": 0.115966796875, - "reward": 0.8567204177379608, - "reward_std": 0.2289358153939247, - "rewards/perpo_ocr_edit_distance_reward": 0.8567204773426056, + "advantages": -3.4059798537100505e-08, + "completion_length": 208.0, + "delta_ref_entropy_loss": -0.0034942626953125, + "delta_ref_ppl": -0.0086669921875, + "entropy_loss": -0.2255859375, + "epoch": 0.0078, + "grad_norm": 2.9289631265270937, + "k1_kl": 0.0086669921875, + "k3_kl": 0.005462646484375, + "kimi_kl": 0.01043701171875, + "learning_rate": 4.961e-07, + "loss": 0.0002, + "ppl": 0.1181640625, + "reward": 0.7322698831558228, + "reward_std": 0.26770949363708496, + "rewards/perpo_ocr_edit_distance_reward": 0.7322698831558228, "step": 39, "temperature": 0.9 }, { - "advantages": -5.513429965020578e-06, - "completion_length": 1034.0, - "delta_ref_entropy_loss": -0.001010894775390625, - "delta_ref_ppl": -0.0008449554443359375, - "entropy_loss": -0.16552734375, - "epoch": 0.016, - "grad_norm": 1.0354123216063829, - "k1_kl": 0.0008792877197265625, - "k3_kl": 0.0009021759033203125, - "kimi_kl": 0.00107574462890625, - "learning_rate": 4.92e-07, + "advantages": 2.7247838829680404e-07, + "completion_length": 456.0, + "delta_ref_entropy_loss": -6.961822509765625e-05, + "delta_ref_ppl": -0.001220703125, + "entropy_loss": -0.11279296875, + "epoch": 0.008, + "grad_norm": 1.5474287194242315, + "k1_kl": 0.0011138916015625, + "k3_kl": 0.000308990478515625, + "kimi_kl": 0.000415802001953125, + "learning_rate": 4.96e-07, "loss": 0.0, - "ppl": 0.0860595703125, - "reward": 0.780910313129425, - "reward_std": 0.10938960174098611, - "rewards/perpo_ocr_edit_distance_reward": 0.7809103429317474, + "ppl": 0.061767578125, + "reward": 0.7656471133232117, + "reward_std": 0.03278651088476181, + "rewards/perpo_ocr_edit_distance_reward": 0.7656471133232117, "step": 40, "temperature": 0.9 }, { - "advantages": -1.4901162082026076e-07, - "completion_length": 450.0, - "delta_ref_entropy_loss": 0.00264739990234375, - "delta_ref_ppl": -0.01718902587890625, - "entropy_loss": -0.29345703125, - "epoch": 0.0164, - "grad_norm": 3.5328764596701334, - "k1_kl": 0.017303466796875, - "k3_kl": 0.011180877685546875, - "kimi_kl": 0.01818084716796875, - "learning_rate": 4.918e-07, - "loss": 0.0004, - "ppl": 0.157958984375, - "reward": 0.7212093770503998, - "reward_std": 0.2458159700036049, - "rewards/perpo_ocr_edit_distance_reward": 0.7212094366550446, + "advantages": -2.895082786835701e-07, + "completion_length": 175.0, + "delta_ref_entropy_loss": -0.00164031982421875, + "delta_ref_ppl": -0.0031890869140625, + "entropy_loss": -0.055908203125, + "epoch": 0.0082, + "grad_norm": 3.654977357166755, + "k1_kl": 0.0032501220703125, + "k3_kl": 0.0019378662109375, + "kimi_kl": 0.0027923583984375, + "learning_rate": 4.959e-07, + "loss": 0.0001, + "ppl": 0.0269775390625, + "reward": 0.9355349540710449, + "reward_std": 0.12724746763706207, + "rewards/perpo_ocr_edit_distance_reward": 0.9355349540710449, "step": 41, "temperature": 0.9 }, { - "advantages": 4.274504590284778e-06, - "completion_length": 875.0, - "delta_ref_entropy_loss": 0.0020751953125, - "delta_ref_ppl": -0.0014324188232421875, - "entropy_loss": -0.057861328125, - "epoch": 0.0168, - "grad_norm": 0.6110669656145786, - "k1_kl": 0.00150299072265625, - "k3_kl": 0.0005092620849609375, - "kimi_kl": 0.0006313323974609375, - "learning_rate": 4.916e-07, - "loss": 0.0, - "ppl": 0.027862548828125, - "reward": 0.9445763230323792, - "reward_std": 0.013880635233363137, - "rewards/perpo_ocr_edit_distance_reward": 0.9445763826370239, + "advantages": 1.0464873412274756e-05, + "completion_length": 441.0, + "delta_ref_entropy_loss": -0.003265380859375, + "delta_ref_ppl": -0.004638671875, + "entropy_loss": -0.037109375, + "epoch": 0.0084, + "grad_norm": 0.4732533682962285, + "k1_kl": 0.004638671875, + "k3_kl": 0.0031585693359375, + "kimi_kl": 0.006378173828125, + "learning_rate": 4.958e-07, + "loss": 0.0001, + "ppl": 0.012939453125, + "reward": 0.9839690923690796, + "reward_std": 0.0007136325584724545, + "rewards/perpo_ocr_edit_distance_reward": 0.9839690327644348, "step": 42, "temperature": 0.9 }, { - "advantages": -1.4475413934178505e-07, - "completion_length": 589.0, - "delta_ref_entropy_loss": 0.001186370849609375, - "delta_ref_ppl": -0.00585174560546875, - "entropy_loss": -0.193359375, - "epoch": 0.0172, - "grad_norm": 1.58430752802689, - "k1_kl": 0.00585174560546875, - "k3_kl": 0.003841400146484375, - "kimi_kl": 0.00586700439453125, - "learning_rate": 4.914e-07, - "loss": 0.0002, - "ppl": 0.105224609375, - "reward": 0.6727547645568848, - "reward_std": 0.23246826231479645, - "rewards/perpo_ocr_edit_distance_reward": 0.6727548241615295, + "advantages": -6.811959565311554e-07, + "completion_length": 300.0, + "delta_ref_entropy_loss": 0.005859375, + "delta_ref_ppl": -0.0037384033203125, + "entropy_loss": -0.13671875, + "epoch": 0.0086, + "grad_norm": 2.5036665919137153, + "k1_kl": 0.00384521484375, + "k3_kl": 0.002105712890625, + "kimi_kl": 0.00238037109375, + "learning_rate": 4.957e-07, + "loss": 0.0001, + "ppl": 0.06494140625, + "reward": 0.4568319022655487, + "reward_std": 0.08604375272989273, + "rewards/perpo_ocr_edit_distance_reward": 0.4568319618701935, "step": 43, "temperature": 0.9 }, { - "advantages": -2.0435879122260303e-07, - "completion_length": 512.0, - "delta_ref_entropy_loss": -0.0030364990234375, - "delta_ref_ppl": -0.0067596435546875, - "entropy_loss": -0.1962890625, - "epoch": 0.0176, - "grad_norm": 1.9583347649056009, - "k1_kl": 0.006805419921875, - "k3_kl": 0.004535675048828125, - "kimi_kl": 0.01056671142578125, - "learning_rate": 4.912e-07, - "loss": 0.0002, - "ppl": 0.1162109375, - "reward": 0.7387199103832245, - "reward_std": 0.2879932373762131, - "rewards/perpo_ocr_edit_distance_reward": 0.7387199699878693, + "advantages": 3.4059798537100505e-08, + "completion_length": 574.0, + "delta_ref_entropy_loss": -0.002166748046875, + "delta_ref_ppl": -0.00194549560546875, + "entropy_loss": -0.27734375, + "epoch": 0.0088, + "grad_norm": 7.677478772549626, + "k1_kl": 0.0016632080078125, + "k3_kl": 0.001922607421875, + "kimi_kl": 0.00194549560546875, + "learning_rate": 4.956e-07, + "loss": 0.0001, + "ppl": 0.1630859375, + "reward": 0.6083736419677734, + "reward_std": 0.2800128161907196, + "rewards/perpo_ocr_edit_distance_reward": 0.6083736419677734, "step": 44, "temperature": 0.9 }, { - "advantages": -1.720445584396657e-05, - "completion_length": 452.5, - "delta_ref_entropy_loss": 0.0029144287109375, - "delta_ref_ppl": -0.00225830078125, - "entropy_loss": -0.052490234375, - "epoch": 0.018, - "grad_norm": 0.7883200112428175, - "k1_kl": 0.002246856689453125, - "k3_kl": 0.0008373260498046875, - "kimi_kl": 0.00104522705078125, - "learning_rate": 4.909999999999999e-07, - "loss": 0.0001, - "ppl": 0.02276611328125, - "reward": 0.9877221882343292, - "reward_std": 0.004720638855360448, - "rewards/perpo_ocr_edit_distance_reward": 0.987722247838974, + "advantages": 1.532690987460228e-07, + "completion_length": 37.0, + "delta_ref_entropy_loss": -0.0098876953125, + "delta_ref_ppl": -0.000820159912109375, + "entropy_loss": -0.0869140625, + "epoch": 0.009, + "grad_norm": 5.758925560898339, + "k1_kl": 0.0008087158203125, + "k3_kl": 0.0047607421875, + "kimi_kl": 0.00531005859375, + "learning_rate": 4.955e-07, + "loss": 0.0002, + "ppl": 0.038330078125, + "reward": 0.8440518379211426, + "reward_std": 0.3213229477405548, + "rewards/perpo_ocr_edit_distance_reward": 0.8440517783164978, "step": 45, "temperature": 0.9 }, { - "advantages": -9.28555226522576e-05, - "completion_length": 1377.5, - "delta_ref_entropy_loss": 0.00168609619140625, - "delta_ref_ppl": -0.0009860992431640625, - "entropy_loss": -0.10235595703125, - "epoch": 0.0184, - "grad_norm": 0.6167509104909638, - "k1_kl": 0.0010585784912109375, - "k3_kl": 0.0003566741943359375, - "kimi_kl": 0.000514984130859375, - "learning_rate": 4.908e-07, - "loss": 0.0001, - "ppl": 0.0518798828125, - "reward": 0.7388620525598526, - "reward_std": 0.08199398677243153, - "rewards/perpo_ocr_edit_distance_reward": 0.7388621121644974, + "advantages": -1.2261527899681823e-06, + "completion_length": 2048.0, + "delta_ref_entropy_loss": 0.00139617919921875, + "delta_ref_ppl": -0.0006103515625, + "entropy_loss": -0.06689453125, + "epoch": 0.0092, + "grad_norm": 1.0978315869401338, + "k1_kl": 0.000579833984375, + "k3_kl": 0.000457763671875, + "kimi_kl": 0.000591278076171875, + "learning_rate": 4.954e-07, + "loss": 0.0, + "ppl": 0.036865234375, + "reward": 0.9418818354606628, + "reward_std": 0.04837515950202942, + "rewards/perpo_ocr_edit_distance_reward": 0.9418818950653076, "step": 46, "temperature": 0.9 }, { - "advantages": -2.0861627314516795e-07, - "completion_length": 406.0, - "delta_ref_entropy_loss": -0.01477813720703125, - "delta_ref_ppl": -0.0040283203125, - "entropy_loss": -0.4765625, - "epoch": 0.0188, - "grad_norm": 5.213151182379237, - "k1_kl": 0.00365447998046875, - "k3_kl": 0.00452423095703125, - "kimi_kl": 0.00641632080078125, - "learning_rate": 4.905999999999999e-07, - "loss": 0.0002, - "ppl": 0.3017578125, - "reward": 0.7630270719528198, - "reward_std": 0.2968149483203888, - "rewards/perpo_ocr_edit_distance_reward": 0.7630271315574646, + "advantages": -1.021793991640152e-07, + "completion_length": 572.0, + "delta_ref_entropy_loss": -0.00139617919921875, + "delta_ref_ppl": -0.0003147125244140625, + "entropy_loss": -0.33984375, + "epoch": 0.0094, + "grad_norm": 4.160244700109368, + "k1_kl": 0.000751495361328125, + "k3_kl": 0.007568359375, + "kimi_kl": 0.00408935546875, + "learning_rate": 4.953e-07, + "loss": 0.0003, + "ppl": 0.2041015625, + "reward": 0.49115055799484253, + "reward_std": 0.3005208373069763, + "rewards/perpo_ocr_edit_distance_reward": 0.4911506175994873, "step": 47, "temperature": 0.9 }, { - "advantages": -8.51494981191081e-08, - "completion_length": 872.5, - "delta_ref_entropy_loss": 0.003936767578125, - "delta_ref_ppl": -0.00543975830078125, - "entropy_loss": -0.144287109375, - "epoch": 0.0192, - "grad_norm": 3.5317876707952074, - "k1_kl": 0.0054779052734375, - "k3_kl": 0.0033111572265625, - "kimi_kl": 0.005001068115234375, - "learning_rate": 4.904e-07, - "loss": 0.0001, - "ppl": 0.0780029296875, - "reward": 0.797523558139801, - "reward_std": 0.22056421637535095, - "rewards/perpo_ocr_edit_distance_reward": 0.7975236177444458, + "advantages": -1.6859600009411224e-06, + "completion_length": 830.0, + "delta_ref_entropy_loss": 0.0020294189453125, + "delta_ref_ppl": -0.0014801025390625, + "entropy_loss": -0.06982421875, + "epoch": 0.0096, + "grad_norm": 1.0154837496766076, + "k1_kl": 0.0015411376953125, + "k3_kl": 0.00101470947265625, + "kimi_kl": 0.00125885009765625, + "learning_rate": 4.951999999999999e-07, + "loss": 0.0, + "ppl": 0.032958984375, + "reward": 0.9525266289710999, + "reward_std": 0.04560462757945061, + "rewards/perpo_ocr_edit_distance_reward": 0.9525266885757446, "step": 48, "temperature": 0.9 }, { - "advantages": -6.811959885055785e-08, - "completion_length": 578.5, - "delta_ref_entropy_loss": -0.006578922271728516, - "delta_ref_ppl": 0.01790618896484375, - "entropy_loss": -0.87353515625, - "epoch": 0.0196, - "grad_norm": 29.16199000263415, - "k1_kl": -0.01888275146484375, - "k3_kl": 0.022571563720703125, - "kimi_kl": 0.01567840576171875, - "learning_rate": 4.902e-07, - "loss": 0.0009, - "ppl": 0.6771240234375, - "reward": 0.44000351428985596, - "reward_std": 0.2129536122083664, - "rewards/perpo_ocr_edit_distance_reward": 0.44000357389450073, + "advantages": -2.2564616131148796e-07, + "completion_length": 1095.0, + "delta_ref_entropy_loss": 0.00125885009765625, + "delta_ref_ppl": -0.00177764892578125, + "entropy_loss": -0.1669921875, + "epoch": 0.0098, + "grad_norm": 1.4647741573491118, + "k1_kl": 0.00186920166015625, + "k3_kl": 0.000789642333984375, + "kimi_kl": 0.000972747802734375, + "learning_rate": 4.950999999999999e-07, + "loss": 0.0, + "ppl": 0.087890625, + "reward": 0.7605360150337219, + "reward_std": 0.07680665701627731, + "rewards/perpo_ocr_edit_distance_reward": 0.7605360150337219, "step": 49, "temperature": 0.9 }, { - "advantages": -2.2649766009408268e-06, - "completion_length": 507.5, - "delta_ref_entropy_loss": -0.000835418701171875, - "delta_ref_ppl": -0.005584716796875, - "entropy_loss": -0.09521484375, - "epoch": 0.02, - "grad_norm": 1.0420679327760156, - "k1_kl": 0.0055694580078125, - "k3_kl": 0.0044708251953125, - "kimi_kl": 0.005096435546875, - "learning_rate": 4.9e-07, - "loss": 0.0002, - "ppl": 0.050537109375, - "reward": 0.9364235401153564, - "reward_std": 0.06070804502815008, - "rewards/perpo_ocr_edit_distance_reward": 0.9364235699176788, + "advantages": -6.811959707420101e-08, + "completion_length": 2048.0, + "delta_ref_entropy_loss": -0.08935546875, + "delta_ref_ppl": -0.162109375, + "entropy_loss": -1.0546875, + "epoch": 0.01, + "grad_norm": 246.6426353444632, + "k1_kl": 0.1640625, + "k3_kl": 0.1650390625, + "kimi_kl": 0.322265625, + "learning_rate": 4.95e-07, + "loss": 0.0066, + "ppl": 0.53515625, + "reward": 0.28370338678359985, + "reward_std": 0.28045013546943665, + "rewards/perpo_ocr_edit_distance_reward": 0.28370341658592224, "step": 50, "temperature": 0.9 }, { - "advantages": -2.1061728538995794e-05, - "completion_length": 1233.5, - "delta_ref_entropy_loss": -0.000171661376953125, - "delta_ref_ppl": -0.0012454986572265625, - "entropy_loss": -0.17254638671875, - "epoch": 0.0204, - "grad_norm": 1.8213637078815244, - "k1_kl": 0.0009927749633789062, - "k3_kl": 0.001186370849609375, - "kimi_kl": 0.001346588134765625, - "learning_rate": 4.898e-07, - "loss": 0.0001, - "ppl": 0.091552734375, - "reward": 0.7217186838388443, - "reward_std": 0.0747278499766253, - "rewards/perpo_ocr_edit_distance_reward": 0.7217187285423279, + "advantages": -2.895082786835701e-07, + "completion_length": 630.0, + "delta_ref_entropy_loss": 0.002166748046875, + "delta_ref_ppl": -0.00543212890625, + "entropy_loss": -0.185546875, + "epoch": 0.0102, + "grad_norm": 3.194803915602411, + "k1_kl": 0.00537109375, + "k3_kl": 0.00823974609375, + "kimi_kl": 0.00665283203125, + "learning_rate": 4.949e-07, + "loss": 0.0003, + "ppl": 0.10693359375, + "reward": 0.4085654616355896, + "reward_std": 0.23334167897701263, + "rewards/perpo_ocr_edit_distance_reward": 0.40856555104255676, "step": 51, "temperature": 0.9 }, { - "advantages": -2.809933477010418e-07, - "completion_length": 543.0, - "delta_ref_entropy_loss": 0.0010433197021484375, - "delta_ref_ppl": -0.00388336181640625, - "entropy_loss": -0.150390625, - "epoch": 0.0208, - "grad_norm": 1.1618188037520902, - "k1_kl": 0.003875732421875, - "k3_kl": 0.00313568115234375, - "kimi_kl": 0.00411224365234375, - "learning_rate": 4.895999999999999e-07, - "loss": 0.0001, - "ppl": 0.083740234375, - "reward": 0.9169642627239227, - "reward_std": 0.09139490313827991, - "rewards/perpo_ocr_edit_distance_reward": 0.9169643223285675, + "advantages": -4.938671054333099e-07, + "completion_length": 496.0, + "delta_ref_entropy_loss": 0.00080108642578125, + "delta_ref_ppl": -0.00112152099609375, + "entropy_loss": -0.04296875, + "epoch": 0.0104, + "grad_norm": 0.8820244918394498, + "k1_kl": 0.00112152099609375, + "k3_kl": 0.000518798828125, + "kimi_kl": 0.000621795654296875, + "learning_rate": 4.948e-07, + "loss": 0.0, + "ppl": 0.0194091796875, + "reward": 0.9698258638381958, + "reward_std": 0.016944918781518936, + "rewards/perpo_ocr_edit_distance_reward": 0.9698259234428406, "step": 52, "temperature": 0.9 }, { - "advantages": -4.896096061912658e-08, - "completion_length": 106.0, - "delta_ref_entropy_loss": -0.015380859375, - "delta_ref_ppl": 0.0014190673828125, - "entropy_loss": -0.57666015625, - "epoch": 0.0212, - "grad_norm": 6.372194414803301, - "k1_kl": -0.002044677734375, - "k3_kl": 0.01312255859375, - "kimi_kl": 0.009979248046875, - "learning_rate": 4.894e-07, - "loss": 0.0005, - "ppl": 0.3775634765625, - "reward": 0.4019659161567688, - "reward_std": 0.1703728549182415, - "rewards/perpo_ocr_edit_distance_reward": 0.4019659161567688, + "advantages": 6.811959707420101e-08, + "completion_length": 904.0, + "delta_ref_entropy_loss": -0.003997802734375, + "delta_ref_ppl": -0.003143310546875, + "entropy_loss": -0.36328125, + "epoch": 0.0106, + "grad_norm": 12.177158898314568, + "k1_kl": 0.00396728515625, + "k3_kl": 0.0026397705078125, + "kimi_kl": 0.0031890869140625, + "learning_rate": 4.947e-07, + "loss": 0.0001, + "ppl": 0.2314453125, + "reward": 0.7898591160774231, + "reward_std": 0.329611599445343, + "rewards/perpo_ocr_edit_distance_reward": 0.7898591160774231, "step": 53, "temperature": 0.9 }, { - "advantages": -1.7029898913278885e-07, - "completion_length": 515.5, - "delta_ref_entropy_loss": -0.001068115234375, - "delta_ref_ppl": -0.0091705322265625, - "entropy_loss": -0.223876953125, - "epoch": 0.0216, - "grad_norm": 1.438183295102604, - "k1_kl": 0.00897216796875, - "k3_kl": 0.007598876953125, - "kimi_kl": 0.01708984375, - "learning_rate": 4.892e-07, - "loss": 0.0003, - "ppl": 0.1318359375, - "reward": 0.5907460749149323, - "reward_std": 0.29701510071754456, - "rewards/perpo_ocr_edit_distance_reward": 0.590746134519577, + "advantages": -1.021793991640152e-07, + "completion_length": 105.0, + "delta_ref_entropy_loss": -0.01141357421875, + "delta_ref_ppl": -0.004913330078125, + "entropy_loss": -0.462890625, + "epoch": 0.0108, + "grad_norm": 6.685360670779224, + "k1_kl": 0.00433349609375, + "k3_kl": 0.0029296875, + "kimi_kl": 0.004241943359375, + "learning_rate": 4.946e-07, + "loss": 0.0001, + "ppl": 0.27734375, + "reward": 0.46778321266174316, + "reward_std": 0.1947130262851715, + "rewards/perpo_ocr_edit_distance_reward": 0.46778321266174316, "step": 54, "temperature": 0.9 }, { - "advantages": 2.639634288925663e-07, - "completion_length": 313.5, - "delta_ref_entropy_loss": -0.0123748779296875, - "delta_ref_ppl": -0.00403594970703125, - "entropy_loss": -0.297607421875, - "epoch": 0.022, - "grad_norm": 1.728666753441499, - "k1_kl": 0.00379180908203125, - "k3_kl": 0.004039764404296875, - "kimi_kl": 0.004360198974609375, - "learning_rate": 4.89e-07, - "loss": 0.0002, - "ppl": 0.1778564453125, - "reward": 0.6526312679052353, - "reward_std": 0.18151800334453583, - "rewards/perpo_ocr_edit_distance_reward": 0.6526312977075577, + "advantages": -2.384185791015625e-07, + "completion_length": 2048.0, + "delta_ref_entropy_loss": -0.00020885467529296875, + "delta_ref_ppl": -0.0010986328125, + "entropy_loss": -0.0771484375, + "epoch": 0.011, + "grad_norm": 0.6197921584038099, + "k1_kl": 0.00104522705078125, + "k3_kl": 0.00032806396484375, + "kimi_kl": 0.0004863739013671875, + "learning_rate": 4.945e-07, + "loss": 0.0, + "ppl": 0.040771484375, + "reward": 0.06475960463285446, + "reward_std": 0.011339632794260979, + "rewards/perpo_ocr_edit_distance_reward": 0.06475960463285446, "step": 55, "temperature": 0.9 }, { - "advantages": 1.6178404393940582e-07, - "completion_length": 1722.0, - "delta_ref_entropy_loss": -0.0028553009033203125, - "delta_ref_ppl": -0.0020503997802734375, - "entropy_loss": -0.2021484375, - "epoch": 0.0224, - "grad_norm": 1.6749014664410677, - "k1_kl": 0.001827239990234375, - "k3_kl": 0.00330352783203125, - "kimi_kl": 0.00634765625, - "learning_rate": 4.888e-07, + "advantages": -2.6924270059680566e-05, + "completion_length": 546.0, + "delta_ref_entropy_loss": 0.0009765625, + "delta_ref_ppl": -0.0026702880859375, + "entropy_loss": -0.0306396484375, + "epoch": 0.0112, + "grad_norm": 0.5495303896371548, + "k1_kl": 0.002685546875, + "k3_kl": 0.0011749267578125, + "kimi_kl": 0.00153350830078125, + "learning_rate": 4.944e-07, "loss": 0.0001, - "ppl": 0.13134765625, - "reward": 0.540643960237503, - "reward_std": 0.16220474988222122, - "rewards/perpo_ocr_edit_distance_reward": 0.5406439751386642, + "ppl": 0.01397705078125, + "reward": 0.9832294583320618, + "reward_std": 0.0017987641040235758, + "rewards/perpo_ocr_edit_distance_reward": 0.9832296371459961, "step": 56, "temperature": 0.9 }, { - "advantages": -5.875315398640168e-07, - "completion_length": 996.0, - "delta_ref_entropy_loss": 0.00240325927734375, - "delta_ref_ppl": -0.00260162353515625, - "entropy_loss": -0.14306640625, - "epoch": 0.0228, - "grad_norm": 1.806701344959443, - "k1_kl": 0.002628326416015625, - "k3_kl": 0.001373291015625, - "kimi_kl": 0.0021038055419921875, - "learning_rate": 4.886e-07, - "loss": 0.0001, - "ppl": 0.0748291015625, - "reward": 0.8192134499549866, - "reward_std": 0.17550147511065006, - "rewards/perpo_ocr_edit_distance_reward": 0.8192135095596313, + "advantages": -4.938671054333099e-07, + "completion_length": 2048.0, + "delta_ref_entropy_loss": 0.002716064453125, + "delta_ref_ppl": -0.000591278076171875, + "entropy_loss": -0.1396484375, + "epoch": 0.0114, + "grad_norm": 0.7719565310499005, + "k1_kl": 0.0006256103515625, + "k3_kl": 0.000522613525390625, + "kimi_kl": 0.00061798095703125, + "learning_rate": 4.943e-07, + "loss": 0.0, + "ppl": 0.08056640625, + "reward": 0.8058194518089294, + "reward_std": 0.22449247539043427, + "rewards/perpo_ocr_edit_distance_reward": 0.805819571018219, "step": 57, "temperature": 0.9 }, { - "advantages": -1.66041520088811e-07, - "completion_length": 1546.5, - "delta_ref_entropy_loss": -0.00080108642578125, - "delta_ref_ppl": -0.003002166748046875, - "entropy_loss": -0.1611328125, - "epoch": 0.0232, - "grad_norm": 1.5618154256550016, - "k1_kl": 0.003082275390625, - "k3_kl": 0.0052032470703125, - "kimi_kl": 0.007602691650390625, - "learning_rate": 4.884e-07, - "loss": 0.0002, - "ppl": 0.086669921875, - "reward": 0.4880093038082123, - "reward_std": 0.15291155874729156, - "rewards/perpo_ocr_edit_distance_reward": 0.48800933361053467, + "advantages": -1.4777695469092578e-05, + "completion_length": 623.0, + "delta_ref_entropy_loss": 0.00146484375, + "delta_ref_ppl": -0.003570556640625, + "entropy_loss": -0.06591796875, + "epoch": 0.0116, + "grad_norm": 1.3672155932762455, + "k1_kl": 0.003509521484375, + "k3_kl": 0.0018310546875, + "kimi_kl": 0.0023956298828125, + "learning_rate": 4.942e-07, + "loss": 0.0001, + "ppl": 0.031494140625, + "reward": 0.9705727100372314, + "reward_std": 0.004510798957198858, + "rewards/perpo_ocr_edit_distance_reward": 0.9705727696418762, "step": 58, "temperature": 0.9 }, { - "advantages": -2.0521029000519775e-06, - "completion_length": 240.5, - "delta_ref_entropy_loss": -0.01435089111328125, - "delta_ref_ppl": -0.04510498046875, - "entropy_loss": -0.11083984375, - "epoch": 0.0236, - "grad_norm": 3.3745336376241064, - "k1_kl": 0.045135498046875, - "k3_kl": 0.0350341796875, - "kimi_kl": 0.132781982421875, - "learning_rate": 4.882e-07, - "loss": 0.0014, - "ppl": 0.053955078125, - "reward": 0.974062442779541, - "reward_std": 0.045032053254544735, - "rewards/perpo_ocr_edit_distance_reward": 0.9740625619888306, + "advantages": -2.1713121896027587e-06, + "completion_length": 248.0, + "delta_ref_entropy_loss": 0.00390625, + "delta_ref_ppl": -0.0024261474609375, + "entropy_loss": -0.06396484375, + "epoch": 0.0118, + "grad_norm": 1.6306667564663828, + "k1_kl": 0.0023956298828125, + "k3_kl": 0.0002593994140625, + "kimi_kl": 0.0003643035888671875, + "learning_rate": 4.941e-07, + "loss": 0.0, + "ppl": 0.031005859375, + "reward": 0.9286463260650635, + "reward_std": 0.035305753350257874, + "rewards/perpo_ocr_edit_distance_reward": 0.9286463856697083, "step": 59, "temperature": 0.9 }, { - "advantages": 8.514949723092968e-08, - "completion_length": 554.5, - "delta_ref_entropy_loss": 0.0023040771484375, - "delta_ref_ppl": -0.0057525634765625, - "entropy_loss": -0.16015625, - "epoch": 0.024, - "grad_norm": 1.2650748827028686, - "k1_kl": 0.00555419921875, - "k3_kl": 0.004119873046875, - "kimi_kl": 0.0085296630859375, - "learning_rate": 4.879999999999999e-07, - "loss": 0.0002, - "ppl": 0.0859375, - "reward": 0.6619415581226349, - "reward_std": 0.19069994986057281, - "rewards/perpo_ocr_edit_distance_reward": 0.6619415581226349, + "advantages": -1.021793991640152e-07, + "completion_length": 246.0, + "delta_ref_entropy_loss": -0.0198974609375, + "delta_ref_ppl": 0.007720947265625, + "entropy_loss": -0.271484375, + "epoch": 0.012, + "grad_norm": 12.85420148749799, + "k1_kl": -0.0076904296875, + "k3_kl": 0.0216064453125, + "kimi_kl": 0.021728515625, + "learning_rate": 4.94e-07, + "loss": 0.0009, + "ppl": 0.232421875, + "reward": 0.8394764065742493, + "reward_std": 0.36705583333969116, + "rewards/perpo_ocr_edit_distance_reward": 0.8394764065742493, "step": 60, "temperature": 0.9 }, { - "advantages": -9.792191946189632e-08, - "completion_length": 798.5, - "delta_ref_entropy_loss": 0.0003509521484375, - "delta_ref_ppl": -0.0035943984985351562, - "entropy_loss": -0.14208984375, - "epoch": 0.0244, - "grad_norm": 1.096747886396917, - "k1_kl": 0.0035924911499023438, - "k3_kl": 0.0020904541015625, - "kimi_kl": 0.0048847198486328125, - "learning_rate": 4.878e-07, + "advantages": -1.532690987460228e-07, + "completion_length": 39.0, + "delta_ref_entropy_loss": -0.002471923828125, + "delta_ref_ppl": -0.00408935546875, + "entropy_loss": -0.1044921875, + "epoch": 0.0122, + "grad_norm": 8.533674428356633, + "k1_kl": 0.004119873046875, + "k3_kl": 0.0015106201171875, + "kimi_kl": 0.002197265625, + "learning_rate": 4.938999999999999e-07, "loss": 0.0001, - "ppl": 0.0709228515625, - "reward": 0.5900775492191315, - "reward_std": 0.2226782888174057, - "rewards/perpo_ocr_edit_distance_reward": 0.590077593922615, + "ppl": 0.05322265625, + "reward": 0.7062292098999023, + "reward_std": 0.28505656123161316, + "rewards/perpo_ocr_edit_distance_reward": 0.7062292695045471, "step": 61, "temperature": 0.9 }, { - "advantages": -2.1713121167721283e-07, - "completion_length": 1216.5, - "delta_ref_entropy_loss": 0.00396728515625, - "delta_ref_ppl": -0.0033636093139648438, - "entropy_loss": -0.137451171875, - "epoch": 0.0248, - "grad_norm": 0.8993497851045971, - "k1_kl": 0.0034265518188476562, - "k3_kl": 0.001644134521484375, - "kimi_kl": 0.0022125244140625, - "learning_rate": 4.876e-07, - "loss": 0.0001, - "ppl": 0.07440185546875, - "reward": 0.6916776746511459, - "reward_std": 0.0319757298566401, - "rewards/perpo_ocr_edit_distance_reward": 0.6916777193546295, + "advantages": 1.532690987460228e-07, + "completion_length": 1087.0, + "delta_ref_entropy_loss": 0.002716064453125, + "delta_ref_ppl": -0.00128936767578125, + "entropy_loss": -0.1279296875, + "epoch": 0.0124, + "grad_norm": 0.6163957691754441, + "k1_kl": 0.00146484375, + "k3_kl": 0.0005645751953125, + "kimi_kl": 0.000743865966796875, + "learning_rate": 4.938e-07, + "loss": 0.0, + "ppl": 0.06591796875, + "reward": 0.8215454816818237, + "reward_std": 0.10714931786060333, + "rewards/perpo_ocr_edit_distance_reward": 0.8215454816818237, "step": 62, "temperature": 0.9 }, { - "advantages": -6.601214877832717e-06, - "completion_length": 875.0, - "delta_ref_entropy_loss": 0.004163265228271484, - "delta_ref_ppl": -0.004180908203125, - "entropy_loss": -0.08258056640625, - "epoch": 0.0252, - "grad_norm": 0.9476448822743326, - "k1_kl": 0.00417327880859375, - "k3_kl": 0.002140045166015625, - "kimi_kl": 0.003078460693359375, - "learning_rate": 4.874e-07, - "loss": 0.0001, - "ppl": 0.039093017578125, - "reward": 0.804414302110672, - "reward_std": 0.09632643824443221, - "rewards/perpo_ocr_edit_distance_reward": 0.8044143319129944, + "advantages": -1.021793991640152e-07, + "completion_length": 20.0, + "delta_ref_entropy_loss": -0.0250244140625, + "delta_ref_ppl": 0.01531982421875, + "entropy_loss": -0.40625, + "epoch": 0.0126, + "grad_norm": 19.005412815905576, + "k1_kl": -0.01611328125, + "k3_kl": 0.0274658203125, + "kimi_kl": 0.0181884765625, + "learning_rate": 4.937e-07, + "loss": 0.0011, + "ppl": 0.310546875, + "reward": 0.3167072534561157, + "reward_std": 0.19309107959270477, + "rewards/perpo_ocr_edit_distance_reward": 0.3167072832584381, "step": 63, "temperature": 0.9 }, { - "advantages": -2.1287374352141342e-07, - "completion_length": 1005.0, - "delta_ref_entropy_loss": -0.0011181831359863281, - "delta_ref_ppl": -0.0029144287109375, - "entropy_loss": -0.1806640625, - "epoch": 0.0256, - "grad_norm": 3.387142029459888, - "k1_kl": 0.0030059814453125, - "k3_kl": 0.00457000732421875, - "kimi_kl": 0.0073394775390625, - "learning_rate": 4.872e-07, - "loss": 0.0002, - "ppl": 0.1005859375, - "reward": 0.7873450219631195, - "reward_std": 0.3454162925481796, - "rewards/perpo_ocr_edit_distance_reward": 0.7873450815677643, + "advantages": 8.514949456639442e-08, + "completion_length": 474.0, + "delta_ref_entropy_loss": 0.04736328125, + "delta_ref_ppl": 0.33984375, + "entropy_loss": -0.7890625, + "epoch": 0.0128, + "grad_norm": 110.12705786925247, + "k1_kl": -0.337890625, + "k3_kl": 0.87109375, + "kimi_kl": 0.369140625, + "learning_rate": 4.935999999999999e-07, + "loss": 0.0348, + "ppl": 0.78125, + "reward": 0.49023646116256714, + "reward_std": 0.33306899666786194, + "rewards/perpo_ocr_edit_distance_reward": 0.49023646116256714, "step": 64, "temperature": 0.9 }, { - "advantages": -3.2782554626464844e-07, - "completion_length": 61.5, - "delta_ref_entropy_loss": -0.2027435302734375, - "delta_ref_ppl": -0.19403076171875, - "entropy_loss": -0.69384765625, - "epoch": 0.026, - "grad_norm": 13.17921492516012, - "k1_kl": 0.194091796875, - "k3_kl": 0.2672119140625, - "kimi_kl": 0.68829345703125, - "learning_rate": 4.87e-07, - "loss": 0.0107, - "ppl": 0.474365234375, - "reward": 0.46365958265960217, - "reward_std": 0.07175436615943909, - "rewards/perpo_ocr_edit_distance_reward": 0.46365961618721485, + "advantages": 0.0, + "completion_length": 1939.0, + "delta_ref_entropy_loss": -0.0001392364501953125, + "delta_ref_ppl": -0.000698089599609375, + "entropy_loss": -0.259765625, + "epoch": 0.013, + "grad_norm": 1.4151140150434103, + "k1_kl": 0.000911712646484375, + "k3_kl": 0.000667572021484375, + "kimi_kl": 0.00090789794921875, + "learning_rate": 4.935e-07, + "loss": 0.0, + "ppl": 0.1435546875, + "reward": 0.47103214263916016, + "reward_std": 0.18228906393051147, + "rewards/perpo_ocr_edit_distance_reward": 0.47103217244148254, "step": 65, "temperature": 0.9 }, { - "advantages": -8.038112490282856e-06, - "completion_length": 287.5, - "delta_ref_entropy_loss": 0.003917217254638672, - "delta_ref_ppl": -0.014434814453125, - "entropy_loss": -0.097900390625, - "epoch": 0.0264, - "grad_norm": 0.7114229187120062, - "k1_kl": 0.014404296875, - "k3_kl": 0.010345458984375, - "kimi_kl": 0.022796630859375, - "learning_rate": 4.867999999999999e-07, - "loss": 0.0004, - "ppl": 0.04901123046875, - "reward": 0.9094946682453156, - "reward_std": 0.15223213320132345, - "rewards/perpo_ocr_edit_distance_reward": 0.9094946980476379, + "advantages": -1.8732889373040962e-07, + "completion_length": 463.0, + "delta_ref_entropy_loss": 0.0030364990234375, + "delta_ref_ppl": -0.00286865234375, + "entropy_loss": -0.15234375, + "epoch": 0.0132, + "grad_norm": 2.2469443243639255, + "k1_kl": 0.002899169921875, + "k3_kl": 0.0013885498046875, + "kimi_kl": 0.002471923828125, + "learning_rate": 4.934e-07, + "loss": 0.0001, + "ppl": 0.07861328125, + "reward": 0.6393460631370544, + "reward_std": 0.2480633705854416, + "rewards/perpo_ocr_edit_distance_reward": 0.6393461227416992, "step": 66, "temperature": 0.9 }, { - "advantages": -2.9802323808780784e-07, - "completion_length": 896.5, - "delta_ref_entropy_loss": -0.0075531005859375, - "delta_ref_ppl": -0.0058135986328125, - "entropy_loss": -0.2633056640625, - "epoch": 0.0268, - "grad_norm": 2.3766113135958578, - "k1_kl": 0.00603485107421875, - "k3_kl": 0.00376129150390625, - "kimi_kl": 0.00653076171875, - "learning_rate": 4.865999999999999e-07, - "loss": 0.0002, - "ppl": 0.1693115234375, - "reward": 0.7650726735591888, - "reward_std": 0.25404931604862213, - "rewards/perpo_ocr_edit_distance_reward": 0.7650727331638336, + "advantages": -2.043587983280304e-07, + "completion_length": 706.0, + "delta_ref_entropy_loss": -0.00080108642578125, + "delta_ref_ppl": -0.004302978515625, + "entropy_loss": -0.296875, + "epoch": 0.0134, + "grad_norm": 5.141145110568225, + "k1_kl": 0.0038909912109375, + "k3_kl": 0.0020751953125, + "kimi_kl": 0.002960205078125, + "learning_rate": 4.933e-07, + "loss": 0.0001, + "ppl": 0.1787109375, + "reward": 0.8520399332046509, + "reward_std": 0.2252533733844757, + "rewards/perpo_ocr_edit_distance_reward": 0.8520400524139404, "step": 67, "temperature": 0.9 }, { - "advantages": -5.534717217869911e-07, - "completion_length": 585.0, - "delta_ref_entropy_loss": -0.0025482177734375, - "delta_ref_ppl": -0.014129638671875, - "entropy_loss": -0.1015625, - "epoch": 0.0272, - "grad_norm": 2.6200421213153544, - "k1_kl": 0.014068603515625, - "k3_kl": 0.0110321044921875, - "kimi_kl": 0.031158447265625, - "learning_rate": 4.864e-07, - "loss": 0.0004, - "ppl": 0.05438232421875, - "reward": 0.8905254900455475, - "reward_std": 0.18684664368629456, - "rewards/perpo_ocr_edit_distance_reward": 0.890525609254837, + "advantages": 2.55448497910038e-08, + "completion_length": 1728.0, + "delta_ref_entropy_loss": 0.0013580322265625, + "delta_ref_ppl": 0.0026702880859375, + "entropy_loss": -0.051513671875, + "epoch": 0.0136, + "grad_norm": 4.168400425888968, + "k1_kl": -0.002655029296875, + "k3_kl": 0.00323486328125, + "kimi_kl": 0.003387451171875, + "learning_rate": 4.932e-07, + "loss": 0.0001, + "ppl": 0.025634765625, + "reward": 0.6922738552093506, + "reward_std": 0.3976011872291565, + "rewards/perpo_ocr_edit_distance_reward": 0.6922738552093506, "step": 68, "temperature": 0.9 }, { - "advantages": -3.980738796371952e-07, - "completion_length": 1093.0, - "delta_ref_entropy_loss": -0.008405208587646484, - "delta_ref_ppl": -0.01714324951171875, - "entropy_loss": -0.12890625, - "epoch": 0.0276, - "grad_norm": 1.6674152213150921, - "k1_kl": 0.016796112060546875, - "k3_kl": 0.01361846923828125, - "kimi_kl": 0.04323577880859375, - "learning_rate": 4.862e-07, - "loss": 0.0005, - "ppl": 0.063232421875, - "reward": 0.5990640819072723, - "reward_std": 0.1266265269368887, - "rewards/perpo_ocr_edit_distance_reward": 0.5990641564130783, + "advantages": -3.2527107123314636e-06, + "completion_length": 400.0, + "delta_ref_entropy_loss": 0.0096435546875, + "delta_ref_ppl": -0.004058837890625, + "entropy_loss": -0.2392578125, + "epoch": 0.0138, + "grad_norm": 1.9480455205626064, + "k1_kl": 0.0038604736328125, + "k3_kl": 0.00136566162109375, + "kimi_kl": 0.0017547607421875, + "learning_rate": 4.931e-07, + "loss": 0.0001, + "ppl": 0.12890625, + "reward": 0.838302731513977, + "reward_std": 0.013008185662329197, + "rewards/perpo_ocr_edit_distance_reward": 0.8383027911186218, "step": 69, "temperature": 0.9 }, { - "advantages": -3.065381761757635e-07, - "completion_length": 638.0, - "delta_ref_entropy_loss": -0.0005397796630859375, - "delta_ref_ppl": -0.006805419921875, - "entropy_loss": -0.090576171875, - "epoch": 0.028, - "grad_norm": 1.1441467040341236, - "k1_kl": 0.0067901611328125, - "k3_kl": 0.00368499755859375, - "kimi_kl": 0.0057220458984375, - "learning_rate": 4.86e-07, - "loss": 0.0001, - "ppl": 0.041015625, - "reward": 0.8239465355873108, - "reward_std": 0.11985132098197937, - "rewards/perpo_ocr_edit_distance_reward": 0.8239465355873108, + "advantages": -2.188342023146106e-06, + "completion_length": 273.0, + "delta_ref_entropy_loss": -0.00567626953125, + "delta_ref_ppl": -0.00762939453125, + "entropy_loss": -0.07861328125, + "epoch": 0.014, + "grad_norm": 1.7573622113521215, + "k1_kl": 0.007598876953125, + "k3_kl": 0.00579833984375, + "kimi_kl": 0.01007080078125, + "learning_rate": 4.93e-07, + "loss": 0.0002, + "ppl": 0.04296875, + "reward": 0.9454789161682129, + "reward_std": 0.02340276911854744, + "rewards/perpo_ocr_edit_distance_reward": 0.9454789757728577, "step": 70, "temperature": 0.9 }, { - "advantages": -1.6178404838029792e-07, - "completion_length": 782.0, - "delta_ref_entropy_loss": -0.00128936767578125, - "delta_ref_ppl": -0.00577545166015625, - "entropy_loss": -0.111572265625, - "epoch": 0.0284, - "grad_norm": 0.9500221752028296, - "k1_kl": 0.0058441162109375, - "k3_kl": 0.0038299560546875, - "kimi_kl": 0.006317138671875, - "learning_rate": 4.858e-07, - "loss": 0.0002, - "ppl": 0.0621337890625, - "reward": 0.9160280227661133, - "reward_std": 0.09229934774339199, - "rewards/perpo_ocr_edit_distance_reward": 0.9160280525684357, + "advantages": -5.143029738974292e-06, + "completion_length": 739.0, + "delta_ref_entropy_loss": 0.0037994384765625, + "delta_ref_ppl": -0.003875732421875, + "entropy_loss": -0.046142578125, + "epoch": 0.0142, + "grad_norm": 0.8177371807956628, + "k1_kl": 0.0038604736328125, + "k3_kl": 0.00194549560546875, + "kimi_kl": 0.002960205078125, + "learning_rate": 4.929e-07, + "loss": 0.0001, + "ppl": 0.019287109375, + "reward": 0.9700127840042114, + "reward_std": 0.004863662179559469, + "rewards/perpo_ocr_edit_distance_reward": 0.9700128436088562, "step": 71, "temperature": 0.9 }, { - "advantages": -2.205371970376291e-06, - "completion_length": 939.5, - "delta_ref_entropy_loss": 0.00200653076171875, - "delta_ref_ppl": -0.004852294921875, - "entropy_loss": -0.0654296875, - "epoch": 0.0288, - "grad_norm": 0.735976295999271, - "k1_kl": 0.00492095947265625, - "k3_kl": 0.0032958984375, - "kimi_kl": 0.00800323486328125, - "learning_rate": 4.856e-07, + "advantages": -5.211149073147681e-06, + "completion_length": 556.0, + "delta_ref_entropy_loss": 0.0030670166015625, + "delta_ref_ppl": -0.00384521484375, + "entropy_loss": -0.0400390625, + "epoch": 0.0144, + "grad_norm": 0.9521996560390549, + "k1_kl": 0.0038299560546875, + "k3_kl": 0.0016326904296875, + "kimi_kl": 0.0024566650390625, + "learning_rate": 4.928e-07, "loss": 0.0001, - "ppl": 0.0308837890625, - "reward": 0.9315571486949921, - "reward_std": 0.03142890427261591, - "rewards/perpo_ocr_edit_distance_reward": 0.9315572381019592, + "ppl": 0.01556396484375, + "reward": 0.9739677309989929, + "reward_std": 0.011298753321170807, + "rewards/perpo_ocr_edit_distance_reward": 0.9739677309989929, "step": 72, "temperature": 0.9 }, { - "advantages": -3.065381761757635e-07, - "completion_length": 1093.5, - "delta_ref_entropy_loss": 0.01886749267578125, - "delta_ref_ppl": -0.011249542236328125, - "entropy_loss": -0.40234375, - "epoch": 0.0292, - "grad_norm": 5.0728925857018545, - "k1_kl": 0.011539459228515625, - "k3_kl": 0.0049800872802734375, - "kimi_kl": 0.0062103271484375, - "learning_rate": 4.853999999999999e-07, - "loss": 0.0002, - "ppl": 0.22216796875, - "reward": 0.5733010619878769, - "reward_std": 0.16285553574562073, - "rewards/perpo_ocr_edit_distance_reward": 0.573301151394844, + "advantages": -3.491129234589607e-07, + "completion_length": 2048.0, + "delta_ref_entropy_loss": 0.0020904541015625, + "delta_ref_ppl": -0.0018157958984375, + "entropy_loss": -0.1611328125, + "epoch": 0.0146, + "grad_norm": 3.841254210344971, + "k1_kl": 0.00185394287109375, + "k3_kl": 0.0009918212890625, + "kimi_kl": 0.00102996826171875, + "learning_rate": 4.927e-07, + "loss": 0.0, + "ppl": 0.0869140625, + "reward": 0.6414281725883484, + "reward_std": 0.1551266759634018, + "rewards/perpo_ocr_edit_distance_reward": 0.6414282917976379, "step": 73, "temperature": 0.9 }, { - "advantages": -1.128230859848145e-07, - "completion_length": 1232.0, - "delta_ref_entropy_loss": -0.0008020401000976562, - "delta_ref_ppl": -0.006596088409423828, - "entropy_loss": -0.170166015625, - "epoch": 0.0296, - "grad_norm": 1.1706513888389647, - "k1_kl": 0.006601393222808838, - "k3_kl": 0.00499725341796875, - "kimi_kl": 0.0133819580078125, - "learning_rate": 4.852e-07, - "loss": 0.0002, - "ppl": 0.08984375, - "reward": 0.5691360384225845, - "reward_std": 0.13326044753193855, - "rewards/perpo_ocr_edit_distance_reward": 0.5691360384225845, + "advantages": -1.0200909855484497e-05, + "completion_length": 524.0, + "delta_ref_entropy_loss": 0.0103759765625, + "delta_ref_ppl": -0.0030059814453125, + "entropy_loss": -0.0791015625, + "epoch": 0.0148, + "grad_norm": 1.0333457401429904, + "k1_kl": 0.0030364990234375, + "k3_kl": 0.000583648681640625, + "kimi_kl": 0.0007171630859375, + "learning_rate": 4.926e-07, + "loss": 0.0, + "ppl": 0.03515625, + "reward": 0.9762570858001709, + "reward_std": 0.004072097130119801, + "rewards/perpo_ocr_edit_distance_reward": 0.9762571454048157, "step": 74, "temperature": 0.9 }, { - "advantages": -2.1287373908052132e-07, - "completion_length": 1672.0, - "delta_ref_entropy_loss": 0.001781463623046875, - "delta_ref_ppl": -0.0005035400390625, - "entropy_loss": -0.146240234375, - "epoch": 0.03, - "grad_norm": 7.534518513526407, - "k1_kl": 0.0005245208740234375, - "k3_kl": 0.00562286376953125, - "kimi_kl": 0.004497528076171875, - "learning_rate": 4.85e-07, - "loss": 0.0002, - "ppl": 0.0948486328125, - "reward": 0.6167257130146027, - "reward_std": 0.08216862939298153, - "rewards/perpo_ocr_edit_distance_reward": 0.6167257726192474, + "advantages": -4.6491626562783495e-05, + "completion_length": 352.0, + "delta_ref_entropy_loss": 0.0169677734375, + "delta_ref_ppl": -0.00555419921875, + "entropy_loss": -0.058837890625, + "epoch": 0.015, + "grad_norm": 0.6693348939099002, + "k1_kl": 0.005615234375, + "k3_kl": 0.0014801025390625, + "kimi_kl": 0.0019989013671875, + "learning_rate": 4.924999999999999e-07, + "loss": 0.0001, + "ppl": 0.0269775390625, + "reward": 0.9714440703392029, + "reward_std": 0.0013655450893566012, + "rewards/perpo_ocr_edit_distance_reward": 0.9714441895484924, "step": 75, "temperature": 0.9 }, { - "advantages": -1.2772424184959164e-07, - "completion_length": 729.0, - "delta_ref_entropy_loss": -7.62939453125e-05, - "delta_ref_ppl": -0.007389068603515625, - "entropy_loss": -0.1923828125, - "epoch": 0.0304, - "grad_norm": 2.5370445384348836, - "k1_kl": 0.007366180419921875, - "k3_kl": 0.0044193267822265625, - "kimi_kl": 0.00768280029296875, - "learning_rate": 4.848e-07, + "advantages": -2.2138868871479644e-07, + "completion_length": 277.0, + "delta_ref_entropy_loss": 0.004730224609375, + "delta_ref_ppl": -0.00994873046875, + "entropy_loss": -0.1630859375, + "epoch": 0.0152, + "grad_norm": 2.7502158476598733, + "k1_kl": 0.0098876953125, + "k3_kl": 0.004425048828125, + "kimi_kl": 0.00616455078125, + "learning_rate": 4.923999999999999e-07, "loss": 0.0002, - "ppl": 0.104248046875, - "reward": 0.5902044773101807, - "reward_std": 0.2522243782877922, - "rewards/perpo_ocr_edit_distance_reward": 0.590204507112503, + "ppl": 0.07763671875, + "reward": 0.828118622303009, + "reward_std": 0.07228939980268478, + "rewards/perpo_ocr_edit_distance_reward": 0.828118622303009, "step": 76, "temperature": 0.9 }, { - "advantages": -4.883323526883032e-06, - "completion_length": 869.5, - "delta_ref_entropy_loss": -0.0010623931884765625, - "delta_ref_ppl": -0.00669097900390625, - "entropy_loss": -0.12255859375, - "epoch": 0.0308, - "grad_norm": 1.2609922811751855, - "k1_kl": 0.00670623779296875, - "k3_kl": 0.0053863525390625, - "kimi_kl": 0.014781951904296875, - "learning_rate": 4.846e-07, - "loss": 0.0002, - "ppl": 0.06396484375, - "reward": 0.8476332724094391, - "reward_std": 0.15917142760008574, - "rewards/perpo_ocr_edit_distance_reward": 0.8476333022117615, + "advantages": -1.532690987460228e-07, + "completion_length": 815.0, + "delta_ref_entropy_loss": 0.00970458984375, + "delta_ref_ppl": -0.005340576171875, + "entropy_loss": -0.080078125, + "epoch": 0.0154, + "grad_norm": 1.934587819378331, + "k1_kl": 0.005401611328125, + "k3_kl": 0.002838134765625, + "kimi_kl": 0.0033111572265625, + "learning_rate": 4.923e-07, + "loss": 0.0001, + "ppl": 0.042236328125, + "reward": 0.8606435656547546, + "reward_std": 0.2315320074558258, + "rewards/perpo_ocr_edit_distance_reward": 0.8606436252593994, "step": 77, "temperature": 0.9 }, { - "advantages": 4.6193600212518504e-05, - "completion_length": 610.0, - "delta_ref_entropy_loss": 0.0057830810546875, - "delta_ref_ppl": -0.0087738037109375, - "entropy_loss": -0.0791015625, - "epoch": 0.0312, - "grad_norm": 0.7736597034036082, - "k1_kl": 0.008636474609375, - "k3_kl": 0.00566864013671875, - "kimi_kl": 0.0082550048828125, - "learning_rate": 4.844e-07, - "loss": 0.0002, - "ppl": 0.04302978515625, - "reward": 0.8744402527809143, - "reward_std": 0.10688251489773393, - "rewards/perpo_ocr_edit_distance_reward": 0.8744402825832367, + "advantages": 2.55448497910038e-07, + "completion_length": 780.0, + "delta_ref_entropy_loss": 0.000835418701171875, + "delta_ref_ppl": -0.00341796875, + "entropy_loss": -0.09375, + "epoch": 0.0156, + "grad_norm": 2.3859442405390543, + "k1_kl": 0.0034027099609375, + "k3_kl": 0.00189971923828125, + "kimi_kl": 0.0027618408203125, + "learning_rate": 4.922e-07, + "loss": 0.0001, + "ppl": 0.045166015625, + "reward": 0.901025652885437, + "reward_std": 0.06700100004673004, + "rewards/perpo_ocr_edit_distance_reward": 0.901025652885437, "step": 78, "temperature": 0.9 }, { - "advantages": -1.8818038256540603e-06, - "completion_length": 643.0, - "delta_ref_entropy_loss": 0.003520965576171875, - "delta_ref_ppl": -0.00560760498046875, - "entropy_loss": -0.05169677734375, - "epoch": 0.0316, - "grad_norm": 0.7088527985466516, - "k1_kl": 0.0055694580078125, - "k3_kl": 0.0034284591674804688, - "kimi_kl": 0.0079193115234375, - "learning_rate": 4.842e-07, - "loss": 0.0001, - "ppl": 0.02484130859375, - "reward": 0.963617205619812, - "reward_std": 0.024129606317728758, - "rewards/perpo_ocr_edit_distance_reward": 0.9636172354221344, + "advantages": -1.9116061594104394e-05, + "completion_length": 387.0, + "delta_ref_entropy_loss": 0.0034942626953125, + "delta_ref_ppl": -0.0014495849609375, + "entropy_loss": -0.07666015625, + "epoch": 0.0158, + "grad_norm": 1.316208022606933, + "k1_kl": 0.0014495849609375, + "k3_kl": 0.00927734375, + "kimi_kl": 0.00408935546875, + "learning_rate": 4.920999999999999e-07, + "loss": 0.0004, + "ppl": 0.041748046875, + "reward": 0.974463164806366, + "reward_std": 0.0034661341924220324, + "rewards/perpo_ocr_edit_distance_reward": 0.9744632244110107, "step": 79, "temperature": 0.9 }, { - "advantages": -1.4555242522362732e-07, - "completion_length": 688.5, - "delta_ref_entropy_loss": -0.0072784423828125, - "delta_ref_ppl": -0.002811431884765625, - "entropy_loss": -0.2802734375, - "epoch": 0.032, - "grad_norm": 1.3311112306314885, - "k1_kl": 0.002696990966796875, - "k3_kl": 0.003509521484375, - "kimi_kl": 0.007312774658203125, - "learning_rate": 4.839999999999999e-07, - "loss": 0.0001, - "ppl": 0.16015625, - "reward": 0.5737494826316833, - "reward_std": 0.18714871257543564, - "rewards/perpo_ocr_edit_distance_reward": 0.5737495273351669, + "advantages": 3.4059798537100505e-08, + "completion_length": 870.0, + "delta_ref_entropy_loss": 0.0025787353515625, + "delta_ref_ppl": -0.00139617919921875, + "entropy_loss": -0.1552734375, + "epoch": 0.016, + "grad_norm": 0.829642476002948, + "k1_kl": 0.0014801025390625, + "k3_kl": 0.000659942626953125, + "kimi_kl": 0.000774383544921875, + "learning_rate": 4.92e-07, + "loss": 0.0, + "ppl": 0.0869140625, + "reward": 0.7802350521087646, + "reward_std": 0.1689373105764389, + "rewards/perpo_ocr_edit_distance_reward": 0.7802350521087646, "step": 80, "temperature": 0.9 }, { - "advantages": -1.9754683577843934e-05, - "completion_length": 599.0, - "delta_ref_entropy_loss": 0.003509521484375, - "delta_ref_ppl": -0.0034942626953125, - "entropy_loss": -0.05889892578125, - "epoch": 0.0324, - "grad_norm": 0.6228530349395303, - "k1_kl": 0.00351715087890625, - "k3_kl": 0.00260162353515625, - "kimi_kl": 0.004058837890625, - "learning_rate": 4.838e-07, + "advantages": 8.514949456639442e-08, + "completion_length": 1324.0, + "delta_ref_entropy_loss": 0.00579833984375, + "delta_ref_ppl": -0.002227783203125, + "entropy_loss": -0.21484375, + "epoch": 0.0162, + "grad_norm": 4.338512924141219, + "k1_kl": 0.0023651123046875, + "k3_kl": 0.0026092529296875, + "kimi_kl": 0.0033721923828125, + "learning_rate": 4.919e-07, "loss": 0.0001, - "ppl": 0.02862548828125, - "reward": 0.9553861618041992, - "reward_std": 0.08413149777334183, - "rewards/perpo_ocr_edit_distance_reward": 0.955386221408844, + "ppl": 0.12890625, + "reward": 0.5167526006698608, + "reward_std": 0.1872996985912323, + "rewards/perpo_ocr_edit_distance_reward": 0.5167526006698608, "step": 81, "temperature": 0.9 }, { - "advantages": -6.173338391590732e-07, - "completion_length": 998.5, - "delta_ref_entropy_loss": 0.004006385803222656, - "delta_ref_ppl": -0.004608154296875, - "entropy_loss": -0.127685546875, - "epoch": 0.0328, - "grad_norm": 0.9085079059415203, - "k1_kl": 0.0044708251953125, - "k3_kl": 0.002208709716796875, - "kimi_kl": 0.004482269287109375, - "learning_rate": 4.835999999999999e-07, - "loss": 0.0001, - "ppl": 0.06396484375, - "reward": 0.8632633090019226, - "reward_std": 0.16038989927619696, - "rewards/perpo_ocr_edit_distance_reward": 0.863263338804245, + "advantages": -5.211149073147681e-06, + "completion_length": 41.0, + "delta_ref_entropy_loss": 0.0308837890625, + "delta_ref_ppl": -0.0595703125, + "entropy_loss": -0.177734375, + "epoch": 0.0164, + "grad_norm": 7.8752323196980205, + "k1_kl": 0.0595703125, + "k3_kl": 0.0216064453125, + "kimi_kl": 0.028076171875, + "learning_rate": 4.918e-07, + "loss": 0.0009, + "ppl": 0.0771484375, + "reward": 0.9293830990791321, + "reward_std": 0.006442578509449959, + "rewards/perpo_ocr_edit_distance_reward": 0.9293831586837769, "step": 82, "temperature": 0.9 }, { - "advantages": -1.1065177204727661e-05, - "completion_length": 651.0, - "delta_ref_entropy_loss": 0.004534244537353516, - "delta_ref_ppl": -0.00408935546875, - "entropy_loss": -0.1168212890625, - "epoch": 0.0332, - "grad_norm": 1.565633184091053, - "k1_kl": 0.00417327880859375, - "k3_kl": 0.002899169921875, - "kimi_kl": 0.00397491455078125, - "learning_rate": 4.834e-07, - "loss": 0.0001, - "ppl": 0.067169189453125, - "reward": 0.9123117327690125, - "reward_std": 0.046307064534630626, - "rewards/perpo_ocr_edit_distance_reward": 0.9123117923736572, + "advantages": -7.833753556951706e-07, + "completion_length": 1184.0, + "delta_ref_entropy_loss": 0.002899169921875, + "delta_ref_ppl": -0.0004711151123046875, + "entropy_loss": -0.0869140625, + "epoch": 0.0166, + "grad_norm": 1.8407474317828854, + "k1_kl": 0.000507354736328125, + "k3_kl": 0.00040435791015625, + "kimi_kl": 0.0004825592041015625, + "learning_rate": 4.917e-07, + "loss": 0.0, + "ppl": 0.043212890625, + "reward": 0.8894678950309753, + "reward_std": 0.032524459064006805, + "rewards/perpo_ocr_edit_distance_reward": 0.8894679546356201, "step": 83, "temperature": 0.9 }, { - "advantages": -2.6715655110365333e-07, - "completion_length": 873.0, - "delta_ref_entropy_loss": 0.0050220489501953125, - "delta_ref_ppl": -0.00394439697265625, - "entropy_loss": -0.114501953125, - "epoch": 0.0336, - "grad_norm": 0.5609164960416907, - "k1_kl": 0.00390625, - "k3_kl": 0.0020751953125, - "kimi_kl": 0.00302886962890625, - "learning_rate": 4.832e-07, + "advantages": -1.0183879567193799e-05, + "completion_length": 568.0, + "delta_ref_entropy_loss": 0.0081787109375, + "delta_ref_ppl": -0.0030364990234375, + "entropy_loss": -0.02587890625, + "epoch": 0.0168, + "grad_norm": 0.4299136192941758, + "k1_kl": 0.003021240234375, + "k3_kl": 0.001434326171875, + "kimi_kl": 0.00150299072265625, + "learning_rate": 4.916e-07, "loss": 0.0001, - "ppl": 0.057861328125, - "reward": 0.6149347126483917, - "reward_std": 0.16005536168813705, - "rewards/perpo_ocr_edit_distance_reward": 0.6149347424507141, + "ppl": 0.01043701171875, + "reward": 0.9799801111221313, + "reward_std": 0.0007371642277576029, + "rewards/perpo_ocr_edit_distance_reward": 0.9799802303314209, "step": 84, "temperature": 0.9 }, { - "advantages": -2.639634519852052e-07, - "completion_length": 672.0, - "delta_ref_entropy_loss": -0.003726959228515625, - "delta_ref_ppl": -0.01090240478515625, - "entropy_loss": -0.12548828125, - "epoch": 0.034, - "grad_norm": 5.404288929925485, - "k1_kl": 0.01091766357421875, - "k3_kl": 0.00731658935546875, - "kimi_kl": 0.01641845703125, - "learning_rate": 4.83e-07, + "advantages": 2.55448497910038e-08, + "completion_length": 542.0, + "delta_ref_entropy_loss": 0.00433349609375, + "delta_ref_ppl": -0.01171875, + "entropy_loss": -0.2021484375, + "epoch": 0.017, + "grad_norm": 1.9527331065280196, + "k1_kl": 0.0115966796875, + "k3_kl": 0.006622314453125, + "kimi_kl": 0.0098876953125, + "learning_rate": 4.915e-07, "loss": 0.0003, - "ppl": 0.06689453125, - "reward": 0.7505800724029541, - "reward_std": 0.07858062908053398, - "rewards/perpo_ocr_edit_distance_reward": 0.7505801022052765, + "ppl": 0.09716796875, + "reward": 0.6789276599884033, + "reward_std": 0.18748392164707184, + "rewards/perpo_ocr_edit_distance_reward": 0.6789277195930481, "step": 85, "temperature": 0.9 }, { - "advantages": -6.215913259666195e-07, - "completion_length": 498.0, - "delta_ref_entropy_loss": -0.012054443359375, - "delta_ref_ppl": -0.030670166015625, - "entropy_loss": -0.212890625, - "epoch": 0.0344, - "grad_norm": 12.59997875956194, - "k1_kl": 0.0302276611328125, - "k3_kl": 0.01849365234375, - "kimi_kl": 0.037261962890625, - "learning_rate": 4.828e-07, - "loss": 0.0007, - "ppl": 0.116943359375, - "reward": 0.8889926970005035, - "reward_std": 0.21336662955582142, - "rewards/perpo_ocr_edit_distance_reward": 0.8889927268028259, + "advantages": -1.1920928955078125e-07, + "completion_length": 316.0, + "delta_ref_entropy_loss": 0.005523681640625, + "delta_ref_ppl": -0.004364013671875, + "entropy_loss": -0.1103515625, + "epoch": 0.0172, + "grad_norm": 0.9546135188346376, + "k1_kl": 0.004364013671875, + "k3_kl": 0.005462646484375, + "kimi_kl": 0.0047607421875, + "learning_rate": 4.914e-07, + "loss": 0.0002, + "ppl": 0.058837890625, + "reward": 0.8673722743988037, + "reward_std": 0.0933666005730629, + "rewards/perpo_ocr_edit_distance_reward": 0.8673723340034485, "step": 86, "temperature": 0.9 }, { - "advantages": -4.483120846998645e-06, - "completion_length": 355.5, - "delta_ref_entropy_loss": 0.010955810546875, - "delta_ref_ppl": -0.0152587890625, - "entropy_loss": -0.08544921875, - "epoch": 0.0348, - "grad_norm": 1.5389993300109177, - "k1_kl": 0.015167236328125, - "k3_kl": 0.008392333984375, - "kimi_kl": 0.01666259765625, - "learning_rate": 4.825999999999999e-07, - "loss": 0.0003, - "ppl": 0.04058837890625, - "reward": 0.9875915944576263, - "reward_std": 0.0081717933062464, - "rewards/perpo_ocr_edit_distance_reward": 0.9875916540622711, + "advantages": -1.9754684217332397e-06, + "completion_length": 655.0, + "delta_ref_entropy_loss": 0.005584716796875, + "delta_ref_ppl": -0.005767822265625, + "entropy_loss": -0.059814453125, + "epoch": 0.0174, + "grad_norm": 0.6968361725849185, + "k1_kl": 0.00579833984375, + "k3_kl": 0.0029754638671875, + "kimi_kl": 0.003936767578125, + "learning_rate": 4.913e-07, + "loss": 0.0001, + "ppl": 0.0223388671875, + "reward": 0.8680532574653625, + "reward_std": 0.03422749415040016, + "rewards/perpo_ocr_edit_distance_reward": 0.8680533170700073, "step": 87, "temperature": 0.9 }, { - "advantages": -3.6954880897610565e-06, - "completion_length": 1055.5, - "delta_ref_entropy_loss": 0.00399017333984375, - "delta_ref_ppl": -0.003143310546875, - "entropy_loss": -0.049072265625, - "epoch": 0.0352, - "grad_norm": 0.4330041648321083, - "k1_kl": 0.00308990478515625, - "k3_kl": 0.00133514404296875, - "kimi_kl": 0.00173187255859375, - "learning_rate": 4.823999999999999e-07, - "loss": 0.0001, - "ppl": 0.0230712890625, - "reward": 0.9908566772937775, - "reward_std": 0.007616460206918418, - "rewards/perpo_ocr_edit_distance_reward": 0.9908567667007446, + "advantages": -2.2990363959252136e-06, + "completion_length": 285.0, + "delta_ref_entropy_loss": -0.0037994384765625, + "delta_ref_ppl": -0.0079345703125, + "entropy_loss": -0.10498046875, + "epoch": 0.0176, + "grad_norm": 1.987409026890148, + "k1_kl": 0.0079345703125, + "k3_kl": 0.00811767578125, + "kimi_kl": 0.01025390625, + "learning_rate": 4.912e-07, + "loss": 0.0003, + "ppl": 0.0615234375, + "reward": 0.933467447757721, + "reward_std": 0.03340853378176689, + "rewards/perpo_ocr_edit_distance_reward": 0.9334675669670105, "step": 88, "temperature": 0.9 }, { - "advantages": -2.1542822707942832e-06, - "completion_length": 70.0, - "delta_ref_entropy_loss": -0.016998291015625, - "delta_ref_ppl": -0.031036376953125, - "entropy_loss": -0.36474609375, - "epoch": 0.0356, - "grad_norm": 3.5656367821026946, - "k1_kl": 0.0308837890625, - "k3_kl": 0.026397705078125, - "kimi_kl": 0.0762481689453125, - "learning_rate": 4.822e-07, - "loss": 0.0011, - "ppl": 0.206298828125, - "reward": 0.7127820998430252, - "reward_std": 0.03040774166584015, - "rewards/perpo_ocr_edit_distance_reward": 0.7127821445465088, + "advantages": -6.100961400079541e-05, + "completion_length": 350.0, + "delta_ref_entropy_loss": 0.005950927734375, + "delta_ref_ppl": -0.003143310546875, + "entropy_loss": -0.032958984375, + "epoch": 0.0178, + "grad_norm": 0.950989059863637, + "k1_kl": 0.003173828125, + "k3_kl": 0.00107574462890625, + "kimi_kl": 0.00152587890625, + "learning_rate": 4.910999999999999e-07, + "loss": 0.0001, + "ppl": 0.0120849609375, + "reward": 0.9808385968208313, + "reward_std": 0.00045819784281775355, + "rewards/perpo_ocr_edit_distance_reward": 0.9808385968208313, "step": 89, "temperature": 0.9 }, { - "advantages": -5.381448057839577e-06, - "completion_length": 366.5, - "delta_ref_entropy_loss": 0.0152740478515625, - "delta_ref_ppl": -0.017486572265625, - "entropy_loss": -0.11553955078125, - "epoch": 0.036, - "grad_norm": 2.658237206035839, - "k1_kl": 0.017547607421875, - "k3_kl": 0.01084136962890625, - "kimi_kl": 0.02207183837890625, - "learning_rate": 4.82e-07, - "loss": 0.0004, - "ppl": 0.059661865234375, - "reward": 0.9594976007938385, - "reward_std": 0.0651484439149499, - "rewards/perpo_ocr_edit_distance_reward": 0.9594976007938385, + "advantages": -2.571514869487146e-06, + "completion_length": 556.0, + "delta_ref_entropy_loss": 0.01165771484375, + "delta_ref_ppl": -0.004486083984375, + "entropy_loss": -0.053466796875, + "epoch": 0.018, + "grad_norm": 0.6852600602505621, + "k1_kl": 0.0045166015625, + "k3_kl": 0.002105712890625, + "kimi_kl": 0.0027923583984375, + "learning_rate": 4.909999999999999e-07, + "loss": 0.0001, + "ppl": 0.02490234375, + "reward": 0.9658229351043701, + "reward_std": 0.003199584549292922, + "rewards/perpo_ocr_edit_distance_reward": 0.9658229947090149, "step": 90, "temperature": 0.9 }, { - "advantages": -7.067408347438686e-07, - "completion_length": 545.5, - "delta_ref_entropy_loss": -0.00920867919921875, - "delta_ref_ppl": -0.0061187744140625, - "entropy_loss": -0.13671875, - "epoch": 0.0364, - "grad_norm": 3.1034633207798548, - "k1_kl": 0.00634765625, - "k3_kl": 0.0047454833984375, - "kimi_kl": 0.0079803466796875, - "learning_rate": 4.818e-07, - "loss": 0.0002, - "ppl": 0.077056884765625, - "reward": 0.9211909174919128, - "reward_std": 0.13659725012257695, - "rewards/perpo_ocr_edit_distance_reward": 0.9211909770965576, + "advantages": -4.240444832248613e-05, + "completion_length": 708.0, + "delta_ref_entropy_loss": 0.00750732421875, + "delta_ref_ppl": -0.003570556640625, + "entropy_loss": -0.026611328125, + "epoch": 0.0182, + "grad_norm": 0.5986869417241741, + "k1_kl": 0.003570556640625, + "k3_kl": 0.00156402587890625, + "kimi_kl": 0.0018768310546875, + "learning_rate": 4.908999999999999e-07, + "loss": 0.0001, + "ppl": 0.01202392578125, + "reward": 0.9846330285072327, + "reward_std": 0.0005025553982704878, + "rewards/perpo_ocr_edit_distance_reward": 0.9846330881118774, "step": 91, "temperature": 0.9 }, { - "advantages": -1.4475413934178505e-07, - "completion_length": 1151.0, - "delta_ref_entropy_loss": 0.001071929931640625, - "delta_ref_ppl": -0.00698089599609375, - "entropy_loss": -0.099853515625, - "epoch": 0.0368, - "grad_norm": 1.2204778612463916, - "k1_kl": 0.00701141357421875, - "k3_kl": 0.005664825439453125, - "kimi_kl": 0.007999420166015625, - "learning_rate": 4.816e-07, - "loss": 0.0002, - "ppl": 0.055908203125, - "reward": 0.7939517199993134, - "reward_std": 0.18934982270002365, - "rewards/perpo_ocr_edit_distance_reward": 0.7939517498016357, + "advantages": -5.10896995820076e-08, + "completion_length": 937.0, + "delta_ref_entropy_loss": 0.002227783203125, + "delta_ref_ppl": -0.001708984375, + "entropy_loss": -0.1650390625, + "epoch": 0.0184, + "grad_norm": 1.0449871526019352, + "k1_kl": 0.0016937255859375, + "k3_kl": 0.000885009765625, + "kimi_kl": 0.001190185546875, + "learning_rate": 4.908e-07, + "loss": 0.0, + "ppl": 0.08642578125, + "reward": 0.6249969005584717, + "reward_std": 0.2304093837738037, + "rewards/perpo_ocr_edit_distance_reward": 0.6249969005584717, "step": 92, "temperature": 0.9 }, { - "advantages": -2.0052705522743963e-06, - "completion_length": 1135.0, - "delta_ref_entropy_loss": 0.0063323974609375, - "delta_ref_ppl": -0.00482177734375, - "entropy_loss": -0.07666015625, - "epoch": 0.0372, - "grad_norm": 2.296253657992682, - "k1_kl": 0.0047473907470703125, - "k3_kl": 0.006439208984375, - "kimi_kl": 0.00689697265625, - "learning_rate": 4.814e-07, + "advantages": -1.3623919414840202e-07, + "completion_length": 423.0, + "delta_ref_entropy_loss": 0.004058837890625, + "delta_ref_ppl": -0.005950927734375, + "entropy_loss": -0.0294189453125, + "epoch": 0.0186, + "grad_norm": 1.9124063894455436, + "k1_kl": 0.005950927734375, + "k3_kl": 0.0068359375, + "kimi_kl": 0.00811767578125, + "learning_rate": 4.907e-07, "loss": 0.0003, - "ppl": 0.042938232421875, - "reward": 0.8781758248806, - "reward_std": 0.09153628582134843, - "rewards/perpo_ocr_edit_distance_reward": 0.8781758844852448, + "ppl": 0.0101318359375, + "reward": 0.8736616373062134, + "reward_std": 0.27170416712760925, + "rewards/perpo_ocr_edit_distance_reward": 0.8736616969108582, "step": 93, "temperature": 0.9 }, { - "advantages": -5.247337924174644e-06, - "completion_length": 652.5, - "delta_ref_entropy_loss": 0.0050048828125, - "delta_ref_ppl": -0.0035247802734375, - "entropy_loss": -0.037811279296875, - "epoch": 0.0376, - "grad_norm": 0.4957580931824109, - "k1_kl": 0.00354766845703125, - "k3_kl": 0.00174713134765625, - "kimi_kl": 0.002166748046875, - "learning_rate": 4.812e-07, - "loss": 0.0001, - "ppl": 0.0168914794921875, - "reward": 0.9375596344470978, - "reward_std": 0.02434730064123869, - "rewards/perpo_ocr_edit_distance_reward": 0.9375597238540649, + "advantages": -1.826456718845293e-05, + "completion_length": 733.0, + "delta_ref_entropy_loss": 0.0084228515625, + "delta_ref_ppl": -0.007110595703125, + "entropy_loss": -0.099609375, + "epoch": 0.0188, + "grad_norm": 1.3885994096071619, + "k1_kl": 0.007080078125, + "k3_kl": 0.003326416015625, + "kimi_kl": 0.005889892578125, + "learning_rate": 4.905999999999999e-07, + "loss": 0.0002, + "ppl": 0.0400390625, + "reward": 0.9712436199188232, + "reward_std": 0.0022298635449260473, + "rewards/perpo_ocr_edit_distance_reward": 0.971243679523468, "step": 94, "temperature": 0.9 }, { - "advantages": -3.405979924764324e-07, - "completion_length": 810.5, - "delta_ref_entropy_loss": 0.0022335052490234375, - "delta_ref_ppl": -0.0041351318359375, - "entropy_loss": -0.049560546875, - "epoch": 0.038, - "grad_norm": 0.7619266837039771, - "k1_kl": 0.004119873046875, - "k3_kl": 0.0025634765625, - "kimi_kl": 0.00429534912109375, - "learning_rate": 4.809999999999999e-07, - "loss": 0.0001, - "ppl": 0.0223388671875, - "reward": 0.913960188627243, - "reward_std": 0.1960262879729271, - "rewards/perpo_ocr_edit_distance_reward": 0.9139602482318878, + "advantages": -9.196145356327179e-07, + "completion_length": 339.0, + "delta_ref_entropy_loss": 0.005523681640625, + "delta_ref_ppl": -0.00872802734375, + "entropy_loss": -0.08349609375, + "epoch": 0.019, + "grad_norm": 1.473369021678186, + "k1_kl": 0.0086669921875, + "k3_kl": 0.003936767578125, + "kimi_kl": 0.0067138671875, + "learning_rate": 4.905e-07, + "loss": 0.0002, + "ppl": 0.0380859375, + "reward": 0.9523850679397583, + "reward_std": 0.018395420163869858, + "rewards/perpo_ocr_edit_distance_reward": 0.9523850679397583, "step": 95, "temperature": 0.9 }, { - "advantages": -1.4560563386112335e-06, - "completion_length": 605.5, - "delta_ref_entropy_loss": 0.0077667236328125, - "delta_ref_ppl": -0.00848388671875, - "entropy_loss": -0.05389404296875, - "epoch": 0.0384, - "grad_norm": 0.6629801659294189, - "k1_kl": 0.008514404296875, - "k3_kl": 0.004730224609375, - "kimi_kl": 0.009185791015625, - "learning_rate": 4.808e-07, - "loss": 0.0002, - "ppl": 0.02325439453125, - "reward": 0.9739895761013031, - "reward_std": 0.027897781692445278, - "rewards/perpo_ocr_edit_distance_reward": 0.9739896357059479, + "advantages": -3.4059798537100505e-08, + "completion_length": 1263.0, + "delta_ref_entropy_loss": 0.003936767578125, + "delta_ref_ppl": -0.004547119140625, + "entropy_loss": -0.068359375, + "epoch": 0.0192, + "grad_norm": 1.2119255174111903, + "k1_kl": 0.004547119140625, + "k3_kl": 0.003021240234375, + "kimi_kl": 0.00506591796875, + "learning_rate": 4.904e-07, + "loss": 0.0001, + "ppl": 0.0341796875, + "reward": 0.7740174531936646, + "reward_std": 0.2737109363079071, + "rewards/perpo_ocr_edit_distance_reward": 0.7740175127983093, "step": 96, "temperature": 0.9 }, { - "advantages": -2.2309167349021664e-06, - "completion_length": 1200.5, - "delta_ref_entropy_loss": 0.006622314453125, - "delta_ref_ppl": -0.005687713623046875, - "entropy_loss": -0.0762939453125, - "epoch": 0.0388, - "grad_norm": 0.916617167059513, - "k1_kl": 0.0057525634765625, - "k3_kl": 0.002658843994140625, - "kimi_kl": 0.003307342529296875, - "learning_rate": 4.806e-07, - "loss": 0.0001, - "ppl": 0.04107666015625, - "reward": 0.8937389254570007, - "reward_std": 0.07584550883620977, - "rewards/perpo_ocr_edit_distance_reward": 0.8937389552593231, + "advantages": -6.471361757576233e-07, + "completion_length": 738.0, + "delta_ref_entropy_loss": 0.0009765625, + "delta_ref_ppl": -0.007171630859375, + "entropy_loss": -0.1240234375, + "epoch": 0.0194, + "grad_norm": 1.350495463950338, + "k1_kl": 0.00726318359375, + "k3_kl": 0.004241943359375, + "kimi_kl": 0.007781982421875, + "learning_rate": 4.903e-07, + "loss": 0.0002, + "ppl": 0.0654296875, + "reward": 0.5809200406074524, + "reward_std": 0.07222352176904678, + "rewards/perpo_ocr_edit_distance_reward": 0.5809201002120972, "step": 97, "temperature": 0.9 }, { - "advantages": -1.4296600710395069e-05, - "completion_length": 964.5, - "delta_ref_entropy_loss": 0.00676727294921875, - "delta_ref_ppl": -0.003520965576171875, - "entropy_loss": -0.1240234375, - "epoch": 0.0392, - "grad_norm": 2.869048611113047, - "k1_kl": 0.003543853759765625, - "k3_kl": 0.0046234130859375, - "kimi_kl": 0.0039215087890625, - "learning_rate": 4.804e-07, - "loss": 0.0002, - "ppl": 0.06982421875, - "reward": 0.7450563311576843, - "reward_std": 0.10983211919665337, - "rewards/perpo_ocr_edit_distance_reward": 0.7450563907623291, + "advantages": -4.257474817137563e-09, + "completion_length": 1512.0, + "delta_ref_entropy_loss": 0.0025177001953125, + "delta_ref_ppl": 0.0050048828125, + "entropy_loss": -1.4375, + "epoch": 0.0196, + "grad_norm": 21.847364534734876, + "k1_kl": -0.0029449462890625, + "k3_kl": 0.031005859375, + "kimi_kl": 0.024169921875, + "learning_rate": 4.902e-07, + "loss": 0.0012, + "ppl": 1.1171875, + "reward": 0.36984387040138245, + "reward_std": 0.2512875497341156, + "rewards/perpo_ocr_edit_distance_reward": 0.36984390020370483, "step": 98, "temperature": 0.9 }, { - "advantages": -1.5497208494252845e-06, - "completion_length": 579.5, - "delta_ref_entropy_loss": 0.0090789794921875, - "delta_ref_ppl": -0.0035858154296875, - "entropy_loss": -0.107666015625, - "epoch": 0.0396, - "grad_norm": 1.6777388036431533, - "k1_kl": 0.0035858154296875, - "k3_kl": 0.0021209716796875, - "kimi_kl": 0.00213623046875, - "learning_rate": 4.802e-07, - "loss": 0.0001, - "ppl": 0.0509033203125, - "reward": 0.8540182411670685, - "reward_std": 0.06998404767364264, - "rewards/perpo_ocr_edit_distance_reward": 0.8540183305740356, + "advantages": -1.832417183322832e-05, + "completion_length": 599.0, + "delta_ref_entropy_loss": 0.00860595703125, + "delta_ref_ppl": -0.00811767578125, + "entropy_loss": -0.06689453125, + "epoch": 0.0198, + "grad_norm": 2.233144406824138, + "k1_kl": 0.00811767578125, + "k3_kl": 0.00408935546875, + "kimi_kl": 0.005645751953125, + "learning_rate": 4.901e-07, + "loss": 0.0002, + "ppl": 0.03857421875, + "reward": 0.984871506690979, + "reward_std": 0.0036115499678999186, + "rewards/perpo_ocr_edit_distance_reward": 0.9848716259002686, "step": 99, "temperature": 0.9 }, { - "advantages": -3.065381861233618e-06, - "completion_length": 253.5, - "delta_ref_entropy_loss": 0.0100860595703125, - "delta_ref_ppl": -0.0121917724609375, - "entropy_loss": -0.077392578125, - "epoch": 0.04, - "grad_norm": 1.7825333642631622, - "k1_kl": 0.012237548828125, - "k3_kl": 0.00547027587890625, - "kimi_kl": 0.00800323486328125, - "learning_rate": 4.8e-07, - "loss": 0.0002, - "ppl": 0.037628173828125, - "reward": 0.9595449268817902, - "reward_std": 0.04231062321923673, - "rewards/perpo_ocr_edit_distance_reward": 0.9595449864864349, + "advantages": -9.877342108666198e-07, + "completion_length": 423.0, + "delta_ref_entropy_loss": 0.0027618408203125, + "delta_ref_ppl": -0.004730224609375, + "entropy_loss": -0.032470703125, + "epoch": 0.02, + "grad_norm": 0.6604623655924516, + "k1_kl": 0.004730224609375, + "k3_kl": 0.0024871826171875, + "kimi_kl": 0.003662109375, + "learning_rate": 4.9e-07, + "loss": 0.0001, + "ppl": 0.0126953125, + "reward": 0.9309247136116028, + "reward_std": 0.03505643457174301, + "rewards/perpo_ocr_edit_distance_reward": 0.9309247136116028, "step": 100, "temperature": 0.9 }, { - "advantages": -1.27724248955019e-08, - "completion_length": 234.0, - "delta_ref_entropy_loss": -0.0330810546875, - "delta_ref_ppl": -0.016754150390625, - "entropy_loss": -0.4580078125, - "epoch": 0.0404, - "grad_norm": 9.950039061831538, - "k1_kl": 0.017425537109375, - "k3_kl": 0.0410614013671875, - "kimi_kl": 0.050750732421875, - "learning_rate": 4.797999999999999e-07, - "loss": 0.0016, - "ppl": 0.2535400390625, - "reward": 0.5755196064710617, - "reward_std": 0.1033979244530201, - "rewards/perpo_ocr_edit_distance_reward": 0.5755196735262871, + "advantages": 2.384185791015625e-07, + "completion_length": 734.0, + "delta_ref_entropy_loss": 0.005706787109375, + "delta_ref_ppl": -0.006683349609375, + "entropy_loss": -0.1865234375, + "epoch": 0.0202, + "grad_norm": 1.378156582147923, + "k1_kl": 0.006591796875, + "k3_kl": 0.003936767578125, + "kimi_kl": 0.005828857421875, + "learning_rate": 4.899e-07, + "loss": 0.0002, + "ppl": 0.09912109375, + "reward": 0.6043200492858887, + "reward_std": 0.036312129348516464, + "rewards/perpo_ocr_edit_distance_reward": 0.6043200492858887, "step": 101, "temperature": 0.9 }, { - "advantages": -0.00010081274689355268, - "completion_length": 712.5, - "delta_ref_entropy_loss": 0.0087127685546875, - "delta_ref_ppl": -0.004852294921875, - "entropy_loss": -0.0379638671875, - "epoch": 0.0408, - "grad_norm": 0.2925748636464845, - "k1_kl": 0.0048828125, - "k3_kl": 0.00208282470703125, - "kimi_kl": 0.00266265869140625, - "learning_rate": 4.796e-07, + "advantages": -0.00016458546451758593, + "completion_length": 958.0, + "delta_ref_entropy_loss": 0.0054931640625, + "delta_ref_ppl": -0.0023345947265625, + "entropy_loss": -0.0169677734375, + "epoch": 0.0204, + "grad_norm": 0.1908707767260534, + "k1_kl": 0.002349853515625, + "k3_kl": 0.0005950927734375, + "kimi_kl": 0.000774383544921875, + "learning_rate": 4.898e-07, "loss": 0.0002, - "ppl": 0.01483154296875, - "reward": 0.970119833946228, - "reward_std": 0.003818553188466467, - "rewards/perpo_ocr_edit_distance_reward": 0.9701198935508728, + "ppl": 0.00531005859375, + "reward": 0.9859115481376648, + "reward_std": 0.00010695091623347253, + "rewards/perpo_ocr_edit_distance_reward": 0.9859115481376648, "step": 102, "temperature": 0.9 }, { - "advantages": -5.300556324527861e-06, - "completion_length": 972.0, - "delta_ref_entropy_loss": 0.0078582763671875, - "delta_ref_ppl": -0.00696563720703125, - "entropy_loss": -0.1103515625, - "epoch": 0.0412, - "grad_norm": 1.975473391477817, - "k1_kl": 0.0068817138671875, - "k3_kl": 0.006317138671875, - "kimi_kl": 0.009674072265625, - "learning_rate": 4.794e-07, - "loss": 0.0003, - "ppl": 0.053466796875, - "reward": 0.9499002695083618, - "reward_std": 0.02546260121744126, - "rewards/perpo_ocr_edit_distance_reward": 0.9499003291130066, + "advantages": -2.7929033876716858e-06, + "completion_length": 537.0, + "delta_ref_entropy_loss": 0.01239013671875, + "delta_ref_ppl": -0.006866455078125, + "entropy_loss": -0.07373046875, + "epoch": 0.0206, + "grad_norm": 1.0835449588954633, + "k1_kl": 0.006927490234375, + "k3_kl": 0.0035552978515625, + "kimi_kl": 0.00439453125, + "learning_rate": 4.897e-07, + "loss": 0.0001, + "ppl": 0.039794921875, + "reward": 0.9653621912002563, + "reward_std": 0.024216897785663605, + "rewards/perpo_ocr_edit_distance_reward": 0.9653623104095459, "step": 103, "temperature": 0.9 }, { - "advantages": -1.1708056035786285e-06, - "completion_length": 532.0, - "delta_ref_entropy_loss": 0.00862884521484375, - "delta_ref_ppl": -0.0082855224609375, - "entropy_loss": -0.0670166015625, - "epoch": 0.0416, - "grad_norm": 0.4533921740649183, - "k1_kl": 0.008270263671875, - "k3_kl": 0.00362396240234375, - "kimi_kl": 0.00616455078125, - "learning_rate": 4.792e-07, + "advantages": -9.97952065517893e-06, + "completion_length": 457.0, + "delta_ref_entropy_loss": 0.01019287109375, + "delta_ref_ppl": -0.0052490234375, + "entropy_loss": -0.031982421875, + "epoch": 0.0208, + "grad_norm": 0.7730594709159068, + "k1_kl": 0.00518798828125, + "k3_kl": 0.0016937255859375, + "kimi_kl": 0.0023040771484375, + "learning_rate": 4.895999999999999e-07, "loss": 0.0001, - "ppl": 0.027587890625, - "reward": 0.921337366104126, - "reward_std": 0.18443143274635077, - "rewards/perpo_ocr_edit_distance_reward": 0.9213374257087708, + "ppl": 0.0130615234375, + "reward": 0.9835055470466614, + "reward_std": 0.001611186657100916, + "rewards/perpo_ocr_edit_distance_reward": 0.9835056066513062, "step": 104, "temperature": 0.9 }, { - "advantages": -7.322856845348724e-07, - "completion_length": 541.5, - "delta_ref_entropy_loss": 0.0082244873046875, - "delta_ref_ppl": -0.0046234130859375, - "entropy_loss": -0.0301513671875, - "epoch": 0.042, - "grad_norm": 0.7413757228778496, - "k1_kl": 0.0045928955078125, - "k3_kl": 0.00229644775390625, - "kimi_kl": 0.0036163330078125, - "learning_rate": 4.79e-07, - "loss": 0.0001, - "ppl": 0.011871337890625, - "reward": 0.9929729700088501, - "reward_std": 0.008669120259582996, - "rewards/perpo_ocr_edit_distance_reward": 0.9929729998111725, + "advantages": 3.4059798537100505e-08, + "completion_length": 102.0, + "delta_ref_entropy_loss": -0.08740234375, + "delta_ref_ppl": -0.177734375, + "entropy_loss": -1.203125, + "epoch": 0.021, + "grad_norm": 65.71607174592944, + "k1_kl": 0.177734375, + "k3_kl": 0.1875, + "kimi_kl": 0.6484375, + "learning_rate": 4.894999999999999e-07, + "loss": 0.0075, + "ppl": 0.7265625, + "reward": 0.06149999797344208, + "reward_std": 0.060573361814022064, + "rewards/perpo_ocr_edit_distance_reward": 0.06149999797344208, "step": 105, "temperature": 0.9 }, { - "advantages": -1.6178404615985187e-07, - "completion_length": 1368.0, - "delta_ref_entropy_loss": 0.0031280517578125, - "delta_ref_ppl": -0.005229949951171875, - "entropy_loss": -0.152587890625, - "epoch": 0.0424, - "grad_norm": 1.2249016913997686, - "k1_kl": 0.005260467529296875, - "k3_kl": 0.003208160400390625, - "kimi_kl": 0.005970001220703125, - "learning_rate": 4.788e-07, + "advantages": -7.425036073982483e-06, + "completion_length": 626.0, + "delta_ref_entropy_loss": 0.010498046875, + "delta_ref_ppl": -0.00738525390625, + "entropy_loss": -0.051513671875, + "epoch": 0.0212, + "grad_norm": 1.1889423616008061, + "k1_kl": 0.00738525390625, + "k3_kl": 0.00286865234375, + "kimi_kl": 0.0035400390625, + "learning_rate": 4.894e-07, "loss": 0.0001, - "ppl": 0.0753173828125, - "reward": 0.6106154769659042, - "reward_std": 0.08246493805199862, - "rewards/perpo_ocr_edit_distance_reward": 0.6106155216693878, + "ppl": 0.0257568359375, + "reward": 0.7666712403297424, + "reward_std": 0.006779659539461136, + "rewards/perpo_ocr_edit_distance_reward": 0.7666712999343872, "step": 106, "temperature": 0.9 }, { - "advantages": -4.512923403865443e-07, - "completion_length": 568.0, - "delta_ref_entropy_loss": 0.008575439453125, - "delta_ref_ppl": -0.0101318359375, - "entropy_loss": -0.12890625, - "epoch": 0.0428, - "grad_norm": 0.678277636547374, - "k1_kl": 0.0101470947265625, - "k3_kl": 0.0056610107421875, - "kimi_kl": 0.0111083984375, - "learning_rate": 4.786e-07, - "loss": 0.0002, - "ppl": 0.06854248046875, - "reward": 0.8721340894699097, - "reward_std": 0.1520935338921845, - "rewards/perpo_ocr_edit_distance_reward": 0.8721340894699097, + "advantages": -1.7029898913278885e-07, + "completion_length": 712.0, + "delta_ref_entropy_loss": -0.0048828125, + "delta_ref_ppl": 0.00848388671875, + "entropy_loss": -0.33984375, + "epoch": 0.0214, + "grad_norm": 13.236391227044209, + "k1_kl": -0.00823974609375, + "k3_kl": 0.029296875, + "kimi_kl": 0.019287109375, + "learning_rate": 4.893e-07, + "loss": 0.0012, + "ppl": 0.1904296875, + "reward": 0.7315413951873779, + "reward_std": 0.4308106601238251, + "rewards/perpo_ocr_edit_distance_reward": 0.7315413951873779, "step": 107, "temperature": 0.9 }, { - "advantages": -1.200607869122905e-06, - "completion_length": 1418.0, - "delta_ref_entropy_loss": 0.0049953460693359375, - "delta_ref_ppl": -0.002635955810546875, - "entropy_loss": -0.191162109375, - "epoch": 0.0432, - "grad_norm": 1.0023311592272526, - "k1_kl": 0.00266265869140625, - "k3_kl": 0.0012969970703125, - "kimi_kl": 0.001873016357421875, - "learning_rate": 4.783999999999999e-07, - "loss": 0.0001, - "ppl": 0.1239013671875, - "reward": 0.5387873388826847, - "reward_std": 0.02882529329508543, - "rewards/perpo_ocr_edit_distance_reward": 0.5387873724102974, + "advantages": -2.7247838829680404e-07, + "completion_length": 717.0, + "delta_ref_entropy_loss": 0.005401611328125, + "delta_ref_ppl": -0.0103759765625, + "entropy_loss": -0.1904296875, + "epoch": 0.0216, + "grad_norm": 141.1359961198395, + "k1_kl": 0.0101318359375, + "k3_kl": 0.009765625, + "kimi_kl": 0.01708984375, + "learning_rate": 4.892e-07, + "loss": 0.0004, + "ppl": 0.11279296875, + "reward": 0.6098195314407349, + "reward_std": 0.24552729725837708, + "rewards/perpo_ocr_edit_distance_reward": 0.6098195910453796, "step": 108, "temperature": 0.9 }, { - "advantages": -0.00010342257883166894, - "completion_length": 1042.5, - "delta_ref_entropy_loss": 0.004024505615234375, - "delta_ref_ppl": -0.0041046142578125, - "entropy_loss": -0.08575439453125, - "epoch": 0.0436, - "grad_norm": 0.49454798350664053, - "k1_kl": 0.0040740966796875, - "k3_kl": 0.0017986297607421875, - "kimi_kl": 0.0031585693359375, - "learning_rate": 4.782e-07, + "advantages": -1.8732889373040962e-07, + "completion_length": 508.0, + "delta_ref_entropy_loss": 0.004730224609375, + "delta_ref_ppl": -0.00860595703125, + "entropy_loss": -0.10693359375, + "epoch": 0.0218, + "grad_norm": 1.0121229652263055, + "k1_kl": 0.00860595703125, + "k3_kl": 0.00408935546875, + "kimi_kl": 0.006683349609375, + "learning_rate": 4.891e-07, "loss": 0.0002, - "ppl": 0.04571533203125, - "reward": 0.6473972946405411, - "reward_std": 0.025364869710756466, - "rewards/perpo_ocr_edit_distance_reward": 0.6473973393440247, + "ppl": 0.054443359375, + "reward": 0.8351684212684631, + "reward_std": 0.2367425560951233, + "rewards/perpo_ocr_edit_distance_reward": 0.8351684808731079, "step": 109, "temperature": 0.9 }, { - "advantages": -3.4613270827321685e-05, - "completion_length": 220.5, - "delta_ref_entropy_loss": 0.01824951171875, - "delta_ref_ppl": -0.01580810546875, - "entropy_loss": -0.06982421875, - "epoch": 0.044, - "grad_norm": 1.0567783686937502, - "k1_kl": 0.015777587890625, - "k3_kl": 0.0068206787109375, - "kimi_kl": 0.011688232421875, - "learning_rate": 4.779999999999999e-07, + "advantages": -1.94140852727287e-06, + "completion_length": 470.0, + "delta_ref_entropy_loss": 0.00994873046875, + "delta_ref_ppl": -0.01312255859375, + "entropy_loss": -0.043212890625, + "epoch": 0.022, + "grad_norm": 0.9318983595471432, + "k1_kl": 0.01312255859375, + "k3_kl": 0.00860595703125, + "kimi_kl": 0.013427734375, + "learning_rate": 4.89e-07, "loss": 0.0003, - "ppl": 0.03265380859375, - "reward": 0.9918895363807678, - "reward_std": 0.003885563579387963, - "rewards/perpo_ocr_edit_distance_reward": 0.9918895959854126, + "ppl": 0.0174560546875, + "reward": 0.9681140780448914, + "reward_std": 0.008693861775100231, + "rewards/perpo_ocr_edit_distance_reward": 0.9681140780448914, "step": 110, "temperature": 0.9 }, { - "advantages": -1.3155596789715673e-06, - "completion_length": 442.0, - "delta_ref_entropy_loss": -0.059906005859375, - "delta_ref_ppl": -0.00603485107421875, - "entropy_loss": -0.71044921875, - "epoch": 0.0444, - "grad_norm": 3.7532039968364432, - "k1_kl": 0.00620269775390625, - "k3_kl": 0.06865692138671875, - "kimi_kl": 0.06459808349609375, - "learning_rate": 4.778e-07, - "loss": 0.0028, - "ppl": 0.472900390625, - "reward": 0.5259968526661396, - "reward_std": 0.0231448570266366, - "rewards/perpo_ocr_edit_distance_reward": 0.525996882468462, + "advantages": 1.7029899268550253e-08, + "completion_length": 73.0, + "delta_ref_entropy_loss": 0.00885009765625, + "delta_ref_ppl": -0.01043701171875, + "entropy_loss": -0.0888671875, + "epoch": 0.0222, + "grad_norm": 5.389763421726432, + "k1_kl": 0.0103759765625, + "k3_kl": 0.005126953125, + "kimi_kl": 0.00830078125, + "learning_rate": 4.889e-07, + "loss": 0.0002, + "ppl": 0.04638671875, + "reward": 0.6548846364021301, + "reward_std": 0.3599124252796173, + "rewards/perpo_ocr_edit_distance_reward": 0.6548846960067749, "step": 111, "temperature": 0.9 }, { - "advantages": -1.693197685881387e-05, - "completion_length": 577.5, - "delta_ref_entropy_loss": 0.013641357421875, - "delta_ref_ppl": -0.00927734375, - "entropy_loss": -0.0638427734375, - "epoch": 0.0448, - "grad_norm": 0.6416822934826159, - "k1_kl": 0.0092926025390625, - "k3_kl": 0.0048675537109375, - "kimi_kl": 0.0062103271484375, - "learning_rate": 4.776e-07, - "loss": 0.0002, - "ppl": 0.0277099609375, - "reward": 0.9884665906429291, - "reward_std": 0.007306718733161688, - "rewards/perpo_ocr_edit_distance_reward": 0.9884666502475739, + "advantages": -1.319817215517105e-06, + "completion_length": 2048.0, + "delta_ref_entropy_loss": 0.004180908203125, + "delta_ref_ppl": -0.00579833984375, + "entropy_loss": -0.1826171875, + "epoch": 0.0224, + "grad_norm": 4.184846936631057, + "k1_kl": 0.005706787109375, + "k3_kl": 0.00958251953125, + "kimi_kl": 0.0128173828125, + "learning_rate": 4.888e-07, + "loss": 0.0004, + "ppl": 0.1201171875, + "reward": 0.6185060739517212, + "reward_std": 0.025918347761034966, + "rewards/perpo_ocr_edit_distance_reward": 0.6185060739517212, "step": 112, "temperature": 0.9 }, { - "advantages": -8.855547832808952e-07, - "completion_length": 1785.5, - "delta_ref_entropy_loss": 0.002399444580078125, - "delta_ref_ppl": -0.00328826904296875, - "entropy_loss": -0.0360107421875, - "epoch": 0.0452, - "grad_norm": 2.057917191669822, - "k1_kl": 0.00330352783203125, - "k3_kl": 0.001983642578125, - "kimi_kl": 0.00323486328125, - "learning_rate": 4.774e-07, - "loss": 0.0001, - "ppl": 0.015289306640625, - "reward": 0.7189756035804749, - "reward_std": 0.06567863561213017, - "rewards/perpo_ocr_edit_distance_reward": 0.7189756631851196, + "advantages": -1.021793991640152e-06, + "completion_length": 943.0, + "delta_ref_entropy_loss": 0.0146484375, + "delta_ref_ppl": -0.00872802734375, + "entropy_loss": -0.130859375, + "epoch": 0.0226, + "grad_norm": 1.9135325612653398, + "k1_kl": 0.00860595703125, + "k3_kl": 0.004730224609375, + "kimi_kl": 0.006561279296875, + "learning_rate": 4.887e-07, + "loss": 0.0002, + "ppl": 0.06982421875, + "reward": 0.9547419548034668, + "reward_std": 0.05837469547986984, + "rewards/perpo_ocr_edit_distance_reward": 0.9547420144081116, "step": 113, "temperature": 0.9 }, { - "advantages": -3.6665374636868364e-05, - "completion_length": 1032.0, - "delta_ref_entropy_loss": 0.00421142578125, - "delta_ref_ppl": -0.00212860107421875, - "entropy_loss": -0.0272216796875, - "epoch": 0.0456, - "grad_norm": 0.4970586624658039, - "k1_kl": 0.0021266937255859375, - "k3_kl": 0.0011496543884277344, - "kimi_kl": 0.0017323493957519531, - "learning_rate": 4.772e-07, + "advantages": -8.53197980177356e-06, + "completion_length": 977.0, + "delta_ref_entropy_loss": 0.011474609375, + "delta_ref_ppl": -0.0050048828125, + "entropy_loss": -0.060791015625, + "epoch": 0.0228, + "grad_norm": 0.8970886397688408, + "k1_kl": 0.004974365234375, + "k3_kl": 0.00177001953125, + "kimi_kl": 0.0023040771484375, + "learning_rate": 4.886e-07, "loss": 0.0001, - "ppl": 0.011474609375, - "reward": 0.9831773340702057, - "reward_std": 0.008807169127976522, - "rewards/perpo_ocr_edit_distance_reward": 0.9831773638725281, + "ppl": 0.03125, + "reward": 0.9408336877822876, + "reward_std": 0.0069077699445188046, + "rewards/perpo_ocr_edit_distance_reward": 0.9408337473869324, "step": 114, "temperature": 0.9 }, { - "advantages": -5.977494765829761e-06, - "completion_length": 379.5, - "delta_ref_entropy_loss": 0.021392822265625, - "delta_ref_ppl": -0.022674560546875, - "entropy_loss": -0.0921630859375, - "epoch": 0.046, - "grad_norm": 1.0882635607673865, - "k1_kl": 0.02276611328125, - "k3_kl": 0.0104522705078125, - "kimi_kl": 0.01544189453125, - "learning_rate": 4.769999999999999e-07, - "loss": 0.0004, - "ppl": 0.04827880859375, - "reward": 0.9885063171386719, - "reward_std": 0.01045231451280415, - "rewards/perpo_ocr_edit_distance_reward": 0.9885063767433167, + "advantages": -1.7029898913278885e-07, + "completion_length": 562.0, + "delta_ref_entropy_loss": 0.00811767578125, + "delta_ref_ppl": -0.01214599609375, + "entropy_loss": -0.2275390625, + "epoch": 0.023, + "grad_norm": 2.660814682584569, + "k1_kl": 0.01226806640625, + "k3_kl": 0.006500244140625, + "kimi_kl": 0.00958251953125, + "learning_rate": 4.885e-07, + "loss": 0.0003, + "ppl": 0.123046875, + "reward": 0.6199653148651123, + "reward_std": 0.15550276637077332, + "rewards/perpo_ocr_edit_distance_reward": 0.6199653148651123, "step": 115, "temperature": 0.9 }, { - "advantages": 1.8439122641211725e-05, - "completion_length": 556.5, - "delta_ref_entropy_loss": 0.00946044921875, - "delta_ref_ppl": -0.010345458984375, - "entropy_loss": -0.0411376953125, - "epoch": 0.0464, - "grad_norm": 0.6710759290525795, - "k1_kl": 0.0103912353515625, - "k3_kl": 0.00621795654296875, - "kimi_kl": 0.0103607177734375, - "learning_rate": 4.768e-07, - "loss": 0.0002, - "ppl": 0.0184326171875, - "reward": 0.9846495389938354, - "reward_std": 0.008271434620837681, - "rewards/perpo_ocr_edit_distance_reward": 0.9846495687961578, + "advantages": 3.4059798537100505e-08, + "completion_length": 402.0, + "delta_ref_entropy_loss": 0.0118408203125, + "delta_ref_ppl": -0.0096435546875, + "entropy_loss": -0.0908203125, + "epoch": 0.0232, + "grad_norm": 1.6361514528606178, + "k1_kl": 0.00970458984375, + "k3_kl": 0.01611328125, + "kimi_kl": 0.0186767578125, + "learning_rate": 4.884e-07, + "loss": 0.0006, + "ppl": 0.0439453125, + "reward": 0.5718792676925659, + "reward_std": 0.30088669061660767, + "rewards/perpo_ocr_edit_distance_reward": 0.5718792676925659, "step": 116, "temperature": 0.9 }, { - "advantages": -6.748097689524002e-07, - "completion_length": 524.5, - "delta_ref_entropy_loss": 0.01226806640625, - "delta_ref_ppl": -0.0070037841796875, - "entropy_loss": -0.160400390625, - "epoch": 0.0468, - "grad_norm": 1.5591379210848368, - "k1_kl": 0.0068817138671875, - "k3_kl": 0.0058135986328125, - "kimi_kl": 0.0050506591796875, - "learning_rate": 4.766e-07, + "advantages": -5.747590876126196e-06, + "completion_length": 415.0, + "delta_ref_entropy_loss": 0.01904296875, + "delta_ref_ppl": -0.00970458984375, + "entropy_loss": -0.0289306640625, + "epoch": 0.0234, + "grad_norm": 1.4094432405475605, + "k1_kl": 0.009765625, + "k3_kl": 0.004669189453125, + "kimi_kl": 0.005889892578125, + "learning_rate": 4.882999999999999e-07, "loss": 0.0002, - "ppl": 0.092529296875, - "reward": 0.849820464849472, - "reward_std": 0.18830417841672897, - "rewards/perpo_ocr_edit_distance_reward": 0.8498205840587616, + "ppl": 0.0118408203125, + "reward": 0.983350932598114, + "reward_std": 0.0013814609264954925, + "rewards/perpo_ocr_edit_distance_reward": 0.983350932598114, "step": 117, "temperature": 0.9 }, { - "advantages": -4.9812455245046294e-06, - "completion_length": 569.0, - "delta_ref_entropy_loss": 0.0081939697265625, - "delta_ref_ppl": -0.0080413818359375, - "entropy_loss": -0.054443359375, - "epoch": 0.0472, - "grad_norm": 0.6389525634601253, - "k1_kl": 0.0080718994140625, - "k3_kl": 0.00458526611328125, - "kimi_kl": 0.010467529296875, - "learning_rate": 4.7639999999999995e-07, - "loss": 0.0002, - "ppl": 0.0216064453125, - "reward": 0.9695978760719299, - "reward_std": 0.022832303307950497, - "rewards/perpo_ocr_edit_distance_reward": 0.9695979356765747, + "advantages": -1.4730862858414184e-06, + "completion_length": 90.0, + "delta_ref_entropy_loss": -0.022216796875, + "delta_ref_ppl": -0.0595703125, + "entropy_loss": -0.19921875, + "epoch": 0.0236, + "grad_norm": 5.026060497694901, + "k1_kl": 0.059326171875, + "k3_kl": 0.057861328125, + "kimi_kl": 0.1416015625, + "learning_rate": 4.882e-07, + "loss": 0.0023, + "ppl": 0.14453125, + "reward": 0.9386433959007263, + "reward_std": 0.029163511469960213, + "rewards/perpo_ocr_edit_distance_reward": 0.9386434555053711, "step": 118, "temperature": 0.9 }, { - "advantages": -7.578305343258762e-07, - "completion_length": 382.0, - "delta_ref_entropy_loss": -0.08074188232421875, - "delta_ref_ppl": -0.0894927978515625, - "entropy_loss": -0.3443603515625, - "epoch": 0.0476, - "grad_norm": 83.53107572687934, - "k1_kl": 0.088470458984375, - "k3_kl": 0.3186798095703125, - "kimi_kl": 0.430816650390625, - "learning_rate": 4.762e-07, - "loss": 0.0128, - "ppl": 0.27886962890625, - "reward": 0.8806667923927307, - "reward_std": 0.2219400554895401, - "rewards/perpo_ocr_edit_distance_reward": 0.8806668817996979, + "advantages": -4.087175966560608e-07, + "completion_length": 292.0, + "delta_ref_entropy_loss": 0.0235595703125, + "delta_ref_ppl": -0.0213623046875, + "entropy_loss": -0.154296875, + "epoch": 0.0238, + "grad_norm": 3.1505175322071395, + "k1_kl": 0.021240234375, + "k3_kl": 0.01177978515625, + "kimi_kl": 0.02099609375, + "learning_rate": 4.880999999999999e-07, + "loss": 0.0005, + "ppl": 0.068359375, + "reward": 0.42785173654556274, + "reward_std": 0.04240371659398079, + "rewards/perpo_ocr_edit_distance_reward": 0.42785176634788513, "step": 119, "temperature": 0.9 }, { - "advantages": -3.065381974920456e-07, - "completion_length": 486.0, - "delta_ref_entropy_loss": 0.01241302490234375, - "delta_ref_ppl": -0.007843017578125, - "entropy_loss": -0.20556640625, - "epoch": 0.048, - "grad_norm": 1.1440585980489204, - "k1_kl": 0.0077972412109375, - "k3_kl": 0.005340576171875, - "kimi_kl": 0.005950927734375, - "learning_rate": 4.76e-07, + "advantages": -3.916876721632434e-06, + "completion_length": 669.0, + "delta_ref_entropy_loss": 0.01019287109375, + "delta_ref_ppl": -0.00732421875, + "entropy_loss": -0.032958984375, + "epoch": 0.024, + "grad_norm": 0.5983472104194533, + "k1_kl": 0.007354736328125, + "k3_kl": 0.005126953125, + "kimi_kl": 0.0128173828125, + "learning_rate": 4.879999999999999e-07, "loss": 0.0002, - "ppl": 0.10888671875, - "reward": 0.8015809953212738, - "reward_std": 0.1366170570254326, - "rewards/perpo_ocr_edit_distance_reward": 0.8015810549259186, + "ppl": 0.0130615234375, + "reward": 0.967214822769165, + "reward_std": 0.01737009733915329, + "rewards/perpo_ocr_edit_distance_reward": 0.9672148823738098, "step": 120, "temperature": 0.9 }, { - "advantages": -4.209791131870588e-05, - "completion_length": 808.5, - "delta_ref_entropy_loss": 0.006439208984375, - "delta_ref_ppl": -0.004364013671875, - "entropy_loss": -0.0286865234375, - "epoch": 0.0484, - "grad_norm": 1.1752313894428208, - "k1_kl": 0.004364013671875, - "k3_kl": 0.00205230712890625, - "kimi_kl": 0.0026397705078125, - "learning_rate": 4.7579999999999996e-07, - "loss": 0.0001, - "ppl": 0.00970458984375, - "reward": 0.9931942522525787, - "reward_std": 0.0035926015698350966, - "rewards/perpo_ocr_edit_distance_reward": 0.9931943714618683, + "advantages": -4.427773774295929e-07, + "completion_length": 967.0, + "delta_ref_entropy_loss": -0.005523681640625, + "delta_ref_ppl": 0.00017452239990234375, + "entropy_loss": -0.1083984375, + "epoch": 0.0242, + "grad_norm": 0.735582332199347, + "k1_kl": -0.0002765655517578125, + "k3_kl": 0.000850677490234375, + "kimi_kl": 0.001129150390625, + "learning_rate": 4.879e-07, + "loss": 0.0, + "ppl": 0.05615234375, + "reward": 0.3902623653411865, + "reward_std": 0.08899995684623718, + "rewards/perpo_ocr_edit_distance_reward": 0.3902624249458313, "step": 121, "temperature": 0.9 }, { - "advantages": -3.7465778177647735e-06, - "completion_length": 451.0, - "delta_ref_entropy_loss": 0.01564788818359375, - "delta_ref_ppl": -0.013153076171875, - "entropy_loss": -0.072998046875, - "epoch": 0.0488, - "grad_norm": 1.1673506864088428, - "k1_kl": 0.01312255859375, - "k3_kl": 0.0054779052734375, - "kimi_kl": 0.008056640625, - "learning_rate": 4.756e-07, - "loss": 0.0002, - "ppl": 0.032470703125, - "reward": 0.9676444828510284, - "reward_std": 0.02491148840636015, - "rewards/perpo_ocr_edit_distance_reward": 0.9676446616649628, + "advantages": -4.427773774295929e-07, + "completion_length": 520.0, + "delta_ref_entropy_loss": 0.02099609375, + "delta_ref_ppl": -0.016357421875, + "entropy_loss": -0.09619140625, + "epoch": 0.0244, + "grad_norm": 1.2713813219881465, + "k1_kl": 0.0164794921875, + "k3_kl": 0.00927734375, + "kimi_kl": 0.0155029296875, + "learning_rate": 4.878e-07, + "loss": 0.0004, + "ppl": 0.04345703125, + "reward": 0.8140645027160645, + "reward_std": 0.20631656050682068, + "rewards/perpo_ocr_edit_distance_reward": 0.8140645623207092, "step": 122, "temperature": 0.9 }, { - "advantages": -4.77433241030667e-05, - "completion_length": 291.5, - "delta_ref_entropy_loss": 0.017059326171875, - "delta_ref_ppl": -0.0097503662109375, - "entropy_loss": -0.01629638671875, - "epoch": 0.0492, - "grad_norm": 0.22890364536027163, - "k1_kl": 0.0097808837890625, - "k3_kl": 0.00484466552734375, - "kimi_kl": 0.0087127685546875, - "learning_rate": 4.754e-07, - "loss": 0.0002, - "ppl": 0.0034027099609375, - "reward": 0.9984558820724487, - "reward_std": 0.00012833959772251546, - "rewards/perpo_ocr_edit_distance_reward": 0.9984559118747711, + "advantages": -7.237706967089252e-08, + "completion_length": 1012.0, + "delta_ref_entropy_loss": 0.007659912109375, + "delta_ref_ppl": -0.00408935546875, + "entropy_loss": -0.212890625, + "epoch": 0.0246, + "grad_norm": 1.5258263498976863, + "k1_kl": 0.004058837890625, + "k3_kl": 0.0026397705078125, + "kimi_kl": 0.0033721923828125, + "learning_rate": 4.877e-07, + "loss": 0.0001, + "ppl": 0.10693359375, + "reward": 0.3632601797580719, + "reward_std": 0.1435529887676239, + "rewards/perpo_ocr_edit_distance_reward": 0.3632602095603943, "step": 123, "temperature": 0.9 }, { - "advantages": -1.903091231270082e-06, - "completion_length": 560.5, - "delta_ref_entropy_loss": 0.013641357421875, - "delta_ref_ppl": 0.0006103515625, - "entropy_loss": -0.0611572265625, - "epoch": 0.0496, - "grad_norm": 57786.98810815703, - "k1_kl": -0.00067138671875, - "k3_kl": 87.00205993652344, - "kimi_kl": 0.06610107421875, - "learning_rate": 4.7519999999999997e-07, - "loss": 3.4858, - "ppl": 0.0391845703125, - "reward": 0.938787966966629, - "reward_std": 0.018849065992981195, - "rewards/perpo_ocr_edit_distance_reward": 0.9387880265712738, + "advantages": -3.576278970740532e-07, + "completion_length": 370.0, + "delta_ref_entropy_loss": 0.01953125, + "delta_ref_ppl": -0.0128173828125, + "entropy_loss": -0.042236328125, + "epoch": 0.0248, + "grad_norm": 0.6154456660446834, + "k1_kl": 0.01275634765625, + "k3_kl": 0.005035400390625, + "kimi_kl": 0.00909423828125, + "learning_rate": 4.876e-07, + "loss": 0.0002, + "ppl": 0.01495361328125, + "reward": 0.905646026134491, + "reward_std": 0.18908004462718964, + "rewards/perpo_ocr_edit_distance_reward": 0.9056460857391357, "step": 124, "temperature": 0.9 }, { - "advantages": -1.5256661072271527e-05, - "completion_length": 694.0, - "delta_ref_entropy_loss": 0.0087127685546875, - "delta_ref_ppl": -0.004241943359375, - "entropy_loss": -0.03594970703125, - "epoch": 0.05, - "grad_norm": 1.0765645926661762, - "k1_kl": 0.004241943359375, - "k3_kl": 0.002117156982421875, - "kimi_kl": 0.002452850341796875, - "learning_rate": 4.7499999999999995e-07, - "loss": 0.0001, - "ppl": 0.0160980224609375, - "reward": 0.9936719834804535, - "reward_std": 0.0031671840260969475, - "rewards/perpo_ocr_edit_distance_reward": 0.9936720728874207, + "advantages": -4.2157516872975975e-05, + "completion_length": 351.0, + "delta_ref_entropy_loss": 0.01275634765625, + "delta_ref_ppl": -0.006378173828125, + "entropy_loss": -0.021240234375, + "epoch": 0.025, + "grad_norm": 0.6048379245315202, + "k1_kl": 0.00640869140625, + "k3_kl": 0.002777099609375, + "kimi_kl": 0.00433349609375, + "learning_rate": 4.875e-07, + "loss": 0.0002, + "ppl": 0.01007080078125, + "reward": 0.98166823387146, + "reward_std": 0.0011114974040538073, + "rewards/perpo_ocr_edit_distance_reward": 0.9816683530807495, "step": 125, "temperature": 0.9 }, { - "advantages": -5.768878565959312e-07, - "completion_length": 1226.5, - "delta_ref_entropy_loss": 0.008056640625, - "delta_ref_ppl": -0.0096282958984375, - "entropy_loss": -0.1884765625, - "epoch": 0.0504, - "grad_norm": 1.3323827752501822, - "k1_kl": 0.0096435546875, - "k3_kl": 0.00502777099609375, - "kimi_kl": 0.0079803466796875, - "learning_rate": 4.748e-07, - "loss": 0.0002, - "ppl": 0.0986328125, - "reward": 0.6541241258382797, - "reward_std": 0.029561500065028667, - "rewards/perpo_ocr_edit_distance_reward": 0.6541241407394409, + "advantages": -4.938671054333099e-07, + "completion_length": 1348.0, + "delta_ref_entropy_loss": 0.01275634765625, + "delta_ref_ppl": -0.004486083984375, + "entropy_loss": -0.11669921875, + "epoch": 0.0252, + "grad_norm": 1.8408778672247086, + "k1_kl": 0.00439453125, + "k3_kl": 0.0035400390625, + "kimi_kl": 0.003448486328125, + "learning_rate": 4.874e-07, + "loss": 0.0001, + "ppl": 0.05859375, + "reward": 0.6277509331703186, + "reward_std": 0.17655575275421143, + "rewards/perpo_ocr_edit_distance_reward": 0.6277509927749634, "step": 126, "temperature": 0.9 }, { - "advantages": -3.653339184950255e-05, - "completion_length": 857.0, - "delta_ref_entropy_loss": 0.00836181640625, - "delta_ref_ppl": -0.00588226318359375, - "entropy_loss": -0.1241455078125, - "epoch": 0.0508, - "grad_norm": 0.6242892946612723, - "k1_kl": 0.00577545166015625, - "k3_kl": 0.0038280487060546875, - "kimi_kl": 0.00751495361328125, - "learning_rate": 4.746e-07, - "loss": 0.0002, - "ppl": 0.0656280517578125, - "reward": 0.9269035160541534, - "reward_std": 0.03388843865832314, - "rewards/perpo_ocr_edit_distance_reward": 0.9269035756587982, + "advantages": -8.889607670425903e-06, + "completion_length": 1305.0, + "delta_ref_entropy_loss": 0.00921630859375, + "delta_ref_ppl": -0.005950927734375, + "entropy_loss": -0.03759765625, + "epoch": 0.0254, + "grad_norm": 0.5867607643974163, + "k1_kl": 0.00592041015625, + "k3_kl": 0.0025787353515625, + "kimi_kl": 0.004913330078125, + "learning_rate": 4.873e-07, + "loss": 0.0001, + "ppl": 0.0186767578125, + "reward": 0.9839828610420227, + "reward_std": 0.0008582557202316821, + "rewards/perpo_ocr_edit_distance_reward": 0.9839829206466675, "step": 127, "temperature": 0.9 }, { - "advantages": -1.3142825082468335e-05, - "completion_length": 176.5, - "delta_ref_entropy_loss": -0.01049041748046875, - "delta_ref_ppl": -0.02691650390625, - "entropy_loss": -0.040283203125, - "epoch": 0.0512, - "grad_norm": 0.48402453564744335, - "k1_kl": 0.02703857421875, - "k3_kl": 0.019287109375, - "kimi_kl": 0.0435791015625, - "learning_rate": 4.7439999999999996e-07, - "loss": 0.0008, - "ppl": 0.010101318359375, - "reward": 0.9963979125022888, - "reward_std": 0.0007588625303469598, - "rewards/perpo_ocr_edit_distance_reward": 0.9963979125022888, + "advantages": -4.982948667020537e-05, + "completion_length": 662.0, + "delta_ref_entropy_loss": 0.01519775390625, + "delta_ref_ppl": -0.0091552734375, + "entropy_loss": -0.0225830078125, + "epoch": 0.0256, + "grad_norm": 4.593582273997591, + "k1_kl": 0.0091552734375, + "k3_kl": 0.0029449462890625, + "kimi_kl": 0.004730224609375, + "learning_rate": 4.872e-07, + "loss": 0.0002, + "ppl": 0.0079345703125, + "reward": 0.9852816462516785, + "reward_std": 0.0012672662269324064, + "rewards/perpo_ocr_edit_distance_reward": 0.9852817058563232, "step": 128, "temperature": 0.9 }, { - "advantages": -5.141113547324494e-05, - "completion_length": 511.0, - "delta_ref_entropy_loss": 0.015106201171875, - "delta_ref_ppl": -0.00885009765625, - "entropy_loss": -0.0389404296875, - "epoch": 0.0516, - "grad_norm": 1.4455766497606544, - "k1_kl": 0.0088043212890625, - "k3_kl": 0.003162384033203125, - "kimi_kl": 0.0046539306640625, - "learning_rate": 4.742e-07, - "loss": 0.0002, - "ppl": 0.016357421875, - "reward": 0.9758402109146118, - "reward_std": 0.009786989801796153, - "rewards/perpo_ocr_edit_distance_reward": 0.9758402705192566, + "advantages": 1.5599387552356347e-05, + "completion_length": 107.0, + "delta_ref_entropy_loss": 0.024658203125, + "delta_ref_ppl": -0.037109375, + "entropy_loss": -0.0986328125, + "epoch": 0.0258, + "grad_norm": 2.4347256763592493, + "k1_kl": 0.036865234375, + "k3_kl": 0.01507568359375, + "kimi_kl": 0.0234375, + "learning_rate": 4.871e-07, + "loss": 0.0006, + "ppl": 0.041015625, + "reward": 0.9342697858810425, + "reward_std": 0.0009916768176481128, + "rewards/perpo_ocr_edit_distance_reward": 0.9342697858810425, "step": 129, "temperature": 0.9 }, { - "advantages": -6.667205596500025e-06, - "completion_length": 465.0, - "delta_ref_entropy_loss": 0.011383056640625, - "delta_ref_ppl": -0.0096893310546875, - "entropy_loss": -0.0587158203125, - "epoch": 0.052, - "grad_norm": 0.9154024779996199, - "k1_kl": 0.0096435546875, - "k3_kl": 0.00514984130859375, - "kimi_kl": 0.00714874267578125, - "learning_rate": 4.7399999999999993e-07, - "loss": 0.0002, - "ppl": 0.029541015625, - "reward": 0.9587422013282776, - "reward_std": 0.03414419433102012, - "rewards/perpo_ocr_edit_distance_reward": 0.9587422311306, + "advantages": 0.0, + "completion_length": 114.0, + "delta_ref_entropy_loss": -0.1865234375, + "delta_ref_ppl": -0.337890625, + "entropy_loss": -1.015625, + "epoch": 0.026, + "grad_norm": 13.857072498434855, + "k1_kl": 0.337890625, + "k3_kl": 0.310546875, + "kimi_kl": 0.88671875, + "learning_rate": 4.87e-07, + "loss": 0.0124, + "ppl": 0.6796875, + "reward": 0.13139329850673676, + "reward_std": 0.11160807311534882, + "rewards/perpo_ocr_edit_distance_reward": 0.13139329850673676, "step": 130, "temperature": 0.9 }, { - "advantages": -3.661428280565815e-07, - "completion_length": 353.5, - "delta_ref_entropy_loss": -0.051116943359375, - "delta_ref_ppl": -0.073883056640625, - "entropy_loss": -0.3056640625, - "epoch": 0.0524, - "grad_norm": 10.598512460344537, - "k1_kl": 0.074371337890625, - "k3_kl": 0.091156005859375, - "kimi_kl": 0.20001220703125, - "learning_rate": 4.7379999999999997e-07, - "loss": 0.0036, - "ppl": 0.156005859375, - "reward": 0.7368461191654205, - "reward_std": 0.28116921335458755, - "rewards/perpo_ocr_edit_distance_reward": 0.7368462085723877, + "advantages": -4.8373429308412597e-05, + "completion_length": 197.0, + "delta_ref_entropy_loss": 0.016357421875, + "delta_ref_ppl": -0.016357421875, + "entropy_loss": -0.07568359375, + "epoch": 0.0262, + "grad_norm": 1.2302608496685388, + "k1_kl": 0.016357421875, + "k3_kl": 0.00811767578125, + "kimi_kl": 0.012451171875, + "learning_rate": 4.868999999999999e-07, + "loss": 0.0004, + "ppl": 0.038330078125, + "reward": 0.9648076295852661, + "reward_std": 0.0013085345271974802, + "rewards/perpo_ocr_edit_distance_reward": 0.9648076891899109, "step": 131, "temperature": 0.9 }, { - "advantages": -6.8715642100869445e-06, - "completion_length": 408.5, - "delta_ref_entropy_loss": 0.003406524658203125, - "delta_ref_ppl": -0.0176849365234375, - "entropy_loss": -0.0401611328125, - "epoch": 0.0528, - "grad_norm": 2.875506565548719, - "k1_kl": 0.017608642578125, - "k3_kl": 0.01021575927734375, - "kimi_kl": 0.0159912109375, - "learning_rate": 4.736e-07, - "loss": 0.0004, - "ppl": 0.0158233642578125, - "reward": 0.9920375347137451, - "reward_std": 0.006021856679581106, - "rewards/perpo_ocr_edit_distance_reward": 0.9920376241207123, + "advantages": -8.598396379966289e-05, + "completion_length": 372.0, + "delta_ref_entropy_loss": 0.0107421875, + "delta_ref_ppl": -0.00811767578125, + "entropy_loss": -0.027587890625, + "epoch": 0.0264, + "grad_norm": 0.7016877797635227, + "k1_kl": 0.00811767578125, + "k3_kl": 0.0037841796875, + "kimi_kl": 0.00640869140625, + "learning_rate": 4.867999999999999e-07, + "loss": 0.0002, + "ppl": 0.01068115234375, + "reward": 0.9846912026405334, + "reward_std": 0.00039515376556664705, + "rewards/perpo_ocr_edit_distance_reward": 0.9846912622451782, "step": 132, "temperature": 0.9 }, { - "advantages": -9.702785324527952e-06, - "completion_length": 195.0, - "delta_ref_entropy_loss": 0.011871337890625, - "delta_ref_ppl": -0.009521484375, - "entropy_loss": -0.0718994140625, - "epoch": 0.0532, - "grad_norm": 3.259030314020455, - "k1_kl": 0.00951385498046875, - "k3_kl": 0.0140380859375, - "kimi_kl": 0.0174560546875, - "learning_rate": 4.734e-07, - "loss": 0.0006, - "ppl": 0.0382080078125, - "reward": 0.918494701385498, - "reward_std": 0.16987746371887624, - "rewards/perpo_ocr_edit_distance_reward": 0.9184947609901428, + "advantages": 2.7247838829680404e-07, + "completion_length": 1060.0, + "delta_ref_entropy_loss": 0.01190185546875, + "delta_ref_ppl": -0.01409912109375, + "entropy_loss": -0.04638671875, + "epoch": 0.0266, + "grad_norm": 1.2658008801289409, + "k1_kl": 0.01409912109375, + "k3_kl": 0.00811767578125, + "kimi_kl": 0.016357421875, + "learning_rate": 4.867e-07, + "loss": 0.0003, + "ppl": 0.02294921875, + "reward": 0.9341219663619995, + "reward_std": 0.03133729100227356, + "rewards/perpo_ocr_edit_distance_reward": 0.9341219663619995, "step": 133, "temperature": 0.9 }, { - "advantages": -1.575904263972916e-05, - "completion_length": 450.0, - "delta_ref_entropy_loss": 0.01922607421875, - "delta_ref_ppl": -0.011993408203125, - "entropy_loss": -0.104248046875, - "epoch": 0.0536, - "grad_norm": 1.1654039597935428, - "k1_kl": 0.011962890625, - "k3_kl": 0.0053863525390625, - "kimi_kl": 0.008941650390625, - "learning_rate": 4.732e-07, - "loss": 0.0002, - "ppl": 0.0477294921875, - "reward": 0.9256871938705444, - "reward_std": 0.023044971516355872, - "rewards/perpo_ocr_edit_distance_reward": 0.9256872534751892, + "advantages": -1.8460410501575097e-05, + "completion_length": 614.0, + "delta_ref_entropy_loss": 0.00775146484375, + "delta_ref_ppl": -0.0133056640625, + "entropy_loss": -0.025146484375, + "epoch": 0.0268, + "grad_norm": 0.7744654340004753, + "k1_kl": 0.0133056640625, + "k3_kl": 0.00848388671875, + "kimi_kl": 0.0216064453125, + "learning_rate": 4.865999999999999e-07, + "loss": 0.0004, + "ppl": 0.01025390625, + "reward": 0.9513046741485596, + "reward_std": 0.0035917749628424644, + "rewards/perpo_ocr_edit_distance_reward": 0.9513047337532043, "step": 134, "temperature": 0.9 }, { - "advantages": -2.069132766457926e-06, - "completion_length": 604.0, - "delta_ref_entropy_loss": 0.011871337890625, - "delta_ref_ppl": -0.00628662109375, - "entropy_loss": -0.06689453125, - "epoch": 0.054, - "grad_norm": 0.8834584314057794, - "k1_kl": 0.0063323974609375, - "k3_kl": 0.0026092529296875, - "kimi_kl": 0.00345611572265625, - "learning_rate": 4.7299999999999996e-07, - "loss": 0.0001, - "ppl": 0.029052734375, - "reward": 0.9755570888519287, - "reward_std": 0.002880120533518493, - "rewards/perpo_ocr_edit_distance_reward": 0.9755571186542511, + "advantages": -9.579318430041894e-06, + "completion_length": 342.0, + "delta_ref_entropy_loss": 0.00150299072265625, + "delta_ref_ppl": -0.0164794921875, + "entropy_loss": -0.083984375, + "epoch": 0.027, + "grad_norm": 1.1765222706430882, + "k1_kl": 0.0164794921875, + "k3_kl": 0.01324462890625, + "kimi_kl": 0.0255126953125, + "learning_rate": 4.864999999999999e-07, + "loss": 0.0005, + "ppl": 0.04248046875, + "reward": 0.9762692451477051, + "reward_std": 0.008796692825853825, + "rewards/perpo_ocr_edit_distance_reward": 0.9762693047523499, "step": 135, "temperature": 0.9 }, { - "advantages": -7.067408205330139e-07, - "completion_length": 349.5, - "delta_ref_entropy_loss": 0.0081939697265625, - "delta_ref_ppl": -0.01409912109375, - "entropy_loss": -0.137451171875, - "epoch": 0.0544, - "grad_norm": 1.6021337624063512, - "k1_kl": 0.014251708984375, - "k3_kl": 0.011871337890625, - "kimi_kl": 0.022247314453125, - "learning_rate": 4.728e-07, + "advantages": -2.8269632821320556e-05, + "completion_length": 822.0, + "delta_ref_entropy_loss": 0.019775390625, + "delta_ref_ppl": -0.0174560546875, + "entropy_loss": -0.033935546875, + "epoch": 0.0272, + "grad_norm": 0.508574542188369, + "k1_kl": 0.0174560546875, + "k3_kl": 0.01129150390625, + "kimi_kl": 0.041015625, + "learning_rate": 4.864e-07, "loss": 0.0005, - "ppl": 0.07763671875, - "reward": 0.9091335535049438, - "reward_std": 0.09843239560723305, - "rewards/perpo_ocr_edit_distance_reward": 0.909133642911911, + "ppl": 0.014404296875, + "reward": 0.976406455039978, + "reward_std": 0.0011045127175748348, + "rewards/perpo_ocr_edit_distance_reward": 0.9764065742492676, "step": 136, "temperature": 0.9 }, { - "advantages": -2.8482506309046585e-05, - "completion_length": 396.5, - "delta_ref_entropy_loss": 0.015838623046875, - "delta_ref_ppl": -0.00994873046875, - "entropy_loss": -0.102783203125, - "epoch": 0.0548, - "grad_norm": 1.1254723006493323, - "k1_kl": 0.00982666015625, - "k3_kl": 0.00392913818359375, - "kimi_kl": 0.00582122802734375, - "learning_rate": 4.726e-07, + "advantages": -2.043587983280304e-07, + "completion_length": 2048.0, + "delta_ref_entropy_loss": 0.0025177001953125, + "delta_ref_ppl": -0.005157470703125, + "entropy_loss": -0.1728515625, + "epoch": 0.0274, + "grad_norm": 2.244570653915853, + "k1_kl": 0.005157470703125, + "k3_kl": 0.0042724609375, + "kimi_kl": 0.00537109375, + "learning_rate": 4.863e-07, "loss": 0.0002, - "ppl": 0.0537109375, - "reward": 0.9106909334659576, - "reward_std": 0.028415104665327817, - "rewards/perpo_ocr_edit_distance_reward": 0.9106909930706024, + "ppl": 0.07763671875, + "reward": 0.24311450123786926, + "reward_std": 0.0505901500582695, + "rewards/perpo_ocr_edit_distance_reward": 0.24311453104019165, "step": 137, "temperature": 0.9 }, { - "advantages": -0.00010817392259365732, - "completion_length": 804.5, - "delta_ref_entropy_loss": 0.014801025390625, - "delta_ref_ppl": -0.007354736328125, - "entropy_loss": -0.199462890625, - "epoch": 0.0552, - "grad_norm": 1.2026115635238577, - "k1_kl": 0.007354736328125, - "k3_kl": 0.0035686492919921875, - "kimi_kl": 0.00569915771484375, - "learning_rate": 4.7239999999999997e-07, - "loss": 0.0003, - "ppl": 0.1057281494140625, - "reward": 0.8473865389823914, - "reward_std": 0.013203137852542568, - "rewards/perpo_ocr_edit_distance_reward": 0.8473865687847137, + "advantages": -1.8732889373040962e-07, + "completion_length": 138.0, + "delta_ref_entropy_loss": -0.00653076171875, + "delta_ref_ppl": -0.041259765625, + "entropy_loss": -0.0390625, + "epoch": 0.0276, + "grad_norm": 1.7346522394531225, + "k1_kl": 0.04150390625, + "k3_kl": 0.0322265625, + "kimi_kl": 0.10595703125, + "learning_rate": 4.862e-07, + "loss": 0.0013, + "ppl": 0.009033203125, + "reward": 0.8942802548408508, + "reward_std": 0.1740618497133255, + "rewards/perpo_ocr_edit_distance_reward": 0.8942802548408508, "step": 138, "temperature": 0.9 }, { - "advantages": -1.1290823238141456e-05, - "completion_length": 564.0, - "delta_ref_entropy_loss": 0.015594482421875, - "delta_ref_ppl": -0.01190185546875, - "entropy_loss": -0.0650634765625, - "epoch": 0.0556, - "grad_norm": 0.9477885657204357, - "k1_kl": 0.011962890625, - "k3_kl": 0.00749969482421875, - "kimi_kl": 0.0171661376953125, - "learning_rate": 4.722e-07, - "loss": 0.0003, - "ppl": 0.03179931640625, - "reward": 0.9238964915275574, - "reward_std": 0.08518733968958259, - "rewards/perpo_ocr_edit_distance_reward": 0.9238965809345245, + "advantages": -8.514949634275126e-09, + "completion_length": 746.0, + "delta_ref_entropy_loss": 0.00494384765625, + "delta_ref_ppl": -0.0079345703125, + "entropy_loss": -0.0458984375, + "epoch": 0.0278, + "grad_norm": 0.731620591886102, + "k1_kl": 0.0079345703125, + "k3_kl": 0.005889892578125, + "kimi_kl": 0.0103759765625, + "learning_rate": 4.861e-07, + "loss": 0.0002, + "ppl": 0.0185546875, + "reward": 0.6121897101402283, + "reward_std": 0.09883197396993637, + "rewards/perpo_ocr_edit_distance_reward": 0.6121897101402283, "step": 139, "temperature": 0.9 }, { - "advantages": -4.159552872806671e-06, - "completion_length": 546.5, - "delta_ref_entropy_loss": 0.01239776611328125, - "delta_ref_ppl": -0.0130157470703125, - "entropy_loss": -0.16748046875, - "epoch": 0.056, - "grad_norm": 1.33490170900731, - "k1_kl": 0.0129852294921875, - "k3_kl": 0.00693511962890625, - "kimi_kl": 0.0141754150390625, - "learning_rate": 4.7199999999999994e-07, + "advantages": -3.5677639971254393e-06, + "completion_length": 414.0, + "delta_ref_entropy_loss": 0.01416015625, + "delta_ref_ppl": -0.01373291015625, + "entropy_loss": -0.0439453125, + "epoch": 0.028, + "grad_norm": 1.4448017394152526, + "k1_kl": 0.01373291015625, + "k3_kl": 0.00750732421875, + "kimi_kl": 0.0128173828125, + "learning_rate": 4.86e-07, "loss": 0.0003, - "ppl": 0.0892333984375, - "reward": 0.9070712924003601, - "reward_std": 0.014862296171486378, - "rewards/perpo_ocr_edit_distance_reward": 0.9070713222026825, + "ppl": 0.015869140625, + "reward": 0.9707626104354858, + "reward_std": 0.002270621480420232, + "rewards/perpo_ocr_edit_distance_reward": 0.9707626104354858, "step": 140, "temperature": 0.9 }, { - "advantages": -2.378438330197241e-05, - "completion_length": 271.5, - "delta_ref_entropy_loss": 0.038970947265625, - "delta_ref_ppl": -0.03387451171875, - "entropy_loss": -0.057861328125, - "epoch": 0.0564, - "grad_norm": 0.4475024107095267, - "k1_kl": 0.03387451171875, - "k3_kl": 0.0125274658203125, - "kimi_kl": 0.019134521484375, - "learning_rate": 4.718e-07, - "loss": 0.0005, - "ppl": 0.0185546875, - "reward": 0.9735345542430878, - "reward_std": 0.000486750592244789, - "rewards/perpo_ocr_edit_distance_reward": 0.9735345840454102, + "advantages": 8.174351933121216e-07, + "completion_length": 1056.0, + "delta_ref_entropy_loss": 0.0167236328125, + "delta_ref_ppl": -0.00830078125, + "entropy_loss": -0.04638671875, + "epoch": 0.0282, + "grad_norm": 1.1699191741245714, + "k1_kl": 0.00830078125, + "k3_kl": 0.003936767578125, + "kimi_kl": 0.005279541015625, + "learning_rate": 4.859e-07, + "loss": 0.0002, + "ppl": 0.0245361328125, + "reward": 0.9768544435501099, + "reward_std": 0.010185855440795422, + "rewards/perpo_ocr_edit_distance_reward": 0.9768545031547546, "step": 141, "temperature": 0.9 }, { - "advantages": -0.00031999179373087827, - "completion_length": 323.5, - "delta_ref_entropy_loss": 0.009521484375, - "delta_ref_ppl": -0.016082763671875, - "entropy_loss": -0.03778076171875, - "epoch": 0.0568, - "grad_norm": 0.34692538393470307, - "k1_kl": 0.01611328125, - "k3_kl": 0.0089263916015625, - "kimi_kl": 0.018341064453125, - "learning_rate": 4.716e-07, - "loss": 0.0007, - "ppl": 0.011993408203125, - "reward": 0.9963905811309814, - "reward_std": 0.0005313934525474906, - "rewards/perpo_ocr_edit_distance_reward": 0.9963906407356262, + "advantages": -0.000199760717805475, + "completion_length": 423.0, + "delta_ref_entropy_loss": 0.024169921875, + "delta_ref_ppl": -0.01519775390625, + "entropy_loss": -0.01318359375, + "epoch": 0.0284, + "grad_norm": 0.6325209527262582, + "k1_kl": 0.01519775390625, + "k3_kl": 0.00811767578125, + "kimi_kl": 0.01495361328125, + "learning_rate": 4.858e-07, + "loss": 0.0005, + "ppl": 0.00494384765625, + "reward": 0.9867319464683533, + "reward_std": 0.00024102107272483408, + "rewards/perpo_ocr_edit_distance_reward": 0.986732006072998, "step": 142, "temperature": 0.9 }, { - "advantages": -4.725797097648865e-06, - "completion_length": 1016.0, - "delta_ref_entropy_loss": 0.0125732421875, - "delta_ref_ppl": -0.00732421875, - "entropy_loss": -0.080810546875, - "epoch": 0.0572, - "grad_norm": 0.7496136635574357, - "k1_kl": 0.00732421875, - "k3_kl": 0.003170013427734375, - "kimi_kl": 0.00397491455078125, - "learning_rate": 4.7139999999999995e-07, - "loss": 0.0001, - "ppl": 0.0374755859375, - "reward": 0.9733289480209351, - "reward_std": 0.012209455017000437, - "rewards/perpo_ocr_edit_distance_reward": 0.9733290076255798, + "advantages": -5.3984781516192015e-06, + "completion_length": 1218.0, + "delta_ref_entropy_loss": 0.0140380859375, + "delta_ref_ppl": -0.00860595703125, + "entropy_loss": -0.0250244140625, + "epoch": 0.0286, + "grad_norm": 0.9395522173152546, + "k1_kl": 0.0086669921875, + "k3_kl": 0.004150390625, + "kimi_kl": 0.0079345703125, + "learning_rate": 4.857e-07, + "loss": 0.0002, + "ppl": 0.0103759765625, + "reward": 0.9720858335494995, + "reward_std": 0.010954543948173523, + "rewards/perpo_ocr_edit_distance_reward": 0.9720859527587891, "step": 143, "temperature": 0.9 }, { - "advantages": -3.6103386467090104e-06, - "completion_length": 191.5, - "delta_ref_entropy_loss": 0.0108184814453125, - "delta_ref_ppl": -0.0117034912109375, - "entropy_loss": -0.098388671875, - "epoch": 0.0576, - "grad_norm": 2.069454581267175, - "k1_kl": 0.0117034912109375, - "k3_kl": 0.0118408203125, - "kimi_kl": 0.01971435546875, - "learning_rate": 4.712e-07, - "loss": 0.0005, - "ppl": 0.0498046875, - "reward": 0.6401745975017548, - "reward_std": 0.18197144893929362, - "rewards/perpo_ocr_edit_distance_reward": 0.640174612402916, + "advantages": 1.004764044409967e-06, + "completion_length": 654.0, + "delta_ref_entropy_loss": 0.018798828125, + "delta_ref_ppl": -0.01104736328125, + "entropy_loss": -0.052978515625, + "epoch": 0.0288, + "grad_norm": 2.5150476357530556, + "k1_kl": 0.01104736328125, + "k3_kl": 0.004241943359375, + "kimi_kl": 0.00799560546875, + "learning_rate": 4.856e-07, + "loss": 0.0002, + "ppl": 0.0184326171875, + "reward": 0.9167401790618896, + "reward_std": 0.008382929489016533, + "rewards/perpo_ocr_edit_distance_reward": 0.9167401790618896, "step": 144, "temperature": 0.9 }, { - "advantages": -5.17708923553073e-06, - "completion_length": 472.5, - "delta_ref_entropy_loss": 0.0216064453125, - "delta_ref_ppl": -0.02001953125, - "entropy_loss": -0.107421875, - "epoch": 0.058, - "grad_norm": 8.504362299446928, - "k1_kl": 0.0198974609375, - "k3_kl": 0.009613037109375, - "kimi_kl": 0.016357421875, - "learning_rate": 4.7099999999999997e-07, - "loss": 0.0004, - "ppl": 0.0518798828125, - "reward": 0.9631153643131256, - "reward_std": 0.024834759766235948, - "rewards/perpo_ocr_edit_distance_reward": 0.963115394115448, + "advantages": -3.4059798537100505e-08, + "completion_length": 687.0, + "delta_ref_entropy_loss": 0.0140380859375, + "delta_ref_ppl": -0.01806640625, + "entropy_loss": -0.2294921875, + "epoch": 0.029, + "grad_norm": 1.2011720421582628, + "k1_kl": 0.01806640625, + "k3_kl": 0.011962890625, + "kimi_kl": 0.030029296875, + "learning_rate": 4.854999999999999e-07, + "loss": 0.0005, + "ppl": 0.119140625, + "reward": 0.6268883943557739, + "reward_std": 0.2742255926132202, + "rewards/perpo_ocr_edit_distance_reward": 0.6268884539604187, "step": 145, "temperature": 0.9 }, { - "advantages": -2.2556100930160028e-05, - "completion_length": 596.5, - "delta_ref_entropy_loss": 0.014434814453125, - "delta_ref_ppl": -0.0112152099609375, - "entropy_loss": -0.03240966796875, - "epoch": 0.0584, - "grad_norm": 0.6944914060701867, - "k1_kl": 0.0112762451171875, - "k3_kl": 0.00521087646484375, - "kimi_kl": 0.0091552734375, - "learning_rate": 4.7079999999999995e-07, - "loss": 0.0002, - "ppl": 0.01080322265625, - "reward": 0.9950167834758759, - "reward_std": 0.005412053142208606, - "rewards/perpo_ocr_edit_distance_reward": 0.9950168430805206, + "advantages": -1.1580331147342804e-06, + "completion_length": 135.0, + "delta_ref_entropy_loss": 0.07958984375, + "delta_ref_ppl": -0.037841796875, + "entropy_loss": -0.3828125, + "epoch": 0.0292, + "grad_norm": 6.872764280239594, + "k1_kl": 0.0380859375, + "k3_kl": 0.0203857421875, + "kimi_kl": 0.0322265625, + "learning_rate": 4.853999999999999e-07, + "loss": 0.0008, + "ppl": 0.2109375, + "reward": 0.46341845393180847, + "reward_std": 0.0299062617123127, + "rewards/perpo_ocr_edit_distance_reward": 0.46341851353645325, "step": 146, "temperature": 0.9 }, { - "advantages": 1.617840297285511e-07, - "completion_length": 691.0, - "delta_ref_entropy_loss": 0.011962890625, - "delta_ref_ppl": -0.0065765380859375, - "entropy_loss": -0.0433349609375, - "epoch": 0.0588, - "grad_norm": 0.8535642778802609, - "k1_kl": 0.0065765380859375, - "k3_kl": 0.00396728515625, - "kimi_kl": 0.0074920654296875, - "learning_rate": 4.706e-07, + "advantages": -3.8317273265420226e-07, + "completion_length": 919.0, + "delta_ref_entropy_loss": 0.000278472900390625, + "delta_ref_ppl": -0.00830078125, + "entropy_loss": -0.2490234375, + "epoch": 0.0294, + "grad_norm": 1.600203844697749, + "k1_kl": 0.00830078125, + "k3_kl": 0.005859375, + "kimi_kl": 0.006591796875, + "learning_rate": 4.853e-07, "loss": 0.0002, - "ppl": 0.017578125, - "reward": 0.92594975233078, - "reward_std": 0.10800753347575665, - "rewards/perpo_ocr_edit_distance_reward": 0.9259497821331024, + "ppl": 0.1376953125, + "reward": 0.4156799912452698, + "reward_std": 0.04391279071569443, + "rewards/perpo_ocr_edit_distance_reward": 0.41568002104759216, "step": 147, "temperature": 0.9 }, { - "advantages": -3.498367161114402e-05, - "completion_length": 1230.5, - "delta_ref_entropy_loss": 0.014678955078125, - "delta_ref_ppl": -0.011199951171875, - "entropy_loss": -0.2354736328125, - "epoch": 0.0592, - "grad_norm": 1.5144602086287924, - "k1_kl": 0.011474609375, - "k3_kl": 0.005645751953125, - "kimi_kl": 0.009002685546875, - "learning_rate": 4.704e-07, - "loss": 0.0003, - "ppl": 0.132415771484375, - "reward": 0.8467992544174194, - "reward_std": 0.10804072438622825, - "rewards/perpo_ocr_edit_distance_reward": 0.8467993438243866, + "advantages": -4.246405296726152e-05, + "completion_length": 322.0, + "delta_ref_entropy_loss": 0.0081787109375, + "delta_ref_ppl": -0.01385498046875, + "entropy_loss": -0.04541015625, + "epoch": 0.0296, + "grad_norm": 1.32573868148698, + "k1_kl": 0.0137939453125, + "k3_kl": 0.00860595703125, + "kimi_kl": 0.0194091796875, + "learning_rate": 4.852e-07, + "loss": 0.0004, + "ppl": 0.012939453125, + "reward": 0.9728103876113892, + "reward_std": 0.0007018144242465496, + "rewards/perpo_ocr_edit_distance_reward": 0.9728103876113892, "step": 148, "temperature": 0.9 }, { - "advantages": -4.862036154129612e-06, - "completion_length": 501.5, - "delta_ref_entropy_loss": 0.01708984375, - "delta_ref_ppl": -0.00926971435546875, - "entropy_loss": -0.1279296875, - "epoch": 0.0596, - "grad_norm": 1.2631639199265687, - "k1_kl": 0.00926971435546875, - "k3_kl": 0.00397491455078125, - "kimi_kl": 0.00495147705078125, - "learning_rate": 4.7019999999999996e-07, - "loss": 0.0002, - "ppl": 0.0675048828125, - "reward": 0.7891462445259094, - "reward_std": 0.1346940038492903, - "rewards/perpo_ocr_edit_distance_reward": 0.7891462445259094, + "advantages": -3.2356808787881164e-07, + "completion_length": 2048.0, + "delta_ref_entropy_loss": 0.0205078125, + "delta_ref_ppl": -0.000278472900390625, + "entropy_loss": -0.240234375, + "epoch": 0.0298, + "grad_norm": 107.80897563372874, + "k1_kl": 0.0002040863037109375, + "k3_kl": 0.08056640625, + "kimi_kl": 0.0269775390625, + "learning_rate": 4.850999999999999e-07, + "loss": 0.0032, + "ppl": 0.1796875, + "reward": 0.4766707122325897, + "reward_std": 0.2038099318742752, + "rewards/perpo_ocr_edit_distance_reward": 0.4766707718372345, "step": 149, "temperature": 0.9 }, { - "advantages": -5.534717359978458e-07, - "completion_length": 1244.5, - "delta_ref_entropy_loss": 0.014007568359375, - "delta_ref_ppl": -0.006866455078125, - "entropy_loss": -0.1502685546875, - "epoch": 0.06, - "grad_norm": 1.355517333460202, - "k1_kl": 0.0068359375, - "k3_kl": 0.003360748291015625, - "kimi_kl": 0.00429534912109375, - "learning_rate": 4.6999999999999995e-07, + "advantages": -9.877342108666198e-07, + "completion_length": 1401.0, + "delta_ref_entropy_loss": 0.0155029296875, + "delta_ref_ppl": -0.006927490234375, + "entropy_loss": -0.0296630859375, + "epoch": 0.03, + "grad_norm": 0.38978679743102745, + "k1_kl": 0.006927490234375, + "k3_kl": 0.0028076171875, + "kimi_kl": 0.004547119140625, + "learning_rate": 4.85e-07, "loss": 0.0001, - "ppl": 0.0850830078125, - "reward": 0.7416155636310577, - "reward_std": 0.05619787611067295, - "rewards/perpo_ocr_edit_distance_reward": 0.7416156232357025, + "ppl": 0.01324462890625, + "reward": 0.6974523067474365, + "reward_std": 0.034212853759527206, + "rewards/perpo_ocr_edit_distance_reward": 0.6974523067474365, "step": 150, "temperature": 0.9 }, { - "advantages": -2.622604597490863e-06, - "completion_length": 193.5, - "delta_ref_entropy_loss": 0.017822265625, - "delta_ref_ppl": -0.02728271484375, - "entropy_loss": -0.06829833984375, - "epoch": 0.0604, - "grad_norm": 2.8891039035917188, - "k1_kl": 0.02752685546875, - "k3_kl": 0.014921188354492188, - "kimi_kl": 0.02291107177734375, - "learning_rate": 4.698e-07, + "advantages": -1.3794218602924957e-06, + "completion_length": 694.0, + "delta_ref_entropy_loss": 0.01708984375, + "delta_ref_ppl": -0.021240234375, + "entropy_loss": -0.0869140625, + "epoch": 0.0302, + "grad_norm": 1.2688731058504306, + "k1_kl": 0.0213623046875, + "k3_kl": 0.01507568359375, + "kimi_kl": 0.0242919921875, + "learning_rate": 4.849e-07, "loss": 0.0006, - "ppl": 0.0244903564453125, - "reward": 0.9899159669876099, - "reward_std": 0.0015721191884949803, - "rewards/perpo_ocr_edit_distance_reward": 0.9899159669876099, + "ppl": 0.04931640625, + "reward": 0.9490707516670227, + "reward_std": 0.024570491164922714, + "rewards/perpo_ocr_edit_distance_reward": 0.9490708112716675, "step": 151, "temperature": 0.9 }, { - "advantages": -9.010519761432079e-05, - "completion_length": 440.0, - "delta_ref_entropy_loss": 0.017822265625, - "delta_ref_ppl": -0.016143798828125, - "entropy_loss": -0.03271484375, - "epoch": 0.0608, - "grad_norm": 1.480626901310117, - "k1_kl": 0.016143798828125, - "k3_kl": 0.00974273681640625, - "kimi_kl": 0.0210113525390625, - "learning_rate": 4.6959999999999997e-07, - "loss": 0.0005, - "ppl": 0.012603759765625, - "reward": 0.9878802001476288, - "reward_std": 0.0026052493922179565, - "rewards/perpo_ocr_edit_distance_reward": 0.9878803193569183, + "advantages": -7.322856845348724e-07, + "completion_length": 207.0, + "delta_ref_entropy_loss": 0.0380859375, + "delta_ref_ppl": -0.040283203125, + "entropy_loss": -0.1494140625, + "epoch": 0.0304, + "grad_norm": 2.9279121285493366, + "k1_kl": 0.0400390625, + "k3_kl": 0.0230712890625, + "kimi_kl": 0.044189453125, + "learning_rate": 4.848e-07, + "loss": 0.0009, + "ppl": 0.0810546875, + "reward": 0.6372258067131042, + "reward_std": 0.023454943671822548, + "rewards/perpo_ocr_edit_distance_reward": 0.637225866317749, "step": 152, "temperature": 0.9 }, { - "advantages": -0.00029905353278536495, - "completion_length": 450.0, - "delta_ref_entropy_loss": 0.0137939453125, - "delta_ref_ppl": -0.0126190185546875, - "entropy_loss": -0.0374755859375, - "epoch": 0.0612, - "grad_norm": 0.4937118039312391, - "k1_kl": 0.0126953125, - "k3_kl": 0.00838470458984375, - "kimi_kl": 0.026004791259765625, - "learning_rate": 4.6939999999999995e-07, - "loss": 0.0006, - "ppl": 0.0130462646484375, - "reward": 0.9624097943305969, - "reward_std": 0.020520400255918503, - "rewards/perpo_ocr_edit_distance_reward": 0.9624098241329193, + "advantages": -0.00015048470231704414, + "completion_length": 388.0, + "delta_ref_entropy_loss": 0.0159912109375, + "delta_ref_ppl": -0.016845703125, + "entropy_loss": -0.02294921875, + "epoch": 0.0306, + "grad_norm": 0.7063107465011101, + "k1_kl": 0.0167236328125, + "k3_kl": 0.00933837890625, + "kimi_kl": 0.0166015625, + "learning_rate": 4.847e-07, + "loss": 0.0005, + "ppl": 0.00994873046875, + "reward": 0.9830837249755859, + "reward_std": 0.0004658602410927415, + "rewards/perpo_ocr_edit_distance_reward": 0.9830839037895203, "step": 153, "temperature": 0.9 }, { - "advantages": -1.972913846515212e-05, - "completion_length": 741.5, - "delta_ref_entropy_loss": 0.0143585205078125, - "delta_ref_ppl": -0.0079193115234375, - "entropy_loss": -0.0372314453125, - "epoch": 0.0616, - "grad_norm": 0.6245887884397563, - "k1_kl": 0.00792694091796875, - "k3_kl": 0.0046539306640625, - "kimi_kl": 0.0069122314453125, - "learning_rate": 4.692e-07, - "loss": 0.0002, - "ppl": 0.015045166015625, - "reward": 0.9772668480873108, - "reward_std": 0.046982452971860766, - "rewards/perpo_ocr_edit_distance_reward": 0.9772669076919556, + "advantages": -2.384185791015625e-07, + "completion_length": 1366.0, + "delta_ref_entropy_loss": 0.0194091796875, + "delta_ref_ppl": -0.01336669921875, + "entropy_loss": -0.1376953125, + "epoch": 0.0308, + "grad_norm": 1.7120687210821492, + "k1_kl": 0.0133056640625, + "k3_kl": 0.00970458984375, + "kimi_kl": 0.0128173828125, + "learning_rate": 4.846e-07, + "loss": 0.0004, + "ppl": 0.080078125, + "reward": 0.8972614407539368, + "reward_std": 0.13980989158153534, + "rewards/perpo_ocr_edit_distance_reward": 0.8972615003585815, "step": 154, "temperature": 0.9 }, { - "advantages": -1.9993102341686608e-05, - "completion_length": 703.5, - "delta_ref_entropy_loss": 0.0177001953125, - "delta_ref_ppl": -0.009490966796875, - "entropy_loss": -0.0521240234375, - "epoch": 0.062, - "grad_norm": 0.8075544760157188, - "k1_kl": 0.00946044921875, - "k3_kl": 0.00405120849609375, - "kimi_kl": 0.0068817138671875, - "learning_rate": 4.689999999999999e-07, - "loss": 0.0002, - "ppl": 0.02276611328125, - "reward": 0.9495241940021515, - "reward_std": 0.008764009224250913, - "rewards/perpo_ocr_edit_distance_reward": 0.949524313211441, + "advantages": -1.7029899268550253e-08, + "completion_length": 411.0, + "delta_ref_entropy_loss": 0.0291748046875, + "delta_ref_ppl": -0.016357421875, + "entropy_loss": -0.03662109375, + "epoch": 0.031, + "grad_norm": 1.2046030295984524, + "k1_kl": 0.016357421875, + "k3_kl": 0.0157470703125, + "kimi_kl": 0.018310546875, + "learning_rate": 4.845e-07, + "loss": 0.0006, + "ppl": 0.0189208984375, + "reward": 0.8421637415885925, + "reward_std": 0.1629801243543625, + "rewards/perpo_ocr_edit_distance_reward": 0.8421638011932373, "step": 155, "temperature": 0.9 }, { - "advantages": -7.086140885803616e-05, - "completion_length": 647.5, - "delta_ref_entropy_loss": 0.013519287109375, - "delta_ref_ppl": -0.0084075927734375, - "entropy_loss": -0.02020263671875, - "epoch": 0.0624, - "grad_norm": 0.37752331722388943, - "k1_kl": 0.0084228515625, - "k3_kl": 0.003452301025390625, - "kimi_kl": 0.006328582763671875, - "learning_rate": 4.6879999999999996e-07, - "loss": 0.0002, - "ppl": 0.00714111328125, - "reward": 0.9937191903591156, - "reward_std": 0.002218145724327769, - "rewards/perpo_ocr_edit_distance_reward": 0.9937192797660828, + "advantages": 0.0, + "completion_length": 436.0, + "delta_ref_entropy_loss": 0.017333984375, + "delta_ref_ppl": -0.007080078125, + "entropy_loss": -0.0093994140625, + "epoch": 0.0312, + "grad_norm": 0.3732033869801257, + "k1_kl": 0.007110595703125, + "k3_kl": 0.002410888671875, + "kimi_kl": 0.003936767578125, + "learning_rate": 4.844e-07, + "loss": 0.0001, + "ppl": 0.0026702880859375, + "reward": 0.9818252325057983, + "reward_std": 0.00026303649065084755, + "rewards/perpo_ocr_edit_distance_reward": 0.9818251729011536, "step": 156, "temperature": 0.9 }, { - "advantages": -6.565025978488848e-05, - "completion_length": 481.5, - "delta_ref_entropy_loss": 0.0145263671875, - "delta_ref_ppl": -0.0143890380859375, - "entropy_loss": -0.0634765625, - "epoch": 0.0628, - "grad_norm": 1.0529075181147118, - "k1_kl": 0.0143280029296875, - "k3_kl": 0.0070343017578125, - "kimi_kl": 0.012638092041015625, - "learning_rate": 4.686e-07, - "loss": 0.0003, - "ppl": 0.028289794921875, - "reward": 0.9875943958759308, - "reward_std": 0.0014176062250044197, - "rewards/perpo_ocr_edit_distance_reward": 0.9875945150852203, + "advantages": -1.3879367770641693e-06, + "completion_length": 690.0, + "delta_ref_entropy_loss": 0.01129150390625, + "delta_ref_ppl": -0.00439453125, + "entropy_loss": -0.01708984375, + "epoch": 0.0314, + "grad_norm": 0.6387493497306862, + "k1_kl": 0.00439453125, + "k3_kl": 0.003936767578125, + "kimi_kl": 0.004180908203125, + "learning_rate": 4.843e-07, + "loss": 0.0002, + "ppl": 0.0079345703125, + "reward": 0.9511628150939941, + "reward_std": 0.0366918221116066, + "rewards/perpo_ocr_edit_distance_reward": 0.9511628746986389, "step": 157, "temperature": 0.9 }, { - "advantages": -6.060941296937017e-05, - "completion_length": 793.0, - "delta_ref_entropy_loss": 0.0157470703125, - "delta_ref_ppl": -0.0104827880859375, - "entropy_loss": -0.0394287109375, - "epoch": 0.0632, - "grad_norm": 1.0288057218051057, - "k1_kl": 0.010467529296875, - "k3_kl": 0.00472259521484375, - "kimi_kl": 0.010166168212890625, - "learning_rate": 4.684e-07, - "loss": 0.0002, - "ppl": 0.016082763671875, - "reward": 0.9858962893486023, - "reward_std": 0.035954557766672224, - "rewards/perpo_ocr_edit_distance_reward": 0.9858963489532471, + "advantages": -3.448554707574658e-06, + "completion_length": 520.0, + "delta_ref_entropy_loss": 0.0162353515625, + "delta_ref_ppl": -0.0108642578125, + "entropy_loss": -0.0439453125, + "epoch": 0.0316, + "grad_norm": 0.7397024277419445, + "k1_kl": 0.01092529296875, + "k3_kl": 0.00946044921875, + "kimi_kl": 0.0115966796875, + "learning_rate": 4.842e-07, + "loss": 0.0004, + "ppl": 0.0230712890625, + "reward": 0.9761788845062256, + "reward_std": 0.00481023546308279, + "rewards/perpo_ocr_edit_distance_reward": 0.9761789441108704, "step": 158, "temperature": 0.9 }, { - "advantages": -7.149577561449405e-05, - "completion_length": 562.5, - "delta_ref_entropy_loss": 0.01287841796875, - "delta_ref_ppl": -0.006988525390625, - "entropy_loss": -0.09600830078125, - "epoch": 0.0636, - "grad_norm": 1.0476104302131288, - "k1_kl": 0.006988525390625, - "k3_kl": 0.003566741943359375, - "kimi_kl": 0.004375457763671875, - "learning_rate": 4.6819999999999997e-07, - "loss": 0.0002, - "ppl": 0.052825927734375, - "reward": 0.9234556257724762, - "reward_std": 0.0037815552786923945, - "rewards/perpo_ocr_edit_distance_reward": 0.9234557151794434, + "advantages": -4.257474756741431e-06, + "completion_length": 448.0, + "delta_ref_entropy_loss": 0.0260009765625, + "delta_ref_ppl": -0.03173828125, + "entropy_loss": -0.15625, + "epoch": 0.0318, + "grad_norm": 1.8630614729936628, + "k1_kl": 0.031982421875, + "k3_kl": 0.016357421875, + "kimi_kl": 0.029052734375, + "learning_rate": 4.841e-07, + "loss": 0.0007, + "ppl": 0.083984375, + "reward": 0.949544370174408, + "reward_std": 0.009907571598887444, + "rewards/perpo_ocr_edit_distance_reward": 0.9495444297790527, "step": 159, "temperature": 0.9 }, { - "advantages": -1.238073639342474e-05, - "completion_length": 549.5, - "delta_ref_entropy_loss": 0.02313232421875, - "delta_ref_ppl": -0.015960693359375, - "entropy_loss": -0.11279296875, - "epoch": 0.064, - "grad_norm": 1.4407846998361087, - "k1_kl": 0.016021728515625, - "k3_kl": 0.007598876953125, - "kimi_kl": 0.0110321044921875, - "learning_rate": 4.68e-07, - "loss": 0.0003, - "ppl": 0.0523681640625, - "reward": 0.9094379842281342, - "reward_std": 0.03242158598732203, - "rewards/perpo_ocr_edit_distance_reward": 0.9094380736351013, + "advantages": -1.021793991640152e-07, + "completion_length": 461.0, + "delta_ref_entropy_loss": 0.0157470703125, + "delta_ref_ppl": -0.0247802734375, + "entropy_loss": -0.1796875, + "epoch": 0.032, + "grad_norm": 3.5304575671348934, + "k1_kl": 0.024658203125, + "k3_kl": 0.022705078125, + "kimi_kl": 0.0294189453125, + "learning_rate": 4.839999999999999e-07, + "loss": 0.0009, + "ppl": 0.0888671875, + "reward": 0.6669535040855408, + "reward_std": 0.23938331007957458, + "rewards/perpo_ocr_edit_distance_reward": 0.6669535040855408, "step": 160, "temperature": 0.9 }, { - "advantages": -4.804560376214795e-06, - "completion_length": 713.5, - "delta_ref_entropy_loss": 0.010894775390625, - "delta_ref_ppl": -0.0043792724609375, - "entropy_loss": -0.03436279296875, - "epoch": 0.0644, - "grad_norm": 0.5523960756485353, - "k1_kl": 0.0044097900390625, - "k3_kl": 0.0022735595703125, - "kimi_kl": 0.002655029296875, - "learning_rate": 4.678e-07, + "advantages": -5.730560951633379e-05, + "completion_length": 793.0, + "delta_ref_entropy_loss": 0.00750732421875, + "delta_ref_ppl": -0.0038604736328125, + "entropy_loss": -0.0157470703125, + "epoch": 0.0322, + "grad_norm": 1.3939785762018684, + "k1_kl": 0.003875732421875, + "k3_kl": 0.001800537109375, + "kimi_kl": 0.003387451171875, + "learning_rate": 4.838999999999999e-07, "loss": 0.0001, - "ppl": 0.016021728515625, - "reward": 0.910999208688736, - "reward_std": 0.003914576955139637, - "rewards/perpo_ocr_edit_distance_reward": 0.9109992384910583, + "ppl": 0.0084228515625, + "reward": 0.9858599901199341, + "reward_std": 0.00034563394729048014, + "rewards/perpo_ocr_edit_distance_reward": 0.9858600497245789, "step": 161, "temperature": 0.9 }, { - "advantages": -2.1236284737824462e-05, - "completion_length": 227.5, - "delta_ref_entropy_loss": 0.0179443359375, - "delta_ref_ppl": -0.01348876953125, - "entropy_loss": -0.02880859375, - "epoch": 0.0648, - "grad_norm": 2.9021969995773156, - "k1_kl": 0.0135040283203125, - "k3_kl": 0.006866455078125, - "kimi_kl": 0.010589599609375, - "learning_rate": 4.676e-07, - "loss": 0.0003, - "ppl": 0.0125732421875, - "reward": 0.9780256748199463, - "reward_std": 0.002907509682700038, - "rewards/perpo_ocr_edit_distance_reward": 0.9780257940292358, + "advantages": -5.1753864681813866e-05, + "completion_length": 398.0, + "delta_ref_entropy_loss": 0.01904296875, + "delta_ref_ppl": -0.006988525390625, + "entropy_loss": -0.052490234375, + "epoch": 0.0324, + "grad_norm": 1.3286724816322324, + "k1_kl": 0.00701904296875, + "k3_kl": 0.0037689208984375, + "kimi_kl": 0.005218505859375, + "learning_rate": 4.838e-07, + "loss": 0.0002, + "ppl": 0.0269775390625, + "reward": 0.979089081287384, + "reward_std": 0.0007227185997180641, + "rewards/perpo_ocr_edit_distance_reward": 0.979089081287384, "step": 162, "temperature": 0.9 }, { - "advantages": -1.2942723515152466e-06, - "completion_length": 325.0, - "delta_ref_entropy_loss": 0.0313720703125, - "delta_ref_ppl": -0.0262451171875, - "entropy_loss": -0.107177734375, - "epoch": 0.0652, - "grad_norm": 0.8762578971908733, - "k1_kl": 0.0262451171875, - "k3_kl": 0.013427734375, - "kimi_kl": 0.02459716796875, - "learning_rate": 4.6739999999999996e-07, - "loss": 0.0005, - "ppl": 0.0490570068359375, - "reward": 0.9539096653461456, - "reward_std": 0.013178729452192783, - "rewards/perpo_ocr_edit_distance_reward": 0.953909695148468, + "advantages": 6.956714059924707e-06, + "completion_length": 909.0, + "delta_ref_entropy_loss": 0.020751953125, + "delta_ref_ppl": -0.01556396484375, + "entropy_loss": -0.05126953125, + "epoch": 0.0326, + "grad_norm": 1.4118181756080965, + "k1_kl": 0.01556396484375, + "k3_kl": 0.007476806640625, + "kimi_kl": 0.0142822265625, + "learning_rate": 4.837e-07, + "loss": 0.0003, + "ppl": 0.022705078125, + "reward": 0.9736489653587341, + "reward_std": 0.004805146250873804, + "rewards/perpo_ocr_edit_distance_reward": 0.9736489057540894, "step": 163, "temperature": 0.9 }, { - "advantages": -5.257981320028193e-05, - "completion_length": 832.0, - "delta_ref_entropy_loss": 0.011627197265625, - "delta_ref_ppl": -0.0082550048828125, - "entropy_loss": -0.0306396484375, - "epoch": 0.0656, - "grad_norm": 0.35628486823286565, - "k1_kl": 0.0082244873046875, - "k3_kl": 0.00424957275390625, - "kimi_kl": 0.0090484619140625, - "learning_rate": 4.672e-07, + "advantages": -2.09893505598302e-06, + "completion_length": 981.0, + "delta_ref_entropy_loss": 0.024169921875, + "delta_ref_ppl": -0.013671875, + "entropy_loss": -0.047119140625, + "epoch": 0.0328, + "grad_norm": 0.8123770904890595, + "k1_kl": 0.013671875, + "k3_kl": 0.005584716796875, + "kimi_kl": 0.0107421875, + "learning_rate": 4.835999999999999e-07, "loss": 0.0002, - "ppl": 0.0154571533203125, - "reward": 0.9947624802589417, - "reward_std": 0.000912553514353931, - "rewards/perpo_ocr_edit_distance_reward": 0.9947625994682312, + "ppl": 0.01708984375, + "reward": 0.9565356373786926, + "reward_std": 0.007953103631734848, + "rewards/perpo_ocr_edit_distance_reward": 0.9565356373786926, "step": 164, "temperature": 0.9 }, { - "advantages": -4.8826849706529174e-05, - "completion_length": 427.5, - "delta_ref_entropy_loss": 0.017333984375, - "delta_ref_ppl": -0.0138702392578125, - "entropy_loss": -0.035888671875, - "epoch": 0.066, - "grad_norm": 0.7825483446151406, - "k1_kl": 0.01387786865234375, - "k3_kl": 0.005542755126953125, - "kimi_kl": 0.009668350219726562, - "learning_rate": 4.67e-07, - "loss": 0.0003, - "ppl": 0.018768310546875, - "reward": 0.996010959148407, - "reward_std": 0.003559656222932972, - "rewards/perpo_ocr_edit_distance_reward": 0.9960109889507294, + "advantages": -3.2356808787881164e-07, + "completion_length": 2048.0, + "delta_ref_entropy_loss": 0.01177978515625, + "delta_ref_ppl": -0.0111083984375, + "entropy_loss": -0.0208740234375, + "epoch": 0.033, + "grad_norm": 0.3305575264374262, + "k1_kl": 0.0111083984375, + "k3_kl": 0.0059814453125, + "kimi_kl": 0.01141357421875, + "learning_rate": 4.835e-07, + "loss": 0.0002, + "ppl": 0.0081787109375, + "reward": 0.8963108062744141, + "reward_std": 0.17501959204673767, + "rewards/perpo_ocr_edit_distance_reward": 0.8963108658790588, "step": 165, "temperature": 0.9 }, { - "advantages": -5.10896995820076e-07, - "completion_length": 1473.5, - "delta_ref_entropy_loss": 0.01385498046875, - "delta_ref_ppl": -0.0059356689453125, - "entropy_loss": -0.089599609375, - "epoch": 0.0664, - "grad_norm": 1.034627100137372, - "k1_kl": 0.0059356689453125, - "k3_kl": 0.005199432373046875, - "kimi_kl": 0.005336761474609375, - "learning_rate": 4.6679999999999997e-07, - "loss": 0.0002, - "ppl": 0.049102783203125, - "reward": 0.9760335683822632, - "reward_std": 0.025140190940874163, - "rewards/perpo_ocr_edit_distance_reward": 0.976033627986908, + "advantages": -7.770743104629219e-05, + "completion_length": 706.0, + "delta_ref_entropy_loss": 0.017333984375, + "delta_ref_ppl": -0.0068359375, + "entropy_loss": -0.0279541015625, + "epoch": 0.0332, + "grad_norm": 8.95087549492707, + "k1_kl": 0.006866455078125, + "k3_kl": 0.00469970703125, + "kimi_kl": 0.006683349609375, + "learning_rate": 4.834e-07, + "loss": 0.0003, + "ppl": 0.0120849609375, + "reward": 0.917645275592804, + "reward_std": 0.0006669690483249724, + "rewards/perpo_ocr_edit_distance_reward": 0.9176453351974487, "step": 166, "temperature": 0.9 }, { - "advantages": -1.2478658845793689e-05, - "completion_length": 405.0, - "delta_ref_entropy_loss": 0.023284912109375, - "delta_ref_ppl": -0.0160980224609375, - "entropy_loss": -0.13946533203125, - "epoch": 0.0668, - "grad_norm": 1.2155895009227848, - "k1_kl": 0.01617431640625, - "k3_kl": 0.008632659912109375, - "kimi_kl": 0.024200439453125, - "learning_rate": 4.666e-07, - "loss": 0.0004, - "ppl": 0.069854736328125, - "reward": 0.9464809000492096, - "reward_std": 0.006695618387311697, - "rewards/perpo_ocr_edit_distance_reward": 0.9464809894561768, + "advantages": -2.4012158519326476e-06, + "completion_length": 627.0, + "delta_ref_entropy_loss": 0.031494140625, + "delta_ref_ppl": -0.017333984375, + "entropy_loss": -0.0849609375, + "epoch": 0.0334, + "grad_norm": 1.6099417144050132, + "k1_kl": 0.0172119140625, + "k3_kl": 0.012939453125, + "kimi_kl": 0.0181884765625, + "learning_rate": 4.833e-07, + "loss": 0.0005, + "ppl": 0.041748046875, + "reward": 0.8016795516014099, + "reward_std": 0.052907075732946396, + "rewards/perpo_ocr_edit_distance_reward": 0.8016796708106995, "step": 167, "temperature": 0.9 }, { - "advantages": -2.1759954117683833e-05, - "completion_length": 498.5, - "delta_ref_entropy_loss": 0.048919677734375, - "delta_ref_ppl": -0.0186004638671875, - "entropy_loss": -0.0589599609375, - "epoch": 0.0672, - "grad_norm": 16.631535983021156, - "k1_kl": 0.01837158203125, - "k3_kl": 0.0245208740234375, - "kimi_kl": 0.0365753173828125, - "learning_rate": 4.6639999999999994e-07, - "loss": 0.001, - "ppl": 0.04156494140625, - "reward": 0.9219225645065308, - "reward_std": 0.18055324134184048, - "rewards/perpo_ocr_edit_distance_reward": 0.9219225645065308, + "advantages": -1.004764044409967e-06, + "completion_length": 626.0, + "delta_ref_entropy_loss": 0.044189453125, + "delta_ref_ppl": -0.0233154296875, + "entropy_loss": -0.328125, + "epoch": 0.0336, + "grad_norm": 2.5537583504500625, + "k1_kl": 0.0238037109375, + "k3_kl": 0.01043701171875, + "kimi_kl": 0.01416015625, + "learning_rate": 4.832e-07, + "loss": 0.0004, + "ppl": 0.1748046875, + "reward": 0.7903031706809998, + "reward_std": 0.06940200924873352, + "rewards/perpo_ocr_edit_distance_reward": 0.7903032898902893, "step": 168, "temperature": 0.9 }, { - "advantages": -1.4475413934178505e-07, - "completion_length": 474.5, - "delta_ref_entropy_loss": 0.01800537109375, - "delta_ref_ppl": -0.03802490234375, - "entropy_loss": -0.166748046875, - "epoch": 0.0676, - "grad_norm": 2.6817039233351303, - "k1_kl": 0.0380859375, - "k3_kl": 0.02691650390625, - "kimi_kl": 0.0902099609375, - "learning_rate": 4.662e-07, - "loss": 0.0011, - "ppl": 0.0848388671875, - "reward": 0.7679829895496368, - "reward_std": 0.2518530860543251, - "rewards/perpo_ocr_edit_distance_reward": 0.7679830193519592, + "advantages": -2.384185791015625e-07, + "completion_length": 1422.0, + "delta_ref_entropy_loss": 0.015869140625, + "delta_ref_ppl": -0.0164794921875, + "entropy_loss": -0.1416015625, + "epoch": 0.0338, + "grad_norm": 1.8531753765437369, + "k1_kl": 0.0164794921875, + "k3_kl": 0.0101318359375, + "kimi_kl": 0.023681640625, + "learning_rate": 4.831e-07, + "loss": 0.0004, + "ppl": 0.07568359375, + "reward": 0.5231530666351318, + "reward_std": 0.1744818091392517, + "rewards/perpo_ocr_edit_distance_reward": 0.5231531262397766, "step": 169, "temperature": 0.9 }, { - "advantages": -7.76116362430912e-05, - "completion_length": 486.5, - "delta_ref_entropy_loss": 0.019775390625, - "delta_ref_ppl": -0.01397705078125, - "entropy_loss": -0.0694580078125, - "epoch": 0.068, - "grad_norm": 0.7488763354392323, - "k1_kl": 0.013946533203125, - "k3_kl": 0.005889892578125, - "kimi_kl": 0.008697509765625, - "learning_rate": 4.66e-07, - "loss": 0.0003, - "ppl": 0.0335693359375, - "reward": 0.9732770919799805, - "reward_std": 0.0019430431420914829, - "rewards/perpo_ocr_edit_distance_reward": 0.9732771813869476, + "advantages": -2.384185791015625e-06, + "completion_length": 173.0, + "delta_ref_entropy_loss": 0.00762939453125, + "delta_ref_ppl": -0.0218505859375, + "entropy_loss": -0.048828125, + "epoch": 0.034, + "grad_norm": 2.557801707734872, + "k1_kl": 0.0218505859375, + "k3_kl": 0.014404296875, + "kimi_kl": 0.03564453125, + "learning_rate": 4.83e-07, + "loss": 0.0006, + "ppl": 0.0196533203125, + "reward": 0.9619178771972656, + "reward_std": 0.031936224550008774, + "rewards/perpo_ocr_edit_distance_reward": 0.9619179964065552, "step": 170, "temperature": 0.9 }, { - "advantages": -0.0001316496382059995, - "completion_length": 828.5, - "delta_ref_entropy_loss": 0.0085906982421875, - "delta_ref_ppl": -0.002567291259765625, - "entropy_loss": -0.0277099609375, - "epoch": 0.0684, - "grad_norm": 0.22243004012951584, - "k1_kl": 0.0025634765625, - "k3_kl": 0.000789642333984375, - "kimi_kl": 0.0011806488037109375, - "learning_rate": 4.6579999999999995e-07, - "loss": 0.0002, - "ppl": 0.0094757080078125, - "reward": 0.9986301064491272, - "reward_std": 0.00039482010470237583, - "rewards/perpo_ocr_edit_distance_reward": 0.998630166053772, + "advantages": -0.0003141675842925906, + "completion_length": 499.0, + "delta_ref_entropy_loss": 0.01239013671875, + "delta_ref_ppl": -0.0079345703125, + "entropy_loss": -0.028564453125, + "epoch": 0.0342, + "grad_norm": 0.8746336560501426, + "k1_kl": 0.0079345703125, + "k3_kl": 0.005462646484375, + "kimi_kl": 0.00823974609375, + "learning_rate": 4.829e-07, + "loss": 0.0005, + "ppl": 0.01385498046875, + "reward": 0.9731163382530212, + "reward_std": 0.0002794429019559175, + "rewards/perpo_ocr_edit_distance_reward": 0.973116397857666, "step": 171, "temperature": 0.9 }, { - "advantages": 9.034361710291705e-06, - "completion_length": 504.0, - "delta_ref_entropy_loss": 0.02197265625, - "delta_ref_ppl": -0.02032470703125, - "entropy_loss": -0.0662841796875, - "epoch": 0.0688, - "grad_norm": 1.606625227078067, - "k1_kl": 0.020263671875, - "k3_kl": 0.00933837890625, - "kimi_kl": 0.015228271484375, - "learning_rate": 4.656e-07, - "loss": 0.0004, - "ppl": 0.02947998046875, - "reward": 0.9832078814506531, - "reward_std": 0.0040947102243080735, - "rewards/perpo_ocr_edit_distance_reward": 0.9832078814506531, + "advantages": -1.611028528714087e-05, + "completion_length": 382.0, + "delta_ref_entropy_loss": 0.031982421875, + "delta_ref_ppl": -0.01953125, + "entropy_loss": -0.039794921875, + "epoch": 0.0344, + "grad_norm": 1.0478979659445449, + "k1_kl": 0.0196533203125, + "k3_kl": 0.010986328125, + "kimi_kl": 0.015625, + "learning_rate": 4.828e-07, + "loss": 0.0005, + "ppl": 0.0198974609375, + "reward": 0.9789817929267883, + "reward_std": 0.002014354569837451, + "rewards/perpo_ocr_edit_distance_reward": 0.9789817929267883, "step": 172, "temperature": 0.9 }, { - "advantages": -2.2726401425643417e-05, - "completion_length": 522.5, - "delta_ref_entropy_loss": 0.02960205078125, - "delta_ref_ppl": -0.01788330078125, - "entropy_loss": -0.07763671875, - "epoch": 0.0692, - "grad_norm": 0.643765226156834, - "k1_kl": 0.01788330078125, - "k3_kl": 0.00958251953125, - "kimi_kl": 0.018280029296875, - "learning_rate": 4.6539999999999997e-07, - "loss": 0.0004, - "ppl": 0.03515625, - "reward": 0.8342057764530182, - "reward_std": 0.017176850320538506, - "rewards/perpo_ocr_edit_distance_reward": 0.834205836057663, + "advantages": -4.104205800103955e-06, + "completion_length": 285.0, + "delta_ref_entropy_loss": 0.035888671875, + "delta_ref_ppl": -0.035400390625, + "entropy_loss": -0.1005859375, + "epoch": 0.0346, + "grad_norm": 2.1022191972585835, + "k1_kl": 0.035400390625, + "k3_kl": 0.0242919921875, + "kimi_kl": 0.0546875, + "learning_rate": 4.827e-07, + "loss": 0.001, + "ppl": 0.05224609375, + "reward": 0.9109518527984619, + "reward_std": 0.006141253747045994, + "rewards/perpo_ocr_edit_distance_reward": 0.9109519124031067, "step": 173, "temperature": 0.9 }, { - "advantages": -7.856744014134165e-05, - "completion_length": 432.0, - "delta_ref_entropy_loss": 0.017913818359375, - "delta_ref_ppl": -0.009674072265625, - "entropy_loss": -0.02508544921875, - "epoch": 0.0696, - "grad_norm": 0.3868902047066776, - "k1_kl": 0.00970458984375, - "k3_kl": 0.004852294921875, - "kimi_kl": 0.007415771484375, - "learning_rate": 4.6519999999999996e-07, - "loss": 0.0003, - "ppl": 0.0090179443359375, - "reward": 0.997760683298111, - "reward_std": 0.0006551960832439363, - "rewards/perpo_ocr_edit_distance_reward": 0.9977607429027557, + "advantages": -8.20841160020791e-06, + "completion_length": 408.0, + "delta_ref_entropy_loss": 0.0250244140625, + "delta_ref_ppl": -0.02001953125, + "entropy_loss": -0.045166015625, + "epoch": 0.0348, + "grad_norm": 1.0165751982968028, + "k1_kl": 0.0201416015625, + "k3_kl": 0.0113525390625, + "kimi_kl": 0.02587890625, + "learning_rate": 4.825999999999999e-07, + "loss": 0.0005, + "ppl": 0.0198974609375, + "reward": 0.9791194200515747, + "reward_std": 0.0019786201883107424, + "rewards/perpo_ocr_edit_distance_reward": 0.9791194200515747, "step": 174, "temperature": 0.9 }, { - "advantages": -4.655548582377378e-05, - "completion_length": 639.0, - "delta_ref_entropy_loss": 0.0135498046875, - "delta_ref_ppl": -0.0069427490234375, - "entropy_loss": -0.02935791015625, - "epoch": 0.07, - "grad_norm": 0.5708798065010661, - "k1_kl": 0.0069732666015625, - "k3_kl": 0.00266265869140625, - "kimi_kl": 0.00460052490234375, - "learning_rate": 4.65e-07, - "loss": 0.0002, - "ppl": 0.0113372802734375, - "reward": 0.9981619715690613, - "reward_std": 0.0006441561781684868, - "rewards/perpo_ocr_edit_distance_reward": 0.9981620013713837, + "advantages": -1.1597361663007177e-05, + "completion_length": 1258.0, + "delta_ref_entropy_loss": 0.01104736328125, + "delta_ref_ppl": -0.005767822265625, + "entropy_loss": -0.064453125, + "epoch": 0.035, + "grad_norm": 0.886999601364085, + "k1_kl": 0.005828857421875, + "k3_kl": 0.003021240234375, + "kimi_kl": 0.00433349609375, + "learning_rate": 4.824999999999999e-07, + "loss": 0.0001, + "ppl": 0.0308837890625, + "reward": 0.978661060333252, + "reward_std": 0.002832663245499134, + "rewards/perpo_ocr_edit_distance_reward": 0.978661060333252, "step": 175, "temperature": 0.9 }, { - "advantages": -4.190206735632529e-05, - "completion_length": 517.0, - "delta_ref_entropy_loss": 0.03338623046875, - "delta_ref_ppl": -0.034027099609375, - "entropy_loss": -0.100341796875, - "epoch": 0.0704, - "grad_norm": 1.81106112117433, - "k1_kl": 0.034149169921875, - "k3_kl": 0.020751953125, - "kimi_kl": 0.049896240234375, - "learning_rate": 4.648e-07, - "loss": 0.0009, - "ppl": 0.04547119140625, - "reward": 0.9087511599063873, - "reward_std": 0.017459047463489696, - "rewards/perpo_ocr_edit_distance_reward": 0.9087512195110321, + "advantages": 1.7029899268550253e-08, + "completion_length": 854.0, + "delta_ref_entropy_loss": 0.01336669921875, + "delta_ref_ppl": -0.006805419921875, + "entropy_loss": -0.01397705078125, + "epoch": 0.0352, + "grad_norm": 0.19645825209548243, + "k1_kl": 0.0068359375, + "k3_kl": 0.002197265625, + "kimi_kl": 0.003143310546875, + "learning_rate": 4.823999999999999e-07, + "loss": 0.0001, + "ppl": 0.0050048828125, + "reward": 0.9822638034820557, + "reward_std": 0.0001829309039749205, + "rewards/perpo_ocr_edit_distance_reward": 0.9822638630867004, "step": 176, "temperature": 0.9 }, { - "advantages": -1.532690930616809e-06, - "completion_length": 425.0, - "delta_ref_entropy_loss": 0.03826904296875, - "delta_ref_ppl": -0.0257568359375, - "entropy_loss": -0.138427734375, - "epoch": 0.0708, - "grad_norm": 1.4472894594910624, - "k1_kl": 0.02587890625, - "k3_kl": 0.0125732421875, - "kimi_kl": 0.023956298828125, - "learning_rate": 4.646e-07, - "loss": 0.0005, - "ppl": 0.064697265625, - "reward": 0.9031889140605927, - "reward_std": 0.019757644273340702, - "rewards/perpo_ocr_edit_distance_reward": 0.9031889736652374, + "advantages": -1.532690987460228e-07, + "completion_length": 66.0, + "delta_ref_entropy_loss": 0.0184326171875, + "delta_ref_ppl": -0.1494140625, + "entropy_loss": -0.353515625, + "epoch": 0.0354, + "grad_norm": 8.795073777709804, + "k1_kl": 0.1494140625, + "k3_kl": 0.115234375, + "kimi_kl": 0.439453125, + "learning_rate": 4.823e-07, + "loss": 0.0046, + "ppl": 0.1689453125, + "reward": 0.5751972198486328, + "reward_std": 0.09703845530748367, + "rewards/perpo_ocr_edit_distance_reward": 0.5751972198486328, "step": 177, "temperature": 0.9 }, { - "advantages": -2.120222369228486e-06, - "completion_length": 239.5, - "delta_ref_entropy_loss": 0.0125274658203125, - "delta_ref_ppl": -0.1329345703125, - "entropy_loss": -0.1962890625, - "epoch": 0.0712, - "grad_norm": 4.439293650874077, - "k1_kl": 0.1329345703125, - "k3_kl": 0.08734130859375, - "kimi_kl": 0.22509765625, - "learning_rate": 4.6439999999999995e-07, - "loss": 0.0035, - "ppl": 0.07080078125, - "reward": 0.553058534860611, - "reward_std": 0.06703525595366955, - "rewards/perpo_ocr_edit_distance_reward": 0.5530585795640945, + "advantages": -0.0005960464477539062, + "completion_length": 87.0, + "delta_ref_entropy_loss": 0.004638671875, + "delta_ref_ppl": -0.040771484375, + "entropy_loss": -0.041748046875, + "epoch": 0.0356, + "grad_norm": 0.044162065802231164, + "k1_kl": 0.040771484375, + "k3_kl": 0.027099609375, + "kimi_kl": 0.0634765625, + "learning_rate": 4.822e-07, + "loss": 0.0017, + "ppl": 0.005828857421875, + "reward": 0.9814814329147339, + "reward_std": 0.0, + "rewards/perpo_ocr_edit_distance_reward": 0.9814815521240234, "step": 178, "temperature": 0.9 }, { - "advantages": -2.5032887833731365e-05, - "completion_length": 381.0, - "delta_ref_entropy_loss": 0.0501708984375, - "delta_ref_ppl": -0.03240966796875, - "entropy_loss": -0.1455078125, - "epoch": 0.0716, - "grad_norm": 1.2009683427386548, - "k1_kl": 0.0323486328125, - "k3_kl": 0.0148162841796875, - "kimi_kl": 0.025421142578125, - "learning_rate": 4.642e-07, - "loss": 0.0006, - "ppl": 0.06396484375, - "reward": 0.9184396862983704, - "reward_std": 0.0063326198142021894, - "rewards/perpo_ocr_edit_distance_reward": 0.9184397757053375, + "advantages": 1.7029899268550253e-08, + "completion_length": 542.0, + "delta_ref_entropy_loss": 0.01025390625, + "delta_ref_ppl": -0.007049560546875, + "entropy_loss": -0.0186767578125, + "epoch": 0.0358, + "grad_norm": 1.3103711403604457, + "k1_kl": 0.007049560546875, + "k3_kl": 0.0033111572265625, + "kimi_kl": 0.004638671875, + "learning_rate": 4.820999999999999e-07, + "loss": 0.0001, + "ppl": 0.00933837890625, + "reward": 0.9825423955917358, + "reward_std": 0.004074324853718281, + "rewards/perpo_ocr_edit_distance_reward": 0.9825423359870911, "step": 179, "temperature": 0.9 }, { - "advantages": 0.0, - "completion_length": 710.5, - "delta_ref_entropy_loss": 0.012725830078125, - "delta_ref_ppl": -0.006591796875, - "entropy_loss": -0.0096282958984375, - "epoch": 0.072, - "grad_norm": 0.01306779211588022, - "k1_kl": 0.006591796875, - "k3_kl": 0.00321197509765625, - "kimi_kl": 0.005859375, - "learning_rate": 4.64e-07, - "loss": 0.0001, - "ppl": 0.00243377685546875, - "reward": 0.9988544881343842, - "reward_std": 0.0, - "rewards/perpo_ocr_edit_distance_reward": 0.9988545179367065, + "advantages": -3.405979782655777e-07, + "completion_length": 181.0, + "delta_ref_entropy_loss": 0.03076171875, + "delta_ref_ppl": -0.0169677734375, + "entropy_loss": -0.07421875, + "epoch": 0.036, + "grad_norm": 2.32039304523778, + "k1_kl": 0.0169677734375, + "k3_kl": 0.006744384765625, + "kimi_kl": 0.01116943359375, + "learning_rate": 4.82e-07, + "loss": 0.0003, + "ppl": 0.0291748046875, + "reward": 0.9371300339698792, + "reward_std": 0.1199227049946785, + "rewards/perpo_ocr_edit_distance_reward": 0.9371300935745239, "step": 180, "temperature": 0.9 }, { - "advantages": -1.4245511138710754e-05, - "completion_length": 545.5, - "delta_ref_entropy_loss": 0.02056884765625, - "delta_ref_ppl": -0.014007568359375, - "entropy_loss": -0.07318115234375, - "epoch": 0.0724, - "grad_norm": 0.635875599520095, - "k1_kl": 0.014068603515625, - "k3_kl": 0.00823974609375, - "kimi_kl": 0.0127105712890625, - "learning_rate": 4.6379999999999996e-07, - "loss": 0.0003, - "ppl": 0.041900634765625, - "reward": 0.8605246841907501, - "reward_std": 0.0742693436332047, - "rewards/perpo_ocr_edit_distance_reward": 0.8605246841907501, + "advantages": -0.00012268339924048632, + "completion_length": 452.0, + "delta_ref_entropy_loss": 0.015869140625, + "delta_ref_ppl": -0.01434326171875, + "entropy_loss": -0.0269775390625, + "epoch": 0.0362, + "grad_norm": 0.6889430676971464, + "k1_kl": 0.01434326171875, + "k3_kl": 0.00665283203125, + "kimi_kl": 0.0103759765625, + "learning_rate": 4.819e-07, + "loss": 0.0004, + "ppl": 0.01068115234375, + "reward": 0.955590546131134, + "reward_std": 0.0003164127410855144, + "rewards/perpo_ocr_edit_distance_reward": 0.955590546131134, "step": 181, "temperature": 0.9 }, { - "advantages": -1.0183879827430076e-05, - "completion_length": 1020.5, - "delta_ref_entropy_loss": 0.0196533203125, - "delta_ref_ppl": -0.007476806640625, - "entropy_loss": -0.04168701171875, - "epoch": 0.0728, - "grad_norm": 0.8056507336264632, - "k1_kl": 0.00750732421875, - "k3_kl": 0.00214385986328125, - "kimi_kl": 0.0029754638671875, - "learning_rate": 4.636e-07, - "loss": 0.0001, - "ppl": 0.0175323486328125, - "reward": 0.9810559749603271, - "reward_std": 0.03514953621197492, - "rewards/perpo_ocr_edit_distance_reward": 0.9810559749603271, + "advantages": -9.518010483589023e-05, + "completion_length": 631.0, + "delta_ref_entropy_loss": 0.0120849609375, + "delta_ref_ppl": -0.00872802734375, + "entropy_loss": -0.0235595703125, + "epoch": 0.0364, + "grad_norm": 0.5296791486773755, + "k1_kl": 0.00872802734375, + "k3_kl": 0.0054931640625, + "kimi_kl": 0.0120849609375, + "learning_rate": 4.818e-07, + "loss": 0.0003, + "ppl": 0.00872802734375, + "reward": 0.9805232286453247, + "reward_std": 0.0004366862413007766, + "rewards/perpo_ocr_edit_distance_reward": 0.9805232882499695, "step": 182, "temperature": 0.9 }, { - "advantages": -2.7472420697449706e-05, - "completion_length": 530.0, - "delta_ref_entropy_loss": 0.016937255859375, - "delta_ref_ppl": -0.01206207275390625, - "entropy_loss": -0.044921875, - "epoch": 0.0732, - "grad_norm": 0.772032117910025, - "k1_kl": 0.01207733154296875, - "k3_kl": 0.0061359405517578125, - "kimi_kl": 0.00836944580078125, - "learning_rate": 4.634e-07, - "loss": 0.0003, - "ppl": 0.0208282470703125, - "reward": 0.9922747313976288, - "reward_std": 0.002408603031653911, - "rewards/perpo_ocr_edit_distance_reward": 0.9922747313976288, + "advantages": -9.952273103408515e-05, + "completion_length": 223.0, + "delta_ref_entropy_loss": 0.004302978515625, + "delta_ref_ppl": -0.0203857421875, + "entropy_loss": -0.025390625, + "epoch": 0.0366, + "grad_norm": 1.0876084398456716, + "k1_kl": 0.020263671875, + "k3_kl": 0.01251220703125, + "kimi_kl": 0.025634765625, + "learning_rate": 4.817e-07, + "loss": 0.0006, + "ppl": 0.0093994140625, + "reward": 0.9749240279197693, + "reward_std": 0.0006701328093186021, + "rewards/perpo_ocr_edit_distance_reward": 0.9749240875244141, "step": 183, "temperature": 0.9 }, { - "advantages": 2.060617816823651e-06, - "completion_length": 510.0, - "delta_ref_entropy_loss": 0.02264404296875, - "delta_ref_ppl": -0.013031005859375, - "entropy_loss": -0.0517578125, - "epoch": 0.0736, - "grad_norm": 0.7214299163002048, - "k1_kl": 0.0129852294921875, - "k3_kl": 0.005950927734375, - "kimi_kl": 0.008819580078125, - "learning_rate": 4.6319999999999997e-07, - "loss": 0.0002, - "ppl": 0.02508544921875, - "reward": 0.8940887451171875, - "reward_std": 0.051461177063174546, - "rewards/perpo_ocr_edit_distance_reward": 0.8940887749195099, + "advantages": -2.023152046604082e-05, + "completion_length": 805.0, + "delta_ref_entropy_loss": 0.0303955078125, + "delta_ref_ppl": -0.0152587890625, + "entropy_loss": -0.1328125, + "epoch": 0.0368, + "grad_norm": 1.426636011282718, + "k1_kl": 0.0152587890625, + "k3_kl": 0.0064697265625, + "kimi_kl": 0.00836181640625, + "learning_rate": 4.816e-07, + "loss": 0.0003, + "ppl": 0.07275390625, + "reward": 0.7959010601043701, + "reward_std": 0.004108173307031393, + "rewards/perpo_ocr_edit_distance_reward": 0.7959011793136597, "step": 184, "temperature": 0.9 }, { - "advantages": -7.495284444303252e-05, - "completion_length": 578.5, - "delta_ref_entropy_loss": 0.02130126953125, - "delta_ref_ppl": -0.00677490234375, - "entropy_loss": -0.027587890625, - "epoch": 0.074, - "grad_norm": 0.4530558733794676, - "k1_kl": 0.006805419921875, - "k3_kl": 0.0018463134765625, - "kimi_kl": 0.00237274169921875, - "learning_rate": 4.63e-07, - "loss": 0.0001, - "ppl": 0.0093994140625, - "reward": 0.9986948072910309, - "reward_std": 0.0008351593423867598, - "rewards/perpo_ocr_edit_distance_reward": 0.9986948668956757, + "advantages": -0.0005960464477539062, + "completion_length": 312.0, + "delta_ref_entropy_loss": 0.0162353515625, + "delta_ref_ppl": -0.017578125, + "entropy_loss": -0.016845703125, + "epoch": 0.037, + "grad_norm": 0.06238695229722707, + "k1_kl": 0.0174560546875, + "k3_kl": 0.0126953125, + "kimi_kl": 0.0247802734375, + "learning_rate": 4.815e-07, + "loss": 0.0011, + "ppl": 0.00927734375, + "reward": 0.9811320304870605, + "reward_std": 0.0, + "rewards/perpo_ocr_edit_distance_reward": 0.9811321496963501, "step": 185, "temperature": 0.9 }, { - "advantages": 2.299036623298889e-07, - "completion_length": 624.5, - "delta_ref_entropy_loss": 0.015472412109375, - "delta_ref_ppl": -0.01934814453125, - "entropy_loss": -0.150390625, - "epoch": 0.0744, - "grad_norm": 1.1008890507040092, - "k1_kl": 0.019287109375, - "k3_kl": 0.01043701171875, - "kimi_kl": 0.01666259765625, - "learning_rate": 4.628e-07, - "loss": 0.0004, - "ppl": 0.076171875, - "reward": 0.7479699552059174, - "reward_std": 0.03760120691731572, - "rewards/perpo_ocr_edit_distance_reward": 0.7479700148105621, + "advantages": -1.6178404393940582e-07, + "completion_length": 1652.0, + "delta_ref_entropy_loss": 0.0238037109375, + "delta_ref_ppl": -0.01361083984375, + "entropy_loss": -0.1337890625, + "epoch": 0.0372, + "grad_norm": 3.9104140493441, + "k1_kl": 0.0135498046875, + "k3_kl": 0.020751953125, + "kimi_kl": 0.0198974609375, + "learning_rate": 4.814e-07, + "loss": 0.0008, + "ppl": 0.07177734375, + "reward": 0.6837654709815979, + "reward_std": 0.2897385060787201, + "rewards/perpo_ocr_edit_distance_reward": 0.6837654709815979, "step": 186, "temperature": 0.9 }, { - "advantages": -3.6018236642121337e-06, - "completion_length": 736.5, - "delta_ref_entropy_loss": 0.017852783203125, - "delta_ref_ppl": -0.0203094482421875, - "entropy_loss": -0.0748291015625, - "epoch": 0.0748, - "grad_norm": 3.4716291406047106, - "k1_kl": 0.02032470703125, - "k3_kl": 0.01967620849609375, - "kimi_kl": 0.027557373046875, - "learning_rate": 4.6259999999999997e-07, - "loss": 0.0008, - "ppl": 0.03955078125, - "reward": 0.9279071986675262, - "reward_std": 0.1489367838948965, - "rewards/perpo_ocr_edit_distance_reward": 0.9279072880744934, + "advantages": -1.813684320950415e-05, + "completion_length": 512.0, + "delta_ref_entropy_loss": 0.0201416015625, + "delta_ref_ppl": -0.01708984375, + "entropy_loss": -0.06982421875, + "epoch": 0.0374, + "grad_norm": 1.2124728425099918, + "k1_kl": 0.01708984375, + "k3_kl": 0.011474609375, + "kimi_kl": 0.0257568359375, + "learning_rate": 4.813e-07, + "loss": 0.0005, + "ppl": 0.037109375, + "reward": 0.9606290459632874, + "reward_std": 0.005537636112421751, + "rewards/perpo_ocr_edit_distance_reward": 0.9606291055679321, "step": 187, "temperature": 0.9 }, { - "advantages": -5.991118541714968e-05, - "completion_length": 535.0, - "delta_ref_entropy_loss": 0.0128326416015625, - "delta_ref_ppl": -0.00518035888671875, - "entropy_loss": -0.023101806640625, - "epoch": 0.0752, - "grad_norm": 0.6409122289571603, - "k1_kl": 0.00518035888671875, - "k3_kl": 0.001861572265625, - "kimi_kl": 0.003021240234375, - "learning_rate": 4.6239999999999996e-07, - "loss": 0.0001, - "ppl": 0.009002685546875, - "reward": 0.9982403218746185, - "reward_std": 0.0006125848449300975, - "rewards/perpo_ocr_edit_distance_reward": 0.9982403814792633, + "advantages": -0.0005960464477539062, + "completion_length": 716.0, + "delta_ref_entropy_loss": 0.00830078125, + "delta_ref_ppl": -0.00469970703125, + "entropy_loss": -0.0093994140625, + "epoch": 0.0376, + "grad_norm": 0.004059456991962833, + "k1_kl": 0.00469970703125, + "k3_kl": 0.00170135498046875, + "kimi_kl": 0.0025787353515625, + "learning_rate": 4.812e-07, + "loss": 0.0007, + "ppl": 0.00189971923828125, + "reward": 0.9391691088676453, + "reward_std": 0.0, + "rewards/perpo_ocr_edit_distance_reward": 0.9391692280769348, "step": 188, "temperature": 0.9 }, { - "advantages": -3.49751562680467e-05, - "completion_length": 525.0, - "delta_ref_entropy_loss": 0.016632080078125, - "delta_ref_ppl": -0.016265869140625, - "entropy_loss": -0.04248046875, - "epoch": 0.0756, - "grad_norm": 1.0944951138158092, - "k1_kl": 0.016265869140625, - "k3_kl": 0.0099945068359375, - "kimi_kl": 0.018310546875, - "learning_rate": 4.622e-07, + "advantages": -0.00012055465776938945, + "completion_length": 662.0, + "delta_ref_entropy_loss": 0.01513671875, + "delta_ref_ppl": -0.01556396484375, + "entropy_loss": -0.0211181640625, + "epoch": 0.0378, + "grad_norm": 0.39267249422078326, + "k1_kl": 0.0155029296875, + "k3_kl": 0.0081787109375, + "kimi_kl": 0.0189208984375, + "learning_rate": 4.811e-07, "loss": 0.0004, - "ppl": 0.021728515625, - "reward": 0.9922289550304413, - "reward_std": 0.0013757511042058468, - "rewards/perpo_ocr_edit_distance_reward": 0.9922290444374084, + "ppl": 0.00860595703125, + "reward": 0.9808907508850098, + "reward_std": 0.0005356052424758673, + "rewards/perpo_ocr_edit_distance_reward": 0.9808908700942993, "step": 189, "temperature": 0.9 }, { - "advantages": -1.404966639029226e-06, - "completion_length": 686.0, - "delta_ref_entropy_loss": 0.0118255615234375, - "delta_ref_ppl": -0.010498046875, - "entropy_loss": -0.088623046875, - "epoch": 0.076, - "grad_norm": 0.8906118771075668, - "k1_kl": 0.0105133056640625, - "k3_kl": 0.00592803955078125, - "kimi_kl": 0.01226806640625, - "learning_rate": 4.62e-07, + "advantages": 0.0, + "completion_length": 935.0, + "delta_ref_entropy_loss": 0.01458740234375, + "delta_ref_ppl": -0.00830078125, + "entropy_loss": -0.03662109375, + "epoch": 0.038, + "grad_norm": 2.261929205048932, + "k1_kl": 0.00823974609375, + "k3_kl": 0.0045166015625, + "kimi_kl": 0.007110595703125, + "learning_rate": 4.809999999999999e-07, "loss": 0.0002, - "ppl": 0.0462646484375, - "reward": 0.8789756298065186, - "reward_std": 0.08239121176302433, - "rewards/perpo_ocr_edit_distance_reward": 0.8789757490158081, + "ppl": 0.017822265625, + "reward": 0.9725815057754517, + "reward_std": 0.0016076330794021487, + "rewards/perpo_ocr_edit_distance_reward": 0.9725815057754517, "step": 190, "temperature": 0.9 }, { - "advantages": -2.724783954022314e-07, - "completion_length": 774.5, - "delta_ref_entropy_loss": 0.017961502075195312, - "delta_ref_ppl": -0.009307861328125, - "entropy_loss": -0.171630859375, - "epoch": 0.0764, - "grad_norm": 46.510629383441135, - "k1_kl": 0.009124755859375, - "k3_kl": 0.4127044677734375, - "kimi_kl": 0.013946533203125, - "learning_rate": 4.6179999999999997e-07, - "loss": 0.0165, - "ppl": 0.0943603515625, - "reward": 0.9024119973182678, - "reward_std": 0.0339131373912096, - "rewards/perpo_ocr_edit_distance_reward": 0.9024119973182678, + "advantages": -1.3964516938358429e-06, + "completion_length": 627.0, + "delta_ref_entropy_loss": 0.0167236328125, + "delta_ref_ppl": -0.0145263671875, + "entropy_loss": -0.033447265625, + "epoch": 0.0382, + "grad_norm": 1.0346151261216892, + "k1_kl": 0.01458740234375, + "k3_kl": 0.0111083984375, + "kimi_kl": 0.0296630859375, + "learning_rate": 4.809e-07, + "loss": 0.0004, + "ppl": 0.01300048828125, + "reward": 0.9368844628334045, + "reward_std": 0.042323529720306396, + "rewards/perpo_ocr_edit_distance_reward": 0.9368845224380493, "step": 191, "temperature": 0.9 }, { - "advantages": -1.319817215517105e-06, - "completion_length": 622.0, - "delta_ref_entropy_loss": 0.025177001953125, - "delta_ref_ppl": -0.010833740234375, - "entropy_loss": -0.087158203125, - "epoch": 0.0768, - "grad_norm": 2.0312868074111194, - "k1_kl": 0.01080322265625, - "k3_kl": 0.00499725341796875, - "kimi_kl": 0.0074310302734375, - "learning_rate": 4.616e-07, - "loss": 0.0002, - "ppl": 0.04461669921875, - "reward": 0.9450285732746124, - "reward_std": 0.007454175502061844, - "rewards/perpo_ocr_edit_distance_reward": 0.9450285732746124, + "advantages": -7.864407234592363e-05, + "completion_length": 583.0, + "delta_ref_entropy_loss": 0.018310546875, + "delta_ref_ppl": -0.0098876953125, + "entropy_loss": -0.020263671875, + "epoch": 0.0384, + "grad_norm": 0.9411011296040435, + "k1_kl": 0.0098876953125, + "k3_kl": 0.005523681640625, + "kimi_kl": 0.0098876953125, + "learning_rate": 4.808e-07, + "loss": 0.0003, + "ppl": 0.0076904296875, + "reward": 0.9829341173171997, + "reward_std": 0.00044128397712484, + "rewards/perpo_ocr_edit_distance_reward": 0.9829341173171997, "step": 192, "temperature": 0.9 }, { - "advantages": -5.568777133646563e-06, - "completion_length": 253.0, - "delta_ref_entropy_loss": 0.02276611328125, - "delta_ref_ppl": -0.0286865234375, - "entropy_loss": -0.1578369140625, - "epoch": 0.0772, - "grad_norm": 1.5668972099384257, - "k1_kl": 0.028778076171875, - "k3_kl": 0.016754150390625, - "kimi_kl": 0.0316162109375, - "learning_rate": 4.6139999999999994e-07, - "loss": 0.0007, - "ppl": 0.08367919921875, - "reward": 0.8983044028282166, - "reward_std": 0.029859457863494754, - "rewards/perpo_ocr_edit_distance_reward": 0.8983044326305389, + "advantages": -5.8795725635718554e-05, + "completion_length": 533.0, + "delta_ref_entropy_loss": 0.0322265625, + "delta_ref_ppl": -0.01904296875, + "entropy_loss": -0.09521484375, + "epoch": 0.0386, + "grad_norm": 1.2872279275604606, + "k1_kl": 0.01904296875, + "k3_kl": 0.0068359375, + "kimi_kl": 0.0091552734375, + "learning_rate": 4.807e-07, + "loss": 0.0003, + "ppl": 0.047119140625, + "reward": 0.9601398706436157, + "reward_std": 0.0010583141120150685, + "rewards/perpo_ocr_edit_distance_reward": 0.9601399898529053, "step": 193, "temperature": 0.9 }, { - "advantages": -2.830369248840725e-05, - "completion_length": 442.0, - "delta_ref_entropy_loss": 0.01373291015625, - "delta_ref_ppl": -0.0197906494140625, - "entropy_loss": -0.087890625, - "epoch": 0.0776, - "grad_norm": 1.6392941458008192, - "k1_kl": 0.0197906494140625, - "k3_kl": 0.009082794189453125, - "kimi_kl": 0.01485443115234375, - "learning_rate": 4.612e-07, - "loss": 0.0004, - "ppl": 0.049163818359375, - "reward": 0.9880252480506897, - "reward_std": 0.004627635818906128, - "rewards/perpo_ocr_edit_distance_reward": 0.9880253374576569, + "advantages": -2.843993206624873e-05, + "completion_length": 1710.0, + "delta_ref_entropy_loss": 0.00469970703125, + "delta_ref_ppl": -0.0042724609375, + "entropy_loss": -0.0072021484375, + "epoch": 0.0388, + "grad_norm": 0.224779662428725, + "k1_kl": 0.004241943359375, + "k3_kl": 0.0021209716796875, + "kimi_kl": 0.0037078857421875, + "learning_rate": 4.806e-07, + "loss": 0.0001, + "ppl": 0.00299072265625, + "reward": 0.9325183629989624, + "reward_std": 0.004093698691576719, + "rewards/perpo_ocr_edit_distance_reward": 0.932518482208252, "step": 194, "temperature": 0.9 }, { - "advantages": 3.078579993598396e-05, - "completion_length": 493.0, - "delta_ref_entropy_loss": 0.01904296875, - "delta_ref_ppl": -0.0078582763671875, - "entropy_loss": -0.0247802734375, - "epoch": 0.078, - "grad_norm": 0.47098957931024243, - "k1_kl": 0.0078277587890625, - "k3_kl": 0.002719879150390625, - "kimi_kl": 0.0037384033203125, - "learning_rate": 4.61e-07, - "loss": 0.0001, - "ppl": 0.00970458984375, - "reward": 0.987204521894455, - "reward_std": 0.0007125649572117254, - "rewards/perpo_ocr_edit_distance_reward": 0.9872045516967773, + "advantages": -2.2990363390817947e-07, + "completion_length": 404.0, + "delta_ref_entropy_loss": 0.02587890625, + "delta_ref_ppl": -0.0140380859375, + "entropy_loss": -0.1083984375, + "epoch": 0.039, + "grad_norm": 3.1337515682667094, + "k1_kl": 0.01416015625, + "k3_kl": 0.0067138671875, + "kimi_kl": 0.010009765625, + "learning_rate": 4.805e-07, + "loss": 0.0003, + "ppl": 0.0517578125, + "reward": 0.5131015777587891, + "reward_std": 0.3054526150226593, + "rewards/perpo_ocr_edit_distance_reward": 0.5131016969680786, "step": 195, "temperature": 0.9 }, { - "advantages": -1.5359904857348283e-05, - "completion_length": 1314.0, - "delta_ref_entropy_loss": 0.026123046875, - "delta_ref_ppl": -0.012908935546875, - "entropy_loss": -0.2099609375, - "epoch": 0.0784, - "grad_norm": 0.8310404188710109, - "k1_kl": 0.012786865234375, - "k3_kl": 0.0058135986328125, - "kimi_kl": 0.00751495361328125, - "learning_rate": 4.6079999999999994e-07, - "loss": 0.0002, - "ppl": 0.11236572265625, - "reward": 0.8256711065769196, - "reward_std": 0.1372755838674493, - "rewards/perpo_ocr_edit_distance_reward": 0.8256711959838867, + "advantages": -3.0994415283203125e-05, + "completion_length": 503.0, + "delta_ref_entropy_loss": 0.02099609375, + "delta_ref_ppl": -0.00836181640625, + "entropy_loss": -0.0712890625, + "epoch": 0.0392, + "grad_norm": 1.0558098150067177, + "k1_kl": 0.00830078125, + "k3_kl": 0.0028076171875, + "kimi_kl": 0.0030670166015625, + "learning_rate": 4.804e-07, + "loss": 0.0001, + "ppl": 0.0390625, + "reward": 0.9720739722251892, + "reward_std": 0.0018220635829493403, + "rewards/perpo_ocr_edit_distance_reward": 0.9720740914344788, "step": 196, "temperature": 0.9 }, { - "advantages": -7.833753315367176e-06, - "completion_length": 478.5, - "delta_ref_entropy_loss": 0.02789306640625, - "delta_ref_ppl": -0.01837158203125, - "entropy_loss": -0.1376953125, - "epoch": 0.0788, - "grad_norm": 2.8747804256065526, - "k1_kl": 0.01824951171875, - "k3_kl": 0.02197265625, - "kimi_kl": 0.02978515625, - "learning_rate": 4.606e-07, - "loss": 0.0009, - "ppl": 0.0855712890625, - "reward": 0.735127717256546, - "reward_std": 0.09580186495441012, - "rewards/perpo_ocr_edit_distance_reward": 0.7351277470588684, + "advantages": -3.644398475444177e-06, + "completion_length": 681.0, + "delta_ref_entropy_loss": 0.035400390625, + "delta_ref_ppl": -0.012451171875, + "entropy_loss": -0.11474609375, + "epoch": 0.0394, + "grad_norm": 0.92897109958176, + "k1_kl": 0.01263427734375, + "k3_kl": 0.005157470703125, + "kimi_kl": 0.0069580078125, + "learning_rate": 4.803e-07, + "loss": 0.0002, + "ppl": 0.060302734375, + "reward": 0.9623088836669922, + "reward_std": 0.004580899141728878, + "rewards/perpo_ocr_edit_distance_reward": 0.9623090028762817, "step": 197, "temperature": 0.9 }, { - "advantages": -8.659703780722339e-06, - "completion_length": 1313.0, - "delta_ref_entropy_loss": 0.017486572265625, - "delta_ref_ppl": -0.01175689697265625, - "entropy_loss": -0.09521484375, - "epoch": 0.0792, - "grad_norm": 1.884178650670653, - "k1_kl": 0.01181793212890625, - "k3_kl": 0.0120849609375, - "kimi_kl": 0.0144195556640625, - "learning_rate": 4.6039999999999997e-07, - "loss": 0.0005, - "ppl": 0.0582275390625, - "reward": 0.8529087007045746, - "reward_std": 0.015866302885115147, - "rewards/perpo_ocr_edit_distance_reward": 0.8529087901115417, + "advantages": -3.542219019436743e-06, + "completion_length": 719.0, + "delta_ref_entropy_loss": 0.01165771484375, + "delta_ref_ppl": -0.006500244140625, + "entropy_loss": -0.0654296875, + "epoch": 0.0396, + "grad_norm": 1.072143041713687, + "k1_kl": 0.00653076171875, + "k3_kl": 0.0028533935546875, + "kimi_kl": 0.0037078857421875, + "learning_rate": 4.802e-07, + "loss": 0.0001, + "ppl": 0.0263671875, + "reward": 0.814034104347229, + "reward_std": 0.019011618569493294, + "rewards/perpo_ocr_edit_distance_reward": 0.8140342235565186, "step": 198, "temperature": 0.9 }, { - "advantages": 6.249972880567611e-06, - "completion_length": 470.5, - "delta_ref_entropy_loss": 0.012939453125, - "delta_ref_ppl": -0.01515960693359375, - "entropy_loss": -0.0528564453125, - "epoch": 0.0796, - "grad_norm": 1.119018385724403, - "k1_kl": 0.01522064208984375, - "k3_kl": 0.011478424072265625, - "kimi_kl": 0.0292205810546875, - "learning_rate": 4.6019999999999995e-07, + "advantages": -3.065381861233618e-06, + "completion_length": 194.0, + "delta_ref_entropy_loss": 0.0184326171875, + "delta_ref_ppl": -0.026611328125, + "entropy_loss": -0.055419921875, + "epoch": 0.0398, + "grad_norm": 2.1638247781519624, + "k1_kl": 0.0267333984375, + "k3_kl": 0.0126953125, + "kimi_kl": 0.023193359375, + "learning_rate": 4.801e-07, "loss": 0.0005, - "ppl": 0.028106689453125, - "reward": 0.7889312505722046, - "reward_std": 0.20777997630648315, - "rewards/perpo_ocr_edit_distance_reward": 0.7889313101768494, + "ppl": 0.017822265625, + "reward": 0.9508359432220459, + "reward_std": 0.013740760274231434, + "rewards/perpo_ocr_edit_distance_reward": 0.9508360028266907, "step": 199, "temperature": 0.9 }, { - "advantages": -9.77005333879788e-05, - "completion_length": 801.5, - "delta_ref_entropy_loss": 0.01568603515625, - "delta_ref_ppl": -0.0085296630859375, - "entropy_loss": -0.05120849609375, - "epoch": 0.08, - "grad_norm": 0.6090220781005395, - "k1_kl": 0.0084991455078125, - "k3_kl": 0.00505828857421875, - "kimi_kl": 0.00939178466796875, - "learning_rate": 4.6e-07, - "loss": 0.0003, - "ppl": 0.022979736328125, - "reward": 0.9936193823814392, - "reward_std": 0.0020899637456750497, - "rewards/perpo_ocr_edit_distance_reward": 0.9936195015907288, + "advantages": -4.117829666938633e-05, + "completion_length": 324.0, + "delta_ref_entropy_loss": 0.027099609375, + "delta_ref_ppl": -0.01220703125, + "entropy_loss": -0.0250244140625, + "epoch": 0.04, + "grad_norm": 0.9946735401040014, + "k1_kl": 0.01220703125, + "k3_kl": 0.005035400390625, + "kimi_kl": 0.00860595703125, + "learning_rate": 4.8e-07, + "loss": 0.0002, + "ppl": 0.0084228515625, + "reward": 0.9805392622947693, + "reward_std": 0.0015545717906206846, + "rewards/perpo_ocr_edit_distance_reward": 0.9805393815040588, "step": 200, "temperature": 0.9 }, { - "advantages": -9.145055884118847e-06, - "completion_length": 538.0, - "delta_ref_entropy_loss": 0.01531982421875, - "delta_ref_ppl": -0.0196533203125, - "entropy_loss": -0.052978515625, - "epoch": 0.0804, - "grad_norm": 1.1835353861951101, - "k1_kl": 0.01971435546875, - "k3_kl": 0.011138916015625, - "kimi_kl": 0.0194091796875, - "learning_rate": 4.598e-07, - "loss": 0.0005, - "ppl": 0.025390625, - "reward": 0.9589465856552124, - "reward_std": 0.09648415085393935, - "rewards/perpo_ocr_edit_distance_reward": 0.9589466452598572, + "advantages": -2.2138868871479644e-07, + "completion_length": 50.0, + "delta_ref_entropy_loss": -0.019287109375, + "delta_ref_ppl": -0.11669921875, + "entropy_loss": -0.8671875, + "epoch": 0.0402, + "grad_norm": 13.45430691097955, + "k1_kl": 0.11767578125, + "k3_kl": 0.10205078125, + "kimi_kl": 0.208984375, + "learning_rate": 4.799e-07, + "loss": 0.0041, + "ppl": 0.51171875, + "reward": 0.20407061278820038, + "reward_std": 0.10758687555789948, + "rewards/perpo_ocr_edit_distance_reward": 0.20407064259052277, "step": 201, "temperature": 0.9 }, { - "advantages": -2.5936537255688563e-05, - "completion_length": 1324.5, - "delta_ref_entropy_loss": 0.01690673828125, - "delta_ref_ppl": -0.010986328125, - "entropy_loss": -0.12451171875, - "epoch": 0.0808, - "grad_norm": 1.8832303400709949, - "k1_kl": 0.0110015869140625, - "k3_kl": 0.00667572021484375, - "kimi_kl": 0.00969696044921875, - "learning_rate": 4.596e-07, - "loss": 0.0003, - "ppl": 0.0665283203125, - "reward": 0.8114360272884369, - "reward_std": 0.11748023657128215, - "rewards/perpo_ocr_edit_distance_reward": 0.811436116695404, + "advantages": -1.9328936105011962e-06, + "completion_length": 411.0, + "delta_ref_entropy_loss": 0.02294921875, + "delta_ref_ppl": -0.01708984375, + "entropy_loss": -0.051025390625, + "epoch": 0.0404, + "grad_norm": 1.5077941400303467, + "k1_kl": 0.01708984375, + "k3_kl": 0.00909423828125, + "kimi_kl": 0.0164794921875, + "learning_rate": 4.797999999999999e-07, + "loss": 0.0004, + "ppl": 0.0230712890625, + "reward": 0.9578314423561096, + "reward_std": 0.017539246007800102, + "rewards/perpo_ocr_edit_distance_reward": 0.9578315019607544, "step": 202, "temperature": 0.9 }, { - "advantages": -6.624631055274222e-06, - "completion_length": 893.5, - "delta_ref_entropy_loss": 0.02655029296875, - "delta_ref_ppl": -0.02294921875, - "entropy_loss": -0.087890625, - "epoch": 0.0812, - "grad_norm": 4.7747864200217, - "k1_kl": 0.02288818359375, - "k3_kl": 0.010772705078125, - "kimi_kl": 0.01959228515625, - "learning_rate": 4.5939999999999994e-07, - "loss": 0.0004, - "ppl": 0.04449462890625, - "reward": 0.9182808995246887, - "reward_std": 0.012810579966753721, - "rewards/perpo_ocr_edit_distance_reward": 0.9182809889316559, + "advantages": -0.00020893132023047656, + "completion_length": 606.0, + "delta_ref_entropy_loss": 0.013427734375, + "delta_ref_ppl": -0.012451171875, + "entropy_loss": -0.0181884765625, + "epoch": 0.0406, + "grad_norm": 0.6248571927879423, + "k1_kl": 0.01239013671875, + "k3_kl": 0.00787353515625, + "kimi_kl": 0.0150146484375, + "learning_rate": 4.797e-07, + "loss": 0.0005, + "ppl": 0.0079345703125, + "reward": 0.9836103320121765, + "reward_std": 0.00030754541512578726, + "rewards/perpo_ocr_edit_distance_reward": 0.9836104512214661, "step": 203, "temperature": 0.9 }, { - "advantages": -7.706029634846345e-06, - "completion_length": 1185.0, - "delta_ref_entropy_loss": 0.020648956298828125, - "delta_ref_ppl": -0.01071929931640625, - "entropy_loss": -0.041259765625, - "epoch": 0.0816, - "grad_norm": 1.883658579599835, - "k1_kl": 0.01070404052734375, - "k3_kl": 0.00518035888671875, - "kimi_kl": 0.00672149658203125, - "learning_rate": 4.592e-07, - "loss": 0.0002, - "ppl": 0.0198974609375, - "reward": 0.8276781439781189, - "reward_std": 0.01737295335624367, - "rewards/perpo_ocr_edit_distance_reward": 0.8276781439781189, + "advantages": -6.505421424662927e-06, + "completion_length": 842.0, + "delta_ref_entropy_loss": 0.0220947265625, + "delta_ref_ppl": -0.015625, + "entropy_loss": -0.04443359375, + "epoch": 0.0408, + "grad_norm": 0.7885439987653512, + "k1_kl": 0.01556396484375, + "k3_kl": 0.007080078125, + "kimi_kl": 0.01336669921875, + "learning_rate": 4.796e-07, + "loss": 0.0003, + "ppl": 0.021728515625, + "reward": 0.927276074886322, + "reward_std": 0.0038172942586243153, + "rewards/perpo_ocr_edit_distance_reward": 0.9272761344909668, "step": 204, "temperature": 0.9 }, { - "advantages": -2.4723156457184814e-05, - "completion_length": 266.0, - "delta_ref_entropy_loss": 0.01995849609375, - "delta_ref_ppl": -0.05596923828125, - "entropy_loss": -0.134765625, - "epoch": 0.082, - "grad_norm": 2.04828580309362, - "k1_kl": 0.05596923828125, - "k3_kl": 0.0307159423828125, - "kimi_kl": 0.047332763671875, - "learning_rate": 4.59e-07, - "loss": 0.0013, - "ppl": 0.05682373046875, - "reward": 0.8122312128543854, - "reward_std": 0.0029896608903072774, - "rewards/perpo_ocr_edit_distance_reward": 0.8122312426567078, + "advantages": -3.4059798537100505e-08, + "completion_length": 824.0, + "delta_ref_entropy_loss": 0.0198974609375, + "delta_ref_ppl": -0.01446533203125, + "entropy_loss": -0.060791015625, + "epoch": 0.041, + "grad_norm": 0.6206088679436752, + "k1_kl": 0.01446533203125, + "k3_kl": 0.00885009765625, + "kimi_kl": 0.0213623046875, + "learning_rate": 4.794999999999999e-07, + "loss": 0.0004, + "ppl": 0.0303955078125, + "reward": 0.8436829447746277, + "reward_std": 0.3451390862464905, + "rewards/perpo_ocr_edit_distance_reward": 0.8436830043792725, "step": 205, "temperature": 0.9 }, { - "advantages": -2.8320722513797136e-05, - "completion_length": 1240.0, - "delta_ref_entropy_loss": 0.02880859375, - "delta_ref_ppl": -0.01202392578125, - "entropy_loss": -0.1337890625, - "epoch": 0.0824, - "grad_norm": 1.8904516980507682, - "k1_kl": 0.01214599609375, - "k3_kl": 0.007915496826171875, - "kimi_kl": 0.01465606689453125, - "learning_rate": 4.5879999999999995e-07, - "loss": 0.0003, - "ppl": 0.071014404296875, - "reward": 0.8556314706802368, - "reward_std": 0.12730570568237454, - "rewards/perpo_ocr_edit_distance_reward": 0.8556315302848816, + "advantages": -3.3889500627992675e-06, + "completion_length": 1142.0, + "delta_ref_entropy_loss": 0.044921875, + "delta_ref_ppl": -0.018310546875, + "entropy_loss": -0.10205078125, + "epoch": 0.0412, + "grad_norm": 1.8273721447680744, + "k1_kl": 0.018310546875, + "k3_kl": 0.01611328125, + "kimi_kl": 0.0120849609375, + "learning_rate": 4.794e-07, + "loss": 0.0006, + "ppl": 0.0537109375, + "reward": 0.9140786528587341, + "reward_std": 0.017575865611433983, + "rewards/perpo_ocr_edit_distance_reward": 0.9140787720680237, "step": 206, "temperature": 0.9 }, { - "advantages": -0.0003381712122063618, - "completion_length": 485.0, - "delta_ref_entropy_loss": 0.02435302734375, - "delta_ref_ppl": -0.018157958984375, - "entropy_loss": -0.03387451171875, - "epoch": 0.0828, - "grad_norm": 0.18682344779969526, - "k1_kl": 0.018096923828125, - "k3_kl": 0.008453369140625, - "kimi_kl": 0.014312744140625, - "learning_rate": 4.586e-07, - "loss": 0.0007, - "ppl": 0.0113525390625, - "reward": 0.846202939748764, - "reward_std": 0.00026816889294423163, - "rewards/perpo_ocr_edit_distance_reward": 0.8462030291557312, - "step": 207, - "temperature": 0.9 - }, + "advantages": 0.0, + "completion_length": 584.0, + "delta_ref_entropy_loss": 0.0106201171875, + "delta_ref_ppl": -0.0106201171875, + "entropy_loss": -0.03662109375, + "epoch": 0.0414, + "grad_norm": 0.7320599287763567, + "k1_kl": 0.0106201171875, + "k3_kl": 0.005096435546875, + "kimi_kl": 0.00909423828125, + "learning_rate": 4.793e-07, + "loss": 0.0002, + "ppl": 0.01519775390625, + "reward": 0.9716980457305908, + "reward_std": 0.004056534264236689, + "rewards/perpo_ocr_edit_distance_reward": 0.9716981053352356, + "step": 207, + "temperature": 0.9 + }, { - "advantages": -2.9027463824604638e-05, - "completion_length": 427.0, - "delta_ref_entropy_loss": 0.0269775390625, - "delta_ref_ppl": -0.016357421875, - "entropy_loss": -0.03265380859375, - "epoch": 0.0832, - "grad_norm": 0.8364694808191355, - "k1_kl": 0.01629638671875, - "k3_kl": 0.00669097900390625, - "kimi_kl": 0.0118560791015625, - "learning_rate": 4.584e-07, - "loss": 0.0003, - "ppl": 0.013824462890625, - "reward": 0.9967804551124573, - "reward_std": 0.0011019092635251582, - "rewards/perpo_ocr_edit_distance_reward": 0.996780514717102, + "advantages": -1.1341912795614917e-05, + "completion_length": 480.0, + "delta_ref_entropy_loss": 0.032470703125, + "delta_ref_ppl": -0.021484375, + "entropy_loss": -0.07763671875, + "epoch": 0.0416, + "grad_norm": 1.1479084714297956, + "k1_kl": 0.0213623046875, + "k3_kl": 0.01171875, + "kimi_kl": 0.0181884765625, + "learning_rate": 4.792e-07, + "loss": 0.0005, + "ppl": 0.03369140625, + "reward": 0.9407750964164734, + "reward_std": 0.001401507412083447, + "rewards/perpo_ocr_edit_distance_reward": 0.9407750964164734, "step": 208, "temperature": 0.9 }, { - "advantages": -3.4059798537100505e-08, - "completion_length": 848.0, - "delta_ref_entropy_loss": 0.0098114013671875, - "delta_ref_ppl": -0.0172119140625, - "entropy_loss": -0.10205078125, - "epoch": 0.0836, - "grad_norm": 0.6707910296687131, - "k1_kl": 0.017181396484375, - "k3_kl": 0.011444091796875, - "kimi_kl": 0.02581787109375, - "learning_rate": 4.5819999999999996e-07, - "loss": 0.0005, - "ppl": 0.04962158203125, - "reward": 0.6958383917808533, - "reward_std": 0.16007207334041595, - "rewards/perpo_ocr_edit_distance_reward": 0.6958384215831757, + "advantages": -0.00011805126268882304, + "completion_length": 377.0, + "delta_ref_entropy_loss": 0.017333984375, + "delta_ref_ppl": -0.0133056640625, + "entropy_loss": -0.02734375, + "epoch": 0.0418, + "grad_norm": 0.9775012849496112, + "k1_kl": 0.01324462890625, + "k3_kl": 0.00811767578125, + "kimi_kl": 0.010986328125, + "learning_rate": 4.791e-07, + "loss": 0.0004, + "ppl": 0.01116943359375, + "reward": 0.9780649542808533, + "reward_std": 0.0006212948937900364, + "rewards/perpo_ocr_edit_distance_reward": 0.978065013885498, "step": 209, "temperature": 0.9 }, { - "advantages": 2.8141908558154682e-05, - "completion_length": 487.0, - "delta_ref_entropy_loss": 0.02410888671875, - "delta_ref_ppl": -0.0390625, - "entropy_loss": -0.056884765625, - "epoch": 0.084, - "grad_norm": 3.4859746261260196, - "k1_kl": 0.0390625, - "k3_kl": 0.02386474609375, - "kimi_kl": 0.046875, - "learning_rate": 4.58e-07, - "loss": 0.0009, - "ppl": 0.03009033203125, - "reward": 0.9621062874794006, - "reward_std": 0.007296456635231152, - "rewards/perpo_ocr_edit_distance_reward": 0.9621062576770782, + "advantages": -5.161762601346709e-05, + "completion_length": 712.0, + "delta_ref_entropy_loss": 0.0142822265625, + "delta_ref_ppl": -0.006561279296875, + "entropy_loss": -0.01806640625, + "epoch": 0.042, + "grad_norm": 0.3882597259056436, + "k1_kl": 0.006561279296875, + "k3_kl": 0.0034027099609375, + "kimi_kl": 0.00634765625, + "learning_rate": 4.79e-07, + "loss": 0.0002, + "ppl": 0.007049560546875, + "reward": 0.9874167442321777, + "reward_std": 0.00022982244263403118, + "rewards/perpo_ocr_edit_distance_reward": 0.9874167442321777, "step": 210, "temperature": 0.9 }, { - "advantages": -3.744449090703483e-05, - "completion_length": 962.5, - "delta_ref_entropy_loss": 0.014617919921875, - "delta_ref_ppl": -0.01416015625, - "entropy_loss": -0.07403564453125, - "epoch": 0.0844, - "grad_norm": 1.5464216928809935, - "k1_kl": 0.014129638671875, - "k3_kl": 0.0095367431640625, - "kimi_kl": 0.01495361328125, - "learning_rate": 4.578e-07, - "loss": 0.0004, - "ppl": 0.034454345703125, - "reward": 0.6886914968490601, - "reward_std": 0.09274141330388375, - "rewards/perpo_ocr_edit_distance_reward": 0.688691571354866, + "advantages": -2.043587983280304e-07, + "completion_length": 301.0, + "delta_ref_entropy_loss": 0.0830078125, + "delta_ref_ppl": -0.04345703125, + "entropy_loss": -0.546875, + "epoch": 0.0422, + "grad_norm": 2.7658145193174137, + "k1_kl": 0.04296875, + "k3_kl": 0.0184326171875, + "kimi_kl": 0.024658203125, + "learning_rate": 4.789e-07, + "loss": 0.0007, + "ppl": 0.3046875, + "reward": 0.5320717096328735, + "reward_std": 0.236982062458992, + "rewards/perpo_ocr_edit_distance_reward": 0.5320717692375183, "step": 211, "temperature": 0.9 }, { - "advantages": -2.1840845931819786e-06, - "completion_length": 1201.5, - "delta_ref_entropy_loss": 0.0279541015625, - "delta_ref_ppl": -0.015655517578125, - "entropy_loss": -0.047119140625, - "epoch": 0.0848, - "grad_norm": 12.79702809941224, - "k1_kl": 0.015655517578125, - "k3_kl": 0.012603759765625, - "kimi_kl": 0.017333984375, - "learning_rate": 4.5759999999999997e-07, + "advantages": -1.697880907158833e-05, + "completion_length": 718.0, + "delta_ref_entropy_loss": 0.0225830078125, + "delta_ref_ppl": -0.01953125, + "entropy_loss": -0.058837890625, + "epoch": 0.0424, + "grad_norm": 1.0736067969989447, + "k1_kl": 0.01953125, + "k3_kl": 0.011474609375, + "kimi_kl": 0.0279541015625, + "learning_rate": 4.788e-07, "loss": 0.0005, - "ppl": 0.02349853515625, - "reward": 0.8496841192245483, - "reward_std": 0.12211609585210681, - "rewards/perpo_ocr_edit_distance_reward": 0.8496842086315155, + "ppl": 0.022705078125, + "reward": 0.9433961510658264, + "reward_std": 0.005913665983825922, + "rewards/perpo_ocr_edit_distance_reward": 0.9433963298797607, "step": 212, "temperature": 0.9 }, { - "advantages": -6.220170831738869e-06, - "completion_length": 349.0, - "delta_ref_entropy_loss": 0.0384521484375, - "delta_ref_ppl": -0.071044921875, - "entropy_loss": -0.1728515625, - "epoch": 0.0852, - "grad_norm": 1.6178611918283425, - "k1_kl": 0.0711669921875, - "k3_kl": 0.0458984375, - "kimi_kl": 0.14453125, - "learning_rate": 4.5739999999999995e-07, - "loss": 0.0018, - "ppl": 0.0797119140625, - "reward": 0.8614193499088287, - "reward_std": 0.11313596670515835, - "rewards/perpo_ocr_edit_distance_reward": 0.8614194095134735, + "advantages": -1.805169267754536e-05, + "completion_length": 629.0, + "delta_ref_entropy_loss": 0.01361083984375, + "delta_ref_ppl": -0.00958251953125, + "entropy_loss": -0.0277099609375, + "epoch": 0.0426, + "grad_norm": 0.6155493027764732, + "k1_kl": 0.00958251953125, + "k3_kl": 0.007293701171875, + "kimi_kl": 0.010986328125, + "learning_rate": 4.787e-07, + "loss": 0.0003, + "ppl": 0.014892578125, + "reward": 0.9863788485527039, + "reward_std": 0.0003715285856742412, + "rewards/perpo_ocr_edit_distance_reward": 0.9863789677619934, "step": 213, "temperature": 0.9 }, { - "advantages": -2.773319101834204e-05, - "completion_length": 791.0, - "delta_ref_entropy_loss": 0.02850341796875, - "delta_ref_ppl": -0.02496337890625, - "entropy_loss": -0.099609375, - "epoch": 0.0856, - "grad_norm": 1.1600579976795329, - "k1_kl": 0.02496337890625, - "k3_kl": 0.014923095703125, - "kimi_kl": 0.03118896484375, - "learning_rate": 4.572e-07, - "loss": 0.0006, - "ppl": 0.05181884765625, - "reward": 0.9593010544776917, - "reward_std": 0.005468996678246185, - "rewards/perpo_ocr_edit_distance_reward": 0.9593011438846588, + "advantages": 0.0, + "completion_length": 393.0, + "delta_ref_entropy_loss": 0.049072265625, + "delta_ref_ppl": -0.0230712890625, + "entropy_loss": -0.1611328125, + "epoch": 0.0428, + "grad_norm": 2.2430808130766815, + "k1_kl": 0.023193359375, + "k3_kl": 0.0106201171875, + "kimi_kl": 0.016357421875, + "learning_rate": 4.786e-07, + "loss": 0.0004, + "ppl": 0.08349609375, + "reward": 0.9035291075706482, + "reward_std": 0.06925825774669647, + "rewards/perpo_ocr_edit_distance_reward": 0.9035290479660034, "step": 214, "temperature": 0.9 }, { - "advantages": -4.066314227202383e-05, - "completion_length": 635.5, - "delta_ref_entropy_loss": 0.029296875, - "delta_ref_ppl": -0.0145416259765625, - "entropy_loss": -0.04779052734375, - "epoch": 0.086, - "grad_norm": 0.5708098733466569, - "k1_kl": 0.014556884765625, - "k3_kl": 0.00656890869140625, - "kimi_kl": 0.0104217529296875, - "learning_rate": 4.57e-07, - "loss": 0.0003, - "ppl": 0.0233001708984375, - "reward": 0.9178333878517151, - "reward_std": 0.011611780111707048, - "rewards/perpo_ocr_edit_distance_reward": 0.9178334176540375, + "advantages": -1.3623919414840202e-07, + "completion_length": 793.0, + "delta_ref_entropy_loss": 0.0228271484375, + "delta_ref_ppl": -0.0091552734375, + "entropy_loss": -0.048828125, + "epoch": 0.043, + "grad_norm": 0.41655300697050746, + "k1_kl": 0.00909423828125, + "k3_kl": 0.002777099609375, + "kimi_kl": 0.003936767578125, + "learning_rate": 4.785e-07, + "loss": 0.0001, + "ppl": 0.0185546875, + "reward": 0.8953214883804321, + "reward_std": 0.22653722763061523, + "rewards/perpo_ocr_edit_distance_reward": 0.8953215479850769, "step": 215, "temperature": 0.9 }, { - "advantages": -8.191381311917212e-06, - "completion_length": 586.5, - "delta_ref_entropy_loss": 0.02655029296875, - "delta_ref_ppl": -0.012451171875, - "entropy_loss": -0.0616455078125, - "epoch": 0.0864, - "grad_norm": 0.8145072978131812, - "k1_kl": 0.012420654296875, - "k3_kl": 0.005340576171875, - "kimi_kl": 0.00775146484375, - "learning_rate": 4.5679999999999996e-07, + "advantages": -5.10896995820076e-08, + "completion_length": 2048.0, + "delta_ref_entropy_loss": 0.004180908203125, + "delta_ref_ppl": -0.00628662109375, + "entropy_loss": -0.458984375, + "epoch": 0.0432, + "grad_norm": 2.142957586674671, + "k1_kl": 0.0067138671875, + "k3_kl": 0.004119873046875, + "kimi_kl": 0.005706787109375, + "learning_rate": 4.783999999999999e-07, "loss": 0.0002, - "ppl": 0.0306396484375, - "reward": 0.986598938703537, - "reward_std": 0.0034388480125926435, - "rewards/perpo_ocr_edit_distance_reward": 0.9865989983081818, + "ppl": 0.3125, + "reward": 0.11116766929626465, + "reward_std": 0.05731361731886864, + "rewards/perpo_ocr_edit_distance_reward": 0.11116767674684525, "step": 216, "temperature": 0.9 }, { - "advantages": -7.576602365588769e-05, - "completion_length": 377.5, - "delta_ref_entropy_loss": 0.025634765625, - "delta_ref_ppl": -0.0092010498046875, - "entropy_loss": -0.0186767578125, - "epoch": 0.0868, - "grad_norm": 0.3130924684191532, - "k1_kl": 0.0091705322265625, - "k3_kl": 0.002532958984375, - "kimi_kl": 0.00366973876953125, - "learning_rate": 4.566e-07, - "loss": 0.0002, - "ppl": 0.00555419921875, - "reward": 0.9992394149303436, - "reward_std": 0.00011856618948513642, - "rewards/perpo_ocr_edit_distance_reward": 0.9992394745349884, + "advantages": -0.00029792106943205, + "completion_length": 982.0, + "delta_ref_entropy_loss": 0.01483154296875, + "delta_ref_ppl": -0.00799560546875, + "entropy_loss": -0.0174560546875, + "epoch": 0.0434, + "grad_norm": 0.13940110458482044, + "k1_kl": 0.0079345703125, + "k3_kl": 0.0030975341796875, + "kimi_kl": 0.005889892578125, + "learning_rate": 4.782999999999999e-07, + "loss": 0.0004, + "ppl": 0.005523681640625, + "reward": 0.9889803528785706, + "reward_std": 0.00024298107018694282, + "rewards/perpo_ocr_edit_distance_reward": 0.9889804720878601, "step": 217, "temperature": 0.9 }, { - "advantages": -5.543232327909209e-05, - "completion_length": 949.5, - "delta_ref_entropy_loss": 0.01495361328125, - "delta_ref_ppl": -0.0106048583984375, - "entropy_loss": -0.02947998046875, - "epoch": 0.0872, - "grad_norm": 0.34234903178894616, - "k1_kl": 0.010528564453125, - "k3_kl": 0.00646209716796875, - "kimi_kl": 0.01493072509765625, - "learning_rate": 4.5639999999999993e-07, - "loss": 0.0003, - "ppl": 0.0125732421875, - "reward": 0.9524321258068085, - "reward_std": 0.0009246931585948914, - "rewards/perpo_ocr_edit_distance_reward": 0.9524322152137756, + "advantages": -8.514949740856537e-07, + "completion_length": 415.0, + "delta_ref_entropy_loss": 0.0220947265625, + "delta_ref_ppl": -0.0302734375, + "entropy_loss": -0.2099609375, + "epoch": 0.0436, + "grad_norm": 2.076658865257889, + "k1_kl": 0.0303955078125, + "k3_kl": 0.020263671875, + "kimi_kl": 0.06982421875, + "learning_rate": 4.782e-07, + "loss": 0.0008, + "ppl": 0.11572265625, + "reward": 0.30877047777175903, + "reward_std": 0.05632273852825165, + "rewards/perpo_ocr_edit_distance_reward": 0.3087705373764038, "step": 218, "temperature": 0.9 }, { - "advantages": -5.036592483520508e-06, - "completion_length": 1057.0, - "delta_ref_entropy_loss": 0.059112548828125, - "delta_ref_ppl": -0.09984588623046875, - "entropy_loss": -0.46630859375, - "epoch": 0.0876, - "grad_norm": 6.811256069052017, - "k1_kl": 0.099365234375, - "k3_kl": 0.062164306640625, - "kimi_kl": 0.11846160888671875, - "learning_rate": 4.5619999999999997e-07, - "loss": 0.0025, - "ppl": 0.2293701171875, - "reward": 0.40275101363658905, - "reward_std": 0.06385501989279874, - "rewards/perpo_ocr_edit_distance_reward": 0.40275105834007263, + "advantages": 3.899846888089087e-06, + "completion_length": 205.0, + "delta_ref_entropy_loss": 0.0252685546875, + "delta_ref_ppl": -0.03125, + "entropy_loss": -0.08203125, + "epoch": 0.0438, + "grad_norm": 2.6831039969410018, + "k1_kl": 0.03125, + "k3_kl": 0.0196533203125, + "kimi_kl": 0.045166015625, + "learning_rate": 4.781e-07, + "loss": 0.0008, + "ppl": 0.046630859375, + "reward": 0.8960029482841492, + "reward_std": 0.002077161567285657, + "rewards/perpo_ocr_edit_distance_reward": 0.8960029482841492, "step": 219, "temperature": 0.9 }, { - "advantages": -2.264976683363784e-05, - "completion_length": 597.5, - "delta_ref_entropy_loss": 0.016082763671875, - "delta_ref_ppl": -0.0179443359375, - "entropy_loss": -0.044921875, - "epoch": 0.088, - "grad_norm": 0.5991673861887113, - "k1_kl": 0.01806640625, - "k3_kl": 0.009857177734375, - "kimi_kl": 0.02093505859375, - "learning_rate": 4.56e-07, - "loss": 0.0004, - "ppl": 0.02276611328125, - "reward": 0.9316835999488831, - "reward_std": 0.015364864259026945, - "rewards/perpo_ocr_edit_distance_reward": 0.9316836595535278, + "advantages": 0.0, + "completion_length": 233.0, + "delta_ref_entropy_loss": 0.030517578125, + "delta_ref_ppl": -0.0177001953125, + "entropy_loss": -0.030029296875, + "epoch": 0.044, + "grad_norm": 1.71501671357038, + "k1_kl": 0.0177001953125, + "k3_kl": 0.00872802734375, + "kimi_kl": 0.0152587890625, + "learning_rate": 4.779999999999999e-07, + "loss": 0.0003, + "ppl": 0.01153564453125, + "reward": 0.9518201947212219, + "reward_std": 0.000800012843683362, + "rewards/perpo_ocr_edit_distance_reward": 0.9518202543258667, "step": 220, "temperature": 0.9 }, { - "advantages": -6.912861971386519e-05, - "completion_length": 649.5, - "delta_ref_entropy_loss": 0.016571044921875, - "delta_ref_ppl": -0.006927490234375, - "entropy_loss": -0.0172119140625, - "epoch": 0.0884, - "grad_norm": 0.5005360086699089, - "k1_kl": 0.0069122314453125, - "k3_kl": 0.0033626556396484375, - "kimi_kl": 0.00446319580078125, - "learning_rate": 4.5579999999999994e-07, - "loss": 0.0002, - "ppl": 0.0071868896484375, - "reward": 0.9846307039260864, - "reward_std": 0.003669788333354518, - "rewards/perpo_ocr_edit_distance_reward": 0.9846307933330536, + "advantages": -6.181853223097278e-06, + "completion_length": 750.0, + "delta_ref_entropy_loss": 0.026611328125, + "delta_ref_ppl": -0.02880859375, + "entropy_loss": -0.072265625, + "epoch": 0.0442, + "grad_norm": 0.9261835417961478, + "k1_kl": 0.02880859375, + "k3_kl": 0.0191650390625, + "kimi_kl": 0.033447265625, + "learning_rate": 4.779e-07, + "loss": 0.0008, + "ppl": 0.03662109375, + "reward": 0.9687142372131348, + "reward_std": 0.010904228314757347, + "rewards/perpo_ocr_edit_distance_reward": 0.9687142968177795, "step": 221, "temperature": 0.9 }, { - "advantages": -8.027043285085966e-05, - "completion_length": 894.5, - "delta_ref_entropy_loss": 0.0205078125, - "delta_ref_ppl": -0.01397705078125, - "entropy_loss": -0.03485107421875, - "epoch": 0.0888, - "grad_norm": 1.0267186278429332, - "k1_kl": 0.014007568359375, - "k3_kl": 0.007171630859375, - "kimi_kl": 0.01300048828125, - "learning_rate": 4.556e-07, - "loss": 0.0004, - "ppl": 0.01519775390625, - "reward": 0.9922086000442505, - "reward_std": 0.009859735073405318, - "rewards/perpo_ocr_edit_distance_reward": 0.9922086596488953, + "advantages": -3.4059798537100505e-08, + "completion_length": 2048.0, + "delta_ref_entropy_loss": -0.01092529296875, + "delta_ref_ppl": -0.044189453125, + "entropy_loss": -0.4140625, + "epoch": 0.0444, + "grad_norm": 5.907413424029178, + "k1_kl": 0.044677734375, + "k3_kl": 0.0390625, + "kimi_kl": 0.1279296875, + "learning_rate": 4.778e-07, + "loss": 0.0016, + "ppl": 0.232421875, + "reward": 0.14174658060073853, + "reward_std": 0.11963863670825958, + "rewards/perpo_ocr_edit_distance_reward": 0.14174659550189972, "step": 222, "temperature": 0.9 }, { - "advantages": -2.4071763391475542e-05, - "completion_length": 350.0, - "delta_ref_entropy_loss": 0.02587890625, - "delta_ref_ppl": -0.03021240234375, - "entropy_loss": -0.0653076171875, - "epoch": 0.0892, - "grad_norm": 1.121939156496563, - "k1_kl": 0.0302734375, - "k3_kl": 0.01959228515625, - "kimi_kl": 0.0540771484375, - "learning_rate": 4.554e-07, - "loss": 0.0008, - "ppl": 0.033905029296875, - "reward": 0.9615525007247925, - "reward_std": 0.011579394631553441, - "rewards/perpo_ocr_edit_distance_reward": 0.9615525901317596, + "advantages": -0.00015793528291396797, + "completion_length": 448.0, + "delta_ref_entropy_loss": 0.0252685546875, + "delta_ref_ppl": -0.01495361328125, + "entropy_loss": -0.036865234375, + "epoch": 0.0446, + "grad_norm": 0.45619432609267396, + "k1_kl": 0.0150146484375, + "k3_kl": 0.006103515625, + "kimi_kl": 0.0086669921875, + "learning_rate": 4.777e-07, + "loss": 0.0004, + "ppl": 0.0152587890625, + "reward": 0.9785297513008118, + "reward_std": 0.00016957592742983252, + "rewards/perpo_ocr_edit_distance_reward": 0.9785298109054565, "step": 223, "temperature": 0.9 }, { - "advantages": -8.514949634275126e-09, - "completion_length": 360.5, - "delta_ref_entropy_loss": 0.04052734375, - "delta_ref_ppl": -0.02801513671875, - "entropy_loss": -0.121826171875, - "epoch": 0.0896, - "grad_norm": 3.2479799274207743, - "k1_kl": 0.0279541015625, - "k3_kl": 0.01500701904296875, - "kimi_kl": 0.0367431640625, - "learning_rate": 4.5519999999999995e-07, - "loss": 0.0006, - "ppl": 0.06170654296875, - "reward": 0.9756431579589844, - "reward_std": 0.010837242938578129, - "rewards/perpo_ocr_edit_distance_reward": 0.9756431877613068, + "advantages": -4.4584277929970995e-05, + "completion_length": 708.0, + "delta_ref_entropy_loss": 0.0306396484375, + "delta_ref_ppl": -0.0189208984375, + "entropy_loss": -0.053955078125, + "epoch": 0.0448, + "grad_norm": 0.7873193188701759, + "k1_kl": 0.0189208984375, + "k3_kl": 0.0086669921875, + "kimi_kl": 0.015625, + "learning_rate": 4.776e-07, + "loss": 0.0004, + "ppl": 0.024169921875, + "reward": 0.9840461611747742, + "reward_std": 0.002001839457079768, + "rewards/perpo_ocr_edit_distance_reward": 0.9840461611747742, "step": 224, "temperature": 0.9 }, { - "advantages": -1.702989962382162e-07, - "completion_length": 721.5, - "delta_ref_entropy_loss": 0.03143310546875, - "delta_ref_ppl": -0.0289306640625, - "entropy_loss": -0.158447265625, - "epoch": 0.09, - "grad_norm": 0.7687467445629758, - "k1_kl": 0.028900146484375, - "k3_kl": 0.016265869140625, - "kimi_kl": 0.03936767578125, - "learning_rate": 4.55e-07, - "loss": 0.0007, - "ppl": 0.0797119140625, - "reward": 0.8294346332550049, - "reward_std": 0.24365413188934326, - "rewards/perpo_ocr_edit_distance_reward": 0.8294346630573273, + "advantages": -1.4219965578377014e-06, + "completion_length": 2048.0, + "delta_ref_entropy_loss": 0.004913330078125, + "delta_ref_ppl": -0.004364013671875, + "entropy_loss": -0.036865234375, + "epoch": 0.045, + "grad_norm": 3.318327241759968, + "k1_kl": 0.004425048828125, + "k3_kl": 0.00738525390625, + "kimi_kl": 0.004486083984375, + "learning_rate": 4.775e-07, + "loss": 0.0003, + "ppl": 0.0177001953125, + "reward": 0.4912078082561493, + "reward_std": 0.027015281841158867, + "rewards/perpo_ocr_edit_distance_reward": 0.49120789766311646, "step": 225, "temperature": 0.9 }, { - "advantages": -0.00010601537880461365, - "completion_length": 457.0, - "delta_ref_entropy_loss": 0.016693115234375, - "delta_ref_ppl": -0.009552001953125, - "entropy_loss": -0.069580078125, - "epoch": 0.0904, - "grad_norm": 0.5486142216452746, - "k1_kl": 0.00957489013671875, - "k3_kl": 0.004856109619140625, - "kimi_kl": 0.008625030517578125, - "learning_rate": 4.5479999999999997e-07, - "loss": 0.0003, - "ppl": 0.0316314697265625, - "reward": 0.8890295028686523, - "reward_std": 0.13322991357563296, - "rewards/perpo_ocr_edit_distance_reward": 0.8890295922756195, + "advantages": -9.394118387717754e-05, + "completion_length": 1661.0, + "delta_ref_entropy_loss": 0.007476806640625, + "delta_ref_ppl": -0.005401611328125, + "entropy_loss": -0.02294921875, + "epoch": 0.0452, + "grad_norm": 0.11466474928900816, + "k1_kl": 0.005401611328125, + "k3_kl": 0.0024566650390625, + "kimi_kl": 0.00482177734375, + "learning_rate": 4.774e-07, + "loss": 0.0002, + "ppl": 0.007415771484375, + "reward": 0.9757578372955322, + "reward_std": 0.0003532619448378682, + "rewards/perpo_ocr_edit_distance_reward": 0.9757578372955322, "step": 226, "temperature": 0.9 }, { - "advantages": -5.790165751307086e-07, - "completion_length": 1238.5, - "delta_ref_entropy_loss": 0.023681640625, - "delta_ref_ppl": -0.011932373046875, - "entropy_loss": -0.15869140625, - "epoch": 0.0908, - "grad_norm": 0.9050388667855312, - "k1_kl": 0.011871337890625, - "k3_kl": 0.006561279296875, - "kimi_kl": 0.0104827880859375, - "learning_rate": 4.546e-07, - "loss": 0.0003, - "ppl": 0.0723876953125, - "reward": 0.86997851729393, - "reward_std": 0.09361708909273148, - "rewards/perpo_ocr_edit_distance_reward": 0.8699785470962524, + "advantages": -0.00012759651872329414, + "completion_length": 836.0, + "delta_ref_entropy_loss": 0.005706787109375, + "delta_ref_ppl": -0.001708984375, + "entropy_loss": -0.01116943359375, + "epoch": 0.0454, + "grad_norm": 0.2991980234470077, + "k1_kl": 0.001708984375, + "k3_kl": 0.0003452301025390625, + "kimi_kl": 0.0003948211669921875, + "learning_rate": 4.773e-07, + "loss": 0.0001, + "ppl": 0.00384521484375, + "reward": 0.9773914217948914, + "reward_std": 0.0005673120613209903, + "rewards/perpo_ocr_edit_distance_reward": 0.9773915410041809, "step": 227, "temperature": 0.9 }, { - "advantages": -6.92265426494032e-06, - "completion_length": 924.5, - "delta_ref_entropy_loss": 0.02288818359375, - "delta_ref_ppl": -0.011627197265625, - "entropy_loss": -0.0482177734375, - "epoch": 0.0912, - "grad_norm": 0.6582632332268742, - "k1_kl": 0.01165771484375, - "k3_kl": 0.00609588623046875, - "kimi_kl": 0.009521484375, - "learning_rate": 4.544e-07, - "loss": 0.0003, - "ppl": 0.024566650390625, - "reward": 0.8864035606384277, - "reward_std": 0.13202398095745593, - "rewards/perpo_ocr_edit_distance_reward": 0.8864035904407501, + "advantages": -8.259501555585302e-06, + "completion_length": 1204.0, + "delta_ref_entropy_loss": 0.006805419921875, + "delta_ref_ppl": -0.004425048828125, + "entropy_loss": -0.0303955078125, + "epoch": 0.0456, + "grad_norm": 1.0499567576061095, + "k1_kl": 0.00445556640625, + "k3_kl": 0.0025787353515625, + "kimi_kl": 0.00531005859375, + "learning_rate": 4.772e-07, + "loss": 0.0001, + "ppl": 0.0140380859375, + "reward": 0.9583025574684143, + "reward_std": 0.0081337234005332, + "rewards/perpo_ocr_edit_distance_reward": 0.9583025574684143, "step": 228, "temperature": 0.9 }, { - "advantages": -0.0003120132860203739, - "completion_length": 389.0, - "delta_ref_entropy_loss": 0.03253173828125, - "delta_ref_ppl": -0.0205078125, - "entropy_loss": -0.04339599609375, - "epoch": 0.0916, - "grad_norm": 0.2921186471462249, - "k1_kl": 0.0205078125, - "k3_kl": 0.009735107421875, - "kimi_kl": 0.01739501953125, - "learning_rate": 4.542e-07, - "loss": 0.0007, - "ppl": 0.016326904296875, - "reward": 0.9910625219345093, - "reward_std": 0.0002543460577726364, - "rewards/perpo_ocr_edit_distance_reward": 0.991062581539154, + "advantages": -1.884358425741084e-05, + "completion_length": 557.0, + "delta_ref_entropy_loss": 0.01953125, + "delta_ref_ppl": -0.023681640625, + "entropy_loss": -0.0303955078125, + "epoch": 0.0458, + "grad_norm": 0.9686067164230776, + "k1_kl": 0.0238037109375, + "k3_kl": 0.014404296875, + "kimi_kl": 0.02978515625, + "learning_rate": 4.771e-07, + "loss": 0.0006, + "ppl": 0.0159912109375, + "reward": 0.9853621125221252, + "reward_std": 0.0026101896073669195, + "rewards/perpo_ocr_edit_distance_reward": 0.9853622317314148, "step": 229, "temperature": 0.9 }, { - "advantages": -8.238213922595605e-05, - "completion_length": 646.5, - "delta_ref_entropy_loss": 0.020751953125, - "delta_ref_ppl": -0.012451171875, - "entropy_loss": -0.02508544921875, - "epoch": 0.092, - "grad_norm": 0.33266684525487944, - "k1_kl": 0.012481689453125, - "k3_kl": 0.00579071044921875, - "kimi_kl": 0.010528564453125, - "learning_rate": 4.54e-07, - "loss": 0.0003, - "ppl": 0.008270263671875, - "reward": 0.9911216199398041, - "reward_std": 0.005537450371775776, - "rewards/perpo_ocr_edit_distance_reward": 0.9911217093467712, + "advantages": -8.20841160020791e-06, + "completion_length": 222.0, + "delta_ref_entropy_loss": 0.0537109375, + "delta_ref_ppl": -0.049560546875, + "entropy_loss": -0.1015625, + "epoch": 0.046, + "grad_norm": 2.2031984718209627, + "k1_kl": 0.049560546875, + "k3_kl": 0.02197265625, + "kimi_kl": 0.034912109375, + "learning_rate": 4.769999999999999e-07, + "loss": 0.0009, + "ppl": 0.059326171875, + "reward": 0.9545519948005676, + "reward_std": 0.00197751191444695, + "rewards/perpo_ocr_edit_distance_reward": 0.9545520544052124, "step": 230, "temperature": 0.9 }, { - "advantages": -0.0002980232238769531, - "completion_length": 295.0, - "delta_ref_entropy_loss": 0.02191162109375, - "delta_ref_ppl": -0.01165771484375, - "entropy_loss": -0.01446533203125, - "epoch": 0.0924, - "grad_norm": 0.013702602243139453, - "k1_kl": 0.01171875, - "k3_kl": 0.00446319580078125, - "kimi_kl": 0.0066375732421875, - "learning_rate": 4.5379999999999995e-07, - "loss": 0.0005, - "ppl": 0.003021240234375, - "reward": 0.9992029368877411, - "reward_std": 0.0, - "rewards/perpo_ocr_edit_distance_reward": 0.9992029964923859, + "advantages": -6.696156196994707e-05, + "completion_length": 326.0, + "delta_ref_entropy_loss": 0.0244140625, + "delta_ref_ppl": -0.029541015625, + "entropy_loss": -0.03369140625, + "epoch": 0.0462, + "grad_norm": 1.382213185775613, + "k1_kl": 0.0294189453125, + "k3_kl": 0.0167236328125, + "kimi_kl": 0.0279541015625, + "learning_rate": 4.768999999999999e-07, + "loss": 0.0007, + "ppl": 0.01708984375, + "reward": 0.9808425903320312, + "reward_std": 0.0009174183942377567, + "rewards/perpo_ocr_edit_distance_reward": 0.980842649936676, "step": 231, "temperature": 0.9 }, { - "advantages": -1.9584384336468474e-07, - "completion_length": 228.5, - "delta_ref_entropy_loss": 0.020647048950195312, - "delta_ref_ppl": -0.0350341796875, - "entropy_loss": -0.1002197265625, - "epoch": 0.0928, - "grad_norm": 2.002456497736413, - "k1_kl": 0.0350341796875, - "k3_kl": 0.02142333984375, - "kimi_kl": 0.06298828125, - "learning_rate": 4.536e-07, - "loss": 0.0009, - "ppl": 0.04620361328125, - "reward": 0.8164941370487213, - "reward_std": 0.04122406058013439, - "rewards/perpo_ocr_edit_distance_reward": 0.8164941668510437, + "advantages": -8.176054689101875e-05, + "completion_length": 786.0, + "delta_ref_entropy_loss": 0.0103759765625, + "delta_ref_ppl": -0.017333984375, + "entropy_loss": -0.031494140625, + "epoch": 0.0464, + "grad_norm": 0.6489220694347378, + "k1_kl": 0.017333984375, + "k3_kl": 0.01153564453125, + "kimi_kl": 0.026123046875, + "learning_rate": 4.768e-07, + "loss": 0.0005, + "ppl": 0.0140380859375, + "reward": 0.9728274345397949, + "reward_std": 0.0010454582516103983, + "rewards/perpo_ocr_edit_distance_reward": 0.9728275537490845, "step": 232, "temperature": 0.9 }, { - "advantages": -5.820819433211e-05, - "completion_length": 873.0, - "delta_ref_entropy_loss": 0.019287109375, - "delta_ref_ppl": -0.011932373046875, - "entropy_loss": -0.02911376953125, - "epoch": 0.0932, - "grad_norm": 0.6167293611909473, - "k1_kl": 0.011932373046875, - "k3_kl": 0.006134033203125, - "kimi_kl": 0.0101318359375, - "learning_rate": 4.534e-07, + "advantages": -3.5988436138723046e-05, + "completion_length": 615.0, + "delta_ref_entropy_loss": 0.03125, + "delta_ref_ppl": -0.015869140625, + "entropy_loss": -0.05126953125, + "epoch": 0.0466, + "grad_norm": 0.6124585619638575, + "k1_kl": 0.015869140625, + "k3_kl": 0.006622314453125, + "kimi_kl": 0.00909423828125, + "learning_rate": 4.767e-07, "loss": 0.0003, - "ppl": 0.0119476318359375, - "reward": 0.9408008754253387, - "reward_std": 0.005681859503965825, - "rewards/perpo_ocr_edit_distance_reward": 0.9408009350299835, + "ppl": 0.023681640625, + "reward": 0.9784474968910217, + "reward_std": 0.000609853770583868, + "rewards/perpo_ocr_edit_distance_reward": 0.9784475564956665, "step": 233, "temperature": 0.9 }, { - "advantages": -1.729386272586453e-05, - "completion_length": 301.5, - "delta_ref_entropy_loss": 0.02197265625, - "delta_ref_ppl": -0.03216552734375, - "entropy_loss": -0.072265625, - "epoch": 0.0936, - "grad_norm": 1.0597387950619164, - "k1_kl": 0.032196044921875, - "k3_kl": 0.02020263671875, - "kimi_kl": 0.0621337890625, - "learning_rate": 4.5319999999999996e-07, - "loss": 0.0008, - "ppl": 0.042633056640625, - "reward": 0.9804576933383942, - "reward_std": 0.04245276235451456, - "rewards/perpo_ocr_edit_distance_reward": 0.9804577231407166, + "advantages": 5.6837288866518065e-06, + "completion_length": 794.0, + "delta_ref_entropy_loss": 0.0267333984375, + "delta_ref_ppl": -0.01446533203125, + "entropy_loss": -0.09716796875, + "epoch": 0.0468, + "grad_norm": 2.5657718875629327, + "k1_kl": 0.01434326171875, + "k3_kl": 0.006378173828125, + "kimi_kl": 0.0093994140625, + "learning_rate": 4.766e-07, + "loss": 0.0002, + "ppl": 0.044189453125, + "reward": 0.9330213665962219, + "reward_std": 0.007401312235742807, + "rewards/perpo_ocr_edit_distance_reward": 0.9330214262008667, "step": 234, "temperature": 0.9 }, { - "advantages": -6.088188911235193e-06, - "completion_length": 961.5, - "delta_ref_entropy_loss": 0.04052734375, + "advantages": 3.099441755693988e-06, + "completion_length": 669.0, + "delta_ref_entropy_loss": 0.0225830078125, "delta_ref_ppl": -0.017333984375, - "entropy_loss": -0.17333984375, - "epoch": 0.094, - "grad_norm": 1.4693908729560996, - "k1_kl": 0.017364501953125, - "k3_kl": 0.00823974609375, - "kimi_kl": 0.014923095703125, - "learning_rate": 4.53e-07, - "loss": 0.0003, - "ppl": 0.100830078125, - "reward": 0.8966382741928101, - "reward_std": 0.011186235351487994, - "rewards/perpo_ocr_edit_distance_reward": 0.8966382741928101, + "entropy_loss": -0.07373046875, + "epoch": 0.047, + "grad_norm": 0.8252356747955554, + "k1_kl": 0.017333984375, + "k3_kl": 0.00958251953125, + "kimi_kl": 0.0234375, + "learning_rate": 4.7649999999999996e-07, + "loss": 0.0004, + "ppl": 0.0361328125, + "reward": 0.9335870146751404, + "reward_std": 0.01366392895579338, + "rewards/perpo_ocr_edit_distance_reward": 0.9335869550704956, "step": 235, "temperature": 0.9 }, { - "advantages": -8.52005832712166e-05, - "completion_length": 550.0, - "delta_ref_entropy_loss": 0.01495361328125, - "delta_ref_ppl": -0.0055999755859375, - "entropy_loss": -0.009857177734375, - "epoch": 0.0944, - "grad_norm": 0.10547404312320102, - "k1_kl": 0.0055999755859375, - "k3_kl": 0.00179290771484375, - "kimi_kl": 0.002849578857421875, - "learning_rate": 4.528e-07, - "loss": 0.0002, - "ppl": 0.002780914306640625, - "reward": 0.9986402094364166, - "reward_std": 0.0001248976623173803, - "rewards/perpo_ocr_edit_distance_reward": 0.998640239238739, + "advantages": -3.899847069988027e-05, + "completion_length": 465.0, + "delta_ref_entropy_loss": 0.0201416015625, + "delta_ref_ppl": -0.0147705078125, + "entropy_loss": -0.037109375, + "epoch": 0.0472, + "grad_norm": 0.8856841866228946, + "k1_kl": 0.01470947265625, + "k3_kl": 0.00726318359375, + "kimi_kl": 0.0142822265625, + "learning_rate": 4.7639999999999995e-07, + "loss": 0.0003, + "ppl": 0.0169677734375, + "reward": 0.9787198305130005, + "reward_std": 0.002083082217723131, + "rewards/perpo_ocr_edit_distance_reward": 0.9787198901176453, "step": 236, "temperature": 0.9 }, { - "advantages": -4.233633080730215e-05, - "completion_length": 400.5, - "delta_ref_entropy_loss": 0.02392578125, - "delta_ref_ppl": -0.01513671875, - "entropy_loss": -0.05841064453125, - "epoch": 0.0948, - "grad_norm": 0.9169497075120134, - "k1_kl": 0.015106201171875, - "k3_kl": 0.008270263671875, - "kimi_kl": 0.01165771484375, - "learning_rate": 4.5259999999999996e-07, - "loss": 0.0004, - "ppl": 0.0242919921875, - "reward": 0.9771941304206848, - "reward_std": 0.004520514863543212, - "rewards/perpo_ocr_edit_distance_reward": 0.977194219827652, + "advantages": -4.5299530029296875e-06, + "completion_length": 352.0, + "delta_ref_entropy_loss": 0.01116943359375, + "delta_ref_ppl": -0.0225830078125, + "entropy_loss": -0.038330078125, + "epoch": 0.0474, + "grad_norm": 1.2304397886600333, + "k1_kl": 0.022705078125, + "k3_kl": 0.01422119140625, + "kimi_kl": 0.039794921875, + "learning_rate": 4.763e-07, + "loss": 0.0006, + "ppl": 0.017578125, + "reward": 0.9501755237579346, + "reward_std": 0.007356105837970972, + "rewards/perpo_ocr_edit_distance_reward": 0.9501755833625793, "step": 237, "temperature": 0.9 }, { - "advantages": -1.3078963092993945e-05, - "completion_length": 401.0, - "delta_ref_entropy_loss": 0.025970458984375, - "delta_ref_ppl": -0.024749755859375, - "entropy_loss": -0.034210205078125, - "epoch": 0.0952, - "grad_norm": 0.9600187770033741, - "k1_kl": 0.0248870849609375, - "k3_kl": 0.014530181884765625, - "kimi_kl": 0.041339874267578125, - "learning_rate": 4.524e-07, - "loss": 0.0006, - "ppl": 0.01599884033203125, - "reward": 0.9395321309566498, - "reward_std": 0.00027525401674211025, - "rewards/perpo_ocr_edit_distance_reward": 0.9395321607589722, + "advantages": -1.004764044409967e-06, + "completion_length": 780.0, + "delta_ref_entropy_loss": 0.0142822265625, + "delta_ref_ppl": -0.019287109375, + "entropy_loss": -0.0400390625, + "epoch": 0.0476, + "grad_norm": 3.378059110564146, + "k1_kl": 0.019287109375, + "k3_kl": 0.012451171875, + "kimi_kl": 0.019775390625, + "learning_rate": 4.762e-07, + "loss": 0.0005, + "ppl": 0.0211181640625, + "reward": 0.8799481391906738, + "reward_std": 0.08555279672145844, + "rewards/perpo_ocr_edit_distance_reward": 0.8799481987953186, "step": 238, "temperature": 0.9 }, { - "advantages": -2.844631751486304e-05, - "completion_length": 802.5, - "delta_ref_entropy_loss": 0.036407470703125, - "delta_ref_ppl": -0.01715087890625, - "entropy_loss": -0.153076171875, - "epoch": 0.0956, - "grad_norm": 0.650710944813989, - "k1_kl": 0.017425537109375, - "k3_kl": 0.0076904296875, - "kimi_kl": 0.0140380859375, - "learning_rate": 4.522e-07, - "loss": 0.0003, - "ppl": 0.0802001953125, - "reward": 0.895981103181839, - "reward_std": 0.06588506512343884, - "rewards/perpo_ocr_edit_distance_reward": 0.8959811627864838, + "advantages": -4.427773774295929e-07, + "completion_length": 581.0, + "delta_ref_entropy_loss": 0.060546875, + "delta_ref_ppl": -0.0286865234375, + "entropy_loss": -0.1884765625, + "epoch": 0.0478, + "grad_norm": 4.494684695620358, + "k1_kl": 0.0284423828125, + "k3_kl": 0.0108642578125, + "kimi_kl": 0.0164794921875, + "learning_rate": 4.761e-07, + "loss": 0.0004, + "ppl": 0.09765625, + "reward": 0.824885904788971, + "reward_std": 0.05844740942120552, + "rewards/perpo_ocr_edit_distance_reward": 0.8248859643936157, "step": 239, "temperature": 0.9 }, { - "advantages": 4.019055268145166e-06, - "completion_length": 676.5, - "delta_ref_entropy_loss": 0.0218505859375, - "delta_ref_ppl": -0.0164031982421875, - "entropy_loss": -0.03228759765625, - "epoch": 0.096, - "grad_norm": 0.4445139240417506, - "k1_kl": 0.0164031982421875, - "k3_kl": 0.00908660888671875, - "kimi_kl": 0.019561767578125, - "learning_rate": 4.5199999999999997e-07, - "loss": 0.0004, - "ppl": 0.0134429931640625, - "reward": 0.9968936741352081, - "reward_std": 0.0014850420993752778, - "rewards/perpo_ocr_edit_distance_reward": 0.9968937635421753, + "advantages": -4.2574748704282683e-07, + "completion_length": 310.0, + "delta_ref_entropy_loss": 0.0186767578125, + "delta_ref_ppl": -0.0242919921875, + "entropy_loss": -0.06640625, + "epoch": 0.048, + "grad_norm": 1.516627188505278, + "k1_kl": 0.024169921875, + "k3_kl": 0.01287841796875, + "kimi_kl": 0.0223388671875, + "learning_rate": 4.76e-07, + "loss": 0.0005, + "ppl": 0.0291748046875, + "reward": 0.8649708032608032, + "reward_std": 0.03970362991094589, + "rewards/perpo_ocr_edit_distance_reward": 0.864970862865448, "step": 240, "temperature": 0.9 }, { - "advantages": -7.85078350418189e-06, - "completion_length": 705.0, - "delta_ref_entropy_loss": 0.01824951171875, - "delta_ref_ppl": -0.01220703125, - "entropy_loss": -0.01605224609375, - "epoch": 0.0964, - "grad_norm": 0.4505213353853611, - "k1_kl": 0.01220703125, - "k3_kl": 0.007080078125, - "kimi_kl": 0.0142822265625, - "learning_rate": 4.5179999999999996e-07, - "loss": 0.0003, - "ppl": 0.00714111328125, - "reward": 0.9577051401138306, - "reward_std": 0.01205615553772077, - "rewards/perpo_ocr_edit_distance_reward": 0.9577051997184753, + "advantages": -3.2612257200526074e-05, + "completion_length": 730.0, + "delta_ref_entropy_loss": 0.0147705078125, + "delta_ref_ppl": -0.0101318359375, + "entropy_loss": -0.0181884765625, + "epoch": 0.0482, + "grad_norm": 0.31147378916542867, + "k1_kl": 0.0101318359375, + "k3_kl": 0.00537109375, + "kimi_kl": 0.007110595703125, + "learning_rate": 4.7589999999999997e-07, + "loss": 0.0002, + "ppl": 0.0052490234375, + "reward": 0.9831122159957886, + "reward_std": 0.0012062618043273687, + "rewards/perpo_ocr_edit_distance_reward": 0.9831122159957886, "step": 241, "temperature": 0.9 }, { - "advantages": -3.0683620934723876e-05, - "completion_length": 628.0, - "delta_ref_entropy_loss": 0.0211181640625, - "delta_ref_ppl": -0.019683837890625, - "entropy_loss": -0.03472900390625, - "epoch": 0.0968, - "grad_norm": 0.6926111158874991, - "k1_kl": 0.019683837890625, - "k3_kl": 0.012359619140625, - "kimi_kl": 0.0301513671875, - "learning_rate": 4.516e-07, - "loss": 0.0005, - "ppl": 0.015045166015625, - "reward": 0.983815610408783, - "reward_std": 0.010191055538598448, - "rewards/perpo_ocr_edit_distance_reward": 0.9838157296180725, + "advantages": -2.1083014871692285e-05, + "completion_length": 905.0, + "delta_ref_entropy_loss": 0.01312255859375, + "delta_ref_ppl": -0.008056640625, + "entropy_loss": -0.029052734375, + "epoch": 0.0484, + "grad_norm": 0.49595372175083574, + "k1_kl": 0.00799560546875, + "k3_kl": 0.004058837890625, + "kimi_kl": 0.005950927734375, + "learning_rate": 4.7579999999999996e-07, + "loss": 0.0002, + "ppl": 0.01171875, + "reward": 0.9845479726791382, + "reward_std": 0.0011118358233943582, + "rewards/perpo_ocr_edit_distance_reward": 0.9845479726791382, "step": 242, "temperature": 0.9 }, { - "advantages": -5.4350923164747655e-05, - "completion_length": 695.0, - "delta_ref_entropy_loss": 0.018890380859375, - "delta_ref_ppl": -0.0100860595703125, - "entropy_loss": -0.023681640625, - "epoch": 0.0972, - "grad_norm": 0.39535053519167934, - "k1_kl": 0.0100860595703125, - "k3_kl": 0.0045166015625, - "kimi_kl": 0.007293701171875, - "learning_rate": 4.514e-07, + "advantages": -2.367155957472278e-06, + "completion_length": 413.0, + "delta_ref_entropy_loss": 0.044921875, + "delta_ref_ppl": -0.01531982421875, + "entropy_loss": -0.0478515625, + "epoch": 0.0486, + "grad_norm": 1.027823521419575, + "k1_kl": 0.015380859375, + "k3_kl": 0.004150390625, + "kimi_kl": 0.0052490234375, + "learning_rate": 4.757e-07, "loss": 0.0002, - "ppl": 0.00982666015625, - "reward": 0.9981549978256226, - "reward_std": 0.0003418362757656723, - "rewards/perpo_ocr_edit_distance_reward": 0.998155027627945, + "ppl": 0.0162353515625, + "reward": 0.9461151957511902, + "reward_std": 0.014307543635368347, + "rewards/perpo_ocr_edit_distance_reward": 0.9461153149604797, "step": 243, "temperature": 0.9 }, { - "advantages": -0.00010526606365601765, - "completion_length": 999.5, - "delta_ref_entropy_loss": 0.012847900390625, - "delta_ref_ppl": -0.0044708251953125, - "entropy_loss": -0.01971435546875, - "epoch": 0.0976, - "grad_norm": 0.3083267313236415, - "k1_kl": 0.00446319580078125, - "k3_kl": 0.001384735107421875, - "kimi_kl": 0.0020580291748046875, - "learning_rate": 4.5119999999999996e-07, - "loss": 0.0002, - "ppl": 0.0069580078125, - "reward": 0.9987558126449585, - "reward_std": 0.0003390357451280579, - "rewards/perpo_ocr_edit_distance_reward": 0.9987558424472809, + "advantages": -9.976114961318672e-05, + "completion_length": 423.0, + "delta_ref_entropy_loss": 0.0118408203125, + "delta_ref_ppl": -0.0174560546875, + "entropy_loss": -0.033447265625, + "epoch": 0.0488, + "grad_norm": 0.9841845104122952, + "k1_kl": 0.0174560546875, + "k3_kl": 0.0093994140625, + "kimi_kl": 0.01806640625, + "learning_rate": 4.756e-07, + "loss": 0.0005, + "ppl": 0.01251220703125, + "reward": 0.9773403406143188, + "reward_std": 0.0009241445804946125, + "rewards/perpo_ocr_edit_distance_reward": 0.9773404598236084, "step": 244, "temperature": 0.9 }, { - "advantages": -4.461833668756299e-06, - "completion_length": 529.0, - "delta_ref_entropy_loss": 0.025634765625, - "delta_ref_ppl": -0.012054443359375, - "entropy_loss": -0.0274658203125, - "epoch": 0.098, - "grad_norm": 0.924068137721977, - "k1_kl": 0.012054443359375, - "k3_kl": 0.00457763671875, - "kimi_kl": 0.0081329345703125, - "learning_rate": 4.51e-07, - "loss": 0.0002, - "ppl": 0.0107421875, - "reward": 0.9726462662220001, - "reward_std": 0.013215235434472561, - "rewards/perpo_ocr_edit_distance_reward": 0.9726463556289673, + "advantages": -0.00012065683404216543, + "completion_length": 384.0, + "delta_ref_entropy_loss": 0.014892578125, + "delta_ref_ppl": -0.0091552734375, + "entropy_loss": -0.013671875, + "epoch": 0.049, + "grad_norm": 0.5983258863499754, + "k1_kl": 0.00909423828125, + "k3_kl": 0.004150390625, + "kimi_kl": 0.007537841796875, + "learning_rate": 4.7549999999999994e-07, + "loss": 0.0003, + "ppl": 0.003692626953125, + "reward": 0.9843078851699829, + "reward_std": 0.0002528192417230457, + "rewards/perpo_ocr_edit_distance_reward": 0.9843079447746277, "step": 245, "temperature": 0.9 }, { - "advantages": -3.423009729885962e-06, - "completion_length": 988.0, - "delta_ref_entropy_loss": 0.01885986328125, - "delta_ref_ppl": -0.011474609375, - "entropy_loss": -0.0728759765625, - "epoch": 0.0984, - "grad_norm": 1.023342082376756, - "k1_kl": 0.011505126953125, - "k3_kl": 0.0060577392578125, - "kimi_kl": 0.0141448974609375, - "learning_rate": 4.5079999999999993e-07, - "loss": 0.0002, - "ppl": 0.0408935546875, - "reward": 0.6079402342438698, - "reward_std": 0.05848122062161565, - "rewards/perpo_ocr_edit_distance_reward": 0.6079402640461922, + "advantages": -0.0005960464477539062, + "completion_length": 199.0, + "delta_ref_entropy_loss": 0.0286865234375, + "delta_ref_ppl": -0.01251220703125, + "entropy_loss": -0.0093994140625, + "epoch": 0.0492, + "grad_norm": 0.0008881616491838757, + "k1_kl": 0.0125732421875, + "k3_kl": 0.006134033203125, + "kimi_kl": 0.0108642578125, + "learning_rate": 4.754e-07, + "loss": 0.0008, + "ppl": 0.0015106201171875, + "reward": 0.9795737266540527, + "reward_std": 0.0, + "rewards/perpo_ocr_edit_distance_reward": 0.9795737862586975, "step": 246, "temperature": 0.9 }, { - "advantages": -3.5549913945942535e-07, - "completion_length": 768.0, - "delta_ref_entropy_loss": 0.03173828125, - "delta_ref_ppl": -0.02362060546875, - "entropy_loss": -0.06884765625, - "epoch": 0.0988, - "grad_norm": 1.0111979083435465, - "k1_kl": 0.0235595703125, - "k3_kl": 0.012939453125, - "kimi_kl": 0.02978515625, - "learning_rate": 4.5059999999999997e-07, - "loss": 0.0005, - "ppl": 0.034912109375, - "reward": 0.8860160708427429, - "reward_std": 0.0726259732618928, - "rewards/perpo_ocr_edit_distance_reward": 0.8860161304473877, - "step": 247, - "temperature": 0.9 - }, - { - "advantages": -6.565026268390284e-06, - "completion_length": 392.5, - "delta_ref_entropy_loss": 0.02496337890625, - "delta_ref_ppl": -0.04241943359375, - "entropy_loss": -0.11474609375, - "epoch": 0.0992, - "grad_norm": 4.334366306053073, - "k1_kl": 0.0423583984375, - "k3_kl": 0.035919189453125, - "kimi_kl": 0.11407470703125, - "learning_rate": 4.504e-07, - "loss": 0.0014, - "ppl": 0.0546875, - "reward": 0.6608403623104095, - "reward_std": 0.08070758334361017, - "rewards/perpo_ocr_edit_distance_reward": 0.6608404368162155, + "advantages": -6.120545731391758e-05, + "completion_length": 608.0, + "delta_ref_entropy_loss": 0.026611328125, + "delta_ref_ppl": -0.0145263671875, + "entropy_loss": -0.0177001953125, + "epoch": 0.0494, + "grad_norm": 0.5008366866161029, + "k1_kl": 0.0145263671875, + "k3_kl": 0.006317138671875, + "kimi_kl": 0.011474609375, + "learning_rate": 4.753e-07, + "loss": 0.0003, + "ppl": 0.007232666015625, + "reward": 0.9593369960784912, + "reward_std": 0.001152027864009142, + "rewards/perpo_ocr_edit_distance_reward": 0.959337055683136, + "step": 247, + "temperature": 0.9 + }, + { + "advantages": -5.245209194981726e-06, + "completion_length": 610.0, + "delta_ref_entropy_loss": 0.030517578125, + "delta_ref_ppl": -0.02978515625, + "entropy_loss": -0.0693359375, + "epoch": 0.0496, + "grad_norm": 1.947219574175417, + "k1_kl": 0.0299072265625, + "k3_kl": 0.016845703125, + "kimi_kl": 0.03662109375, + "learning_rate": 4.7519999999999997e-07, + "loss": 0.0007, + "ppl": 0.032958984375, + "reward": 0.8637540936470032, + "reward_std": 0.02106575109064579, + "rewards/perpo_ocr_edit_distance_reward": 0.8637542724609375, "step": 248, "temperature": 0.9 }, { - "advantages": -1.4483928907793597e-05, - "completion_length": 708.5, - "delta_ref_entropy_loss": 0.024169921875, - "delta_ref_ppl": -0.0191650390625, - "entropy_loss": -0.06689453125, - "epoch": 0.0996, - "grad_norm": 0.8134854072023933, - "k1_kl": 0.0191650390625, - "k3_kl": 0.01092529296875, - "kimi_kl": 0.018463134765625, - "learning_rate": 4.502e-07, - "loss": 0.0005, - "ppl": 0.0317535400390625, - "reward": 0.8554463982582092, - "reward_std": 0.011039601522497833, - "rewards/perpo_ocr_edit_distance_reward": 0.855446457862854, + "advantages": -3.823212409770349e-06, + "completion_length": 755.0, + "delta_ref_entropy_loss": 0.0111083984375, + "delta_ref_ppl": -0.004913330078125, + "entropy_loss": -0.0186767578125, + "epoch": 0.0498, + "grad_norm": 0.4260971997691509, + "k1_kl": 0.004913330078125, + "k3_kl": 0.0014495849609375, + "kimi_kl": 0.001922607421875, + "learning_rate": 4.751e-07, + "loss": 0.0001, + "ppl": 0.00750732421875, + "reward": 0.8057220578193665, + "reward_std": 0.008814733475446701, + "rewards/perpo_ocr_edit_distance_reward": 0.8057221174240112, "step": 249, "temperature": 0.9 }, { - "advantages": -2.3671560768434574e-05, - "completion_length": 563.0, - "delta_ref_entropy_loss": 0.017608642578125, - "delta_ref_ppl": -0.0157623291015625, - "entropy_loss": -0.031982421875, - "epoch": 0.1, - "grad_norm": 0.6594317582968713, - "k1_kl": 0.0157623291015625, - "k3_kl": 0.00806427001953125, - "kimi_kl": 0.0162811279296875, - "learning_rate": 4.5e-07, - "loss": 0.0003, - "ppl": 0.0155029296875, - "reward": 0.9879007041454315, - "reward_std": 0.015357919153757393, - "rewards/perpo_ocr_edit_distance_reward": 0.9879007637500763, + "advantages": -2.6558127501630224e-05, + "completion_length": 631.0, + "delta_ref_entropy_loss": 0.0223388671875, + "delta_ref_ppl": -0.01226806640625, + "entropy_loss": -0.033203125, + "epoch": 0.05, + "grad_norm": 0.6787055843285116, + "k1_kl": 0.01226806640625, + "k3_kl": 0.005401611328125, + "kimi_kl": 0.00787353515625, + "learning_rate": 4.7499999999999995e-07, + "loss": 0.0002, + "ppl": 0.01422119140625, + "reward": 0.9844616055488586, + "reward_std": 0.0005416158819571137, + "rewards/perpo_ocr_edit_distance_reward": 0.9844616651535034, "step": 250, "temperature": 0.9 }, { - "advantages": -4.74602010029912e-05, - "completion_length": 970.0, - "delta_ref_entropy_loss": 0.028076171875, - "delta_ref_ppl": -0.0128173828125, - "entropy_loss": -0.0753173828125, - "epoch": 0.1004, - "grad_norm": 0.934795873465931, - "k1_kl": 0.0128173828125, - "k3_kl": 0.005584716796875, - "kimi_kl": 0.00848388671875, - "learning_rate": 4.4979999999999996e-07, - "loss": 0.0003, - "ppl": 0.037841796875, - "reward": 0.9503952860832214, - "reward_std": 0.08530453435378149, - "rewards/perpo_ocr_edit_distance_reward": 0.9503953456878662, + "advantages": 0.0, + "completion_length": 1008.0, + "delta_ref_entropy_loss": 0.0478515625, + "delta_ref_ppl": -0.0185546875, + "entropy_loss": -0.220703125, + "epoch": 0.0502, + "grad_norm": 1.394176465051501, + "k1_kl": 0.0184326171875, + "k3_kl": 0.0087890625, + "kimi_kl": 0.01025390625, + "learning_rate": 4.7489999999999994e-07, + "loss": 0.0004, + "ppl": 0.11572265625, + "reward": 0.35639825463294983, + "reward_std": 0.0019641555845737457, + "rewards/perpo_ocr_edit_distance_reward": 0.35639825463294983, "step": 251, "temperature": 0.9 }, { - "advantages": -2.9717173219978577e-06, - "completion_length": 489.0, - "delta_ref_entropy_loss": 0.02874755859375, - "delta_ref_ppl": -0.014404296875, - "entropy_loss": -0.060394287109375, - "epoch": 0.1008, - "grad_norm": 0.606766737278751, - "k1_kl": 0.014404296875, - "k3_kl": 0.00867462158203125, - "kimi_kl": 0.01519775390625, - "learning_rate": 4.496e-07, - "loss": 0.0004, - "ppl": 0.0325164794921875, - "reward": 0.957019567489624, - "reward_std": 0.007829940877854824, - "rewards/perpo_ocr_edit_distance_reward": 0.9570196270942688, + "advantages": -2.3228782083606347e-05, + "completion_length": 402.0, + "delta_ref_entropy_loss": 0.033447265625, + "delta_ref_ppl": -0.021484375, + "entropy_loss": -0.0634765625, + "epoch": 0.0504, + "grad_norm": 0.9146828924900015, + "k1_kl": 0.021484375, + "k3_kl": 0.01239013671875, + "kimi_kl": 0.0213623046875, + "learning_rate": 4.748e-07, + "loss": 0.0005, + "ppl": 0.02880859375, + "reward": 0.9759218692779541, + "reward_std": 0.0013660675613209605, + "rewards/perpo_ocr_edit_distance_reward": 0.9759219288825989, "step": 252, "temperature": 0.9 }, { - "advantages": 3.8317273265420226e-07, - "completion_length": 596.0, - "delta_ref_entropy_loss": 0.017822265625, - "delta_ref_ppl": -0.00970458984375, - "entropy_loss": -0.02728271484375, - "epoch": 0.1012, - "grad_norm": 1.415232820906998, - "k1_kl": 0.0096435546875, - "k3_kl": 0.0056304931640625, - "kimi_kl": 0.010101318359375, - "learning_rate": 4.494e-07, - "loss": 0.0002, - "ppl": 0.013153076171875, - "reward": 0.9889457821846008, - "reward_std": 0.012122239626478404, - "rewards/perpo_ocr_edit_distance_reward": 0.9889458119869232, + "advantages": -2.384185791015625e-07, + "completion_length": 2048.0, + "delta_ref_entropy_loss": 0.0184326171875, + "delta_ref_ppl": -0.01263427734375, + "entropy_loss": -0.28515625, + "epoch": 0.0506, + "grad_norm": 6.8063738921594075, + "k1_kl": 0.01239013671875, + "k3_kl": 0.0067138671875, + "kimi_kl": 0.0130615234375, + "learning_rate": 4.747e-07, + "loss": 0.0003, + "ppl": 0.15625, + "reward": 0.6968095302581787, + "reward_std": 0.17925064265727997, + "rewards/perpo_ocr_edit_distance_reward": 0.6968096494674683, "step": 253, "temperature": 0.9 }, { - "advantages": 7.01759594790019e-05, - "completion_length": 1401.5, - "delta_ref_entropy_loss": 0.021453857421875, - "delta_ref_ppl": -0.0137176513671875, - "entropy_loss": -0.056365966796875, - "epoch": 0.1016, - "grad_norm": 3.1587559892922483, - "k1_kl": 0.013702392578125, - "k3_kl": 0.0076904296875, - "kimi_kl": 0.01397705078125, - "learning_rate": 4.4919999999999997e-07, - "loss": 0.0002, - "ppl": 0.02850341796875, - "reward": 0.7876321375370026, - "reward_std": 0.09470455668270006, - "rewards/perpo_ocr_edit_distance_reward": 0.7876321375370026, + "advantages": -0.00010621548426570371, + "completion_length": 720.0, + "delta_ref_entropy_loss": 0.0164794921875, + "delta_ref_ppl": -0.005401611328125, + "entropy_loss": -0.01385498046875, + "epoch": 0.0508, + "grad_norm": 0.4626660480592606, + "k1_kl": 0.00537109375, + "k3_kl": 0.001068115234375, + "kimi_kl": 0.0013275146484375, + "learning_rate": 4.746e-07, + "loss": 0.0001, + "ppl": 0.00311279296875, + "reward": 0.9819371700286865, + "reward_std": 0.0001405343209626153, + "rewards/perpo_ocr_edit_distance_reward": 0.9819371700286865, "step": 254, "temperature": 0.9 }, { - "advantages": -4.207875100803449e-05, - "completion_length": 265.5, - "delta_ref_entropy_loss": 0.058349609375, - "delta_ref_ppl": -0.035888671875, - "entropy_loss": -0.229736328125, - "epoch": 0.102, - "grad_norm": 1.5892529945122502, - "k1_kl": 0.0361328125, - "k3_kl": 0.0185546875, - "kimi_kl": 0.0301513671875, - "learning_rate": 4.49e-07, - "loss": 0.0008, - "ppl": 0.128448486328125, - "reward": 0.9015490710735321, - "reward_std": 0.011082398879807442, - "rewards/perpo_ocr_edit_distance_reward": 0.9015491306781769, + "advantages": -0.0005960464477539062, + "completion_length": 82.0, + "delta_ref_entropy_loss": 0.0050048828125, + "delta_ref_ppl": -0.0458984375, + "entropy_loss": -0.0262451171875, + "epoch": 0.051, + "grad_norm": 0.02450240775893683, + "k1_kl": 0.0458984375, + "k3_kl": 0.033447265625, + "kimi_kl": 0.083984375, + "learning_rate": 4.7449999999999997e-07, + "loss": 0.0019, + "ppl": 0.00372314453125, + "reward": 0.9810725450515747, + "reward_std": 0.0, + "rewards/perpo_ocr_edit_distance_reward": 0.9810726046562195, "step": 255, "temperature": 0.9 }, { - "advantages": 5.466597940539941e-06, - "completion_length": 623.5, - "delta_ref_entropy_loss": 0.019195556640625, - "delta_ref_ppl": -0.013916015625, - "entropy_loss": -0.022735595703125, - "epoch": 0.1024, - "grad_norm": 0.3932802657267044, - "k1_kl": 0.013885498046875, - "k3_kl": 0.0072479248046875, - "kimi_kl": 0.01727294921875, - "learning_rate": 4.4879999999999994e-07, - "loss": 0.0003, - "ppl": 0.007659912109375, - "reward": 0.9874660074710846, - "reward_std": 0.0005148641939740628, - "rewards/perpo_ocr_edit_distance_reward": 0.987466037273407, + "advantages": -8.988381159724668e-05, + "completion_length": 271.0, + "delta_ref_entropy_loss": 0.005706787109375, + "delta_ref_ppl": -0.018310546875, + "entropy_loss": -0.0196533203125, + "epoch": 0.0512, + "grad_norm": 0.708353288820473, + "k1_kl": 0.018310546875, + "k3_kl": 0.01226806640625, + "kimi_kl": 0.02685546875, + "learning_rate": 4.7439999999999996e-07, + "loss": 0.0006, + "ppl": 0.00567626953125, + "reward": 0.9818421006202698, + "reward_std": 0.00037361274007707834, + "rewards/perpo_ocr_edit_distance_reward": 0.9818421006202698, "step": 256, "temperature": 0.9 }, { - "advantages": -5.560261996606641e-06, - "completion_length": 1546.5, - "delta_ref_entropy_loss": 0.01519775390625, - "delta_ref_ppl": -0.00897216796875, - "entropy_loss": -0.07373046875, - "epoch": 0.1028, - "grad_norm": 1.4166802847400988, - "k1_kl": 0.0089874267578125, - "k3_kl": 0.0089111328125, - "kimi_kl": 0.01348876953125, - "learning_rate": 4.486e-07, + "advantages": -0.00024987972574308515, + "completion_length": 718.0, + "delta_ref_entropy_loss": 0.019287109375, + "delta_ref_ppl": -0.005767822265625, + "entropy_loss": -0.0230712890625, + "epoch": 0.0514, + "grad_norm": 0.8170165454923832, + "k1_kl": 0.00579833984375, + "k3_kl": 0.004180908203125, + "kimi_kl": 0.004547119140625, + "learning_rate": 4.7429999999999995e-07, "loss": 0.0004, - "ppl": 0.03948974609375, - "reward": 0.9429281651973724, - "reward_std": 0.010183615144342184, - "rewards/perpo_ocr_edit_distance_reward": 0.9429282248020172, + "ppl": 0.00909423828125, + "reward": 0.9877747297286987, + "reward_std": 0.0001385339564876631, + "rewards/perpo_ocr_edit_distance_reward": 0.9877748489379883, "step": 257, "temperature": 0.9 }, { - "advantages": -1.5650478275119895e-05, - "completion_length": 673.5, - "delta_ref_entropy_loss": 0.040771484375, - "delta_ref_ppl": -0.025390625, - "entropy_loss": -0.07086181640625, - "epoch": 0.1032, - "grad_norm": 0.9472834193589483, - "k1_kl": 0.025238037109375, - "k3_kl": 0.011871337890625, - "kimi_kl": 0.020416259765625, - "learning_rate": 4.484e-07, - "loss": 0.0005, - "ppl": 0.0331878662109375, - "reward": 0.9487087428569794, - "reward_std": 0.007942517753690481, - "rewards/perpo_ocr_edit_distance_reward": 0.9487087726593018, + "advantages": -1.9073486328125e-06, + "completion_length": 302.0, + "delta_ref_entropy_loss": 0.0213623046875, + "delta_ref_ppl": -0.0164794921875, + "entropy_loss": -0.034912109375, + "epoch": 0.0516, + "grad_norm": 1.371092826724407, + "k1_kl": 0.016357421875, + "k3_kl": 0.00994873046875, + "kimi_kl": 0.0206298828125, + "learning_rate": 4.742e-07, + "loss": 0.0004, + "ppl": 0.017822265625, + "reward": 0.9138448238372803, + "reward_std": 0.01322898082435131, + "rewards/perpo_ocr_edit_distance_reward": 0.9138448238372803, "step": 258, "temperature": 0.9 }, { - "advantages": -5.495974210134591e-05, - "completion_length": 860.0, - "delta_ref_entropy_loss": 0.02166748046875, - "delta_ref_ppl": -0.01220703125, - "entropy_loss": -0.0479736328125, - "epoch": 0.1036, - "grad_norm": 0.7331683576147927, - "k1_kl": 0.012237548828125, - "k3_kl": 0.0070037841796875, - "kimi_kl": 0.013885498046875, - "learning_rate": 4.4819999999999995e-07, - "loss": 0.0003, - "ppl": 0.025604248046875, - "reward": 0.9835996329784393, - "reward_std": 0.013240760774351656, - "rewards/perpo_ocr_edit_distance_reward": 0.9835997223854065, + "advantages": -3.916876812581904e-05, + "completion_length": 373.0, + "delta_ref_entropy_loss": 0.02099609375, + "delta_ref_ppl": -0.0201416015625, + "entropy_loss": -0.032958984375, + "epoch": 0.0518, + "grad_norm": 4.0253501390697375, + "k1_kl": 0.020263671875, + "k3_kl": 0.01300048828125, + "kimi_kl": 0.028564453125, + "learning_rate": 4.741e-07, + "loss": 0.0006, + "ppl": 0.016357421875, + "reward": 0.9827967882156372, + "reward_std": 0.0018568352097645402, + "rewards/perpo_ocr_edit_distance_reward": 0.982796847820282, "step": 259, "temperature": 0.9 }, { - "advantages": -3.610338613846409e-06, - "completion_length": 1085.5, - "delta_ref_entropy_loss": 0.02252197265625, - "delta_ref_ppl": -0.0100555419921875, - "entropy_loss": -0.05047607421875, - "epoch": 0.104, - "grad_norm": 514933.6486947483, - "k1_kl": 0.0100555419921875, - "k3_kl": 256.0034484863281, - "kimi_kl": 0.0499267578125, - "learning_rate": 4.48e-07, - "loss": 10.2498, - "ppl": 0.027557373046875, - "reward": 0.9854201376438141, - "reward_std": 0.0053211969789117575, - "rewards/perpo_ocr_edit_distance_reward": 0.9854201674461365, + "advantages": -1.2738364603137597e-05, + "completion_length": 555.0, + "delta_ref_entropy_loss": 0.0223388671875, + "delta_ref_ppl": -0.0167236328125, + "entropy_loss": -0.042236328125, + "epoch": 0.052, + "grad_norm": 0.9973456989444285, + "k1_kl": 0.0166015625, + "k3_kl": 0.00860595703125, + "kimi_kl": 0.01300048828125, + "learning_rate": 4.7399999999999993e-07, + "loss": 0.0004, + "ppl": 0.0194091796875, + "reward": 0.9023227095603943, + "reward_std": 0.0032427411060780287, + "rewards/perpo_ocr_edit_distance_reward": 0.9023227691650391, "step": 260, "temperature": 0.9 }, { - "advantages": -1.9933496204771473e-05, - "completion_length": 1205.0, - "delta_ref_entropy_loss": 0.0159912109375, - "delta_ref_ppl": -0.011077880859375, - "entropy_loss": -0.02874755859375, - "epoch": 0.1044, - "grad_norm": 0.39031133247235167, - "k1_kl": 0.011077880859375, - "k3_kl": 0.0056304931640625, - "kimi_kl": 0.012054443359375, - "learning_rate": 4.4779999999999997e-07, - "loss": 0.0002, - "ppl": 0.011749267578125, - "reward": 0.9103737473487854, - "reward_std": 0.11954859847901389, - "rewards/perpo_ocr_edit_distance_reward": 0.9103738367557526, + "advantages": -2.8456961445044726e-05, + "completion_length": 698.0, + "delta_ref_entropy_loss": 0.04833984375, + "delta_ref_ppl": -0.0264892578125, + "entropy_loss": -0.15625, + "epoch": 0.0522, + "grad_norm": 1.612318798586422, + "k1_kl": 0.0264892578125, + "k3_kl": 0.01116943359375, + "kimi_kl": 0.018310546875, + "learning_rate": 4.739e-07, + "loss": 0.0005, + "ppl": 0.07861328125, + "reward": 0.9360733032226562, + "reward_std": 0.001994886202737689, + "rewards/perpo_ocr_edit_distance_reward": 0.936073362827301, "step": 261, "temperature": 0.9 }, { - "advantages": -1.9916467635994195e-05, - "completion_length": 614.0, - "delta_ref_entropy_loss": 0.03802490234375, - "delta_ref_ppl": -0.019805908203125, - "entropy_loss": -0.06329345703125, - "epoch": 0.1048, - "grad_norm": 0.730923548831318, - "k1_kl": 0.019805908203125, - "k3_kl": 0.0098114013671875, - "kimi_kl": 0.017730712890625, - "learning_rate": 4.4759999999999996e-07, - "loss": 0.0004, - "ppl": 0.028045654296875, - "reward": 0.9655693769454956, - "reward_std": 0.010298507870174944, - "rewards/perpo_ocr_edit_distance_reward": 0.9655694663524628, + "advantages": 0.0, + "completion_length": 278.0, + "delta_ref_entropy_loss": 0.0211181640625, + "delta_ref_ppl": -0.035888671875, + "entropy_loss": -0.03466796875, + "epoch": 0.0524, + "grad_norm": 0.047923275422552064, + "k1_kl": 0.0361328125, + "k3_kl": 0.023681640625, + "kimi_kl": 0.07080078125, + "learning_rate": 4.7379999999999997e-07, + "loss": 0.001, + "ppl": 0.01202392578125, + "reward": 0.9634655117988586, + "reward_std": 0.0, + "rewards/perpo_ocr_edit_distance_reward": 0.9634655714035034, "step": 262, "temperature": 0.9 }, { - "advantages": -0.0001412928213539999, - "completion_length": 733.5, - "delta_ref_entropy_loss": 0.02081298828125, - "delta_ref_ppl": -0.0167236328125, - "entropy_loss": -0.026031494140625, - "epoch": 0.1052, - "grad_norm": 0.42222771302534645, - "k1_kl": 0.0167236328125, - "k3_kl": 0.0090484619140625, - "kimi_kl": 0.01971435546875, - "learning_rate": 4.474e-07, - "loss": 0.0005, - "ppl": 0.00981903076171875, - "reward": 0.9995769262313843, - "reward_std": 0.0003544850624166429, - "rewards/perpo_ocr_edit_distance_reward": 0.9995770156383514, + "advantages": -1.7029899268550253e-08, + "completion_length": 103.0, + "delta_ref_entropy_loss": 0.01226806640625, + "delta_ref_ppl": -0.029052734375, + "entropy_loss": -0.051513671875, + "epoch": 0.0526, + "grad_norm": 2.868446437682325, + "k1_kl": 0.0291748046875, + "k3_kl": 0.0205078125, + "kimi_kl": 0.033447265625, + "learning_rate": 4.737e-07, + "loss": 0.0008, + "ppl": 0.0228271484375, + "reward": 0.9686716794967651, + "reward_std": 0.0010700615821406245, + "rewards/perpo_ocr_edit_distance_reward": 0.9686716794967651, "step": 263, "temperature": 0.9 }, { - "advantages": -8.927924426416212e-06, - "completion_length": 488.5, - "delta_ref_entropy_loss": 0.03875732421875, - "delta_ref_ppl": -0.033416748046875, - "entropy_loss": -0.0745849609375, - "epoch": 0.1056, - "grad_norm": 3.693539681976249, - "k1_kl": 0.03338623046875, - "k3_kl": 0.022125244140625, - "kimi_kl": 0.042572021484375, - "learning_rate": 4.472e-07, - "loss": 0.0009, - "ppl": 0.0362548828125, - "reward": 0.8041711747646332, - "reward_std": 0.04682193661574274, - "rewards/perpo_ocr_edit_distance_reward": 0.804171234369278, + "advantages": -0.00014327254029922187, + "completion_length": 713.0, + "delta_ref_entropy_loss": 0.01141357421875, + "delta_ref_ppl": -0.00811767578125, + "entropy_loss": -0.0167236328125, + "epoch": 0.0528, + "grad_norm": 0.19491020803276413, + "k1_kl": 0.00811767578125, + "k3_kl": 0.00396728515625, + "kimi_kl": 0.0068359375, + "learning_rate": 4.736e-07, + "loss": 0.0003, + "ppl": 0.005462646484375, + "reward": 0.9865026473999023, + "reward_std": 0.0003160368651151657, + "rewards/perpo_ocr_edit_distance_reward": 0.9865027070045471, "step": 264, "temperature": 0.9 }, { - "advantages": -5.8242255533969e-06, - "completion_length": 1200.0, - "delta_ref_entropy_loss": 0.02227783203125, - "delta_ref_ppl": -0.0233154296875, - "entropy_loss": -0.0499267578125, - "epoch": 0.106, - "grad_norm": 0.743216846595324, - "k1_kl": 0.02325439453125, - "k3_kl": 0.01348876953125, - "kimi_kl": 0.03765869140625, - "learning_rate": 4.4699999999999997e-07, - "loss": 0.0005, - "ppl": 0.026123046875, - "reward": 0.8680950105190277, - "reward_std": 0.09949037758633494, - "rewards/perpo_ocr_edit_distance_reward": 0.8680950701236725, + "advantages": -0.0001716954429866746, + "completion_length": 362.0, + "delta_ref_entropy_loss": 0.025390625, + "delta_ref_ppl": -0.020263671875, + "entropy_loss": -0.017333984375, + "epoch": 0.053, + "grad_norm": 0.5321895670310179, + "k1_kl": 0.020263671875, + "k3_kl": 0.01025390625, + "kimi_kl": 0.0172119140625, + "learning_rate": 4.7349999999999995e-07, + "loss": 0.0006, + "ppl": 0.005950927734375, + "reward": 0.981106698513031, + "reward_std": 0.00019756940309889615, + "rewards/perpo_ocr_edit_distance_reward": 0.9811067581176758, "step": 265, "temperature": 0.9 }, { - "advantages": -2.4608203830212005e-06, - "completion_length": 336.5, - "delta_ref_entropy_loss": 0.01226806640625, - "delta_ref_ppl": -0.0263671875, - "entropy_loss": -0.03546142578125, - "epoch": 0.1064, - "grad_norm": 1.3502599062623692, - "k1_kl": 0.02630615234375, - "k3_kl": 0.0164794921875, - "kimi_kl": 0.0341796875, - "learning_rate": 4.4679999999999995e-07, - "loss": 0.0007, - "ppl": 0.01739501953125, - "reward": 0.9667766988277435, - "reward_std": 0.035108954180032015, - "rewards/perpo_ocr_edit_distance_reward": 0.9667767584323883, + "advantages": -1.8732889373040962e-07, + "completion_length": 365.0, + "delta_ref_entropy_loss": 0.027099609375, + "delta_ref_ppl": -0.026611328125, + "entropy_loss": -0.0732421875, + "epoch": 0.0532, + "grad_norm": 1.0821039877768368, + "k1_kl": 0.026611328125, + "k3_kl": 0.01287841796875, + "kimi_kl": 0.0228271484375, + "learning_rate": 4.734e-07, + "loss": 0.0005, + "ppl": 0.034423828125, + "reward": 0.8582745790481567, + "reward_std": 0.28364482522010803, + "rewards/perpo_ocr_edit_distance_reward": 0.8582746386528015, "step": 266, "temperature": 0.9 }, { - "advantages": -3.976481457357295e-05, - "completion_length": 417.0, - "delta_ref_entropy_loss": 0.014984130859375, - "delta_ref_ppl": -0.0146636962890625, - "entropy_loss": -0.015655517578125, - "epoch": 0.1068, - "grad_norm": 0.20201576420555123, - "k1_kl": 0.0146636962890625, - "k3_kl": 0.008699417114257812, - "kimi_kl": 0.020786285400390625, - "learning_rate": 4.466e-07, + "advantages": -9.67298274190398e-06, + "completion_length": 563.0, + "delta_ref_entropy_loss": 0.0400390625, + "delta_ref_ppl": -0.0224609375, + "entropy_loss": -0.0830078125, + "epoch": 0.0534, + "grad_norm": 1.1312376908491775, + "k1_kl": 0.0225830078125, + "k3_kl": 0.010498046875, + "kimi_kl": 0.0169677734375, + "learning_rate": 4.733e-07, "loss": 0.0004, - "ppl": 0.00447845458984375, - "reward": 0.99908047914505, - "reward_std": 0.00011058803647756577, - "rewards/perpo_ocr_edit_distance_reward": 0.9990805089473724, + "ppl": 0.042724609375, + "reward": 0.9559832215309143, + "reward_std": 0.0034251122269779444, + "rewards/perpo_ocr_edit_distance_reward": 0.9559832811355591, "step": 267, "temperature": 0.9 }, { - "advantages": -5.330358817445813e-06, - "completion_length": 342.0, - "delta_ref_entropy_loss": 0.03302001953125, - "delta_ref_ppl": -0.02532958984375, - "entropy_loss": -0.0509033203125, - "epoch": 0.1072, - "grad_norm": 1.0830053164189992, - "k1_kl": 0.025390625, - "k3_kl": 0.0142822265625, - "kimi_kl": 0.02996826171875, - "learning_rate": 4.464e-07, - "loss": 0.0006, - "ppl": 0.0277099609375, - "reward": 0.9888673722743988, - "reward_std": 0.0025169606087729335, - "rewards/perpo_ocr_edit_distance_reward": 0.9888673722743988, + "advantages": 1.450095896871062e-05, + "completion_length": 358.0, + "delta_ref_entropy_loss": 0.044677734375, + "delta_ref_ppl": -0.0244140625, + "entropy_loss": -0.040771484375, + "epoch": 0.0536, + "grad_norm": 0.6127612549606402, + "k1_kl": 0.0245361328125, + "k3_kl": 0.0101318359375, + "kimi_kl": 0.01708984375, + "learning_rate": 4.732e-07, + "loss": 0.0004, + "ppl": 0.01434326171875, + "reward": 0.8801782131195068, + "reward_std": 0.0010738938581198454, + "rewards/perpo_ocr_edit_distance_reward": 0.8801782131195068, "step": 268, "temperature": 0.9 }, { - "advantages": -7.339886451518396e-06, - "completion_length": 443.5, - "delta_ref_entropy_loss": 0.028472900390625, - "delta_ref_ppl": -0.0296630859375, - "entropy_loss": -0.0496826171875, - "epoch": 0.1076, - "grad_norm": 0.7664468519437315, - "k1_kl": 0.0296630859375, - "k3_kl": 0.0160675048828125, - "kimi_kl": 0.03240966796875, - "learning_rate": 4.4619999999999996e-07, - "loss": 0.0006, - "ppl": 0.0225372314453125, - "reward": 0.9908979535102844, - "reward_std": 0.0006727728177793324, - "rewards/perpo_ocr_edit_distance_reward": 0.9908979535102844, + "advantages": -3.3634050851105712e-06, + "completion_length": 592.0, + "delta_ref_entropy_loss": 0.0216064453125, + "delta_ref_ppl": -0.0093994140625, + "entropy_loss": -0.04833984375, + "epoch": 0.0538, + "grad_norm": 1.3175954584044156, + "k1_kl": 0.00946044921875, + "k3_kl": 0.005706787109375, + "kimi_kl": 0.00775146484375, + "learning_rate": 4.731e-07, + "loss": 0.0002, + "ppl": 0.0267333984375, + "reward": 0.9656442999839783, + "reward_std": 0.015129424631595612, + "rewards/perpo_ocr_edit_distance_reward": 0.965644359588623, "step": 269, "temperature": 0.9 }, { - "advantages": -0.0003167774011672009, - "completion_length": 327.5, + "advantages": -5.234991112956777e-05, + "completion_length": 587.0, "delta_ref_entropy_loss": 0.0281982421875, - "delta_ref_ppl": -0.0384521484375, - "entropy_loss": -0.0538330078125, - "epoch": 0.108, - "grad_norm": 0.6068765193300061, - "k1_kl": 0.0384521484375, - "k3_kl": 0.025390625, - "kimi_kl": 0.07568359375, - "learning_rate": 4.46e-07, - "loss": 0.0013, - "ppl": 0.0227203369140625, - "reward": 0.9705975353717804, - "reward_std": 0.0007444061920978129, - "rewards/perpo_ocr_edit_distance_reward": 0.9705976247787476, + "delta_ref_ppl": -0.0164794921875, + "entropy_loss": -0.06689453125, + "epoch": 0.054, + "grad_norm": 0.9977374478824728, + "k1_kl": 0.0164794921875, + "k3_kl": 0.00885009765625, + "kimi_kl": 0.013916015625, + "learning_rate": 4.7299999999999996e-07, + "loss": 0.0004, + "ppl": 0.036376953125, + "reward": 0.9541707634925842, + "reward_std": 0.0013638336677104235, + "rewards/perpo_ocr_edit_distance_reward": 0.954170823097229, "step": 270, "temperature": 0.9 }, { - "advantages": -2.954687455591909e-06, - "completion_length": 1107.5, - "delta_ref_entropy_loss": 0.0245361328125, - "delta_ref_ppl": -0.012969970703125, + "advantages": -3.317424489068799e-05, + "completion_length": 392.0, + "delta_ref_entropy_loss": 0.0306396484375, + "delta_ref_ppl": -0.0267333984375, "entropy_loss": -0.0732421875, - "epoch": 0.1084, - "grad_norm": 1.2947022648977538, - "k1_kl": 0.012969970703125, - "k3_kl": 0.00604248046875, - "kimi_kl": 0.00933837890625, - "learning_rate": 4.4579999999999993e-07, - "loss": 0.0002, - "ppl": 0.0333251953125, - "reward": 0.9025858938694, - "reward_std": 0.14002793096005917, - "rewards/perpo_ocr_edit_distance_reward": 0.9025859832763672, + "epoch": 0.0542, + "grad_norm": 1.1008671392760454, + "k1_kl": 0.0267333984375, + "k3_kl": 0.0147705078125, + "kimi_kl": 0.028076171875, + "learning_rate": 4.7289999999999995e-07, + "loss": 0.0006, + "ppl": 0.034912109375, + "reward": 0.9619207382202148, + "reward_std": 0.0016968128038570285, + "rewards/perpo_ocr_edit_distance_reward": 0.9619208574295044, "step": 271, "temperature": 0.9 }, { - "advantages": 5.3218434103996515e-09, - "completion_length": 1026.5, - "delta_ref_entropy_loss": 0.01708984375, - "delta_ref_ppl": -0.014412879943847656, - "entropy_loss": -0.026123046875, - "epoch": 0.1088, - "grad_norm": 2839157.68998756, - "k1_kl": 0.014401435852050781, - "k3_kl": 5856.008239746094, - "kimi_kl": 0.052490234375, - "learning_rate": 4.4559999999999997e-07, - "loss": 234.2812, - "ppl": 0.015842437744140625, - "reward": 0.9629560112953186, - "reward_std": 0.003396668005734682, - "rewards/perpo_ocr_edit_distance_reward": 0.962956041097641, + "advantages": 0.0, + "completion_length": 266.0, + "delta_ref_entropy_loss": 0.047607421875, + "delta_ref_ppl": -0.0380859375, + "entropy_loss": -0.11083984375, + "epoch": 0.0544, + "grad_norm": 2.0895038475592806, + "k1_kl": 0.037841796875, + "k3_kl": 0.0220947265625, + "kimi_kl": 0.04443359375, + "learning_rate": 4.728e-07, + "loss": 0.0009, + "ppl": 0.064453125, + "reward": 0.9387266039848328, + "reward_std": 0.002555681625381112, + "rewards/perpo_ocr_edit_distance_reward": 0.9387266039848328, "step": 272, "temperature": 0.9 }, { - "advantages": -9.63892284744361e-06, - "completion_length": 912.5, - "delta_ref_entropy_loss": 0.012237548828125, - "delta_ref_ppl": -0.01357269287109375, - "entropy_loss": -0.03326416015625, - "epoch": 0.1092, - "grad_norm": 1.3810579924606468, - "k1_kl": 0.01357269287109375, - "k3_kl": 0.00652313232421875, - "kimi_kl": 0.0118560791015625, - "learning_rate": 4.454e-07, - "loss": 0.0003, - "ppl": 0.0145263671875, - "reward": 0.951031893491745, - "reward_std": 0.00789865548722446, - "rewards/perpo_ocr_edit_distance_reward": 0.9510319530963898, + "advantages": -6.811959565311554e-07, + "completion_length": 781.0, + "delta_ref_entropy_loss": 0.03515625, + "delta_ref_ppl": -0.03369140625, + "entropy_loss": -0.11328125, + "epoch": 0.0546, + "grad_norm": 1.1138223253002575, + "k1_kl": 0.033935546875, + "k3_kl": 0.0216064453125, + "kimi_kl": 0.04638671875, + "learning_rate": 4.727e-07, + "loss": 0.0009, + "ppl": 0.060791015625, + "reward": 0.8142251372337341, + "reward_std": 0.13777627050876617, + "rewards/perpo_ocr_edit_distance_reward": 0.8142252564430237, "step": 273, "temperature": 0.9 }, { - "advantages": -2.2432634068536572e-05, - "completion_length": 542.0, - "delta_ref_entropy_loss": 0.01873779296875, - "delta_ref_ppl": -0.015167236328125, - "entropy_loss": -0.03173828125, - "epoch": 0.1096, - "grad_norm": 0.4539672043095021, - "k1_kl": 0.015167236328125, - "k3_kl": 0.00777435302734375, - "kimi_kl": 0.015777587890625, - "learning_rate": 4.452e-07, - "loss": 0.0003, - "ppl": 0.0126190185546875, - "reward": 0.9962810277938843, - "reward_std": 0.0008204270852729678, - "rewards/perpo_ocr_edit_distance_reward": 0.9962810575962067, + "advantages": -4.4294767576502636e-05, + "completion_length": 217.0, + "delta_ref_entropy_loss": 0.04150390625, + "delta_ref_ppl": -0.01470947265625, + "entropy_loss": -0.046630859375, + "epoch": 0.0548, + "grad_norm": 1.234180210297072, + "k1_kl": 0.01470947265625, + "k3_kl": 0.0037994384765625, + "kimi_kl": 0.005126953125, + "learning_rate": 4.726e-07, + "loss": 0.0002, + "ppl": 0.018798828125, + "reward": 0.9594651460647583, + "reward_std": 0.001053261454217136, + "rewards/perpo_ocr_edit_distance_reward": 0.9594652056694031, "step": 274, "temperature": 0.9 }, { - "advantages": -3.984996510553174e-06, - "completion_length": 335.5, - "delta_ref_entropy_loss": 0.05224609375, - "delta_ref_ppl": -0.02130126953125, - "entropy_loss": -0.1173095703125, - "epoch": 0.11, - "grad_norm": 2.2046183897462446, - "k1_kl": 0.021026611328125, - "k3_kl": 0.0075836181640625, - "kimi_kl": 0.0099029541015625, - "learning_rate": 4.45e-07, - "loss": 0.0003, - "ppl": 0.06268310546875, - "reward": 0.8186160326004028, - "reward_std": 0.04718808038160205, - "rewards/perpo_ocr_edit_distance_reward": 0.8186160624027252, + "advantages": -5.966425305814482e-05, + "completion_length": 1012.0, + "delta_ref_entropy_loss": 0.015869140625, + "delta_ref_ppl": -0.00445556640625, + "entropy_loss": -0.0191650390625, + "epoch": 0.055, + "grad_norm": 0.5487708496112581, + "k1_kl": 0.00445556640625, + "k3_kl": 0.000949859619140625, + "kimi_kl": 0.00130462646484375, + "learning_rate": 4.725e-07, + "loss": 0.0001, + "ppl": 0.0067138671875, + "reward": 0.978066086769104, + "reward_std": 0.0003281377721577883, + "rewards/perpo_ocr_edit_distance_reward": 0.978066086769104, "step": 275, "temperature": 0.9 }, { - "advantages": -6.733196642016992e-06, - "completion_length": 924.0, - "delta_ref_entropy_loss": 0.02850341796875, - "delta_ref_ppl": -0.01702880859375, - "entropy_loss": -0.0684814453125, - "epoch": 0.1104, - "grad_norm": 0.9866907651970015, - "k1_kl": 0.01702880859375, - "k3_kl": 0.00982666015625, - "kimi_kl": 0.0168304443359375, - "learning_rate": 4.4479999999999996e-07, - "loss": 0.0004, - "ppl": 0.03741455078125, - "reward": 0.9883506894111633, - "reward_std": 0.0029629966302309185, - "rewards/perpo_ocr_edit_distance_reward": 0.9883507490158081, + "advantages": -1.1954989531659521e-05, + "completion_length": 589.0, + "delta_ref_entropy_loss": 0.06298828125, + "delta_ref_ppl": -0.0322265625, + "entropy_loss": -0.27734375, + "epoch": 0.0552, + "grad_norm": 1.9627593491072584, + "k1_kl": 0.032470703125, + "k3_kl": 0.018798828125, + "kimi_kl": 0.0257568359375, + "learning_rate": 4.7239999999999997e-07, + "loss": 0.0008, + "ppl": 0.1474609375, + "reward": 0.6575808525085449, + "reward_std": 0.004172904416918755, + "rewards/perpo_ocr_edit_distance_reward": 0.6575809717178345, "step": 276, "temperature": 0.9 }, { - "advantages": -1.102685956766436e-06, - "completion_length": 576.0, - "delta_ref_entropy_loss": 0.0369873046875, - "delta_ref_ppl": -0.02459716796875, - "entropy_loss": -0.179931640625, - "epoch": 0.1108, - "grad_norm": 0.7987665856356307, - "k1_kl": 0.02459716796875, - "k3_kl": 0.01373291015625, - "kimi_kl": 0.0225830078125, - "learning_rate": 4.446e-07, - "loss": 0.0005, - "ppl": 0.092529296875, - "reward": 0.8510172963142395, - "reward_std": 0.09552582073956728, - "rewards/perpo_ocr_edit_distance_reward": 0.8510173559188843, + "advantages": -2.4097307687043212e-05, + "completion_length": 628.0, + "delta_ref_entropy_loss": 0.035888671875, + "delta_ref_ppl": -0.020263671875, + "entropy_loss": -0.059326171875, + "epoch": 0.0554, + "grad_norm": 0.8040213212296721, + "k1_kl": 0.020263671875, + "k3_kl": 0.00994873046875, + "kimi_kl": 0.017822265625, + "learning_rate": 4.7229999999999996e-07, + "loss": 0.0004, + "ppl": 0.030029296875, + "reward": 0.8983544707298279, + "reward_std": 0.0027276836335659027, + "rewards/perpo_ocr_edit_distance_reward": 0.8983545899391174, "step": 277, "temperature": 0.9 }, { - "advantages": -8.514949634275126e-09, - "completion_length": 1087.5, - "delta_ref_entropy_loss": 0.031280517578125, - "delta_ref_ppl": -0.02603912353515625, - "entropy_loss": -0.03759765625, - "epoch": 0.1112, - "grad_norm": 1.2399423485068881, - "k1_kl": 0.02603912353515625, - "k3_kl": 0.0141143798828125, - "kimi_kl": 0.030792236328125, - "learning_rate": 4.444e-07, - "loss": 0.0006, - "ppl": 0.019683837890625, - "reward": 0.5771319195628166, - "reward_std": 0.01104651391506195, - "rewards/perpo_ocr_edit_distance_reward": 0.5771319568157196, + "advantages": -5.3252493671607226e-05, + "completion_length": 481.0, + "delta_ref_entropy_loss": 0.0196533203125, + "delta_ref_ppl": -0.0247802734375, + "entropy_loss": -0.04150390625, + "epoch": 0.0556, + "grad_norm": 0.8153923953968474, + "k1_kl": 0.0247802734375, + "k3_kl": 0.0157470703125, + "kimi_kl": 0.047119140625, + "learning_rate": 4.722e-07, + "loss": 0.0007, + "ppl": 0.0184326171875, + "reward": 0.9784340262413025, + "reward_std": 0.0014994678786024451, + "rewards/perpo_ocr_edit_distance_reward": 0.9784340858459473, "step": 278, "temperature": 0.9 }, { - "advantages": -1.0036996757056826e-06, - "completion_length": 764.0, - "delta_ref_entropy_loss": 0.021484375, - "delta_ref_ppl": -0.010284423828125, - "entropy_loss": -0.03662109375, - "epoch": 0.1116, - "grad_norm": 0.9884284289185911, - "k1_kl": 0.010284423828125, - "k3_kl": 0.0059661865234375, - "kimi_kl": 0.01507568359375, - "learning_rate": 4.4419999999999997e-07, - "loss": 0.0002, - "ppl": 0.018707275390625, - "reward": 0.986366480588913, - "reward_std": 0.010299902991391718, - "rewards/perpo_ocr_edit_distance_reward": 0.9863665103912354, + "advantages": -2.2309168343781494e-06, + "completion_length": 282.0, + "delta_ref_entropy_loss": 0.046142578125, + "delta_ref_ppl": -0.0400390625, + "entropy_loss": -0.17578125, + "epoch": 0.0558, + "grad_norm": 1.8703691236599862, + "k1_kl": 0.039794921875, + "k3_kl": 0.026611328125, + "kimi_kl": 0.080078125, + "learning_rate": 4.721e-07, + "loss": 0.0011, + "ppl": 0.0849609375, + "reward": 0.8622023463249207, + "reward_std": 0.01517539843916893, + "rewards/perpo_ocr_edit_distance_reward": 0.8622024059295654, "step": 279, "temperature": 0.9 }, { - "advantages": -2.5366034833496087e-05, - "completion_length": 604.0, - "delta_ref_entropy_loss": 0.02886962890625, - "delta_ref_ppl": -0.016387939453125, - "entropy_loss": -0.0465087890625, - "epoch": 0.112, - "grad_norm": 0.602923122204444, - "k1_kl": 0.016387939453125, - "k3_kl": 0.0068511962890625, - "kimi_kl": 0.01092529296875, - "learning_rate": 4.44e-07, - "loss": 0.0003, - "ppl": 0.0198974609375, - "reward": 0.7070087492465973, - "reward_std": 0.0010278466215822846, - "rewards/perpo_ocr_edit_distance_reward": 0.7070087790489197, + "advantages": -8.004052460819366e-07, + "completion_length": 814.0, + "delta_ref_entropy_loss": 0.0174560546875, + "delta_ref_ppl": -0.0179443359375, + "entropy_loss": -0.076171875, + "epoch": 0.056, + "grad_norm": 1.1004906174891989, + "k1_kl": 0.0179443359375, + "k3_kl": 0.01019287109375, + "kimi_kl": 0.023193359375, + "learning_rate": 4.7199999999999994e-07, + "loss": 0.0004, + "ppl": 0.0390625, + "reward": 0.8706241250038147, + "reward_std": 0.053131841123104095, + "rewards/perpo_ocr_edit_distance_reward": 0.8706242442131042, "step": 280, "temperature": 0.9 }, { - "advantages": -1.887764256025548e-05, - "completion_length": 659.0, - "delta_ref_entropy_loss": 0.041259765625, - "delta_ref_ppl": -0.0250244140625, - "entropy_loss": -0.0885009765625, - "epoch": 0.1124, - "grad_norm": 1.7062444386793965, - "k1_kl": 0.0250244140625, - "k3_kl": 0.016448974609375, - "kimi_kl": 0.0252685546875, - "learning_rate": 4.4379999999999994e-07, - "loss": 0.0007, - "ppl": 0.04522705078125, - "reward": 0.9442498981952667, - "reward_std": 0.020683443086454645, - "rewards/perpo_ocr_edit_distance_reward": 0.9442499577999115, + "advantages": -0.0005960464477539062, + "completion_length": 52.0, + "delta_ref_entropy_loss": 0.08251953125, + "delta_ref_ppl": -0.05712890625, + "entropy_loss": -0.05322265625, + "epoch": 0.0562, + "grad_norm": 0.0552328851212475, + "k1_kl": 0.05712890625, + "k3_kl": 0.019775390625, + "kimi_kl": 0.02734375, + "learning_rate": 4.719e-07, + "loss": 0.0014, + "ppl": 0.0107421875, + "reward": 0.9299999475479126, + "reward_std": 0.0, + "rewards/perpo_ocr_edit_distance_reward": 0.9300000667572021, "step": 281, "temperature": 0.9 }, { - "advantages": -3.7806375985383056e-06, - "completion_length": 425.0, - "delta_ref_entropy_loss": 0.0341796875, - "delta_ref_ppl": -0.0322265625, - "entropy_loss": -0.1151123046875, - "epoch": 0.1128, - "grad_norm": 0.9335548842185103, - "k1_kl": 0.03204345703125, - "k3_kl": 0.016876220703125, - "kimi_kl": 0.0362548828125, - "learning_rate": 4.436e-07, - "loss": 0.0007, - "ppl": 0.0608673095703125, - "reward": 0.9763505160808563, - "reward_std": 0.0010750986402854323, - "rewards/perpo_ocr_edit_distance_reward": 0.9763504862785339, + "advantages": -0.0001005828453344293, + "completion_length": 491.0, + "delta_ref_entropy_loss": 0.025634765625, + "delta_ref_ppl": -0.0166015625, + "entropy_loss": -0.038818359375, + "epoch": 0.0564, + "grad_norm": 0.6814995706648468, + "k1_kl": 0.0167236328125, + "k3_kl": 0.00775146484375, + "kimi_kl": 0.015380859375, + "learning_rate": 4.718e-07, + "loss": 0.0004, + "ppl": 0.0185546875, + "reward": 0.961529016494751, + "reward_std": 0.0004078965575899929, + "rewards/perpo_ocr_edit_distance_reward": 0.9615290760993958, "step": 282, "temperature": 0.9 }, { - "advantages": -0.00015100411474122666, - "completion_length": 642.5, - "delta_ref_entropy_loss": 0.016815185546875, - "delta_ref_ppl": -0.0058746337890625, - "entropy_loss": -0.0189208984375, - "epoch": 0.1132, - "grad_norm": 2.632565066719861, - "k1_kl": 0.005889892578125, - "k3_kl": 0.0023956298828125, - "kimi_kl": 0.0030670166015625, - "learning_rate": 4.434e-07, - "loss": 0.0002, - "ppl": 0.00787353515625, - "reward": 0.9995838105678558, - "reward_std": 0.0004344392364146188, - "rewards/perpo_ocr_edit_distance_reward": 0.999583899974823, + "advantages": 0.0, + "completion_length": 153.0, + "delta_ref_entropy_loss": 0.0098876953125, + "delta_ref_ppl": -0.021240234375, + "entropy_loss": -0.015869140625, + "epoch": 0.0566, + "grad_norm": 0.003947297952407854, + "k1_kl": 0.021240234375, + "k3_kl": 0.0142822265625, + "kimi_kl": 0.0322265625, + "learning_rate": 4.7169999999999997e-07, + "loss": 0.0006, + "ppl": 0.002471923828125, + "reward": 0.8700564503669739, + "reward_std": 0.0, + "rewards/perpo_ocr_edit_distance_reward": 0.8700565099716187, "step": 283, "temperature": 0.9 }, { - "advantages": -5.655629456668976e-05, - "completion_length": 846.0, - "delta_ref_entropy_loss": 0.020599365234375, - "delta_ref_ppl": -0.02294921875, - "entropy_loss": -0.0567626953125, - "epoch": 0.1136, - "grad_norm": 0.798399963420269, - "k1_kl": 0.022918701171875, - "k3_kl": 0.014068603515625, - "kimi_kl": 0.02655029296875, - "learning_rate": 4.4319999999999995e-07, - "loss": 0.0006, - "ppl": 0.0267333984375, - "reward": 0.9842457175254822, - "reward_std": 0.003364513657288626, - "rewards/perpo_ocr_edit_distance_reward": 0.984245777130127, + "advantages": 0.0, + "completion_length": 493.0, + "delta_ref_entropy_loss": 0.036376953125, + "delta_ref_ppl": -0.021484375, + "entropy_loss": -0.029296875, + "epoch": 0.0568, + "grad_norm": 0.4564208148584788, + "k1_kl": 0.021484375, + "k3_kl": 0.0107421875, + "kimi_kl": 0.0216064453125, + "learning_rate": 4.716e-07, + "loss": 0.0004, + "ppl": 0.01171875, + "reward": 0.9780552983283997, + "reward_std": 0.000706045946571976, + "rewards/perpo_ocr_edit_distance_reward": 0.9780552983283997, "step": 284, "temperature": 0.9 }, { - "advantages": -3.483891697442232e-05, - "completion_length": 591.0, - "delta_ref_entropy_loss": 0.03082275390625, - "delta_ref_ppl": -0.024169921875, - "entropy_loss": -0.060791015625, - "epoch": 0.114, - "grad_norm": 0.6919540972296017, - "k1_kl": 0.024169921875, - "k3_kl": 0.01611328125, - "kimi_kl": 0.0465087890625, - "learning_rate": 4.43e-07, - "loss": 0.0007, - "ppl": 0.029754638671875, - "reward": 0.9736410677433014, - "reward_std": 0.006966193730477244, - "rewards/perpo_ocr_edit_distance_reward": 0.9736410975456238, + "advantages": -2.919776306953281e-05, + "completion_length": 1263.0, + "delta_ref_entropy_loss": 0.019287109375, + "delta_ref_ppl": -0.00897216796875, + "entropy_loss": -0.0732421875, + "epoch": 0.057, + "grad_norm": 1.0899345840181927, + "k1_kl": 0.009033203125, + "k3_kl": 0.004486083984375, + "kimi_kl": 0.005859375, + "learning_rate": 4.7149999999999995e-07, + "loss": 0.0002, + "ppl": 0.0380859375, + "reward": 0.9422287344932556, + "reward_std": 0.001066128141246736, + "rewards/perpo_ocr_edit_distance_reward": 0.9422287940979004, "step": 285, "temperature": 0.9 }, { - "advantages": -0.00030877760491421213, - "completion_length": 348.5, - "delta_ref_entropy_loss": 0.02801513671875, - "delta_ref_ppl": -0.03955078125, - "entropy_loss": -0.06640625, - "epoch": 0.1144, - "grad_norm": 0.819882853110852, - "k1_kl": 0.03955078125, - "k3_kl": 0.02508544921875, - "kimi_kl": 0.05908203125, - "learning_rate": 4.428e-07, - "loss": 0.0013, - "ppl": 0.032958984375, - "reward": 0.9640381336212158, - "reward_std": 0.0013357957359403372, - "rewards/perpo_ocr_edit_distance_reward": 0.964038223028183, + "advantages": -9.281295206164941e-06, + "completion_length": 696.0, + "delta_ref_entropy_loss": 0.03857421875, + "delta_ref_ppl": -0.031982421875, + "entropy_loss": -0.03759765625, + "epoch": 0.0572, + "grad_norm": 0.6580209507353945, + "k1_kl": 0.031982421875, + "k3_kl": 0.0159912109375, + "kimi_kl": 0.036376953125, + "learning_rate": 4.7139999999999995e-07, + "loss": 0.0006, + "ppl": 0.015625, + "reward": 0.9780932068824768, + "reward_std": 0.006304990034550428, + "rewards/perpo_ocr_edit_distance_reward": 0.9780933260917664, "step": 286, "temperature": 0.9 }, { - "advantages": -1.2329647120168374e-05, - "completion_length": 180.0, - "delta_ref_entropy_loss": 0.066925048828125, - "delta_ref_ppl": -0.075836181640625, - "entropy_loss": -0.062957763671875, - "epoch": 0.1148, - "grad_norm": 8.448348126356422, - "k1_kl": 0.076324462890625, - "k3_kl": 0.041534423828125, - "kimi_kl": 0.06781005859375, - "learning_rate": 4.4259999999999995e-07, - "loss": 0.0017, - "ppl": 0.03192138671875, - "reward": 0.972204178571701, - "reward_std": 0.005759851104812697, - "rewards/perpo_ocr_edit_distance_reward": 0.9722042083740234, + "advantages": -8.514949456639442e-08, + "completion_length": 1737.0, + "delta_ref_entropy_loss": 0.007598876953125, + "delta_ref_ppl": -0.005462646484375, + "entropy_loss": -0.03369140625, + "epoch": 0.0574, + "grad_norm": 7.09550255770005, + "k1_kl": 0.005523681640625, + "k3_kl": 0.00872802734375, + "kimi_kl": 0.0142822265625, + "learning_rate": 4.713e-07, + "loss": 0.0004, + "ppl": 0.02294921875, + "reward": 0.6900092959403992, + "reward_std": 0.27264291048049927, + "rewards/perpo_ocr_edit_distance_reward": 0.690009355545044, "step": 287, "temperature": 0.9 }, { - "advantages": -3.5928830470766115e-05, - "completion_length": 781.5, - "delta_ref_entropy_loss": 0.03955078125, - "delta_ref_ppl": -0.0279541015625, - "entropy_loss": -0.0675048828125, - "epoch": 0.1152, - "grad_norm": 0.82678514998743, - "k1_kl": 0.0279541015625, - "k3_kl": 0.014068603515625, - "kimi_kl": 0.02789306640625, - "learning_rate": 4.424e-07, - "loss": 0.0006, - "ppl": 0.03271484375, - "reward": 0.9256784617900848, - "reward_std": 0.013476643653120846, - "rewards/perpo_ocr_edit_distance_reward": 0.925678551197052, + "advantages": -2.5289400582551025e-05, + "completion_length": 312.0, + "delta_ref_entropy_loss": 0.050048828125, + "delta_ref_ppl": -0.041748046875, + "entropy_loss": -0.050048828125, + "epoch": 0.0576, + "grad_norm": 1.2152542309836682, + "k1_kl": 0.041748046875, + "k3_kl": 0.02294921875, + "kimi_kl": 0.050537109375, + "learning_rate": 4.712e-07, + "loss": 0.0009, + "ppl": 0.026123046875, + "reward": 0.9731051325798035, + "reward_std": 0.0022596896160393953, + "rewards/perpo_ocr_edit_distance_reward": 0.9731051921844482, "step": 288, "temperature": 0.9 }, { - "advantages": -2.043589120148681e-07, - "completion_length": 583.5, - "delta_ref_entropy_loss": 0.03363037109375, - "delta_ref_ppl": -0.01800537109375, - "entropy_loss": -0.0816650390625, - "epoch": 0.1156, - "grad_norm": 0.7963755514336934, - "k1_kl": 0.01806640625, - "k3_kl": 0.0083160400390625, - "kimi_kl": 0.016845703125, - "learning_rate": 4.422e-07, - "loss": 0.0003, - "ppl": 0.045654296875, - "reward": 0.9719229638576508, - "reward_std": 0.0015043944877106696, - "rewards/perpo_ocr_edit_distance_reward": 0.9719229638576508, + "advantages": -2.1168165403651074e-05, + "completion_length": 543.0, + "delta_ref_entropy_loss": 0.048828125, + "delta_ref_ppl": -0.03564453125, + "entropy_loss": -0.1044921875, + "epoch": 0.0578, + "grad_norm": 1.5930328746930136, + "k1_kl": 0.03564453125, + "k3_kl": 0.0228271484375, + "kimi_kl": 0.04345703125, + "learning_rate": 4.711e-07, + "loss": 0.0009, + "ppl": 0.061767578125, + "reward": 0.9470121264457703, + "reward_std": 0.001911866944283247, + "rewards/perpo_ocr_edit_distance_reward": 0.947012186050415, "step": 289, "temperature": 0.9 }, { - "advantages": -4.109314522793284e-05, - "completion_length": 1004.5, - "delta_ref_entropy_loss": 0.02001953125, - "delta_ref_ppl": -0.016632080078125, - "entropy_loss": -0.05120849609375, - "epoch": 0.116, - "grad_norm": 1.2209298894185587, - "k1_kl": 0.0166015625, - "k3_kl": 0.0092620849609375, - "kimi_kl": 0.025054931640625, - "learning_rate": 4.4199999999999996e-07, - "loss": 0.0004, - "ppl": 0.024200439453125, - "reward": 0.9803467392921448, - "reward_std": 0.007411918297293596, - "rewards/perpo_ocr_edit_distance_reward": 0.9803468286991119, + "advantages": -1.0320119145035278e-05, + "completion_length": 379.0, + "delta_ref_entropy_loss": 0.03955078125, + "delta_ref_ppl": -0.02392578125, + "entropy_loss": -0.0255126953125, + "epoch": 0.058, + "grad_norm": 0.9857188134221855, + "k1_kl": 0.02392578125, + "k3_kl": 0.010986328125, + "kimi_kl": 0.0179443359375, + "learning_rate": 4.7099999999999997e-07, + "loss": 0.0005, + "ppl": 0.00970458984375, + "reward": 0.9806636571884155, + "reward_std": 0.0031987042166292667, + "rewards/perpo_ocr_edit_distance_reward": 0.9806636571884155, "step": 290, "temperature": 0.9 }, { - "advantages": -5.676916725860792e-05, - "completion_length": 682.5, - "delta_ref_entropy_loss": 0.025482177734375, - "delta_ref_ppl": -0.01922607421875, - "entropy_loss": -0.03204345703125, - "epoch": 0.1164, - "grad_norm": 0.9231708845164699, - "k1_kl": 0.0193634033203125, - "k3_kl": 0.0091552734375, - "kimi_kl": 0.018524169921875, - "learning_rate": 4.418e-07, - "loss": 0.0004, - "ppl": 0.012603759765625, - "reward": 0.9981275498867035, - "reward_std": 0.000812564991065301, - "rewards/perpo_ocr_edit_distance_reward": 0.9981275498867035, + "advantages": -8.434057963313535e-05, + "completion_length": 1014.0, + "delta_ref_entropy_loss": 0.01104736328125, + "delta_ref_ppl": -0.010009765625, + "entropy_loss": -0.0120849609375, + "epoch": 0.0582, + "grad_norm": 0.575663264722131, + "k1_kl": 0.00994873046875, + "k3_kl": 0.006011962890625, + "kimi_kl": 0.01190185546875, + "learning_rate": 4.7089999999999996e-07, + "loss": 0.0003, + "ppl": 0.00494384765625, + "reward": 0.9889331459999084, + "reward_std": 0.0004047003749292344, + "rewards/perpo_ocr_edit_distance_reward": 0.988933265209198, "step": 291, "temperature": 0.9 }, { - "advantages": -9.034361366389021e-05, - "completion_length": 817.0, - "delta_ref_entropy_loss": 0.027587890625, - "delta_ref_ppl": -0.01929473876953125, - "entropy_loss": -0.1258544921875, - "epoch": 0.1168, - "grad_norm": 0.5160831248527725, - "k1_kl": 0.01917266845703125, - "k3_kl": 0.010402679443359375, - "kimi_kl": 0.019775390625, - "learning_rate": 4.416e-07, - "loss": 0.0005, - "ppl": 0.0703582763671875, - "reward": 0.9242553412914276, - "reward_std": 0.07756715531286318, - "rewards/perpo_ocr_edit_distance_reward": 0.9242554008960724, + "advantages": -0.00010412080155219883, + "completion_length": 180.0, + "delta_ref_entropy_loss": 0.03173828125, + "delta_ref_ppl": -0.020263671875, + "entropy_loss": -0.03466796875, + "epoch": 0.0584, + "grad_norm": 0.8224676376439125, + "k1_kl": 0.0201416015625, + "k3_kl": 0.0081787109375, + "kimi_kl": 0.01434326171875, + "learning_rate": 4.7079999999999995e-07, + "loss": 0.0004, + "ppl": 0.0091552734375, + "reward": 0.9823214411735535, + "reward_std": 0.00047246640315279365, + "rewards/perpo_ocr_edit_distance_reward": 0.9823215007781982, "step": 292, "temperature": 0.9 }, { - "advantages": -0.00030297040939331055, - "completion_length": 271.0, - "delta_ref_entropy_loss": 0.02618408203125, - "delta_ref_ppl": -0.033203125, - "entropy_loss": -0.02862548828125, - "epoch": 0.1172, - "grad_norm": 0.7603342457722344, - "k1_kl": 0.033203125, - "k3_kl": 0.019744873046875, - "kimi_kl": 0.05926513671875, - "learning_rate": 4.4139999999999997e-07, - "loss": 0.0011, - "ppl": 0.00970458984375, - "reward": 0.9911105632781982, - "reward_std": 0.0059866392984986305, - "rewards/perpo_ocr_edit_distance_reward": 0.991110622882843, + "advantages": -6.373439646267798e-06, + "completion_length": 738.0, + "delta_ref_entropy_loss": 0.017333984375, + "delta_ref_ppl": -0.01806640625, + "entropy_loss": -0.033203125, + "epoch": 0.0586, + "grad_norm": 0.9357403193598968, + "k1_kl": 0.01806640625, + "k3_kl": 0.0120849609375, + "kimi_kl": 0.02978515625, + "learning_rate": 4.707e-07, + "loss": 0.0005, + "ppl": 0.014404296875, + "reward": 0.9359477758407593, + "reward_std": 0.014605415053665638, + "rewards/perpo_ocr_edit_distance_reward": 0.9359478950500488, "step": 293, "temperature": 0.9 }, { - "advantages": -6.605898255429565e-05, - "completion_length": 451.0, - "delta_ref_entropy_loss": 0.04046630859375, - "delta_ref_ppl": -0.0201416015625, - "entropy_loss": -0.07208251953125, - "epoch": 0.1176, - "grad_norm": 1.0206481037945019, - "k1_kl": 0.0201416015625, - "k3_kl": 0.0091705322265625, - "kimi_kl": 0.0159912109375, - "learning_rate": 4.4119999999999995e-07, - "loss": 0.0004, - "ppl": 0.030975341796875, - "reward": 0.9723959863185883, - "reward_std": 0.0075319507595850155, - "rewards/perpo_ocr_edit_distance_reward": 0.9723960161209106, + "advantages": -3.065381861233618e-06, + "completion_length": 1135.0, + "delta_ref_entropy_loss": 0.0211181640625, + "delta_ref_ppl": -0.01092529296875, + "entropy_loss": -0.041259765625, + "epoch": 0.0588, + "grad_norm": 3.1139520624782135, + "k1_kl": 0.01092529296875, + "k3_kl": 0.006072998046875, + "kimi_kl": 0.00994873046875, + "learning_rate": 4.706e-07, + "loss": 0.0002, + "ppl": 0.0181884765625, + "reward": 0.9264222383499146, + "reward_std": 0.016513893380761147, + "rewards/perpo_ocr_edit_distance_reward": 0.9264222979545593, "step": 294, "temperature": 0.9 }, { - "advantages": -4.57806254416937e-05, - "completion_length": 420.5, - "delta_ref_entropy_loss": 0.0291748046875, - "delta_ref_ppl": -0.016754150390625, - "entropy_loss": -0.047027587890625, - "epoch": 0.118, - "grad_norm": 1.0474456490006685, - "k1_kl": 0.016754150390625, - "k3_kl": 0.0091400146484375, - "kimi_kl": 0.0191802978515625, - "learning_rate": 4.41e-07, - "loss": 0.0004, - "ppl": 0.0247955322265625, - "reward": 0.9871106743812561, - "reward_std": 0.0013308647758094594, - "rewards/perpo_ocr_edit_distance_reward": 0.9871107041835785, + "advantages": -2.9921533496235497e-05, + "completion_length": 415.0, + "delta_ref_entropy_loss": 0.026123046875, + "delta_ref_ppl": -0.02587890625, + "entropy_loss": -0.0245361328125, + "epoch": 0.059, + "grad_norm": 0.7019765779772572, + "k1_kl": 0.02587890625, + "k3_kl": 0.01513671875, + "kimi_kl": 0.03515625, + "learning_rate": 4.7049999999999993e-07, + "loss": 0.0006, + "ppl": 0.0098876953125, + "reward": 0.9781992435455322, + "reward_std": 0.0007537209312431514, + "rewards/perpo_ocr_edit_distance_reward": 0.9781992435455322, "step": 295, "temperature": 0.9 }, { - "advantages": -5.517687273481897e-05, - "completion_length": 1071.5, - "delta_ref_entropy_loss": 0.0221099853515625, - "delta_ref_ppl": -0.0104827880859375, - "entropy_loss": -0.07464599609375, - "epoch": 0.1184, - "grad_norm": 2.416932411534945, - "k1_kl": 0.01055908203125, - "k3_kl": 0.0056610107421875, - "kimi_kl": 0.011871337890625, - "learning_rate": 4.4080000000000003e-07, - "loss": 0.0003, - "ppl": 0.04266357421875, - "reward": 0.9247621595859528, - "reward_std": 0.0326470946893096, - "rewards/perpo_ocr_edit_distance_reward": 0.9247622191905975, + "advantages": -1.6008104921638733e-06, + "completion_length": 643.0, + "delta_ref_entropy_loss": 0.08544921875, + "delta_ref_ppl": -0.0400390625, + "entropy_loss": -0.54296875, + "epoch": 0.0592, + "grad_norm": 3.4695523116579983, + "k1_kl": 0.03955078125, + "k3_kl": 0.0191650390625, + "kimi_kl": 0.0262451171875, + "learning_rate": 4.704e-07, + "loss": 0.0008, + "ppl": 0.3125, + "reward": 0.76980060338974, + "reward_std": 0.01583702489733696, + "rewards/perpo_ocr_edit_distance_reward": 0.7698007225990295, "step": 296, "temperature": 0.9 }, { - "advantages": -2.3194723326014355e-05, - "completion_length": 723.5, - "delta_ref_entropy_loss": 0.02606201171875, - "delta_ref_ppl": -0.0128631591796875, - "entropy_loss": -0.04449462890625, - "epoch": 0.1188, - "grad_norm": 0.8753766622022568, - "k1_kl": 0.0128631591796875, - "k3_kl": 0.0056915283203125, - "kimi_kl": 0.0106201171875, - "learning_rate": 4.4059999999999996e-07, - "loss": 0.0003, - "ppl": 0.02117919921875, - "reward": 0.9931840002536774, - "reward_std": 0.0023515161592513323, - "rewards/perpo_ocr_edit_distance_reward": 0.9931840896606445, + "advantages": -2.7247838829680404e-07, + "completion_length": 1331.0, + "delta_ref_entropy_loss": 0.0191650390625, + "delta_ref_ppl": -0.01806640625, + "entropy_loss": -0.1904296875, + "epoch": 0.0594, + "grad_norm": 2.4456795049938314, + "k1_kl": 0.0181884765625, + "k3_kl": 0.00994873046875, + "kimi_kl": 0.01483154296875, + "learning_rate": 4.7029999999999997e-07, + "loss": 0.0004, + "ppl": 0.10400390625, + "reward": 0.786083996295929, + "reward_std": 0.09784162044525146, + "rewards/perpo_ocr_edit_distance_reward": 0.7860840559005737, "step": 297, "temperature": 0.9 }, { - "advantages": -0.0003230231159250252, - "completion_length": 924.5, - "delta_ref_entropy_loss": 0.02532958984375, - "delta_ref_ppl": -0.0151519775390625, - "entropy_loss": -0.02911376953125, - "epoch": 0.1192, - "grad_norm": 0.3134869903202289, - "k1_kl": 0.0151519775390625, - "k3_kl": 0.0089263916015625, - "kimi_kl": 0.02178955078125, - "learning_rate": 4.404e-07, - "loss": 0.0007, - "ppl": 0.010650634765625, - "reward": 0.963924765586853, - "reward_std": 0.00037566726678051054, - "rewards/perpo_ocr_edit_distance_reward": 0.9639248549938202, + "advantages": 1.7029899268550253e-08, + "completion_length": 311.0, + "delta_ref_entropy_loss": 0.052001953125, + "delta_ref_ppl": -0.0279541015625, + "entropy_loss": -0.056640625, + "epoch": 0.0596, + "grad_norm": 0.7772616726022774, + "k1_kl": 0.0279541015625, + "k3_kl": 0.01043701171875, + "kimi_kl": 0.019775390625, + "learning_rate": 4.7019999999999996e-07, + "loss": 0.0004, + "ppl": 0.025634765625, + "reward": 0.9739764332771301, + "reward_std": 0.0005259702447801828, + "rewards/perpo_ocr_edit_distance_reward": 0.9739764928817749, "step": 298, "temperature": 0.9 }, { - "advantages": -1.3428075703814102e-05, - "completion_length": 525.0, - "delta_ref_entropy_loss": 0.0501708984375, - "delta_ref_ppl": -0.0291748046875, - "entropy_loss": -0.080078125, - "epoch": 0.1196, - "grad_norm": 1.703391119927932, - "k1_kl": 0.02923583984375, - "k3_kl": 0.0126953125, - "kimi_kl": 0.020538330078125, - "learning_rate": 4.402e-07, - "loss": 0.0005, - "ppl": 0.04815673828125, - "reward": 0.9527095556259155, - "reward_std": 0.005284923157887533, - "rewards/perpo_ocr_edit_distance_reward": 0.9527096152305603, + "advantages": -1.4305115882962127e-06, + "completion_length": 393.0, + "delta_ref_entropy_loss": 0.0322265625, + "delta_ref_ppl": -0.0159912109375, + "entropy_loss": -0.029541015625, + "epoch": 0.0598, + "grad_norm": 0.9545958407986357, + "k1_kl": 0.0159912109375, + "k3_kl": 0.007720947265625, + "kimi_kl": 0.012451171875, + "learning_rate": 4.701e-07, + "loss": 0.0003, + "ppl": 0.01422119140625, + "reward": 0.9335646629333496, + "reward_std": 0.041216690093278885, + "rewards/perpo_ocr_edit_distance_reward": 0.9335647821426392, "step": 299, "temperature": 0.9 }, { - "advantages": -7.727316784666982e-06, - "completion_length": 693.5, - "delta_ref_entropy_loss": 0.03753662109375, - "delta_ref_ppl": -0.016845703125, - "entropy_loss": -0.0972900390625, - "epoch": 0.12, - "grad_norm": 0.9058468413911042, - "k1_kl": 0.01690673828125, - "k3_kl": 0.007049560546875, - "kimi_kl": 0.0115814208984375, - "learning_rate": 4.3999999999999997e-07, - "loss": 0.0003, - "ppl": 0.04815673828125, - "reward": 0.9202784895896912, - "reward_std": 0.02505368203856051, - "rewards/perpo_ocr_edit_distance_reward": 0.9202785491943359, + "advantages": 1.021793991640152e-07, + "completion_length": 1230.0, + "delta_ref_entropy_loss": 0.0908203125, + "delta_ref_ppl": -0.04736328125, + "entropy_loss": -0.396484375, + "epoch": 0.06, + "grad_norm": 199.7266677237443, + "k1_kl": 0.047119140625, + "k3_kl": 0.1884765625, + "kimi_kl": 0.047119140625, + "learning_rate": 4.6999999999999995e-07, + "loss": 0.0076, + "ppl": 0.2431640625, + "reward": 0.5992385745048523, + "reward_std": 0.09259671717882156, + "rewards/perpo_ocr_edit_distance_reward": 0.5992385745048523, "step": 300, "temperature": 0.9 }, { - "advantages": -1.862645227390658e-05, - "completion_length": 1657.5, - "delta_ref_entropy_loss": 0.0101165771484375, - "delta_ref_ppl": -0.008627891540527344, - "entropy_loss": -0.03173828125, - "epoch": 0.1204, - "grad_norm": 0.6036504963135798, - "k1_kl": 0.00856924057006836, - "k3_kl": 0.00476837158203125, - "kimi_kl": 0.00795745849609375, - "learning_rate": 4.398e-07, - "loss": 0.0002, - "ppl": 0.0111541748046875, - "reward": 0.4953632727265358, - "reward_std": 0.11557668575778735, - "rewards/perpo_ocr_edit_distance_reward": 0.4953632801771164, + "advantages": -9.945460988092236e-06, + "completion_length": 43.0, + "delta_ref_entropy_loss": 0.02734375, + "delta_ref_ppl": -0.06591796875, + "entropy_loss": -0.095703125, + "epoch": 0.0602, + "grad_norm": 5.640662343684089, + "k1_kl": 0.06591796875, + "k3_kl": 0.043701171875, + "kimi_kl": 0.0771484375, + "learning_rate": 4.6989999999999994e-07, + "loss": 0.0018, + "ppl": 0.060791015625, + "reward": 0.8813302516937256, + "reward_std": 0.004162959288805723, + "rewards/perpo_ocr_edit_distance_reward": 0.8813303709030151, "step": 301, "temperature": 0.9 }, { - "advantages": -0.0003028937749149918, - "completion_length": 589.5, - "delta_ref_entropy_loss": 0.02667236328125, - "delta_ref_ppl": -0.0284423828125, - "entropy_loss": -0.043853759765625, - "epoch": 0.1208, - "grad_norm": 0.5338060079337336, - "k1_kl": 0.028472900390625, - "k3_kl": 0.01605224609375, - "kimi_kl": 0.0456085205078125, - "learning_rate": 4.396e-07, - "loss": 0.0009, - "ppl": 0.0196990966796875, - "reward": 0.9793291687965393, - "reward_std": 0.002573086181655526, - "rewards/perpo_ocr_edit_distance_reward": 0.9793292284011841, + "advantages": 0.0, + "completion_length": 344.0, + "delta_ref_entropy_loss": 0.0230712890625, + "delta_ref_ppl": -0.00616455078125, + "entropy_loss": -0.0106201171875, + "epoch": 0.0604, + "grad_norm": 0.004125134900702612, + "k1_kl": 0.0062255859375, + "k3_kl": 0.0010986328125, + "kimi_kl": 0.00135040283203125, + "learning_rate": 4.698e-07, + "loss": 0.0, + "ppl": 0.0018768310546875, + "reward": 0.9855682253837585, + "reward_std": 0.0, + "rewards/perpo_ocr_edit_distance_reward": 0.9855682253837585, "step": 302, "temperature": 0.9 }, { - "advantages": -1.832417137848097e-05, - "completion_length": 638.0, - "delta_ref_entropy_loss": 0.034515380859375, - "delta_ref_ppl": -0.017333984375, - "entropy_loss": -0.0924072265625, - "epoch": 0.1212, - "grad_norm": 1.0389841573223657, - "k1_kl": 0.0174560546875, - "k3_kl": 0.00775146484375, - "kimi_kl": 0.0117034912109375, - "learning_rate": 4.394e-07, - "loss": 0.0003, - "ppl": 0.048858642578125, - "reward": 0.9526292681694031, - "reward_std": 0.004868338000960648, - "rewards/perpo_ocr_edit_distance_reward": 0.9526293575763702, + "advantages": -2.889973984565586e-05, + "completion_length": 728.0, + "delta_ref_entropy_loss": 0.0294189453125, + "delta_ref_ppl": -0.0128173828125, + "entropy_loss": -0.02587890625, + "epoch": 0.0606, + "grad_norm": 0.3350145162615852, + "k1_kl": 0.0128173828125, + "k3_kl": 0.004150390625, + "kimi_kl": 0.00689697265625, + "learning_rate": 4.697e-07, + "loss": 0.0002, + "ppl": 0.011474609375, + "reward": 0.9742056131362915, + "reward_std": 0.0004896067548543215, + "rewards/perpo_ocr_edit_distance_reward": 0.9742056727409363, "step": 303, "temperature": 0.9 }, { - "advantages": -4.870550972313481e-06, - "completion_length": 1410.0, - "delta_ref_entropy_loss": 0.01812744140625, - "delta_ref_ppl": -0.043914794921875, - "entropy_loss": -0.23516845703125, - "epoch": 0.1216, - "grad_norm": 4.722336553264701, - "k1_kl": 0.043670654296875, - "k3_kl": 0.0297698974609375, - "kimi_kl": 0.067962646484375, - "learning_rate": 4.3919999999999996e-07, - "loss": 0.0012, - "ppl": 0.124420166015625, - "reward": 0.5685688331723213, - "reward_std": 0.07050525362137705, - "rewards/perpo_ocr_edit_distance_reward": 0.5685688629746437, + "advantages": -2.55448497910038e-08, + "completion_length": 154.0, + "delta_ref_entropy_loss": 0.0208740234375, + "delta_ref_ppl": -0.0380859375, + "entropy_loss": -0.028076171875, + "epoch": 0.0608, + "grad_norm": 1.5338123936681167, + "k1_kl": 0.038330078125, + "k3_kl": 0.025146484375, + "kimi_kl": 0.05517578125, + "learning_rate": 4.6959999999999997e-07, + "loss": 0.001, + "ppl": 0.0191650390625, + "reward": 0.9600179195404053, + "reward_std": 0.0015344400890171528, + "rewards/perpo_ocr_edit_distance_reward": 0.96001797914505, "step": 304, "temperature": 0.9 }, { - "advantages": -6.846019729778163e-06, - "completion_length": 120.0, - "delta_ref_entropy_loss": 0.090087890625, - "delta_ref_ppl": -0.1202392578125, - "entropy_loss": -0.12109375, - "epoch": 0.122, - "grad_norm": 5.4837349558569315, - "k1_kl": 0.12060546875, - "k3_kl": 0.073974609375, - "kimi_kl": 0.12994384765625, - "learning_rate": 4.39e-07, - "loss": 0.003, - "ppl": 0.05780029296875, - "reward": 0.940397322177887, - "reward_std": 0.018143439083360136, - "rewards/perpo_ocr_edit_distance_reward": 0.940397322177887, + "advantages": -1.2891633559775073e-05, + "completion_length": 311.0, + "delta_ref_entropy_loss": 0.0201416015625, + "delta_ref_ppl": -0.00909423828125, + "entropy_loss": -0.01953125, + "epoch": 0.061, + "grad_norm": 1.1209687561350268, + "k1_kl": 0.00909423828125, + "k3_kl": 0.0037994384765625, + "kimi_kl": 0.006500244140625, + "learning_rate": 4.6949999999999996e-07, + "loss": 0.0002, + "ppl": 0.00537109375, + "reward": 0.9619755744934082, + "reward_std": 0.0051924074068665504, + "rewards/perpo_ocr_edit_distance_reward": 0.9619756937026978, "step": 305, "temperature": 0.9 }, { - "advantages": -0.00042634351120796055, - "completion_length": 461.0, - "delta_ref_entropy_loss": 0.018157958984375, - "delta_ref_ppl": -0.01190185546875, - "entropy_loss": -0.021087646484375, - "epoch": 0.1224, - "grad_norm": 0.15724893906537551, - "k1_kl": 0.0118865966796875, - "k3_kl": 0.00553131103515625, - "kimi_kl": 0.0091400146484375, - "learning_rate": 4.388e-07, - "loss": 0.0006, - "ppl": 0.008056640625, - "reward": 0.9967456459999084, - "reward_std": 6.61259182379581e-05, - "rewards/perpo_ocr_edit_distance_reward": 0.996745765209198, + "advantages": -2.4012158519326476e-06, + "completion_length": 591.0, + "delta_ref_entropy_loss": 0.0250244140625, + "delta_ref_ppl": -0.04443359375, + "entropy_loss": -0.0260009765625, + "epoch": 0.0612, + "grad_norm": 1.0914098749148131, + "k1_kl": 0.04443359375, + "k3_kl": 0.033935546875, + "kimi_kl": 0.140625, + "learning_rate": 4.6939999999999995e-07, + "loss": 0.0014, + "ppl": 0.01007080078125, + "reward": 0.9215838313102722, + "reward_std": 0.014049557968974113, + "rewards/perpo_ocr_edit_distance_reward": 0.921583890914917, "step": 306, "temperature": 0.9 }, { - "advantages": -9.916936323861592e-05, - "completion_length": 498.5, - "delta_ref_entropy_loss": 0.024169921875, - "delta_ref_ppl": -0.02496337890625, - "entropy_loss": -0.019073486328125, - "epoch": 0.1228, - "grad_norm": 0.5352552451705328, - "k1_kl": 0.024810791015625, - "k3_kl": 0.014373779296875, - "kimi_kl": 0.03045654296875, - "learning_rate": 4.3859999999999997e-07, - "loss": 0.0007, - "ppl": 0.0093536376953125, - "reward": 0.9712769687175751, - "reward_std": 0.0008792553271632642, - "rewards/perpo_ocr_edit_distance_reward": 0.9712770283222198, + "advantages": -2.54426686296938e-05, + "completion_length": 367.0, + "delta_ref_entropy_loss": 0.03515625, + "delta_ref_ppl": -0.0179443359375, + "entropy_loss": -0.025390625, + "epoch": 0.0614, + "grad_norm": 0.4818372183452776, + "k1_kl": 0.0179443359375, + "k3_kl": 0.00836181640625, + "kimi_kl": 0.01397705078125, + "learning_rate": 4.6929999999999995e-07, + "loss": 0.0004, + "ppl": 0.007354736328125, + "reward": 0.9709142446517944, + "reward_std": 0.0002346430701436475, + "rewards/perpo_ocr_edit_distance_reward": 0.9709142446517944, "step": 307, "temperature": 0.9 }, { - "advantages": -2.7780022492152057e-05, - "completion_length": 785.0, - "delta_ref_entropy_loss": 0.0504150390625, - "delta_ref_ppl": -0.023956298828125, - "entropy_loss": -0.1175537109375, - "epoch": 0.1232, - "grad_norm": 0.9011763615862445, - "k1_kl": 0.023956298828125, - "k3_kl": 0.0096588134765625, - "kimi_kl": 0.0149383544921875, - "learning_rate": 4.384e-07, - "loss": 0.0004, - "ppl": 0.056793212890625, - "reward": 0.942359983921051, - "reward_std": 0.0033622587216086686, - "rewards/perpo_ocr_edit_distance_reward": 0.9423600137233734, + "advantages": -7.099765207385644e-05, + "completion_length": 1116.0, + "delta_ref_entropy_loss": 0.01055908203125, + "delta_ref_ppl": -0.00787353515625, + "entropy_loss": -0.0238037109375, + "epoch": 0.0616, + "grad_norm": 0.44739607405937737, + "k1_kl": 0.0078125, + "k3_kl": 0.004119873046875, + "kimi_kl": 0.0084228515625, + "learning_rate": 4.692e-07, + "loss": 0.0002, + "ppl": 0.01092529296875, + "reward": 0.9690865874290466, + "reward_std": 0.0006196221802383661, + "rewards/perpo_ocr_edit_distance_reward": 0.9690867066383362, "step": 308, "temperature": 0.9 }, { - "advantages": -3.090075188083574e-05, - "completion_length": 604.0, - "delta_ref_entropy_loss": 0.044677734375, - "delta_ref_ppl": -0.0313720703125, - "entropy_loss": -0.105224609375, - "epoch": 0.1236, - "grad_norm": 11.031098877864563, - "k1_kl": 0.0313720703125, - "k3_kl": 0.019927978515625, - "kimi_kl": 0.0433349609375, - "learning_rate": 4.3819999999999994e-07, - "loss": 0.0008, - "ppl": 0.0556640625, - "reward": 0.9032276272773743, - "reward_std": 0.0577557539800182, - "rewards/perpo_ocr_edit_distance_reward": 0.9032276570796967, + "advantages": -2.213887000834802e-06, + "completion_length": 388.0, + "delta_ref_entropy_loss": 0.033203125, + "delta_ref_ppl": -0.01708984375, + "entropy_loss": -0.04541015625, + "epoch": 0.0618, + "grad_norm": 1.3326838668589691, + "k1_kl": 0.0172119140625, + "k3_kl": 0.0101318359375, + "kimi_kl": 0.01806640625, + "learning_rate": 4.691e-07, + "loss": 0.0004, + "ppl": 0.01904296875, + "reward": 0.885827362537384, + "reward_std": 0.011383569799363613, + "rewards/perpo_ocr_edit_distance_reward": 0.885827362537384, "step": 309, "temperature": 0.9 }, { - "advantages": -7.846526204957627e-05, - "completion_length": 424.5, - "delta_ref_entropy_loss": 0.02734375, - "delta_ref_ppl": -0.017120361328125, - "entropy_loss": -0.02398681640625, - "epoch": 0.124, - "grad_norm": 0.7980775694697723, - "k1_kl": 0.017120361328125, - "k3_kl": 0.00821685791015625, - "kimi_kl": 0.0174407958984375, - "learning_rate": 4.38e-07, - "loss": 0.0004, - "ppl": 0.0091705322265625, - "reward": 0.995046079158783, - "reward_std": 0.0005629721999866888, - "rewards/perpo_ocr_edit_distance_reward": 0.9950461387634277, + "advantages": -7.152557373046875e-05, + "completion_length": 1040.0, + "delta_ref_entropy_loss": 0.025390625, + "delta_ref_ppl": -0.0189208984375, + "entropy_loss": -0.031982421875, + "epoch": 0.062, + "grad_norm": 0.5208651573753452, + "k1_kl": 0.0189208984375, + "k3_kl": 0.01080322265625, + "kimi_kl": 0.027587890625, + "learning_rate": 4.689999999999999e-07, + "loss": 0.0005, + "ppl": 0.0145263671875, + "reward": 0.9758915901184082, + "reward_std": 0.00025721060228534043, + "rewards/perpo_ocr_edit_distance_reward": 0.975891649723053, "step": 310, "temperature": 0.9 }, { - "advantages": -1.148666729022807e-05, - "completion_length": 196.5, - "delta_ref_entropy_loss": 0.0382080078125, - "delta_ref_ppl": -0.0501708984375, - "entropy_loss": -0.053466796875, - "epoch": 0.1244, - "grad_norm": 2.3056581911540905, - "k1_kl": 0.0499267578125, - "k3_kl": 0.03466796875, - "kimi_kl": 0.085205078125, - "learning_rate": 4.378e-07, - "loss": 0.0014, - "ppl": 0.023681640625, - "reward": 0.9910635650157928, - "reward_std": 0.00441104406490922, - "rewards/perpo_ocr_edit_distance_reward": 0.9910636246204376, + "advantages": -3.37191995640751e-05, + "completion_length": 737.0, + "delta_ref_entropy_loss": 0.0181884765625, + "delta_ref_ppl": -0.0213623046875, + "entropy_loss": -0.0167236328125, + "epoch": 0.0622, + "grad_norm": 0.4022815894139989, + "k1_kl": 0.021484375, + "k3_kl": 0.01409912109375, + "kimi_kl": 0.033935546875, + "learning_rate": 4.6889999999999997e-07, + "loss": 0.0006, + "ppl": 0.00604248046875, + "reward": 0.9823548793792725, + "reward_std": 0.0006579436012543738, + "rewards/perpo_ocr_edit_distance_reward": 0.982354998588562, "step": 311, "temperature": 0.9 }, { - "advantages": -0.00012554867680591997, - "completion_length": 809.5, - "delta_ref_entropy_loss": 0.01708984375, - "delta_ref_ppl": -0.0076751708984375, - "entropy_loss": -0.008087158203125, - "epoch": 0.1248, - "grad_norm": 0.1527684171794072, - "k1_kl": 0.007659912109375, - "k3_kl": 0.00305938720703125, - "kimi_kl": 0.004608154296875, - "learning_rate": 4.3759999999999995e-07, - "loss": 0.0002, - "ppl": 0.00342559814453125, - "reward": 0.9991215169429779, - "reward_std": 0.00021482439478859305, - "rewards/perpo_ocr_edit_distance_reward": 0.9991215765476227, + "advantages": -2.86272606899729e-05, + "completion_length": 576.0, + "delta_ref_entropy_loss": 0.0208740234375, + "delta_ref_ppl": -0.01318359375, + "entropy_loss": -0.0137939453125, + "epoch": 0.0624, + "grad_norm": 0.5527065932272903, + "k1_kl": 0.01312255859375, + "k3_kl": 0.00732421875, + "kimi_kl": 0.0191650390625, + "learning_rate": 4.6879999999999996e-07, + "loss": 0.0003, + "ppl": 0.00714111328125, + "reward": 0.9806252121925354, + "reward_std": 0.0010896550957113504, + "rewards/perpo_ocr_edit_distance_reward": 0.9806252121925354, "step": 312, "temperature": 0.9 }, { - "advantages": -0.00030225515365600586, - "completion_length": 779.0, - "delta_ref_entropy_loss": 0.01861572265625, - "delta_ref_ppl": -0.0110931396484375, - "entropy_loss": -0.026763916015625, - "epoch": 0.1252, - "grad_norm": 0.30992162587433053, - "k1_kl": 0.0110321044921875, - "k3_kl": 0.00531768798828125, - "kimi_kl": 0.00998687744140625, - "learning_rate": 4.374e-07, - "loss": 0.0005, - "ppl": 0.01239013671875, - "reward": 0.9832404553890228, - "reward_std": 0.00095651630545035, - "rewards/perpo_ocr_edit_distance_reward": 0.98324054479599, + "advantages": 0.0, + "completion_length": 291.0, + "delta_ref_entropy_loss": 0.032470703125, + "delta_ref_ppl": -0.033203125, + "entropy_loss": -0.07568359375, + "epoch": 0.0626, + "grad_norm": 1.7153016841541848, + "k1_kl": 0.033203125, + "k3_kl": 0.0189208984375, + "kimi_kl": 0.0361328125, + "learning_rate": 4.687e-07, + "loss": 0.0008, + "ppl": 0.03759765625, + "reward": 0.9702068567276001, + "reward_std": 0.001278554555028677, + "rewards/perpo_ocr_edit_distance_reward": 0.9702068567276001, "step": 313, "temperature": 0.9 }, { - "advantages": 2.971717354860459e-06, - "completion_length": 645.0, - "delta_ref_entropy_loss": 0.0208740234375, - "delta_ref_ppl": -0.009521484375, - "entropy_loss": -0.016754150390625, - "epoch": 0.1256, - "grad_norm": 0.8607317027535218, - "k1_kl": 0.009521484375, - "k3_kl": 0.0047607421875, - "kimi_kl": 0.00847625732421875, - "learning_rate": 4.3719999999999997e-07, - "loss": 0.0002, - "ppl": 0.0061798095703125, - "reward": 0.9729126691818237, - "reward_std": 0.0029163289509597234, - "rewards/perpo_ocr_edit_distance_reward": 0.9729126691818237, + "advantages": 2.1287374085687816e-09, + "completion_length": 675.0, + "delta_ref_entropy_loss": 0.014404296875, + "delta_ref_ppl": -0.00726318359375, + "entropy_loss": -0.037109375, + "epoch": 0.0628, + "grad_norm": 2.396389375711101, + "k1_kl": 0.007293701171875, + "k3_kl": 0.0027618408203125, + "kimi_kl": 0.005157470703125, + "learning_rate": 4.686e-07, + "loss": 0.0001, + "ppl": 0.01434326171875, + "reward": 0.9739382266998291, + "reward_std": 0.006958744954317808, + "rewards/perpo_ocr_edit_distance_reward": 0.9739382266998291, "step": 314, "temperature": 0.9 }, { - "advantages": -2.196857010972053e-06, - "completion_length": 1242.0, - "delta_ref_entropy_loss": 0.02239990234375, - "delta_ref_ppl": -0.02984619140625, - "entropy_loss": -0.496337890625, - "epoch": 0.126, - "grad_norm": 2.215865084550417, - "k1_kl": 0.02996826171875, - "k3_kl": 0.024658203125, - "kimi_kl": 0.0498046875, - "learning_rate": 4.3699999999999996e-07, - "loss": 0.001, - "ppl": 0.2860107421875, - "reward": 0.6294701546430588, - "reward_std": 0.036582003347575665, - "rewards/perpo_ocr_edit_distance_reward": 0.6294701993465424, + "advantages": -8.105380402412266e-05, + "completion_length": 1191.0, + "delta_ref_entropy_loss": 0.01385498046875, + "delta_ref_ppl": -0.005126953125, + "entropy_loss": -0.0286865234375, + "epoch": 0.063, + "grad_norm": 0.43582237561287684, + "k1_kl": 0.005126953125, + "k3_kl": 0.0022430419921875, + "kimi_kl": 0.0025787353515625, + "learning_rate": 4.685e-07, + "loss": 0.0002, + "ppl": 0.01214599609375, + "reward": 0.9733056426048279, + "reward_std": 0.0003202120424248278, + "rewards/perpo_ocr_edit_distance_reward": 0.9733057022094727, "step": 315, "temperature": 0.9 }, { - "advantages": -1.243182850885205e-06, - "completion_length": 425.0, - "delta_ref_entropy_loss": 0.096923828125, - "delta_ref_ppl": -0.0936279296875, - "entropy_loss": -0.13525390625, - "epoch": 0.1264, - "grad_norm": 4.184896782620395, - "k1_kl": 0.0941162109375, - "k3_kl": 0.073974609375, - "kimi_kl": 0.205322265625, - "learning_rate": 4.368e-07, - "loss": 0.003, - "ppl": 0.06640625, - "reward": 0.8799799978733063, - "reward_std": 0.0039819442899897695, - "rewards/perpo_ocr_edit_distance_reward": 0.8799799978733063, + "advantages": -2.7247838829680404e-07, + "completion_length": 397.0, + "delta_ref_entropy_loss": 0.033203125, + "delta_ref_ppl": -0.0299072265625, + "entropy_loss": -0.052978515625, + "epoch": 0.0632, + "grad_norm": 1.9864692445343028, + "k1_kl": 0.0299072265625, + "k3_kl": 0.01544189453125, + "kimi_kl": 0.0390625, + "learning_rate": 4.684e-07, + "loss": 0.0006, + "ppl": 0.0216064453125, + "reward": 0.9319255352020264, + "reward_std": 0.09320632368326187, + "rewards/perpo_ocr_edit_distance_reward": 0.9319255352020264, "step": 316, "temperature": 0.9 }, { - "advantages": -9.611675028509126e-05, - "completion_length": 491.5, - "delta_ref_entropy_loss": 0.0452880859375, - "delta_ref_ppl": -0.027252197265625, - "entropy_loss": -0.18182373046875, - "epoch": 0.1268, - "grad_norm": 0.7768321554229509, - "k1_kl": 0.0276336669921875, - "k3_kl": 0.013031005859375, - "kimi_kl": 0.0238494873046875, - "learning_rate": 4.366e-07, - "loss": 0.0006, - "ppl": 0.09920501708984375, - "reward": 0.8636569678783417, - "reward_std": 0.07669390023511369, - "rewards/perpo_ocr_edit_distance_reward": 0.8636569976806641, + "advantages": -3.7235873605823144e-05, + "completion_length": 547.0, + "delta_ref_entropy_loss": 0.01324462890625, + "delta_ref_ppl": -0.00439453125, + "entropy_loss": -0.01385498046875, + "epoch": 0.0634, + "grad_norm": 0.8586169368163539, + "k1_kl": 0.004425048828125, + "k3_kl": 0.00145721435546875, + "kimi_kl": 0.00174713134765625, + "learning_rate": 4.683e-07, + "loss": 0.0001, + "ppl": 0.0103759765625, + "reward": 0.977669358253479, + "reward_std": 0.0008145045721903443, + "rewards/perpo_ocr_edit_distance_reward": 0.977669358253479, "step": 317, "temperature": 0.9 }, { - "advantages": -2.289882804973331e-05, - "completion_length": 1199.5, - "delta_ref_entropy_loss": 0.039154052734375, - "delta_ref_ppl": -0.0194244384765625, - "entropy_loss": -0.06689453125, - "epoch": 0.1272, - "grad_norm": 1.2471930814790142, - "k1_kl": 0.0194091796875, - "k3_kl": 0.0081329345703125, - "kimi_kl": 0.0142364501953125, - "learning_rate": 4.364e-07, - "loss": 0.0003, - "ppl": 0.031890869140625, - "reward": 0.6838484555482864, - "reward_std": 0.024970633734483272, - "rewards/perpo_ocr_edit_distance_reward": 0.6838484853506088, + "advantages": -8.821487426757812e-06, + "completion_length": 570.0, + "delta_ref_entropy_loss": 0.044189453125, + "delta_ref_ppl": -0.0247802734375, + "entropy_loss": -0.1630859375, + "epoch": 0.0636, + "grad_norm": 1.9949308490154567, + "k1_kl": 0.0250244140625, + "k3_kl": 0.0145263671875, + "kimi_kl": 0.0196533203125, + "learning_rate": 4.6819999999999997e-07, + "loss": 0.0006, + "ppl": 0.09375, + "reward": 0.7961018681526184, + "reward_std": 0.0037623976822942495, + "rewards/perpo_ocr_edit_distance_reward": 0.7961018681526184, "step": 318, "temperature": 0.9 }, { - "advantages": -9.843281759458478e-06, - "completion_length": 552.5, - "delta_ref_entropy_loss": 0.02935791015625, - "delta_ref_ppl": -0.017303466796875, - "entropy_loss": -0.0306396484375, - "epoch": 0.1276, - "grad_norm": 0.5264459266383973, - "k1_kl": 0.0172119140625, - "k3_kl": 0.0094146728515625, - "kimi_kl": 0.0177001953125, - "learning_rate": 4.3619999999999995e-07, - "loss": 0.0004, - "ppl": 0.0128173828125, - "reward": 0.9848928451538086, - "reward_std": 0.004516394576057792, - "rewards/perpo_ocr_edit_distance_reward": 0.984892874956131, + "advantages": -3.109659519395791e-05, + "completion_length": 343.0, + "delta_ref_entropy_loss": 0.06591796875, + "delta_ref_ppl": -0.05322265625, + "entropy_loss": -0.078125, + "epoch": 0.0638, + "grad_norm": 1.809298425057354, + "k1_kl": 0.05322265625, + "k3_kl": 0.03173828125, + "kimi_kl": 0.0693359375, + "learning_rate": 4.681e-07, + "loss": 0.0013, + "ppl": 0.040283203125, + "reward": 0.9426714777946472, + "reward_std": 0.002089958405122161, + "rewards/perpo_ocr_edit_distance_reward": 0.942671537399292, "step": 319, "temperature": 0.9 }, { - "advantages": -3.962005939683877e-05, - "completion_length": 1261.0, - "delta_ref_entropy_loss": 0.0167236328125, - "delta_ref_ppl": -0.00091552734375, - "entropy_loss": -0.05645751953125, - "epoch": 0.128, - "grad_norm": 24594.76513547605, - "k1_kl": 0.0009002685546875, - "k3_kl": 91.0011978149414, - "kimi_kl": 0.04021453857421875, - "learning_rate": 4.36e-07, - "loss": 3.6427, - "ppl": 0.044281005859375, - "reward": 0.9532942473888397, - "reward_std": 0.04038518897868926, - "rewards/perpo_ocr_edit_distance_reward": 0.9532943665981293, + "advantages": -8.514949634275126e-09, + "completion_length": 941.0, + "delta_ref_entropy_loss": 0.0252685546875, + "delta_ref_ppl": -0.0322265625, + "entropy_loss": -0.10205078125, + "epoch": 0.064, + "grad_norm": 1.0890518592034892, + "k1_kl": 0.0322265625, + "k3_kl": 0.0191650390625, + "kimi_kl": 0.037109375, + "learning_rate": 4.68e-07, + "loss": 0.0008, + "ppl": 0.0498046875, + "reward": 0.798669695854187, + "reward_std": 0.16967986524105072, + "rewards/perpo_ocr_edit_distance_reward": 0.798669695854187, "step": 320, "temperature": 0.9 }, { - "advantages": -0.00033297709160251543, - "completion_length": 411.5, - "delta_ref_entropy_loss": 0.03857421875, - "delta_ref_ppl": -0.02032470703125, - "entropy_loss": -0.043975830078125, - "epoch": 0.1284, - "grad_norm": 1.0004395196822151, - "k1_kl": 0.0203857421875, - "k3_kl": 0.008758544921875, - "kimi_kl": 0.012908935546875, - "learning_rate": 4.358e-07, - "loss": 0.0007, - "ppl": 0.0221710205078125, - "reward": 0.9927371442317963, - "reward_std": 0.00025454984279349446, - "rewards/perpo_ocr_edit_distance_reward": 0.992737203836441, + "advantages": -0.00016777004930190742, + "completion_length": 800.0, + "delta_ref_entropy_loss": 0.0113525390625, + "delta_ref_ppl": -0.00885009765625, + "entropy_loss": -0.0252685546875, + "epoch": 0.0642, + "grad_norm": 0.5497859079252367, + "k1_kl": 0.00885009765625, + "k3_kl": 0.006134033203125, + "kimi_kl": 0.01080322265625, + "learning_rate": 4.6789999999999995e-07, + "loss": 0.0004, + "ppl": 0.01385498046875, + "reward": 0.9875165820121765, + "reward_std": 0.0003568007377907634, + "rewards/perpo_ocr_edit_distance_reward": 0.9875167012214661, "step": 321, "temperature": 0.9 }, { - "advantages": -0.00032408748484158423, - "completion_length": 550.0, - "delta_ref_entropy_loss": 0.056396484375, - "delta_ref_ppl": -0.031585693359375, - "entropy_loss": -0.0257568359375, - "epoch": 0.1288, - "grad_norm": 0.5222694969446897, - "k1_kl": 0.0316162109375, - "k3_kl": 0.0144195556640625, - "kimi_kl": 0.02557373046875, - "learning_rate": 4.3559999999999996e-07, - "loss": 0.0009, - "ppl": 0.00738525390625, - "reward": 0.996869295835495, - "reward_std": 0.0006853101658634841, - "rewards/perpo_ocr_edit_distance_reward": 0.9968693554401398, + "advantages": -3.162452412652783e-05, + "completion_length": 633.0, + "delta_ref_entropy_loss": 0.020751953125, + "delta_ref_ppl": -0.022216796875, + "entropy_loss": -0.036865234375, + "epoch": 0.0644, + "grad_norm": 0.8308092350396762, + "k1_kl": 0.022216796875, + "k3_kl": 0.01409912109375, + "kimi_kl": 0.034423828125, + "learning_rate": 4.678e-07, + "loss": 0.0006, + "ppl": 0.0208740234375, + "reward": 0.8160135746002197, + "reward_std": 0.0028631461318582296, + "rewards/perpo_ocr_edit_distance_reward": 0.8160136342048645, "step": 322, "temperature": 0.9 }, { - "advantages": -4.730906169037041e-05, - "completion_length": 582.5, - "delta_ref_entropy_loss": 0.023681640625, - "delta_ref_ppl": -0.031219482421875, - "entropy_loss": -0.0560302734375, - "epoch": 0.1292, - "grad_norm": 10.329411313338783, - "k1_kl": 0.031280517578125, - "k3_kl": 0.03887939453125, - "kimi_kl": 0.0477294921875, - "learning_rate": 4.354e-07, - "loss": 0.0016, - "ppl": 0.03350830078125, - "reward": 0.9614442586898804, - "reward_std": 0.004805227741599083, - "rewards/perpo_ocr_edit_distance_reward": 0.9614443182945251, + "advantages": 8.634158803033642e-06, + "completion_length": 101.0, + "delta_ref_entropy_loss": 0.02392578125, + "delta_ref_ppl": -0.0194091796875, + "entropy_loss": -0.021484375, + "epoch": 0.0646, + "grad_norm": 4.9699127725163645, + "k1_kl": 0.01953125, + "k3_kl": 0.01318359375, + "kimi_kl": 0.0185546875, + "learning_rate": 4.677e-07, + "loss": 0.0005, + "ppl": 0.0093994140625, + "reward": 0.9644225835800171, + "reward_std": 0.001869525178335607, + "rewards/perpo_ocr_edit_distance_reward": 0.9644225239753723, "step": 323, "temperature": 0.9 }, { - "advantages": -1.9218241244622902e-05, - "completion_length": 796.5, - "delta_ref_entropy_loss": 0.03167724609375, - "delta_ref_ppl": -0.0164794921875, - "entropy_loss": -0.02874755859375, - "epoch": 0.1296, - "grad_norm": 0.40806951837586264, - "k1_kl": 0.0164794921875, - "k3_kl": 0.00726318359375, - "kimi_kl": 0.0139923095703125, - "learning_rate": 4.352e-07, - "loss": 0.0003, - "ppl": 0.0113525390625, - "reward": 0.9892276525497437, - "reward_std": 0.0012664224705076776, - "rewards/perpo_ocr_edit_distance_reward": 0.9892277121543884, + "advantages": -0.0005960464477539062, + "completion_length": 347.0, + "delta_ref_entropy_loss": 0.0284423828125, + "delta_ref_ppl": -0.0242919921875, + "entropy_loss": -0.0157470703125, + "epoch": 0.0648, + "grad_norm": 0.006216941866370149, + "k1_kl": 0.0244140625, + "k3_kl": 0.01165771484375, + "kimi_kl": 0.0208740234375, + "learning_rate": 4.676e-07, + "loss": 0.0011, + "ppl": 0.002777099609375, + "reward": 0.9326177835464478, + "reward_std": 0.0, + "rewards/perpo_ocr_edit_distance_reward": 0.9326178431510925, "step": 324, "temperature": 0.9 }, { - "advantages": -7.141275318645057e-05, - "completion_length": 1004.0, - "delta_ref_entropy_loss": 0.02252197265625, - "delta_ref_ppl": -0.01055908203125, - "entropy_loss": -0.02655029296875, - "epoch": 0.13, - "grad_norm": 0.3558421781650099, - "k1_kl": 0.010589599609375, - "k3_kl": 0.00437164306640625, - "kimi_kl": 0.0063018798828125, - "learning_rate": 4.3499999999999996e-07, - "loss": 0.0002, - "ppl": 0.012359619140625, - "reward": 0.9982692301273346, - "reward_std": 0.0006752536428393796, - "rewards/perpo_ocr_edit_distance_reward": 0.998269259929657, - "step": 325, - "temperature": 0.9 + "advantages": -2.1925995952187805e-06, + "completion_length": 500.0, + "delta_ref_entropy_loss": 0.09033203125, + "delta_ref_ppl": -0.072265625, + "entropy_loss": -0.1357421875, + "epoch": 0.065, + "grad_norm": 1.6924035113652003, + "k1_kl": 0.07177734375, + "k3_kl": 0.042236328125, + "kimi_kl": 0.10009765625, + "learning_rate": 4.675e-07, + "loss": 0.0017, + "ppl": 0.06640625, + "reward": 0.9020984768867493, + "reward_std": 0.0038139314856380224, + "rewards/perpo_ocr_edit_distance_reward": 0.9020984768867493, + "step": 325, + "temperature": 0.9 }, { - "advantages": -1.8996852304553613e-05, - "completion_length": 489.5, - "delta_ref_entropy_loss": 0.043212890625, - "delta_ref_ppl": -0.0250244140625, - "entropy_loss": -0.0543212890625, - "epoch": 0.1304, - "grad_norm": 0.5778289830634603, - "k1_kl": 0.025146484375, - "k3_kl": 0.010955810546875, - "kimi_kl": 0.016845703125, - "learning_rate": 4.348e-07, - "loss": 0.0005, - "ppl": 0.026519775390625, - "reward": 0.9734722375869751, - "reward_std": 0.0008096366364043206, - "rewards/perpo_ocr_edit_distance_reward": 0.9734722375869751, + "advantages": -0.0005960464477539062, + "completion_length": 146.0, + "delta_ref_entropy_loss": 0.0240478515625, + "delta_ref_ppl": -0.0230712890625, + "entropy_loss": -0.0130615234375, + "epoch": 0.0652, + "grad_norm": 0.004812844814135917, + "k1_kl": 0.02294921875, + "k3_kl": 0.0113525390625, + "kimi_kl": 0.021240234375, + "learning_rate": 4.6739999999999996e-07, + "loss": 0.001, + "ppl": 0.0020751953125, + "reward": 0.9415855407714844, + "reward_std": 0.0, + "rewards/perpo_ocr_edit_distance_reward": 0.9415856003761292, "step": 326, "temperature": 0.9 }, { - "advantages": -3.559248807505355e-05, - "completion_length": 540.5, - "delta_ref_entropy_loss": 0.0418701171875, - "delta_ref_ppl": -0.027618408203125, - "entropy_loss": -0.135955810546875, - "epoch": 0.1308, - "grad_norm": 1.1248273567696268, - "k1_kl": 0.027618408203125, - "k3_kl": 0.01385498046875, - "kimi_kl": 0.0254974365234375, - "learning_rate": 4.346e-07, - "loss": 0.0006, - "ppl": 0.07228851318359375, - "reward": 0.9309137761592865, - "reward_std": 0.005466920556500554, - "rewards/perpo_ocr_edit_distance_reward": 0.930913895368576, + "advantages": -3.4059798053931445e-05, + "completion_length": 1218.0, + "delta_ref_entropy_loss": 0.01434326171875, + "delta_ref_ppl": -0.0089111328125, + "entropy_loss": -0.0291748046875, + "epoch": 0.0654, + "grad_norm": 0.45685988772343245, + "k1_kl": 0.0089111328125, + "k3_kl": 0.00469970703125, + "kimi_kl": 0.0123291015625, + "learning_rate": 4.6729999999999996e-07, + "loss": 0.0002, + "ppl": 0.0130615234375, + "reward": 0.97788405418396, + "reward_std": 0.0009003636660054326, + "rewards/perpo_ocr_edit_distance_reward": 0.9778841137886047, "step": 327, "temperature": 0.9 }, { - "advantages": -4.2489600673434325e-06, - "completion_length": 785.5, - "delta_ref_entropy_loss": 0.0230712890625, - "delta_ref_ppl": -0.0073699951171875, - "entropy_loss": -0.03155517578125, - "epoch": 0.1312, - "grad_norm": 0.5470819958629513, - "k1_kl": 0.007354736328125, - "k3_kl": 0.00376129150390625, - "kimi_kl": 0.0038604736328125, - "learning_rate": 4.3439999999999997e-07, - "loss": 0.0002, - "ppl": 0.01575469970703125, - "reward": 0.9804536402225494, - "reward_std": 0.0009526016656309366, - "rewards/perpo_ocr_edit_distance_reward": 0.9804536700248718, + "advantages": -0.00013892140123061836, + "completion_length": 444.0, + "delta_ref_entropy_loss": 0.01458740234375, + "delta_ref_ppl": -0.0086669921875, + "entropy_loss": -0.023681640625, + "epoch": 0.0656, + "grad_norm": 0.44334621868351304, + "k1_kl": 0.0086669921875, + "k3_kl": 0.006011962890625, + "kimi_kl": 0.010498046875, + "learning_rate": 4.672e-07, + "loss": 0.0004, + "ppl": 0.007598876953125, + "reward": 0.8595132231712341, + "reward_std": 0.0005741544300690293, + "rewards/perpo_ocr_edit_distance_reward": 0.8595132827758789, "step": 328, "temperature": 0.9 }, { - "advantages": -8.802967931842431e-05, - "completion_length": 1263.0, - "delta_ref_entropy_loss": 0.02459716796875, - "delta_ref_ppl": -0.02001953125, - "entropy_loss": -0.101318359375, - "epoch": 0.1316, - "grad_norm": 7.557298239259151, - "k1_kl": 0.02001953125, - "k3_kl": 0.020416259765625, - "kimi_kl": 0.04840087890625, - "learning_rate": 4.3419999999999996e-07, - "loss": 0.0009, - "ppl": 0.066650390625, - "reward": 0.4538957476615906, - "reward_std": 0.0007550326554337516, - "rewards/perpo_ocr_edit_distance_reward": 0.45389582216739655, + "advantages": -0.00010831015970325097, + "completion_length": 693.0, + "delta_ref_entropy_loss": 0.0206298828125, + "delta_ref_ppl": -0.01611328125, + "entropy_loss": -0.01019287109375, + "epoch": 0.0658, + "grad_norm": 0.4973273165510867, + "k1_kl": 0.01611328125, + "k3_kl": 0.0089111328125, + "kimi_kl": 0.02099609375, + "learning_rate": 4.671e-07, + "loss": 0.0005, + "ppl": 0.004058837890625, + "reward": 0.9862247109413147, + "reward_std": 0.0002930171031039208, + "rewards/perpo_ocr_edit_distance_reward": 0.9862248301506042, "step": 329, "temperature": 0.9 }, { - "advantages": -0.00022719587832398247, - "completion_length": 679.0, - "delta_ref_entropy_loss": 0.02532958984375, - "delta_ref_ppl": -0.0318603515625, - "entropy_loss": -0.0223388671875, - "epoch": 0.132, - "grad_norm": 0.67185411862634, - "k1_kl": 0.0318603515625, - "k3_kl": 0.019744873046875, - "kimi_kl": 0.05096435546875, - "learning_rate": 4.34e-07, - "loss": 0.001, - "ppl": 0.009979248046875, - "reward": 0.9978067278862, - "reward_std": 0.0005214148259256035, - "rewards/perpo_ocr_edit_distance_reward": 0.9978068172931671, + "advantages": -2.588544703030493e-06, + "completion_length": 162.0, + "delta_ref_entropy_loss": 0.03173828125, + "delta_ref_ppl": -0.0203857421875, + "entropy_loss": -0.043212890625, + "epoch": 0.066, + "grad_norm": 1.7577106882540343, + "k1_kl": 0.0205078125, + "k3_kl": 0.01068115234375, + "kimi_kl": 0.01611328125, + "learning_rate": 4.67e-07, + "loss": 0.0004, + "ppl": 0.02978515625, + "reward": 0.8890848755836487, + "reward_std": 0.006551342085003853, + "rewards/perpo_ocr_edit_distance_reward": 0.8890848755836487, "step": 330, "temperature": 0.9 }, { - "advantages": -3.951788312406279e-05, - "completion_length": 1234.5, - "delta_ref_entropy_loss": 0.017425537109375, - "delta_ref_ppl": -0.0069580078125, - "entropy_loss": -0.02197265625, - "epoch": 0.1324, - "grad_norm": 0.23118816890074712, - "k1_kl": 0.0069427490234375, - "k3_kl": 0.00237274169921875, - "kimi_kl": 0.003631591796875, - "learning_rate": 4.338e-07, - "loss": 0.0001, - "ppl": 0.00872802734375, - "reward": 0.9984358847141266, - "reward_std": 0.0003588192048482597, - "rewards/perpo_ocr_edit_distance_reward": 0.998435914516449, + "advantages": -0.0001094681938411668, + "completion_length": 1009.0, + "delta_ref_entropy_loss": 0.0172119140625, + "delta_ref_ppl": -0.0091552734375, + "entropy_loss": -0.018798828125, + "epoch": 0.0662, + "grad_norm": 0.46938860720002523, + "k1_kl": 0.0091552734375, + "k3_kl": 0.0033111572265625, + "kimi_kl": 0.005279541015625, + "learning_rate": 4.669e-07, + "loss": 0.0002, + "ppl": 0.00653076171875, + "reward": 0.9733612537384033, + "reward_std": 0.00044453312875702977, + "rewards/perpo_ocr_edit_distance_reward": 0.9733613133430481, "step": 331, "temperature": 0.9 }, { - "advantages": -0.0003148487649013987, - "completion_length": 652.0, - "delta_ref_entropy_loss": 0.013916015625, - "delta_ref_ppl": -0.012115478515625, - "entropy_loss": -0.01422119140625, - "epoch": 0.1328, - "grad_norm": 0.36283573614975106, - "k1_kl": 0.012115478515625, - "k3_kl": 0.006755828857421875, - "kimi_kl": 0.01564788818359375, - "learning_rate": 4.3359999999999997e-07, - "loss": 0.0006, - "ppl": 0.0048980712890625, - "reward": 0.9922923743724823, - "reward_std": 7.6464049925562e-05, - "rewards/perpo_ocr_edit_distance_reward": 0.9922924637794495, + "advantages": -4.938671054333099e-07, + "completion_length": 1933.0, + "delta_ref_entropy_loss": 0.029052734375, + "delta_ref_ppl": -0.01300048828125, + "entropy_loss": -0.1396484375, + "epoch": 0.0664, + "grad_norm": 2.102473525124725, + "k1_kl": 0.012939453125, + "k3_kl": 0.01129150390625, + "kimi_kl": 0.01416015625, + "learning_rate": 4.6679999999999997e-07, + "loss": 0.0005, + "ppl": 0.08544921875, + "reward": 0.926608145236969, + "reward_std": 0.03489316254854202, + "rewards/perpo_ocr_edit_distance_reward": 0.9266082048416138, "step": 332, "temperature": 0.9 }, { - "advantages": -6.6629479178459405e-06, - "completion_length": 677.0, - "delta_ref_entropy_loss": 0.02569580078125, - "delta_ref_ppl": -0.017120361328125, - "entropy_loss": -0.0623779296875, - "epoch": 0.1332, - "grad_norm": 0.7025536976931706, - "k1_kl": 0.017059326171875, - "k3_kl": 0.0085296630859375, - "kimi_kl": 0.01922607421875, - "learning_rate": 4.334e-07, - "loss": 0.0003, - "ppl": 0.0392608642578125, - "reward": 0.8444679081439972, - "reward_std": 0.11869276530342177, - "rewards/perpo_ocr_edit_distance_reward": 0.8444678783416748, + "advantages": -1.8051692904919037e-06, + "completion_length": 436.0, + "delta_ref_entropy_loss": 0.07763671875, + "delta_ref_ppl": -0.036865234375, + "entropy_loss": -0.2109375, + "epoch": 0.0666, + "grad_norm": 2.301308107719421, + "k1_kl": 0.036865234375, + "k3_kl": 0.0186767578125, + "kimi_kl": 0.035888671875, + "learning_rate": 4.6669999999999996e-07, + "loss": 0.0007, + "ppl": 0.11083984375, + "reward": 0.9042505621910095, + "reward_std": 0.03823498636484146, + "rewards/perpo_ocr_edit_distance_reward": 0.9042506217956543, "step": 333, "temperature": 0.9 }, { - "advantages": -0.00031113198929233477, - "completion_length": 1219.5, - "delta_ref_entropy_loss": 0.01153564453125, - "delta_ref_ppl": -0.0080108642578125, - "entropy_loss": -0.018524169921875, - "epoch": 0.1336, - "grad_norm": 0.6810512564529999, - "k1_kl": 0.0079803466796875, - "k3_kl": 0.00457763671875, - "kimi_kl": 0.009246826171875, - "learning_rate": 4.3319999999999994e-07, - "loss": 0.0005, - "ppl": 0.009266853332519531, - "reward": 0.9949658811092377, - "reward_std": 0.0007618192466907203, - "rewards/perpo_ocr_edit_distance_reward": 0.9949659407138824, + "advantages": -0.0002283198555232957, + "completion_length": 416.0, + "delta_ref_entropy_loss": 0.0166015625, + "delta_ref_ppl": -0.008056640625, + "entropy_loss": -0.017333984375, + "epoch": 0.0668, + "grad_norm": 0.5027629446280353, + "k1_kl": 0.00811767578125, + "k3_kl": 0.004058837890625, + "kimi_kl": 0.006988525390625, + "learning_rate": 4.666e-07, + "loss": 0.0004, + "ppl": 0.007659912109375, + "reward": 0.9798383116722107, + "reward_std": 0.0003102186310570687, + "rewards/perpo_ocr_edit_distance_reward": 0.9798384308815002, "step": 334, "temperature": 0.9 }, { - "advantages": -1.417739076714497e-05, - "completion_length": 499.5, - "delta_ref_entropy_loss": 0.0374755859375, - "delta_ref_ppl": -0.0185546875, - "entropy_loss": -0.0386962890625, - "epoch": 0.134, - "grad_norm": 1.1521509190552857, - "k1_kl": 0.0185546875, - "k3_kl": 0.0072021484375, - "kimi_kl": 0.0108184814453125, - "learning_rate": 4.3299999999999997e-07, - "loss": 0.0003, - "ppl": 0.017547607421875, - "reward": 0.9875519871711731, - "reward_std": 0.0046280191745609045, - "rewards/perpo_ocr_edit_distance_reward": 0.9875521063804626, + "advantages": -9.082045289687812e-05, + "completion_length": 611.0, + "delta_ref_entropy_loss": 0.0228271484375, + "delta_ref_ppl": -0.0101318359375, + "entropy_loss": -0.0400390625, + "epoch": 0.067, + "grad_norm": 0.8322483187953027, + "k1_kl": 0.01019287109375, + "k3_kl": 0.00396728515625, + "kimi_kl": 0.00634765625, + "learning_rate": 4.665e-07, + "loss": 0.0002, + "ppl": 0.0198974609375, + "reward": 0.9816701412200928, + "reward_std": 0.0006499605369754136, + "rewards/perpo_ocr_edit_distance_reward": 0.9816701412200928, "step": 335, "temperature": 0.9 }, { - "advantages": -4.1378398236702196e-05, - "completion_length": 1281.0, - "delta_ref_entropy_loss": 0.01904296875, - "delta_ref_ppl": -0.008758544921875, - "entropy_loss": -0.05810546875, - "epoch": 0.1344, - "grad_norm": 0.7390392348104182, - "k1_kl": 0.008758544921875, - "k3_kl": 0.005523681640625, - "kimi_kl": 0.00909423828125, - "learning_rate": 4.328e-07, - "loss": 0.0003, - "ppl": 0.0286865234375, - "reward": 0.9684523642063141, - "reward_std": 0.0016408856899943203, - "rewards/perpo_ocr_edit_distance_reward": 0.9684524238109589, + "advantages": -7.56638401071541e-05, + "completion_length": 397.0, + "delta_ref_entropy_loss": 0.04345703125, + "delta_ref_ppl": -0.0194091796875, + "entropy_loss": -0.0247802734375, + "epoch": 0.0672, + "grad_norm": 0.5784793646112949, + "k1_kl": 0.0194091796875, + "k3_kl": 0.01336669921875, + "kimi_kl": 0.0257568359375, + "learning_rate": 4.6639999999999994e-07, + "loss": 0.0006, + "ppl": 0.01483154296875, + "reward": 0.9678988456726074, + "reward_std": 0.0006878364947624505, + "rewards/perpo_ocr_edit_distance_reward": 0.9678989052772522, "step": 336, "temperature": 0.9 }, { - "advantages": -6.414311376801152e-05, - "completion_length": 483.0, - "delta_ref_entropy_loss": 0.023284912109375, - "delta_ref_ppl": -0.080047607421875, - "entropy_loss": -0.126800537109375, - "epoch": 0.1348, - "grad_norm": 1.1707548299612367, - "k1_kl": 0.0800628662109375, - "k3_kl": 0.06420135498046875, - "kimi_kl": 0.34136962890625, - "learning_rate": 4.3259999999999994e-07, - "loss": 0.0026, - "ppl": 0.058074951171875, - "reward": 0.6113853007555008, - "reward_std": 0.03760181233519688, - "rewards/perpo_ocr_edit_distance_reward": 0.6113853603601456, + "advantages": 4.087175966560608e-07, + "completion_length": 298.0, + "delta_ref_entropy_loss": 0.0654296875, + "delta_ref_ppl": -0.072265625, + "entropy_loss": -0.11181640625, + "epoch": 0.0674, + "grad_norm": 2.3223870221236687, + "k1_kl": 0.072265625, + "k3_kl": 0.0478515625, + "kimi_kl": 0.1591796875, + "learning_rate": 4.663e-07, + "loss": 0.0019, + "ppl": 0.053466796875, + "reward": 0.7899038791656494, + "reward_std": 0.03991587087512016, + "rewards/perpo_ocr_edit_distance_reward": 0.7899039387702942, "step": 337, "temperature": 0.9 }, { - "advantages": -0.00032245900001726113, - "completion_length": 355.5, - "delta_ref_entropy_loss": 0.046142578125, - "delta_ref_ppl": -0.037109375, - "entropy_loss": -0.0595703125, - "epoch": 0.1352, - "grad_norm": 0.5447352496990263, - "k1_kl": 0.03717041015625, - "k3_kl": 0.019989013671875, - "kimi_kl": 0.0489501953125, - "learning_rate": 4.324e-07, - "loss": 0.0011, - "ppl": 0.027923583984375, - "reward": 0.9847497940063477, - "reward_std": 0.0005600288859568536, - "rewards/perpo_ocr_edit_distance_reward": 0.98474982380867, + "advantages": -5.817413693876006e-05, + "completion_length": 654.0, + "delta_ref_entropy_loss": 0.03173828125, + "delta_ref_ppl": -0.03271484375, + "entropy_loss": -0.05859375, + "epoch": 0.0676, + "grad_norm": 1.5028333056584708, + "k1_kl": 0.03271484375, + "k3_kl": 0.0223388671875, + "kimi_kl": 0.06494140625, + "learning_rate": 4.662e-07, + "loss": 0.001, + "ppl": 0.0281982421875, + "reward": 0.9608334898948669, + "reward_std": 0.0010710820788517594, + "rewards/perpo_ocr_edit_distance_reward": 0.9608336091041565, "step": 338, "temperature": 0.9 }, { - "advantages": -3.443232685640396e-06, - "completion_length": 721.0, - "delta_ref_entropy_loss": 0.033203125, - "delta_ref_ppl": -0.02557373046875, - "entropy_loss": -0.04345703125, - "epoch": 0.1356, - "grad_norm": 0.7871705509857889, - "k1_kl": 0.02557373046875, - "k3_kl": 0.0164794921875, - "kimi_kl": 0.036865234375, - "learning_rate": 4.3219999999999997e-07, - "loss": 0.0007, - "ppl": 0.02178955078125, - "reward": 0.8846765458583832, - "reward_std": 0.09519649914000183, - "rewards/perpo_ocr_edit_distance_reward": 0.884676605463028, + "advantages": -1.8392290712654358e-06, + "completion_length": 615.0, + "delta_ref_entropy_loss": 0.03369140625, + "delta_ref_ppl": -0.025146484375, + "entropy_loss": -0.07958984375, + "epoch": 0.0678, + "grad_norm": 1.2629959391694578, + "k1_kl": 0.025146484375, + "k3_kl": 0.01544189453125, + "kimi_kl": 0.0302734375, + "learning_rate": 4.6609999999999997e-07, + "loss": 0.0006, + "ppl": 0.04052734375, + "reward": 0.9266980886459351, + "reward_std": 0.004527832847088575, + "rewards/perpo_ocr_edit_distance_reward": 0.9266980886459351, "step": 339, "temperature": 0.9 }, { - "advantages": -7.37948139430955e-05, - "completion_length": 531.5, - "delta_ref_entropy_loss": 0.019683837890625, - "delta_ref_ppl": -0.0108489990234375, - "entropy_loss": -0.011932373046875, - "epoch": 0.136, - "grad_norm": 0.2546980308771171, - "k1_kl": 0.01085662841796875, - "k3_kl": 0.00447845458984375, - "kimi_kl": 0.0067043304443359375, - "learning_rate": 4.3199999999999995e-07, - "loss": 0.0003, - "ppl": 0.003902435302734375, - "reward": 0.998714029788971, - "reward_std": 0.00020961022528354079, - "rewards/perpo_ocr_edit_distance_reward": 0.9987140893936157, + "advantages": -0.00010408674279460683, + "completion_length": 336.0, + "delta_ref_entropy_loss": 0.0322265625, + "delta_ref_ppl": -0.017333984375, + "entropy_loss": -0.034423828125, + "epoch": 0.068, + "grad_norm": 0.970992129935341, + "k1_kl": 0.017333984375, + "k3_kl": 0.00836181640625, + "kimi_kl": 0.0123291015625, + "learning_rate": 4.66e-07, + "loss": 0.0004, + "ppl": 0.01544189453125, + "reward": 0.9380544424057007, + "reward_std": 0.0009633310837671161, + "rewards/perpo_ocr_edit_distance_reward": 0.9380545616149902, "step": 340, "temperature": 0.9 }, { - "advantages": -3.225888622182538e-05, - "completion_length": 521.5, - "delta_ref_entropy_loss": 0.02459716796875, - "delta_ref_ppl": -0.0194091796875, - "entropy_loss": -0.018310546875, - "epoch": 0.1364, - "grad_norm": 0.4716121460846033, - "k1_kl": 0.01934814453125, - "k3_kl": 0.01116943359375, - "kimi_kl": 0.025177001953125, - "learning_rate": 4.318e-07, - "loss": 0.0005, - "ppl": 0.0080108642578125, - "reward": 0.9978277683258057, - "reward_std": 0.0013708840706385672, - "rewards/perpo_ocr_edit_distance_reward": 0.997827798128128, + "advantages": -8.122410508804023e-05, + "completion_length": 1166.0, + "delta_ref_entropy_loss": 0.012451171875, + "delta_ref_ppl": -0.0118408203125, + "entropy_loss": -0.034423828125, + "epoch": 0.0682, + "grad_norm": 0.6472448412404253, + "k1_kl": 0.0118408203125, + "k3_kl": 0.006988525390625, + "kimi_kl": 0.0172119140625, + "learning_rate": 4.6589999999999996e-07, + "loss": 0.0004, + "ppl": 0.01611328125, + "reward": 0.9821191430091858, + "reward_std": 0.0006340371328406036, + "rewards/perpo_ocr_edit_distance_reward": 0.9821192622184753, "step": 341, "temperature": 0.9 }, { - "advantages": -9.09226291696541e-05, - "completion_length": 699.0, - "delta_ref_entropy_loss": 0.0318603515625, - "delta_ref_ppl": -0.01971435546875, - "entropy_loss": -0.03546142578125, - "epoch": 0.1368, - "grad_norm": 0.5701547946141302, - "k1_kl": 0.019775390625, - "k3_kl": 0.008880615234375, - "kimi_kl": 0.01654052734375, - "learning_rate": 4.316e-07, - "loss": 0.0004, - "ppl": 0.015716552734375, - "reward": 0.9892893433570862, - "reward_std": 0.0007676621025893837, - "rewards/perpo_ocr_edit_distance_reward": 0.989289402961731, + "advantages": -7.213865319499746e-05, + "completion_length": 495.0, + "delta_ref_entropy_loss": 0.017822265625, + "delta_ref_ppl": -0.006072998046875, + "entropy_loss": -0.016845703125, + "epoch": 0.0684, + "grad_norm": 0.6272010191329703, + "k1_kl": 0.006072998046875, + "k3_kl": 0.0035858154296875, + "kimi_kl": 0.0064697265625, + "learning_rate": 4.6579999999999995e-07, + "loss": 0.0002, + "ppl": 0.006988525390625, + "reward": 0.9878751635551453, + "reward_std": 0.0007262814906425774, + "rewards/perpo_ocr_edit_distance_reward": 0.98787522315979, "step": 342, "temperature": 0.9 }, { - "advantages": -5.351645813789219e-05, - "completion_length": 479.0, - "delta_ref_entropy_loss": 0.04315185546875, - "delta_ref_ppl": -0.019805908203125, - "entropy_loss": -0.05889892578125, - "epoch": 0.1372, - "grad_norm": 0.6455615104680683, - "k1_kl": 0.019744873046875, - "k3_kl": 0.0092620849609375, - "kimi_kl": 0.014190673828125, - "learning_rate": 4.314e-07, - "loss": 0.0004, - "ppl": 0.0299072265625, - "reward": 0.9863541424274445, - "reward_std": 0.0009837666875682771, - "rewards/perpo_ocr_edit_distance_reward": 0.9863542020320892, + "advantages": -6.342786218738183e-05, + "completion_length": 457.0, + "delta_ref_entropy_loss": 0.035400390625, + "delta_ref_ppl": -0.03759765625, + "entropy_loss": -0.06298828125, + "epoch": 0.0686, + "grad_norm": 0.9708591017868796, + "k1_kl": 0.037841796875, + "k3_kl": 0.021240234375, + "kimi_kl": 0.043212890625, + "learning_rate": 4.657e-07, + "loss": 0.0009, + "ppl": 0.034423828125, + "reward": 0.9623990058898926, + "reward_std": 0.0011082212440669537, + "rewards/perpo_ocr_edit_distance_reward": 0.9623990654945374, "step": 343, "temperature": 0.9 }, { - "advantages": -1.2048653843521606e-05, - "completion_length": 850.0, - "delta_ref_entropy_loss": 0.015594482421875, - "delta_ref_ppl": -0.021331787109375, - "entropy_loss": -0.0367431640625, - "epoch": 0.1376, - "grad_norm": 0.6428805659063953, - "k1_kl": 0.021331787109375, - "k3_kl": 0.01413726806640625, - "kimi_kl": 0.027374267578125, - "learning_rate": 4.312e-07, + "advantages": 8.514949634275126e-09, + "completion_length": 550.0, + "delta_ref_entropy_loss": 0.041748046875, + "delta_ref_ppl": -0.033203125, + "entropy_loss": -0.03173828125, + "epoch": 0.0688, + "grad_norm": 1.4387088292659913, + "k1_kl": 0.033203125, + "k3_kl": 0.01544189453125, + "kimi_kl": 0.02978515625, + "learning_rate": 4.656e-07, "loss": 0.0006, - "ppl": 0.0157470703125, - "reward": 0.9634031653404236, - "reward_std": 0.0009449085046071559, - "rewards/perpo_ocr_edit_distance_reward": 0.9634032547473907, + "ppl": 0.013671875, + "reward": 0.9589172601699829, + "reward_std": 0.0005017620278522372, + "rewards/perpo_ocr_edit_distance_reward": 0.9589172601699829, "step": 344, "temperature": 0.9 }, { - "advantages": -9.558456531522097e-05, - "completion_length": 564.5, - "delta_ref_entropy_loss": 0.04620361328125, - "delta_ref_ppl": -0.0203857421875, - "entropy_loss": -0.04766845703125, - "epoch": 0.138, - "grad_norm": 0.6126479518666609, - "k1_kl": 0.0203857421875, - "k3_kl": 0.0086517333984375, - "kimi_kl": 0.013153076171875, - "learning_rate": 4.31e-07, - "loss": 0.0004, - "ppl": 0.02264404296875, - "reward": 0.936176985502243, - "reward_std": 0.0012763642880599946, - "rewards/perpo_ocr_edit_distance_reward": 0.9361770749092102, + "advantages": -1.7029899268550253e-08, + "completion_length": 495.0, + "delta_ref_entropy_loss": 0.046875, + "delta_ref_ppl": -0.032470703125, + "entropy_loss": -0.062255859375, + "epoch": 0.069, + "grad_norm": 0.9718230283290603, + "k1_kl": 0.032470703125, + "k3_kl": 0.01904296875, + "kimi_kl": 0.035888671875, + "learning_rate": 4.655e-07, + "loss": 0.0008, + "ppl": 0.0296630859375, + "reward": 0.632843017578125, + "reward_std": 0.017206033691763878, + "rewards/perpo_ocr_edit_distance_reward": 0.632843017578125, "step": 345, "temperature": 0.9 }, { - "advantages": -3.7235875161911736e-05, - "completion_length": 180.0, - "delta_ref_entropy_loss": 0.09130859375, - "delta_ref_ppl": -0.12939453125, - "entropy_loss": -0.107421875, - "epoch": 0.1384, - "grad_norm": 5.882733841275777, - "k1_kl": 0.1290283203125, - "k3_kl": 0.0777587890625, - "kimi_kl": 0.1881103515625, - "learning_rate": 4.308e-07, - "loss": 0.0031, - "ppl": 0.0526123046875, - "reward": 0.8490214347839355, - "reward_std": 0.0846224655979313, - "rewards/perpo_ocr_edit_distance_reward": 0.8490214645862579, + "advantages": -6.401538848876953e-05, + "completion_length": 557.0, + "delta_ref_entropy_loss": 0.0458984375, + "delta_ref_ppl": -0.0284423828125, + "entropy_loss": -0.05322265625, + "epoch": 0.0692, + "grad_norm": 0.7666816843931862, + "k1_kl": 0.0284423828125, + "k3_kl": 0.014404296875, + "kimi_kl": 0.02880859375, + "learning_rate": 4.6539999999999997e-07, + "loss": 0.0006, + "ppl": 0.025146484375, + "reward": 0.9696755409240723, + "reward_std": 0.0010969823924824595, + "rewards/perpo_ocr_edit_distance_reward": 0.969675600528717, "step": 346, "temperature": 0.9 }, { - "advantages": -2.7929033876716858e-06, - "completion_length": 239.0, - "delta_ref_entropy_loss": 0.0557861328125, - "delta_ref_ppl": -0.111328125, - "entropy_loss": -0.0953369140625, - "epoch": 0.1388, - "grad_norm": 0.8795423821969325, - "k1_kl": 0.111328125, - "k3_kl": 0.075927734375, - "kimi_kl": 0.216552734375, - "learning_rate": 4.3059999999999995e-07, - "loss": 0.003, - "ppl": 0.041748046875, - "reward": 0.9795485734939575, - "reward_std": 0.001470924005843699, - "rewards/perpo_ocr_edit_distance_reward": 0.9795486032962799, + "advantages": -2.346720066270791e-05, + "completion_length": 402.0, + "delta_ref_entropy_loss": 0.02783203125, + "delta_ref_ppl": -0.0125732421875, + "entropy_loss": -0.01348876953125, + "epoch": 0.0694, + "grad_norm": 0.6069534743888487, + "k1_kl": 0.01263427734375, + "k3_kl": 0.005889892578125, + "kimi_kl": 0.0107421875, + "learning_rate": 4.6529999999999996e-07, + "loss": 0.0003, + "ppl": 0.004241943359375, + "reward": 0.9840024709701538, + "reward_std": 0.0035283740144222975, + "rewards/perpo_ocr_edit_distance_reward": 0.9840025901794434, "step": 347, "temperature": 0.9 }, { - "advantages": -7.785218735989474e-05, - "completion_length": 666.0, - "delta_ref_entropy_loss": 0.02862548828125, - "delta_ref_ppl": -0.014556884765625, - "entropy_loss": -0.0347900390625, - "epoch": 0.1392, - "grad_norm": 0.5983184643084509, - "k1_kl": 0.014556884765625, - "k3_kl": 0.0058441162109375, - "kimi_kl": 0.010101318359375, - "learning_rate": 4.304e-07, - "loss": 0.0003, - "ppl": 0.018585205078125, - "reward": 0.986127644777298, - "reward_std": 0.009734862615005113, - "rewards/perpo_ocr_edit_distance_reward": 0.9861277639865875, + "advantages": -7.774148798489477e-06, + "completion_length": 463.0, + "delta_ref_entropy_loss": 0.016357421875, + "delta_ref_ppl": -0.0167236328125, + "entropy_loss": -0.02587890625, + "epoch": 0.0696, + "grad_norm": 0.8349099460477435, + "k1_kl": 0.0167236328125, + "k3_kl": 0.01031494140625, + "kimi_kl": 0.0181884765625, + "learning_rate": 4.6519999999999996e-07, + "loss": 0.0004, + "ppl": 0.01177978515625, + "reward": 0.9744512438774109, + "reward_std": 0.0009954898850992322, + "rewards/perpo_ocr_edit_distance_reward": 0.9744512438774109, "step": 348, "temperature": 0.9 }, { - "advantages": -6.188665338413557e-05, - "completion_length": 967.5, - "delta_ref_entropy_loss": 0.0185546875, - "delta_ref_ppl": -0.00860595703125, - "entropy_loss": -0.0216064453125, - "epoch": 0.1396, - "grad_norm": 0.6527812873579396, - "k1_kl": 0.008544921875, - "k3_kl": 0.003387451171875, - "kimi_kl": 0.00667572021484375, - "learning_rate": 4.3020000000000003e-07, - "loss": 0.0002, - "ppl": 0.010833740234375, - "reward": 0.9988672435283661, - "reward_std": 0.0006334694116958417, - "rewards/perpo_ocr_edit_distance_reward": 0.9988673031330109, + "advantages": -6.817068788222969e-05, + "completion_length": 620.0, + "delta_ref_entropy_loss": 0.0177001953125, + "delta_ref_ppl": -0.01177978515625, + "entropy_loss": -0.0152587890625, + "epoch": 0.0698, + "grad_norm": 0.4054405642797217, + "k1_kl": 0.01177978515625, + "k3_kl": 0.0057373046875, + "kimi_kl": 0.011474609375, + "learning_rate": 4.651e-07, + "loss": 0.0003, + "ppl": 0.005462646484375, + "reward": 0.9837831854820251, + "reward_std": 0.00027481457800604403, + "rewards/perpo_ocr_edit_distance_reward": 0.9837832450866699, "step": 349, "temperature": 0.9 }, { - "advantages": -8.926221926230937e-05, - "completion_length": 521.5, - "delta_ref_entropy_loss": 0.06536865234375, - "delta_ref_ppl": -0.083892822265625, - "entropy_loss": -0.13323974609375, - "epoch": 0.14, - "grad_norm": 5.609134238455716, - "k1_kl": 0.084381103515625, - "k3_kl": 0.05224609375, - "kimi_kl": 0.2173309326171875, - "learning_rate": 4.2999999999999996e-07, - "loss": 0.0022, - "ppl": 0.0676727294921875, - "reward": 0.7682211697101593, - "reward_std": 0.05361894426459912, - "rewards/perpo_ocr_edit_distance_reward": 0.7682212591171265, + "advantages": -1.6842570403241552e-05, + "completion_length": 672.0, + "delta_ref_entropy_loss": 0.0203857421875, + "delta_ref_ppl": -0.0107421875, + "entropy_loss": -0.040771484375, + "epoch": 0.07, + "grad_norm": 0.7869704788721783, + "k1_kl": 0.0107421875, + "k3_kl": 0.004791259765625, + "kimi_kl": 0.00982666015625, + "learning_rate": 4.65e-07, + "loss": 0.0002, + "ppl": 0.01904296875, + "reward": 0.9645707607269287, + "reward_std": 0.0009110545506700873, + "rewards/perpo_ocr_edit_distance_reward": 0.9645708799362183, "step": 350, "temperature": 0.9 }, { - "advantages": -6.0596639741561376e-05, - "completion_length": 800.5, - "delta_ref_entropy_loss": 0.02178955078125, - "delta_ref_ppl": -0.017974853515625, - "entropy_loss": -0.03692626953125, - "epoch": 0.1404, - "grad_norm": 1.3612923878989993, - "k1_kl": 0.01800537109375, - "k3_kl": 0.021148681640625, - "kimi_kl": 0.030517578125, - "learning_rate": 4.298e-07, - "loss": 0.0009, - "ppl": 0.021331787109375, - "reward": 0.9982765913009644, - "reward_std": 0.0006405531021300703, - "rewards/perpo_ocr_edit_distance_reward": 0.9982767105102539, + "advantages": -1.3879367543268017e-05, + "completion_length": 880.0, + "delta_ref_entropy_loss": 0.023681640625, + "delta_ref_ppl": -0.0174560546875, + "entropy_loss": -0.0380859375, + "epoch": 0.0702, + "grad_norm": 0.6061322342514825, + "k1_kl": 0.0174560546875, + "k3_kl": 0.01434326171875, + "kimi_kl": 0.0311279296875, + "learning_rate": 4.6489999999999993e-07, + "loss": 0.0006, + "ppl": 0.02392578125, + "reward": 0.9311241507530212, + "reward_std": 0.0054257092997431755, + "rewards/perpo_ocr_edit_distance_reward": 0.9311242699623108, "step": 351, "temperature": 0.9 }, { - "advantages": -0.00011063900046792696, - "completion_length": 1021.0, - "delta_ref_entropy_loss": 0.044708251953125, - "delta_ref_ppl": -0.01776123046875, - "entropy_loss": -0.0904541015625, - "epoch": 0.1408, - "grad_norm": 0.8561836866815018, - "k1_kl": 0.0177764892578125, - "k3_kl": 0.008960723876953125, - "kimi_kl": 0.014064788818359375, - "learning_rate": 4.296e-07, - "loss": 0.0005, - "ppl": 0.04925537109375, - "reward": 0.9655544757843018, - "reward_std": 0.005694209263310768, - "rewards/perpo_ocr_edit_distance_reward": 0.9655545651912689, + "advantages": -2.1287374352141342e-07, + "completion_length": 178.0, + "delta_ref_entropy_loss": 0.08740234375, + "delta_ref_ppl": -0.08056640625, + "entropy_loss": -0.140625, + "epoch": 0.0704, + "grad_norm": 3.36658503082193, + "k1_kl": 0.08056640625, + "k3_kl": 0.0458984375, + "kimi_kl": 0.1005859375, + "learning_rate": 4.648e-07, + "loss": 0.0018, + "ppl": 0.07666015625, + "reward": 0.831501841545105, + "reward_std": 0.03919536992907524, + "rewards/perpo_ocr_edit_distance_reward": 0.831501841545105, "step": 352, "temperature": 0.9 }, { - "advantages": -9.91906454146374e-05, - "completion_length": 591.5, - "delta_ref_entropy_loss": 0.036376953125, - "delta_ref_ppl": -0.019775390625, - "entropy_loss": -0.03216552734375, - "epoch": 0.1412, - "grad_norm": 0.6586837741036383, - "k1_kl": 0.019775390625, - "k3_kl": 0.0094146728515625, - "kimi_kl": 0.028778076171875, - "learning_rate": 4.2939999999999997e-07, - "loss": 0.0005, - "ppl": 0.010894775390625, - "reward": 0.6604785323143005, - "reward_std": 0.00023443334794137627, - "rewards/perpo_ocr_edit_distance_reward": 0.6604785621166229, + "advantages": -2.244540701212827e-05, + "completion_length": 528.0, + "delta_ref_entropy_loss": 0.042236328125, + "delta_ref_ppl": -0.03515625, + "entropy_loss": -0.04833984375, + "epoch": 0.0706, + "grad_norm": 0.8927324693388149, + "k1_kl": 0.03515625, + "k3_kl": 0.0184326171875, + "kimi_kl": 0.04052734375, + "learning_rate": 4.6469999999999997e-07, + "loss": 0.0008, + "ppl": 0.0272216796875, + "reward": 0.9750760197639465, + "reward_std": 0.0010382115142419934, + "rewards/perpo_ocr_edit_distance_reward": 0.9750760197639465, "step": 353, "temperature": 0.9 }, { - "advantages": -1.3581344319391064e-05, - "completion_length": 366.5, - "delta_ref_entropy_loss": 0.02886962890625, - "delta_ref_ppl": -0.019775390625, - "entropy_loss": -0.021148681640625, - "epoch": 0.1416, - "grad_norm": 0.5036800315374234, - "k1_kl": 0.01983642578125, - "k3_kl": 0.010345458984375, - "kimi_kl": 0.019378662109375, - "learning_rate": 4.292e-07, - "loss": 0.0004, - "ppl": 0.00830841064453125, - "reward": 0.9991015195846558, - "reward_std": 0.0007335889968089759, - "rewards/perpo_ocr_edit_distance_reward": 0.9991015493869781, + "advantages": -6.897109301462478e-07, + "completion_length": 423.0, + "delta_ref_entropy_loss": 0.080078125, + "delta_ref_ppl": -0.0595703125, + "entropy_loss": -0.1337890625, + "epoch": 0.0708, + "grad_norm": 1.7958507930319914, + "k1_kl": 0.0595703125, + "k3_kl": 0.03857421875, + "kimi_kl": 0.0830078125, + "learning_rate": 4.646e-07, + "loss": 0.0015, + "ppl": 0.0625, + "reward": 0.7725416421890259, + "reward_std": 0.060408852994441986, + "rewards/perpo_ocr_edit_distance_reward": 0.7725417613983154, "step": 354, "temperature": 0.9 }, { - "advantages": 1.7992089397012023e-05, - "completion_length": 798.5, - "delta_ref_entropy_loss": 0.03887939453125, - "delta_ref_ppl": -0.04193115234375, - "entropy_loss": -0.0570068359375, - "epoch": 0.142, - "grad_norm": 0.885441153354592, - "k1_kl": 0.0419158935546875, - "k3_kl": 0.029388427734375, - "kimi_kl": 0.1182708740234375, - "learning_rate": 4.29e-07, - "loss": 0.0012, - "ppl": 0.02880859375, - "reward": 0.953163355588913, - "reward_std": 0.0026471267046872526, - "rewards/perpo_ocr_edit_distance_reward": 0.9531633853912354, + "advantages": -1.819644785427954e-05, + "completion_length": 459.0, + "delta_ref_entropy_loss": 0.060791015625, + "delta_ref_ppl": -0.06396484375, + "entropy_loss": -0.10693359375, + "epoch": 0.071, + "grad_norm": 1.4942619675245887, + "k1_kl": 0.06396484375, + "k3_kl": 0.042724609375, + "kimi_kl": 0.1357421875, + "learning_rate": 4.645e-07, + "loss": 0.0017, + "ppl": 0.058349609375, + "reward": 0.9526141881942749, + "reward_std": 0.0022401621099561453, + "rewards/perpo_ocr_edit_distance_reward": 0.9526143074035645, "step": 355, "temperature": 0.9 }, { - "advantages": -3.0270645083874115e-06, - "completion_length": 293.5, - "delta_ref_entropy_loss": 0.068115234375, - "delta_ref_ppl": -0.110107421875, - "entropy_loss": -0.140380859375, - "epoch": 0.1424, - "grad_norm": 4.746379171827118, - "k1_kl": 0.11016845703125, - "k3_kl": 0.065673828125, - "kimi_kl": 0.15960693359375, - "learning_rate": 4.288e-07, - "loss": 0.0026, - "ppl": 0.059967041015625, - "reward": 0.9291689097881317, - "reward_std": 0.011241161730140448, - "rewards/perpo_ocr_edit_distance_reward": 0.9291689991950989, + "advantages": 0.0, + "completion_length": 14.0, + "delta_ref_entropy_loss": 0.02734375, + "delta_ref_ppl": -0.263671875, + "entropy_loss": -0.1826171875, + "epoch": 0.0712, + "grad_norm": 0.11525394529779238, + "k1_kl": 0.263671875, + "k3_kl": 0.1787109375, + "kimi_kl": 0.431640625, + "learning_rate": 4.6439999999999995e-07, + "loss": 0.0072, + "ppl": 0.036376953125, + "reward": 0.09395972639322281, + "reward_std": 0.0, + "rewards/perpo_ocr_edit_distance_reward": 0.0939597338438034, "step": 356, "temperature": 0.9 }, { - "advantages": -0.0002980828285217285, - "completion_length": 387.5, - "delta_ref_entropy_loss": 0.0472412109375, - "delta_ref_ppl": -0.03436279296875, - "entropy_loss": -0.0264892578125, - "epoch": 0.1428, - "grad_norm": 0.7586387692576178, - "k1_kl": 0.03436279296875, - "k3_kl": 0.016082763671875, - "kimi_kl": 0.0341796875, - "learning_rate": 4.2859999999999996e-07, - "loss": 0.0009, - "ppl": 0.00750732421875, - "reward": 0.9565792679786682, - "reward_std": 0.07028304040431976, - "rewards/perpo_ocr_edit_distance_reward": 0.956579327583313, + "advantages": -2.5033950805664062e-06, + "completion_length": 585.0, + "delta_ref_entropy_loss": 0.05419921875, + "delta_ref_ppl": -0.0299072265625, + "entropy_loss": -0.150390625, + "epoch": 0.0714, + "grad_norm": 1.5306055336172133, + "k1_kl": 0.02978515625, + "k3_kl": 0.01904296875, + "kimi_kl": 0.02978515625, + "learning_rate": 4.643e-07, + "loss": 0.0008, + "ppl": 0.080078125, + "reward": 0.8151390552520752, + "reward_std": 0.006699356250464916, + "rewards/perpo_ocr_edit_distance_reward": 0.81513911485672, "step": 357, "temperature": 0.9 }, { - "advantages": -3.1411649355383986e-05, - "completion_length": 279.5, - "delta_ref_entropy_loss": 0.0550537109375, - "delta_ref_ppl": -0.0416259765625, - "entropy_loss": -0.0625, - "epoch": 0.1432, - "grad_norm": 1.0348618507911524, - "k1_kl": 0.041748046875, - "k3_kl": 0.019683837890625, - "kimi_kl": 0.034912109375, - "learning_rate": 4.284e-07, - "loss": 0.0008, - "ppl": 0.03363037109375, - "reward": 0.991060197353363, - "reward_std": 0.0014541984419338405, - "rewards/perpo_ocr_edit_distance_reward": 0.9910602867603302, + "advantages": -2.5919505787896924e-05, + "completion_length": 192.0, + "delta_ref_entropy_loss": 0.09716796875, + "delta_ref_ppl": -0.08251953125, + "entropy_loss": -0.09130859375, + "epoch": 0.0716, + "grad_norm": 1.4445029594311203, + "k1_kl": 0.08251953125, + "k3_kl": 0.049560546875, + "kimi_kl": 0.09228515625, + "learning_rate": 4.642e-07, + "loss": 0.002, + "ppl": 0.03955078125, + "reward": 0.9613186120986938, + "reward_std": 0.001869239378720522, + "rewards/perpo_ocr_edit_distance_reward": 0.9613187313079834, "step": 358, "temperature": 0.9 }, { - "advantages": -5.93492002032292e-06, - "completion_length": 324.5, - "delta_ref_entropy_loss": 0.05914306640625, - "delta_ref_ppl": -0.07110595703125, - "entropy_loss": -0.1097412109375, - "epoch": 0.1436, - "grad_norm": 3.7472722790497635, - "k1_kl": 0.07135009765625, - "k3_kl": 0.038970947265625, - "kimi_kl": 0.07208251953125, - "learning_rate": 4.282e-07, - "loss": 0.0016, - "ppl": 0.0755767822265625, - "reward": 0.9786360561847687, - "reward_std": 0.00377067866793368, - "rewards/perpo_ocr_edit_distance_reward": 0.9786361157894135, + "advantages": -0.00014298729365691543, + "completion_length": 435.0, + "delta_ref_entropy_loss": 0.018310546875, + "delta_ref_ppl": -0.01043701171875, + "entropy_loss": -0.007171630859375, + "epoch": 0.0718, + "grad_norm": 0.5087382234728676, + "k1_kl": 0.010498046875, + "k3_kl": 0.006195068359375, + "kimi_kl": 0.013427734375, + "learning_rate": 4.641e-07, + "loss": 0.0004, + "ppl": 0.0023193359375, + "reward": 0.9802615642547607, + "reward_std": 0.00019771509687416255, + "rewards/perpo_ocr_edit_distance_reward": 0.9802616834640503, "step": 359, "temperature": 0.9 }, { - "advantages": -4.036086011183215e-06, - "completion_length": 529.5, - "delta_ref_entropy_loss": 0.06201171875, - "delta_ref_ppl": -0.03143310546875, - "entropy_loss": -0.11712646484375, - "epoch": 0.144, - "grad_norm": 1.095065038509751, - "k1_kl": 0.03143310546875, - "k3_kl": 0.0161285400390625, - "kimi_kl": 0.04052734375, - "learning_rate": 4.2799999999999997e-07, - "loss": 0.0006, - "ppl": 0.064300537109375, - "reward": 0.957771360874176, - "reward_std": 0.004158557392656803, - "rewards/perpo_ocr_edit_distance_reward": 0.9577713906764984, + "advantages": -1.8085753254126757e-05, + "completion_length": 987.0, + "delta_ref_entropy_loss": 0.008544921875, + "delta_ref_ppl": -0.009521484375, + "entropy_loss": -0.01190185546875, + "epoch": 0.072, + "grad_norm": 0.594313649871122, + "k1_kl": 0.00946044921875, + "k3_kl": 0.006256103515625, + "kimi_kl": 0.0145263671875, + "learning_rate": 4.64e-07, + "loss": 0.0003, + "ppl": 0.0048828125, + "reward": 0.9713489413261414, + "reward_std": 0.0008422419195994735, + "rewards/perpo_ocr_edit_distance_reward": 0.9713490009307861, "step": 360, "temperature": 0.9 }, { - "advantages": -0.0002980232238769531, - "completion_length": 576.5, - "delta_ref_entropy_loss": 0.0457763671875, - "delta_ref_ppl": -0.042449951171875, - "entropy_loss": -0.03216552734375, - "epoch": 0.1444, - "grad_norm": 0.24494955525753823, - "k1_kl": 0.04248046875, - "k3_kl": 0.02447509765625, - "kimi_kl": 0.0533447265625, - "learning_rate": 4.278e-07, - "loss": 0.0013, - "ppl": 0.01226806640625, - "reward": 0.8177970051765442, - "reward_std": 0.005106570664793253, - "rewards/perpo_ocr_edit_distance_reward": 0.817797064781189, + "advantages": -6.215913117557648e-07, + "completion_length": 781.0, + "delta_ref_entropy_loss": 0.0257568359375, + "delta_ref_ppl": -0.0166015625, + "entropy_loss": -0.041015625, + "epoch": 0.0722, + "grad_norm": 0.8795920148941162, + "k1_kl": 0.0166015625, + "k3_kl": 0.0113525390625, + "kimi_kl": 0.0247802734375, + "learning_rate": 4.6389999999999997e-07, + "loss": 0.0005, + "ppl": 0.021484375, + "reward": 0.839087188243866, + "reward_std": 0.0807306095957756, + "rewards/perpo_ocr_edit_distance_reward": 0.8390872478485107, "step": 361, "temperature": 0.9 }, { - "advantages": -9.093540370486153e-05, - "completion_length": 288.5, - "delta_ref_entropy_loss": 0.059326171875, - "delta_ref_ppl": -0.0654296875, - "entropy_loss": -0.0955810546875, - "epoch": 0.1448, - "grad_norm": 1.4107565593176132, - "k1_kl": 0.0654296875, - "k3_kl": 0.0416259765625, - "kimi_kl": 0.129150390625, - "learning_rate": 4.2759999999999994e-07, - "loss": 0.0018, - "ppl": 0.04656982421875, - "reward": 0.9435113072395325, - "reward_std": 0.006486251804744825, - "rewards/perpo_ocr_edit_distance_reward": 0.9435113966464996, + "advantages": -0.0002717461029533297, + "completion_length": 332.0, + "delta_ref_entropy_loss": 0.035888671875, + "delta_ref_ppl": -0.037841796875, + "entropy_loss": -0.02197265625, + "epoch": 0.0724, + "grad_norm": 0.8209012093315153, + "k1_kl": 0.03759765625, + "k3_kl": 0.022216796875, + "kimi_kl": 0.046142578125, + "learning_rate": 4.6379999999999996e-07, + "loss": 0.0012, + "ppl": 0.01190185546875, + "reward": 0.9821520447731018, + "reward_std": 0.0002759834169410169, + "rewards/perpo_ocr_edit_distance_reward": 0.9821521639823914, "step": 362, "temperature": 0.9 }, { - "advantages": -2.282857985846931e-05, - "completion_length": 605.0, - "delta_ref_entropy_loss": 0.025421142578125, - "delta_ref_ppl": -0.014923095703125, - "entropy_loss": -0.02508544921875, - "epoch": 0.1452, - "grad_norm": 0.6651799914811717, - "k1_kl": 0.014923095703125, - "k3_kl": 0.0076141357421875, - "kimi_kl": 0.0140380859375, - "learning_rate": 4.274e-07, - "loss": 0.0003, - "ppl": 0.0102386474609375, - "reward": 0.9967541992664337, - "reward_std": 0.0008627987117506564, - "rewards/perpo_ocr_edit_distance_reward": 0.9967542290687561, + "advantages": 1.7029899268550253e-08, + "completion_length": 655.0, + "delta_ref_entropy_loss": 0.0201416015625, + "delta_ref_ppl": -0.01806640625, + "entropy_loss": -0.0177001953125, + "epoch": 0.0726, + "grad_norm": 0.3538900696838897, + "k1_kl": 0.01806640625, + "k3_kl": 0.01116943359375, + "kimi_kl": 0.0228271484375, + "learning_rate": 4.637e-07, + "loss": 0.0004, + "ppl": 0.008056640625, + "reward": 0.91062331199646, + "reward_std": 0.1840764284133911, + "rewards/perpo_ocr_edit_distance_reward": 0.9106233716011047, "step": 363, "temperature": 0.9 }, { - "advantages": -8.97475729288999e-06, - "completion_length": 566.0, - "delta_ref_entropy_loss": 0.0279541015625, - "delta_ref_ppl": -0.013702392578125, - "entropy_loss": -0.0196533203125, - "epoch": 0.1456, - "grad_norm": 0.4012836219841366, - "k1_kl": 0.01373291015625, - "k3_kl": 0.00585174560546875, - "kimi_kl": 0.00927734375, - "learning_rate": 4.272e-07, + "advantages": -4.938671054333099e-07, + "completion_length": 1320.0, + "delta_ref_entropy_loss": 0.0291748046875, + "delta_ref_ppl": -0.01104736328125, + "entropy_loss": -0.050048828125, + "epoch": 0.0728, + "grad_norm": 1.2162253047327047, + "k1_kl": 0.01104736328125, + "k3_kl": 0.0052490234375, + "kimi_kl": 0.00653076171875, + "learning_rate": 4.636e-07, "loss": 0.0002, - "ppl": 0.007720947265625, - "reward": 0.9975687861442566, - "reward_std": 0.002555995713919401, - "rewards/perpo_ocr_edit_distance_reward": 0.9975688457489014, + "ppl": 0.0250244140625, + "reward": 0.8971972465515137, + "reward_std": 0.11818893253803253, + "rewards/perpo_ocr_edit_distance_reward": 0.8971973061561584, "step": 364, "temperature": 0.9 }, { - "advantages": -7.938487505043668e-05, - "completion_length": 570.5, - "delta_ref_entropy_loss": 0.0272216796875, - "delta_ref_ppl": -0.02691650390625, - "entropy_loss": -0.033935546875, - "epoch": 0.146, - "grad_norm": 5.695529338854812, - "k1_kl": 0.02691650390625, - "k3_kl": 0.014617919921875, - "kimi_kl": 0.04095458984375, - "learning_rate": 4.2699999999999995e-07, - "loss": 0.0007, - "ppl": 0.0178680419921875, - "reward": 0.9790785908699036, - "reward_std": 0.020534135721391067, - "rewards/perpo_ocr_edit_distance_reward": 0.9790786504745483, + "advantages": 5.2962986956117675e-06, + "completion_length": 449.0, + "delta_ref_entropy_loss": 0.040771484375, + "delta_ref_ppl": -0.041259765625, + "entropy_loss": -0.052490234375, + "epoch": 0.073, + "grad_norm": 1.0935152999823254, + "k1_kl": 0.04150390625, + "k3_kl": 0.02490234375, + "kimi_kl": 0.056884765625, + "learning_rate": 4.635e-07, + "loss": 0.001, + "ppl": 0.0223388671875, + "reward": 0.9337871670722961, + "reward_std": 0.0015116935828700662, + "rewards/perpo_ocr_edit_distance_reward": 0.9337872266769409, "step": 365, "temperature": 0.9 }, { - "advantages": -5.705016405954666e-07, - "completion_length": 346.5, - "delta_ref_entropy_loss": 0.0982666015625, - "delta_ref_ppl": -0.12762451171875, - "entropy_loss": -0.0872802734375, - "epoch": 0.1464, - "grad_norm": 0.5619555322962595, - "k1_kl": 0.12811279296875, - "k3_kl": 0.084991455078125, - "kimi_kl": 0.19732666015625, - "learning_rate": 4.268e-07, - "loss": 0.0034, - "ppl": 0.03924560546875, - "reward": 0.5654055774211884, - "reward_std": 0.0037303082644939423, - "rewards/perpo_ocr_edit_distance_reward": 0.5654056072235107, + "advantages": -1.7029899268550253e-08, + "completion_length": 607.0, + "delta_ref_entropy_loss": 0.01007080078125, + "delta_ref_ppl": -0.010986328125, + "entropy_loss": -0.017822265625, + "epoch": 0.0732, + "grad_norm": 0.7365994790450126, + "k1_kl": 0.010986328125, + "k3_kl": 0.00811767578125, + "kimi_kl": 0.0186767578125, + "learning_rate": 4.634e-07, + "loss": 0.0003, + "ppl": 0.007568359375, + "reward": 0.9747307896614075, + "reward_std": 0.0005064686993137002, + "rewards/perpo_ocr_edit_distance_reward": 0.9747307896614075, "step": 366, "temperature": 0.9 }, { - "advantages": -0.0002980232238769531, - "completion_length": 467.5, - "delta_ref_entropy_loss": 0.0623779296875, - "delta_ref_ppl": -0.0528564453125, - "entropy_loss": -0.027496337890625, - "epoch": 0.1468, - "grad_norm": 0.025437045581856447, - "k1_kl": 0.0526123046875, - "k3_kl": 0.027801513671875, - "kimi_kl": 0.06256103515625, - "learning_rate": 4.2659999999999997e-07, - "loss": 0.0014, - "ppl": 0.0068359375, - "reward": 0.9815984964370728, - "reward_std": 0.0, - "rewards/perpo_ocr_edit_distance_reward": 0.9815985560417175, + "advantages": -4.2038307583425194e-05, + "completion_length": 287.0, + "delta_ref_entropy_loss": 0.0208740234375, + "delta_ref_ppl": -0.024169921875, + "entropy_loss": -0.034912109375, + "epoch": 0.0734, + "grad_norm": 1.5435788069859881, + "k1_kl": 0.024169921875, + "k3_kl": 0.0159912109375, + "kimi_kl": 0.036376953125, + "learning_rate": 4.6329999999999997e-07, + "loss": 0.0007, + "ppl": 0.0162353515625, + "reward": 0.9738151431083679, + "reward_std": 0.0009127480443567038, + "rewards/perpo_ocr_edit_distance_reward": 0.9738150835037231, "step": 367, "temperature": 0.9 }, { - "advantages": -0.0001502164772659853, - "completion_length": 839.5, - "delta_ref_entropy_loss": 0.06341552734375, - "delta_ref_ppl": -0.03271484375, - "entropy_loss": -0.14996337890625, - "epoch": 0.1472, - "grad_norm": 1.3835621147592934, - "k1_kl": 0.03265380859375, - "k3_kl": 0.0189208984375, - "kimi_kl": 0.0484619140625, - "learning_rate": 4.264e-07, - "loss": 0.0009, - "ppl": 0.086395263671875, - "reward": 0.8982080221176147, - "reward_std": 0.01675136417907197, - "rewards/perpo_ocr_edit_distance_reward": 0.8982080817222595, + "advantages": -1.4679772903036792e-05, + "completion_length": 734.0, + "delta_ref_entropy_loss": 0.0341796875, + "delta_ref_ppl": -0.0250244140625, + "entropy_loss": -0.038818359375, + "epoch": 0.0736, + "grad_norm": 0.7683194014197563, + "k1_kl": 0.0250244140625, + "k3_kl": 0.0123291015625, + "kimi_kl": 0.022216796875, + "learning_rate": 4.6319999999999997e-07, + "loss": 0.0005, + "ppl": 0.01806640625, + "reward": 0.8376683592796326, + "reward_std": 0.0010576305212453008, + "rewards/perpo_ocr_edit_distance_reward": 0.8376683592796326, "step": 368, "temperature": 0.9 }, { - "advantages": -7.369688972858057e-06, - "completion_length": 384.5, - "delta_ref_entropy_loss": 0.0648193359375, - "delta_ref_ppl": -0.02972412109375, - "entropy_loss": -0.238525390625, - "epoch": 0.1476, - "grad_norm": 2.5564963821046924, - "k1_kl": 0.02984619140625, - "k3_kl": 0.012939453125, - "kimi_kl": 0.01751708984375, - "learning_rate": 4.262e-07, + "advantages": -1.611028528714087e-05, + "completion_length": 413.0, + "delta_ref_entropy_loss": 0.036865234375, + "delta_ref_ppl": -0.0216064453125, + "entropy_loss": -0.0189208984375, + "epoch": 0.0738, + "grad_norm": 0.5740033368564337, + "k1_kl": 0.0216064453125, + "k3_kl": 0.01141357421875, + "kimi_kl": 0.029052734375, + "learning_rate": 4.631e-07, "loss": 0.0005, - "ppl": 0.14208984375, - "reward": 0.7197246998548508, - "reward_std": 0.007815061486326158, - "rewards/perpo_ocr_edit_distance_reward": 0.7197247445583344, + "ppl": 0.007110595703125, + "reward": 0.9871824383735657, + "reward_std": 0.00042785468394868076, + "rewards/perpo_ocr_edit_distance_reward": 0.9871824979782104, "step": 369, "temperature": 0.9 }, { - "advantages": -1.1452607111550606e-06, - "completion_length": 513.0, - "delta_ref_entropy_loss": 0.022308349609375, - "delta_ref_ppl": -0.03157806396484375, - "entropy_loss": -0.056396484375, - "epoch": 0.148, - "grad_norm": 1.6727269577054702, - "k1_kl": 0.031585693359375, - "k3_kl": 0.01840972900390625, - "kimi_kl": 0.043548583984375, - "learning_rate": 4.26e-07, + "advantages": -3.5592489439295605e-05, + "completion_length": 748.0, + "delta_ref_entropy_loss": 0.018798828125, + "delta_ref_ppl": -0.02392578125, + "entropy_loss": -0.0238037109375, + "epoch": 0.074, + "grad_norm": 0.582255910515967, + "k1_kl": 0.02392578125, + "k3_kl": 0.0167236328125, + "kimi_kl": 0.048583984375, + "learning_rate": 4.63e-07, "loss": 0.0007, - "ppl": 0.03110504150390625, - "reward": 0.7210165858268738, - "reward_std": 0.021837297827005386, - "rewards/perpo_ocr_edit_distance_reward": 0.7210166603326797, + "ppl": 0.01177978515625, + "reward": 0.9851158857345581, + "reward_std": 0.0006173787987791002, + "rewards/perpo_ocr_edit_distance_reward": 0.9851158857345581, "step": 370, "temperature": 0.9 }, { - "advantages": -3.533704010294514e-06, - "completion_length": 715.5, - "delta_ref_entropy_loss": 0.03466796875, - "delta_ref_ppl": -0.03045654296875, - "entropy_loss": -0.04534912109375, - "epoch": 0.1484, - "grad_norm": 1.5621767596739473, - "k1_kl": 0.03045654296875, - "k3_kl": 0.01953125, - "kimi_kl": 0.0628662109375, - "learning_rate": 4.258e-07, - "loss": 0.0008, - "ppl": 0.025421142578125, - "reward": 0.9125703573226929, - "reward_std": 0.13454571564216167, - "rewards/perpo_ocr_edit_distance_reward": 0.9125704169273376, + "advantages": -1.691068973741494e-05, + "completion_length": 409.0, + "delta_ref_entropy_loss": 0.045654296875, + "delta_ref_ppl": -0.044921875, + "entropy_loss": -0.111328125, + "epoch": 0.0742, + "grad_norm": 1.4883938159492545, + "k1_kl": 0.044921875, + "k3_kl": 0.026611328125, + "kimi_kl": 0.0458984375, + "learning_rate": 4.6289999999999994e-07, + "loss": 0.0011, + "ppl": 0.06640625, + "reward": 0.9134544730186462, + "reward_std": 0.002922044601291418, + "rewards/perpo_ocr_edit_distance_reward": 0.913454532623291, "step": 371, "temperature": 0.9 }, { - "advantages": -0.00035899877912015654, - "completion_length": 300.5, - "delta_ref_entropy_loss": 0.0323486328125, - "delta_ref_ppl": -0.01690673828125, - "entropy_loss": -0.0115203857421875, - "epoch": 0.1488, - "grad_norm": 0.5132331364762296, - "k1_kl": 0.01690673828125, - "k3_kl": 0.0082244873046875, - "kimi_kl": 0.013824462890625, - "learning_rate": 4.2559999999999995e-07, - "loss": 0.0007, - "ppl": 0.0045166015625, - "reward": 0.9968791007995605, - "reward_std": 0.0002991225919686258, - "rewards/perpo_ocr_edit_distance_reward": 0.9968791902065277, + "advantages": 3.2356808787881164e-07, + "completion_length": 202.0, + "delta_ref_entropy_loss": 0.030517578125, + "delta_ref_ppl": -0.051513671875, + "entropy_loss": -0.208984375, + "epoch": 0.0744, + "grad_norm": 2.0176409609074804, + "k1_kl": 0.051513671875, + "k3_kl": 0.033203125, + "kimi_kl": 0.07861328125, + "learning_rate": 4.628e-07, + "loss": 0.0013, + "ppl": 0.1064453125, + "reward": 0.5688599944114685, + "reward_std": 0.03903498128056526, + "rewards/perpo_ocr_edit_distance_reward": 0.5688599944114685, "step": 372, "temperature": 0.9 }, { - "advantages": -5.621995452642636e-05, - "completion_length": 662.0, - "delta_ref_entropy_loss": 0.015045166015625, - "delta_ref_ppl": -0.022796630859375, - "entropy_loss": -0.044189453125, - "epoch": 0.1492, - "grad_norm": 1.158682918964072, - "k1_kl": 0.022796630859375, - "k3_kl": 0.0157318115234375, - "kimi_kl": 0.05242919921875, - "learning_rate": 4.254e-07, - "loss": 0.0007, - "ppl": 0.0198211669921875, - "reward": 0.7819819152355194, - "reward_std": 0.04129365465632873, - "rewards/perpo_ocr_edit_distance_reward": 0.7819819450378418, + "advantages": -4.311970405979082e-05, + "completion_length": 138.0, + "delta_ref_entropy_loss": 0.057861328125, + "delta_ref_ppl": -0.06591796875, + "entropy_loss": -0.08642578125, + "epoch": 0.0746, + "grad_norm": 2.692481877234277, + "k1_kl": 0.06640625, + "k3_kl": 0.036865234375, + "kimi_kl": 0.09130859375, + "learning_rate": 4.627e-07, + "loss": 0.0015, + "ppl": 0.0478515625, + "reward": 0.9266667366027832, + "reward_std": 0.0012829964980483055, + "rewards/perpo_ocr_edit_distance_reward": 0.9266667366027832, "step": 373, "temperature": 0.9 }, { - "advantages": -9.801984197110869e-05, - "completion_length": 899.5, - "delta_ref_entropy_loss": 0.011474609375, - "delta_ref_ppl": -0.0051174163818359375, - "entropy_loss": -0.0177001953125, - "epoch": 0.1496, - "grad_norm": 33714.704669184204, - "k1_kl": 0.0051136016845703125, - "k3_kl": 740.0023956298828, - "kimi_kl": 0.0179595947265625, - "learning_rate": 4.252e-07, - "loss": 29.5341, - "ppl": 0.0152587890625, - "reward": 0.9977097511291504, - "reward_std": 0.0006405595631804317, - "rewards/perpo_ocr_edit_distance_reward": 0.9977098107337952, + "advantages": -4.138265467190649e-06, + "completion_length": 1386.0, + "delta_ref_entropy_loss": 0.0111083984375, + "delta_ref_ppl": -0.0057373046875, + "entropy_loss": -0.02880859375, + "epoch": 0.0748, + "grad_norm": 0.4299122241838666, + "k1_kl": 0.005767822265625, + "k3_kl": 0.003082275390625, + "kimi_kl": 0.006591796875, + "learning_rate": 4.6259999999999997e-07, + "loss": 0.0001, + "ppl": 0.0146484375, + "reward": 0.9657732844352722, + "reward_std": 0.006089001893997192, + "rewards/perpo_ocr_edit_distance_reward": 0.965773344039917, "step": 374, "temperature": 0.9 }, { - "advantages": -2.2206988489870128e-05, - "completion_length": 660.5, - "delta_ref_entropy_loss": 0.05560302734375, - "delta_ref_ppl": -0.03948974609375, - "entropy_loss": -0.0791015625, - "epoch": 0.15, - "grad_norm": 0.9156043661762396, - "k1_kl": 0.03948974609375, - "k3_kl": 0.0218505859375, - "kimi_kl": 0.05511474609375, - "learning_rate": 4.2499999999999995e-07, - "loss": 0.0009, - "ppl": 0.037933349609375, - "reward": 0.8838109076023102, - "reward_std": 0.013928488129749894, - "rewards/perpo_ocr_edit_distance_reward": 0.8838109970092773, + "advantages": -1.5280076695489697e-05, + "completion_length": 520.0, + "delta_ref_entropy_loss": 0.0194091796875, + "delta_ref_ppl": -0.02392578125, + "entropy_loss": -0.035400390625, + "epoch": 0.075, + "grad_norm": 0.7698823420077766, + "k1_kl": 0.02392578125, + "k3_kl": 0.0167236328125, + "kimi_kl": 0.037353515625, + "learning_rate": 4.625e-07, + "loss": 0.0007, + "ppl": 0.019775390625, + "reward": 0.9741180539131165, + "reward_std": 0.0010145696578547359, + "rewards/perpo_ocr_edit_distance_reward": 0.9741181135177612, "step": 375, "temperature": 0.9 }, { - "advantages": -4.163810558566183e-06, - "completion_length": 678.5, - "delta_ref_entropy_loss": 0.06103515625, - "delta_ref_ppl": -0.0406494140625, - "entropy_loss": -0.1156005859375, - "epoch": 0.1504, - "grad_norm": 0.8291408660127758, - "k1_kl": 0.040771484375, - "k3_kl": 0.02496337890625, - "kimi_kl": 0.04736328125, - "learning_rate": 4.248e-07, - "loss": 0.001, - "ppl": 0.06280517578125, - "reward": 0.7873359620571136, - "reward_std": 0.03367983899079263, - "rewards/perpo_ocr_edit_distance_reward": 0.7873359620571136, + "advantages": -0.00026938747032545507, + "completion_length": 554.0, + "delta_ref_entropy_loss": 0.01177978515625, + "delta_ref_ppl": -0.01385498046875, + "entropy_loss": -0.0091552734375, + "epoch": 0.0752, + "grad_norm": 0.3756213207226167, + "k1_kl": 0.01385498046875, + "k3_kl": 0.00787353515625, + "kimi_kl": 0.01708984375, + "learning_rate": 4.6239999999999996e-07, + "loss": 0.0006, + "ppl": 0.00421142578125, + "reward": 0.9849671721458435, + "reward_std": 0.00012126556975999847, + "rewards/perpo_ocr_edit_distance_reward": 0.9849672913551331, "step": 376, "temperature": 0.9 }, { - "advantages": -5.5628166592214257e-05, - "completion_length": 406.0, - "delta_ref_entropy_loss": 0.0338134765625, - "delta_ref_ppl": -0.042236328125, - "entropy_loss": -0.04931640625, - "epoch": 0.1508, - "grad_norm": 0.9505574430179894, - "k1_kl": 0.04217529296875, - "k3_kl": 0.029541015625, - "kimi_kl": 0.09521484375, - "learning_rate": 4.246e-07, - "loss": 0.0012, - "ppl": 0.023345947265625, - "reward": 0.9835275411605835, - "reward_std": 0.0009356806112919003, - "rewards/perpo_ocr_edit_distance_reward": 0.9835275709629059, + "advantages": 7.62939453125e-06, + "completion_length": 395.0, + "delta_ref_entropy_loss": 0.0220947265625, + "delta_ref_ppl": -0.0301513671875, + "entropy_loss": -0.0341796875, + "epoch": 0.0754, + "grad_norm": 0.8797028343816296, + "k1_kl": 0.0301513671875, + "k3_kl": 0.0225830078125, + "kimi_kl": 0.05517578125, + "learning_rate": 4.6229999999999995e-07, + "loss": 0.0009, + "ppl": 0.0162353515625, + "reward": 0.9750298857688904, + "reward_std": 0.0010154939955100417, + "rewards/perpo_ocr_edit_distance_reward": 0.9750300049781799, "step": 377, "temperature": 0.9 }, { - "advantages": -3.605655313343448e-05, - "completion_length": 710.0, - "delta_ref_entropy_loss": 0.04962158203125, - "delta_ref_ppl": -0.02288818359375, - "entropy_loss": -0.104248046875, - "epoch": 0.1512, - "grad_norm": 1.022154565274923, - "k1_kl": 0.022918701171875, - "k3_kl": 0.01085662841796875, - "kimi_kl": 0.01519775390625, - "learning_rate": 4.2439999999999996e-07, - "loss": 0.0005, - "ppl": 0.0513916015625, - "reward": 0.9759455323219299, - "reward_std": 0.001805495732696727, - "rewards/perpo_ocr_edit_distance_reward": 0.9759455621242523, + "advantages": -7.983616524143144e-05, + "completion_length": 666.0, + "delta_ref_entropy_loss": 0.031982421875, + "delta_ref_ppl": -0.0311279296875, + "entropy_loss": -0.026123046875, + "epoch": 0.0756, + "grad_norm": 0.3458632462045254, + "k1_kl": 0.0311279296875, + "k3_kl": 0.018798828125, + "kimi_kl": 0.046630859375, + "learning_rate": 4.622e-07, + "loss": 0.0008, + "ppl": 0.0108642578125, + "reward": 0.9819808006286621, + "reward_std": 0.00032653467496857047, + "rewards/perpo_ocr_edit_distance_reward": 0.9819809198379517, "step": 378, "temperature": 0.9 }, { - "advantages": -0.0002942106584669091, - "completion_length": 603.0, - "delta_ref_entropy_loss": 0.036376953125, - "delta_ref_ppl": -0.016693115234375, - "entropy_loss": -0.02154541015625, - "epoch": 0.1516, - "grad_norm": 0.245580417662043, - "k1_kl": 0.0167236328125, - "k3_kl": 0.0059967041015625, - "kimi_kl": 0.009490966796875, - "learning_rate": 4.242e-07, + "advantages": -1.4134816410660278e-06, + "completion_length": 555.0, + "delta_ref_entropy_loss": 0.020263671875, + "delta_ref_ppl": -0.017578125, + "entropy_loss": -0.0322265625, + "epoch": 0.0758, + "grad_norm": 1.0981608482807368, + "k1_kl": 0.017578125, + "k3_kl": 0.01214599609375, + "kimi_kl": 0.02685546875, + "learning_rate": 4.621e-07, "loss": 0.0005, - "ppl": 0.009368896484375, - "reward": 0.9987429678440094, - "reward_std": 0.00039669139368925244, - "rewards/perpo_ocr_edit_distance_reward": 0.9987430572509766, + "ppl": 0.0177001953125, + "reward": 0.9564575552940369, + "reward_std": 0.07350556552410126, + "rewards/perpo_ocr_edit_distance_reward": 0.9564576148986816, "step": 379, "temperature": 0.9 }, { - "advantages": -1.2895891359221423e-05, - "completion_length": 138.0, - "delta_ref_entropy_loss": 0.03009033203125, - "delta_ref_ppl": -0.0673828125, - "entropy_loss": -0.053955078125, - "epoch": 0.152, - "grad_norm": 1.6134996936712087, - "k1_kl": 0.067626953125, - "k3_kl": 0.045166015625, - "kimi_kl": 0.118896484375, - "learning_rate": 4.24e-07, - "loss": 0.0018, - "ppl": 0.023468017578125, - "reward": 0.9786071479320526, - "reward_std": 0.005073045438621193, - "rewards/perpo_ocr_edit_distance_reward": 0.9786072075366974, + "advantages": -2.520425141483429e-06, + "completion_length": 672.0, + "delta_ref_entropy_loss": 0.050048828125, + "delta_ref_ppl": -0.04736328125, + "entropy_loss": -0.08740234375, + "epoch": 0.076, + "grad_norm": 1.0265844536576436, + "k1_kl": 0.047119140625, + "k3_kl": 0.0279541015625, + "kimi_kl": 0.07275390625, + "learning_rate": 4.62e-07, + "loss": 0.0011, + "ppl": 0.046875, + "reward": 0.9217580556869507, + "reward_std": 0.04684457555413246, + "rewards/perpo_ocr_edit_distance_reward": 0.9217581748962402, "step": 380, "temperature": 0.9 }, { - "advantages": -3.5890511753677856e-06, - "completion_length": 512.0, - "delta_ref_entropy_loss": 0.02777099609375, - "delta_ref_ppl": -0.03094482421875, - "entropy_loss": -0.0291748046875, - "epoch": 0.1524, - "grad_norm": 0.5660293876283722, - "k1_kl": 0.03094482421875, - "k3_kl": 0.01715087890625, - "kimi_kl": 0.03564453125, - "learning_rate": 4.2379999999999997e-07, - "loss": 0.0007, - "ppl": 0.01300048828125, - "reward": 0.992169588804245, - "reward_std": 0.0020282777259126306, - "rewards/perpo_ocr_edit_distance_reward": 0.992169588804245, + "advantages": -5.27926886206842e-07, + "completion_length": 2048.0, + "delta_ref_entropy_loss": 0.0791015625, + "delta_ref_ppl": -0.03369140625, + "entropy_loss": -0.208984375, + "epoch": 0.0762, + "grad_norm": 1.775880333312857, + "k1_kl": 0.033447265625, + "k3_kl": 0.0233154296875, + "kimi_kl": 0.026123046875, + "learning_rate": 4.619e-07, + "loss": 0.0009, + "ppl": 0.1201171875, + "reward": 0.8026613593101501, + "reward_std": 0.08284352719783783, + "rewards/perpo_ocr_edit_distance_reward": 0.8026613593101501, "step": 381, "temperature": 0.9 }, { - "advantages": 5.896602488064673e-06, - "completion_length": 392.5, - "delta_ref_entropy_loss": 0.0889892578125, - "delta_ref_ppl": -0.0782470703125, - "entropy_loss": -0.0869140625, - "epoch": 0.1528, - "grad_norm": 2.41618401710342, - "k1_kl": 0.07879638671875, - "k3_kl": 0.0435638427734375, - "kimi_kl": 0.086700439453125, - "learning_rate": 4.2359999999999995e-07, - "loss": 0.0017, - "ppl": 0.0313720703125, - "reward": 0.980213463306427, - "reward_std": 0.0026660190778784454, - "rewards/perpo_ocr_edit_distance_reward": 0.980213463306427, + "advantages": -4.2574748704282683e-07, + "completion_length": 185.0, + "delta_ref_entropy_loss": 0.006500244140625, + "delta_ref_ppl": -0.062255859375, + "entropy_loss": -0.04052734375, + "epoch": 0.0764, + "grad_norm": 2.8448156372743254, + "k1_kl": 0.06201171875, + "k3_kl": 0.05224609375, + "kimi_kl": 0.150390625, + "learning_rate": 4.6179999999999997e-07, + "loss": 0.0021, + "ppl": 0.01953125, + "reward": 0.9300492405891418, + "reward_std": 0.060899391770362854, + "rewards/perpo_ocr_edit_distance_reward": 0.9300493001937866, "step": 382, "temperature": 0.9 }, { - "advantages": -7.481021862076886e-05, - "completion_length": 565.5, - "delta_ref_entropy_loss": 0.04010009765625, - "delta_ref_ppl": -0.022674560546875, - "entropy_loss": -0.02789306640625, - "epoch": 0.1532, - "grad_norm": 0.5237593131548305, - "k1_kl": 0.02276611328125, - "k3_kl": 0.0099945068359375, - "kimi_kl": 0.017913818359375, - "learning_rate": 4.234e-07, - "loss": 0.0005, - "ppl": 0.010498046875, - "reward": 0.9983398914337158, - "reward_std": 0.00079406175063923, - "rewards/perpo_ocr_edit_distance_reward": 0.9983399212360382, + "advantages": -6.198883511387976e-06, + "completion_length": 619.0, + "delta_ref_entropy_loss": 0.06201171875, + "delta_ref_ppl": -0.0296630859375, + "entropy_loss": -0.123046875, + "epoch": 0.0766, + "grad_norm": 2.278887265662404, + "k1_kl": 0.02978515625, + "k3_kl": 0.015869140625, + "kimi_kl": 0.0244140625, + "learning_rate": 4.6169999999999996e-07, + "loss": 0.0006, + "ppl": 0.07080078125, + "reward": 0.8900998830795288, + "reward_std": 0.01500013843178749, + "rewards/perpo_ocr_edit_distance_reward": 0.8901000022888184, "step": 383, "temperature": 0.9 }, { - "advantages": -3.6473786167334765e-05, - "completion_length": 636.0, - "delta_ref_entropy_loss": 0.03363037109375, - "delta_ref_ppl": -0.01519775390625, - "entropy_loss": -0.0228271484375, - "epoch": 0.1536, - "grad_norm": 1.9440248167300258, - "k1_kl": 0.01519775390625, - "k3_kl": 0.00506591796875, - "kimi_kl": 0.0079345703125, - "learning_rate": 4.232e-07, - "loss": 0.0002, - "ppl": 0.0093994140625, - "reward": 0.9992599189281464, - "reward_std": 0.0006088503578212112, - "rewards/perpo_ocr_edit_distance_reward": 0.9992599785327911, + "advantages": 1.2772424270224292e-05, + "completion_length": 608.0, + "delta_ref_entropy_loss": 0.0162353515625, + "delta_ref_ppl": -0.01422119140625, + "entropy_loss": -0.0189208984375, + "epoch": 0.0768, + "grad_norm": 0.5248898151055786, + "k1_kl": 0.01422119140625, + "k3_kl": 0.0086669921875, + "kimi_kl": 0.01611328125, + "learning_rate": 4.616e-07, + "loss": 0.0003, + "ppl": 0.0089111328125, + "reward": 0.9874200224876404, + "reward_std": 0.0012328601442277431, + "rewards/perpo_ocr_edit_distance_reward": 0.9874199628829956, "step": 384, "temperature": 0.9 }, { - "advantages": -8.404255481764267e-06, - "completion_length": 671.0, - "delta_ref_entropy_loss": 0.0333251953125, - "delta_ref_ppl": -0.036376953125, - "entropy_loss": -0.036376953125, - "epoch": 0.154, - "grad_norm": 0.7095492911757467, - "k1_kl": 0.036376953125, - "k3_kl": 0.025848388671875, - "kimi_kl": 0.1026611328125, - "learning_rate": 4.2299999999999996e-07, - "loss": 0.001, - "ppl": 0.019195556640625, - "reward": 0.995845228433609, - "reward_std": 0.003434610436670482, - "rewards/perpo_ocr_edit_distance_reward": 0.9958453178405762, + "advantages": -7.033348538243445e-06, + "completion_length": 273.0, + "delta_ref_entropy_loss": 0.07080078125, + "delta_ref_ppl": -0.09130859375, + "entropy_loss": -0.2333984375, + "epoch": 0.077, + "grad_norm": 4.686239262173633, + "k1_kl": 0.0908203125, + "k3_kl": 0.0576171875, + "kimi_kl": 0.126953125, + "learning_rate": 4.615e-07, + "loss": 0.0023, + "ppl": 0.142578125, + "reward": 0.8319596648216248, + "reward_std": 0.010785029269754887, + "rewards/perpo_ocr_edit_distance_reward": 0.8319597840309143, "step": 385, "temperature": 0.9 }, { - "advantages": -1.2785196986442315e-05, - "completion_length": 849.5, - "delta_ref_entropy_loss": 0.07489013671875, - "delta_ref_ppl": -0.045654296875, - "entropy_loss": -0.1644287109375, - "epoch": 0.1544, - "grad_norm": 1.7236794395315636, - "k1_kl": 0.04583740234375, - "k3_kl": 0.022796630859375, - "kimi_kl": 0.04241943359375, - "learning_rate": 4.228e-07, - "loss": 0.0009, - "ppl": 0.09228515625, - "reward": 0.9097210466861725, - "reward_std": 0.009973035892471671, - "rewards/perpo_ocr_edit_distance_reward": 0.9097211062908173, + "advantages": -9.92843160929624e-06, + "completion_length": 255.0, + "delta_ref_entropy_loss": 0.03369140625, + "delta_ref_ppl": -0.0233154296875, + "entropy_loss": -0.048583984375, + "epoch": 0.0772, + "grad_norm": 1.6923619161493697, + "k1_kl": 0.0233154296875, + "k3_kl": 0.01214599609375, + "kimi_kl": 0.0233154296875, + "learning_rate": 4.6139999999999994e-07, + "loss": 0.0005, + "ppl": 0.0244140625, + "reward": 0.9469040632247925, + "reward_std": 0.004189536906778812, + "rewards/perpo_ocr_edit_distance_reward": 0.9469040632247925, "step": 386, "temperature": 0.9 }, { - "advantages": -1.4134816410660278e-06, - "completion_length": 718.5, - "delta_ref_entropy_loss": 0.03759765625, - "delta_ref_ppl": -0.05126953125, - "entropy_loss": -0.0738525390625, - "epoch": 0.1548, - "grad_norm": 2.287455888794989, - "k1_kl": 0.0511474609375, - "k3_kl": 0.0357666015625, - "kimi_kl": 0.0966796875, - "learning_rate": 4.2259999999999993e-07, - "loss": 0.0014, - "ppl": 0.0391845703125, - "reward": 0.9060720503330231, - "reward_std": 0.03362188208848238, - "rewards/perpo_ocr_edit_distance_reward": 0.9060721099376678, + "advantages": -4.785401870321948e-06, + "completion_length": 740.0, + "delta_ref_entropy_loss": 0.010009765625, + "delta_ref_ppl": -0.012939453125, + "entropy_loss": -0.034912109375, + "epoch": 0.0774, + "grad_norm": 1.3271691896430506, + "k1_kl": 0.01300048828125, + "k3_kl": 0.00836181640625, + "kimi_kl": 0.014892578125, + "learning_rate": 4.613e-07, + "loss": 0.0003, + "ppl": 0.020263671875, + "reward": 0.9584107398986816, + "reward_std": 0.005240830592811108, + "rewards/perpo_ocr_edit_distance_reward": 0.9584107398986816, "step": 387, "temperature": 0.9 }, { - "advantages": -1.1239733339607483e-06, - "completion_length": 445.5, - "delta_ref_entropy_loss": 0.05059814453125, - "delta_ref_ppl": -0.033447265625, - "entropy_loss": -0.065185546875, - "epoch": 0.1552, - "grad_norm": 0.9013210361538868, - "k1_kl": 0.0333251953125, - "k3_kl": 0.015167236328125, - "kimi_kl": 0.027923583984375, - "learning_rate": 4.2239999999999997e-07, - "loss": 0.0006, - "ppl": 0.033435821533203125, - "reward": 0.9393859207630157, - "reward_std": 0.009390298277139664, - "rewards/perpo_ocr_edit_distance_reward": 0.9393859803676605, + "advantages": -3.168412877130322e-05, + "completion_length": 159.0, + "delta_ref_entropy_loss": 0.036865234375, + "delta_ref_ppl": -0.059814453125, + "entropy_loss": -0.12158203125, + "epoch": 0.0776, + "grad_norm": 3.4166410464629857, + "k1_kl": 0.06005859375, + "k3_kl": 0.046875, + "kimi_kl": 0.09765625, + "learning_rate": 4.612e-07, + "loss": 0.0019, + "ppl": 0.0712890625, + "reward": 0.9714285135269165, + "reward_std": 0.0015118669252842665, + "rewards/perpo_ocr_edit_distance_reward": 0.971428632736206, "step": 388, "temperature": 0.9 }, { - "advantages": -1.924378580042685e-06, - "completion_length": 549.5, - "delta_ref_entropy_loss": 0.02618408203125, - "delta_ref_ppl": -0.02777099609375, - "entropy_loss": -0.09393310546875, - "epoch": 0.1556, - "grad_norm": 7.105591463963455, - "k1_kl": 0.02777099609375, - "k3_kl": 0.01763916015625, - "kimi_kl": 0.040771484375, - "learning_rate": 4.222e-07, - "loss": 0.0007, - "ppl": 0.0455322265625, - "reward": 0.8311065137386322, - "reward_std": 0.06759785022586584, - "rewards/perpo_ocr_edit_distance_reward": 0.8311066031455994, + "advantages": -5.3771906095789745e-05, + "completion_length": 700.0, + "delta_ref_entropy_loss": 0.023681640625, + "delta_ref_ppl": -0.025390625, + "entropy_loss": -0.0224609375, + "epoch": 0.0778, + "grad_norm": 0.6294024378381755, + "k1_kl": 0.025390625, + "k3_kl": 0.0145263671875, + "kimi_kl": 0.037841796875, + "learning_rate": 4.6109999999999997e-07, + "loss": 0.0006, + "ppl": 0.00885009765625, + "reward": 0.9854747653007507, + "reward_std": 0.0005334603483788669, + "rewards/perpo_ocr_edit_distance_reward": 0.9854748249053955, "step": 389, "temperature": 0.9 }, { - "advantages": -1.9512007270350296e-05, - "completion_length": 407.0, - "delta_ref_entropy_loss": 0.02923583984375, - "delta_ref_ppl": -0.01336669921875, - "entropy_loss": -0.021484375, - "epoch": 0.156, - "grad_norm": 0.7073804070066548, - "k1_kl": 0.0133056640625, - "k3_kl": 0.0046539306640625, - "kimi_kl": 0.0064544677734375, - "learning_rate": 4.2199999999999994e-07, - "loss": 0.0002, - "ppl": 0.009796142578125, - "reward": 0.9807721972465515, - "reward_std": 0.005223827960435301, - "rewards/perpo_ocr_edit_distance_reward": 0.9807722568511963, + "advantages": -4.485675526666455e-05, + "completion_length": 289.0, + "delta_ref_entropy_loss": 0.022705078125, + "delta_ref_ppl": -0.031982421875, + "entropy_loss": -0.0174560546875, + "epoch": 0.078, + "grad_norm": 1.6246639943567185, + "k1_kl": 0.031982421875, + "k3_kl": 0.0211181640625, + "kimi_kl": 0.061279296875, + "learning_rate": 4.61e-07, + "loss": 0.0009, + "ppl": 0.007659912109375, + "reward": 0.9548873901367188, + "reward_std": 0.000849742500577122, + "rewards/perpo_ocr_edit_distance_reward": 0.9548874497413635, "step": 390, "temperature": 0.9 }, { - "advantages": -2.6764615085994592e-05, - "completion_length": 394.0, - "delta_ref_entropy_loss": 0.0360107421875, - "delta_ref_ppl": -0.0213623046875, - "entropy_loss": -0.018310546875, - "epoch": 0.1564, - "grad_norm": 1.0785254826656856, - "k1_kl": 0.02130126953125, - "k3_kl": 0.009063720703125, - "kimi_kl": 0.014129638671875, - "learning_rate": 4.218e-07, - "loss": 0.0004, - "ppl": 0.0076141357421875, - "reward": 0.9920126497745514, - "reward_std": 0.002073103649308905, - "rewards/perpo_ocr_edit_distance_reward": 0.9920126497745514, + "advantages": -3.167561317241052e-06, + "completion_length": 578.0, + "delta_ref_entropy_loss": 0.125, + "delta_ref_ppl": -0.058837890625, + "entropy_loss": -0.28515625, + "epoch": 0.0782, + "grad_norm": 2.8107801376721877, + "k1_kl": 0.05859375, + "k3_kl": 0.0283203125, + "kimi_kl": 0.06396484375, + "learning_rate": 4.6089999999999995e-07, + "loss": 0.0011, + "ppl": 0.1484375, + "reward": 0.8038432002067566, + "reward_std": 0.02381136827170849, + "rewards/perpo_ocr_edit_distance_reward": 0.8038432598114014, "step": 391, "temperature": 0.9 }, { - "advantages": -0.00010152374670724384, - "completion_length": 561.5, - "delta_ref_entropy_loss": 0.028564453125, - "delta_ref_ppl": -0.0137939453125, - "entropy_loss": -0.02459716796875, - "epoch": 0.1568, - "grad_norm": 0.512071512790775, - "k1_kl": 0.013824462890625, - "k3_kl": 0.0055694580078125, - "kimi_kl": 0.00836181640625, - "learning_rate": 4.2159999999999996e-07, + "advantages": -2.8205769922351465e-05, + "completion_length": 589.0, + "delta_ref_entropy_loss": 0.020751953125, + "delta_ref_ppl": -0.01214599609375, + "entropy_loss": -0.021484375, + "epoch": 0.0784, + "grad_norm": 0.9587923658505031, + "k1_kl": 0.01214599609375, + "k3_kl": 0.007415771484375, + "kimi_kl": 0.0108642578125, + "learning_rate": 4.6079999999999994e-07, "loss": 0.0003, - "ppl": 0.0101776123046875, - "reward": 0.9981580078601837, - "reward_std": 0.0005941468552919105, - "rewards/perpo_ocr_edit_distance_reward": 0.9981580674648285, + "ppl": 0.01141357421875, + "reward": 0.9475516080856323, + "reward_std": 0.0017112414352595806, + "rewards/perpo_ocr_edit_distance_reward": 0.9475517272949219, "step": 392, "temperature": 0.9 }, { - "advantages": -1.762594592946698e-06, - "completion_length": 544.5, - "delta_ref_entropy_loss": 0.05419921875, - "delta_ref_ppl": -0.024169921875, - "entropy_loss": -0.082794189453125, - "epoch": 0.1572, - "grad_norm": 0.797192509993641, - "k1_kl": 0.0242919921875, - "k3_kl": 0.0087432861328125, - "kimi_kl": 0.01373291015625, - "learning_rate": 4.214e-07, - "loss": 0.0004, - "ppl": 0.0427398681640625, - "reward": 0.9359504580497742, - "reward_std": 0.004757922142744064, - "rewards/perpo_ocr_edit_distance_reward": 0.9359504282474518, + "advantages": 8.514949634275126e-09, + "completion_length": 661.0, + "delta_ref_entropy_loss": 0.03271484375, + "delta_ref_ppl": -0.031982421875, + "entropy_loss": -0.0341796875, + "epoch": 0.0786, + "grad_norm": 3.438653929955905, + "k1_kl": 0.031982421875, + "k3_kl": 0.0225830078125, + "kimi_kl": 0.08935546875, + "learning_rate": 4.607e-07, + "loss": 0.0009, + "ppl": 0.0196533203125, + "reward": 0.9817342758178711, + "reward_std": 0.0009695396292954683, + "rewards/perpo_ocr_edit_distance_reward": 0.9817343354225159, "step": 393, "temperature": 0.9 }, { - "advantages": -1.0473388556420105e-06, - "completion_length": 291.5, - "delta_ref_entropy_loss": 0.05584716796875, - "delta_ref_ppl": -0.06842041015625, - "entropy_loss": -0.05218505859375, - "epoch": 0.1576, - "grad_norm": 5.196712928621414, - "k1_kl": 0.068115234375, - "k3_kl": 0.03741455078125, - "kimi_kl": 0.0684814453125, - "learning_rate": 4.212e-07, - "loss": 0.0015, - "ppl": 0.01792144775390625, - "reward": 0.9846635460853577, - "reward_std": 0.01610644906759262, - "rewards/perpo_ocr_edit_distance_reward": 0.9846636056900024, + "advantages": -1.4475413934178505e-07, + "completion_length": 303.0, + "delta_ref_entropy_loss": 0.050537109375, + "delta_ref_ppl": -0.03662109375, + "entropy_loss": -0.25, + "epoch": 0.0788, + "grad_norm": 2.8000949803357966, + "k1_kl": 0.037109375, + "k3_kl": 0.0206298828125, + "kimi_kl": 0.039306640625, + "learning_rate": 4.606e-07, + "loss": 0.0008, + "ppl": 0.17578125, + "reward": 0.6146252751350403, + "reward_std": 0.20856298506259918, + "rewards/perpo_ocr_edit_distance_reward": 0.6146252751350403, "step": 394, "temperature": 0.9 }, { - "advantages": -1.2602125707417144e-06, - "completion_length": 641.5, - "delta_ref_entropy_loss": 0.04248046875, - "delta_ref_ppl": -0.03668212890625, - "entropy_loss": -0.082763671875, - "epoch": 0.158, - "grad_norm": 1.2631192261991728, - "k1_kl": 0.03656005859375, - "k3_kl": 0.0216064453125, - "kimi_kl": 0.04644775390625, - "learning_rate": 4.2099999999999997e-07, - "loss": 0.0009, - "ppl": 0.0400390625, - "reward": 0.9372349977493286, - "reward_std": 0.037473696283996105, - "rewards/perpo_ocr_edit_distance_reward": 0.937235027551651, + "advantages": -1.2091228427379974e-06, + "completion_length": 2048.0, + "delta_ref_entropy_loss": 0.0208740234375, + "delta_ref_ppl": -0.01458740234375, + "entropy_loss": -0.0830078125, + "epoch": 0.079, + "grad_norm": 63.27472320274864, + "k1_kl": 0.0145263671875, + "k3_kl": 0.01226806640625, + "kimi_kl": 0.016357421875, + "learning_rate": 4.605e-07, + "loss": 0.0005, + "ppl": 0.056884765625, + "reward": 0.7157791256904602, + "reward_std": 0.04209188371896744, + "rewards/perpo_ocr_edit_distance_reward": 0.715779185295105, "step": 395, "temperature": 0.9 }, { - "advantages": -0.0002983638218552187, - "completion_length": 597.0, - "delta_ref_entropy_loss": 0.02349853515625, - "delta_ref_ppl": -0.019683837890625, - "entropy_loss": -0.0205078125, - "epoch": 0.1584, - "grad_norm": 0.6779341454012556, - "k1_kl": 0.019683837890625, - "k3_kl": 0.01194000244140625, - "kimi_kl": 0.02425384521484375, - "learning_rate": 4.208e-07, - "loss": 0.0008, - "ppl": 0.0093231201171875, - "reward": 0.9926855564117432, - "reward_std": 0.006576570216566324, - "rewards/perpo_ocr_edit_distance_reward": 0.9926856160163879, + "advantages": 4.495893335842993e-06, + "completion_length": 542.0, + "delta_ref_entropy_loss": 0.049560546875, + "delta_ref_ppl": -0.0400390625, + "entropy_loss": -0.07666015625, + "epoch": 0.0792, + "grad_norm": 1.3948383620937606, + "k1_kl": 0.0400390625, + "k3_kl": 0.02587890625, + "kimi_kl": 0.050537109375, + "learning_rate": 4.6039999999999997e-07, + "loss": 0.001, + "ppl": 0.044921875, + "reward": 0.9688094854354858, + "reward_std": 0.005569769069552422, + "rewards/perpo_ocr_edit_distance_reward": 0.9688094258308411, "step": 396, "temperature": 0.9 }, { - "advantages": -5.1611239314297563e-05, - "completion_length": 650.5, - "delta_ref_entropy_loss": 0.05755615234375, - "delta_ref_ppl": -0.03143310546875, - "entropy_loss": -0.1116943359375, - "epoch": 0.1588, - "grad_norm": 0.9269153113519093, - "k1_kl": 0.03143310546875, - "k3_kl": 0.016632080078125, - "kimi_kl": 0.05706787109375, - "learning_rate": 4.2059999999999994e-07, - "loss": 0.0007, - "ppl": 0.060089111328125, - "reward": 0.9570842683315277, - "reward_std": 0.010508210616535507, - "rewards/perpo_ocr_edit_distance_reward": 0.9570842981338501, + "advantages": -6.355558434734121e-05, + "completion_length": 744.0, + "delta_ref_entropy_loss": 0.01251220703125, + "delta_ref_ppl": -0.01422119140625, + "entropy_loss": -0.01416015625, + "epoch": 0.0794, + "grad_norm": 0.5437637390982826, + "k1_kl": 0.01422119140625, + "k3_kl": 0.00994873046875, + "kimi_kl": 0.025634765625, + "learning_rate": 4.6029999999999996e-07, + "loss": 0.0005, + "ppl": 0.00616455078125, + "reward": 0.9851335883140564, + "reward_std": 0.0004358270380180329, + "rewards/perpo_ocr_edit_distance_reward": 0.9851336479187012, "step": 397, "temperature": 0.9 }, { - "advantages": 1.6178404393940582e-07, - "completion_length": 505.5, - "delta_ref_entropy_loss": 0.11572265625, - "delta_ref_ppl": -0.0770263671875, - "entropy_loss": -0.21533203125, - "epoch": 0.1592, - "grad_norm": 14.666158943461992, - "k1_kl": 0.0770263671875, - "k3_kl": 0.05206298828125, - "kimi_kl": 0.105712890625, - "learning_rate": 4.204e-07, - "loss": 0.0021, - "ppl": 0.135009765625, - "reward": 0.8379810452461243, - "reward_std": 0.25370998680591583, - "rewards/perpo_ocr_edit_distance_reward": 0.8379810452461243, + "advantages": -2.895082786835701e-07, + "completion_length": 194.0, + "delta_ref_entropy_loss": 0.04345703125, + "delta_ref_ppl": -0.0576171875, + "entropy_loss": -0.125, + "epoch": 0.0796, + "grad_norm": 0.9550042853273342, + "k1_kl": 0.057373046875, + "k3_kl": 0.036376953125, + "kimi_kl": 0.1044921875, + "learning_rate": 4.6019999999999995e-07, + "loss": 0.0015, + "ppl": 0.07421875, + "reward": 0.7380915880203247, + "reward_std": 0.26384082436561584, + "rewards/perpo_ocr_edit_distance_reward": 0.7380916476249695, "step": 398, "temperature": 0.9 }, { - "advantages": -3.0006682209204882e-05, - "completion_length": 547.0, - "delta_ref_entropy_loss": 0.035308837890625, - "delta_ref_ppl": -0.0216064453125, - "entropy_loss": -0.043212890625, - "epoch": 0.1596, - "grad_norm": 0.50991445573931, - "k1_kl": 0.021728515625, - "k3_kl": 0.00885009765625, - "kimi_kl": 0.0128173828125, - "learning_rate": 4.202e-07, - "loss": 0.0004, - "ppl": 0.02178955078125, - "reward": 0.9834192395210266, - "reward_std": 0.002290479256771505, - "rewards/perpo_ocr_edit_distance_reward": 0.9834192991256714, + "advantages": -5.376339322538115e-05, + "completion_length": 511.0, + "delta_ref_entropy_loss": 0.0189208984375, + "delta_ref_ppl": -0.0186767578125, + "entropy_loss": -0.0185546875, + "epoch": 0.0798, + "grad_norm": 0.4233558156119653, + "k1_kl": 0.018798828125, + "k3_kl": 0.0128173828125, + "kimi_kl": 0.031494140625, + "learning_rate": 4.601e-07, + "loss": 0.0006, + "ppl": 0.0096435546875, + "reward": 0.9851284027099609, + "reward_std": 0.00037507241358980536, + "rewards/perpo_ocr_edit_distance_reward": 0.9851284623146057, "step": 399, "temperature": 0.9 }, { - "advantages": -7.275172902154736e-05, - "completion_length": 306.0, - "delta_ref_entropy_loss": 0.0504150390625, - "delta_ref_ppl": -0.033447265625, - "entropy_loss": -0.03765869140625, - "epoch": 0.16, - "grad_norm": 1.0506503409276367, - "k1_kl": 0.0335693359375, - "k3_kl": 0.017578125, - "kimi_kl": 0.03253173828125, - "learning_rate": 4.1999999999999995e-07, + "advantages": -4.0820668800733984e-05, + "completion_length": 1088.0, + "delta_ref_entropy_loss": 0.031982421875, + "delta_ref_ppl": -0.027587890625, + "entropy_loss": -0.0654296875, + "epoch": 0.08, + "grad_norm": 0.8768228466881665, + "k1_kl": 0.0274658203125, + "k3_kl": 0.0184326171875, + "kimi_kl": 0.04296875, + "learning_rate": 4.6e-07, "loss": 0.0008, - "ppl": 0.016845703125, - "reward": 0.9833307564258575, - "reward_std": 0.001380076602799818, - "rewards/perpo_ocr_edit_distance_reward": 0.9833308756351471, + "ppl": 0.032470703125, + "reward": 0.9786263108253479, + "reward_std": 0.0015674388268962502, + "rewards/perpo_ocr_edit_distance_reward": 0.9786264300346375, "step": 400, "temperature": 0.9 }, { - "advantages": -4.947611627414972e-05, - "completion_length": 555.5, - "delta_ref_entropy_loss": 0.0272216796875, - "delta_ref_ppl": -0.016693115234375, - "entropy_loss": -0.021240234375, - "epoch": 0.1604, - "grad_norm": 0.8247906256022787, - "k1_kl": 0.016632080078125, - "k3_kl": 0.007598876953125, - "kimi_kl": 0.011505126953125, - "learning_rate": 4.198e-07, - "loss": 0.0004, - "ppl": 0.010162353515625, - "reward": 0.9587680101394653, - "reward_std": 0.03392167278798297, - "rewards/perpo_ocr_edit_distance_reward": 0.9587680697441101, + "advantages": -0.0005960464477539062, + "completion_length": 425.0, + "delta_ref_entropy_loss": 0.02880859375, + "delta_ref_ppl": -0.0286865234375, + "entropy_loss": -0.019287109375, + "epoch": 0.0802, + "grad_norm": 0.014730854924329458, + "k1_kl": 0.0289306640625, + "k3_kl": 0.0146484375, + "kimi_kl": 0.030029296875, + "learning_rate": 4.5989999999999993e-07, + "loss": 0.0012, + "ppl": 0.0038299560546875, + "reward": 0.9822874665260315, + "reward_std": 0.0, + "rewards/perpo_ocr_edit_distance_reward": 0.9822875261306763, "step": 401, "temperature": 0.9 }, { - "advantages": -8.429797162534669e-07, - "completion_length": 622.0, - "delta_ref_entropy_loss": 0.0284423828125, - "delta_ref_ppl": -0.0206298828125, - "entropy_loss": -0.0203857421875, - "epoch": 0.1608, - "grad_norm": 2.682543888331396, - "k1_kl": 0.02056884765625, - "k3_kl": 0.01190185546875, - "kimi_kl": 0.03033447265625, - "learning_rate": 4.1959999999999997e-07, - "loss": 0.0005, - "ppl": 0.0077362060546875, - "reward": 0.9956031143665314, - "reward_std": 0.0016496583702974021, - "rewards/perpo_ocr_edit_distance_reward": 0.9956032037734985, + "advantages": -5.8565823565004393e-05, + "completion_length": 656.0, + "delta_ref_entropy_loss": 0.0235595703125, + "delta_ref_ppl": -0.031494140625, + "entropy_loss": -0.038818359375, + "epoch": 0.0804, + "grad_norm": 1.49432556544746, + "k1_kl": 0.031494140625, + "k3_kl": 0.02294921875, + "kimi_kl": 0.0498046875, + "learning_rate": 4.598e-07, + "loss": 0.001, + "ppl": 0.0201416015625, + "reward": 0.978786826133728, + "reward_std": 0.000917784112971276, + "rewards/perpo_ocr_edit_distance_reward": 0.9787869453430176, "step": 402, "temperature": 0.9 }, { - "advantages": -4.1284732494872856e-05, - "completion_length": 1210.5, - "delta_ref_entropy_loss": 0.0234375, - "delta_ref_ppl": -0.024505615234375, - "entropy_loss": -0.169281005859375, - "epoch": 0.1612, - "grad_norm": 119.40953825240193, - "k1_kl": 0.024383544921875, - "k3_kl": 0.06103515625, - "kimi_kl": 0.056884765625, - "learning_rate": 4.1939999999999996e-07, - "loss": 0.0025, - "ppl": 0.1171875, - "reward": 0.7427677661180496, - "reward_std": 0.02956487135088537, - "rewards/perpo_ocr_edit_distance_reward": 0.7427678406238556, + "advantages": 5.10896995820076e-08, + "completion_length": 894.0, + "delta_ref_entropy_loss": 0.06298828125, + "delta_ref_ppl": -0.037109375, + "entropy_loss": -0.1064453125, + "epoch": 0.0806, + "grad_norm": 0.7727909291671892, + "k1_kl": 0.037353515625, + "k3_kl": 0.0230712890625, + "kimi_kl": 0.041259765625, + "learning_rate": 4.5969999999999997e-07, + "loss": 0.0009, + "ppl": 0.057373046875, + "reward": 0.785999059677124, + "reward_std": 0.20019511878490448, + "rewards/perpo_ocr_edit_distance_reward": 0.785999059677124, "step": 403, "temperature": 0.9 }, { - "advantages": -3.090075239242651e-05, - "completion_length": 158.5, - "delta_ref_entropy_loss": 0.03857421875, - "delta_ref_ppl": -0.03179931640625, - "entropy_loss": -0.03460693359375, - "epoch": 0.1616, - "grad_norm": 2.957216352796761, - "k1_kl": 0.031829833984375, - "k3_kl": 0.018707275390625, - "kimi_kl": 0.044342041015625, - "learning_rate": 4.192e-07, - "loss": 0.0008, - "ppl": 0.01763916015625, - "reward": 0.9612475633621216, - "reward_std": 0.014439716993365437, - "rewards/perpo_ocr_edit_distance_reward": 0.9612476229667664, + "advantages": -8.091756899375468e-05, + "completion_length": 1435.0, + "delta_ref_entropy_loss": 0.0230712890625, + "delta_ref_ppl": -0.01177978515625, + "entropy_loss": -0.04931640625, + "epoch": 0.0808, + "grad_norm": 2.540763069024993, + "k1_kl": 0.0118408203125, + "k3_kl": 0.006317138671875, + "kimi_kl": 0.0107421875, + "learning_rate": 4.596e-07, + "loss": 0.0003, + "ppl": 0.0228271484375, + "reward": 0.9744581580162048, + "reward_std": 0.0010577262146398425, + "rewards/perpo_ocr_edit_distance_reward": 0.9744582176208496, "step": 404, "temperature": 0.9 }, { - "advantages": -6.009425578668015e-05, - "completion_length": 664.0, - "delta_ref_entropy_loss": 0.02667236328125, - "delta_ref_ppl": -0.024749755859375, - "entropy_loss": -0.04315185546875, - "epoch": 0.162, - "grad_norm": 1.8258200063431496, - "k1_kl": 0.0247802734375, - "k3_kl": 0.0169830322265625, - "kimi_kl": 0.04766845703125, - "learning_rate": 4.19e-07, - "loss": 0.0007, - "ppl": 0.020050048828125, - "reward": 0.9917895197868347, - "reward_std": 0.0017109858454205096, - "rewards/perpo_ocr_edit_distance_reward": 0.9917896389961243, + "advantages": 1.3726098586630542e-05, + "completion_length": 758.0, + "delta_ref_entropy_loss": 0.06396484375, + "delta_ref_ppl": -0.035888671875, + "entropy_loss": -0.09130859375, + "epoch": 0.081, + "grad_norm": 5.99277091854032, + "k1_kl": 0.03564453125, + "k3_kl": 0.015869140625, + "kimi_kl": 0.029541015625, + "learning_rate": 4.595e-07, + "loss": 0.0006, + "ppl": 0.0478515625, + "reward": 0.791968822479248, + "reward_std": 0.0023834356106817722, + "rewards/perpo_ocr_edit_distance_reward": 0.7919687628746033, "step": 405, "temperature": 0.9 }, { - "advantages": 1.806020787853413e-05, - "completion_length": 416.5, - "delta_ref_entropy_loss": 0.0728759765625, - "delta_ref_ppl": -0.0408935546875, - "entropy_loss": -0.1585693359375, - "epoch": 0.1624, - "grad_norm": 1.067630796437961, - "k1_kl": 0.040771484375, - "k3_kl": 0.01861572265625, - "kimi_kl": 0.03277587890625, - "learning_rate": 4.1879999999999996e-07, - "loss": 0.0007, - "ppl": 0.0802001953125, - "reward": 0.8469056785106659, - "reward_std": 0.021956250129733235, - "rewards/perpo_ocr_edit_distance_reward": 0.8469057381153107, + "advantages": -3.4059798537100505e-08, + "completion_length": 1130.0, + "delta_ref_entropy_loss": 0.030029296875, + "delta_ref_ppl": -0.03564453125, + "entropy_loss": -0.08203125, + "epoch": 0.0812, + "grad_norm": 1.9010781750348473, + "k1_kl": 0.035400390625, + "k3_kl": 0.0206298828125, + "kimi_kl": 0.04443359375, + "learning_rate": 4.5939999999999994e-07, + "loss": 0.0008, + "ppl": 0.04443359375, + "reward": 0.7670911550521851, + "reward_std": 0.2863900065422058, + "rewards/perpo_ocr_edit_distance_reward": 0.7670911550521851, "step": 406, "temperature": 0.9 }, { - "advantages": -6.224428034329321e-06, - "completion_length": 753.5, - "delta_ref_entropy_loss": 0.028228759765625, - "delta_ref_ppl": -0.0226898193359375, - "entropy_loss": -0.02197265625, - "epoch": 0.1628, - "grad_norm": 3.1380373876877883, - "k1_kl": 0.022552490234375, - "k3_kl": 0.01065826416015625, - "kimi_kl": 0.01834869384765625, - "learning_rate": 4.186e-07, - "loss": 0.0004, - "ppl": 0.0086822509765625, - "reward": 0.9978683590888977, - "reward_std": 0.00063407450215891, - "rewards/perpo_ocr_edit_distance_reward": 0.9978683590888977, + "advantages": -9.383474207425024e-06, + "completion_length": 2048.0, + "delta_ref_entropy_loss": 0.007171630859375, + "delta_ref_ppl": -0.005157470703125, + "entropy_loss": -0.028076171875, + "epoch": 0.0814, + "grad_norm": 4.4971615165817225, + "k1_kl": 0.005126953125, + "k3_kl": 0.0032501220703125, + "kimi_kl": 0.0087890625, + "learning_rate": 4.593e-07, + "loss": 0.0001, + "ppl": 0.01214599609375, + "reward": 0.6309975385665894, + "reward_std": 0.0026239994913339615, + "rewards/perpo_ocr_edit_distance_reward": 0.6309975385665894, "step": 407, "temperature": 0.9 }, { - "advantages": -1.3027872682869202e-06, - "completion_length": 328.0, - "delta_ref_entropy_loss": 0.0601806640625, - "delta_ref_ppl": -0.03631591796875, - "entropy_loss": -0.066162109375, - "epoch": 0.1632, - "grad_norm": 1.1813671356367876, - "k1_kl": 0.03643798828125, - "k3_kl": 0.015625, - "kimi_kl": 0.0263671875, - "learning_rate": 4.184e-07, - "loss": 0.0006, - "ppl": 0.02996826171875, - "reward": 0.9090254008769989, - "reward_std": 0.011675736168399453, - "rewards/perpo_ocr_edit_distance_reward": 0.9090254604816437, + "advantages": -1.5752656281620148e-06, + "completion_length": 325.0, + "delta_ref_entropy_loss": 0.043212890625, + "delta_ref_ppl": -0.036376953125, + "entropy_loss": -0.039794921875, + "epoch": 0.0816, + "grad_norm": 0.8845951421001647, + "k1_kl": 0.036376953125, + "k3_kl": 0.021240234375, + "kimi_kl": 0.03369140625, + "learning_rate": 4.592e-07, + "loss": 0.0009, + "ppl": 0.0218505859375, + "reward": 0.9522662162780762, + "reward_std": 0.03794095665216446, + "rewards/perpo_ocr_edit_distance_reward": 0.952266275882721, "step": 408, "temperature": 0.9 }, { - "advantages": -6.965228816824265e-06, - "completion_length": 747.5, - "delta_ref_entropy_loss": 0.047607421875, - "delta_ref_ppl": -0.041259765625, - "entropy_loss": -0.065185546875, - "epoch": 0.1636, - "grad_norm": 1.1672276103230161, - "k1_kl": 0.0413818359375, - "k3_kl": 0.022613525390625, - "kimi_kl": 0.05120849609375, - "learning_rate": 4.1819999999999997e-07, - "loss": 0.0009, - "ppl": 0.03411865234375, - "reward": 0.8011177480220795, - "reward_std": 0.04786531603895128, - "rewards/perpo_ocr_edit_distance_reward": 0.8011178076267242, + "advantages": -3.864935570163652e-05, + "completion_length": 467.0, + "delta_ref_entropy_loss": 0.038818359375, + "delta_ref_ppl": -0.029296875, + "entropy_loss": -0.0419921875, + "epoch": 0.0818, + "grad_norm": 0.8466967400432771, + "k1_kl": 0.029296875, + "k3_kl": 0.015869140625, + "kimi_kl": 0.035400390625, + "learning_rate": 4.591e-07, + "loss": 0.0007, + "ppl": 0.0260009765625, + "reward": 0.9651530981063843, + "reward_std": 0.00078131805639714, + "rewards/perpo_ocr_edit_distance_reward": 0.9651532173156738, "step": 409, "temperature": 0.9 }, { - "advantages": -1.3266291716718115e-05, - "completion_length": 266.5, - "delta_ref_entropy_loss": 0.042724609375, - "delta_ref_ppl": -0.079833984375, - "entropy_loss": -0.048583984375, - "epoch": 0.164, - "grad_norm": 1.3740032820321317, - "k1_kl": 0.079833984375, - "k3_kl": 0.056060791015625, - "kimi_kl": 0.1903076171875, - "learning_rate": 4.1799999999999996e-07, - "loss": 0.0023, - "ppl": 0.0238037109375, - "reward": 0.9755150675773621, - "reward_std": 0.0030839144019410014, - "rewards/perpo_ocr_edit_distance_reward": 0.9755151867866516, + "advantages": 3.831727553915698e-06, + "completion_length": 67.0, + "delta_ref_entropy_loss": 0.0791015625, + "delta_ref_ppl": -0.1376953125, + "entropy_loss": -0.150390625, + "epoch": 0.082, + "grad_norm": 3.587018687452, + "k1_kl": 0.1376953125, + "k3_kl": 0.07275390625, + "kimi_kl": 0.12255859375, + "learning_rate": 4.59e-07, + "loss": 0.0029, + "ppl": 0.06494140625, + "reward": 0.5634644031524658, + "reward_std": 0.004367819521576166, + "rewards/perpo_ocr_edit_distance_reward": 0.5634644031524658, "step": 410, "temperature": 0.9 }, { - "advantages": -0.0003920176241081208, - "completion_length": 409.0, - "delta_ref_entropy_loss": 0.024658203125, - "delta_ref_ppl": -0.0127105712890625, - "entropy_loss": -0.01202392578125, - "epoch": 0.1644, - "grad_norm": 0.21342432810889014, - "k1_kl": 0.0127105712890625, - "k3_kl": 0.00476837158203125, - "kimi_kl": 0.00665283203125, - "learning_rate": 4.178e-07, - "loss": 0.0006, - "ppl": 0.00470733642578125, - "reward": 0.9986997246742249, - "reward_std": 0.0001311697269557044, - "rewards/perpo_ocr_edit_distance_reward": 0.9986997544765472, + "advantages": -1.4798982192587573e-05, + "completion_length": 433.0, + "delta_ref_entropy_loss": 0.040771484375, + "delta_ref_ppl": -0.0179443359375, + "entropy_loss": -0.019775390625, + "epoch": 0.0822, + "grad_norm": 0.6400253124232754, + "k1_kl": 0.0179443359375, + "k3_kl": 0.007720947265625, + "kimi_kl": 0.01324462890625, + "learning_rate": 4.5889999999999996e-07, + "loss": 0.0003, + "ppl": 0.0089111328125, + "reward": 0.9815413355827332, + "reward_std": 0.00105045095551759, + "rewards/perpo_ocr_edit_distance_reward": 0.9815413951873779, "step": 411, "temperature": 0.9 }, { - "advantages": -4.811372127733193e-05, - "completion_length": 349.0, - "delta_ref_entropy_loss": 0.03375244140625, - "delta_ref_ppl": -0.013427734375, - "entropy_loss": -0.01666259765625, - "epoch": 0.1648, - "grad_norm": 0.588913120574555, - "k1_kl": 0.013458251953125, - "k3_kl": 0.00469207763671875, - "kimi_kl": 0.0066986083984375, - "learning_rate": 4.1760000000000003e-07, - "loss": 0.0002, - "ppl": 0.00527191162109375, - "reward": 0.9713294208049774, - "reward_std": 0.00021548732183873653, - "rewards/perpo_ocr_edit_distance_reward": 0.9713294506072998, + "advantages": 3.448554707574658e-06, + "completion_length": 1138.0, + "delta_ref_entropy_loss": 0.083984375, + "delta_ref_ppl": -0.041259765625, + "entropy_loss": -0.1953125, + "epoch": 0.0824, + "grad_norm": 3.440410676259875, + "k1_kl": 0.04150390625, + "k3_kl": 0.0255126953125, + "kimi_kl": 0.052490234375, + "learning_rate": 4.5879999999999995e-07, + "loss": 0.001, + "ppl": 0.1123046875, + "reward": 0.8406593203544617, + "reward_std": 0.0023842232767492533, + "rewards/perpo_ocr_edit_distance_reward": 0.8406593203544617, "step": 412, "temperature": 0.9 }, { - "advantages": 4.257474728319721e-08, - "completion_length": 982.0, - "delta_ref_entropy_loss": 0.044158935546875, - "delta_ref_ppl": -0.0324249267578125, - "entropy_loss": -0.136199951171875, - "epoch": 0.1652, - "grad_norm": 1.9225217504443266, - "k1_kl": 0.0324249267578125, - "k3_kl": 0.0234832763671875, - "kimi_kl": 0.04753875732421875, - "learning_rate": 4.1739999999999997e-07, - "loss": 0.0009, - "ppl": 0.07613372802734375, - "reward": 0.9059461355209351, - "reward_std": 0.07889758795499802, - "rewards/perpo_ocr_edit_distance_reward": 0.9059461355209351, + "advantages": -3.0824117857264355e-05, + "completion_length": 375.0, + "delta_ref_entropy_loss": 0.040771484375, + "delta_ref_ppl": -0.041015625, + "entropy_loss": -0.02392578125, + "epoch": 0.0826, + "grad_norm": 0.6658890173310599, + "k1_kl": 0.041015625, + "k3_kl": 0.025146484375, + "kimi_kl": 0.04833984375, + "learning_rate": 4.587e-07, + "loss": 0.001, + "ppl": 0.0115966796875, + "reward": 0.97581547498703, + "reward_std": 0.00045284561929292977, + "rewards/perpo_ocr_edit_distance_reward": 0.9758155345916748, "step": 413, "temperature": 0.9 }, { - "advantages": -7.096784611348994e-05, - "completion_length": 570.0, - "delta_ref_entropy_loss": 0.0562744140625, - "delta_ref_ppl": -0.0260009765625, - "entropy_loss": -0.060791015625, - "epoch": 0.1656, - "grad_norm": 0.7724068989368181, - "k1_kl": 0.0260009765625, - "k3_kl": 0.0098876953125, - "kimi_kl": 0.015380859375, - "learning_rate": 4.172e-07, + "advantages": -7.435253792209551e-05, + "completion_length": 694.0, + "delta_ref_entropy_loss": 0.0272216796875, + "delta_ref_ppl": -0.0194091796875, + "entropy_loss": -0.042724609375, + "epoch": 0.0828, + "grad_norm": 0.6338118227565973, + "k1_kl": 0.0194091796875, + "k3_kl": 0.01116943359375, + "kimi_kl": 0.02001953125, + "learning_rate": 4.586e-07, "loss": 0.0005, - "ppl": 0.02984619140625, - "reward": 0.9866729378700256, - "reward_std": 0.0007404318021144718, - "rewards/perpo_ocr_edit_distance_reward": 0.9866729974746704, + "ppl": 0.0185546875, + "reward": 0.97786945104599, + "reward_std": 0.0009306828142143786, + "rewards/perpo_ocr_edit_distance_reward": 0.9778695106506348, "step": 414, "temperature": 0.9 }, { - "advantages": 3.974352785007795e-05, - "completion_length": 838.5, - "delta_ref_entropy_loss": 0.03179931640625, - "delta_ref_ppl": -0.02069091796875, - "entropy_loss": -0.0360107421875, - "epoch": 0.166, - "grad_norm": 0.4276389646752694, - "k1_kl": 0.0206298828125, - "k3_kl": 0.0090484619140625, - "kimi_kl": 0.016082763671875, - "learning_rate": 4.17e-07, - "loss": 0.0003, - "ppl": 0.01448822021484375, - "reward": 0.9832623302936554, - "reward_std": 0.0007014449438429438, - "rewards/perpo_ocr_edit_distance_reward": 0.9832623600959778, + "advantages": -9.298324584960938e-06, + "completion_length": 600.0, + "delta_ref_entropy_loss": 0.03271484375, + "delta_ref_ppl": -0.020263671875, + "entropy_loss": -0.028076171875, + "epoch": 0.083, + "grad_norm": 0.8217492095854754, + "k1_kl": 0.0201416015625, + "k3_kl": 0.00994873046875, + "kimi_kl": 0.017822265625, + "learning_rate": 4.585e-07, + "loss": 0.0004, + "ppl": 0.01336669921875, + "reward": 0.9757429957389832, + "reward_std": 0.0008154110400937498, + "rewards/perpo_ocr_edit_distance_reward": 0.9757430553436279, "step": 415, "temperature": 0.9 }, { - "advantages": -0.0001702943866774831, - "completion_length": 436.5, - "delta_ref_entropy_loss": 0.03594970703125, - "delta_ref_ppl": -0.02740478515625, - "entropy_loss": -0.025634765625, - "epoch": 0.1664, - "grad_norm": 1.2974476152949903, - "k1_kl": 0.0274658203125, - "k3_kl": 0.015625, - "kimi_kl": 0.03619384765625, - "learning_rate": 4.1679999999999997e-07, - "loss": 0.0008, - "ppl": 0.009613037109375, - "reward": 0.7591368854045868, - "reward_std": 0.09202849283078862, - "rewards/perpo_ocr_edit_distance_reward": 0.7591369152069092, + "advantages": -7.990429003257304e-05, + "completion_length": 257.0, + "delta_ref_entropy_loss": 0.043212890625, + "delta_ref_ppl": -0.035400390625, + "entropy_loss": -0.02392578125, + "epoch": 0.0832, + "grad_norm": 1.059215020340178, + "k1_kl": 0.035400390625, + "k3_kl": 0.0198974609375, + "kimi_kl": 0.039794921875, + "learning_rate": 4.584e-07, + "loss": 0.0009, + "ppl": 0.0146484375, + "reward": 0.9770569801330566, + "reward_std": 0.0006459528813138604, + "rewards/perpo_ocr_edit_distance_reward": 0.9770570397377014, "step": 416, "temperature": 0.9 }, { - "advantages": -5.4155084399099e-06, - "completion_length": 349.0, - "delta_ref_entropy_loss": 0.0487060546875, - "delta_ref_ppl": -0.04022216796875, - "entropy_loss": -0.05181884765625, - "epoch": 0.1668, - "grad_norm": 1.2831979395528252, - "k1_kl": 0.04010009765625, - "k3_kl": 0.019317626953125, - "kimi_kl": 0.0404052734375, - "learning_rate": 4.166e-07, - "loss": 0.0008, - "ppl": 0.026123046875, - "reward": 0.9912316799163818, - "reward_std": 0.0009611002606106922, - "rewards/perpo_ocr_edit_distance_reward": 0.9912317395210266, + "advantages": -1.393045749864541e-05, + "completion_length": 809.0, + "delta_ref_entropy_loss": 0.055908203125, + "delta_ref_ppl": -0.043212890625, + "entropy_loss": -0.12060546875, + "epoch": 0.0834, + "grad_norm": 1.7590602673068592, + "k1_kl": 0.043212890625, + "k3_kl": 0.025146484375, + "kimi_kl": 0.05712890625, + "learning_rate": 4.5829999999999997e-07, + "loss": 0.001, + "ppl": 0.0634765625, + "reward": 0.9554242491722107, + "reward_std": 0.005403791554272175, + "rewards/perpo_ocr_edit_distance_reward": 0.9554243683815002, "step": 417, "temperature": 0.9 }, { - "advantages": -7.486343747586943e-05, - "completion_length": 609.5, - "delta_ref_entropy_loss": 0.0394287109375, - "delta_ref_ppl": -0.02435302734375, - "entropy_loss": -0.046630859375, - "epoch": 0.1672, - "grad_norm": 0.6823322866666103, - "k1_kl": 0.02435302734375, - "k3_kl": 0.011444091796875, - "kimi_kl": 0.022216796875, - "learning_rate": 4.164e-07, - "loss": 0.0005, - "ppl": 0.02490234375, - "reward": 0.9854454398155212, - "reward_std": 0.0012921103334520012, - "rewards/perpo_ocr_edit_distance_reward": 0.9854455292224884, + "advantages": -3.789152515309979e-06, + "completion_length": 565.0, + "delta_ref_entropy_loss": 0.022216796875, + "delta_ref_ppl": -0.0390625, + "entropy_loss": -0.03955078125, + "epoch": 0.0836, + "grad_norm": 4.744744790389578, + "k1_kl": 0.0390625, + "k3_kl": 0.027587890625, + "kimi_kl": 0.1025390625, + "learning_rate": 4.5819999999999996e-07, + "loss": 0.0011, + "ppl": 0.018310546875, + "reward": 0.6902033090591431, + "reward_std": 0.008894486352801323, + "rewards/perpo_ocr_edit_distance_reward": 0.6902033686637878, "step": 418, "temperature": 0.9 }, { - "advantages": -3.065381861233618e-05, - "completion_length": 608.0, - "delta_ref_entropy_loss": 0.04296875, - "delta_ref_ppl": -0.03076171875, - "entropy_loss": -0.02874755859375, - "epoch": 0.1676, - "grad_norm": 1.9129418622760648, - "k1_kl": 0.0306396484375, - "k3_kl": 0.01580810546875, - "kimi_kl": 0.0355224609375, - "learning_rate": 4.162e-07, - "loss": 0.0007, - "ppl": 0.0134429931640625, - "reward": 0.9832972586154938, - "reward_std": 0.028002482373267412, - "rewards/perpo_ocr_edit_distance_reward": 0.9832974076271057, + "advantages": -7.31008403818123e-05, + "completion_length": 908.0, + "delta_ref_entropy_loss": 0.0269775390625, + "delta_ref_ppl": -0.01611328125, + "entropy_loss": -0.0361328125, + "epoch": 0.0838, + "grad_norm": 1.6466309007730862, + "k1_kl": 0.01611328125, + "k3_kl": 0.0084228515625, + "kimi_kl": 0.0167236328125, + "learning_rate": 4.581e-07, + "loss": 0.0004, + "ppl": 0.02099609375, + "reward": 0.972606897354126, + "reward_std": 0.00048234520363621414, + "rewards/perpo_ocr_edit_distance_reward": 0.972606897354126, "step": 419, "temperature": 0.9 }, { - "advantages": -0.00016527304478586302, - "completion_length": 691.0, - "delta_ref_entropy_loss": 0.0362548828125, - "delta_ref_ppl": -0.0350341796875, - "entropy_loss": -0.0498046875, - "epoch": 0.168, - "grad_norm": 0.6977844568962203, - "k1_kl": 0.0350341796875, - "k3_kl": 0.0220947265625, - "kimi_kl": 0.064697265625, - "learning_rate": 4.1599999999999997e-07, - "loss": 0.0011, - "ppl": 0.023895263671875, - "reward": 0.988583892583847, - "reward_std": 0.0009376117959618568, - "rewards/perpo_ocr_edit_distance_reward": 0.9885839223861694, + "advantages": 3.916876778475853e-07, + "completion_length": 76.0, + "delta_ref_entropy_loss": 0.042236328125, + "delta_ref_ppl": -0.13671875, + "entropy_loss": -0.154296875, + "epoch": 0.084, + "grad_norm": 4.778306571892695, + "k1_kl": 0.13671875, + "k3_kl": 0.1005859375, + "kimi_kl": 0.220703125, + "learning_rate": 4.58e-07, + "loss": 0.004, + "ppl": 0.095703125, + "reward": 0.8999999761581421, + "reward_std": 0.021942678838968277, + "rewards/perpo_ocr_edit_distance_reward": 0.8999999761581421, "step": 420, "temperature": 0.9 }, { - "advantages": -0.00040016855928115547, - "completion_length": 436.5, - "delta_ref_entropy_loss": 0.0224609375, - "delta_ref_ppl": -0.014678955078125, - "entropy_loss": -0.0075531005859375, - "epoch": 0.1684, - "grad_norm": 0.08671775226247044, - "k1_kl": 0.014739990234375, - "k3_kl": 0.0061187744140625, - "kimi_kl": 0.01007080078125, - "learning_rate": 4.158e-07, + "advantages": -1.3623919414840202e-07, + "completion_length": 733.0, + "delta_ref_entropy_loss": 0.025634765625, + "delta_ref_ppl": -0.0230712890625, + "entropy_loss": -0.05615234375, + "epoch": 0.0842, + "grad_norm": 0.9027632351609498, + "k1_kl": 0.0230712890625, + "k3_kl": 0.0157470703125, + "kimi_kl": 0.0277099609375, + "learning_rate": 4.5789999999999994e-07, "loss": 0.0006, - "ppl": 0.001495361328125, - "reward": 0.9980114102363586, - "reward_std": 9.588102693669498e-05, - "rewards/perpo_ocr_edit_distance_reward": 0.9980114698410034, + "ppl": 0.0255126953125, + "reward": 0.5127891898155212, + "reward_std": 0.03725038841366768, + "rewards/perpo_ocr_edit_distance_reward": 0.512789249420166, "step": 421, "temperature": 0.9 }, { - "advantages": -0.00029854689330477413, - "completion_length": 947.0, - "delta_ref_entropy_loss": 0.0706787109375, - "delta_ref_ppl": -0.0887603759765625, - "entropy_loss": -0.079345703125, - "epoch": 0.1688, - "grad_norm": 62.90419638225752, - "k1_kl": 0.088775634765625, - "k3_kl": 0.0677490234375, - "kimi_kl": 0.10552978515625, - "learning_rate": 4.156e-07, - "loss": 0.003, - "ppl": 0.032470703125, - "reward": 0.9143741130828857, - "reward_std": 0.03252432867884636, - "rewards/perpo_ocr_edit_distance_reward": 0.9143742322921753, + "advantages": -1.3070447494101245e-05, + "completion_length": 620.0, + "delta_ref_entropy_loss": 0.028564453125, + "delta_ref_ppl": -0.025146484375, + "entropy_loss": -0.023681640625, + "epoch": 0.0844, + "grad_norm": 1.0801511479515233, + "k1_kl": 0.025146484375, + "k3_kl": 0.0140380859375, + "kimi_kl": 0.03125, + "learning_rate": 4.578e-07, + "loss": 0.0006, + "ppl": 0.01153564453125, + "reward": 0.9787060022354126, + "reward_std": 0.00185338722076267, + "rewards/perpo_ocr_edit_distance_reward": 0.9787061214447021, "step": 422, "temperature": 0.9 }, { - "advantages": -4.180840278422693e-06, - "completion_length": 511.0, - "delta_ref_entropy_loss": 0.08251953125, - "delta_ref_ppl": -0.0499267578125, - "entropy_loss": -0.19775390625, - "epoch": 0.1692, - "grad_norm": 1.6122167297132413, - "k1_kl": 0.0499267578125, - "k3_kl": 0.0281982421875, - "kimi_kl": 0.0701904296875, - "learning_rate": 4.154e-07, - "loss": 0.0011, - "ppl": 0.110595703125, - "reward": 0.8682056665420532, - "reward_std": 0.008558978792279959, - "rewards/perpo_ocr_edit_distance_reward": 0.8682056963443756, + "advantages": -7.144042683648877e-06, + "completion_length": 907.0, + "delta_ref_entropy_loss": 0.045654296875, + "delta_ref_ppl": -0.026611328125, + "entropy_loss": -0.035888671875, + "epoch": 0.0846, + "grad_norm": 1.9382797826842066, + "k1_kl": 0.0264892578125, + "k3_kl": 0.0135498046875, + "kimi_kl": 0.026123046875, + "learning_rate": 4.577e-07, + "loss": 0.0006, + "ppl": 0.0169677734375, + "reward": 0.904590904712677, + "reward_std": 0.0046593728475272655, + "rewards/perpo_ocr_edit_distance_reward": 0.9045909643173218, "step": 423, "temperature": 0.9 }, { - "advantages": 9.621893468647613e-07, - "completion_length": 853.0, - "delta_ref_entropy_loss": 0.03125, - "delta_ref_ppl": -0.0194091796875, - "entropy_loss": -0.03289794921875, - "epoch": 0.1696, - "grad_norm": 0.8412924847348104, - "k1_kl": 0.01934814453125, - "k3_kl": 0.0103759765625, - "kimi_kl": 0.0211181640625, - "learning_rate": 4.152e-07, - "loss": 0.0004, - "ppl": 0.01593017578125, - "reward": 0.9796364009380341, - "reward_std": 0.005221313505899161, - "rewards/perpo_ocr_edit_distance_reward": 0.9796363711357117, + "advantages": -5.642857286147773e-05, + "completion_length": 1504.0, + "delta_ref_entropy_loss": 0.03662109375, + "delta_ref_ppl": -0.0230712890625, + "entropy_loss": -0.034423828125, + "epoch": 0.0848, + "grad_norm": 2.0057981303865904, + "k1_kl": 0.0230712890625, + "k3_kl": 0.0111083984375, + "kimi_kl": 0.018310546875, + "learning_rate": 4.5759999999999997e-07, + "loss": 0.0005, + "ppl": 0.0206298828125, + "reward": 0.9717778563499451, + "reward_std": 0.0015599685721099377, + "rewards/perpo_ocr_edit_distance_reward": 0.9717779755592346, "step": 424, "temperature": 0.9 }, { - "advantages": -0.0001005572985377512, - "completion_length": 459.0, - "delta_ref_entropy_loss": 0.038818359375, - "delta_ref_ppl": -0.02642822265625, - "entropy_loss": -0.0594482421875, - "epoch": 0.17, - "grad_norm": 0.589259420381936, - "k1_kl": 0.02642822265625, - "k3_kl": 0.014862060546875, - "kimi_kl": 0.03094482421875, - "learning_rate": 4.1499999999999994e-07, - "loss": 0.0007, - "ppl": 0.02777099609375, - "reward": 0.9224341511726379, - "reward_std": 0.0019896655721822754, - "rewards/perpo_ocr_edit_distance_reward": 0.9224341809749603, + "advantages": -4.002026230409683e-07, + "completion_length": 227.0, + "delta_ref_entropy_loss": 0.09619140625, + "delta_ref_ppl": -0.12890625, + "entropy_loss": -0.2197265625, + "epoch": 0.085, + "grad_norm": 3.2301226142935584, + "k1_kl": 0.12890625, + "k3_kl": 0.078125, + "kimi_kl": 0.248046875, + "learning_rate": 4.575e-07, + "loss": 0.0031, + "ppl": 0.11767578125, + "reward": 0.7097199559211731, + "reward_std": 0.12592320144176483, + "rewards/perpo_ocr_edit_distance_reward": 0.7097200155258179, "step": 425, "temperature": 0.9 }, { - "advantages": -3.523911800584756e-05, - "completion_length": 513.5, - "delta_ref_entropy_loss": 0.03033447265625, - "delta_ref_ppl": -0.0250244140625, - "entropy_loss": -0.0230712890625, - "epoch": 0.1704, - "grad_norm": 0.335799126778236, - "k1_kl": 0.02508544921875, - "k3_kl": 0.0142822265625, - "kimi_kl": 0.034423828125, - "learning_rate": 4.148e-07, - "loss": 0.0006, - "ppl": 0.009674072265625, - "reward": 0.9947977662086487, - "reward_std": 0.0004999950615456328, - "rewards/perpo_ocr_edit_distance_reward": 0.9947978258132935, + "advantages": -6.556511493727157e-07, + "completion_length": 269.0, + "delta_ref_entropy_loss": 0.059814453125, + "delta_ref_ppl": -0.0654296875, + "entropy_loss": -0.10546875, + "epoch": 0.0852, + "grad_norm": 2.029850166674256, + "k1_kl": 0.06591796875, + "k3_kl": 0.0390625, + "kimi_kl": 0.09521484375, + "learning_rate": 4.5739999999999995e-07, + "loss": 0.0016, + "ppl": 0.06787109375, + "reward": 0.7733151316642761, + "reward_std": 0.07901666313409805, + "rewards/perpo_ocr_edit_distance_reward": 0.7733151316642761, "step": 426, "temperature": 0.9 }, { - "advantages": -1.2938465943079791e-05, - "completion_length": 631.0, - "delta_ref_entropy_loss": 0.040283203125, - "delta_ref_ppl": -0.024658203125, - "entropy_loss": -0.077392578125, - "epoch": 0.1708, - "grad_norm": 1.1193583545515766, - "k1_kl": 0.02459716796875, - "k3_kl": 0.01239013671875, - "kimi_kl": 0.02703857421875, - "learning_rate": 4.146e-07, - "loss": 0.0005, - "ppl": 0.03961181640625, - "reward": 0.8793007135391235, - "reward_std": 0.012286526151001453, - "rewards/perpo_ocr_edit_distance_reward": 0.8793007731437683, + "advantages": -4.669598274631426e-05, + "completion_length": 813.0, + "delta_ref_entropy_loss": 0.057373046875, + "delta_ref_ppl": -0.044677734375, + "entropy_loss": -0.09521484375, + "epoch": 0.0854, + "grad_norm": 1.168773820717993, + "k1_kl": 0.044921875, + "k3_kl": 0.026611328125, + "kimi_kl": 0.0556640625, + "learning_rate": 4.5729999999999995e-07, + "loss": 0.0011, + "ppl": 0.053466796875, + "reward": 0.9132817983627319, + "reward_std": 0.0015402344288304448, + "rewards/perpo_ocr_edit_distance_reward": 0.9132819175720215, "step": 427, "temperature": 0.9 }, { - "advantages": -0.0003027915954589844, - "completion_length": 458.5, - "delta_ref_entropy_loss": 0.04296875, - "delta_ref_ppl": -0.01947021484375, - "entropy_loss": -0.046173095703125, - "epoch": 0.1712, - "grad_norm": 0.5791189947662473, - "k1_kl": 0.01953125, - "k3_kl": 0.008758544921875, - "kimi_kl": 0.014892578125, - "learning_rate": 4.1439999999999995e-07, - "loss": 0.0007, - "ppl": 0.020229339599609375, - "reward": 0.9745430052280426, - "reward_std": 0.0012858828995376825, - "rewards/perpo_ocr_edit_distance_reward": 0.974543035030365, + "advantages": -4.0241651731776074e-05, + "completion_length": 724.0, + "delta_ref_entropy_loss": 0.04833984375, + "delta_ref_ppl": -0.043212890625, + "entropy_loss": -0.053955078125, + "epoch": 0.0856, + "grad_norm": 1.3026489047998038, + "k1_kl": 0.043212890625, + "k3_kl": 0.0234375, + "kimi_kl": 0.052978515625, + "learning_rate": 4.572e-07, + "loss": 0.001, + "ppl": 0.0250244140625, + "reward": 0.9441294074058533, + "reward_std": 0.0018044031457975507, + "rewards/perpo_ocr_edit_distance_reward": 0.944129467010498, "step": 428, "temperature": 0.9 }, { - "advantages": -0.000298018966402136, - "completion_length": 156.0, - "delta_ref_entropy_loss": 0.08831787109375, - "delta_ref_ppl": -0.19134521484375, - "entropy_loss": -0.18255615234375, - "epoch": 0.1716, - "grad_norm": 5.174464200976846, - "k1_kl": 0.19134521484375, - "k3_kl": 0.145172119140625, - "kimi_kl": 0.84429931640625, - "learning_rate": 4.142e-07, - "loss": 0.0061, - "ppl": 0.09423828125, - "reward": 0.6678064167499542, - "reward_std": 0.06267401576042175, - "rewards/perpo_ocr_edit_distance_reward": 0.6678064465522766, + "advantages": -1.7029898913278885e-07, + "completion_length": 654.0, + "delta_ref_entropy_loss": 0.05078125, + "delta_ref_ppl": -0.028076171875, + "entropy_loss": -0.068359375, + "epoch": 0.0858, + "grad_norm": 1.6150077077560887, + "k1_kl": 0.028076171875, + "k3_kl": 0.0167236328125, + "kimi_kl": 0.038818359375, + "learning_rate": 4.571e-07, + "loss": 0.0007, + "ppl": 0.035400390625, + "reward": 0.7695047855377197, + "reward_std": 0.09253177791833878, + "rewards/perpo_ocr_edit_distance_reward": 0.769504725933075, "step": 429, "temperature": 0.9 }, { - "advantages": -1.1444092478996026e-05, - "completion_length": 533.5, - "delta_ref_entropy_loss": 0.03741455078125, - "delta_ref_ppl": -0.024810791015625, - "entropy_loss": -0.047119140625, - "epoch": 0.172, - "grad_norm": 0.6602958890049054, - "k1_kl": 0.02484130859375, - "k3_kl": 0.0114593505859375, - "kimi_kl": 0.022674560546875, - "learning_rate": 4.14e-07, - "loss": 0.0005, - "ppl": 0.0226287841796875, - "reward": 0.9738067090511322, - "reward_std": 0.005826429056469351, - "rewards/perpo_ocr_edit_distance_reward": 0.9738067984580994, + "advantages": -5.876166687812656e-05, + "completion_length": 653.0, + "delta_ref_entropy_loss": 0.0185546875, + "delta_ref_ppl": -0.01171875, + "entropy_loss": -0.01708984375, + "epoch": 0.086, + "grad_norm": 0.3678044408372858, + "k1_kl": 0.01171875, + "k3_kl": 0.005615234375, + "kimi_kl": 0.00836181640625, + "learning_rate": 4.57e-07, + "loss": 0.0003, + "ppl": 0.006988525390625, + "reward": 0.9785735011100769, + "reward_std": 0.0003347682359162718, + "rewards/perpo_ocr_edit_distance_reward": 0.9785735607147217, "step": 430, "temperature": 0.9 }, { - "advantages": -0.00011501142216729932, - "completion_length": 574.0, - "delta_ref_entropy_loss": 0.01641845703125, - "delta_ref_ppl": -0.015869140625, - "entropy_loss": -0.025390625, - "epoch": 0.1724, - "grad_norm": 0.5334028192420339, - "k1_kl": 0.01580810546875, - "k3_kl": 0.0086822509765625, - "kimi_kl": 0.019439697265625, - "learning_rate": 4.1379999999999996e-07, - "loss": 0.0005, - "ppl": 0.0125579833984375, - "reward": 0.9965759217739105, - "reward_std": 0.0007600736571475863, - "rewards/perpo_ocr_edit_distance_reward": 0.9965760111808777, + "advantages": -2.35353199968813e-05, + "completion_length": 475.0, + "delta_ref_entropy_loss": 0.0478515625, + "delta_ref_ppl": -0.019287109375, + "entropy_loss": -0.06201171875, + "epoch": 0.0862, + "grad_norm": 1.0001200829250916, + "k1_kl": 0.019287109375, + "k3_kl": 0.00970458984375, + "kimi_kl": 0.0126953125, + "learning_rate": 4.5689999999999997e-07, + "loss": 0.0004, + "ppl": 0.03662109375, + "reward": 0.9544315338134766, + "reward_std": 0.0024307083804160357, + "rewards/perpo_ocr_edit_distance_reward": 0.9544315934181213, "step": 431, "temperature": 0.9 }, { - "advantages": -0.00011484112474136055, - "completion_length": 474.0, - "delta_ref_entropy_loss": 0.02532958984375, - "delta_ref_ppl": -0.02545166015625, - "entropy_loss": -0.021240234375, - "epoch": 0.1728, - "grad_norm": 0.6176682435678621, - "k1_kl": 0.02545166015625, - "k3_kl": 0.016021728515625, - "kimi_kl": 0.041534423828125, - "learning_rate": 4.136e-07, - "loss": 0.0008, - "ppl": 0.006622314453125, - "reward": 0.9984827041625977, - "reward_std": 0.0004120187513763085, - "rewards/perpo_ocr_edit_distance_reward": 0.9984827637672424, + "advantages": -1.0030610610556323e-05, + "completion_length": 709.0, + "delta_ref_entropy_loss": 0.031494140625, + "delta_ref_ppl": -0.0184326171875, + "entropy_loss": -0.0262451171875, + "epoch": 0.0864, + "grad_norm": 0.4422294311649597, + "k1_kl": 0.0184326171875, + "k3_kl": 0.009765625, + "kimi_kl": 0.0169677734375, + "learning_rate": 4.5679999999999996e-07, + "loss": 0.0004, + "ppl": 0.01202392578125, + "reward": 0.9655459523200989, + "reward_std": 0.0015955190174281597, + "rewards/perpo_ocr_edit_distance_reward": 0.9655459523200989, "step": 432, "temperature": 0.9 }, { - "advantages": -5.516835881280713e-05, - "completion_length": 617.0, - "delta_ref_entropy_loss": 0.02655029296875, - "delta_ref_ppl": -0.017486572265625, - "entropy_loss": -0.02874755859375, - "epoch": 0.1732, - "grad_norm": 0.5115109091100853, - "k1_kl": 0.0174560546875, - "k3_kl": 0.00750732421875, - "kimi_kl": 0.0120849609375, - "learning_rate": 4.134e-07, - "loss": 0.0004, - "ppl": 0.013824462890625, - "reward": 0.977005124092102, - "reward_std": 0.001224179199198261, - "rewards/perpo_ocr_edit_distance_reward": 0.9770051836967468, + "advantages": 7.93603976489976e-05, + "completion_length": 329.0, + "delta_ref_entropy_loss": 0.037109375, + "delta_ref_ppl": -0.01434326171875, + "entropy_loss": -0.015869140625, + "epoch": 0.0866, + "grad_norm": 0.028727342381650457, + "k1_kl": 0.014404296875, + "k3_kl": 0.005096435546875, + "kimi_kl": 0.007720947265625, + "learning_rate": 4.5669999999999995e-07, + "loss": 0.0001, + "ppl": 0.00494384765625, + "reward": 0.979762852191925, + "reward_std": 7.291969723155489e-06, + "rewards/perpo_ocr_edit_distance_reward": 0.9797629117965698, "step": 433, "temperature": 0.9 }, { - "advantages": -0.0003091011731157778, - "completion_length": 599.5, - "delta_ref_entropy_loss": 0.04296875, - "delta_ref_ppl": -0.02972412109375, - "entropy_loss": -0.03961181640625, - "epoch": 0.1736, - "grad_norm": 0.4890147602443322, - "k1_kl": 0.02972412109375, - "k3_kl": 0.0159912109375, - "kimi_kl": 0.03228759765625, - "learning_rate": 4.1319999999999997e-07, - "loss": 0.0009, - "ppl": 0.0204010009765625, - "reward": 0.9904180467128754, - "reward_std": 0.000718887138646096, - "rewards/perpo_ocr_edit_distance_reward": 0.9904181361198425, + "advantages": -6.387915345840156e-05, + "completion_length": 446.0, + "delta_ref_entropy_loss": 0.02197265625, + "delta_ref_ppl": -0.016357421875, + "entropy_loss": -0.022216796875, + "epoch": 0.0868, + "grad_norm": 0.5289210897469095, + "k1_kl": 0.016357421875, + "k3_kl": 0.0107421875, + "kimi_kl": 0.014892578125, + "learning_rate": 4.566e-07, + "loss": 0.0005, + "ppl": 0.009765625, + "reward": 0.97920161485672, + "reward_std": 0.00029986980371177197, + "rewards/perpo_ocr_edit_distance_reward": 0.97920161485672, "step": 434, "temperature": 0.9 }, { - "advantages": -0.0001221554703079164, - "completion_length": 1127.0, - "delta_ref_entropy_loss": 0.0400390625, - "delta_ref_ppl": -0.02069091796875, - "entropy_loss": -0.0701904296875, - "epoch": 0.174, - "grad_norm": 0.7918350317560543, - "k1_kl": 0.020538330078125, - "k3_kl": 0.012939453125, - "kimi_kl": 0.020172119140625, - "learning_rate": 4.1299999999999995e-07, - "loss": 0.0006, - "ppl": 0.03687286376953125, - "reward": 0.9851274788379669, - "reward_std": 0.0014121146305114962, - "rewards/perpo_ocr_edit_distance_reward": 0.9851275682449341, + "advantages": -3.320830364827998e-05, + "completion_length": 430.0, + "delta_ref_entropy_loss": 0.0255126953125, + "delta_ref_ppl": -0.0220947265625, + "entropy_loss": -0.0291748046875, + "epoch": 0.087, + "grad_norm": 0.6174112118783716, + "k1_kl": 0.0220947265625, + "k3_kl": 0.0123291015625, + "kimi_kl": 0.028564453125, + "learning_rate": 4.565e-07, + "loss": 0.0005, + "ppl": 0.011962890625, + "reward": 0.9744426012039185, + "reward_std": 0.0024639342445880175, + "rewards/perpo_ocr_edit_distance_reward": 0.9744426608085632, "step": 435, "temperature": 0.9 }, { - "advantages": 1.1086464610343683e-05, - "completion_length": 811.5, - "delta_ref_entropy_loss": 0.03179931640625, - "delta_ref_ppl": -0.0181884765625, - "entropy_loss": -0.05560302734375, - "epoch": 0.1744, - "grad_norm": 0.6829203513236265, - "k1_kl": 0.01812744140625, - "k3_kl": 0.0080718994140625, - "kimi_kl": 0.01336669921875, - "learning_rate": 4.128e-07, - "loss": 0.0003, - "ppl": 0.029937744140625, - "reward": 0.9741853773593903, - "reward_std": 0.003857252770103514, - "rewards/perpo_ocr_edit_distance_reward": 0.9741854071617126, + "advantages": -3.760201798286289e-05, + "completion_length": 1488.0, + "delta_ref_entropy_loss": 0.01116943359375, + "delta_ref_ppl": -0.005218505859375, + "entropy_loss": -0.0281982421875, + "epoch": 0.0872, + "grad_norm": 0.3544445138581285, + "k1_kl": 0.005218505859375, + "k3_kl": 0.0025177001953125, + "kimi_kl": 0.00439453125, + "learning_rate": 4.5639999999999993e-07, + "loss": 0.0001, + "ppl": 0.01239013671875, + "reward": 0.8968610167503357, + "reward_std": 0.0021636656019836664, + "rewards/perpo_ocr_edit_distance_reward": 0.8968610763549805, "step": 436, "temperature": 0.9 }, { - "advantages": -1.762594592946698e-06, - "completion_length": 362.5, - "delta_ref_entropy_loss": 0.03497314453125, - "delta_ref_ppl": -0.230712890625, - "entropy_loss": -0.160858154296875, - "epoch": 0.1748, - "grad_norm": 0.26449608668460084, - "k1_kl": 0.229736328125, - "k3_kl": 0.14627838134765625, - "kimi_kl": 0.382080078125, - "learning_rate": 4.1260000000000003e-07, - "loss": 0.0059, - "ppl": 0.042327880859375, - "reward": 0.5373173914849758, - "reward_std": 0.00479854503646493, - "rewards/perpo_ocr_edit_distance_reward": 0.5373174250125885, + "advantages": -1.4628683857154101e-05, + "completion_length": 2048.0, + "delta_ref_entropy_loss": 0.01068115234375, + "delta_ref_ppl": -0.00579833984375, + "entropy_loss": -0.05322265625, + "epoch": 0.0874, + "grad_norm": 0.824017036077703, + "k1_kl": 0.005859375, + "k3_kl": 0.004974365234375, + "kimi_kl": 0.0081787109375, + "learning_rate": 4.563e-07, + "loss": 0.0002, + "ppl": 0.0272216796875, + "reward": 0.3348921239376068, + "reward_std": 0.0010653588688001037, + "rewards/perpo_ocr_edit_distance_reward": 0.3348921537399292, "step": 437, "temperature": 0.9 }, { - "advantages": -1.1908156920981128e-05, - "completion_length": 459.0, - "delta_ref_entropy_loss": 0.05718994140625, - "delta_ref_ppl": -0.0322265625, - "entropy_loss": -0.04852294921875, - "epoch": 0.1752, - "grad_norm": 0.7731453019834386, - "k1_kl": 0.0322265625, - "k3_kl": 0.01519775390625, - "kimi_kl": 0.033447265625, - "learning_rate": 4.1239999999999996e-07, - "loss": 0.0006, - "ppl": 0.02471923828125, - "reward": 0.7487287074327469, - "reward_std": 0.002312093216460198, - "rewards/perpo_ocr_edit_distance_reward": 0.7487287521362305, + "advantages": -1.7029899268550253e-08, + "completion_length": 2048.0, + "delta_ref_entropy_loss": 0.1748046875, + "delta_ref_ppl": -0.21875, + "entropy_loss": -0.30078125, + "epoch": 0.0876, + "grad_norm": 2.7298519628528437, + "k1_kl": 0.21875, + "k3_kl": 0.119140625, + "kimi_kl": 0.2177734375, + "learning_rate": 4.5619999999999997e-07, + "loss": 0.0048, + "ppl": 0.1396484375, + "reward": 0.4285678267478943, + "reward_std": 0.18097823858261108, + "rewards/perpo_ocr_edit_distance_reward": 0.4285678565502167, "step": 438, "temperature": 0.9 }, { - "advantages": -3.3753260403557306e-05, - "completion_length": 379.5, - "delta_ref_entropy_loss": 0.0694580078125, - "delta_ref_ppl": -0.04815673828125, - "entropy_loss": -0.1064453125, - "epoch": 0.1756, - "grad_norm": 0.7105125438141334, - "k1_kl": 0.04815673828125, - "k3_kl": 0.023651123046875, - "kimi_kl": 0.050018310546875, - "learning_rate": 4.122e-07, - "loss": 0.001, - "ppl": 0.05657958984375, - "reward": 0.9019496142864227, - "reward_std": 0.11170799526735209, - "rewards/perpo_ocr_edit_distance_reward": 0.9019496440887451, + "advantages": -3.3038004403351806e-06, + "completion_length": 583.0, + "delta_ref_entropy_loss": 0.01953125, + "delta_ref_ppl": -0.0211181640625, + "entropy_loss": -0.04345703125, + "epoch": 0.0878, + "grad_norm": 1.1151427279478918, + "k1_kl": 0.02099609375, + "k3_kl": 0.0125732421875, + "kimi_kl": 0.0238037109375, + "learning_rate": 4.5609999999999996e-07, + "loss": 0.0005, + "ppl": 0.0216064453125, + "reward": 0.8821086883544922, + "reward_std": 0.02071418985724449, + "rewards/perpo_ocr_edit_distance_reward": 0.882108747959137, "step": 439, "temperature": 0.9 }, { - "advantages": -1.9810030153166736e-05, - "completion_length": 534.5, - "delta_ref_entropy_loss": 0.0377197265625, - "delta_ref_ppl": -0.014739990234375, - "entropy_loss": -0.04510498046875, - "epoch": 0.176, - "grad_norm": 0.7252238892471847, - "k1_kl": 0.0147705078125, - "k3_kl": 0.0072784423828125, - "kimi_kl": 0.0117034912109375, - "learning_rate": 4.12e-07, - "loss": 0.0003, - "ppl": 0.021575927734375, - "reward": 0.9838152527809143, - "reward_std": 0.0012211665161885321, - "rewards/perpo_ocr_edit_distance_reward": 0.9838153719902039, + "advantages": -4.76837158203125e-07, + "completion_length": 684.0, + "delta_ref_entropy_loss": 0.031982421875, + "delta_ref_ppl": -0.0260009765625, + "entropy_loss": -0.053466796875, + "epoch": 0.088, + "grad_norm": 0.7415358480031647, + "k1_kl": 0.0260009765625, + "k3_kl": 0.01385498046875, + "kimi_kl": 0.0274658203125, + "learning_rate": 4.56e-07, + "loss": 0.0006, + "ppl": 0.03369140625, + "reward": 0.8099309206008911, + "reward_std": 0.12185259163379669, + "rewards/perpo_ocr_edit_distance_reward": 0.8099309802055359, "step": 440, "temperature": 0.9 }, { - "advantages": -1.14526073957677e-05, - "completion_length": 675.0, - "delta_ref_entropy_loss": 0.03363037109375, - "delta_ref_ppl": -0.01947021484375, - "entropy_loss": -0.0291748046875, - "epoch": 0.1764, - "grad_norm": 0.9269084093144283, - "k1_kl": 0.01959228515625, - "k3_kl": 0.0099334716796875, - "kimi_kl": 0.017242431640625, - "learning_rate": 4.1179999999999997e-07, - "loss": 0.0004, - "ppl": 0.01104736328125, - "reward": 0.9532181024551392, - "reward_std": 0.002539320499636233, - "rewards/perpo_ocr_edit_distance_reward": 0.9532181024551392, + "advantages": -3.8538662920473143e-05, + "completion_length": 690.0, + "delta_ref_entropy_loss": 0.0157470703125, + "delta_ref_ppl": -0.00567626953125, + "entropy_loss": -0.0166015625, + "epoch": 0.0882, + "grad_norm": 0.270646266877717, + "k1_kl": 0.005645751953125, + "k3_kl": 0.0022125244140625, + "kimi_kl": 0.0026092529296875, + "learning_rate": 4.559e-07, + "loss": 0.0001, + "ppl": 0.00787353515625, + "reward": 0.983831524848938, + "reward_std": 0.0005628874059766531, + "rewards/perpo_ocr_edit_distance_reward": 0.983831524848938, "step": 441, "temperature": 0.9 }, { - "advantages": -7.513165684258638e-05, - "completion_length": 471.5, - "delta_ref_entropy_loss": 0.0501708984375, - "delta_ref_ppl": -0.036865234375, - "entropy_loss": -0.027587890625, - "epoch": 0.1768, - "grad_norm": 0.6853510292630716, - "k1_kl": 0.036865234375, - "k3_kl": 0.02032470703125, - "kimi_kl": 0.0616455078125, - "learning_rate": 4.116e-07, - "loss": 0.0009, - "ppl": 0.012237548828125, - "reward": 0.9906066060066223, - "reward_std": 0.010227507154922932, - "rewards/perpo_ocr_edit_distance_reward": 0.9906066954135895, + "advantages": -1.1410032129788306e-05, + "completion_length": 615.0, + "delta_ref_entropy_loss": 0.03369140625, + "delta_ref_ppl": -0.018310546875, + "entropy_loss": -0.0113525390625, + "epoch": 0.0884, + "grad_norm": 1.331756709949405, + "k1_kl": 0.018310546875, + "k3_kl": 0.0091552734375, + "kimi_kl": 0.0238037109375, + "learning_rate": 4.5579999999999994e-07, + "loss": 0.0004, + "ppl": 0.0029449462890625, + "reward": 0.9507551789283752, + "reward_std": 0.005137949250638485, + "rewards/perpo_ocr_edit_distance_reward": 0.9507552981376648, "step": 442, "temperature": 0.9 }, { - "advantages": -5.228179134064703e-06, - "completion_length": 491.0, - "delta_ref_entropy_loss": 0.0457763671875, - "delta_ref_ppl": -0.0286865234375, - "entropy_loss": -0.083343505859375, - "epoch": 0.1772, - "grad_norm": 1.137018372574404, - "k1_kl": 0.028564453125, - "k3_kl": 0.01531982421875, - "kimi_kl": 0.0384521484375, - "learning_rate": 4.114e-07, - "loss": 0.0006, - "ppl": 0.044586181640625, - "reward": 0.9087736904621124, - "reward_std": 0.011184065137058496, - "rewards/perpo_ocr_edit_distance_reward": 0.9087737202644348, + "advantages": -9.806241723708808e-05, + "completion_length": 680.0, + "delta_ref_entropy_loss": 0.02685546875, + "delta_ref_ppl": -0.0223388671875, + "entropy_loss": -0.0220947265625, + "epoch": 0.0886, + "grad_norm": 0.4846449159001709, + "k1_kl": 0.0223388671875, + "k3_kl": 0.01519775390625, + "kimi_kl": 0.033203125, + "learning_rate": 4.557e-07, + "loss": 0.0007, + "ppl": 0.00799560546875, + "reward": 0.9851812720298767, + "reward_std": 0.00024736131308600307, + "rewards/perpo_ocr_edit_distance_reward": 0.9851813912391663, "step": 443, "temperature": 0.9 }, { - "advantages": -4.2574748704282683e-07, - "completion_length": 151.5, - "delta_ref_entropy_loss": 0.0430908203125, - "delta_ref_ppl": -0.0657958984375, - "entropy_loss": -0.044921875, - "epoch": 0.1776, - "grad_norm": 1.0410733926137647, - "k1_kl": 0.06573486328125, - "k3_kl": 0.04217529296875, - "kimi_kl": 0.13818359375, - "learning_rate": 4.112e-07, - "loss": 0.0017, - "ppl": 0.0201568603515625, - "reward": 0.9763459265232086, - "reward_std": 0.004882039036601782, - "rewards/perpo_ocr_edit_distance_reward": 0.9763459861278534, + "advantages": -7.37564914743416e-05, + "completion_length": 1170.0, + "delta_ref_entropy_loss": 0.0283203125, + "delta_ref_ppl": -0.0205078125, + "entropy_loss": -0.0361328125, + "epoch": 0.0888, + "grad_norm": 1.402597268006113, + "k1_kl": 0.0203857421875, + "k3_kl": 0.01214599609375, + "kimi_kl": 0.020751953125, + "learning_rate": 4.556e-07, + "loss": 0.0006, + "ppl": 0.017822265625, + "reward": 0.9767359495162964, + "reward_std": 0.0014011268503963947, + "rewards/perpo_ocr_edit_distance_reward": 0.9767361283302307, "step": 444, "temperature": 0.9 }, { - "advantages": -6.811959707420101e-08, - "completion_length": 380.0, - "delta_ref_entropy_loss": 0.0126953125, - "delta_ref_ppl": -0.2818603515625, - "entropy_loss": -0.28759765625, - "epoch": 0.178, - "grad_norm": 0.7630778338815882, - "k1_kl": 0.280029296875, - "k3_kl": 0.17529296875, - "kimi_kl": 0.3402099609375, - "learning_rate": 4.1099999999999996e-07, - "loss": 0.007, - "ppl": 0.11798095703125, - "reward": 0.3928178669884801, - "reward_std": 0.09648311138153076, - "rewards/perpo_ocr_edit_distance_reward": 0.39281789772212505, + "advantages": -6.74384000376449e-06, + "completion_length": 518.0, + "delta_ref_entropy_loss": 0.048095703125, + "delta_ref_ppl": -0.052734375, + "entropy_loss": -0.059814453125, + "epoch": 0.089, + "grad_norm": 1.4177782749159513, + "k1_kl": 0.052734375, + "k3_kl": 0.0296630859375, + "kimi_kl": 0.07470703125, + "learning_rate": 4.5549999999999997e-07, + "loss": 0.0012, + "ppl": 0.0303955078125, + "reward": 0.9186623692512512, + "reward_std": 0.01756577380001545, + "rewards/perpo_ocr_edit_distance_reward": 0.9186625480651855, "step": 445, "temperature": 0.9 }, { - "advantages": -0.00010215385350420547, - "completion_length": 531.5, - "delta_ref_entropy_loss": 0.06402587890625, - "delta_ref_ppl": -0.03369140625, - "entropy_loss": -0.112060546875, - "epoch": 0.1784, - "grad_norm": 1.4306797619694065, - "k1_kl": 0.0338134765625, - "k3_kl": 0.017608642578125, - "kimi_kl": 0.043212890625, - "learning_rate": 4.108e-07, - "loss": 0.0008, - "ppl": 0.0605010986328125, - "reward": 0.9052285850048065, - "reward_std": 0.02744645633356413, - "rewards/perpo_ocr_edit_distance_reward": 0.9052286744117737, + "advantages": -6.762573320884258e-05, + "completion_length": 264.0, + "delta_ref_entropy_loss": 0.03564453125, + "delta_ref_ppl": -0.05419921875, + "entropy_loss": -0.02490234375, + "epoch": 0.0892, + "grad_norm": 0.6754683890832346, + "k1_kl": 0.054443359375, + "k3_kl": 0.0390625, + "kimi_kl": 0.09521484375, + "learning_rate": 4.554e-07, + "loss": 0.0016, + "ppl": 0.0113525390625, + "reward": 0.9726095199584961, + "reward_std": 0.001032993313856423, + "rewards/perpo_ocr_edit_distance_reward": 0.9726095795631409, "step": 446, "temperature": 0.9 }, { - "advantages": 8.059400101956271e-06, - "completion_length": 541.5, - "delta_ref_entropy_loss": 0.0609130859375, - "delta_ref_ppl": -0.03692626953125, - "entropy_loss": -0.1162109375, - "epoch": 0.1788, - "grad_norm": 1.1626106758101544, - "k1_kl": 0.036865234375, - "k3_kl": 0.02325439453125, - "kimi_kl": 0.03594970703125, - "learning_rate": 4.106e-07, - "loss": 0.0009, - "ppl": 0.0654296875, - "reward": 0.9171877801418304, - "reward_std": 0.003722499532159418, - "rewards/perpo_ocr_edit_distance_reward": 0.9171877801418304, + "advantages": -0.0005960464477539062, + "completion_length": 63.0, + "delta_ref_entropy_loss": 0.033935546875, + "delta_ref_ppl": -0.04345703125, + "entropy_loss": -0.04248046875, + "epoch": 0.0894, + "grad_norm": 0.2643044892937474, + "k1_kl": 0.0439453125, + "k3_kl": 0.031982421875, + "kimi_kl": 0.057373046875, + "learning_rate": 4.5529999999999995e-07, + "loss": 0.0019, + "ppl": 0.0228271484375, + "reward": 0.9775280952453613, + "reward_std": 0.0, + "rewards/perpo_ocr_edit_distance_reward": 0.9775281548500061, "step": 447, "temperature": 0.9 }, { - "advantages": -2.518296332709724e-05, - "completion_length": 1160.0, - "delta_ref_entropy_loss": 0.0482177734375, - "delta_ref_ppl": -0.037841796875, - "entropy_loss": -0.09765625, - "epoch": 0.1792, - "grad_norm": 7.1643882989193965, - "k1_kl": 0.0377197265625, - "k3_kl": 0.02960205078125, - "kimi_kl": 0.0479736328125, - "learning_rate": 4.1039999999999997e-07, - "loss": 0.0012, - "ppl": 0.057861328125, - "reward": 0.9558930397033691, - "reward_std": 0.0031752126524224877, - "rewards/perpo_ocr_edit_distance_reward": 0.9558931589126587, + "advantages": -8.404254913330078e-06, + "completion_length": 654.0, + "delta_ref_entropy_loss": 0.0751953125, + "delta_ref_ppl": -0.031005859375, + "entropy_loss": -0.1171875, + "epoch": 0.0896, + "grad_norm": 2.9772482717146596, + "k1_kl": 0.031005859375, + "k3_kl": 0.01251220703125, + "kimi_kl": 0.0230712890625, + "learning_rate": 4.5519999999999995e-07, + "loss": 0.0005, + "ppl": 0.056884765625, + "reward": 0.9446247220039368, + "reward_std": 0.001929540536366403, + "rewards/perpo_ocr_edit_distance_reward": 0.9446247220039368, "step": 448, "temperature": 0.9 }, { - "advantages": -0.00031306062464864226, - "completion_length": 720.5, - "delta_ref_entropy_loss": 0.0240478515625, - "delta_ref_ppl": -0.015899658203125, - "entropy_loss": -0.025604248046875, - "epoch": 0.1796, - "grad_norm": 1.7720031452785048, - "k1_kl": 0.015960693359375, - "k3_kl": 0.0079345703125, - "kimi_kl": 0.01776123046875, - "learning_rate": 4.102e-07, - "loss": 0.0006, - "ppl": 0.01171875, - "reward": 0.8918212652206421, - "reward_std": 0.0007995109772309661, - "rewards/perpo_ocr_edit_distance_reward": 0.8918213248252869, + "advantages": -2.7247838829680404e-07, + "completion_length": 2048.0, + "delta_ref_entropy_loss": 0.025634765625, + "delta_ref_ppl": -0.018798828125, + "entropy_loss": -0.150390625, + "epoch": 0.0898, + "grad_norm": 2.1007579260205005, + "k1_kl": 0.0186767578125, + "k3_kl": 0.0096435546875, + "kimi_kl": 0.0179443359375, + "learning_rate": 4.551e-07, + "loss": 0.0004, + "ppl": 0.080078125, + "reward": 0.7702589631080627, + "reward_std": 0.21119622886180878, + "rewards/perpo_ocr_edit_distance_reward": 0.7702590823173523, "step": 449, "temperature": 0.9 }, { - "advantages": -2.4267605880368137e-07, - "completion_length": 127.0, - "delta_ref_entropy_loss": 0.061309814453125, - "delta_ref_ppl": -0.1187744140625, - "entropy_loss": -0.12353515625, - "epoch": 0.18, - "grad_norm": 7.294707187424293, - "k1_kl": 0.11883544921875, - "k3_kl": 0.076416015625, - "kimi_kl": 0.1588134765625, - "learning_rate": 4.0999999999999994e-07, - "loss": 0.0031, - "ppl": 0.066314697265625, - "reward": 0.6439372897148132, - "reward_std": 0.11943146958947182, - "rewards/perpo_ocr_edit_distance_reward": 0.6439374089241028, + "advantages": -2.213886909885332e-05, + "completion_length": 407.0, + "delta_ref_entropy_loss": 0.1015625, + "delta_ref_ppl": -0.0810546875, + "entropy_loss": -0.1748046875, + "epoch": 0.09, + "grad_norm": 1.563061683810121, + "k1_kl": 0.08154296875, + "k3_kl": 0.04443359375, + "kimi_kl": 0.1064453125, + "learning_rate": 4.55e-07, + "loss": 0.0018, + "ppl": 0.0927734375, + "reward": 0.8048640489578247, + "reward_std": 0.0033607580699026585, + "rewards/perpo_ocr_edit_distance_reward": 0.8048641085624695, "step": 450, "temperature": 0.9 }, { - "advantages": -0.0003681864109239541, - "completion_length": 231.5, - "delta_ref_entropy_loss": 0.0570068359375, - "delta_ref_ppl": -0.04107666015625, - "entropy_loss": -0.039794921875, - "epoch": 0.1804, - "grad_norm": 0.3546411625608517, - "k1_kl": 0.0408935546875, - "k3_kl": 0.02435302734375, - "kimi_kl": 0.0672607421875, - "learning_rate": 4.098e-07, - "loss": 0.0013, - "ppl": 0.0193939208984375, - "reward": 0.7719060778617859, - "reward_std": 0.00016236088413279504, - "rewards/perpo_ocr_edit_distance_reward": 0.771906167268753, + "advantages": -5.790165573671402e-07, + "completion_length": 387.0, + "delta_ref_entropy_loss": 0.047607421875, + "delta_ref_ppl": -0.0361328125, + "entropy_loss": -0.1005859375, + "epoch": 0.0902, + "grad_norm": 1.9719261268091444, + "k1_kl": 0.0361328125, + "k3_kl": 0.0224609375, + "kimi_kl": 0.038818359375, + "learning_rate": 4.5490000000000003e-07, + "loss": 0.0009, + "ppl": 0.055419921875, + "reward": 0.8285498023033142, + "reward_std": 0.1413556933403015, + "rewards/perpo_ocr_edit_distance_reward": 0.8285499215126038, "step": 451, "temperature": 0.9 }, { - "advantages": -6.292122088780161e-05, - "completion_length": 437.0, - "delta_ref_entropy_loss": 0.0308837890625, - "delta_ref_ppl": -0.08544921875, - "entropy_loss": -0.08203125, - "epoch": 0.1808, - "grad_norm": 1.6244956969523, - "k1_kl": 0.08544921875, - "k3_kl": 0.0565185546875, - "kimi_kl": 0.146240234375, - "learning_rate": 4.096e-07, - "loss": 0.0023, - "ppl": 0.025726318359375, - "reward": 0.6671512871980667, - "reward_std": 0.0011583643063204363, - "rewards/perpo_ocr_edit_distance_reward": 0.6671513319015503, + "advantages": -2.627713547553867e-05, + "completion_length": 558.0, + "delta_ref_entropy_loss": 0.01434326171875, + "delta_ref_ppl": -0.00811767578125, + "entropy_loss": -0.0191650390625, + "epoch": 0.0904, + "grad_norm": 0.6284605392005902, + "k1_kl": 0.00811767578125, + "k3_kl": 0.006134033203125, + "kimi_kl": 0.01116943359375, + "learning_rate": 4.5479999999999997e-07, + "loss": 0.0003, + "ppl": 0.010009765625, + "reward": 0.9835861921310425, + "reward_std": 0.001195251476019621, + "rewards/perpo_ocr_edit_distance_reward": 0.983586311340332, "step": 452, "temperature": 0.9 }, { - "advantages": -2.6685852390073705e-05, - "completion_length": 277.0, - "delta_ref_entropy_loss": 0.0836181640625, - "delta_ref_ppl": -0.0565185546875, - "entropy_loss": -0.10400390625, - "epoch": 0.1812, - "grad_norm": 1.4050111782149874, - "k1_kl": 0.0567626953125, - "k3_kl": 0.02520751953125, - "kimi_kl": 0.04345703125, - "learning_rate": 4.0939999999999995e-07, - "loss": 0.001, - "ppl": 0.055419921875, - "reward": 0.9651956558227539, - "reward_std": 0.0018432127544656396, - "rewards/perpo_ocr_edit_distance_reward": 0.9651957154273987, + "advantages": 3.4059798537100505e-08, + "completion_length": 942.0, + "delta_ref_entropy_loss": 0.051513671875, + "delta_ref_ppl": -0.0203857421875, + "entropy_loss": -0.255859375, + "epoch": 0.0906, + "grad_norm": 0.9896487717790329, + "k1_kl": 0.0205078125, + "k3_kl": 0.0098876953125, + "kimi_kl": 0.0142822265625, + "learning_rate": 4.5469999999999996e-07, + "loss": 0.0004, + "ppl": 0.126953125, + "reward": 0.8004648685455322, + "reward_std": 0.25852832198143005, + "rewards/perpo_ocr_edit_distance_reward": 0.8004648685455322, "step": 453, "temperature": 0.9 }, { - "advantages": -0.00029893432349581417, - "completion_length": 458.5, - "delta_ref_entropy_loss": 0.02203369140625, - "delta_ref_ppl": -0.0238037109375, - "entropy_loss": -0.0155181884765625, - "epoch": 0.1816, - "grad_norm": 0.4662933529509379, - "k1_kl": 0.023773193359375, - "k3_kl": 0.01494598388671875, - "kimi_kl": 0.0578155517578125, - "learning_rate": 4.092e-07, - "loss": 0.0009, - "ppl": 0.005558013916015625, - "reward": 0.9810854494571686, - "reward_std": 0.006954526994377375, - "rewards/perpo_ocr_edit_distance_reward": 0.9810855090618134, + "advantages": -2.4778503302513855e-06, + "completion_length": 440.0, + "delta_ref_entropy_loss": 0.0303955078125, + "delta_ref_ppl": -0.03564453125, + "entropy_loss": -0.046630859375, + "epoch": 0.0908, + "grad_norm": 1.139247721108332, + "k1_kl": 0.03564453125, + "k3_kl": 0.024169921875, + "kimi_kl": 0.07275390625, + "learning_rate": 4.546e-07, + "loss": 0.001, + "ppl": 0.021484375, + "reward": 0.9040901064872742, + "reward_std": 0.0033434629440307617, + "rewards/perpo_ocr_edit_distance_reward": 0.904090166091919, "step": 454, "temperature": 0.9 }, { - "advantages": -0.0003333432359795552, - "completion_length": 450.5, - "delta_ref_entropy_loss": 0.0189208984375, - "delta_ref_ppl": -0.0235595703125, - "entropy_loss": -0.019683837890625, - "epoch": 0.182, - "grad_norm": 0.3527214245847551, - "k1_kl": 0.023681640625, - "k3_kl": 0.015869140625, - "kimi_kl": 0.038818359375, - "learning_rate": 4.0899999999999997e-07, - "loss": 0.001, - "ppl": 0.00946044921875, - "reward": 0.997545450925827, - "reward_std": 0.0003116075531579554, - "rewards/perpo_ocr_edit_distance_reward": 0.9975455105304718, + "advantages": -4.567419091472402e-05, + "completion_length": 660.0, + "delta_ref_entropy_loss": 0.024658203125, + "delta_ref_ppl": -0.0400390625, + "entropy_loss": -0.028076171875, + "epoch": 0.091, + "grad_norm": 0.3764662786000261, + "k1_kl": 0.0400390625, + "k3_kl": 0.0306396484375, + "kimi_kl": 0.1357421875, + "learning_rate": 4.545e-07, + "loss": 0.0013, + "ppl": 0.01312255859375, + "reward": 0.9828822612762451, + "reward_std": 0.0006457444396801293, + "rewards/perpo_ocr_edit_distance_reward": 0.9828823804855347, "step": 455, "temperature": 0.9 }, { - "advantages": -6.258275347548192e-05, - "completion_length": 466.5, - "delta_ref_entropy_loss": 0.03857421875, - "delta_ref_ppl": -0.1380615234375, - "entropy_loss": -0.195556640625, - "epoch": 0.1824, - "grad_norm": 7.242234195228801, - "k1_kl": 0.13714599609375, - "k3_kl": 0.09466552734375, - "kimi_kl": 0.2816162109375, - "learning_rate": 4.0879999999999995e-07, - "loss": 0.0038, - "ppl": 0.092193603515625, - "reward": 0.6508784741163254, - "reward_std": 0.07724606202100404, - "rewards/perpo_ocr_edit_distance_reward": 0.650878518819809, + "advantages": -3.804479638347402e-05, + "completion_length": 1207.0, + "delta_ref_entropy_loss": 0.04052734375, + "delta_ref_ppl": -0.031005859375, + "entropy_loss": -0.052734375, + "epoch": 0.0912, + "grad_norm": 1.024833530612664, + "k1_kl": 0.0311279296875, + "k3_kl": 0.0169677734375, + "kimi_kl": 0.03515625, + "learning_rate": 4.544e-07, + "loss": 0.0007, + "ppl": 0.026611328125, + "reward": 0.965644359588623, + "reward_std": 0.0019139588112011552, + "rewards/perpo_ocr_edit_distance_reward": 0.9656444191932678, "step": 456, "temperature": 0.9 }, { - "advantages": -0.0003403340087970719, - "completion_length": 368.5, - "delta_ref_entropy_loss": 0.030517578125, - "delta_ref_ppl": -0.0191650390625, - "entropy_loss": -0.0338134765625, - "epoch": 0.1828, - "grad_norm": 0.880513370806339, - "k1_kl": 0.0191650390625, - "k3_kl": 0.010040283203125, - "kimi_kl": 0.01507568359375, - "learning_rate": 4.086e-07, - "loss": 0.0007, - "ppl": 0.0163421630859375, - "reward": 0.9970274269580841, - "reward_std": 0.00035257526906207204, - "rewards/perpo_ocr_edit_distance_reward": 0.9970275163650513, + "advantages": -4.2004245187854394e-05, + "completion_length": 315.0, + "delta_ref_entropy_loss": 0.05908203125, + "delta_ref_ppl": -0.042236328125, + "entropy_loss": -0.046875, + "epoch": 0.0914, + "grad_norm": 1.515774506082453, + "k1_kl": 0.042236328125, + "k3_kl": 0.0245361328125, + "kimi_kl": 0.05126953125, + "learning_rate": 4.543e-07, + "loss": 0.001, + "ppl": 0.021240234375, + "reward": 0.9716734290122986, + "reward_std": 0.0003054398111999035, + "rewards/perpo_ocr_edit_distance_reward": 0.9716734886169434, "step": 457, "temperature": 0.9 }, { - "advantages": 4.512922942012665e-07, - "completion_length": 592.5, - "delta_ref_entropy_loss": 0.105712890625, - "delta_ref_ppl": -0.05419921875, - "entropy_loss": -0.20263671875, - "epoch": 0.1832, - "grad_norm": 1.9727333612233713, - "k1_kl": 0.0540771484375, - "k3_kl": 0.02630615234375, - "kimi_kl": 0.0501708984375, - "learning_rate": 4.084e-07, - "loss": 0.0011, - "ppl": 0.117431640625, - "reward": 0.7146188020706177, - "reward_std": 0.011104196310043335, - "rewards/perpo_ocr_edit_distance_reward": 0.7146188318729401, + "advantages": -0.00012369499017950147, + "completion_length": 462.0, + "delta_ref_entropy_loss": 0.032958984375, + "delta_ref_ppl": -0.01806640625, + "entropy_loss": -0.021240234375, + "epoch": 0.0916, + "grad_norm": 0.8010279659868748, + "k1_kl": 0.0181884765625, + "k3_kl": 0.01287841796875, + "kimi_kl": 0.0191650390625, + "learning_rate": 4.542e-07, + "loss": 0.0006, + "ppl": 0.010009765625, + "reward": 0.9794973134994507, + "reward_std": 0.0003818439145106822, + "rewards/perpo_ocr_edit_distance_reward": 0.9794974327087402, "step": 458, "temperature": 0.9 }, { - "advantages": -9.597625830792822e-05, - "completion_length": 992.5, - "delta_ref_entropy_loss": 0.02056884765625, - "delta_ref_ppl": -0.010284423828125, - "entropy_loss": -0.03173828125, - "epoch": 0.1836, - "grad_norm": 0.3742464211631865, - "k1_kl": 0.010284423828125, - "k3_kl": 0.00450897216796875, - "kimi_kl": 0.00762939453125, - "learning_rate": 4.0819999999999996e-07, - "loss": 0.0003, - "ppl": 0.015289306640625, - "reward": 0.9963189363479614, - "reward_std": 0.0007770252414047718, - "rewards/perpo_ocr_edit_distance_reward": 0.9963189959526062, + "advantages": -1.2125287867092993e-05, + "completion_length": 590.0, + "delta_ref_entropy_loss": 0.0302734375, + "delta_ref_ppl": -0.033447265625, + "entropy_loss": -0.0177001953125, + "epoch": 0.0918, + "grad_norm": 0.39739848762496466, + "k1_kl": 0.033447265625, + "k3_kl": 0.02392578125, + "kimi_kl": 0.1025390625, + "learning_rate": 4.5409999999999997e-07, + "loss": 0.001, + "ppl": 0.00750732421875, + "reward": 0.9836231470108032, + "reward_std": 0.0006027533672749996, + "rewards/perpo_ocr_edit_distance_reward": 0.983623206615448, "step": 459, "temperature": 0.9 }, { - "advantages": 4.470348642371391e-07, - "completion_length": 591.5, - "delta_ref_entropy_loss": 0.08404541015625, - "delta_ref_ppl": -0.04345703125, - "entropy_loss": -0.1573486328125, - "epoch": 0.184, - "grad_norm": 1.918501387261608, - "k1_kl": 0.04345703125, - "k3_kl": 0.020050048828125, - "kimi_kl": 0.031707763671875, - "learning_rate": 4.0799999999999995e-07, + "advantages": -0.0001023965232889168, + "completion_length": 721.0, + "delta_ref_entropy_loss": 0.0247802734375, + "delta_ref_ppl": -0.02880859375, + "entropy_loss": -0.0244140625, + "epoch": 0.092, + "grad_norm": 0.7559793452763748, + "k1_kl": 0.02880859375, + "k3_kl": 0.0167236328125, + "kimi_kl": 0.0390625, + "learning_rate": 4.54e-07, "loss": 0.0008, - "ppl": 0.083953857421875, - "reward": 0.4975941553711891, - "reward_std": 0.0047471304424107075, - "rewards/perpo_ocr_edit_distance_reward": 0.4975941553711891, + "ppl": 0.00958251953125, + "reward": 0.9828248023986816, + "reward_std": 0.0005653434200212359, + "rewards/perpo_ocr_edit_distance_reward": 0.9828249216079712, "step": 460, "temperature": 0.9 }, { - "advantages": -4.9020564802049194e-05, - "completion_length": 1194.0, - "delta_ref_entropy_loss": 0.02197265625, - "delta_ref_ppl": -0.015838623046875, - "entropy_loss": -0.044921875, - "epoch": 0.1844, - "grad_norm": 1.1014170375513561, - "k1_kl": 0.01580810546875, - "k3_kl": 0.0093231201171875, - "kimi_kl": 0.037139892578125, - "learning_rate": 4.078e-07, - "loss": 0.0004, - "ppl": 0.02374267578125, - "reward": 0.9961119294166565, - "reward_std": 0.0011700853356160223, - "rewards/perpo_ocr_edit_distance_reward": 0.9961119890213013, + "advantages": -0.0005960464477539062, + "completion_length": 328.0, + "delta_ref_entropy_loss": 0.021728515625, + "delta_ref_ppl": -0.0162353515625, + "entropy_loss": -0.01177978515625, + "epoch": 0.0922, + "grad_norm": 0.011403703610522002, + "k1_kl": 0.0162353515625, + "k3_kl": 0.00799560546875, + "kimi_kl": 0.0146484375, + "learning_rate": 4.539e-07, + "loss": 0.0009, + "ppl": 0.0023193359375, + "reward": 0.9842848777770996, + "reward_std": 0.0, + "rewards/perpo_ocr_edit_distance_reward": 0.9842849969863892, "step": 461, "temperature": 0.9 }, { - "advantages": -9.541852549421748e-05, - "completion_length": 851.5, - "delta_ref_entropy_loss": 0.0562744140625, - "delta_ref_ppl": -0.0328369140625, - "entropy_loss": -0.14129638671875, - "epoch": 0.1848, - "grad_norm": 2.7676603327117744, - "k1_kl": 0.0330810546875, - "k3_kl": 0.016021728515625, - "kimi_kl": 0.03387451171875, - "learning_rate": 4.076e-07, - "loss": 0.0007, - "ppl": 0.0792236328125, - "reward": 0.6575115025043488, - "reward_std": 0.09472301689675078, - "rewards/perpo_ocr_edit_distance_reward": 0.6575115323066711, + "advantages": -0.0001368097000522539, + "completion_length": 267.0, + "delta_ref_entropy_loss": 0.028564453125, + "delta_ref_ppl": -0.01226806640625, + "entropy_loss": -0.0208740234375, + "epoch": 0.0924, + "grad_norm": 1.299632023040164, + "k1_kl": 0.01239013671875, + "k3_kl": 0.0032196044921875, + "kimi_kl": 0.00408935546875, + "learning_rate": 4.5379999999999995e-07, + "loss": 0.0003, + "ppl": 0.00830078125, + "reward": 0.9794649481773376, + "reward_std": 0.0005223894258961082, + "rewards/perpo_ocr_edit_distance_reward": 0.9794650077819824, "step": 462, "temperature": 0.9 }, { - "advantages": -1.0762896636151709e-05, - "completion_length": 835.5, - "delta_ref_entropy_loss": 0.0174560546875, - "delta_ref_ppl": -0.0064697265625, - "entropy_loss": -0.0216064453125, - "epoch": 0.1852, - "grad_norm": 0.2696337106021107, - "k1_kl": 0.006439208984375, - "k3_kl": 0.0026702880859375, - "kimi_kl": 0.00339508056640625, - "learning_rate": 4.0739999999999996e-07, - "loss": 0.0001, - "ppl": 0.00902557373046875, - "reward": 0.9954689145088196, - "reward_std": 0.0017293124692514539, - "rewards/perpo_ocr_edit_distance_reward": 0.9954689741134644, + "advantages": -2.384185791015625e-07, + "completion_length": 493.0, + "delta_ref_entropy_loss": 0.07177734375, + "delta_ref_ppl": -0.05322265625, + "entropy_loss": -0.1259765625, + "epoch": 0.0926, + "grad_norm": 3.6127149597190007, + "k1_kl": 0.052978515625, + "k3_kl": 0.0283203125, + "kimi_kl": 0.04638671875, + "learning_rate": 4.537e-07, + "loss": 0.0011, + "ppl": 0.0654296875, + "reward": 0.7696003913879395, + "reward_std": 0.03801080584526062, + "rewards/perpo_ocr_edit_distance_reward": 0.7696004509925842, "step": 463, "temperature": 0.9 }, { - "advantages": -0.00040411097870673984, - "completion_length": 373.0, - "delta_ref_entropy_loss": 0.02496337890625, - "delta_ref_ppl": -0.01190185546875, - "entropy_loss": -0.0196533203125, - "epoch": 0.1856, - "grad_norm": 0.30230717451450545, - "k1_kl": 0.011810302734375, - "k3_kl": 0.004913330078125, - "kimi_kl": 0.008026123046875, - "learning_rate": 4.072e-07, - "loss": 0.0006, - "ppl": 0.0081024169921875, - "reward": 0.9961651563644409, - "reward_std": 0.00017073377966880798, - "rewards/perpo_ocr_edit_distance_reward": 0.9961652159690857, + "advantages": -8.514949456639442e-08, + "completion_length": 242.0, + "delta_ref_entropy_loss": 0.0235595703125, + "delta_ref_ppl": -0.05322265625, + "entropy_loss": -0.1591796875, + "epoch": 0.0928, + "grad_norm": 1.919813792551111, + "k1_kl": 0.052978515625, + "k3_kl": 0.04248046875, + "kimi_kl": 0.13671875, + "learning_rate": 4.536e-07, + "loss": 0.0017, + "ppl": 0.07666015625, + "reward": 0.6941400766372681, + "reward_std": 0.4293837249279022, + "rewards/perpo_ocr_edit_distance_reward": 0.6941400766372681, "step": 464, "temperature": 0.9 }, { - "advantages": -3.684418538796308e-05, - "completion_length": 269.5, - "delta_ref_entropy_loss": 0.0399169921875, - "delta_ref_ppl": -0.040283203125, - "entropy_loss": -0.07769775390625, - "epoch": 0.186, - "grad_norm": 1.00224083029551, - "k1_kl": 0.0400390625, - "k3_kl": 0.02581024169921875, - "kimi_kl": 0.0677947998046875, - "learning_rate": 4.07e-07, - "loss": 0.0011, - "ppl": 0.0386962890625, - "reward": 0.3178934268653393, - "reward_std": 0.005339631345123053, - "rewards/perpo_ocr_edit_distance_reward": 0.3178934305906296, + "advantages": -8.004052460819366e-07, + "completion_length": 1003.0, + "delta_ref_entropy_loss": 0.02783203125, + "delta_ref_ppl": -0.02099609375, + "entropy_loss": -0.041259765625, + "epoch": 0.093, + "grad_norm": 1.1961267526131556, + "k1_kl": 0.02099609375, + "k3_kl": 0.0120849609375, + "kimi_kl": 0.0233154296875, + "learning_rate": 4.535e-07, + "loss": 0.0005, + "ppl": 0.0216064453125, + "reward": 0.8722906112670898, + "reward_std": 0.07357166707515717, + "rewards/perpo_ocr_edit_distance_reward": 0.8722906708717346, "step": 465, "temperature": 0.9 }, { - "advantages": -6.769384981453186e-07, - "completion_length": 577.0, - "delta_ref_entropy_loss": 0.0992431640625, - "delta_ref_ppl": -0.061767578125, - "entropy_loss": -0.2138671875, - "epoch": 0.1864, - "grad_norm": 1.9909509967440162, - "k1_kl": 0.061767578125, - "k3_kl": 0.03057861328125, - "kimi_kl": 0.05029296875, - "learning_rate": 4.0679999999999996e-07, - "loss": 0.0012, - "ppl": 0.11474609375, - "reward": 0.49700942263007164, - "reward_std": 0.03016636474058032, - "rewards/perpo_ocr_edit_distance_reward": 0.4970094822347164, + "advantages": -0.00021336760255508125, + "completion_length": 909.0, + "delta_ref_entropy_loss": 0.0274658203125, + "delta_ref_ppl": -0.0177001953125, + "entropy_loss": -0.032958984375, + "epoch": 0.0932, + "grad_norm": 0.31416837363362937, + "k1_kl": 0.0177001953125, + "k3_kl": 0.00830078125, + "kimi_kl": 0.0150146484375, + "learning_rate": 4.534e-07, + "loss": 0.0005, + "ppl": 0.0133056640625, + "reward": 0.9864624738693237, + "reward_std": 0.00033899079426191747, + "rewards/perpo_ocr_edit_distance_reward": 0.9864625930786133, "step": 466, "temperature": 0.9 }, { - "advantages": -2.4961575036286376e-05, - "completion_length": 440.0, - "delta_ref_entropy_loss": 0.0401611328125, - "delta_ref_ppl": -0.0416259765625, - "entropy_loss": -0.07012939453125, - "epoch": 0.1868, - "grad_norm": 1.1370981774077498, - "k1_kl": 0.0413818359375, - "k3_kl": 0.0247802734375, - "kimi_kl": 0.0496826171875, - "learning_rate": 4.066e-07, - "loss": 0.001, - "ppl": 0.039520263671875, - "reward": 0.9797825515270233, - "reward_std": 0.017828369804192334, - "rewards/perpo_ocr_edit_distance_reward": 0.9797825813293457, + "advantages": -0.00014268075756262988, + "completion_length": 397.0, + "delta_ref_entropy_loss": 0.023681640625, + "delta_ref_ppl": -0.01068115234375, + "entropy_loss": -0.01043701171875, + "epoch": 0.0934, + "grad_norm": 0.26829081302859914, + "k1_kl": 0.01068115234375, + "k3_kl": 0.0036773681640625, + "kimi_kl": 0.00531005859375, + "learning_rate": 4.5329999999999996e-07, + "loss": 0.0003, + "ppl": 0.0036773681640625, + "reward": 0.98345547914505, + "reward_std": 0.0002580330765340477, + "rewards/perpo_ocr_edit_distance_reward": 0.9834555387496948, "step": 467, "temperature": 0.9 }, { - "advantages": -0.0002983127321556367, - "completion_length": 1135.0, - "delta_ref_entropy_loss": 0.038818359375, - "delta_ref_ppl": -0.023223876953125, - "entropy_loss": -0.0722198486328125, - "epoch": 0.1872, - "grad_norm": 2.0629410514013085, - "k1_kl": 0.0232391357421875, - "k3_kl": 0.01375579833984375, - "kimi_kl": 0.02944183349609375, - "learning_rate": 4.064e-07, - "loss": 0.0008, - "ppl": 0.03585052490234375, - "reward": 0.9779467582702637, - "reward_std": 0.03704526275396347, - "rewards/perpo_ocr_edit_distance_reward": 0.9779467880725861, + "advantages": 3.916876778475853e-07, + "completion_length": 208.0, + "delta_ref_entropy_loss": 0.042724609375, + "delta_ref_ppl": -0.05517578125, + "entropy_loss": -0.134765625, + "epoch": 0.0936, + "grad_norm": 2.5953106779647928, + "k1_kl": 0.054931640625, + "k3_kl": 0.0771484375, + "kimi_kl": 0.09423828125, + "learning_rate": 4.5319999999999996e-07, + "loss": 0.0031, + "ppl": 0.072265625, + "reward": 0.8988445401191711, + "reward_std": 0.04361499845981598, + "rewards/perpo_ocr_edit_distance_reward": 0.8988445401191711, "step": 468, "temperature": 0.9 }, { - "advantages": -5.859562692700493e-05, - "completion_length": 692.0, - "delta_ref_entropy_loss": 0.03057861328125, - "delta_ref_ppl": -0.01776123046875, - "entropy_loss": -0.03240966796875, - "epoch": 0.1876, - "grad_norm": 0.4718810859692344, - "k1_kl": 0.01776123046875, - "k3_kl": 0.0096435546875, - "kimi_kl": 0.014251708984375, - "learning_rate": 4.0619999999999997e-07, - "loss": 0.0004, - "ppl": 0.016815185546875, - "reward": 0.9961452484130859, - "reward_std": 0.0003423873567953706, - "rewards/perpo_ocr_edit_distance_reward": 0.9961452782154083, + "advantages": -6.372588541125879e-05, + "completion_length": 873.0, + "delta_ref_entropy_loss": 0.039794921875, + "delta_ref_ppl": -0.0213623046875, + "entropy_loss": -0.059326171875, + "epoch": 0.0938, + "grad_norm": 8.97893707288307, + "k1_kl": 0.0213623046875, + "k3_kl": 0.037353515625, + "kimi_kl": 0.0361328125, + "learning_rate": 4.531e-07, + "loss": 0.0016, + "ppl": 0.040771484375, + "reward": 0.9722188711166382, + "reward_std": 0.0015034318203106523, + "rewards/perpo_ocr_edit_distance_reward": 0.9722189903259277, "step": 469, "temperature": 0.9 }, { - "advantages": 1.8690311662794556e-06, - "completion_length": 378.5, - "delta_ref_entropy_loss": 0.067626953125, - "delta_ref_ppl": -0.0709228515625, - "entropy_loss": -0.06573486328125, - "epoch": 0.188, - "grad_norm": 1.0500389223971085, - "k1_kl": 0.0706787109375, - "k3_kl": 0.04278564453125, - "kimi_kl": 0.1669921875, - "learning_rate": 4.06e-07, - "loss": 0.0017, - "ppl": 0.032379150390625, - "reward": 0.9890789091587067, - "reward_std": 0.0024166194489225745, - "rewards/perpo_ocr_edit_distance_reward": 0.9890789687633514, + "advantages": 6.982258469179214e-07, + "completion_length": 790.0, + "delta_ref_entropy_loss": 0.10400390625, + "delta_ref_ppl": -0.0556640625, + "entropy_loss": -0.26171875, + "epoch": 0.094, + "grad_norm": 3.511728345527181, + "k1_kl": 0.0556640625, + "k3_kl": 0.031982421875, + "kimi_kl": 0.046630859375, + "learning_rate": 4.53e-07, + "loss": 0.0013, + "ppl": 0.15625, + "reward": 0.7886508107185364, + "reward_std": 0.024271439760923386, + "rewards/perpo_ocr_edit_distance_reward": 0.7886508703231812, "step": 470, "temperature": 0.9 }, { - "advantages": -2.196005607402185e-05, - "completion_length": 766.5, - "delta_ref_entropy_loss": 0.071533203125, - "delta_ref_ppl": -0.046600341796875, - "entropy_loss": -0.1019287109375, - "epoch": 0.1884, - "grad_norm": 1.2728142092393515, - "k1_kl": 0.046630859375, - "k3_kl": 0.0247802734375, - "kimi_kl": 0.057342529296875, - "learning_rate": 4.058e-07, - "loss": 0.001, - "ppl": 0.052978515625, - "reward": 0.9262180030345917, - "reward_std": 0.004153226822381839, - "rewards/perpo_ocr_edit_distance_reward": 0.9262180626392365, + "advantages": 0.0, + "completion_length": 322.0, + "delta_ref_entropy_loss": 0.024658203125, + "delta_ref_ppl": -0.015380859375, + "entropy_loss": -0.00958251953125, + "epoch": 0.0942, + "grad_norm": 0.009392375776053645, + "k1_kl": 0.01544189453125, + "k3_kl": 0.007415771484375, + "kimi_kl": 0.0152587890625, + "learning_rate": 4.529e-07, + "loss": 0.0003, + "ppl": 0.00157928466796875, + "reward": 0.9809182286262512, + "reward_std": 0.0, + "rewards/perpo_ocr_edit_distance_reward": 0.980918288230896, "step": 471, "temperature": 0.9 }, { - "advantages": -7.31221298337914e-05, - "completion_length": 374.5, - "delta_ref_entropy_loss": 0.034515380859375, - "delta_ref_ppl": -0.0390625, - "entropy_loss": -0.04815673828125, - "epoch": 0.1888, - "grad_norm": 0.6339246537338763, - "k1_kl": 0.0391845703125, - "k3_kl": 0.023468017578125, - "kimi_kl": 0.0513916015625, - "learning_rate": 4.056e-07, - "loss": 0.001, - "ppl": 0.0226287841796875, - "reward": 0.6883965134620667, - "reward_std": 0.0006947017755010165, - "rewards/perpo_ocr_edit_distance_reward": 0.6883966028690338, + "advantages": -0.00017195088730659336, + "completion_length": 782.0, + "delta_ref_entropy_loss": 0.0145263671875, + "delta_ref_ppl": -0.009765625, + "entropy_loss": -0.007293701171875, + "epoch": 0.0944, + "grad_norm": 0.24585163124999718, + "k1_kl": 0.009765625, + "k3_kl": 0.004425048828125, + "kimi_kl": 0.007720947265625, + "learning_rate": 4.528e-07, + "loss": 0.0003, + "ppl": 0.0034942626953125, + "reward": 0.9863900542259216, + "reward_std": 0.00024663633666932583, + "rewards/perpo_ocr_edit_distance_reward": 0.9863901734352112, "step": 472, "temperature": 0.9 }, { - "advantages": -2.2436892095356598e-06, - "completion_length": 275.5, - "delta_ref_entropy_loss": 0.162109375, - "delta_ref_ppl": -0.08251953125, - "entropy_loss": -0.24609375, - "epoch": 0.1892, - "grad_norm": 1.9444645241967906, - "k1_kl": 0.08251953125, - "k3_kl": 0.041015625, - "kimi_kl": 0.0732421875, - "learning_rate": 4.0539999999999996e-07, - "loss": 0.0016, - "ppl": 0.13916015625, - "reward": 0.7749434411525726, - "reward_std": 0.015718692680820823, - "rewards/perpo_ocr_edit_distance_reward": 0.774943470954895, + "advantages": -3.600120544433594e-05, + "completion_length": 412.0, + "delta_ref_entropy_loss": 0.0205078125, + "delta_ref_ppl": -0.037353515625, + "entropy_loss": -0.024658203125, + "epoch": 0.0946, + "grad_norm": 1.1225173675646518, + "k1_kl": 0.037353515625, + "k3_kl": 0.0250244140625, + "kimi_kl": 0.06396484375, + "learning_rate": 4.5269999999999997e-07, + "loss": 0.001, + "ppl": 0.011962890625, + "reward": 0.9727954268455505, + "reward_std": 0.001083211856894195, + "rewards/perpo_ocr_edit_distance_reward": 0.9727955460548401, "step": 473, "temperature": 0.9 }, { - "advantages": -4.661934781324817e-06, - "completion_length": 722.5, - "delta_ref_entropy_loss": 0.02532958984375, - "delta_ref_ppl": -0.0211181640625, - "entropy_loss": -0.0389404296875, - "epoch": 0.1896, - "grad_norm": 1.4589110898161672, - "k1_kl": 0.02105712890625, - "k3_kl": 0.0135498046875, - "kimi_kl": 0.0386962890625, - "learning_rate": 4.052e-07, - "loss": 0.0005, - "ppl": 0.01983642578125, - "reward": 0.8253108263015747, - "reward_std": 0.009964405093342066, - "rewards/perpo_ocr_edit_distance_reward": 0.8253109455108643, + "advantages": -1.482452717027627e-05, + "completion_length": 394.0, + "delta_ref_entropy_loss": 0.046875, + "delta_ref_ppl": -0.053466796875, + "entropy_loss": -0.07177734375, + "epoch": 0.0948, + "grad_norm": 1.322455729374095, + "k1_kl": 0.053466796875, + "k3_kl": 0.033447265625, + "kimi_kl": 0.076171875, + "learning_rate": 4.5259999999999996e-07, + "loss": 0.0013, + "ppl": 0.038330078125, + "reward": 0.9408402442932129, + "reward_std": 0.0016236844239756465, + "rewards/perpo_ocr_edit_distance_reward": 0.9408403635025024, "step": 474, "temperature": 0.9 }, { - "advantages": -2.189193637036624e-05, - "completion_length": 499.5, - "delta_ref_entropy_loss": 0.02947998046875, - "delta_ref_ppl": -0.024200439453125, - "entropy_loss": -0.02178955078125, - "epoch": 0.19, - "grad_norm": 0.7735989150428033, - "k1_kl": 0.024169921875, - "k3_kl": 0.01451873779296875, - "kimi_kl": 0.04361724853515625, - "learning_rate": 4.05e-07, - "loss": 0.0006, - "ppl": 0.0098724365234375, - "reward": 0.9866104125976562, - "reward_std": 0.020063563395524397, - "rewards/perpo_ocr_edit_distance_reward": 0.986610472202301, + "advantages": 1.437323498976184e-05, + "completion_length": 241.0, + "delta_ref_entropy_loss": 0.06689453125, + "delta_ref_ppl": -0.07177734375, + "entropy_loss": -0.03955078125, + "epoch": 0.095, + "grad_norm": 1.3288811575399397, + "k1_kl": 0.07177734375, + "k3_kl": 0.042236328125, + "kimi_kl": 0.109375, + "learning_rate": 4.525e-07, + "loss": 0.0017, + "ppl": 0.01904296875, + "reward": 0.8735764622688293, + "reward_std": 0.0004923645174130797, + "rewards/perpo_ocr_edit_distance_reward": 0.8735764622688293, "step": 475, "temperature": 0.9 }, { - "advantages": -0.0002981935228660859, - "completion_length": 507.5, - "delta_ref_entropy_loss": 0.0614013671875, - "delta_ref_ppl": -0.02813720703125, - "entropy_loss": -0.135986328125, - "epoch": 0.1904, - "grad_norm": 0.6372668356265663, - "k1_kl": 0.0283203125, - "k3_kl": 0.013671875, - "kimi_kl": 0.01904296875, - "learning_rate": 4.0479999999999997e-07, - "loss": 0.0008, - "ppl": 0.07586669921875, - "reward": 0.9328307211399078, - "reward_std": 0.0647805705666542, - "rewards/perpo_ocr_edit_distance_reward": 0.932830810546875, + "advantages": -0.00021274175378493965, + "completion_length": 564.0, + "delta_ref_entropy_loss": 0.016357421875, + "delta_ref_ppl": -0.006072998046875, + "entropy_loss": -0.0089111328125, + "epoch": 0.0952, + "grad_norm": 0.30830977014371175, + "k1_kl": 0.006072998046875, + "k3_kl": 0.002655029296875, + "kimi_kl": 0.005615234375, + "learning_rate": 4.524e-07, + "loss": 0.0003, + "ppl": 0.00225830078125, + "reward": 0.9871808290481567, + "reward_std": 0.00018018369155470282, + "rewards/perpo_ocr_edit_distance_reward": 0.9871808886528015, "step": 476, "temperature": 0.9 }, { - "advantages": 1.0388239388703369e-06, - "completion_length": 244.0, - "delta_ref_entropy_loss": 0.05712890625, - "delta_ref_ppl": -0.0635986328125, - "entropy_loss": -0.06298828125, - "epoch": 0.1908, - "grad_norm": 0.914529592520111, - "k1_kl": 0.063720703125, - "k3_kl": 0.041015625, - "kimi_kl": 0.130859375, - "learning_rate": 4.046e-07, - "loss": 0.0016, - "ppl": 0.03009033203125, - "reward": 0.8849352300167084, - "reward_std": 0.0037875293637625873, - "rewards/perpo_ocr_edit_distance_reward": 0.8849352896213531, + "advantages": -1.2431826235115295e-06, + "completion_length": 990.0, + "delta_ref_entropy_loss": 0.0189208984375, + "delta_ref_ppl": -0.0108642578125, + "entropy_loss": -0.049072265625, + "epoch": 0.0954, + "grad_norm": 0.8760669097966192, + "k1_kl": 0.01092529296875, + "k3_kl": 0.005950927734375, + "kimi_kl": 0.01251220703125, + "learning_rate": 4.5229999999999994e-07, + "loss": 0.0002, + "ppl": 0.0284423828125, + "reward": 0.9458158612251282, + "reward_std": 0.01359253004193306, + "rewards/perpo_ocr_edit_distance_reward": 0.945815920829773, "step": 477, "temperature": 0.9 }, { - "advantages": -7.345847188844346e-05, - "completion_length": 400.0, - "delta_ref_entropy_loss": 0.02923583984375, - "delta_ref_ppl": -0.016357421875, - "entropy_loss": -0.014007568359375, - "epoch": 0.1912, - "grad_norm": 0.40947312314755785, - "k1_kl": 0.01629638671875, - "k3_kl": 0.0078582763671875, - "kimi_kl": 0.012603759765625, - "learning_rate": 4.0439999999999994e-07, - "loss": 0.0004, - "ppl": 0.005401611328125, - "reward": 0.9966090321540833, - "reward_std": 0.0009988982346840203, - "rewards/perpo_ocr_edit_distance_reward": 0.996609091758728, + "advantages": -2.55448497910038e-07, + "completion_length": 613.0, + "delta_ref_entropy_loss": 0.1044921875, + "delta_ref_ppl": -0.048095703125, + "entropy_loss": -0.220703125, + "epoch": 0.0956, + "grad_norm": 1.3794078019760256, + "k1_kl": 0.048583984375, + "k3_kl": 0.02197265625, + "kimi_kl": 0.033935546875, + "learning_rate": 4.522e-07, + "loss": 0.0009, + "ppl": 0.1220703125, + "reward": 0.7197683453559875, + "reward_std": 0.15791921317577362, + "rewards/perpo_ocr_edit_distance_reward": 0.7197684049606323, "step": 478, "temperature": 0.9 }, { - "advantages": -9.210065968545678e-06, - "completion_length": 475.0, - "delta_ref_entropy_loss": 0.067138671875, - "delta_ref_ppl": -0.03778076171875, - "entropy_loss": -0.14404296875, - "epoch": 0.1916, - "grad_norm": 1.3635393967908396, - "k1_kl": 0.0377197265625, - "k3_kl": 0.022674560546875, - "kimi_kl": 0.0728759765625, - "learning_rate": 4.042e-07, - "loss": 0.0009, - "ppl": 0.079345703125, - "reward": 0.9296967089176178, - "reward_std": 0.021070991351734847, - "rewards/perpo_ocr_edit_distance_reward": 0.9296967685222626, + "advantages": -6.823029252700508e-05, + "completion_length": 886.0, + "delta_ref_entropy_loss": 0.01409912109375, + "delta_ref_ppl": -0.01287841796875, + "entropy_loss": -0.0186767578125, + "epoch": 0.0958, + "grad_norm": 0.48701235425178996, + "k1_kl": 0.01287841796875, + "k3_kl": 0.006988525390625, + "kimi_kl": 0.01336669921875, + "learning_rate": 4.521e-07, + "loss": 0.0003, + "ppl": 0.00653076171875, + "reward": 0.979035496711731, + "reward_std": 0.00039918479160405695, + "rewards/perpo_ocr_edit_distance_reward": 0.9790355563163757, "step": 479, "temperature": 0.9 }, { - "advantages": -1.1333398106216919e-05, - "completion_length": 609.5, - "delta_ref_entropy_loss": 0.02349853515625, - "delta_ref_ppl": -0.02093505859375, - "entropy_loss": -0.040283203125, - "epoch": 0.192, - "grad_norm": 1.2075866852411128, - "k1_kl": 0.0208740234375, - "k3_kl": 0.0128173828125, - "kimi_kl": 0.0213623046875, - "learning_rate": 4.04e-07, - "loss": 0.0005, - "ppl": 0.0224609375, - "reward": 0.9879380166530609, - "reward_std": 0.0009987110388465226, - "rewards/perpo_ocr_edit_distance_reward": 0.9879380464553833, + "advantages": -5.7297096645925194e-05, + "completion_length": 473.0, + "delta_ref_entropy_loss": 0.0478515625, + "delta_ref_ppl": -0.034423828125, + "entropy_loss": -0.0303955078125, + "epoch": 0.096, + "grad_norm": 0.5077887225257064, + "k1_kl": 0.03466796875, + "k3_kl": 0.0205078125, + "kimi_kl": 0.04296875, + "learning_rate": 4.5199999999999997e-07, + "loss": 0.0009, + "ppl": 0.01141357421875, + "reward": 0.9848908185958862, + "reward_std": 0.0003459099098108709, + "rewards/perpo_ocr_edit_distance_reward": 0.9848908185958862, "step": 480, "temperature": 0.9 }, { - "advantages": -0.0004240955749992281, - "completion_length": 744.5, - "delta_ref_entropy_loss": 0.017333984375, - "delta_ref_ppl": -0.0124359130859375, - "entropy_loss": -0.0124969482421875, - "epoch": 0.1924, - "grad_norm": 0.16833730282039652, - "k1_kl": 0.0124969482421875, - "k3_kl": 0.00661468505859375, - "kimi_kl": 0.01497650146484375, - "learning_rate": 4.0379999999999995e-07, - "loss": 0.0007, - "ppl": 0.003650665283203125, - "reward": 0.9990531504154205, - "reward_std": 0.00011885133426403627, - "rewards/perpo_ocr_edit_distance_reward": 0.9990532398223877, + "advantages": 6.184408266562968e-05, + "completion_length": 509.0, + "delta_ref_entropy_loss": 0.0250244140625, + "delta_ref_ppl": -0.0255126953125, + "entropy_loss": -0.011962890625, + "epoch": 0.0962, + "grad_norm": 0.38181197239965536, + "k1_kl": 0.0255126953125, + "k3_kl": 0.01470947265625, + "kimi_kl": 0.033203125, + "learning_rate": 4.519e-07, + "loss": 0.0005, + "ppl": 0.005767822265625, + "reward": 0.9869068264961243, + "reward_std": 0.0003130900440737605, + "rewards/perpo_ocr_edit_distance_reward": 0.986906886100769, "step": 481, "temperature": 0.9 }, { - "advantages": -0.0005960464477539062, - "completion_length": 89.0, - "delta_ref_entropy_loss": 0.0504150390625, - "delta_ref_ppl": -0.0616455078125, - "entropy_loss": -0.0250244140625, - "epoch": 0.1928, - "grad_norm": 0.17456996033844097, - "k1_kl": 0.061767578125, - "k3_kl": 0.04156494140625, - "kimi_kl": 0.097900390625, - "learning_rate": 4.036e-07, - "loss": 0.0023, - "ppl": 0.012969970703125, - "reward": 0.9908961951732635, - "reward_std": 0.0, - "rewards/perpo_ocr_edit_distance_reward": 0.9908962547779083, - "step": 482, - "temperature": 0.9 - }, - { - "advantages": -1.4560563954546524e-06, - "completion_length": 745.0, - "delta_ref_entropy_loss": 0.0465087890625, - "delta_ref_ppl": -0.0436859130859375, - "entropy_loss": -0.0859375, - "epoch": 0.1932, - "grad_norm": 3.2004276661119135, - "k1_kl": 0.043670654296875, - "k3_kl": 0.022705078125, - "kimi_kl": 0.038360595703125, - "learning_rate": 4.0339999999999997e-07, - "loss": 0.0009, - "ppl": 0.0382080078125, - "reward": 0.9305991530418396, - "reward_std": 0.008480908814817667, - "rewards/perpo_ocr_edit_distance_reward": 0.930599182844162, - "step": 483, + "advantages": -0.0002454349014442414, + "completion_length": 906.0, + "delta_ref_entropy_loss": 0.0208740234375, + "delta_ref_ppl": -0.0089111328125, + "entropy_loss": -0.013671875, + "epoch": 0.0964, + "grad_norm": 0.33571993543765855, + "k1_kl": 0.0089111328125, + "k3_kl": 0.00360107421875, + "kimi_kl": 0.0050048828125, + "learning_rate": 4.5179999999999996e-07, + "loss": 0.0004, + "ppl": 0.004608154296875, + "reward": 0.8922232389450073, + "reward_std": 0.00028165412368252873, + "rewards/perpo_ocr_edit_distance_reward": 0.8922233581542969, + "step": 482, "temperature": 0.9 }, { - "advantages": -4.8522438873987994e-05, - "completion_length": 949.5, - "delta_ref_entropy_loss": 0.0550537109375, - "delta_ref_ppl": -0.02850341796875, - "entropy_loss": -0.107177734375, - "epoch": 0.1936, - "grad_norm": 0.9525029328451622, - "k1_kl": 0.028472900390625, - "k3_kl": 0.0137481689453125, - "kimi_kl": 0.0321044921875, - "learning_rate": 4.032e-07, + "advantages": -2.7869429686688818e-05, + "completion_length": 663.0, + "delta_ref_entropy_loss": 0.03173828125, + "delta_ref_ppl": -0.0230712890625, + "entropy_loss": -0.042236328125, + "epoch": 0.0966, + "grad_norm": 0.5716728073441979, + "k1_kl": 0.0230712890625, + "k3_kl": 0.01507568359375, + "kimi_kl": 0.039794921875, + "learning_rate": 4.5169999999999995e-07, "loss": 0.0006, - "ppl": 0.0570068359375, - "reward": 0.9210259914398193, - "reward_std": 0.008654517063405365, - "rewards/perpo_ocr_edit_distance_reward": 0.9210260808467865, + "ppl": 0.0216064453125, + "reward": 0.980970561504364, + "reward_std": 0.0008168932981789112, + "rewards/perpo_ocr_edit_distance_reward": 0.9809706211090088, + "step": 483, + "temperature": 0.9 + }, + { + "advantages": -4.435437222127803e-05, + "completion_length": 602.0, + "delta_ref_entropy_loss": 0.031494140625, + "delta_ref_ppl": -0.04345703125, + "entropy_loss": -0.0235595703125, + "epoch": 0.0968, + "grad_norm": 0.8813978956372625, + "k1_kl": 0.04345703125, + "k3_kl": 0.027587890625, + "kimi_kl": 0.06787109375, + "learning_rate": 4.516e-07, + "loss": 0.0011, + "ppl": 0.01312255859375, + "reward": 0.9833354949951172, + "reward_std": 0.0010517156915739179, + "rewards/perpo_ocr_edit_distance_reward": 0.983335554599762, "step": 484, "temperature": 0.9 }, { - "advantages": 5.5568561425189955e-05, - "completion_length": 943.5, - "delta_ref_entropy_loss": 0.0633544921875, - "delta_ref_ppl": -0.039031982421875, - "entropy_loss": -0.10833740234375, - "epoch": 0.194, - "grad_norm": 5.785746661577999, - "k1_kl": 0.0390472412109375, - "k3_kl": 0.02240753173828125, - "kimi_kl": 0.055755615234375, - "learning_rate": 4.03e-07, - "loss": 0.0008, - "ppl": 0.0594940185546875, - "reward": 0.8707817494869232, - "reward_std": 0.11264554678928107, - "rewards/perpo_ocr_edit_distance_reward": 0.8707817494869232, + "advantages": -4.346030254964717e-05, + "completion_length": 495.0, + "delta_ref_entropy_loss": 0.0269775390625, + "delta_ref_ppl": -0.0225830078125, + "entropy_loss": -0.037841796875, + "epoch": 0.097, + "grad_norm": 0.9646289718845747, + "k1_kl": 0.022705078125, + "k3_kl": 0.01220703125, + "kimi_kl": 0.0206298828125, + "learning_rate": 4.515e-07, + "loss": 0.0005, + "ppl": 0.019775390625, + "reward": 0.9808200597763062, + "reward_std": 0.0016633451450616121, + "rewards/perpo_ocr_edit_distance_reward": 0.9808201789855957, "step": 485, "temperature": 0.9 }, { - "advantages": -2.7103083994006738e-05, - "completion_length": 1134.5, - "delta_ref_entropy_loss": 0.04119873046875, - "delta_ref_ppl": -0.03070068359375, - "entropy_loss": -0.064697265625, - "epoch": 0.1944, - "grad_norm": 1.3915163943380844, - "k1_kl": 0.03070068359375, - "k3_kl": 0.019134521484375, - "kimi_kl": 0.03564453125, - "learning_rate": 4.028e-07, - "loss": 0.0008, - "ppl": 0.032470703125, - "reward": 0.953559160232544, - "reward_std": 0.018193969794083387, - "rewards/perpo_ocr_edit_distance_reward": 0.9535592198371887, + "advantages": -6.16908073425293e-05, + "completion_length": 896.0, + "delta_ref_entropy_loss": 0.00982666015625, + "delta_ref_ppl": -0.01153564453125, + "entropy_loss": -0.017578125, + "epoch": 0.0972, + "grad_norm": 0.3373526887457087, + "k1_kl": 0.01153564453125, + "k3_kl": 0.007476806640625, + "kimi_kl": 0.0198974609375, + "learning_rate": 4.514e-07, + "loss": 0.0004, + "ppl": 0.00701904296875, + "reward": 0.9772942066192627, + "reward_std": 0.000589982548262924, + "rewards/perpo_ocr_edit_distance_reward": 0.9772942662239075, "step": 486, "temperature": 0.9 }, { - "advantages": -2.247946895295172e-06, - "completion_length": 470.5, - "delta_ref_entropy_loss": 0.04779052734375, - "delta_ref_ppl": -0.04119873046875, - "entropy_loss": -0.0701904296875, - "epoch": 0.1948, - "grad_norm": 1.5766888839445097, - "k1_kl": 0.04107666015625, - "k3_kl": 0.024383544921875, - "kimi_kl": 0.0478515625, - "learning_rate": 4.026e-07, - "loss": 0.001, - "ppl": 0.0386962890625, - "reward": 0.9912620484828949, - "reward_std": 0.0014059583190828562, - "rewards/perpo_ocr_edit_distance_reward": 0.9912620484828949, + "advantages": -0.00022271701891440898, + "completion_length": 1413.0, + "delta_ref_entropy_loss": 0.011474609375, + "delta_ref_ppl": -0.007293701171875, + "entropy_loss": -0.016845703125, + "epoch": 0.0974, + "grad_norm": 1.0060104062202984, + "k1_kl": 0.00732421875, + "k3_kl": 0.00433349609375, + "kimi_kl": 0.011962890625, + "learning_rate": 4.5129999999999997e-07, + "loss": 0.0004, + "ppl": 0.00726318359375, + "reward": 0.9829495549201965, + "reward_std": 0.000511647667735815, + "rewards/perpo_ocr_edit_distance_reward": 0.9829497337341309, "step": 487, "temperature": 0.9 }, { - "advantages": -5.568777112330281e-06, - "completion_length": 339.5, - "delta_ref_entropy_loss": 0.03936767578125, - "delta_ref_ppl": -0.0517578125, - "entropy_loss": -0.078125, - "epoch": 0.1952, - "grad_norm": 1.6442001522747633, - "k1_kl": 0.0517578125, - "k3_kl": 0.03790283203125, - "kimi_kl": 0.130615234375, - "learning_rate": 4.0239999999999995e-07, - "loss": 0.0015, - "ppl": 0.0421142578125, - "reward": 0.9531689882278442, - "reward_std": 0.023465996142476797, - "rewards/perpo_ocr_edit_distance_reward": 0.953169047832489, + "advantages": -0.0002584968460723758, + "completion_length": 587.0, + "delta_ref_entropy_loss": 0.021484375, + "delta_ref_ppl": -0.0174560546875, + "entropy_loss": -0.02001953125, + "epoch": 0.0976, + "grad_norm": 0.3598960887266872, + "k1_kl": 0.0174560546875, + "k3_kl": 0.00836181640625, + "kimi_kl": 0.015869140625, + "learning_rate": 4.5119999999999996e-07, + "loss": 0.0006, + "ppl": 0.007598876953125, + "reward": 0.9824277758598328, + "reward_std": 0.0002294081059517339, + "rewards/perpo_ocr_edit_distance_reward": 0.9824278950691223, "step": 488, "temperature": 0.9 }, { - "advantages": -1.5241759818707123e-06, - "completion_length": 198.0, - "delta_ref_entropy_loss": -0.255126953125, - "delta_ref_ppl": -0.30419921875, - "entropy_loss": -0.56298828125, - "epoch": 0.1956, - "grad_norm": 16.893788754153224, - "k1_kl": 0.30615234375, - "k3_kl": 0.198486328125, - "kimi_kl": 0.576416015625, - "learning_rate": 4.022e-07, - "loss": 0.0079, - "ppl": 0.18701171875, - "reward": 0.5058235935866833, - "reward_std": 0.09397748950868845, - "rewards/perpo_ocr_edit_distance_reward": 0.505823627114296, + "advantages": -0.00015376295777969062, + "completion_length": 324.0, + "delta_ref_entropy_loss": 0.021484375, + "delta_ref_ppl": -0.0242919921875, + "entropy_loss": -0.0174560546875, + "epoch": 0.0978, + "grad_norm": 1.04260121690362, + "k1_kl": 0.0242919921875, + "k3_kl": 0.01708984375, + "kimi_kl": 0.03515625, + "learning_rate": 4.5109999999999996e-07, + "loss": 0.0008, + "ppl": 0.0067138671875, + "reward": 0.9175525307655334, + "reward_std": 0.0003429779608268291, + "rewards/perpo_ocr_edit_distance_reward": 0.9175525903701782, "step": 489, "temperature": 0.9 }, { - "advantages": -3.2101361284730956e-05, - "completion_length": 404.5, - "delta_ref_entropy_loss": 0.03564453125, - "delta_ref_ppl": -0.0169677734375, - "entropy_loss": -0.023681640625, - "epoch": 0.196, - "grad_norm": 0.3516327739934454, - "k1_kl": 0.016937255859375, - "k3_kl": 0.008556365966796875, - "kimi_kl": 0.0152435302734375, - "learning_rate": 4.02e-07, - "loss": 0.0004, - "ppl": 0.01116943359375, - "reward": 0.9972618222236633, - "reward_std": 0.0004142585094086826, - "rewards/perpo_ocr_edit_distance_reward": 0.9972618520259857, + "advantages": -2.5664057829999365e-05, + "completion_length": 743.0, + "delta_ref_entropy_loss": 0.039794921875, + "delta_ref_ppl": -0.023193359375, + "entropy_loss": -0.0277099609375, + "epoch": 0.098, + "grad_norm": 1.0934313304569068, + "k1_kl": 0.0233154296875, + "k3_kl": 0.01153564453125, + "kimi_kl": 0.0296630859375, + "learning_rate": 4.51e-07, + "loss": 0.0005, + "ppl": 0.0120849609375, + "reward": 0.9797518849372864, + "reward_std": 0.0008950839983299375, + "rewards/perpo_ocr_edit_distance_reward": 0.9797518849372864, "step": 490, "temperature": 0.9 }, { - "advantages": -4.136562404255528e-05, - "completion_length": 688.5, - "delta_ref_entropy_loss": 0.053955078125, - "delta_ref_ppl": -0.040008544921875, - "entropy_loss": -0.06060791015625, - "epoch": 0.1964, - "grad_norm": 1.2546511029272762, - "k1_kl": 0.040252685546875, - "k3_kl": 0.02398681640625, - "kimi_kl": 0.067962646484375, - "learning_rate": 4.0179999999999996e-07, - "loss": 0.001, - "ppl": 0.03338623046875, - "reward": 0.9611924588680267, - "reward_std": 0.01245338813168928, - "rewards/perpo_ocr_edit_distance_reward": 0.9611925184726715, + "advantages": -2.874646997952368e-05, + "completion_length": 1332.0, + "delta_ref_entropy_loss": 0.03125, + "delta_ref_ppl": -0.0181884765625, + "entropy_loss": -0.0439453125, + "epoch": 0.0982, + "grad_norm": 1.3417381467539016, + "k1_kl": 0.0181884765625, + "k3_kl": 0.01007080078125, + "kimi_kl": 0.01806640625, + "learning_rate": 4.509e-07, + "loss": 0.0004, + "ppl": 0.022705078125, + "reward": 0.9681804180145264, + "reward_std": 0.0004926334368065, + "rewards/perpo_ocr_edit_distance_reward": 0.9681804180145264, "step": 491, "temperature": 0.9 }, { - "advantages": -4.5695475705542776e-05, - "completion_length": 413.0, - "delta_ref_entropy_loss": 0.06646728515625, - "delta_ref_ppl": -0.0628662109375, - "entropy_loss": -0.17694091796875, - "epoch": 0.1968, - "grad_norm": 1.619620524528021, - "k1_kl": 0.0628662109375, - "k3_kl": 0.04193115234375, - "kimi_kl": 0.1573486328125, - "learning_rate": 4.016e-07, - "loss": 0.0017, - "ppl": 0.085693359375, - "reward": 0.6313896328210831, - "reward_std": 0.007571198788355105, - "rewards/perpo_ocr_edit_distance_reward": 0.6313896924257278, + "advantages": -5.960464477539063e-08, + "completion_length": 605.0, + "delta_ref_entropy_loss": 0.0233154296875, + "delta_ref_ppl": -0.027099609375, + "entropy_loss": -0.12158203125, + "epoch": 0.0984, + "grad_norm": 1.5793717822092448, + "k1_kl": 0.0272216796875, + "k3_kl": 0.0174560546875, + "kimi_kl": 0.041015625, + "learning_rate": 4.5079999999999993e-07, + "loss": 0.0007, + "ppl": 0.0859375, + "reward": 0.30982011556625366, + "reward_std": 0.14893417060375214, + "rewards/perpo_ocr_edit_distance_reward": 0.30982014536857605, "step": 492, "temperature": 0.9 }, { - "advantages": -5.364418029785156e-07, - "completion_length": 424.5, - "delta_ref_entropy_loss": 0.06787109375, - "delta_ref_ppl": -0.04150390625, - "entropy_loss": -0.16748046875, - "epoch": 0.1972, - "grad_norm": 1.7248120430476066, - "k1_kl": 0.0413818359375, - "k3_kl": 0.02154541015625, - "kimi_kl": 0.03314208984375, - "learning_rate": 4.014e-07, - "loss": 0.0009, - "ppl": 0.08843994140625, - "reward": 0.8671410977840424, - "reward_std": 0.01595904678106308, - "rewards/perpo_ocr_edit_distance_reward": 0.8671411275863647, + "advantages": -2.0776476503669983e-06, + "completion_length": 816.0, + "delta_ref_entropy_loss": 0.0274658203125, + "delta_ref_ppl": -0.01556396484375, + "entropy_loss": -0.06494140625, + "epoch": 0.0986, + "grad_norm": 3.404686741701666, + "k1_kl": 0.01556396484375, + "k3_kl": 0.0201416015625, + "kimi_kl": 0.0235595703125, + "learning_rate": 4.507e-07, + "loss": 0.0008, + "ppl": 0.0498046875, + "reward": 0.9214508533477783, + "reward_std": 0.0203873198479414, + "rewards/perpo_ocr_edit_distance_reward": 0.9214509129524231, "step": 493, "temperature": 0.9 }, { - "advantages": -3.124347858829424e-05, - "completion_length": 503.0, - "delta_ref_entropy_loss": 0.05523681640625, - "delta_ref_ppl": -0.04547119140625, - "entropy_loss": -0.0301513671875, - "epoch": 0.1976, - "grad_norm": 1.1894288568112916, - "k1_kl": 0.04547119140625, - "k3_kl": 0.0259552001953125, - "kimi_kl": 0.060272216796875, - "learning_rate": 4.0119999999999997e-07, - "loss": 0.0011, - "ppl": 0.015167236328125, - "reward": 0.9958218336105347, - "reward_std": 0.0008414728799834847, - "rewards/perpo_ocr_edit_distance_reward": 0.9958218932151794, + "advantages": -8.514949634275126e-09, + "completion_length": 824.0, + "delta_ref_entropy_loss": 0.059814453125, + "delta_ref_ppl": -0.041015625, + "entropy_loss": -0.056396484375, + "epoch": 0.0988, + "grad_norm": 4.256604632447755, + "k1_kl": 0.041015625, + "k3_kl": 0.0205078125, + "kimi_kl": 0.054443359375, + "learning_rate": 4.5059999999999997e-07, + "loss": 0.0008, + "ppl": 0.020263671875, + "reward": 0.9308803677558899, + "reward_std": 0.03041127510368824, + "rewards/perpo_ocr_edit_distance_reward": 0.9308804273605347, "step": 494, "temperature": 0.9 }, { - "advantages": -0.000300645828474444, - "completion_length": 471.5, - "delta_ref_entropy_loss": 0.02587890625, - "delta_ref_ppl": -0.01995849609375, - "entropy_loss": -0.0321044921875, - "epoch": 0.198, - "grad_norm": 1.0346651319518179, - "k1_kl": 0.02001953125, - "k3_kl": 0.010711669921875, - "kimi_kl": 0.02215576171875, - "learning_rate": 4.01e-07, - "loss": 0.0007, - "ppl": 0.015625, - "reward": 0.993540495634079, - "reward_std": 0.0015654797898605466, - "rewards/perpo_ocr_edit_distance_reward": 0.9935405552387238, + "advantages": -4.5299530029296875e-06, + "completion_length": 601.0, + "delta_ref_entropy_loss": 0.062255859375, + "delta_ref_ppl": -0.042236328125, + "entropy_loss": -0.09765625, + "epoch": 0.099, + "grad_norm": 1.566741826224861, + "k1_kl": 0.042236328125, + "k3_kl": 0.0206298828125, + "kimi_kl": 0.036865234375, + "learning_rate": 4.505e-07, + "loss": 0.0008, + "ppl": 0.04931640625, + "reward": 0.8530126214027405, + "reward_std": 0.011156733147799969, + "rewards/perpo_ocr_edit_distance_reward": 0.8530126810073853, "step": 495, "temperature": 0.9 }, { - "advantages": -0.0001300530774130948, - "completion_length": 637.5, - "delta_ref_entropy_loss": 0.0264892578125, - "delta_ref_ppl": -0.02679443359375, - "entropy_loss": -0.0296630859375, - "epoch": 0.1984, - "grad_norm": 0.9226359757426283, - "k1_kl": 0.02667236328125, - "k3_kl": 0.018402099609375, - "kimi_kl": 0.0517578125, - "learning_rate": 4.008e-07, - "loss": 0.0009, - "ppl": 0.01275634765625, - "reward": 0.9597093760967255, - "reward_std": 0.007962141149619129, - "rewards/perpo_ocr_edit_distance_reward": 0.9597094058990479, + "advantages": -2.895082786835701e-07, + "completion_length": 65.0, + "delta_ref_entropy_loss": 0.032958984375, + "delta_ref_ppl": -0.15234375, + "entropy_loss": -0.1982421875, + "epoch": 0.0992, + "grad_norm": 5.705112480088513, + "k1_kl": 0.1513671875, + "k3_kl": 0.1181640625, + "kimi_kl": 0.39453125, + "learning_rate": 4.504e-07, + "loss": 0.0047, + "ppl": 0.10205078125, + "reward": 0.45818623900413513, + "reward_std": 0.08974876254796982, + "rewards/perpo_ocr_edit_distance_reward": 0.4581862986087799, "step": 496, "temperature": 0.9 }, { - "advantages": -2.9163702492951415e-06, - "completion_length": 250.0, - "delta_ref_entropy_loss": 0.052093505859375, - "delta_ref_ppl": -0.03759765625, - "entropy_loss": -0.054290771484375, - "epoch": 0.1988, - "grad_norm": 0.7537224704815101, - "k1_kl": 0.03778076171875, - "k3_kl": 0.02239990234375, - "kimi_kl": 0.0611572265625, - "learning_rate": 4.006e-07, - "loss": 0.0009, - "ppl": 0.0258941650390625, - "reward": 0.9399915337562561, - "reward_std": 0.0014122307766228914, - "rewards/perpo_ocr_edit_distance_reward": 0.9399915635585785, + "advantages": -1.1124781849503051e-05, + "completion_length": 959.0, + "delta_ref_entropy_loss": 0.06787109375, + "delta_ref_ppl": -0.048828125, + "entropy_loss": -0.0927734375, + "epoch": 0.0994, + "grad_norm": 3.3630168579692348, + "k1_kl": 0.048828125, + "k3_kl": 0.0284423828125, + "kimi_kl": 0.0634765625, + "learning_rate": 4.5029999999999995e-07, + "loss": 0.0012, + "ppl": 0.044677734375, + "reward": 0.8575456142425537, + "reward_std": 0.0014323792420327663, + "rewards/perpo_ocr_edit_distance_reward": 0.8575456738471985, "step": 497, "temperature": 0.9 }, { - "advantages": -6.532563020300586e-06, - "completion_length": 465.0, - "delta_ref_entropy_loss": 0.0400390625, - "delta_ref_ppl": -0.030029296875, - "entropy_loss": -0.03240966796875, - "epoch": 0.1992, - "grad_norm": 0.5809373934609422, - "k1_kl": 0.030029296875, - "k3_kl": 0.01739501953125, - "kimi_kl": 0.035888671875, - "learning_rate": 4.0039999999999996e-07, + "advantages": -0.00015349900058936328, + "completion_length": 452.0, + "delta_ref_entropy_loss": 0.0224609375, + "delta_ref_ppl": -0.02490234375, + "entropy_loss": -0.01275634765625, + "epoch": 0.0996, + "grad_norm": 0.5232392637879392, + "k1_kl": 0.0250244140625, + "k3_kl": 0.01251220703125, + "kimi_kl": 0.0213623046875, + "learning_rate": 4.502e-07, "loss": 0.0007, - "ppl": 0.01666259765625, - "reward": 0.9965218305587769, - "reward_std": 0.0007992405153345317, - "rewards/perpo_ocr_edit_distance_reward": 0.9965218305587769, + "ppl": 0.0037689208984375, + "reward": 0.8092965483665466, + "reward_std": 0.00023282927577383816, + "rewards/perpo_ocr_edit_distance_reward": 0.8092966675758362, "step": 498, "temperature": 0.9 }, { - "advantages": -8.089202196970291e-08, - "completion_length": 989.5, - "delta_ref_entropy_loss": 0.0269775390625, - "delta_ref_ppl": -0.0189208984375, - "entropy_loss": -0.06829833984375, - "epoch": 0.1996, - "grad_norm": 2.775702660757998, - "k1_kl": 0.0189208984375, - "k3_kl": 0.011810302734375, - "kimi_kl": 0.02423095703125, - "learning_rate": 4.002e-07, - "loss": 0.0005, - "ppl": 0.03631591796875, - "reward": 0.8909993469715118, - "reward_std": 0.1109224408864975, - "rewards/perpo_ocr_edit_distance_reward": 0.8909993767738342, + "advantages": -8.26971881906502e-05, + "completion_length": 819.0, + "delta_ref_entropy_loss": 0.01397705078125, + "delta_ref_ppl": -0.0068359375, + "entropy_loss": -0.0238037109375, + "epoch": 0.0998, + "grad_norm": 0.7807032015457236, + "k1_kl": 0.006805419921875, + "k3_kl": 0.0023345947265625, + "kimi_kl": 0.00347900390625, + "learning_rate": 4.501e-07, + "loss": 0.0002, + "ppl": 0.01055908203125, + "reward": 0.9852652549743652, + "reward_std": 0.0009297474171034992, + "rewards/perpo_ocr_edit_distance_reward": 0.98526531457901, "step": 499, "temperature": 0.9 }, { - "advantages": -7.33648079744853e-05, - "completion_length": 1006.0, - "delta_ref_entropy_loss": 0.018585205078125, - "delta_ref_ppl": -0.01300048828125, - "entropy_loss": -0.02703857421875, - "epoch": 0.2, - "grad_norm": 1.2375130456691128, - "k1_kl": 0.01300048828125, - "k3_kl": 0.0073089599609375, - "kimi_kl": 0.0177001953125, - "learning_rate": 4e-07, - "loss": 0.0004, - "ppl": 0.010986328125, - "reward": 0.9615689218044281, - "reward_std": 0.09005109563440783, - "rewards/perpo_ocr_edit_distance_reward": 0.9615689814090729, + "advantages": -2.023152046604082e-05, + "completion_length": 313.0, + "delta_ref_entropy_loss": 0.0419921875, + "delta_ref_ppl": -0.040283203125, + "entropy_loss": -0.0291748046875, + "epoch": 0.1, + "grad_norm": 0.745447119714977, + "k1_kl": 0.040283203125, + "k3_kl": 0.020751953125, + "kimi_kl": 0.038818359375, + "learning_rate": 4.5e-07, + "loss": 0.0008, + "ppl": 0.01092529296875, + "reward": 0.9747920036315918, + "reward_std": 0.0007425532676279545, + "rewards/perpo_ocr_edit_distance_reward": 0.9747920036315918, "step": 500, "temperature": 0.9 }, { - "advantages": 6.67231415718561e-05, - "completion_length": 557.5, - "delta_ref_entropy_loss": 0.02056884765625, - "delta_ref_ppl": -0.014129638671875, - "entropy_loss": -0.0230712890625, - "epoch": 0.2004, - "grad_norm": 0.3094117603996625, - "k1_kl": 0.0140838623046875, - "k3_kl": 0.00737762451171875, - "kimi_kl": 0.0138702392578125, - "learning_rate": 3.9979999999999997e-07, - "loss": 0.0002, - "ppl": 0.008941650390625, - "reward": 0.9979169368743896, - "reward_std": 0.0003015190304722637, - "rewards/perpo_ocr_edit_distance_reward": 0.9979168772697449, + "advantages": -3.9283720980165526e-05, + "completion_length": 787.0, + "delta_ref_entropy_loss": 0.0250244140625, + "delta_ref_ppl": -0.0157470703125, + "entropy_loss": -0.03515625, + "epoch": 0.1002, + "grad_norm": 0.6443221976855779, + "k1_kl": 0.015625, + "k3_kl": 0.007293701171875, + "kimi_kl": 0.01153564453125, + "learning_rate": 4.499e-07, + "loss": 0.0003, + "ppl": 0.0198974609375, + "reward": 0.9702938199043274, + "reward_std": 0.0012003935407847166, + "rewards/perpo_ocr_edit_distance_reward": 0.9702938795089722, "step": 501, "temperature": 0.9 }, { - "advantages": -0.0002980232238769531, - "completion_length": 419.0, - "delta_ref_entropy_loss": 0.02960205078125, - "delta_ref_ppl": -0.01361083984375, - "entropy_loss": -0.0198974609375, - "epoch": 0.2008, - "grad_norm": 0.03955543957016081, - "k1_kl": 0.013641357421875, - "k3_kl": 0.00537109375, - "kimi_kl": 0.00860595703125, - "learning_rate": 3.996e-07, - "loss": 0.0005, - "ppl": 0.009521484375, - "reward": 0.9980981349945068, - "reward_std": 0.0, - "rewards/perpo_ocr_edit_distance_reward": 0.9980981945991516, + "advantages": -1.8805265426635742e-05, + "completion_length": 1109.0, + "delta_ref_entropy_loss": 0.05859375, + "delta_ref_ppl": -0.0247802734375, + "entropy_loss": -0.09375, + "epoch": 0.1004, + "grad_norm": 1.265004161236, + "k1_kl": 0.0247802734375, + "k3_kl": 0.0269775390625, + "kimi_kl": 0.0218505859375, + "learning_rate": 4.4979999999999996e-07, + "loss": 0.0011, + "ppl": 0.05126953125, + "reward": 0.9614647030830383, + "reward_std": 0.0017121266573667526, + "rewards/perpo_ocr_edit_distance_reward": 0.9614647030830383, "step": 502, "temperature": 0.9 }, { - "advantages": -8.566039241486578e-06, - "completion_length": 424.5, - "delta_ref_entropy_loss": 0.04095458984375, - "delta_ref_ppl": -0.0411376953125, - "entropy_loss": -0.03759765625, - "epoch": 0.2012, - "grad_norm": 0.6216371756334678, - "k1_kl": 0.041015625, - "k3_kl": 0.02349853515625, - "kimi_kl": 0.0648193359375, - "learning_rate": 3.9939999999999994e-07, - "loss": 0.0009, - "ppl": 0.0159912109375, - "reward": 0.9628293514251709, - "reward_std": 0.0035308445803821087, - "rewards/perpo_ocr_edit_distance_reward": 0.9628293812274933, + "advantages": -8.514949456639442e-08, + "completion_length": 474.0, + "delta_ref_entropy_loss": 0.06298828125, + "delta_ref_ppl": -0.051513671875, + "entropy_loss": -0.0859375, + "epoch": 0.1006, + "grad_norm": 7.508604363258282, + "k1_kl": 0.051513671875, + "k3_kl": 0.03466796875, + "kimi_kl": 0.07568359375, + "learning_rate": 4.4969999999999996e-07, + "loss": 0.0014, + "ppl": 0.056396484375, + "reward": 0.7847900390625, + "reward_std": 0.3072626292705536, + "rewards/perpo_ocr_edit_distance_reward": 0.7847900986671448, "step": 503, "temperature": 0.9 }, { - "advantages": -6.075416749240503e-06, - "completion_length": 338.5, - "delta_ref_entropy_loss": 0.01104736328125, - "delta_ref_ppl": -0.067626953125, - "entropy_loss": -0.085205078125, - "epoch": 0.2016, - "grad_norm": 2.5464651438484767, - "k1_kl": 0.0675048828125, - "k3_kl": 0.0509033203125, - "kimi_kl": 0.1917724609375, - "learning_rate": 3.992e-07, - "loss": 0.002, - "ppl": 0.035888671875, - "reward": 0.7253714054822922, - "reward_std": 0.055718544172123075, - "rewards/perpo_ocr_edit_distance_reward": 0.7253714799880981, + "advantages": 0.0, + "completion_length": 488.0, + "delta_ref_entropy_loss": 0.0223388671875, + "delta_ref_ppl": -0.0157470703125, + "entropy_loss": -0.00811767578125, + "epoch": 0.1008, + "grad_norm": 0.24416283911995762, + "k1_kl": 0.015869140625, + "k3_kl": 0.00970458984375, + "kimi_kl": 0.0220947265625, + "learning_rate": 4.496e-07, + "loss": 0.0004, + "ppl": 0.002105712890625, + "reward": 0.989541232585907, + "reward_std": 0.0001572403562022373, + "rewards/perpo_ocr_edit_distance_reward": 0.989541232585907, "step": 504, "temperature": 0.9 }, { - "advantages": -1.8992595357758546e-05, - "completion_length": 397.5, - "delta_ref_entropy_loss": 0.0487060546875, - "delta_ref_ppl": -0.0262451171875, - "entropy_loss": -0.03912353515625, - "epoch": 0.202, - "grad_norm": 0.6699104027450946, - "k1_kl": 0.0262451171875, - "k3_kl": 0.013519287109375, - "kimi_kl": 0.0294189453125, - "learning_rate": 3.99e-07, - "loss": 0.0006, - "ppl": 0.01885986328125, - "reward": 0.907953679561615, - "reward_std": 0.015205805393634364, - "rewards/perpo_ocr_edit_distance_reward": 0.9079537391662598, + "advantages": -2.9597964385175146e-05, + "completion_length": 502.0, + "delta_ref_entropy_loss": 0.025390625, + "delta_ref_ppl": -0.0155029296875, + "entropy_loss": -0.0245361328125, + "epoch": 0.101, + "grad_norm": 0.4583969509711391, + "k1_kl": 0.0155029296875, + "k3_kl": 0.007720947265625, + "kimi_kl": 0.01214599609375, + "learning_rate": 4.495e-07, + "loss": 0.0003, + "ppl": 0.01226806640625, + "reward": 0.9669157862663269, + "reward_std": 0.001340034301392734, + "rewards/perpo_ocr_edit_distance_reward": 0.9669158458709717, "step": 505, "temperature": 0.9 }, { - "advantages": -1.743010216159746e-05, - "completion_length": 315.5, - "delta_ref_entropy_loss": 0.0377197265625, - "delta_ref_ppl": -0.0335693359375, - "entropy_loss": -0.03143310546875, - "epoch": 0.2024, - "grad_norm": 0.3687912731213098, - "k1_kl": 0.03363037109375, - "k3_kl": 0.02117919921875, - "kimi_kl": 0.04620361328125, - "learning_rate": 3.9879999999999994e-07, - "loss": 0.0009, - "ppl": 0.0157470703125, - "reward": 0.9931910634040833, - "reward_std": 0.0003162567736580968, - "rewards/perpo_ocr_edit_distance_reward": 0.9931910634040833, + "advantages": -5.7935718359658495e-05, + "completion_length": 651.0, + "delta_ref_entropy_loss": 0.026611328125, + "delta_ref_ppl": -0.0198974609375, + "entropy_loss": -0.0150146484375, + "epoch": 0.1012, + "grad_norm": 0.278911257969409, + "k1_kl": 0.0198974609375, + "k3_kl": 0.01336669921875, + "kimi_kl": 0.039306640625, + "learning_rate": 4.494e-07, + "loss": 0.0006, + "ppl": 0.006011962890625, + "reward": 0.9391129016876221, + "reward_std": 0.0003408058255445212, + "rewards/perpo_ocr_edit_distance_reward": 0.9391129016876221, "step": 506, "temperature": 0.9 }, { - "advantages": -0.0003055632118957874, - "completion_length": 1148.5, - "delta_ref_entropy_loss": 0.0213623046875, - "delta_ref_ppl": -0.016571044921875, - "entropy_loss": -0.02783203125, - "epoch": 0.2028, - "grad_norm": 0.3299951768482563, - "k1_kl": 0.016571044921875, - "k3_kl": 0.0094451904296875, - "kimi_kl": 0.0193634033203125, - "learning_rate": 3.986e-07, - "loss": 0.0007, - "ppl": 0.012939453125, - "reward": 0.9974353909492493, - "reward_std": 0.00023226122721098363, - "rewards/perpo_ocr_edit_distance_reward": 0.9974354207515717, + "advantages": -3.6614283089875244e-06, + "completion_length": 1539.0, + "delta_ref_entropy_loss": 0.049072265625, + "delta_ref_ppl": -0.0308837890625, + "entropy_loss": -0.09130859375, + "epoch": 0.1014, + "grad_norm": 1.3847473960973933, + "k1_kl": 0.0308837890625, + "k3_kl": 0.01513671875, + "kimi_kl": 0.028076171875, + "learning_rate": 4.493e-07, + "loss": 0.0006, + "ppl": 0.04931640625, + "reward": 0.6269674301147461, + "reward_std": 0.0068742684088647366, + "rewards/perpo_ocr_edit_distance_reward": 0.6269674301147461, "step": 507, "temperature": 0.9 }, { - "advantages": 8.429800800513476e-06, - "completion_length": 666.0, - "delta_ref_entropy_loss": 0.05657958984375, - "delta_ref_ppl": -0.03936767578125, - "entropy_loss": -0.069183349609375, - "epoch": 0.2032, - "grad_norm": 0.9662471006931008, - "k1_kl": 0.039154052734375, - "k3_kl": 0.02185821533203125, - "kimi_kl": 0.05545806884765625, - "learning_rate": 3.9839999999999997e-07, - "loss": 0.0009, - "ppl": 0.03826904296875, - "reward": 0.9632874727249146, - "reward_std": 0.001164186847745441, - "rewards/perpo_ocr_edit_distance_reward": 0.9632875323295593, + "advantages": -0.00012711543240584433, + "completion_length": 1220.0, + "delta_ref_entropy_loss": 0.012451171875, + "delta_ref_ppl": -0.01226806640625, + "entropy_loss": -0.01483154296875, + "epoch": 0.1016, + "grad_norm": 0.3243893911384248, + "k1_kl": 0.01226806640625, + "k3_kl": 0.00811767578125, + "kimi_kl": 0.017333984375, + "learning_rate": 4.4919999999999997e-07, + "loss": 0.0005, + "ppl": 0.0050048828125, + "reward": 0.9799706339836121, + "reward_std": 0.00016792710812296718, + "rewards/perpo_ocr_edit_distance_reward": 0.9799706935882568, "step": 508, "temperature": 0.9 }, { - "advantages": -3.49496097555857e-05, - "completion_length": 1076.0, - "delta_ref_entropy_loss": 0.01934814453125, - "delta_ref_ppl": -0.009429931640625, - "entropy_loss": -0.0443115234375, - "epoch": 0.2036, - "grad_norm": 1.221537052497446, - "k1_kl": 0.009429931640625, - "k3_kl": 0.00366973876953125, - "kimi_kl": 0.0056915283203125, - "learning_rate": 3.982e-07, - "loss": 0.0002, - "ppl": 0.020721435546875, - "reward": 0.9058766067028046, - "reward_std": 0.05450112899416126, - "rewards/perpo_ocr_edit_distance_reward": 0.9058766663074493, + "advantages": -2.8669835955952294e-05, + "completion_length": 171.0, + "delta_ref_entropy_loss": 0.08203125, + "delta_ref_ppl": -0.061767578125, + "entropy_loss": -0.0703125, + "epoch": 0.1018, + "grad_norm": 3.496021673928595, + "k1_kl": 0.061767578125, + "k3_kl": 0.033447265625, + "kimi_kl": 0.06005859375, + "learning_rate": 4.4909999999999996e-07, + "loss": 0.0014, + "ppl": 0.032470703125, + "reward": 0.928822934627533, + "reward_std": 0.0013851854018867016, + "rewards/perpo_ocr_edit_distance_reward": 0.9288229942321777, "step": 509, "temperature": 0.9 }, { - "advantages": -0.00041921224328689277, - "completion_length": 553.0, - "delta_ref_entropy_loss": 0.017242431640625, - "delta_ref_ppl": -0.0054473876953125, - "entropy_loss": -0.014495849609375, - "epoch": 0.204, - "grad_norm": 0.25705661557914417, - "k1_kl": 0.00543212890625, - "k3_kl": 0.0036773681640625, - "kimi_kl": 0.005126953125, - "learning_rate": 3.98e-07, - "loss": 0.0006, - "ppl": 0.00650787353515625, - "reward": 0.9965207874774933, - "reward_std": 9.052583482116461e-05, - "rewards/perpo_ocr_edit_distance_reward": 0.9965208768844604, + "advantages": -4.0616309888719115e-06, + "completion_length": 414.0, + "delta_ref_entropy_loss": 0.1611328125, + "delta_ref_ppl": -0.08984375, + "entropy_loss": -0.283203125, + "epoch": 0.102, + "grad_norm": 3.203630633704987, + "k1_kl": 0.08984375, + "k3_kl": 0.051025390625, + "kimi_kl": 0.087890625, + "learning_rate": 4.49e-07, + "loss": 0.002, + "ppl": 0.16796875, + "reward": 0.7520169615745544, + "reward_std": 0.004096608608961105, + "rewards/perpo_ocr_edit_distance_reward": 0.7520169615745544, "step": 510, "temperature": 0.9 }, { - "advantages": -1.2846930530940881e-05, - "completion_length": 373.5, - "delta_ref_entropy_loss": 0.09375, - "delta_ref_ppl": -0.0753173828125, - "entropy_loss": -0.142822265625, - "epoch": 0.2044, - "grad_norm": 1.3622714689324877, - "k1_kl": 0.0753173828125, - "k3_kl": 0.04315185546875, - "kimi_kl": 0.111083984375, - "learning_rate": 3.978e-07, - "loss": 0.0017, - "ppl": 0.0745849609375, - "reward": 0.9005874991416931, - "reward_std": 0.004234011052176356, - "rewards/perpo_ocr_edit_distance_reward": 0.9005875587463379, + "advantages": 0.0, + "completion_length": 793.0, + "delta_ref_entropy_loss": 0.01544189453125, + "delta_ref_ppl": -0.0089111328125, + "entropy_loss": -0.0120849609375, + "epoch": 0.1022, + "grad_norm": 0.3235826859279163, + "k1_kl": 0.0089111328125, + "k3_kl": 0.00469970703125, + "kimi_kl": 0.009521484375, + "learning_rate": 4.489e-07, + "loss": 0.0002, + "ppl": 0.005706787109375, + "reward": 0.9188182950019836, + "reward_std": 0.0051379939541220665, + "rewards/perpo_ocr_edit_distance_reward": 0.9188182950019836, "step": 511, "temperature": 0.9 }, { - "advantages": -0.000298018966402136, - "completion_length": 475.5, - "delta_ref_entropy_loss": 0.027099609375, - "delta_ref_ppl": -0.0137939453125, - "entropy_loss": -0.026214599609375, - "epoch": 0.2048, - "grad_norm": 0.5558445677772904, - "k1_kl": 0.0138092041015625, - "k3_kl": 0.00628662109375, - "kimi_kl": 0.0133056640625, - "learning_rate": 3.976e-07, - "loss": 0.0005, - "ppl": 0.012359619140625, - "reward": 0.9794330298900604, - "reward_std": 0.001399434171617031, - "rewards/perpo_ocr_edit_distance_reward": 0.9794330894947052, + "advantages": -1.812407026591245e-05, + "completion_length": 457.0, + "delta_ref_entropy_loss": 0.0306396484375, + "delta_ref_ppl": -0.025146484375, + "entropy_loss": -0.0306396484375, + "epoch": 0.1024, + "grad_norm": 0.491910030898367, + "k1_kl": 0.025146484375, + "k3_kl": 0.0147705078125, + "kimi_kl": 0.034912109375, + "learning_rate": 4.4879999999999994e-07, + "loss": 0.0006, + "ppl": 0.013916015625, + "reward": 0.9445765018463135, + "reward_std": 0.00036982682649977505, + "rewards/perpo_ocr_edit_distance_reward": 0.9445764422416687, "step": 512, "temperature": 0.9 }, { - "advantages": -0.00011285288826456963, - "completion_length": 431.0, - "delta_ref_entropy_loss": 0.0430908203125, - "delta_ref_ppl": -0.0343017578125, - "entropy_loss": -0.0384521484375, - "epoch": 0.2052, - "grad_norm": 1.587221937704333, - "k1_kl": 0.0341796875, - "k3_kl": 0.02069091796875, - "kimi_kl": 0.036865234375, - "learning_rate": 3.9739999999999995e-07, - "loss": 0.0009, - "ppl": 0.0194091796875, - "reward": 0.9457361996173859, - "reward_std": 0.02188937876780983, - "rewards/perpo_ocr_edit_distance_reward": 0.9457363188266754, + "advantages": -6.982258582866052e-06, + "completion_length": 2048.0, + "delta_ref_entropy_loss": 0.01336669921875, + "delta_ref_ppl": -0.0084228515625, + "entropy_loss": -0.058837890625, + "epoch": 0.1026, + "grad_norm": 1.20774236396967, + "k1_kl": 0.00830078125, + "k3_kl": 0.0062255859375, + "kimi_kl": 0.01171875, + "learning_rate": 4.487e-07, + "loss": 0.0003, + "ppl": 0.02880859375, + "reward": 0.8994972705841064, + "reward_std": 0.010855902917683125, + "rewards/perpo_ocr_edit_distance_reward": 0.8994973301887512, "step": 513, "temperature": 0.9 }, { - "advantages": -6.069669143471401e-05, - "completion_length": 231.0, - "delta_ref_entropy_loss": 0.05206298828125, - "delta_ref_ppl": -0.03790283203125, - "entropy_loss": -0.04473876953125, - "epoch": 0.2056, - "grad_norm": 1.0525803709511918, - "k1_kl": 0.03790283203125, - "k3_kl": 0.017852783203125, - "kimi_kl": 0.0318603515625, - "learning_rate": 3.972e-07, - "loss": 0.0008, - "ppl": 0.022552490234375, - "reward": 0.9932445883750916, - "reward_std": 0.0008234070846810937, - "rewards/perpo_ocr_edit_distance_reward": 0.9932447075843811, + "advantages": 2.6055745365738403e-06, + "completion_length": 1056.0, + "delta_ref_entropy_loss": 0.0294189453125, + "delta_ref_ppl": -0.02685546875, + "entropy_loss": -0.054931640625, + "epoch": 0.1028, + "grad_norm": 2.475484103584878, + "k1_kl": 0.0267333984375, + "k3_kl": 0.0179443359375, + "kimi_kl": 0.040771484375, + "learning_rate": 4.486e-07, + "loss": 0.0007, + "ppl": 0.0294189453125, + "reward": 0.961101770401001, + "reward_std": 0.003168633906170726, + "rewards/perpo_ocr_edit_distance_reward": 0.9611017107963562, "step": 514, "temperature": 0.9 }, { - "advantages": -3.351484065206023e-05, - "completion_length": 604.5, - "delta_ref_entropy_loss": 0.041015625, - "delta_ref_ppl": -0.02484130859375, - "entropy_loss": -0.04052734375, - "epoch": 0.206, - "grad_norm": 0.5833459652748019, - "k1_kl": 0.02490234375, - "k3_kl": 0.01263427734375, - "kimi_kl": 0.0306396484375, - "learning_rate": 3.97e-07, - "loss": 0.0005, - "ppl": 0.01983642578125, - "reward": 0.9919541478157043, - "reward_std": 0.0010999829974025488, - "rewards/perpo_ocr_edit_distance_reward": 0.9919542074203491, + "advantages": -3.2867706067918334e-06, + "completion_length": 457.0, + "delta_ref_entropy_loss": 0.078125, + "delta_ref_ppl": -0.0390625, + "entropy_loss": -0.1123046875, + "epoch": 0.103, + "grad_norm": 1.9643225056770148, + "k1_kl": 0.0390625, + "k3_kl": 0.0245361328125, + "kimi_kl": 0.031982421875, + "learning_rate": 4.4849999999999997e-07, + "loss": 0.001, + "ppl": 0.0625, + "reward": 0.8706088662147522, + "reward_std": 0.010190699249505997, + "rewards/perpo_ocr_edit_distance_reward": 0.870608925819397, "step": 515, "temperature": 0.9 }, { - "advantages": -0.0002980232238769531, - "completion_length": 178.0, - "delta_ref_entropy_loss": 0.03216552734375, - "delta_ref_ppl": -0.056396484375, - "entropy_loss": -0.03741455078125, - "epoch": 0.2064, - "grad_norm": 0.33882644497224046, - "k1_kl": 0.056396484375, - "k3_kl": 0.047458648681640625, - "kimi_kl": 0.098358154296875, - "learning_rate": 3.9679999999999995e-07, - "loss": 0.0022, - "ppl": 0.017208099365234375, - "reward": 0.9894366264343262, - "reward_std": 0.0, - "rewards/perpo_ocr_edit_distance_reward": 0.9894366562366486, + "advantages": -0.00010189840395469218, + "completion_length": 884.0, + "delta_ref_entropy_loss": 0.021484375, + "delta_ref_ppl": -0.01385498046875, + "entropy_loss": -0.016845703125, + "epoch": 0.1032, + "grad_norm": 0.26745896115671053, + "k1_kl": 0.013916015625, + "k3_kl": 0.0064697265625, + "kimi_kl": 0.01123046875, + "learning_rate": 4.484e-07, + "loss": 0.0004, + "ppl": 0.005828857421875, + "reward": 0.9873022437095642, + "reward_std": 0.00015070165682118386, + "rewards/perpo_ocr_edit_distance_reward": 0.987302303314209, "step": 516, "temperature": 0.9 }, { - "advantages": -0.0002981349825930124, - "completion_length": 364.5, - "delta_ref_entropy_loss": 0.03955078125, - "delta_ref_ppl": -0.042236328125, - "entropy_loss": -0.061279296875, - "epoch": 0.2068, - "grad_norm": 1.2142803958494193, - "k1_kl": 0.042236328125, - "k3_kl": 0.02862548828125, - "kimi_kl": 0.064697265625, - "learning_rate": 3.966e-07, - "loss": 0.0014, - "ppl": 0.02978515625, - "reward": 0.698532909154892, - "reward_std": 0.029635369777679443, - "rewards/perpo_ocr_edit_distance_reward": 0.6985329389572144, + "advantages": -4.938670826959424e-06, + "completion_length": 1209.0, + "delta_ref_entropy_loss": 0.02490234375, + "delta_ref_ppl": -0.01495361328125, + "entropy_loss": -0.06640625, + "epoch": 0.1034, + "grad_norm": 0.8995022450801439, + "k1_kl": 0.01483154296875, + "k3_kl": 0.00927734375, + "kimi_kl": 0.0244140625, + "learning_rate": 4.4829999999999996e-07, + "loss": 0.0004, + "ppl": 0.03759765625, + "reward": 0.9602838158607483, + "reward_std": 0.01726643182337284, + "rewards/perpo_ocr_edit_distance_reward": 0.9602838754653931, "step": 517, "temperature": 0.9 }, { - "advantages": -0.0003597395771066658, - "completion_length": 766.5, - "delta_ref_entropy_loss": 0.076904296875, - "delta_ref_ppl": -0.144561767578125, - "entropy_loss": -0.04791259765625, - "epoch": 0.2072, - "grad_norm": 0.6152985100549362, - "k1_kl": 0.145538330078125, - "k3_kl": 0.110809326171875, - "kimi_kl": 0.2580108642578125, - "learning_rate": 3.964e-07, - "loss": 0.0048, - "ppl": 0.024505615234375, - "reward": 0.9880383014678955, - "reward_std": 0.00012247564154677093, - "rewards/perpo_ocr_edit_distance_reward": 0.9880383312702179, + "advantages": -0.0003416538529563695, + "completion_length": 512.0, + "delta_ref_entropy_loss": 0.031494140625, + "delta_ref_ppl": -0.0269775390625, + "entropy_loss": -0.028076171875, + "epoch": 0.1036, + "grad_norm": 0.7068479443909457, + "k1_kl": 0.0269775390625, + "k3_kl": 0.01318359375, + "kimi_kl": 0.028076171875, + "learning_rate": 4.4819999999999995e-07, + "loss": 0.0009, + "ppl": 0.01141357421875, + "reward": 0.9828385710716248, + "reward_std": 0.0002240154572064057, + "rewards/perpo_ocr_edit_distance_reward": 0.9828386902809143, "step": 518, "temperature": 0.9 }, { - "advantages": -2.6600701858114917e-05, - "completion_length": 864.0, - "delta_ref_entropy_loss": 0.021820068359375, - "delta_ref_ppl": -0.0163421630859375, - "entropy_loss": -0.02105712890625, - "epoch": 0.2076, - "grad_norm": 0.32797997745197577, - "k1_kl": 0.016326904296875, - "k3_kl": 0.01043701171875, - "kimi_kl": 0.0267181396484375, - "learning_rate": 3.9619999999999996e-07, - "loss": 0.0004, - "ppl": 0.009796142578125, - "reward": 0.9991983771324158, - "reward_std": 0.00030494316888507456, - "rewards/perpo_ocr_edit_distance_reward": 0.9991983771324158, + "advantages": -5.568776941800024e-06, + "completion_length": 1489.0, + "delta_ref_entropy_loss": 0.0294189453125, + "delta_ref_ppl": -0.022216796875, + "entropy_loss": -0.07958984375, + "epoch": 0.1038, + "grad_norm": 3.298485237508824, + "k1_kl": 0.0223388671875, + "k3_kl": 0.01513671875, + "kimi_kl": 0.060302734375, + "learning_rate": 4.481e-07, + "loss": 0.0006, + "ppl": 0.043212890625, + "reward": 0.9571765065193176, + "reward_std": 0.006030827760696411, + "rewards/perpo_ocr_edit_distance_reward": 0.9571765065193176, "step": 519, "temperature": 0.9 }, { - "advantages": -0.00029769539830226677, - "completion_length": 918.5, - "delta_ref_entropy_loss": 0.02336883544921875, - "delta_ref_ppl": -0.029541015625, - "entropy_loss": -0.0576171875, - "epoch": 0.208, - "grad_norm": 0.7751541570132482, - "k1_kl": 0.02947998046875, - "k3_kl": 0.0203857421875, - "kimi_kl": 0.05224609375, - "learning_rate": 3.96e-07, - "loss": 0.0011, - "ppl": 0.041015625, - "reward": 0.8947668075561523, - "reward_std": 0.019682615995407104, - "rewards/perpo_ocr_edit_distance_reward": 0.8947668075561523, + "advantages": -0.0003021104203071445, + "completion_length": 700.0, + "delta_ref_entropy_loss": 0.02685546875, + "delta_ref_ppl": -0.0198974609375, + "entropy_loss": -0.0281982421875, + "epoch": 0.104, + "grad_norm": 0.438364934282148, + "k1_kl": 0.0198974609375, + "k3_kl": 0.01043701171875, + "kimi_kl": 0.0238037109375, + "learning_rate": 4.48e-07, + "loss": 0.0007, + "ppl": 0.01123046875, + "reward": 0.98382568359375, + "reward_std": 0.00023820468049962074, + "rewards/perpo_ocr_edit_distance_reward": 0.9838257431983948, "step": 520, "temperature": 0.9 }, { - "advantages": -7.234727153093701e-05, - "completion_length": 409.5, - "delta_ref_entropy_loss": 0.05792236328125, - "delta_ref_ppl": -0.04486083984375, - "entropy_loss": -0.144256591796875, - "epoch": 0.2084, - "grad_norm": 2.4680788036663577, - "k1_kl": 0.04510498046875, - "k3_kl": 0.029052734375, - "kimi_kl": 0.0928955078125, - "learning_rate": 3.958e-07, - "loss": 0.0012, - "ppl": 0.072540283203125, - "reward": 0.856945812702179, - "reward_std": 0.027902278052351903, - "rewards/perpo_ocr_edit_distance_reward": 0.8569458723068237, - "step": 521, - "temperature": 0.9 - }, - { - "advantages": -4.257474817137563e-09, - "completion_length": 580.5, - "delta_ref_entropy_loss": 0.0347900390625, - "delta_ref_ppl": -0.021270751953125, - "entropy_loss": -0.0179443359375, - "epoch": 0.2088, - "grad_norm": 0.367191169934795, - "k1_kl": 0.021270751953125, - "k3_kl": 0.010009765625, - "kimi_kl": 0.017822265625, - "learning_rate": 3.9559999999999997e-07, - "loss": 0.0004, - "ppl": 0.0058135986328125, - "reward": 0.996457040309906, - "reward_std": 0.0002787433040793985, - "rewards/perpo_ocr_edit_distance_reward": 0.9964570701122284, + "advantages": -2.946172571682837e-06, + "completion_length": 1747.0, + "delta_ref_entropy_loss": 0.018798828125, + "delta_ref_ppl": -0.0123291015625, + "entropy_loss": -0.028564453125, + "epoch": 0.1042, + "grad_norm": 1.2875639819694045, + "k1_kl": 0.0123291015625, + "k3_kl": 0.00836181640625, + "kimi_kl": 0.019775390625, + "learning_rate": 4.479e-07, + "loss": 0.0003, + "ppl": 0.0140380859375, + "reward": 0.8907753229141235, + "reward_std": 0.017387844622135162, + "rewards/perpo_ocr_edit_distance_reward": 0.8907753825187683, + "step": 521, + "temperature": 0.9 + }, + { + "advantages": -2.1798271063744323e-06, + "completion_length": 693.0, + "delta_ref_entropy_loss": 0.0286865234375, + "delta_ref_ppl": -0.0172119140625, + "entropy_loss": -0.0247802734375, + "epoch": 0.1044, + "grad_norm": 0.5764983443037421, + "k1_kl": 0.0172119140625, + "k3_kl": 0.008056640625, + "kimi_kl": 0.0142822265625, + "learning_rate": 4.4779999999999997e-07, + "loss": 0.0003, + "ppl": 0.01068115234375, + "reward": 0.9013009667396545, + "reward_std": 0.030960163101553917, + "rewards/perpo_ocr_edit_distance_reward": 0.9013011455535889, "step": 522, "temperature": 0.9 }, { - "advantages": -4.334961029428541e-05, - "completion_length": 682.5, - "delta_ref_entropy_loss": 0.02642822265625, - "delta_ref_ppl": -0.033447265625, - "entropy_loss": -0.0343017578125, - "epoch": 0.2092, - "grad_norm": 0.5288178704050617, - "k1_kl": 0.033447265625, - "k3_kl": 0.02044677734375, - "kimi_kl": 0.0628662109375, - "learning_rate": 3.9539999999999995e-07, - "loss": 0.0009, - "ppl": 0.01739501953125, - "reward": 0.9062467813491821, - "reward_std": 0.03115716713364236, - "rewards/perpo_ocr_edit_distance_reward": 0.9062468409538269, + "advantages": -6.267002845561365e-06, + "completion_length": 479.0, + "delta_ref_entropy_loss": 0.07861328125, + "delta_ref_ppl": -0.041015625, + "entropy_loss": -0.078125, + "epoch": 0.1046, + "grad_norm": 1.309419390538037, + "k1_kl": 0.041015625, + "k3_kl": 0.01953125, + "kimi_kl": 0.044921875, + "learning_rate": 4.4769999999999997e-07, + "loss": 0.0008, + "ppl": 0.038330078125, + "reward": 0.9712048768997192, + "reward_std": 0.0039755189791321754, + "rewards/perpo_ocr_edit_distance_reward": 0.971204936504364, "step": 523, "temperature": 0.9 }, { - "advantages": -6.399410267476924e-05, - "completion_length": 601.0, - "delta_ref_entropy_loss": 0.0498046875, - "delta_ref_ppl": -0.0458984375, - "entropy_loss": -0.0498046875, - "epoch": 0.2096, - "grad_norm": 0.5423121689746305, - "k1_kl": 0.0458984375, - "k3_kl": 0.0272216796875, - "kimi_kl": 0.0672607421875, - "learning_rate": 3.952e-07, - "loss": 0.0012, - "ppl": 0.02618408203125, - "reward": 0.9809284508228302, - "reward_std": 0.0014387058618012816, - "rewards/perpo_ocr_edit_distance_reward": 0.9809285402297974, + "advantages": -0.00012534856796264648, + "completion_length": 805.0, + "delta_ref_entropy_loss": 0.0234375, + "delta_ref_ppl": -0.01116943359375, + "entropy_loss": -0.0263671875, + "epoch": 0.1048, + "grad_norm": 2.306671055958338, + "k1_kl": 0.01123046875, + "k3_kl": 0.0380859375, + "kimi_kl": 0.020263671875, + "learning_rate": 4.4759999999999996e-07, + "loss": 0.0017, + "ppl": 0.0277099609375, + "reward": 0.8916634917259216, + "reward_std": 0.0006471259403042495, + "rewards/perpo_ocr_edit_distance_reward": 0.8916636109352112, "step": 524, "temperature": 0.9 }, { - "advantages": -0.0001396025971871495, - "completion_length": 1251.5, - "delta_ref_entropy_loss": 0.029571533203125, - "delta_ref_ppl": -0.0182342529296875, - "entropy_loss": -0.06756591796875, - "epoch": 0.21, - "grad_norm": 0.9376231756043371, - "k1_kl": 0.01824951171875, - "k3_kl": 0.01009368896484375, - "kimi_kl": 0.01812744140625, - "learning_rate": 3.95e-07, - "loss": 0.0005, - "ppl": 0.034088134765625, - "reward": 0.9487049281597137, - "reward_std": 0.0525626953021856, - "rewards/perpo_ocr_edit_distance_reward": 0.9487049877643585, + "advantages": -7.018021278781816e-05, + "completion_length": 417.0, + "delta_ref_entropy_loss": 0.023193359375, + "delta_ref_ppl": -0.0244140625, + "entropy_loss": -0.013916015625, + "epoch": 0.105, + "grad_norm": 0.3162319136715054, + "k1_kl": 0.0242919921875, + "k3_kl": 0.0157470703125, + "kimi_kl": 0.036376953125, + "learning_rate": 4.475e-07, + "loss": 0.0007, + "ppl": 0.004486083984375, + "reward": 0.9849660992622375, + "reward_std": 0.0002640819875523448, + "rewards/perpo_ocr_edit_distance_reward": 0.9849661588668823, "step": 525, "temperature": 0.9 }, { - "advantages": -0.0003078537329201936, - "completion_length": 708.5, - "delta_ref_entropy_loss": 0.02471923828125, - "delta_ref_ppl": -0.0100555419921875, - "entropy_loss": -0.027099609375, - "epoch": 0.2104, - "grad_norm": 1.2171570631468998, - "k1_kl": 0.0100555419921875, - "k3_kl": 0.0039234161376953125, - "kimi_kl": 0.006134033203125, - "learning_rate": 3.9479999999999996e-07, - "loss": 0.0005, - "ppl": 0.0133819580078125, - "reward": 0.9961785674095154, - "reward_std": 0.0012490830849856138, - "rewards/perpo_ocr_edit_distance_reward": 0.9961786270141602, + "advantages": -2.6106836230610497e-05, + "completion_length": 1054.0, + "delta_ref_entropy_loss": 0.02734375, + "delta_ref_ppl": -0.016845703125, + "entropy_loss": -0.041015625, + "epoch": 0.1052, + "grad_norm": 1.1932763319051647, + "k1_kl": 0.0167236328125, + "k3_kl": 0.00811767578125, + "kimi_kl": 0.0169677734375, + "learning_rate": 4.474e-07, + "loss": 0.0004, + "ppl": 0.019287109375, + "reward": 0.9704614281654358, + "reward_std": 0.0021814180072396994, + "rewards/perpo_ocr_edit_distance_reward": 0.9704614877700806, "step": 526, "temperature": 0.9 }, { - "advantages": -0.0003096205855399603, - "completion_length": 523.0, - "delta_ref_entropy_loss": 0.0438232421875, - "delta_ref_ppl": -0.0216064453125, - "entropy_loss": -0.0250396728515625, - "epoch": 0.2108, - "grad_norm": 0.35073970871100746, - "k1_kl": 0.021728515625, - "k3_kl": 0.008920669555664062, - "kimi_kl": 0.01641845703125, - "learning_rate": 3.946e-07, - "loss": 0.0007, - "ppl": 0.009935379028320312, - "reward": 0.997566819190979, - "reward_std": 0.0005004872218705714, - "rewards/perpo_ocr_edit_distance_reward": 0.9975669085979462, + "advantages": -4.717281990451738e-05, + "completion_length": 892.0, + "delta_ref_entropy_loss": 0.023193359375, + "delta_ref_ppl": -0.01324462890625, + "entropy_loss": -0.04638671875, + "epoch": 0.1054, + "grad_norm": 2.3707845239324055, + "k1_kl": 0.0133056640625, + "k3_kl": 0.00897216796875, + "kimi_kl": 0.017822265625, + "learning_rate": 4.4729999999999994e-07, + "loss": 0.0004, + "ppl": 0.0220947265625, + "reward": 0.8245874643325806, + "reward_std": 0.0009835289092734456, + "rewards/perpo_ocr_edit_distance_reward": 0.8245874643325806, "step": 527, "temperature": 0.9 }, { - "advantages": -8.514949634275126e-09, - "completion_length": 346.5, - "delta_ref_entropy_loss": 0.0223388671875, - "delta_ref_ppl": -0.02228546142578125, - "entropy_loss": -0.022003173828125, - "epoch": 0.2112, - "grad_norm": 3.0045381468024575, - "k1_kl": 0.02239990234375, - "k3_kl": 0.015309333801269531, - "kimi_kl": 0.029577255249023438, - "learning_rate": 3.9439999999999993e-07, - "loss": 0.0006, - "ppl": 0.01058197021484375, - "reward": 0.9797367453575134, - "reward_std": 0.006766095291823149, - "rewards/perpo_ocr_edit_distance_reward": 0.9797367453575134, + "advantages": -2.043587983280304e-06, + "completion_length": 161.0, + "delta_ref_entropy_loss": 0.08154296875, + "delta_ref_ppl": -0.0810546875, + "entropy_loss": -0.09375, + "epoch": 0.1056, + "grad_norm": 4.313336520898188, + "k1_kl": 0.0810546875, + "k3_kl": 0.047607421875, + "kimi_kl": 0.09765625, + "learning_rate": 4.472e-07, + "loss": 0.0019, + "ppl": 0.043212890625, + "reward": 0.8201309442520142, + "reward_std": 0.020687034353613853, + "rewards/perpo_ocr_edit_distance_reward": 0.8201309442520142, "step": 528, "temperature": 0.9 }, { - "advantages": -1.4181648907651834e-05, - "completion_length": 840.0, - "delta_ref_entropy_loss": 0.03814697265625, - "delta_ref_ppl": -0.03265380859375, - "entropy_loss": -0.057373046875, - "epoch": 0.2116, - "grad_norm": 1.3061134498190379, - "k1_kl": 0.03265380859375, - "k3_kl": 0.0225830078125, - "kimi_kl": 0.0567626953125, - "learning_rate": 3.9419999999999997e-07, - "loss": 0.0009, - "ppl": 0.0351409912109375, - "reward": 0.9922629594802856, - "reward_std": 0.0032943502301350236, - "rewards/perpo_ocr_edit_distance_reward": 0.9922630190849304, + "advantages": -3.525189185893396e-06, + "completion_length": 886.0, + "delta_ref_entropy_loss": 0.03857421875, + "delta_ref_ppl": -0.04248046875, + "entropy_loss": -0.04150390625, + "epoch": 0.1058, + "grad_norm": 0.6935027196222647, + "k1_kl": 0.04248046875, + "k3_kl": 0.0245361328125, + "kimi_kl": 0.08642578125, + "learning_rate": 4.4709999999999997e-07, + "loss": 0.001, + "ppl": 0.01806640625, + "reward": 0.9342042207717896, + "reward_std": 0.0023111007176339626, + "rewards/perpo_ocr_edit_distance_reward": 0.9342042803764343, "step": 529, "temperature": 0.9 }, { - "advantages": -3.414654565858655e-05, - "completion_length": 707.0, - "delta_ref_entropy_loss": 0.04278564453125, - "delta_ref_ppl": -0.025909423828125, - "entropy_loss": -0.036376953125, - "epoch": 0.212, - "grad_norm": 0.48724843338725443, - "k1_kl": 0.02587890625, - "k3_kl": 0.0129547119140625, - "kimi_kl": 0.0234832763671875, - "learning_rate": 3.94e-07, + "advantages": -3.4059798537100505e-08, + "completion_length": 1095.0, + "delta_ref_entropy_loss": 0.0198974609375, + "delta_ref_ppl": -0.02294921875, + "entropy_loss": -0.056396484375, + "epoch": 0.106, + "grad_norm": 1.2135616301656817, + "k1_kl": 0.02294921875, + "k3_kl": 0.01507568359375, + "kimi_kl": 0.03369140625, + "learning_rate": 4.4699999999999997e-07, "loss": 0.0006, - "ppl": 0.0157012939453125, - "reward": 0.9958505928516388, - "reward_std": 0.0003659391513792798, - "rewards/perpo_ocr_edit_distance_reward": 0.9958505928516388, + "ppl": 0.0267333984375, + "reward": 0.9542679786682129, + "reward_std": 0.017364468425512314, + "rewards/perpo_ocr_edit_distance_reward": 0.9542680978775024, "step": 530, "temperature": 0.9 }, { - "advantages": -2.450602551107295e-05, - "completion_length": 574.0, - "delta_ref_entropy_loss": 0.05108642578125, - "delta_ref_ppl": -0.04595947265625, - "entropy_loss": -0.105712890625, - "epoch": 0.2124, - "grad_norm": 0.9549028119555978, - "k1_kl": 0.04620361328125, - "k3_kl": 0.02679443359375, - "kimi_kl": 0.057769775390625, - "learning_rate": 3.9379999999999994e-07, - "loss": 0.0011, - "ppl": 0.05908203125, - "reward": 0.9607450366020203, - "reward_std": 0.007520139159169048, - "rewards/perpo_ocr_edit_distance_reward": 0.960745096206665, + "advantages": -2.8610231765924254e-06, + "completion_length": 105.0, + "delta_ref_entropy_loss": 0.0213623046875, + "delta_ref_ppl": -0.0732421875, + "entropy_loss": -0.0269775390625, + "epoch": 0.1062, + "grad_norm": 1.8264398040707177, + "k1_kl": 0.0732421875, + "k3_kl": 0.052978515625, + "kimi_kl": 0.1513671875, + "learning_rate": 4.469e-07, + "loss": 0.0021, + "ppl": 0.0084228515625, + "reward": 0.7960386872291565, + "reward_std": 0.00891952309757471, + "rewards/perpo_ocr_edit_distance_reward": 0.796038806438446, "step": 531, "temperature": 0.9 }, { - "advantages": -7.254737226958241e-06, - "completion_length": 527.5, - "delta_ref_entropy_loss": 0.0576171875, - "delta_ref_ppl": -0.03228759765625, - "entropy_loss": -0.0615234375, - "epoch": 0.2128, - "grad_norm": 1.020704000631232, - "k1_kl": 0.03228759765625, - "k3_kl": 0.0139007568359375, - "kimi_kl": 0.026397705078125, - "learning_rate": 3.936e-07, - "loss": 0.0006, - "ppl": 0.0296630859375, - "reward": 0.9631005525588989, - "reward_std": 0.03915685007814318, - "rewards/perpo_ocr_edit_distance_reward": 0.9631005525588989, + "advantages": -2.920627775893081e-05, + "completion_length": 572.0, + "delta_ref_entropy_loss": 0.0390625, + "delta_ref_ppl": -0.02294921875, + "entropy_loss": -0.046875, + "epoch": 0.1064, + "grad_norm": 0.7721794772106999, + "k1_kl": 0.02294921875, + "k3_kl": 0.01123046875, + "kimi_kl": 0.01953125, + "learning_rate": 4.4679999999999995e-07, + "loss": 0.0005, + "ppl": 0.0233154296875, + "reward": 0.9825990200042725, + "reward_std": 0.00048305816017091274, + "rewards/perpo_ocr_edit_distance_reward": 0.982599139213562, "step": 532, "temperature": 0.9 }, { - "advantages": -0.0002973846026748106, - "completion_length": 445.0, - "delta_ref_entropy_loss": 0.05926513671875, - "delta_ref_ppl": -0.04901123046875, - "entropy_loss": -0.08489990234375, - "epoch": 0.2132, - "grad_norm": 1.3158231633674475, - "k1_kl": 0.04901123046875, - "k3_kl": 0.028564453125, - "kimi_kl": 0.08935546875, - "learning_rate": 3.934e-07, - "loss": 0.0014, - "ppl": 0.04522705078125, - "reward": 0.8436188697814941, - "reward_std": 0.013269953429698944, - "rewards/perpo_ocr_edit_distance_reward": 0.8436189293861389, + "advantages": -0.00012711116869468242, + "completion_length": 300.0, + "delta_ref_entropy_loss": 0.014404296875, + "delta_ref_ppl": -0.0294189453125, + "entropy_loss": -0.0198974609375, + "epoch": 0.1066, + "grad_norm": 0.9102186268260124, + "k1_kl": 0.0294189453125, + "k3_kl": 0.02197265625, + "kimi_kl": 0.052734375, + "learning_rate": 4.4669999999999994e-07, + "loss": 0.001, + "ppl": 0.006500244140625, + "reward": 0.9795918464660645, + "reward_std": 0.00036888642353005707, + "rewards/perpo_ocr_edit_distance_reward": 0.9795919060707092, "step": 533, "temperature": 0.9 }, { - "advantages": -6.289993120844883e-05, - "completion_length": 463.5, - "delta_ref_entropy_loss": 0.0386962890625, - "delta_ref_ppl": -0.0355224609375, - "entropy_loss": -0.02032470703125, - "epoch": 0.2136, - "grad_norm": 0.47632398047904506, - "k1_kl": 0.03564453125, - "k3_kl": 0.01995849609375, - "kimi_kl": 0.0484619140625, - "learning_rate": 3.932e-07, - "loss": 0.0009, - "ppl": 0.0073394775390625, - "reward": 0.9107362329959869, - "reward_std": 0.003480447303445544, - "rewards/perpo_ocr_edit_distance_reward": 0.9107363224029541, + "advantages": -2.219421548943501e-05, + "completion_length": 537.0, + "delta_ref_entropy_loss": 0.014404296875, + "delta_ref_ppl": -0.0146484375, + "entropy_loss": -0.0146484375, + "epoch": 0.1068, + "grad_norm": 0.504858045574357, + "k1_kl": 0.0146484375, + "k3_kl": 0.0076904296875, + "kimi_kl": 0.0142822265625, + "learning_rate": 4.466e-07, + "loss": 0.0003, + "ppl": 0.005462646484375, + "reward": 0.9806478023529053, + "reward_std": 0.0002837240172084421, + "rewards/perpo_ocr_edit_distance_reward": 0.98064786195755, "step": 534, "temperature": 0.9 }, { - "advantages": -1.794099911700542e-05, - "completion_length": 594.5, - "delta_ref_entropy_loss": 0.07177734375, - "delta_ref_ppl": -0.039306640625, - "entropy_loss": -0.0802001953125, - "epoch": 0.214, - "grad_norm": 0.6929880684021503, - "k1_kl": 0.039306640625, - "k3_kl": 0.016876220703125, - "kimi_kl": 0.0433349609375, - "learning_rate": 3.93e-07, - "loss": 0.0007, - "ppl": 0.0389404296875, - "reward": 0.636625349521637, - "reward_std": 0.0005868783337064087, - "rewards/perpo_ocr_edit_distance_reward": 0.6366253942251205, + "advantages": -2.665179181349231e-06, + "completion_length": 173.0, + "delta_ref_entropy_loss": 0.0303955078125, + "delta_ref_ppl": -0.057373046875, + "entropy_loss": -0.0390625, + "epoch": 0.107, + "grad_norm": 1.2745634140843927, + "k1_kl": 0.057373046875, + "k3_kl": 0.036376953125, + "kimi_kl": 0.07373046875, + "learning_rate": 4.465e-07, + "loss": 0.0015, + "ppl": 0.020263671875, + "reward": 0.9603174924850464, + "reward_std": 0.003078240668401122, + "rewards/perpo_ocr_edit_distance_reward": 0.9603174924850464, "step": 535, "temperature": 0.9 }, { - "advantages": -8.812972737359814e-06, - "completion_length": 253.5, - "delta_ref_entropy_loss": 0.0452880859375, - "delta_ref_ppl": -0.086181640625, - "entropy_loss": -0.053955078125, - "epoch": 0.2144, - "grad_norm": 0.40812472858397303, - "k1_kl": 0.0863037109375, - "k3_kl": 0.063232421875, - "kimi_kl": 0.28857421875, - "learning_rate": 3.9279999999999997e-07, - "loss": 0.0025, - "ppl": 0.021636962890625, - "reward": 0.9935643672943115, - "reward_std": 0.0018819567048922181, - "rewards/perpo_ocr_edit_distance_reward": 0.9935644268989563, + "advantages": -8.872577382135205e-06, + "completion_length": 552.0, + "delta_ref_entropy_loss": 0.05078125, + "delta_ref_ppl": -0.039306640625, + "entropy_loss": -0.056396484375, + "epoch": 0.1072, + "grad_norm": 0.8550236647160752, + "k1_kl": 0.039306640625, + "k3_kl": 0.022216796875, + "kimi_kl": 0.049560546875, + "learning_rate": 4.464e-07, + "loss": 0.0009, + "ppl": 0.032470703125, + "reward": 0.980435311794281, + "reward_std": 0.003734671277925372, + "rewards/perpo_ocr_edit_distance_reward": 0.9804354310035706, "step": 536, "temperature": 0.9 }, { - "advantages": 2.0406076885137736e-05, - "completion_length": 538.0, - "delta_ref_entropy_loss": 0.0384521484375, - "delta_ref_ppl": -0.02978515625, - "entropy_loss": -0.04931640625, - "epoch": 0.2148, - "grad_norm": 0.4782474025707296, - "k1_kl": 0.02984619140625, - "k3_kl": 0.017120361328125, - "kimi_kl": 0.0487060546875, - "learning_rate": 3.926e-07, - "loss": 0.0007, - "ppl": 0.0244140625, - "reward": 0.9909096360206604, - "reward_std": 0.0028219990781508386, - "rewards/perpo_ocr_edit_distance_reward": 0.9909096658229828, + "advantages": -0.00011347021791152656, + "completion_length": 663.0, + "delta_ref_entropy_loss": 0.018310546875, + "delta_ref_ppl": -0.02197265625, + "entropy_loss": -0.0216064453125, + "epoch": 0.1074, + "grad_norm": 0.7933061811009878, + "k1_kl": 0.0220947265625, + "k3_kl": 0.0123291015625, + "kimi_kl": 0.0302734375, + "learning_rate": 4.4629999999999997e-07, + "loss": 0.0006, + "ppl": 0.0081787109375, + "reward": 0.9627328515052795, + "reward_std": 0.0004251878126524389, + "rewards/perpo_ocr_edit_distance_reward": 0.9627329707145691, "step": 537, "temperature": 0.9 }, { - "advantages": -1.037120864566532e-05, - "completion_length": 130.5, - "delta_ref_entropy_loss": 0.083740234375, - "delta_ref_ppl": -0.07568359375, - "entropy_loss": -0.06689453125, - "epoch": 0.2152, - "grad_norm": 2.5039564335599294, - "k1_kl": 0.07568359375, - "k3_kl": 0.0445556640625, - "kimi_kl": 0.0794677734375, - "learning_rate": 3.924e-07, - "loss": 0.0018, - "ppl": 0.03363037109375, - "reward": 0.9552958011627197, - "reward_std": 0.005421391106210649, - "rewards/perpo_ocr_edit_distance_reward": 0.9552958309650421, + "advantages": -2.2487982278107665e-05, + "completion_length": 232.0, + "delta_ref_entropy_loss": 0.055419921875, + "delta_ref_ppl": -0.08056640625, + "entropy_loss": -0.05419921875, + "epoch": 0.1076, + "grad_norm": 2.8135833798064755, + "k1_kl": 0.080078125, + "k3_kl": 0.04833984375, + "kimi_kl": 0.115234375, + "learning_rate": 4.4619999999999996e-07, + "loss": 0.002, + "ppl": 0.0223388671875, + "reward": 0.9630398750305176, + "reward_std": 0.0010358967119827867, + "rewards/perpo_ocr_edit_distance_reward": 0.9630398750305176, "step": 538, "temperature": 0.9 }, { - "advantages": -0.00037735700607299805, - "completion_length": 469.0, - "delta_ref_entropy_loss": 0.03607177734375, - "delta_ref_ppl": -0.039093017578125, - "entropy_loss": -0.035400390625, - "epoch": 0.2156, - "grad_norm": 0.6534182530395453, - "k1_kl": 0.0388336181640625, - "k3_kl": 0.0250396728515625, - "kimi_kl": 0.049346923828125, - "learning_rate": 3.922e-07, - "loss": 0.0014, - "ppl": 0.01934814453125, - "reward": 0.9734097719192505, - "reward_std": 0.0001109950608224608, - "rewards/perpo_ocr_edit_distance_reward": 0.9734098613262177, + "advantages": -3.676755295600742e-05, + "completion_length": 152.0, + "delta_ref_entropy_loss": 0.00567626953125, + "delta_ref_ppl": -0.09130859375, + "entropy_loss": -0.03955078125, + "epoch": 0.1078, + "grad_norm": 2.3548593555028177, + "k1_kl": 0.09130859375, + "k3_kl": 0.0791015625, + "kimi_kl": 0.271484375, + "learning_rate": 4.4609999999999995e-07, + "loss": 0.0032, + "ppl": 0.02099609375, + "reward": 0.9650350213050842, + "reward_std": 0.0029114270582795143, + "rewards/perpo_ocr_edit_distance_reward": 0.965035080909729, "step": 539, "temperature": 0.9 }, { - "advantages": -7.346485758086274e-05, - "completion_length": 443.5, - "delta_ref_entropy_loss": 0.04827880859375, - "delta_ref_ppl": -0.074462890625, - "entropy_loss": -0.08349609375, - "epoch": 0.216, - "grad_norm": 4.122113985412811, - "k1_kl": 0.074462890625, - "k3_kl": 0.04473876953125, - "kimi_kl": 0.1038818359375, - "learning_rate": 3.92e-07, - "loss": 0.0019, - "ppl": 0.045166015625, - "reward": 0.9798128306865692, - "reward_std": 0.018454552002367564, - "rewards/perpo_ocr_edit_distance_reward": 0.979812890291214, + "advantages": -3.007480154337827e-05, + "completion_length": 512.0, + "delta_ref_entropy_loss": 0.056396484375, + "delta_ref_ppl": -0.04638671875, + "entropy_loss": -0.07470703125, + "epoch": 0.108, + "grad_norm": 1.6431048151291339, + "k1_kl": 0.04638671875, + "k3_kl": 0.029296875, + "kimi_kl": 0.060791015625, + "learning_rate": 4.46e-07, + "loss": 0.0012, + "ppl": 0.040771484375, + "reward": 0.9265285730361938, + "reward_std": 0.0007500573992729187, + "rewards/perpo_ocr_edit_distance_reward": 0.9265286922454834, "step": 540, "temperature": 0.9 }, { - "advantages": -9.932688328717632e-06, - "completion_length": 909.5, - "delta_ref_entropy_loss": 0.02374267578125, - "delta_ref_ppl": -0.01116943359375, - "entropy_loss": -0.05889892578125, - "epoch": 0.2164, - "grad_norm": 1.1534997324701917, - "k1_kl": 0.011199951171875, - "k3_kl": 0.00537109375, - "kimi_kl": 0.010009765625, - "learning_rate": 3.9179999999999995e-07, - "loss": 0.0002, - "ppl": 0.03521728515625, - "reward": 0.9112117886543274, - "reward_std": 0.06695519085042179, - "rewards/perpo_ocr_edit_distance_reward": 0.9112118780612946, + "advantages": -7.918903065728955e-06, + "completion_length": 1347.0, + "delta_ref_entropy_loss": 0.03564453125, + "delta_ref_ppl": -0.021484375, + "entropy_loss": -0.08349609375, + "epoch": 0.1082, + "grad_norm": 1.807459831064944, + "k1_kl": 0.0213623046875, + "k3_kl": 0.0125732421875, + "kimi_kl": 0.0201416015625, + "learning_rate": 4.459e-07, + "loss": 0.0005, + "ppl": 0.040771484375, + "reward": 0.9464644193649292, + "reward_std": 0.008504091762006283, + "rewards/perpo_ocr_edit_distance_reward": 0.9464645385742188, "step": 541, "temperature": 0.9 }, { - "advantages": -1.8366745806019935e-05, - "completion_length": 607.0, - "delta_ref_entropy_loss": 0.04400634765625, - "delta_ref_ppl": -0.05560302734375, - "entropy_loss": -0.05108642578125, - "epoch": 0.2168, - "grad_norm": 4.649272114759572, - "k1_kl": 0.05584716796875, - "k3_kl": 0.03331756591796875, - "kimi_kl": 0.11065673828125, - "learning_rate": 3.916e-07, - "loss": 0.0013, - "ppl": 0.022003173828125, - "reward": 0.8903753161430359, - "reward_std": 0.09178018500097096, - "rewards/perpo_ocr_edit_distance_reward": 0.8903753757476807, + "advantages": -1.055853772413684e-06, + "completion_length": 998.0, + "delta_ref_entropy_loss": 0.037109375, + "delta_ref_ppl": -0.017578125, + "entropy_loss": -0.034423828125, + "epoch": 0.1084, + "grad_norm": 0.7951268775855402, + "k1_kl": 0.017578125, + "k3_kl": 0.006866455078125, + "kimi_kl": 0.00927734375, + "learning_rate": 4.4579999999999993e-07, + "loss": 0.0003, + "ppl": 0.01544189453125, + "reward": 0.8047666549682617, + "reward_std": 0.04000210762023926, + "rewards/perpo_ocr_edit_distance_reward": 0.8047667145729065, "step": 542, "temperature": 0.9 }, { - "advantages": 5.577292085945373e-07, - "completion_length": 1041.5, - "delta_ref_entropy_loss": 0.079833984375, - "delta_ref_ppl": -0.0526123046875, - "entropy_loss": -0.1322021484375, - "epoch": 0.2172, - "grad_norm": 51.557661668734184, - "k1_kl": 0.052734375, - "k3_kl": 0.1134033203125, - "kimi_kl": 0.05078125, - "learning_rate": 3.914e-07, - "loss": 0.0046, - "ppl": 0.08642578125, - "reward": 0.8774810135364532, - "reward_std": 0.07641451479867101, - "rewards/perpo_ocr_edit_distance_reward": 0.8774810135364532, + "advantages": -3.704854680108838e-05, + "completion_length": 144.0, + "delta_ref_entropy_loss": 0.015869140625, + "delta_ref_ppl": -0.041259765625, + "entropy_loss": -0.04638671875, + "epoch": 0.1086, + "grad_norm": 3.3007089319824017, + "k1_kl": 0.041259765625, + "k3_kl": 0.028564453125, + "kimi_kl": 0.06982421875, + "learning_rate": 4.457e-07, + "loss": 0.0012, + "ppl": 0.0264892578125, + "reward": 0.9868850708007812, + "reward_std": 0.0028897589072585106, + "rewards/perpo_ocr_edit_distance_reward": 0.986885130405426, "step": 543, "temperature": 0.9 }, { - "advantages": -6.930317613296211e-05, - "completion_length": 860.5, - "delta_ref_entropy_loss": 0.0316162109375, - "delta_ref_ppl": -0.01263427734375, - "entropy_loss": -0.04876708984375, - "epoch": 0.2176, - "grad_norm": 0.8718495613555288, - "k1_kl": 0.0125732421875, - "k3_kl": 0.00494384765625, - "kimi_kl": 0.0071868896484375, - "learning_rate": 3.9119999999999996e-07, - "loss": 0.0003, - "ppl": 0.0232696533203125, - "reward": 0.9781278967857361, - "reward_std": 0.0016947225885814987, - "rewards/perpo_ocr_edit_distance_reward": 0.9781280159950256, + "advantages": -7.833753556951706e-07, + "completion_length": 2048.0, + "delta_ref_entropy_loss": 0.01068115234375, + "delta_ref_ppl": -0.0101318359375, + "entropy_loss": -0.044189453125, + "epoch": 0.1088, + "grad_norm": 0.8158230351689097, + "k1_kl": 0.0101318359375, + "k3_kl": 0.00604248046875, + "kimi_kl": 0.01348876953125, + "learning_rate": 4.4559999999999997e-07, + "loss": 0.0002, + "ppl": 0.019287109375, + "reward": 0.9556329250335693, + "reward_std": 0.02154909260571003, + "rewards/perpo_ocr_edit_distance_reward": 0.9556330442428589, "step": 544, "temperature": 0.9 }, { - "advantages": -1.532690987460228e-07, - "completion_length": 1169.0, - "delta_ref_entropy_loss": 0.0770263671875, - "delta_ref_ppl": -0.0386962890625, - "entropy_loss": -0.164794921875, - "epoch": 0.218, - "grad_norm": 0.8437242270247177, - "k1_kl": 0.0386962890625, - "k3_kl": 0.01873779296875, - "kimi_kl": 0.0306396484375, - "learning_rate": 3.91e-07, - "loss": 0.0007, - "ppl": 0.08740234375, - "reward": 0.7664267122745514, - "reward_std": 0.09007960930466652, - "rewards/perpo_ocr_edit_distance_reward": 0.7664267420768738, + "advantages": -2.3330962903855834e-06, + "completion_length": 1488.0, + "delta_ref_entropy_loss": 0.00994873046875, + "delta_ref_ppl": -0.004669189453125, + "entropy_loss": -0.0169677734375, + "epoch": 0.109, + "grad_norm": 0.42913596613707256, + "k1_kl": 0.004669189453125, + "k3_kl": 0.0021514892578125, + "kimi_kl": 0.0032806396484375, + "learning_rate": 4.455e-07, + "loss": 0.0001, + "ppl": 0.007354736328125, + "reward": 0.8329014778137207, + "reward_std": 0.021832309663295746, + "rewards/perpo_ocr_edit_distance_reward": 0.8329015374183655, "step": 545, "temperature": 0.9 }, { - "advantages": -6.790246516175102e-05, - "completion_length": 891.5, - "delta_ref_entropy_loss": 0.025909423828125, - "delta_ref_ppl": -0.01617431640625, - "entropy_loss": -0.017822265625, - "epoch": 0.2184, - "grad_norm": 0.6955683815203922, - "k1_kl": 0.01624298095703125, - "k3_kl": 0.009202957153320312, - "kimi_kl": 0.019487380981445312, - "learning_rate": 3.908e-07, - "loss": 0.0004, - "ppl": 0.0080108642578125, - "reward": 0.8599339425563812, - "reward_std": 0.0003525315987644717, - "rewards/perpo_ocr_edit_distance_reward": 0.8599339723587036, + "advantages": -2.7128629881190136e-05, + "completion_length": 354.0, + "delta_ref_entropy_loss": 0.042724609375, + "delta_ref_ppl": -0.04638671875, + "entropy_loss": -0.049560546875, + "epoch": 0.1092, + "grad_norm": 1.0473421257698985, + "k1_kl": 0.04638671875, + "k3_kl": 0.03125, + "kimi_kl": 0.091796875, + "learning_rate": 4.454e-07, + "loss": 0.0013, + "ppl": 0.025390625, + "reward": 0.9095988273620605, + "reward_std": 0.0036678253673017025, + "rewards/perpo_ocr_edit_distance_reward": 0.9095989465713501, "step": 546, "temperature": 0.9 }, { - "advantages": -0.00029821055277068353, - "completion_length": 523.5, - "delta_ref_entropy_loss": 0.0341796875, - "delta_ref_ppl": -0.02191162109375, - "entropy_loss": -0.02435302734375, - "epoch": 0.2188, - "grad_norm": 0.7276906871009587, - "k1_kl": 0.02191162109375, - "k3_kl": 0.0098419189453125, - "kimi_kl": 0.0189208984375, - "learning_rate": 3.9059999999999996e-07, - "loss": 0.0007, - "ppl": 0.0109710693359375, - "reward": 0.968519002199173, - "reward_std": 0.043783050030469894, - "rewards/perpo_ocr_edit_distance_reward": 0.9685190916061401, + "advantages": -0.00012228319246787578, + "completion_length": 532.0, + "delta_ref_entropy_loss": 0.02099609375, + "delta_ref_ppl": -0.0186767578125, + "entropy_loss": -0.0167236328125, + "epoch": 0.1094, + "grad_norm": 0.6111833932645676, + "k1_kl": 0.0185546875, + "k3_kl": 0.010009765625, + "kimi_kl": 0.0213623046875, + "learning_rate": 4.4529999999999994e-07, + "loss": 0.0005, + "ppl": 0.0067138671875, + "reward": 0.9881152510643005, + "reward_std": 0.0005267145461402833, + "rewards/perpo_ocr_edit_distance_reward": 0.9881153106689453, "step": 547, "temperature": 0.9 }, { - "advantages": -0.000298121145796415, - "completion_length": 609.0, + "advantages": 9.383474207425024e-06, + "completion_length": 552.0, "delta_ref_entropy_loss": 0.030517578125, - "delta_ref_ppl": -0.019683837890625, - "entropy_loss": -0.0302734375, - "epoch": 0.2192, - "grad_norm": 0.43480058759049767, - "k1_kl": 0.01971435546875, - "k3_kl": 0.0098876953125, - "kimi_kl": 0.016510009765625, - "learning_rate": 3.904e-07, - "loss": 0.0007, - "ppl": 0.0125274658203125, - "reward": 0.8472175300121307, - "reward_std": 0.058721281588077545, - "rewards/perpo_ocr_edit_distance_reward": 0.8472175896167755, + "delta_ref_ppl": -0.0267333984375, + "entropy_loss": -0.0361328125, + "epoch": 0.1096, + "grad_norm": 0.5676512479951641, + "k1_kl": 0.02685546875, + "k3_kl": 0.01348876953125, + "kimi_kl": 0.02490234375, + "learning_rate": 4.452e-07, + "loss": 0.0005, + "ppl": 0.015380859375, + "reward": 0.9668015241622925, + "reward_std": 0.0008080659317784011, + "rewards/perpo_ocr_edit_distance_reward": 0.9668015241622925, "step": 548, "temperature": 0.9 }, { - "advantages": -0.00032782554808363784, - "completion_length": 539.0, - "delta_ref_entropy_loss": 0.0325927734375, - "delta_ref_ppl": -0.027587890625, - "entropy_loss": -0.032318115234375, - "epoch": 0.2196, - "grad_norm": 0.5651231574264782, - "k1_kl": 0.0274658203125, - "k3_kl": 0.015838623046875, - "kimi_kl": 0.0361328125, - "learning_rate": 3.902e-07, - "loss": 0.001, - "ppl": 0.0155487060546875, - "reward": 0.9973914623260498, - "reward_std": 0.0005931418272666633, - "rewards/perpo_ocr_edit_distance_reward": 0.997391551733017, + "advantages": 0.0, + "completion_length": 101.0, + "delta_ref_entropy_loss": 0.04052734375, + "delta_ref_ppl": -0.05859375, + "entropy_loss": -0.02978515625, + "epoch": 0.1098, + "grad_norm": 0.08687410963996496, + "k1_kl": 0.058837890625, + "k3_kl": 0.032958984375, + "kimi_kl": 0.07861328125, + "learning_rate": 4.451e-07, + "loss": 0.0013, + "ppl": 0.01116943359375, + "reward": 0.9616934657096863, + "reward_std": 0.0, + "rewards/perpo_ocr_edit_distance_reward": 0.961693525314331, "step": 549, "temperature": 0.9 }, { - "advantages": -0.0001590251940797316, - "completion_length": 663.0, - "delta_ref_entropy_loss": 0.0238037109375, - "delta_ref_ppl": -0.02410888671875, - "entropy_loss": -0.0317535400390625, - "epoch": 0.22, - "grad_norm": 0.9182632696763307, - "k1_kl": 0.0240478515625, - "k3_kl": 0.016448974609375, - "kimi_kl": 0.042236328125, - "learning_rate": 3.8999999999999997e-07, - "loss": 0.0008, - "ppl": 0.02257537841796875, - "reward": 0.9904227554798126, - "reward_std": 0.0007811894320184365, - "rewards/perpo_ocr_edit_distance_reward": 0.9904228448867798, + "advantages": -1.2125287867092993e-05, + "completion_length": 577.0, + "delta_ref_entropy_loss": 0.1318359375, + "delta_ref_ppl": -0.0673828125, + "entropy_loss": -0.2373046875, + "epoch": 0.11, + "grad_norm": 1.8771267270073493, + "k1_kl": 0.0673828125, + "k3_kl": 0.0289306640625, + "kimi_kl": 0.04541015625, + "learning_rate": 4.45e-07, + "loss": 0.0012, + "ppl": 0.134765625, + "reward": 0.5637111067771912, + "reward_std": 0.005518836434930563, + "rewards/perpo_ocr_edit_distance_reward": 0.5637111663818359, "step": 550, "temperature": 0.9 }, { - "advantages": -3.193957763869548e-05, - "completion_length": 626.5, - "delta_ref_entropy_loss": 0.04803466796875, - "delta_ref_ppl": -0.024200439453125, - "entropy_loss": -0.057861328125, - "epoch": 0.2204, - "grad_norm": 1.3218946483194844, - "k1_kl": 0.02423095703125, - "k3_kl": 0.01141357421875, - "kimi_kl": 0.0229949951171875, - "learning_rate": 3.8979999999999996e-07, - "loss": 0.0005, - "ppl": 0.030517578125, - "reward": 0.9742090106010437, - "reward_std": 0.003839332581264898, - "rewards/perpo_ocr_edit_distance_reward": 0.9742090106010437, + "advantages": -4.691737103712512e-06, + "completion_length": 1283.0, + "delta_ref_entropy_loss": 0.06591796875, + "delta_ref_ppl": -0.03759765625, + "entropy_loss": -0.09814453125, + "epoch": 0.1102, + "grad_norm": 1.8470711094603909, + "k1_kl": 0.03759765625, + "k3_kl": 0.0205078125, + "kimi_kl": 0.041015625, + "learning_rate": 4.449e-07, + "loss": 0.0008, + "ppl": 0.0537109375, + "reward": 0.9686251282691956, + "reward_std": 0.0017118859104812145, + "rewards/perpo_ocr_edit_distance_reward": 0.9686251282691956, "step": 551, "temperature": 0.9 }, { - "advantages": -2.48125634243479e-05, - "completion_length": 917.5, - "delta_ref_entropy_loss": 0.0364990234375, - "delta_ref_ppl": -0.02862548828125, - "entropy_loss": -0.046630859375, - "epoch": 0.2208, - "grad_norm": 0.4585433425925787, - "k1_kl": 0.02850341796875, - "k3_kl": 0.01715087890625, - "kimi_kl": 0.05029296875, - "learning_rate": 3.896e-07, - "loss": 0.0007, - "ppl": 0.0230712890625, - "reward": 0.9641164541244507, - "reward_std": 0.0014180318976286799, - "rewards/perpo_ocr_edit_distance_reward": 0.9641165435314178, + "advantages": -1.7029898913278885e-07, + "completion_length": 567.0, + "delta_ref_entropy_loss": 0.0064697265625, + "delta_ref_ppl": -0.0101318359375, + "entropy_loss": -0.06787109375, + "epoch": 0.1104, + "grad_norm": 0.7163341989256852, + "k1_kl": 0.01007080078125, + "k3_kl": 0.007171630859375, + "kimi_kl": 0.01153564453125, + "learning_rate": 4.4479999999999996e-07, + "loss": 0.0003, + "ppl": 0.03515625, + "reward": 0.7220337390899658, + "reward_std": 0.3153943419456482, + "rewards/perpo_ocr_edit_distance_reward": 0.7220337986946106, "step": 552, "temperature": 0.9 }, { - "advantages": 1.2568065230311731e-05, - "completion_length": 797.0, - "delta_ref_entropy_loss": 0.04644775390625, - "delta_ref_ppl": -0.033233642578125, - "entropy_loss": -0.08984375, - "epoch": 0.2212, - "grad_norm": 1.4547233108080775, - "k1_kl": 0.033233642578125, - "k3_kl": 0.01983642578125, - "kimi_kl": 0.0355224609375, - "learning_rate": 3.8940000000000003e-07, - "loss": 0.0008, - "ppl": 0.05615234375, - "reward": 0.8690693974494934, - "reward_std": 0.08897696173517033, - "rewards/perpo_ocr_edit_distance_reward": 0.8690694570541382, + "advantages": -1.234667706739856e-06, + "completion_length": 601.0, + "delta_ref_entropy_loss": 0.095703125, + "delta_ref_ppl": -0.057373046875, + "entropy_loss": -0.2138671875, + "epoch": 0.1106, + "grad_norm": 1.9554395904425619, + "k1_kl": 0.057861328125, + "k3_kl": 0.0341796875, + "kimi_kl": 0.061767578125, + "learning_rate": 4.4469999999999995e-07, + "loss": 0.0014, + "ppl": 0.11669921875, + "reward": 0.8222081661224365, + "reward_std": 0.04123027250170708, + "rewards/perpo_ocr_edit_distance_reward": 0.8222081661224365, "step": 553, "temperature": 0.9 }, { - "advantages": -0.0003009012768870889, - "completion_length": 639.0, - "delta_ref_entropy_loss": 0.03582763671875, - "delta_ref_ppl": -0.027191162109375, - "entropy_loss": -0.0380859375, - "epoch": 0.2216, - "grad_norm": 0.43296722131735155, - "k1_kl": 0.02716064453125, - "k3_kl": 0.011474609375, - "kimi_kl": 0.01837158203125, - "learning_rate": 3.8919999999999996e-07, - "loss": 0.0008, - "ppl": 0.013397216796875, - "reward": 0.9859186112880707, - "reward_std": 0.005869992543011904, - "rewards/perpo_ocr_edit_distance_reward": 0.9859187006950378, + "advantages": -2.0350730665086303e-06, + "completion_length": 577.0, + "delta_ref_entropy_loss": 0.025146484375, + "delta_ref_ppl": -0.022705078125, + "entropy_loss": -0.1103515625, + "epoch": 0.1108, + "grad_norm": 1.2337400167491535, + "k1_kl": 0.022705078125, + "k3_kl": 0.01300048828125, + "kimi_kl": 0.028564453125, + "learning_rate": 4.446e-07, + "loss": 0.0005, + "ppl": 0.06103515625, + "reward": 0.9129314422607422, + "reward_std": 0.02087954618036747, + "rewards/perpo_ocr_edit_distance_reward": 0.9129315614700317, "step": 554, "temperature": 0.9 }, { - "advantages": -0.0001119204971473664, - "completion_length": 789.0, - "delta_ref_entropy_loss": 0.02734375, - "delta_ref_ppl": -0.020751953125, - "entropy_loss": -0.01947021484375, - "epoch": 0.222, - "grad_norm": 0.2389793152189085, - "k1_kl": 0.020751953125, - "k3_kl": 0.011077880859375, - "kimi_kl": 0.0313720703125, - "learning_rate": 3.89e-07, - "loss": 0.0006, - "ppl": 0.0072174072265625, - "reward": 0.9975292980670929, - "reward_std": 0.00043937520240433514, - "rewards/perpo_ocr_edit_distance_reward": 0.9975294172763824, + "advantages": 4.666192580771167e-06, + "completion_length": 131.0, + "delta_ref_entropy_loss": 0.04052734375, + "delta_ref_ppl": -0.07763671875, + "entropy_loss": -0.04150390625, + "epoch": 0.111, + "grad_norm": 2.2918978689290417, + "k1_kl": 0.07763671875, + "k3_kl": 0.04638671875, + "kimi_kl": 0.10009765625, + "learning_rate": 4.445e-07, + "loss": 0.0019, + "ppl": 0.025390625, + "reward": 0.9774149656295776, + "reward_std": 0.003551355330273509, + "rewards/perpo_ocr_edit_distance_reward": 0.9774149656295776, "step": 555, "temperature": 0.9 }, { - "advantages": -6.74384000376449e-05, - "completion_length": 233.0, - "delta_ref_entropy_loss": 0.0682373046875, - "delta_ref_ppl": -0.045166015625, - "entropy_loss": -0.03759765625, - "epoch": 0.2224, - "grad_norm": 1.2078164927305182, - "k1_kl": 0.045166015625, - "k3_kl": 0.0224609375, - "kimi_kl": 0.0418701171875, - "learning_rate": 3.888e-07, - "loss": 0.001, - "ppl": 0.016632080078125, - "reward": 0.9318532049655914, - "reward_std": 0.0010927258990705013, - "rewards/perpo_ocr_edit_distance_reward": 0.9318532943725586, + "advantages": -5.705016405954666e-07, + "completion_length": 2048.0, + "delta_ref_entropy_loss": 0.0196533203125, + "delta_ref_ppl": -0.0091552734375, + "entropy_loss": -0.05224609375, + "epoch": 0.1112, + "grad_norm": 5.911563464996492, + "k1_kl": 0.009033203125, + "k3_kl": 0.01019287109375, + "kimi_kl": 0.0137939453125, + "learning_rate": 4.444e-07, + "loss": 0.0004, + "ppl": 0.03466796875, + "reward": 0.16213613748550415, + "reward_std": 0.022923942655324936, + "rewards/perpo_ocr_edit_distance_reward": 0.16213615238666534, "step": 556, "temperature": 0.9 }, { - "advantages": -0.0002980657986242363, - "completion_length": 294.5, - "delta_ref_entropy_loss": 0.08148193359375, - "delta_ref_ppl": -0.072021484375, - "entropy_loss": -0.09979248046875, - "epoch": 0.2228, - "grad_norm": 3.2514161391714027, - "k1_kl": 0.0718994140625, - "k3_kl": 0.04119873046875, - "kimi_kl": 0.1204833984375, - "learning_rate": 3.8859999999999997e-07, - "loss": 0.0019, - "ppl": 0.0504608154296875, - "reward": 0.7519968748092651, - "reward_std": 0.09528856724500656, - "rewards/perpo_ocr_edit_distance_reward": 0.7519969642162323, + "advantages": 8.65118909132434e-06, + "completion_length": 1064.0, + "delta_ref_entropy_loss": 0.0198974609375, + "delta_ref_ppl": -0.01318359375, + "entropy_loss": -0.037353515625, + "epoch": 0.1114, + "grad_norm": 1.6422167038897906, + "k1_kl": 0.01318359375, + "k3_kl": 0.011962890625, + "kimi_kl": 0.0169677734375, + "learning_rate": 4.443e-07, + "loss": 0.0005, + "ppl": 0.0274658203125, + "reward": 0.976546585559845, + "reward_std": 0.0038353740237653255, + "rewards/perpo_ocr_edit_distance_reward": 0.976546585559845, "step": 557, "temperature": 0.9 }, { - "advantages": -2.404621773166582e-05, - "completion_length": 718.0, - "delta_ref_entropy_loss": 0.0562744140625, - "delta_ref_ppl": -0.03619384765625, - "entropy_loss": -0.1131591796875, - "epoch": 0.2232, - "grad_norm": 1.2489127432042826, - "k1_kl": 0.0362548828125, - "k3_kl": 0.019439697265625, - "kimi_kl": 0.043914794921875, - "learning_rate": 3.884e-07, - "loss": 0.0008, - "ppl": 0.0589599609375, - "reward": 0.9470407664775848, - "reward_std": 0.0024534365511499345, - "rewards/perpo_ocr_edit_distance_reward": 0.9470407962799072, + "advantages": -2.5289400582551025e-06, + "completion_length": 581.0, + "delta_ref_entropy_loss": 0.0296630859375, + "delta_ref_ppl": -0.0264892578125, + "entropy_loss": -0.039306640625, + "epoch": 0.1116, + "grad_norm": 1.1807126479385985, + "k1_kl": 0.0264892578125, + "k3_kl": 0.0157470703125, + "kimi_kl": 0.03857421875, + "learning_rate": 4.4419999999999997e-07, + "loss": 0.0006, + "ppl": 0.01708984375, + "reward": 0.9863255620002747, + "reward_std": 0.006578738335520029, + "rewards/perpo_ocr_edit_distance_reward": 0.9863255620002747, "step": 558, "temperature": 0.9 }, { - "advantages": -2.8456961445044726e-05, - "completion_length": 410.0, - "delta_ref_entropy_loss": 0.03302001953125, - "delta_ref_ppl": -0.032196044921875, - "entropy_loss": -0.025543212890625, - "epoch": 0.2236, - "grad_norm": 0.2729698591111787, - "k1_kl": 0.032196044921875, - "k3_kl": 0.01686859130859375, - "kimi_kl": 0.036304473876953125, - "learning_rate": 3.882e-07, - "loss": 0.0007, - "ppl": 0.008209228515625, - "reward": 0.9936455488204956, - "reward_std": 0.0003240410587750375, - "rewards/perpo_ocr_edit_distance_reward": 0.993645578622818, + "advantages": -2.9376576549111633e-06, + "completion_length": 910.0, + "delta_ref_entropy_loss": 0.0284423828125, + "delta_ref_ppl": -0.02783203125, + "entropy_loss": -0.058349609375, + "epoch": 0.1118, + "grad_norm": 1.0980455983893778, + "k1_kl": 0.0277099609375, + "k3_kl": 0.0155029296875, + "kimi_kl": 0.029052734375, + "learning_rate": 4.4409999999999996e-07, + "loss": 0.0006, + "ppl": 0.032958984375, + "reward": 0.9668051600456238, + "reward_std": 0.005736985709518194, + "rewards/perpo_ocr_edit_distance_reward": 0.9668052196502686, "step": 559, "temperature": 0.9 }, { - "advantages": -0.0001113883163270657, - "completion_length": 670.5, - "delta_ref_entropy_loss": 0.02496337890625, - "delta_ref_ppl": -0.0112152099609375, - "entropy_loss": -0.02362060546875, - "epoch": 0.224, - "grad_norm": 1.8373167461840756, - "k1_kl": 0.011199951171875, - "k3_kl": 0.0046234130859375, - "kimi_kl": 0.006999969482421875, - "learning_rate": 3.88e-07, - "loss": 0.0003, - "ppl": 0.011474609375, - "reward": 0.9905715882778168, - "reward_std": 0.0013075583992758766, - "rewards/perpo_ocr_edit_distance_reward": 0.9905716180801392, + "advantages": 0.0, + "completion_length": 337.0, + "delta_ref_entropy_loss": 0.056396484375, + "delta_ref_ppl": -0.033203125, + "entropy_loss": -0.05908203125, + "epoch": 0.112, + "grad_norm": 1.5710954657968035, + "k1_kl": 0.033203125, + "k3_kl": 0.02099609375, + "kimi_kl": 0.0498046875, + "learning_rate": 4.44e-07, + "loss": 0.0008, + "ppl": 0.033447265625, + "reward": 0.4077821671962738, + "reward_std": 0.001791209913790226, + "rewards/perpo_ocr_edit_distance_reward": 0.4077821671962738, "step": 560, "temperature": 0.9 }, { - "advantages": -1.3385501006268896e-05, - "completion_length": 730.0, - "delta_ref_entropy_loss": 0.03729248046875, - "delta_ref_ppl": -0.023162841796875, - "entropy_loss": -0.053466796875, - "epoch": 0.2244, - "grad_norm": 1.9827019411691098, - "k1_kl": 0.023284912109375, - "k3_kl": 0.0121307373046875, - "kimi_kl": 0.021575927734375, - "learning_rate": 3.8779999999999997e-07, - "loss": 0.0005, - "ppl": 0.02685546875, - "reward": 0.9373556971549988, - "reward_std": 0.01281717885285616, - "rewards/perpo_ocr_edit_distance_reward": 0.9373558461666107, + "advantages": -3.096035652561113e-05, + "completion_length": 574.0, + "delta_ref_entropy_loss": 0.0390625, + "delta_ref_ppl": -0.03759765625, + "entropy_loss": -0.0654296875, + "epoch": 0.1122, + "grad_norm": 0.9313174754057886, + "k1_kl": 0.03759765625, + "k3_kl": 0.0235595703125, + "kimi_kl": 0.049072265625, + "learning_rate": 4.439e-07, + "loss": 0.001, + "ppl": 0.036865234375, + "reward": 0.95748370885849, + "reward_std": 0.002374294213950634, + "rewards/perpo_ocr_edit_distance_reward": 0.9574837684631348, "step": 561, "temperature": 0.9 }, { - "advantages": -5.6922440307971556e-06, - "completion_length": 831.0, - "delta_ref_entropy_loss": 0.012420654296875, - "delta_ref_ppl": -0.012664794921875, - "entropy_loss": -0.04449462890625, - "epoch": 0.2248, - "grad_norm": 0.914833809964045, - "k1_kl": 0.012664794921875, - "k3_kl": 0.008026123046875, - "kimi_kl": 0.0185546875, - "learning_rate": 3.876e-07, - "loss": 0.0003, - "ppl": 0.021881103515625, - "reward": 0.6622755080461502, - "reward_std": 0.0831866025691852, - "rewards/perpo_ocr_edit_distance_reward": 0.6622755378484726, + "advantages": -3.405979782655777e-07, + "completion_length": 785.0, + "delta_ref_entropy_loss": 0.053466796875, + "delta_ref_ppl": -0.033447265625, + "entropy_loss": -0.1123046875, + "epoch": 0.1124, + "grad_norm": 1.8823680085598722, + "k1_kl": 0.033447265625, + "k3_kl": 0.0198974609375, + "kimi_kl": 0.04541015625, + "learning_rate": 4.4379999999999994e-07, + "loss": 0.0008, + "ppl": 0.0595703125, + "reward": 0.8305776119232178, + "reward_std": 0.12270724773406982, + "rewards/perpo_ocr_edit_distance_reward": 0.8305776715278625, "step": 562, "temperature": 0.9 }, { - "advantages": -1.3198172382544726e-05, - "completion_length": 456.0, - "delta_ref_entropy_loss": 0.03009033203125, - "delta_ref_ppl": -0.0255126953125, - "entropy_loss": -0.0404052734375, - "epoch": 0.2252, - "grad_norm": 2.8144441664819735, - "k1_kl": 0.02545166015625, - "k3_kl": 0.01519775390625, - "kimi_kl": 0.03216552734375, - "learning_rate": 3.874e-07, + "advantages": -4.711321525974199e-05, + "completion_length": 209.0, + "delta_ref_entropy_loss": 0.0341796875, + "delta_ref_ppl": -0.0262451171875, + "entropy_loss": -0.026611328125, + "epoch": 0.1126, + "grad_norm": 1.1941670882031847, + "k1_kl": 0.0264892578125, + "k3_kl": 0.01470947265625, + "kimi_kl": 0.033447265625, + "learning_rate": 4.437e-07, "loss": 0.0006, - "ppl": 0.0216064453125, - "reward": 0.9952777624130249, - "reward_std": 0.0012299322406761348, - "rewards/perpo_ocr_edit_distance_reward": 0.9952778518199921, + "ppl": 0.00982666015625, + "reward": 0.9590492248535156, + "reward_std": 0.0006226484547369182, + "rewards/perpo_ocr_edit_distance_reward": 0.9590492248535156, "step": 563, "temperature": 0.9 }, { - "advantages": -4.451189775522835e-05, - "completion_length": 727.5, - "delta_ref_entropy_loss": 0.01904296875, - "delta_ref_ppl": -0.01593017578125, - "entropy_loss": -0.02374267578125, - "epoch": 0.2256, - "grad_norm": 3.1749598087295268, - "k1_kl": 0.015869140625, - "k3_kl": 0.009735107421875, - "kimi_kl": 0.020904541015625, - "learning_rate": 3.8719999999999997e-07, - "loss": 0.0004, - "ppl": 0.012847900390625, - "reward": 0.9881956279277802, - "reward_std": 0.02799290034454316, - "rewards/perpo_ocr_edit_distance_reward": 0.9881957173347473, + "advantages": 3.2356808787881164e-07, + "completion_length": 658.0, + "delta_ref_entropy_loss": 0.06005859375, + "delta_ref_ppl": -0.050048828125, + "entropy_loss": -0.15625, + "epoch": 0.1128, + "grad_norm": 0.798309025286671, + "k1_kl": 0.0498046875, + "k3_kl": 0.0311279296875, + "kimi_kl": 0.058349609375, + "learning_rate": 4.436e-07, + "loss": 0.0012, + "ppl": 0.08740234375, + "reward": 0.5737996101379395, + "reward_std": 0.11682352423667908, + "rewards/perpo_ocr_edit_distance_reward": 0.5737996101379395, "step": 564, "temperature": 0.9 }, { - "advantages": -7.607255767094756e-05, - "completion_length": 823.5, - "delta_ref_entropy_loss": 0.02813720703125, - "delta_ref_ppl": -0.012603759765625, - "entropy_loss": -0.0177001953125, - "epoch": 0.226, - "grad_norm": 0.2107718158524253, - "k1_kl": 0.0125732421875, - "k3_kl": 0.00444793701171875, - "kimi_kl": 0.0067291259765625, - "learning_rate": 3.87e-07, - "loss": 0.0003, - "ppl": 0.0070037841796875, - "reward": 0.998479425907135, - "reward_std": 0.00048773227899800986, - "rewards/perpo_ocr_edit_distance_reward": 0.9984794855117798, + "advantages": -0.00010099155770149082, + "completion_length": 1036.0, + "delta_ref_entropy_loss": 0.0101318359375, + "delta_ref_ppl": -0.011962890625, + "entropy_loss": -0.0164794921875, + "epoch": 0.113, + "grad_norm": 1.4448440950752115, + "k1_kl": 0.01202392578125, + "k3_kl": 0.00787353515625, + "kimi_kl": 0.0211181640625, + "learning_rate": 4.4349999999999997e-07, + "loss": 0.0004, + "ppl": 0.0068359375, + "reward": 0.9860268831253052, + "reward_std": 0.00023720302851870656, + "rewards/perpo_ocr_edit_distance_reward": 0.9860270023345947, "step": 565, "temperature": 0.9 }, { - "advantages": -9.853499796008691e-05, - "completion_length": 684.0, - "delta_ref_entropy_loss": 0.027587890625, - "delta_ref_ppl": -0.018585205078125, - "entropy_loss": -0.025390625, - "epoch": 0.2264, - "grad_norm": 1.1357936542309275, - "k1_kl": 0.018524169921875, - "k3_kl": 0.01119232177734375, - "kimi_kl": 0.022308349609375, - "learning_rate": 3.8679999999999994e-07, - "loss": 0.0005, - "ppl": 0.0115966796875, - "reward": 0.9986551702022552, - "reward_std": 0.00067309764563106, - "rewards/perpo_ocr_edit_distance_reward": 0.9986552596092224, + "advantages": -1.011576023302041e-05, + "completion_length": 261.0, + "delta_ref_entropy_loss": 0.0211181640625, + "delta_ref_ppl": -0.025390625, + "entropy_loss": -0.0361328125, + "epoch": 0.1132, + "grad_norm": 2.23532166176894, + "k1_kl": 0.025390625, + "k3_kl": 0.019775390625, + "kimi_kl": 0.039306640625, + "learning_rate": 4.434e-07, + "loss": 0.0008, + "ppl": 0.0238037109375, + "reward": 0.9873991012573242, + "reward_std": 0.004114842973649502, + "rewards/perpo_ocr_edit_distance_reward": 0.987399160861969, "step": 566, "temperature": 0.9 }, { - "advantages": -2.772467541944934e-05, - "completion_length": 365.0, - "delta_ref_entropy_loss": 0.048583984375, - "delta_ref_ppl": -0.02642822265625, - "entropy_loss": -0.04443359375, - "epoch": 0.2268, - "grad_norm": 0.9920449010426747, - "k1_kl": 0.0263671875, - "k3_kl": 0.012298583984375, - "kimi_kl": 0.022705078125, - "learning_rate": 3.866e-07, - "loss": 0.0005, - "ppl": 0.0213623046875, - "reward": 0.8981716334819794, - "reward_std": 0.0010410965187475085, - "rewards/perpo_ocr_edit_distance_reward": 0.8981717228889465, + "advantages": -4.793916559719946e-06, + "completion_length": 748.0, + "delta_ref_entropy_loss": 0.0211181640625, + "delta_ref_ppl": -0.0181884765625, + "entropy_loss": -0.056884765625, + "epoch": 0.1134, + "grad_norm": 1.0844183397531666, + "k1_kl": 0.0181884765625, + "k3_kl": 0.01025390625, + "kimi_kl": 0.0220947265625, + "learning_rate": 4.433e-07, + "loss": 0.0004, + "ppl": 0.030517578125, + "reward": 0.9786426424980164, + "reward_std": 0.005197714548557997, + "rewards/perpo_ocr_edit_distance_reward": 0.9786426424980164, "step": 567, "temperature": 0.9 }, { - "advantages": -1.3470650628732983e-05, - "completion_length": 476.5, - "delta_ref_entropy_loss": 0.028564453125, - "delta_ref_ppl": -0.0413818359375, - "entropy_loss": -0.0242919921875, - "epoch": 0.2272, - "grad_norm": 1.2732030844649531, - "k1_kl": 0.0413818359375, - "k3_kl": 0.02447509765625, - "kimi_kl": 0.0509033203125, - "learning_rate": 3.864e-07, - "loss": 0.001, - "ppl": 0.010589599609375, - "reward": 0.9573783576488495, - "reward_std": 0.0007094333413988352, - "rewards/perpo_ocr_edit_distance_reward": 0.9573783874511719, + "advantages": -7.987022399902344e-06, + "completion_length": 964.0, + "delta_ref_entropy_loss": 0.049560546875, + "delta_ref_ppl": -0.04638671875, + "entropy_loss": -0.087890625, + "epoch": 0.1136, + "grad_norm": 1.2082581869002933, + "k1_kl": 0.04638671875, + "k3_kl": 0.0301513671875, + "kimi_kl": 0.056396484375, + "learning_rate": 4.4319999999999995e-07, + "loss": 0.0012, + "ppl": 0.0498046875, + "reward": 0.972919225692749, + "reward_std": 0.004160560201853514, + "rewards/perpo_ocr_edit_distance_reward": 0.9729192852973938, "step": 568, "temperature": 0.9 }, { - "advantages": -0.00036347338755149394, - "completion_length": 577.5, - "delta_ref_entropy_loss": 0.01666259765625, - "delta_ref_ppl": -0.01025390625, - "entropy_loss": -0.0074615478515625, - "epoch": 0.2276, - "grad_norm": 0.2209727806398196, - "k1_kl": 0.01024627685546875, - "k3_kl": 0.005828857421875, - "kimi_kl": 0.02115631103515625, - "learning_rate": 3.8619999999999995e-07, - "loss": 0.0006, - "ppl": 0.0028858184814453125, - "reward": 0.9990736246109009, - "reward_std": 0.0002752800064627081, - "rewards/perpo_ocr_edit_distance_reward": 0.9990737438201904, + "advantages": -7.18661749488092e-06, + "completion_length": 475.0, + "delta_ref_entropy_loss": 0.0625, + "delta_ref_ppl": -0.048828125, + "entropy_loss": -0.09423828125, + "epoch": 0.1138, + "grad_norm": 1.690469143772312, + "k1_kl": 0.049072265625, + "k3_kl": 0.0277099609375, + "kimi_kl": 0.058837890625, + "learning_rate": 4.431e-07, + "loss": 0.0011, + "ppl": 0.049072265625, + "reward": 0.9471304416656494, + "reward_std": 0.004611714277416468, + "rewards/perpo_ocr_edit_distance_reward": 0.9471305012702942, "step": 569, "temperature": 0.9 }, { - "advantages": -2.5459700111696293e-05, - "completion_length": 251.0, - "delta_ref_entropy_loss": 0.066162109375, - "delta_ref_ppl": -0.0509033203125, - "entropy_loss": -0.070068359375, - "epoch": 0.228, - "grad_norm": 3.43791746535565, - "k1_kl": 0.0511474609375, - "k3_kl": 0.0279541015625, - "kimi_kl": 0.044677734375, - "learning_rate": 3.86e-07, + "advantages": -1.2491431334638037e-05, + "completion_length": 777.0, + "delta_ref_entropy_loss": 0.038330078125, + "delta_ref_ppl": -0.044189453125, + "entropy_loss": -0.05615234375, + "epoch": 0.114, + "grad_norm": 4.036358348782312, + "k1_kl": 0.04443359375, + "k3_kl": 0.0277099609375, + "kimi_kl": 0.08154296875, + "learning_rate": 4.43e-07, "loss": 0.0011, - "ppl": 0.039306640625, - "reward": 0.9834538400173187, - "reward_std": 0.0032831517746672034, - "rewards/perpo_ocr_edit_distance_reward": 0.9834539592266083, + "ppl": 0.0286865234375, + "reward": 0.9708544611930847, + "reward_std": 0.003305662889033556, + "rewards/perpo_ocr_edit_distance_reward": 0.9708544611930847, "step": 570, "temperature": 0.9 }, { - "advantages": -3.102847585978452e-05, - "completion_length": 770.0, - "delta_ref_entropy_loss": 0.046539306640625, - "delta_ref_ppl": -0.03118896484375, - "entropy_loss": -0.086181640625, - "epoch": 0.2284, - "grad_norm": 1.4661209624354987, - "k1_kl": 0.03131103515625, - "k3_kl": 0.015354156494140625, - "kimi_kl": 0.02680206298828125, - "learning_rate": 3.858e-07, - "loss": 0.0006, - "ppl": 0.050933837890625, - "reward": 0.8708511590957642, - "reward_std": 0.0017592764925211668, - "rewards/perpo_ocr_edit_distance_reward": 0.8708512187004089, + "advantages": -1.559087286295835e-05, + "completion_length": 265.0, + "delta_ref_entropy_loss": 0.03662109375, + "delta_ref_ppl": -0.054443359375, + "entropy_loss": -0.04052734375, + "epoch": 0.1142, + "grad_norm": 1.2047871697168653, + "k1_kl": 0.0546875, + "k3_kl": 0.03857421875, + "kimi_kl": 0.09521484375, + "learning_rate": 4.429e-07, + "loss": 0.0016, + "ppl": 0.021728515625, + "reward": 0.9774227142333984, + "reward_std": 0.004263988696038723, + "rewards/perpo_ocr_edit_distance_reward": 0.977422833442688, "step": 571, "temperature": 0.9 }, { - "advantages": -2.9759749367030963e-06, - "completion_length": 907.0, - "delta_ref_entropy_loss": 0.03436279296875, - "delta_ref_ppl": -0.0193634033203125, - "entropy_loss": -0.0750732421875, - "epoch": 0.2288, - "grad_norm": 9.851172457631767, - "k1_kl": 0.0194091796875, - "k3_kl": 0.02667236328125, - "kimi_kl": 0.027099609375, - "learning_rate": 3.8559999999999996e-07, - "loss": 0.0011, - "ppl": 0.0496826171875, - "reward": 0.902690052986145, - "reward_std": 0.11949722352437675, - "rewards/perpo_ocr_edit_distance_reward": 0.9026901125907898, + "advantages": -4.0309772884938866e-05, + "completion_length": 450.0, + "delta_ref_entropy_loss": 0.0693359375, + "delta_ref_ppl": -0.07080078125, + "entropy_loss": -0.10595703125, + "epoch": 0.1144, + "grad_norm": 1.1544926636730257, + "k1_kl": 0.07080078125, + "k3_kl": 0.04541015625, + "kimi_kl": 0.09326171875, + "learning_rate": 4.428e-07, + "loss": 0.0019, + "ppl": 0.058349609375, + "reward": 0.9215528964996338, + "reward_std": 0.0011671031825244427, + "rewards/perpo_ocr_edit_distance_reward": 0.9215529561042786, "step": 572, "temperature": 0.9 }, { - "advantages": -3.015143634632622e-05, - "completion_length": 462.0, - "delta_ref_entropy_loss": 0.0443115234375, - "delta_ref_ppl": -0.03240966796875, - "entropy_loss": -0.02825927734375, - "epoch": 0.2292, - "grad_norm": 0.4067834641009614, - "k1_kl": 0.03253173828125, - "k3_kl": 0.01678466796875, - "kimi_kl": 0.03192138671875, - "learning_rate": 3.854e-07, - "loss": 0.0007, - "ppl": 0.013397216796875, - "reward": 0.984885960817337, - "reward_std": 0.00045506823516916484, - "rewards/perpo_ocr_edit_distance_reward": 0.9848860204219818, + "advantages": -4.174028435954824e-05, + "completion_length": 355.0, + "delta_ref_entropy_loss": 0.0230712890625, + "delta_ref_ppl": -0.0478515625, + "entropy_loss": -0.025634765625, + "epoch": 0.1146, + "grad_norm": 0.7541393964431489, + "k1_kl": 0.0478515625, + "k3_kl": 0.0306396484375, + "kimi_kl": 0.0869140625, + "learning_rate": 4.4269999999999996e-07, + "loss": 0.0013, + "ppl": 0.01287841796875, + "reward": 0.9912450313568115, + "reward_std": 0.000715952250175178, + "rewards/perpo_ocr_edit_distance_reward": 0.9912450909614563, "step": 573, "temperature": 0.9 }, { - "advantages": -2.6243074444209924e-05, - "completion_length": 843.5, - "delta_ref_entropy_loss": 0.020263671875, - "delta_ref_ppl": -0.011962890625, - "entropy_loss": -0.01800537109375, - "epoch": 0.2296, - "grad_norm": 0.3804415321896807, - "k1_kl": 0.011962890625, - "k3_kl": 0.00579833984375, - "kimi_kl": 0.0111083984375, - "learning_rate": 3.852e-07, - "loss": 0.0003, - "ppl": 0.008392333984375, - "reward": 0.99599289894104, - "reward_std": 0.0019342911546118557, - "rewards/perpo_ocr_edit_distance_reward": 0.9959929287433624, + "advantages": -2.3058482838678174e-05, + "completion_length": 25.0, + "delta_ref_entropy_loss": 0.154296875, + "delta_ref_ppl": -0.3203125, + "entropy_loss": -0.11328125, + "epoch": 0.1148, + "grad_norm": 12.56905303321582, + "k1_kl": 0.318359375, + "k3_kl": 0.2109375, + "kimi_kl": 0.58203125, + "learning_rate": 4.4259999999999995e-07, + "loss": 0.0085, + "ppl": 0.03271484375, + "reward": 0.665413498878479, + "reward_std": 0.0024865989107638597, + "rewards/perpo_ocr_edit_distance_reward": 0.6654136180877686, "step": 574, "temperature": 0.9 }, { - "advantages": -3.8181035577622424e-05, - "completion_length": 350.5, - "delta_ref_entropy_loss": 0.0347900390625, - "delta_ref_ppl": -0.101348876953125, - "entropy_loss": -0.169677734375, - "epoch": 0.23, - "grad_norm": 2.8304789356282907, - "k1_kl": 0.101348876953125, - "k3_kl": 0.0785675048828125, - "kimi_kl": 0.227264404296875, - "learning_rate": 3.8499999999999997e-07, - "loss": 0.0032, - "ppl": 0.066192626953125, - "reward": 0.8314715027809143, - "reward_std": 0.04077840957324952, - "rewards/perpo_ocr_edit_distance_reward": 0.8314715921878815, + "advantages": -4.036086011183215e-06, + "completion_length": 776.0, + "delta_ref_entropy_loss": 0.043701171875, + "delta_ref_ppl": -0.037109375, + "entropy_loss": -0.049560546875, + "epoch": 0.115, + "grad_norm": 1.0908753844579044, + "k1_kl": 0.037109375, + "k3_kl": 0.0206298828125, + "kimi_kl": 0.044677734375, + "learning_rate": 4.425e-07, + "loss": 0.0008, + "ppl": 0.025634765625, + "reward": 0.9386169910430908, + "reward_std": 0.016677793115377426, + "rewards/perpo_ocr_edit_distance_reward": 0.9386170506477356, "step": 575, "temperature": 0.9 }, { - "advantages": -0.00032579047547187656, - "completion_length": 1059.0, - "delta_ref_entropy_loss": 0.0252685546875, - "delta_ref_ppl": -0.012115478515625, - "entropy_loss": -0.039276123046875, - "epoch": 0.2304, - "grad_norm": 0.5276272704521737, - "k1_kl": 0.01214599609375, - "k3_kl": 0.00542449951171875, - "kimi_kl": 0.0094757080078125, - "learning_rate": 3.8479999999999995e-07, - "loss": 0.0005, - "ppl": 0.0180206298828125, - "reward": 0.9966422319412231, - "reward_std": 0.0007933140150271356, - "rewards/perpo_ocr_edit_distance_reward": 0.9966423511505127, + "advantages": -6.709780336677795e-06, + "completion_length": 890.0, + "delta_ref_entropy_loss": 0.07861328125, + "delta_ref_ppl": -0.0703125, + "entropy_loss": -0.09521484375, + "epoch": 0.1152, + "grad_norm": 1.2931078354667986, + "k1_kl": 0.0703125, + "k3_kl": 0.046875, + "kimi_kl": 0.1201171875, + "learning_rate": 4.424e-07, + "loss": 0.0019, + "ppl": 0.0546875, + "reward": 0.8717122673988342, + "reward_std": 0.004987405613064766, + "rewards/perpo_ocr_edit_distance_reward": 0.871712327003479, "step": 576, "temperature": 0.9 }, { - "advantages": -7.450580596923828e-05, - "completion_length": 145.0, - "delta_ref_entropy_loss": 0.0523681640625, - "delta_ref_ppl": -0.0509033203125, - "entropy_loss": -0.028564453125, - "epoch": 0.2308, - "grad_norm": 0.039962299014137975, - "k1_kl": 0.0509033203125, - "k3_kl": 0.02703857421875, - "kimi_kl": 0.0556640625, - "learning_rate": 3.846e-07, - "loss": 0.0012, - "ppl": 0.012603759765625, - "reward": 0.6188769415020943, - "reward_std": 0.0, - "rewards/perpo_ocr_edit_distance_reward": 0.6188769489526749, + "advantages": -7.162775727920234e-05, + "completion_length": 772.0, + "delta_ref_entropy_loss": 0.064453125, + "delta_ref_ppl": -0.042236328125, + "entropy_loss": -0.12158203125, + "epoch": 0.1154, + "grad_norm": 0.9867844916656731, + "k1_kl": 0.042236328125, + "k3_kl": 0.0250244140625, + "kimi_kl": 0.055419921875, + "learning_rate": 4.423e-07, + "loss": 0.0011, + "ppl": 0.06396484375, + "reward": 0.9355924725532532, + "reward_std": 0.0007318864809349179, + "rewards/perpo_ocr_edit_distance_reward": 0.935592532157898, "step": 577, "temperature": 0.9 }, { - "advantages": -2.9593707040476147e-05, - "completion_length": 1079.5, - "delta_ref_entropy_loss": 0.02532958984375, - "delta_ref_ppl": -0.014434814453125, - "entropy_loss": -0.0311279296875, - "epoch": 0.2312, - "grad_norm": 1.2950828112730985, - "k1_kl": 0.014373779296875, - "k3_kl": 0.007598876953125, - "kimi_kl": 0.015960693359375, - "learning_rate": 3.8440000000000003e-07, - "loss": 0.0003, - "ppl": 0.0133056640625, - "reward": 0.968134194612503, - "reward_std": 0.006209185463376343, - "rewards/perpo_ocr_edit_distance_reward": 0.9681343138217926, + "advantages": -4.01054148824187e-06, + "completion_length": 481.0, + "delta_ref_entropy_loss": 0.0361328125, + "delta_ref_ppl": -0.0308837890625, + "entropy_loss": -0.04638671875, + "epoch": 0.1156, + "grad_norm": 1.2557278772199718, + "k1_kl": 0.0308837890625, + "k3_kl": 0.0213623046875, + "kimi_kl": 0.049560546875, + "learning_rate": 4.422e-07, + "loss": 0.0009, + "ppl": 0.026611328125, + "reward": 0.9884302616119385, + "reward_std": 0.004134356044232845, + "rewards/perpo_ocr_edit_distance_reward": 0.9884303212165833, "step": 578, "temperature": 0.9 }, { - "advantages": -7.109982846031926e-06, - "completion_length": 451.5, - "delta_ref_entropy_loss": 0.095703125, - "delta_ref_ppl": -0.0574951171875, - "entropy_loss": -0.0986328125, - "epoch": 0.2316, - "grad_norm": 1.1228014328223592, - "k1_kl": 0.0574951171875, - "k3_kl": 0.02838134765625, - "kimi_kl": 0.0496826171875, - "learning_rate": 3.8419999999999996e-07, - "loss": 0.0011, - "ppl": 0.05267333984375, - "reward": 0.7776265442371368, - "reward_std": 0.00780639355070889, - "rewards/perpo_ocr_edit_distance_reward": 0.777626633644104, + "advantages": -1.348768000752898e-05, + "completion_length": 263.0, + "delta_ref_entropy_loss": 0.01055908203125, + "delta_ref_ppl": -0.0478515625, + "entropy_loss": -0.04052734375, + "epoch": 0.1158, + "grad_norm": 2.186896910378309, + "k1_kl": 0.0478515625, + "k3_kl": 0.035400390625, + "kimi_kl": 0.10302734375, + "learning_rate": 4.4209999999999997e-07, + "loss": 0.0014, + "ppl": 0.0223388671875, + "reward": 0.9774789214134216, + "reward_std": 0.003684730501845479, + "rewards/perpo_ocr_edit_distance_reward": 0.9774790406227112, "step": 579, "temperature": 0.9 }, { - "advantages": -0.00010834421755134827, - "completion_length": 430.5, - "delta_ref_entropy_loss": 0.0509033203125, - "delta_ref_ppl": -0.04132080078125, - "entropy_loss": -0.0692138671875, - "epoch": 0.232, - "grad_norm": 0.7918993453790903, - "k1_kl": 0.0411376953125, - "k3_kl": 0.0250244140625, - "kimi_kl": 0.0721435546875, - "learning_rate": 3.84e-07, - "loss": 0.0011, - "ppl": 0.03692626953125, - "reward": 0.9669206738471985, - "reward_std": 0.0006808157631894574, - "rewards/perpo_ocr_edit_distance_reward": 0.9669207632541656, + "advantages": -1.0848046258615796e-05, + "completion_length": 1795.0, + "delta_ref_entropy_loss": 0.04345703125, + "delta_ref_ppl": -0.03369140625, + "entropy_loss": -0.0947265625, + "epoch": 0.116, + "grad_norm": 1.9737223021513999, + "k1_kl": 0.03369140625, + "k3_kl": 0.0238037109375, + "kimi_kl": 0.06201171875, + "learning_rate": 4.4199999999999996e-07, + "loss": 0.001, + "ppl": 0.051025390625, + "reward": 0.9600529670715332, + "reward_std": 0.006985116750001907, + "rewards/perpo_ocr_edit_distance_reward": 0.960053026676178, "step": 580, "temperature": 0.9 }, { - "advantages": -1.7021384337567724e-05, - "completion_length": 558.0, - "delta_ref_entropy_loss": 0.0372314453125, - "delta_ref_ppl": -0.032623291015625, - "entropy_loss": -0.02825927734375, - "epoch": 0.2324, - "grad_norm": 0.7252439538412511, - "k1_kl": 0.03265380859375, - "k3_kl": 0.0195159912109375, - "kimi_kl": 0.04388427734375, - "learning_rate": 3.838e-07, - "loss": 0.0008, - "ppl": 0.0115203857421875, - "reward": 0.9453526735305786, - "reward_std": 0.0013673290814040229, - "rewards/perpo_ocr_edit_distance_reward": 0.9453526735305786, + "advantages": 0.0, + "completion_length": 962.0, + "delta_ref_entropy_loss": 0.011962890625, + "delta_ref_ppl": -0.013671875, + "entropy_loss": -0.02197265625, + "epoch": 0.1162, + "grad_norm": 0.5607778454767902, + "k1_kl": 0.013671875, + "k3_kl": 0.01055908203125, + "kimi_kl": 0.0224609375, + "learning_rate": 4.419e-07, + "loss": 0.0004, + "ppl": 0.0113525390625, + "reward": 0.9846698641777039, + "reward_std": 0.003219959558919072, + "rewards/perpo_ocr_edit_distance_reward": 0.9846699237823486, "step": 581, "temperature": 0.9 }, { - "advantages": -3.66142822727511e-07, - "completion_length": 585.5, - "delta_ref_entropy_loss": 0.0428466796875, - "delta_ref_ppl": -0.06964111328125, - "entropy_loss": -0.13916015625, - "epoch": 0.2328, - "grad_norm": 1.5389058024345539, - "k1_kl": 0.06951904296875, - "k3_kl": 0.05218505859375, - "kimi_kl": 0.192626953125, - "learning_rate": 3.8359999999999997e-07, - "loss": 0.0021, - "ppl": 0.077392578125, - "reward": 0.7726686298847198, - "reward_std": 0.07885318249464035, - "rewards/perpo_ocr_edit_distance_reward": 0.7726686894893646, + "advantages": -1.8835067749023438e-05, + "completion_length": 428.0, + "delta_ref_entropy_loss": 0.05712890625, + "delta_ref_ppl": -0.068359375, + "entropy_loss": -0.068359375, + "epoch": 0.1164, + "grad_norm": 1.554960031208612, + "k1_kl": 0.06884765625, + "k3_kl": 0.041015625, + "kimi_kl": 0.09716796875, + "learning_rate": 4.418e-07, + "loss": 0.0017, + "ppl": 0.037109375, + "reward": 0.9903354644775391, + "reward_std": 0.002160867443308234, + "rewards/perpo_ocr_edit_distance_reward": 0.9903355836868286, "step": 582, "temperature": 0.9 }, { - "advantages": -2.86144877463812e-05, - "completion_length": 638.5, - "delta_ref_entropy_loss": 0.06787109375, - "delta_ref_ppl": -0.0426025390625, - "entropy_loss": -0.0703125, - "epoch": 0.2332, - "grad_norm": 0.6892186907504712, - "k1_kl": 0.04248046875, - "k3_kl": 0.021484375, - "kimi_kl": 0.0516357421875, - "learning_rate": 3.834e-07, - "loss": 0.0009, - "ppl": 0.03765869140625, - "reward": 0.9725810587406158, - "reward_std": 0.0010961980442516506, - "rewards/perpo_ocr_edit_distance_reward": 0.9725811183452606, + "advantages": -1.151221204054309e-05, + "completion_length": 1083.0, + "delta_ref_entropy_loss": 0.01373291015625, + "delta_ref_ppl": -0.0166015625, + "entropy_loss": -0.023681640625, + "epoch": 0.1166, + "grad_norm": 0.5280820187361193, + "k1_kl": 0.0167236328125, + "k3_kl": 0.00921630859375, + "kimi_kl": 0.018310546875, + "learning_rate": 4.4169999999999994e-07, + "loss": 0.0004, + "ppl": 0.0126953125, + "reward": 0.9861733317375183, + "reward_std": 0.005080812145024538, + "rewards/perpo_ocr_edit_distance_reward": 0.9861734509468079, "step": 583, "temperature": 0.9 }, { - "advantages": -0.00013882561013645045, - "completion_length": 677.5, - "delta_ref_entropy_loss": 0.03173828125, - "delta_ref_ppl": -0.026947021484375, - "entropy_loss": -0.031585693359375, - "epoch": 0.2336, - "grad_norm": 0.804955298370257, - "k1_kl": 0.026947021484375, - "k3_kl": 0.01513671875, - "kimi_kl": 0.0491790771484375, - "learning_rate": 3.832e-07, - "loss": 0.0007, - "ppl": 0.01413726806640625, - "reward": 0.9476704299449921, - "reward_std": 0.024955672677606344, - "rewards/perpo_ocr_edit_distance_reward": 0.9476704895496368, + "advantages": -9.434564162802417e-06, + "completion_length": 593.0, + "delta_ref_entropy_loss": 0.0947265625, + "delta_ref_ppl": -0.08642578125, + "entropy_loss": -0.232421875, + "epoch": 0.1168, + "grad_norm": 1.9644600122980425, + "k1_kl": 0.0869140625, + "k3_kl": 0.051513671875, + "kimi_kl": 0.10693359375, + "learning_rate": 4.416e-07, + "loss": 0.0021, + "ppl": 0.1259765625, + "reward": 0.8719926476478577, + "reward_std": 0.003506341017782688, + "rewards/perpo_ocr_edit_distance_reward": 0.8719927668571472, "step": 584, "temperature": 0.9 }, { - "advantages": -1.328332150052347e-06, - "completion_length": 403.5, - "delta_ref_entropy_loss": 0.09619140625, - "delta_ref_ppl": -0.057373046875, - "entropy_loss": -0.169189453125, - "epoch": 0.234, - "grad_norm": 1.5284717372681382, - "k1_kl": 0.0574951171875, - "k3_kl": 0.0306396484375, - "kimi_kl": 0.0760498046875, - "learning_rate": 3.83e-07, - "loss": 0.0012, - "ppl": 0.0948486328125, - "reward": 0.8298505544662476, - "reward_std": 0.11041353456676006, - "rewards/perpo_ocr_edit_distance_reward": 0.8298506438732147, + "advantages": -0.00022768124472349882, + "completion_length": 290.0, + "delta_ref_entropy_loss": 0.0654296875, + "delta_ref_ppl": -0.11376953125, + "entropy_loss": -0.050048828125, + "epoch": 0.117, + "grad_norm": 0.8644200212017725, + "k1_kl": 0.11376953125, + "k3_kl": 0.072265625, + "kimi_kl": 0.2138671875, + "learning_rate": 4.415e-07, + "loss": 0.0031, + "ppl": 0.0172119140625, + "reward": 0.907101571559906, + "reward_std": 0.0003862053854390979, + "rewards/perpo_ocr_edit_distance_reward": 0.9071016907691956, "step": 585, "temperature": 0.9 }, { - "advantages": -7.399491187243257e-05, - "completion_length": 1120.5, - "delta_ref_entropy_loss": 0.0428466796875, - "delta_ref_ppl": -0.02996826171875, - "entropy_loss": -0.0389404296875, - "epoch": 0.2344, - "grad_norm": 1.0584756734460872, - "k1_kl": 0.030029296875, - "k3_kl": 0.01470947265625, - "kimi_kl": 0.02685546875, - "learning_rate": 3.8279999999999996e-07, - "loss": 0.0007, - "ppl": 0.02105712890625, - "reward": 0.9976195991039276, - "reward_std": 0.0006910289666848257, - "rewards/perpo_ocr_edit_distance_reward": 0.9976197183132172, + "advantages": 4.344327317085117e-05, + "completion_length": 288.0, + "delta_ref_entropy_loss": 0.052734375, + "delta_ref_ppl": -0.06884765625, + "entropy_loss": -0.0302734375, + "epoch": 0.1172, + "grad_norm": 0.7048619248558865, + "k1_kl": 0.06884765625, + "k3_kl": 0.042236328125, + "kimi_kl": 0.10693359375, + "learning_rate": 4.4139999999999997e-07, + "loss": 0.0017, + "ppl": 0.01092529296875, + "reward": 0.9795359373092651, + "reward_std": 0.0004877088067587465, + "rewards/perpo_ocr_edit_distance_reward": 0.9795359373092651, "step": 586, "temperature": 0.9 }, { - "advantages": -2.396532659076911e-05, - "completion_length": 430.5, - "delta_ref_entropy_loss": 0.05487060546875, - "delta_ref_ppl": -0.056884765625, - "entropy_loss": -0.0841064453125, - "epoch": 0.2348, - "grad_norm": 0.8172908311456861, - "k1_kl": 0.056884765625, - "k3_kl": 0.0340576171875, - "kimi_kl": 0.0904541015625, - "learning_rate": 3.826e-07, - "loss": 0.0014, - "ppl": 0.045318603515625, - "reward": 0.8822779357433319, - "reward_std": 0.007002691156230867, - "rewards/perpo_ocr_edit_distance_reward": 0.8822779953479767, + "advantages": -7.322856845348724e-07, + "completion_length": 386.0, + "delta_ref_entropy_loss": 0.09423828125, + "delta_ref_ppl": -0.0751953125, + "entropy_loss": -0.134765625, + "epoch": 0.1174, + "grad_norm": 1.6139585575395092, + "k1_kl": 0.0751953125, + "k3_kl": 0.056396484375, + "kimi_kl": 0.12109375, + "learning_rate": 4.413e-07, + "loss": 0.0023, + "ppl": 0.07861328125, + "reward": 0.8905905485153198, + "reward_std": 0.011235273443162441, + "rewards/perpo_ocr_edit_distance_reward": 0.8905905485153198, "step": 587, "temperature": 0.9 }, { - "advantages": -6.518193913507275e-06, - "completion_length": 558.0, - "delta_ref_entropy_loss": 0.0364990234375, - "delta_ref_ppl": -0.03826904296875, - "entropy_loss": -0.0389404296875, - "epoch": 0.2352, - "grad_norm": 0.9309684448899371, - "k1_kl": 0.0382080078125, - "k3_kl": 0.022857666015625, - "kimi_kl": 0.0516357421875, - "learning_rate": 3.824e-07, - "loss": 0.0009, - "ppl": 0.01898193359375, - "reward": 0.9774890840053558, - "reward_std": 0.008190528955310583, - "rewards/perpo_ocr_edit_distance_reward": 0.977489173412323, + "advantages": -2.3296901417779736e-05, + "completion_length": 576.0, + "delta_ref_entropy_loss": 0.05322265625, + "delta_ref_ppl": -0.052001953125, + "entropy_loss": -0.0281982421875, + "epoch": 0.1176, + "grad_norm": 0.773763621715305, + "k1_kl": 0.052001953125, + "k3_kl": 0.03173828125, + "kimi_kl": 0.09375, + "learning_rate": 4.4119999999999995e-07, + "loss": 0.0013, + "ppl": 0.01202392578125, + "reward": 0.9904341101646423, + "reward_std": 0.0009963181801140308, + "rewards/perpo_ocr_edit_distance_reward": 0.9904341697692871, "step": 588, "temperature": 0.9 }, { - "advantages": -1.1972019137829193e-05, - "completion_length": 342.5, - "delta_ref_entropy_loss": 0.093505859375, - "delta_ref_ppl": -0.059814453125, - "entropy_loss": -0.117919921875, - "epoch": 0.2356, - "grad_norm": 1.9144760654069108, - "k1_kl": 0.059814453125, - "k3_kl": 0.0296630859375, - "kimi_kl": 0.060546875, - "learning_rate": 3.8219999999999997e-07, + "advantages": -1.813684320950415e-05, + "completion_length": 564.0, + "delta_ref_entropy_loss": 0.0478515625, + "delta_ref_ppl": -0.05126953125, + "entropy_loss": -0.01708984375, + "epoch": 0.1178, + "grad_norm": 0.8497558493033519, + "k1_kl": 0.051513671875, + "k3_kl": 0.0306396484375, + "kimi_kl": 0.07373046875, + "learning_rate": 4.4109999999999995e-07, "loss": 0.0012, - "ppl": 0.0694580078125, - "reward": 0.9155579805374146, - "reward_std": 0.006548156728968024, - "rewards/perpo_ocr_edit_distance_reward": 0.9155580401420593, + "ppl": 0.007171630859375, + "reward": 0.9859557151794434, + "reward_std": 0.0036489227786660194, + "rewards/perpo_ocr_edit_distance_reward": 0.9859558343887329, "step": 589, "temperature": 0.9 }, { - "advantages": -0.00029882362912303506, - "completion_length": 905.5, - "delta_ref_entropy_loss": 0.0216064453125, - "delta_ref_ppl": -0.01263427734375, - "entropy_loss": -0.0284423828125, - "epoch": 0.236, - "grad_norm": 20332754.154163472, - "k1_kl": 0.01263427734375, - "k3_kl": 56320.004150390625, - "kimi_kl": 0.11181640625, - "learning_rate": 3.82e-07, - "loss": 2250.4321, - "ppl": 0.022151947021484375, - "reward": 0.9902025163173676, - "reward_std": 0.007912581786513329, - "rewards/perpo_ocr_edit_distance_reward": 0.9902025759220123, + "advantages": -2.5468214516877197e-05, + "completion_length": 317.0, + "delta_ref_entropy_loss": 0.06396484375, + "delta_ref_ppl": -0.09326171875, + "entropy_loss": -0.0927734375, + "epoch": 0.118, + "grad_norm": 2.0378647633555262, + "k1_kl": 0.09375, + "k3_kl": 0.06591796875, + "kimi_kl": 0.1650390625, + "learning_rate": 4.41e-07, + "loss": 0.0027, + "ppl": 0.054443359375, + "reward": 0.9673790335655212, + "reward_std": 0.002905852161347866, + "rewards/perpo_ocr_edit_distance_reward": 0.9673791527748108, "step": 590, "temperature": 0.9 }, { - "advantages": -3.2859191378520336e-05, - "completion_length": 541.5, - "delta_ref_entropy_loss": 0.0872802734375, - "delta_ref_ppl": -0.0496826171875, - "entropy_loss": -0.1087646484375, - "epoch": 0.2364, - "grad_norm": 1.0965790505489317, - "k1_kl": 0.0496826171875, - "k3_kl": 0.0230712890625, - "kimi_kl": 0.0396728515625, - "learning_rate": 3.8179999999999994e-07, - "loss": 0.001, - "ppl": 0.06072998046875, - "reward": 0.9598098397254944, - "reward_std": 0.0028676943329628557, - "rewards/perpo_ocr_edit_distance_reward": 0.9598099291324615, + "advantages": -5.177089406060986e-06, + "completion_length": 1043.0, + "delta_ref_entropy_loss": 0.00775146484375, + "delta_ref_ppl": -0.017822265625, + "entropy_loss": -0.024658203125, + "epoch": 0.1182, + "grad_norm": 0.5146914815492182, + "k1_kl": 0.017822265625, + "k3_kl": 0.0128173828125, + "kimi_kl": 0.03662109375, + "learning_rate": 4.409e-07, + "loss": 0.0005, + "ppl": 0.01202392578125, + "reward": 0.9710227847099304, + "reward_std": 0.00154297414701432, + "rewards/perpo_ocr_edit_distance_reward": 0.9710227847099304, "step": 591, "temperature": 0.9 }, { - "advantages": -0.0003077387809753418, - "completion_length": 407.0, - "delta_ref_entropy_loss": 0.03472900390625, - "delta_ref_ppl": -0.03192138671875, - "entropy_loss": -0.0544281005859375, - "epoch": 0.2368, - "grad_norm": 0.7836462312765282, - "k1_kl": 0.03192138671875, - "k3_kl": 0.019611358642578125, - "kimi_kl": 0.03641510009765625, - "learning_rate": 3.816e-07, - "loss": 0.0011, - "ppl": 0.02725982666015625, - "reward": 0.987002968788147, - "reward_std": 0.0014845019904896617, - "rewards/perpo_ocr_edit_distance_reward": 0.9870030879974365, + "advantages": -2.7077539925812744e-06, + "completion_length": 1133.0, + "delta_ref_entropy_loss": 0.06787109375, + "delta_ref_ppl": -0.0390625, + "entropy_loss": -0.10009765625, + "epoch": 0.1184, + "grad_norm": 2.5474134162785846, + "k1_kl": 0.0390625, + "k3_kl": 0.022705078125, + "kimi_kl": 0.03515625, + "learning_rate": 4.4080000000000003e-07, + "loss": 0.0009, + "ppl": 0.05419921875, + "reward": 0.6004337072372437, + "reward_std": 0.012573358602821827, + "rewards/perpo_ocr_edit_distance_reward": 0.6004337668418884, "step": 592, "temperature": 0.9 }, { - "advantages": -0.0003268889031460276, - "completion_length": 521.0, - "delta_ref_entropy_loss": 0.0247802734375, - "delta_ref_ppl": -0.015594482421875, - "entropy_loss": -0.01519775390625, - "epoch": 0.2372, - "grad_norm": 0.8958987381073081, - "k1_kl": 0.015594482421875, - "k3_kl": 0.0079498291015625, - "kimi_kl": 0.014801025390625, - "learning_rate": 3.814e-07, - "loss": 0.0006, - "ppl": 0.00714111328125, - "reward": 0.9962724149227142, - "reward_std": 0.00031878476147539914, - "rewards/perpo_ocr_edit_distance_reward": 0.9962725043296814, + "advantages": 3.984996510553174e-06, + "completion_length": 468.0, + "delta_ref_entropy_loss": 0.08154296875, + "delta_ref_ppl": -0.07080078125, + "entropy_loss": -0.058349609375, + "epoch": 0.1186, + "grad_norm": 1.596421430989461, + "k1_kl": 0.07080078125, + "k3_kl": 0.039794921875, + "kimi_kl": 0.078125, + "learning_rate": 4.4069999999999997e-07, + "loss": 0.0016, + "ppl": 0.02783203125, + "reward": 0.9792989492416382, + "reward_std": 0.004189714323729277, + "rewards/perpo_ocr_edit_distance_reward": 0.9792988896369934, "step": 593, "temperature": 0.9 }, { - "advantages": -4.8143526782951085e-05, - "completion_length": 829.5, - "delta_ref_entropy_loss": 0.023773193359375, - "delta_ref_ppl": -0.01922607421875, - "entropy_loss": -0.0384521484375, - "epoch": 0.2376, - "grad_norm": 0.6711484870651444, - "k1_kl": 0.019134521484375, - "k3_kl": 0.0114593505859375, - "kimi_kl": 0.02655029296875, - "learning_rate": 3.8119999999999995e-07, - "loss": 0.0005, - "ppl": 0.0191650390625, - "reward": 0.9962091147899628, - "reward_std": 0.002795918844640255, - "rewards/perpo_ocr_edit_distance_reward": 0.9962091743946075, + "advantages": -2.717120514716953e-05, + "completion_length": 1041.0, + "delta_ref_entropy_loss": 0.020263671875, + "delta_ref_ppl": -0.020263671875, + "entropy_loss": -0.035400390625, + "epoch": 0.1188, + "grad_norm": 0.9108899744324487, + "k1_kl": 0.020263671875, + "k3_kl": 0.0162353515625, + "kimi_kl": 0.030517578125, + "learning_rate": 4.4059999999999996e-07, + "loss": 0.0007, + "ppl": 0.01904296875, + "reward": 0.981270432472229, + "reward_std": 0.0008405420230701566, + "rewards/perpo_ocr_edit_distance_reward": 0.9812705516815186, "step": 594, "temperature": 0.9 }, { - "advantages": -1.987389234159309e-05, - "completion_length": 1251.5, - "delta_ref_entropy_loss": 0.0931396484375, - "delta_ref_ppl": -0.1328125, - "entropy_loss": -0.1922607421875, - "epoch": 0.238, - "grad_norm": 2.5684657068296604, - "k1_kl": 0.13330078125, - "k3_kl": 0.09136962890625, - "kimi_kl": 0.3228759765625, - "learning_rate": 3.81e-07, - "loss": 0.0037, - "ppl": 0.10205078125, - "reward": 0.6280926614999771, - "reward_std": 0.04895892782951705, - "rewards/perpo_ocr_edit_distance_reward": 0.6280926764011383, + "advantages": -2.1917479898547754e-05, + "completion_length": 1375.0, + "delta_ref_entropy_loss": 0.01373291015625, + "delta_ref_ppl": -0.01531982421875, + "entropy_loss": -0.055419921875, + "epoch": 0.119, + "grad_norm": 1.0585933470166464, + "k1_kl": 0.0152587890625, + "k3_kl": 0.013671875, + "kimi_kl": 0.0294189453125, + "learning_rate": 4.405e-07, + "loss": 0.0006, + "ppl": 0.0286865234375, + "reward": 0.9892856478691101, + "reward_std": 0.002232712460681796, + "rewards/perpo_ocr_edit_distance_reward": 0.9892857670783997, "step": 595, "temperature": 0.9 }, { - "advantages": -6.986696462263353e-05, - "completion_length": 667.5, - "delta_ref_entropy_loss": 0.02471923828125, - "delta_ref_ppl": -0.016876220703125, - "entropy_loss": -0.02130126953125, - "epoch": 0.2384, - "grad_norm": 0.35187821185306145, - "k1_kl": 0.016876220703125, - "k3_kl": 0.009002685546875, - "kimi_kl": 0.019805908203125, - "learning_rate": 3.808e-07, - "loss": 0.0004, - "ppl": 0.011688232421875, - "reward": 0.9974454939365387, - "reward_std": 0.00044767851068172604, - "rewards/perpo_ocr_edit_distance_reward": 0.9974455833435059, + "advantages": -7.813317643012851e-05, + "completion_length": 589.0, + "delta_ref_entropy_loss": 0.07421875, + "delta_ref_ppl": -0.0556640625, + "entropy_loss": -0.0311279296875, + "epoch": 0.1192, + "grad_norm": 0.541758147924899, + "k1_kl": 0.0556640625, + "k3_kl": 0.031494140625, + "kimi_kl": 0.0712890625, + "learning_rate": 4.404e-07, + "loss": 0.0013, + "ppl": 0.01019287109375, + "reward": 0.8610711693763733, + "reward_std": 0.0006629105191677809, + "rewards/perpo_ocr_edit_distance_reward": 0.8610712885856628, "step": 596, "temperature": 0.9 }, { - "advantages": -0.00029842768398680164, - "completion_length": 242.5, - "delta_ref_entropy_loss": 0.02142333984375, - "delta_ref_ppl": -0.024169921875, - "entropy_loss": -0.08197021484375, - "epoch": 0.2388, - "grad_norm": 1.103478505306422, - "k1_kl": 0.024169921875, - "k3_kl": 0.013458251953125, - "kimi_kl": 0.023468017578125, - "learning_rate": 3.8059999999999995e-07, - "loss": 0.0008, - "ppl": 0.043670654296875, - "reward": 0.830331951379776, - "reward_std": 0.031910449266433716, - "rewards/perpo_ocr_edit_distance_reward": 0.8303320407867432, + "advantages": 3.137758994853357e-06, + "completion_length": 1006.0, + "delta_ref_entropy_loss": 0.0908203125, + "delta_ref_ppl": -0.056396484375, + "entropy_loss": -0.1337890625, + "epoch": 0.1194, + "grad_norm": 3.687165055106055, + "k1_kl": 0.056396484375, + "k3_kl": 0.0263671875, + "kimi_kl": 0.048095703125, + "learning_rate": 4.403e-07, + "loss": 0.001, + "ppl": 0.078125, + "reward": 0.8895310163497925, + "reward_std": 0.008068199269473553, + "rewards/perpo_ocr_edit_distance_reward": 0.8895310163497925, "step": 597, "temperature": 0.9 }, { - "advantages": -6.650175691902405e-06, - "completion_length": 651.0, - "delta_ref_entropy_loss": 0.0615234375, - "delta_ref_ppl": -0.05029296875, - "entropy_loss": -0.109619140625, - "epoch": 0.2392, - "grad_norm": 1.3282638826420192, - "k1_kl": 0.0504150390625, - "k3_kl": 0.025634765625, - "kimi_kl": 0.056640625, - "learning_rate": 3.804e-07, - "loss": 0.001, - "ppl": 0.0604248046875, - "reward": 0.9035579860210419, - "reward_std": 0.007942597614601254, - "rewards/perpo_ocr_edit_distance_reward": 0.9035580456256866, + "advantages": -7.169587661337573e-06, + "completion_length": 120.0, + "delta_ref_entropy_loss": 0.042236328125, + "delta_ref_ppl": -0.1669921875, + "entropy_loss": -0.07275390625, + "epoch": 0.1196, + "grad_norm": 2.55744142238455, + "k1_kl": 0.1669921875, + "k3_kl": 0.12255859375, + "kimi_kl": 0.353515625, + "learning_rate": 4.402e-07, + "loss": 0.0049, + "ppl": 0.042724609375, + "reward": 0.9660245776176453, + "reward_std": 0.0022752429358661175, + "rewards/perpo_ocr_edit_distance_reward": 0.96602463722229, "step": 598, "temperature": 0.9 }, { - "advantages": -3.537961583788274e-05, - "completion_length": 603.0, - "delta_ref_entropy_loss": 0.0400390625, - "delta_ref_ppl": -0.0245361328125, - "entropy_loss": -0.0477294921875, - "epoch": 0.2396, - "grad_norm": 0.5724283969950906, - "k1_kl": 0.0245361328125, - "k3_kl": 0.0111083984375, - "kimi_kl": 0.017547607421875, - "learning_rate": 3.802e-07, - "loss": 0.0005, - "ppl": 0.02691650390625, - "reward": 0.9121042191982269, - "reward_std": 0.0005552714719669893, - "rewards/perpo_ocr_edit_distance_reward": 0.9121042191982269, + "advantages": -5.27926886206842e-07, + "completion_length": 851.0, + "delta_ref_entropy_loss": 0.10693359375, + "delta_ref_ppl": -0.0693359375, + "entropy_loss": -0.16015625, + "epoch": 0.1198, + "grad_norm": 1.4640426497887722, + "k1_kl": 0.0693359375, + "k3_kl": 0.03955078125, + "kimi_kl": 0.0927734375, + "learning_rate": 4.401e-07, + "loss": 0.0016, + "ppl": 0.0810546875, + "reward": 0.8749405145645142, + "reward_std": 0.04744841158390045, + "rewards/perpo_ocr_edit_distance_reward": 0.8749405741691589, "step": 599, "temperature": 0.9 }, { - "advantages": -1.3623919414840202e-07, - "completion_length": 1128.0, - "delta_ref_entropy_loss": 0.026336669921875, - "delta_ref_ppl": -0.022735595703125, - "entropy_loss": -0.078826904296875, - "epoch": 0.24, - "grad_norm": 0.7609818298640336, - "k1_kl": 0.022735595703125, - "k3_kl": 0.01397705078125, - "kimi_kl": 0.0433349609375, - "learning_rate": 3.7999999999999996e-07, - "loss": 0.0006, - "ppl": 0.051532745361328125, - "reward": 0.8586277663707733, - "reward_std": 0.07536132633686066, - "rewards/perpo_ocr_edit_distance_reward": 0.8586277961730957, + "advantages": -1.158033137471648e-05, + "completion_length": 571.0, + "delta_ref_entropy_loss": 0.044921875, + "delta_ref_ppl": -0.057373046875, + "entropy_loss": -0.033935546875, + "epoch": 0.12, + "grad_norm": 0.6943082743146942, + "k1_kl": 0.0576171875, + "k3_kl": 0.037353515625, + "kimi_kl": 0.1279296875, + "learning_rate": 4.3999999999999997e-07, + "loss": 0.0015, + "ppl": 0.0157470703125, + "reward": 0.9905495047569275, + "reward_std": 0.002836356870830059, + "rewards/perpo_ocr_edit_distance_reward": 0.9905495643615723, "step": 600, "temperature": 0.9 }, { - "advantages": -2.487642541382229e-05, - "completion_length": 582.5, - "delta_ref_entropy_loss": 0.02386474609375, - "delta_ref_ppl": -0.020751953125, - "entropy_loss": -0.01947021484375, - "epoch": 0.2404, - "grad_norm": 0.33357160702530825, - "k1_kl": 0.020751953125, - "k3_kl": 0.0121917724609375, - "kimi_kl": 0.030731201171875, - "learning_rate": 3.798e-07, - "loss": 0.0005, - "ppl": 0.009246826171875, - "reward": 0.9968328773975372, - "reward_std": 0.000783987867180258, - "rewards/perpo_ocr_edit_distance_reward": 0.996832937002182, + "advantages": -4.9216409934160765e-06, + "completion_length": 1437.0, + "delta_ref_entropy_loss": 0.0439453125, + "delta_ref_ppl": -0.04150390625, + "entropy_loss": -0.056884765625, + "epoch": 0.1202, + "grad_norm": 6.273057717459858, + "k1_kl": 0.04150390625, + "k3_kl": 0.040283203125, + "kimi_kl": 0.060791015625, + "learning_rate": 4.399e-07, + "loss": 0.0016, + "ppl": 0.03271484375, + "reward": 0.9504016041755676, + "reward_std": 0.015465348958969116, + "rewards/perpo_ocr_edit_distance_reward": 0.950401782989502, "step": 601, "temperature": 0.9 }, { - "advantages": 1.6071967365860473e-06, - "completion_length": 342.0, - "delta_ref_entropy_loss": 0.0986328125, - "delta_ref_ppl": -0.093505859375, - "entropy_loss": -0.077880859375, - "epoch": 0.2408, - "grad_norm": 0.7093971303800936, - "k1_kl": 0.09326171875, - "k3_kl": 0.0517578125, - "kimi_kl": 0.12158203125, - "learning_rate": 3.796e-07, - "loss": 0.0021, - "ppl": 0.0367431640625, - "reward": 0.9083070755004883, - "reward_std": 0.0012711063027381897, - "rewards/perpo_ocr_edit_distance_reward": 0.9083071053028107, + "advantages": -6.811959707420101e-08, + "completion_length": 770.0, + "delta_ref_entropy_loss": 0.00872802734375, + "delta_ref_ppl": -0.0230712890625, + "entropy_loss": -0.060791015625, + "epoch": 0.1204, + "grad_norm": 1.1962676792441898, + "k1_kl": 0.0230712890625, + "k3_kl": 0.0166015625, + "kimi_kl": 0.0419921875, + "learning_rate": 4.398e-07, + "loss": 0.0007, + "ppl": 0.0264892578125, + "reward": 0.5776922106742859, + "reward_std": 0.40602022409439087, + "rewards/perpo_ocr_edit_distance_reward": 0.5776922106742859, "step": 602, "temperature": 0.9 }, { - "advantages": -2.8780530101357726e-06, - "completion_length": 662.0, - "delta_ref_entropy_loss": 0.03363037109375, - "delta_ref_ppl": -0.0142822265625, - "entropy_loss": -0.0225830078125, - "epoch": 0.2412, - "grad_norm": 0.23318034406995644, - "k1_kl": 0.0142822265625, - "k3_kl": 0.005035400390625, - "kimi_kl": 0.007904052734375, - "learning_rate": 3.794e-07, - "loss": 0.0002, - "ppl": 0.010101318359375, - "reward": 0.9972159564495087, - "reward_std": 0.0036553703248500824, - "rewards/perpo_ocr_edit_distance_reward": 0.9972159564495087, + "advantages": -2.6191984943579882e-05, + "completion_length": 495.0, + "delta_ref_entropy_loss": 0.056640625, + "delta_ref_ppl": -0.06787109375, + "entropy_loss": -0.017822265625, + "epoch": 0.1206, + "grad_norm": 0.9529199034926497, + "k1_kl": 0.06787109375, + "k3_kl": 0.0400390625, + "kimi_kl": 0.1103515625, + "learning_rate": 4.3969999999999995e-07, + "loss": 0.0016, + "ppl": 0.0079345703125, + "reward": 0.9950564503669739, + "reward_std": 0.0034702832344919443, + "rewards/perpo_ocr_edit_distance_reward": 0.9950566291809082, "step": 603, "temperature": 0.9 }, { - "advantages": -0.0002980232238769531, - "completion_length": 371.5, - "delta_ref_entropy_loss": 0.0389404296875, - "delta_ref_ppl": -0.03497314453125, - "entropy_loss": -0.02593994140625, - "epoch": 0.2416, - "grad_norm": 0.061300923634345826, - "k1_kl": 0.03497314453125, - "k3_kl": 0.021728515625, - "kimi_kl": 0.0479736328125, - "learning_rate": 3.7919999999999995e-07, - "loss": 0.0012, - "ppl": 0.010955810546875, - "reward": 0.995302826166153, - "reward_std": 0.0, - "rewards/perpo_ocr_edit_distance_reward": 0.9953028857707977, + "advantages": -2.620901432237588e-05, + "completion_length": 735.0, + "delta_ref_entropy_loss": 0.08837890625, + "delta_ref_ppl": -0.10302734375, + "entropy_loss": -0.08740234375, + "epoch": 0.1208, + "grad_norm": 1.4281639076002972, + "k1_kl": 0.103515625, + "k3_kl": 0.060302734375, + "kimi_kl": 0.16015625, + "learning_rate": 4.396e-07, + "loss": 0.0024, + "ppl": 0.044921875, + "reward": 0.9588574767112732, + "reward_std": 0.0018501071026548743, + "rewards/perpo_ocr_edit_distance_reward": 0.9588575959205627, "step": 604, "temperature": 0.9 }, { - "advantages": -3.099015884799883e-05, - "completion_length": 568.0, - "delta_ref_entropy_loss": 0.0445556640625, - "delta_ref_ppl": -0.0201416015625, - "entropy_loss": -0.0537109375, - "epoch": 0.242, - "grad_norm": 1.1000631880881115, - "k1_kl": 0.02013397216796875, - "k3_kl": 0.00849151611328125, - "kimi_kl": 0.01331329345703125, - "learning_rate": 3.79e-07, - "loss": 0.0004, - "ppl": 0.026641845703125, - "reward": 0.9621269702911377, - "reward_std": 0.0022766528418287635, - "rewards/perpo_ocr_edit_distance_reward": 0.9621270596981049, + "advantages": 1.3777188542007934e-05, + "completion_length": 723.0, + "delta_ref_entropy_loss": 0.04150390625, + "delta_ref_ppl": -0.041015625, + "entropy_loss": -0.025634765625, + "epoch": 0.121, + "grad_norm": 0.9675056351722572, + "k1_kl": 0.041015625, + "k3_kl": 0.0225830078125, + "kimi_kl": 0.058349609375, + "learning_rate": 4.395e-07, + "loss": 0.0009, + "ppl": 0.0123291015625, + "reward": 0.9962537288665771, + "reward_std": 0.000516532629262656, + "rewards/perpo_ocr_edit_distance_reward": 0.9962537288665771, "step": 605, "temperature": 0.9 }, { - "advantages": -0.00035760232640313916, - "completion_length": 742.0, - "delta_ref_entropy_loss": 0.0347900390625, - "delta_ref_ppl": -0.018707275390625, - "entropy_loss": -0.02655029296875, - "epoch": 0.2424, - "grad_norm": 0.3124441753156252, - "k1_kl": 0.018707275390625, - "k3_kl": 0.0082244873046875, - "kimi_kl": 0.011962890625, - "learning_rate": 3.7880000000000003e-07, - "loss": 0.0007, - "ppl": 0.01141357421875, - "reward": 0.9930682182312012, - "reward_std": 0.0002358549099881202, - "rewards/perpo_ocr_edit_distance_reward": 0.9930683076381683, + "advantages": -8.97475729288999e-06, + "completion_length": 662.0, + "delta_ref_entropy_loss": 0.11669921875, + "delta_ref_ppl": -0.08056640625, + "entropy_loss": -0.1484375, + "epoch": 0.1212, + "grad_norm": 2.281814517447984, + "k1_kl": 0.08056640625, + "k3_kl": 0.039306640625, + "kimi_kl": 0.06640625, + "learning_rate": 4.394e-07, + "loss": 0.0016, + "ppl": 0.07568359375, + "reward": 0.877986490726471, + "reward_std": 0.004632795695215464, + "rewards/perpo_ocr_edit_distance_reward": 0.8779866099357605, "step": 606, "temperature": 0.9 }, { - "advantages": -6.743840231138165e-05, - "completion_length": 788.5, - "delta_ref_entropy_loss": 0.0487060546875, - "delta_ref_ppl": -0.02288818359375, - "entropy_loss": -0.03076171875, - "epoch": 0.2428, - "grad_norm": 0.4849245533816222, - "k1_kl": 0.02276611328125, - "k3_kl": 0.009307861328125, - "kimi_kl": 0.02032470703125, - "learning_rate": 3.7859999999999996e-07, - "loss": 0.0004, - "ppl": 0.0133056640625, - "reward": 0.9917984008789062, - "reward_std": 0.002502526345779188, - "rewards/perpo_ocr_edit_distance_reward": 0.9917984306812286, + "advantages": -5.10896995820076e-08, + "completion_length": 2048.0, + "delta_ref_entropy_loss": 0.0228271484375, + "delta_ref_ppl": -0.03369140625, + "entropy_loss": -0.2578125, + "epoch": 0.1214, + "grad_norm": 3.8800703763098103, + "k1_kl": 0.033935546875, + "k3_kl": 0.02880859375, + "kimi_kl": 0.0615234375, + "learning_rate": 4.393e-07, + "loss": 0.0011, + "ppl": 0.134765625, + "reward": 0.05404976010322571, + "reward_std": 0.07386479526758194, + "rewards/perpo_ocr_edit_distance_reward": 0.05404976010322571, "step": 607, "temperature": 0.9 }, { - "advantages": -1.3756100997852627e-05, - "completion_length": 478.0, - "delta_ref_entropy_loss": 0.04705810546875, - "delta_ref_ppl": -0.0478515625, - "entropy_loss": -0.044189453125, - "epoch": 0.2432, - "grad_norm": 0.5068490854745467, - "k1_kl": 0.0478515625, - "k3_kl": 0.031005859375, - "kimi_kl": 0.08251953125, - "learning_rate": 3.784e-07, - "loss": 0.0013, - "ppl": 0.024749755859375, - "reward": 0.9797166287899017, - "reward_std": 0.0014199241995811462, - "rewards/perpo_ocr_edit_distance_reward": 0.9797166585922241, + "advantages": -1.648494253458921e-05, + "completion_length": 802.0, + "delta_ref_entropy_loss": 0.0262451171875, + "delta_ref_ppl": -0.0260009765625, + "entropy_loss": -0.0272216796875, + "epoch": 0.1216, + "grad_norm": 0.8386017125802034, + "k1_kl": 0.0260009765625, + "k3_kl": 0.013916015625, + "kimi_kl": 0.027099609375, + "learning_rate": 4.3919999999999996e-07, + "loss": 0.0006, + "ppl": 0.01373291015625, + "reward": 0.9946218729019165, + "reward_std": 0.001964505994692445, + "rewards/perpo_ocr_edit_distance_reward": 0.9946218729019165, "step": 608, "temperature": 0.9 }, { - "advantages": -3.175650326170398e-05, - "completion_length": 1261.0, - "delta_ref_entropy_loss": 0.02294921875, - "delta_ref_ppl": -0.0128936767578125, - "entropy_loss": -0.08349609375, - "epoch": 0.2436, - "grad_norm": 3.7685545009058172, - "k1_kl": 0.012908935546875, - "k3_kl": 0.0076751708984375, - "kimi_kl": 0.01214599609375, - "learning_rate": 3.782e-07, - "loss": 0.0003, - "ppl": 0.04443359375, - "reward": 0.8830425441265106, - "reward_std": 0.0883642453700304, - "rewards/perpo_ocr_edit_distance_reward": 0.883042573928833, + "advantages": -3.6060812362848083e-06, + "completion_length": 228.0, + "delta_ref_entropy_loss": 0.068359375, + "delta_ref_ppl": -0.115234375, + "entropy_loss": -0.076171875, + "epoch": 0.1218, + "grad_norm": 5.111751969627682, + "k1_kl": 0.11474609375, + "k3_kl": 0.078125, + "kimi_kl": 0.20703125, + "learning_rate": 4.3909999999999995e-07, + "loss": 0.0031, + "ppl": 0.0390625, + "reward": 0.9622306823730469, + "reward_std": 0.006970512680709362, + "rewards/perpo_ocr_edit_distance_reward": 0.9622307419776917, "step": 609, "temperature": 0.9 }, { - "advantages": -8.011715760858351e-05, - "completion_length": 891.5, - "delta_ref_entropy_loss": 0.059326171875, - "delta_ref_ppl": -0.02276611328125, - "entropy_loss": -0.062957763671875, - "epoch": 0.244, - "grad_norm": 1.5553460944699113, - "k1_kl": 0.022674560546875, - "k3_kl": 0.018718719482421875, - "kimi_kl": 0.029541015625, - "learning_rate": 3.7799999999999997e-07, - "loss": 0.0008, - "ppl": 0.043975830078125, - "reward": 0.9522573947906494, - "reward_std": 0.021951328351860866, - "rewards/perpo_ocr_edit_distance_reward": 0.9522574841976166, + "advantages": -2.2309168343781494e-06, + "completion_length": 33.0, + "delta_ref_entropy_loss": 0.267578125, + "delta_ref_ppl": -0.5625, + "entropy_loss": -0.1806640625, + "epoch": 0.122, + "grad_norm": 15.207780231325826, + "k1_kl": 0.5625, + "k3_kl": 0.39453125, + "kimi_kl": 1.0546875, + "learning_rate": 4.39e-07, + "loss": 0.0157, + "ppl": 0.0654296875, + "reward": 0.9502288699150085, + "reward_std": 0.007567933294922113, + "rewards/perpo_ocr_edit_distance_reward": 0.9502288699150085, "step": 610, "temperature": 0.9 }, { - "advantages": -5.15239607921103e-05, - "completion_length": 479.0, + "advantages": -6.522451258206274e-06, + "completion_length": 177.0, "delta_ref_entropy_loss": 0.044189453125, - "delta_ref_ppl": -0.028900146484375, - "entropy_loss": -0.02447509765625, - "epoch": 0.2444, - "grad_norm": 0.55901335922648, - "k1_kl": 0.028900146484375, - "k3_kl": 0.01885986328125, - "kimi_kl": 0.03753662109375, - "learning_rate": 3.778e-07, - "loss": 0.0008, - "ppl": 0.0120391845703125, - "reward": 0.9983359277248383, - "reward_std": 0.0003817859978880733, - "rewards/perpo_ocr_edit_distance_reward": 0.9983359575271606, + "delta_ref_ppl": -0.11669921875, + "entropy_loss": -0.04833984375, + "epoch": 0.1222, + "grad_norm": 2.097684564856428, + "k1_kl": 0.11669921875, + "k3_kl": 0.083984375, + "kimi_kl": 0.2099609375, + "learning_rate": 4.389e-07, + "loss": 0.0034, + "ppl": 0.0255126953125, + "reward": 0.956733226776123, + "reward_std": 0.009045506827533245, + "rewards/perpo_ocr_edit_distance_reward": 0.9567332863807678, "step": 611, "temperature": 0.9 }, { - "advantages": -0.00011564153101062402, - "completion_length": 700.5, - "delta_ref_entropy_loss": 0.0321044921875, - "delta_ref_ppl": -0.0196533203125, - "entropy_loss": -0.026611328125, - "epoch": 0.2448, - "grad_norm": 0.2695140627741631, - "k1_kl": 0.01971435546875, - "k3_kl": 0.00970458984375, - "kimi_kl": 0.0223541259765625, - "learning_rate": 3.776e-07, - "loss": 0.0005, - "ppl": 0.012847900390625, - "reward": 0.9950104057788849, - "reward_std": 9.725719428388402e-05, - "rewards/perpo_ocr_edit_distance_reward": 0.9950104653835297, + "advantages": -8.863637049216777e-05, + "completion_length": 805.0, + "delta_ref_entropy_loss": 0.042236328125, + "delta_ref_ppl": -0.052001953125, + "entropy_loss": -0.0218505859375, + "epoch": 0.1224, + "grad_norm": 1.1964196725988028, + "k1_kl": 0.0517578125, + "k3_kl": 0.0281982421875, + "kimi_kl": 0.0654296875, + "learning_rate": 4.388e-07, + "loss": 0.0012, + "ppl": 0.01324462890625, + "reward": 0.9961286187171936, + "reward_std": 0.0004765127378050238, + "rewards/perpo_ocr_edit_distance_reward": 0.9961286783218384, "step": 612, "temperature": 0.9 }, { - "advantages": -2.89508284367912e-06, - "completion_length": 320.5, - "delta_ref_entropy_loss": 0.0631103515625, - "delta_ref_ppl": -0.0709228515625, - "entropy_loss": -0.06903076171875, - "epoch": 0.2452, - "grad_norm": 0.7240998245450267, - "k1_kl": 0.071044921875, - "k3_kl": 0.04498291015625, - "kimi_kl": 0.1298828125, - "learning_rate": 3.774e-07, - "loss": 0.0018, - "ppl": 0.035064697265625, - "reward": 0.9515924453735352, - "reward_std": 0.0021641727071255445, - "rewards/perpo_ocr_edit_distance_reward": 0.9515924751758575, + "advantages": -3.784894943237305e-05, + "completion_length": 385.0, + "delta_ref_entropy_loss": 0.04736328125, + "delta_ref_ppl": -0.080078125, + "entropy_loss": -0.0184326171875, + "epoch": 0.1226, + "grad_norm": 0.4604582063316459, + "k1_kl": 0.08056640625, + "k3_kl": 0.052490234375, + "kimi_kl": 0.1494140625, + "learning_rate": 4.387e-07, + "loss": 0.0021, + "ppl": 0.00616455078125, + "reward": 0.8885799646377563, + "reward_std": 0.0008001684909686446, + "rewards/perpo_ocr_edit_distance_reward": 0.8885800242424011, "step": 613, "temperature": 0.9 }, { - "advantages": -0.0003097653398071998, - "completion_length": 411.0, - "delta_ref_entropy_loss": 0.0379638671875, - "delta_ref_ppl": -0.029296875, - "entropy_loss": -0.0302734375, - "epoch": 0.2456, - "grad_norm": 0.2571646581544969, - "k1_kl": 0.029296875, - "k3_kl": 0.013427734375, - "kimi_kl": 0.02349853515625, - "learning_rate": 3.7719999999999996e-07, - "loss": 0.0008, - "ppl": 0.016815185546875, - "reward": 0.9905266463756561, - "reward_std": 0.00103818962816149, - "rewards/perpo_ocr_edit_distance_reward": 0.9905267357826233, + "advantages": -9.645734826335683e-05, + "completion_length": 688.0, + "delta_ref_entropy_loss": 0.049560546875, + "delta_ref_ppl": -0.041015625, + "entropy_loss": -0.0252685546875, + "epoch": 0.1228, + "grad_norm": 0.7417983572995157, + "k1_kl": 0.041015625, + "k3_kl": 0.0244140625, + "kimi_kl": 0.06298828125, + "learning_rate": 4.3859999999999997e-07, + "loss": 0.0011, + "ppl": 0.01165771484375, + "reward": 0.9799075722694397, + "reward_std": 0.0010472639696672559, + "rewards/perpo_ocr_edit_distance_reward": 0.9799076318740845, "step": 614, "temperature": 0.9 }, { - "advantages": -8.042370154726086e-06, - "completion_length": 229.5, - "delta_ref_entropy_loss": 0.0882568359375, - "delta_ref_ppl": -0.146484375, - "entropy_loss": -0.03662109375, - "epoch": 0.246, - "grad_norm": 3.293417924909283, - "k1_kl": 0.1455078125, - "k3_kl": 0.093017578125, - "kimi_kl": 0.2880859375, - "learning_rate": 3.77e-07, - "loss": 0.0037, - "ppl": 0.015777587890625, - "reward": 0.9947121143341064, - "reward_std": 0.0064892994705587626, - "rewards/perpo_ocr_edit_distance_reward": 0.9947122037410736, + "advantages": -1.7029899268550253e-08, + "completion_length": 493.0, + "delta_ref_entropy_loss": 0.138671875, + "delta_ref_ppl": -0.0810546875, + "entropy_loss": -0.1796875, + "epoch": 0.123, + "grad_norm": 1.7367445804955, + "k1_kl": 0.08056640625, + "k3_kl": 0.037841796875, + "kimi_kl": 0.0712890625, + "learning_rate": 4.3849999999999996e-07, + "loss": 0.0015, + "ppl": 0.087890625, + "reward": 0.7926297783851624, + "reward_std": 0.21185334026813507, + "rewards/perpo_ocr_edit_distance_reward": 0.7926298379898071, "step": 615, "temperature": 0.9 }, { - "advantages": -9.328126907348633e-06, - "completion_length": 1211.0, - "delta_ref_entropy_loss": 0.068115234375, - "delta_ref_ppl": -0.04327392578125, - "entropy_loss": -0.12548828125, - "epoch": 0.2464, - "grad_norm": 2.1777345167799718, - "k1_kl": 0.04345703125, - "k3_kl": 0.035858154296875, - "kimi_kl": 0.05377197265625, - "learning_rate": 3.768e-07, - "loss": 0.0014, - "ppl": 0.070068359375, - "reward": 0.9557304084300995, - "reward_std": 0.028858465841040015, - "rewards/perpo_ocr_edit_distance_reward": 0.9557304978370667, + "advantages": -1.341956067335559e-05, + "completion_length": 1169.0, + "delta_ref_entropy_loss": 0.03369140625, + "delta_ref_ppl": -0.022705078125, + "entropy_loss": -0.0390625, + "epoch": 0.1232, + "grad_norm": 0.6093689533267288, + "k1_kl": 0.022705078125, + "k3_kl": 0.01025390625, + "kimi_kl": 0.0198974609375, + "learning_rate": 4.384e-07, + "loss": 0.0004, + "ppl": 0.016357421875, + "reward": 0.9914526343345642, + "reward_std": 0.005609150510281324, + "rewards/perpo_ocr_edit_distance_reward": 0.9914527535438538, "step": 616, "temperature": 0.9 }, { - "advantages": -5.270753852926191e-06, - "completion_length": 906.5, - "delta_ref_entropy_loss": 0.0574951171875, - "delta_ref_ppl": -0.03857421875, - "entropy_loss": -0.044921875, - "epoch": 0.2468, - "grad_norm": 0.8218683274517669, - "k1_kl": 0.0384521484375, - "k3_kl": 0.02020263671875, - "kimi_kl": 0.0440673828125, - "learning_rate": 3.7659999999999997e-07, - "loss": 0.0008, - "ppl": 0.023681640625, - "reward": 0.8888521492481232, - "reward_std": 0.09001200331840664, - "rewards/perpo_ocr_edit_distance_reward": 0.8888522088527679, + "advantages": -3.065381974920456e-07, + "completion_length": 1671.0, + "delta_ref_entropy_loss": 0.07421875, + "delta_ref_ppl": -0.0634765625, + "entropy_loss": -0.203125, + "epoch": 0.1234, + "grad_norm": 2.252084971287791, + "k1_kl": 0.06298828125, + "k3_kl": 0.04052734375, + "kimi_kl": 0.08154296875, + "learning_rate": 4.383e-07, + "loss": 0.0016, + "ppl": 0.111328125, + "reward": 0.7496101260185242, + "reward_std": 0.14216305315494537, + "rewards/perpo_ocr_edit_distance_reward": 0.7496102452278137, "step": 617, "temperature": 0.9 }, { - "advantages": -4.6023302303410674e-05, - "completion_length": 365.5, - "delta_ref_entropy_loss": 0.115234375, - "delta_ref_ppl": -0.0784912109375, - "entropy_loss": -0.1795654296875, - "epoch": 0.2472, - "grad_norm": 1.6868037555693634, - "k1_kl": 0.0782470703125, - "k3_kl": 0.0447998046875, - "kimi_kl": 0.1910400390625, - "learning_rate": 3.764e-07, - "loss": 0.0018, - "ppl": 0.09765625, - "reward": 0.7089489847421646, - "reward_std": 0.010225629055639729, - "rewards/perpo_ocr_edit_distance_reward": 0.7089490592479706, + "advantages": -4.775183697347529e-05, + "completion_length": 324.0, + "delta_ref_entropy_loss": 0.054931640625, + "delta_ref_ppl": -0.056640625, + "entropy_loss": -0.032958984375, + "epoch": 0.1236, + "grad_norm": 0.9260482285913012, + "k1_kl": 0.056640625, + "k3_kl": 0.037109375, + "kimi_kl": 0.1142578125, + "learning_rate": 4.3819999999999994e-07, + "loss": 0.0015, + "ppl": 0.015869140625, + "reward": 0.9957611560821533, + "reward_std": 0.0007913972367532551, + "rewards/perpo_ocr_edit_distance_reward": 0.9957612156867981, "step": 618, "temperature": 0.9 }, { - "advantages": -4.030977106594946e-05, - "completion_length": 270.5, - "delta_ref_entropy_loss": 0.0447998046875, - "delta_ref_ppl": -0.036376953125, - "entropy_loss": -0.046875, - "epoch": 0.2476, - "grad_norm": 1.7024495425762707, - "k1_kl": 0.036376953125, - "k3_kl": 0.021484375, - "kimi_kl": 0.0526123046875, - "learning_rate": 3.7619999999999994e-07, - "loss": 0.0009, - "ppl": 0.02410888671875, - "reward": 0.9675824046134949, - "reward_std": 0.0010274768574163318, - "rewards/perpo_ocr_edit_distance_reward": 0.967582494020462, + "advantages": -1.9933497242163867e-05, + "completion_length": 702.0, + "delta_ref_entropy_loss": 0.03857421875, + "delta_ref_ppl": -0.035400390625, + "entropy_loss": -0.031982421875, + "epoch": 0.1238, + "grad_norm": 0.7696137886491436, + "k1_kl": 0.035400390625, + "k3_kl": 0.0205078125, + "kimi_kl": 0.06298828125, + "learning_rate": 4.381e-07, + "loss": 0.0008, + "ppl": 0.01385498046875, + "reward": 0.9436801075935364, + "reward_std": 0.0007543478277511895, + "rewards/perpo_ocr_edit_distance_reward": 0.9436801075935364, "step": 619, "temperature": 0.9 }, { - "advantages": -2.690724159037927e-06, - "completion_length": 1091.5, - "delta_ref_entropy_loss": 0.03375244140625, - "delta_ref_ppl": -0.023468017578125, - "entropy_loss": -0.0537109375, - "epoch": 0.248, - "grad_norm": 4834.0594430263645, - "k1_kl": 0.023223876953125, - "k3_kl": 8.821044921875, - "kimi_kl": 0.0423583984375, - "learning_rate": 3.76e-07, - "loss": 0.3538, - "ppl": 0.0535888671875, - "reward": 0.9903966784477234, - "reward_std": 0.006300801411271095, - "rewards/perpo_ocr_edit_distance_reward": 0.9903967380523682, + "advantages": -5.366972618503496e-05, + "completion_length": 196.0, + "delta_ref_entropy_loss": 0.056396484375, + "delta_ref_ppl": -0.09619140625, + "entropy_loss": -0.049560546875, + "epoch": 0.124, + "grad_norm": 1.3710865132147951, + "k1_kl": 0.095703125, + "k3_kl": 0.06640625, + "kimi_kl": 0.1748046875, + "learning_rate": 4.38e-07, + "loss": 0.0027, + "ppl": 0.0224609375, + "reward": 0.9656596183776855, + "reward_std": 0.001486936816945672, + "rewards/perpo_ocr_edit_distance_reward": 0.9656597375869751, "step": 620, "temperature": 0.9 }, { - "advantages": -2.1038311388110742e-05, - "completion_length": 457.5, - "delta_ref_entropy_loss": 0.0294189453125, - "delta_ref_ppl": -0.0152587890625, - "entropy_loss": -0.013336181640625, - "epoch": 0.2484, - "grad_norm": 0.4303252900472392, - "k1_kl": 0.015289306640625, - "k3_kl": 0.00732421875, - "kimi_kl": 0.0135498046875, - "learning_rate": 3.758e-07, - "loss": 0.0003, - "ppl": 0.00543212890625, - "reward": 0.9987873136997223, - "reward_std": 0.00035471239243634045, - "rewards/perpo_ocr_edit_distance_reward": 0.9987873136997223, + "advantages": -5.306516686687246e-05, + "completion_length": 130.0, + "delta_ref_entropy_loss": 0.1181640625, + "delta_ref_ppl": -0.1689453125, + "entropy_loss": -0.04931640625, + "epoch": 0.1242, + "grad_norm": 3.3019699379814402, + "k1_kl": 0.1689453125, + "k3_kl": 0.11962890625, + "kimi_kl": 0.349609375, + "learning_rate": 4.3789999999999997e-07, + "loss": 0.0048, + "ppl": 0.0135498046875, + "reward": 0.9683418273925781, + "reward_std": 0.0018258239142596722, + "rewards/perpo_ocr_edit_distance_reward": 0.9683418869972229, "step": 621, "temperature": 0.9 }, { - "advantages": -4.208939571981318e-05, - "completion_length": 522.5, - "delta_ref_entropy_loss": 0.06103515625, - "delta_ref_ppl": -0.070556640625, - "entropy_loss": -0.03472900390625, - "epoch": 0.2488, - "grad_norm": 0.358666277713106, - "k1_kl": 0.070556640625, - "k3_kl": 0.04473876953125, - "kimi_kl": 0.130157470703125, - "learning_rate": 3.7559999999999995e-07, - "loss": 0.0018, - "ppl": 0.01434326171875, - "reward": 0.9950494766235352, - "reward_std": 0.0001522657839814201, - "rewards/perpo_ocr_edit_distance_reward": 0.9950495362281799, + "advantages": -1.7029899268550253e-08, + "completion_length": 308.0, + "delta_ref_entropy_loss": 0.055419921875, + "delta_ref_ppl": -0.0537109375, + "entropy_loss": -0.037353515625, + "epoch": 0.1244, + "grad_norm": 1.1373251254567796, + "k1_kl": 0.053466796875, + "k3_kl": 0.0322265625, + "kimi_kl": 0.07470703125, + "learning_rate": 4.378e-07, + "loss": 0.0013, + "ppl": 0.0177001953125, + "reward": 0.9917661547660828, + "reward_std": 0.0013705944875255227, + "rewards/perpo_ocr_edit_distance_reward": 0.9917661547660828, "step": 622, "temperature": 0.9 }, { - "advantages": 1.5820776695818495e-05, - "completion_length": 647.5, - "delta_ref_entropy_loss": 0.044189453125, - "delta_ref_ppl": -0.026641845703125, - "entropy_loss": -0.05535888671875, - "epoch": 0.2492, - "grad_norm": 1.1808034766906237, - "k1_kl": 0.026641845703125, - "k3_kl": 0.0135498046875, - "kimi_kl": 0.02630615234375, - "learning_rate": 3.754e-07, - "loss": 0.0005, - "ppl": 0.02593994140625, - "reward": 0.9661305248737335, - "reward_std": 0.015052659888169728, - "rewards/perpo_ocr_edit_distance_reward": 0.9661305546760559, + "advantages": -1.2423311090969946e-05, + "completion_length": 1239.0, + "delta_ref_entropy_loss": 0.0260009765625, + "delta_ref_ppl": -0.0267333984375, + "entropy_loss": -0.01470947265625, + "epoch": 0.1246, + "grad_norm": 0.5584503088601297, + "k1_kl": 0.0267333984375, + "k3_kl": 0.0167236328125, + "kimi_kl": 0.0498046875, + "learning_rate": 4.3769999999999996e-07, + "loss": 0.0007, + "ppl": 0.006561279296875, + "reward": 0.980307400226593, + "reward_std": 0.0026458112988620996, + "rewards/perpo_ocr_edit_distance_reward": 0.9803074598312378, "step": 623, "temperature": 0.9 }, { - "advantages": -2.9121128477527236e-06, - "completion_length": 463.5, - "delta_ref_entropy_loss": 0.0533447265625, - "delta_ref_ppl": -0.0687255859375, - "entropy_loss": -0.0648193359375, - "epoch": 0.2496, - "grad_norm": 1.9142617103960633, - "k1_kl": 0.0684814453125, - "k3_kl": 0.04510498046875, - "kimi_kl": 0.12841796875, - "learning_rate": 3.7519999999999997e-07, - "loss": 0.0018, - "ppl": 0.0352783203125, - "reward": 0.9334573149681091, - "reward_std": 0.014357049018144608, - "rewards/perpo_ocr_edit_distance_reward": 0.9334573447704315, + "advantages": -2.995559225382749e-05, + "completion_length": 502.0, + "delta_ref_entropy_loss": 0.039794921875, + "delta_ref_ppl": -0.042236328125, + "entropy_loss": -0.017333984375, + "epoch": 0.1248, + "grad_norm": 0.5462272179495649, + "k1_kl": 0.042236328125, + "k3_kl": 0.027099609375, + "kimi_kl": 0.06982421875, + "learning_rate": 4.3759999999999995e-07, + "loss": 0.0011, + "ppl": 0.00897216796875, + "reward": 0.9858402609825134, + "reward_std": 0.0013217803789302707, + "rewards/perpo_ocr_edit_distance_reward": 0.9858403205871582, "step": 624, "temperature": 0.9 }, { - "advantages": -3.159046229939122e-06, - "completion_length": 347.5, - "delta_ref_entropy_loss": 0.1048583984375, - "delta_ref_ppl": -0.079833984375, - "entropy_loss": -0.1884765625, - "epoch": 0.25, - "grad_norm": 3.452506095637201, - "k1_kl": 0.079833984375, - "k3_kl": 0.04638671875, - "kimi_kl": 0.119140625, - "learning_rate": 3.75e-07, - "loss": 0.0019, - "ppl": 0.1048583984375, - "reward": 0.8641857504844666, - "reward_std": 0.013872999814338982, - "rewards/perpo_ocr_edit_distance_reward": 0.8641857504844666, + "advantages": -1.1299338439130224e-05, + "completion_length": 881.0, + "delta_ref_entropy_loss": 0.060302734375, + "delta_ref_ppl": -0.0673828125, + "entropy_loss": -0.05810546875, + "epoch": 0.125, + "grad_norm": 1.4445352037695933, + "k1_kl": 0.0673828125, + "k3_kl": 0.048828125, + "kimi_kl": 0.115234375, + "learning_rate": 4.375e-07, + "loss": 0.002, + "ppl": 0.0308837890625, + "reward": 0.9817020297050476, + "reward_std": 0.002914630575105548, + "rewards/perpo_ocr_edit_distance_reward": 0.9817021489143372, "step": 625, "temperature": 0.9 }, { - "advantages": -0.00012463970779208466, - "completion_length": 963.0, - "delta_ref_entropy_loss": 0.0302734375, - "delta_ref_ppl": -0.023468017578125, - "entropy_loss": -0.0291748046875, - "epoch": 0.2504, - "grad_norm": 0.46260760874927476, - "k1_kl": 0.023468017578125, - "k3_kl": 0.0129852294921875, - "kimi_kl": 0.031982421875, - "learning_rate": 3.748e-07, - "loss": 0.0006, - "ppl": 0.01507568359375, - "reward": 0.9986006617546082, - "reward_std": 0.0006440706492867321, - "rewards/perpo_ocr_edit_distance_reward": 0.9986007213592529, + "advantages": -1.9763197997235693e-05, + "completion_length": 769.0, + "delta_ref_entropy_loss": 0.03515625, + "delta_ref_ppl": -0.033447265625, + "entropy_loss": -0.016845703125, + "epoch": 0.1252, + "grad_norm": 0.5227026477008301, + "k1_kl": 0.033447265625, + "k3_kl": 0.02001953125, + "kimi_kl": 0.059326171875, + "learning_rate": 4.374e-07, + "loss": 0.0008, + "ppl": 0.00775146484375, + "reward": 0.9960106015205383, + "reward_std": 0.0007624838035553694, + "rewards/perpo_ocr_edit_distance_reward": 0.9960106015205383, "step": 626, "temperature": 0.9 }, { - "advantages": -1.5667507113903412e-06, - "completion_length": 650.5, - "delta_ref_entropy_loss": 0.103515625, - "delta_ref_ppl": -0.0751953125, - "entropy_loss": -0.146240234375, - "epoch": 0.2508, - "grad_norm": 1.2148367633239043, - "k1_kl": 0.0753173828125, - "k3_kl": 0.0462646484375, - "kimi_kl": 0.087890625, - "learning_rate": 3.746e-07, - "loss": 0.0018, - "ppl": 0.0802001953125, - "reward": 0.9522692263126373, - "reward_std": 0.005397487431764603, - "rewards/perpo_ocr_edit_distance_reward": 0.9522691965103149, + "advantages": -3.4570696243463317e-06, + "completion_length": 782.0, + "delta_ref_entropy_loss": 0.061279296875, + "delta_ref_ppl": -0.0576171875, + "entropy_loss": -0.0296630859375, + "epoch": 0.1254, + "grad_norm": 1.7041828684935647, + "k1_kl": 0.057861328125, + "k3_kl": 0.03271484375, + "kimi_kl": 0.0869140625, + "learning_rate": 4.373e-07, + "loss": 0.0013, + "ppl": 0.012451171875, + "reward": 0.9258041977882385, + "reward_std": 0.01220795325934887, + "rewards/perpo_ocr_edit_distance_reward": 0.9258042573928833, "step": 627, "temperature": 0.9 }, { - "advantages": -6.3947270518838195e-06, - "completion_length": 787.5, - "delta_ref_entropy_loss": 0.087158203125, - "delta_ref_ppl": -0.0499267578125, - "entropy_loss": -0.104736328125, - "epoch": 0.2512, - "grad_norm": 1.578964891305769, - "k1_kl": 0.0496826171875, - "k3_kl": 0.026824951171875, - "kimi_kl": 0.0626220703125, - "learning_rate": 3.744e-07, - "loss": 0.0011, - "ppl": 0.05712890625, - "reward": 0.834266185760498, - "reward_std": 0.012151638977229595, - "rewards/perpo_ocr_edit_distance_reward": 0.8342662453651428, + "advantages": 6.130763722467236e-06, + "completion_length": 630.0, + "delta_ref_entropy_loss": 0.04931640625, + "delta_ref_ppl": -0.044921875, + "entropy_loss": -0.027587890625, + "epoch": 0.1256, + "grad_norm": 0.7357576777041647, + "k1_kl": 0.044921875, + "k3_kl": 0.024658203125, + "kimi_kl": 0.060546875, + "learning_rate": 4.3719999999999997e-07, + "loss": 0.001, + "ppl": 0.013427734375, + "reward": 0.9934478998184204, + "reward_std": 0.0012892164522781968, + "rewards/perpo_ocr_edit_distance_reward": 0.9934478998184204, "step": 628, "temperature": 0.9 }, { - "advantages": 1.2763909353452618e-05, - "completion_length": 720.0, - "delta_ref_entropy_loss": 0.03179931640625, - "delta_ref_ppl": -0.02325439453125, - "entropy_loss": -0.02789306640625, - "epoch": 0.2516, - "grad_norm": 0.3778459100869354, - "k1_kl": 0.023193359375, - "k3_kl": 0.012359619140625, - "kimi_kl": 0.032562255859375, - "learning_rate": 3.7419999999999995e-07, - "loss": 0.0005, - "ppl": 0.01336669921875, - "reward": 0.9905956089496613, - "reward_std": 0.002807794517138973, - "rewards/perpo_ocr_edit_distance_reward": 0.9905956387519836, + "advantages": -6.130763949840912e-07, + "completion_length": 842.0, + "delta_ref_entropy_loss": 0.09912109375, + "delta_ref_ppl": -0.10791015625, + "entropy_loss": -1.0625, + "epoch": 0.1258, + "grad_norm": 7.1904625362113075, + "k1_kl": 0.10693359375, + "k3_kl": 0.07958984375, + "kimi_kl": 0.146484375, + "learning_rate": 4.3709999999999996e-07, + "loss": 0.0032, + "ppl": 0.625, + "reward": 0.3089968264102936, + "reward_std": 0.029121115803718567, + "rewards/perpo_ocr_edit_distance_reward": 0.30899685621261597, "step": 629, "temperature": 0.9 }, { - "advantages": -0.000300645828474444, - "completion_length": 453.5, - "delta_ref_entropy_loss": 0.056396484375, - "delta_ref_ppl": -0.02850341796875, - "entropy_loss": -0.03265380859375, - "epoch": 0.252, - "grad_norm": 0.4089712104774523, - "k1_kl": 0.02850341796875, - "k3_kl": 0.012054443359375, - "kimi_kl": 0.022125244140625, - "learning_rate": 3.74e-07, - "loss": 0.0008, - "ppl": 0.018035888671875, - "reward": 0.9713785350322723, - "reward_std": 0.004000721964985132, - "rewards/perpo_ocr_edit_distance_reward": 0.9713785946369171, + "advantages": -1.6817026335047558e-05, + "completion_length": 1418.0, + "delta_ref_entropy_loss": 0.052978515625, + "delta_ref_ppl": -0.03173828125, + "entropy_loss": -0.1259765625, + "epoch": 0.126, + "grad_norm": 233.0054359724522, + "k1_kl": 0.03173828125, + "k3_kl": 0.2158203125, + "kimi_kl": 0.0693359375, + "learning_rate": 4.3699999999999996e-07, + "loss": 0.0087, + "ppl": 0.068359375, + "reward": 0.885877251625061, + "reward_std": 0.00243182061240077, + "rewards/perpo_ocr_edit_distance_reward": 0.8858773112297058, "step": 630, "temperature": 0.9 }, { - "advantages": -5.4674491821060656e-05, - "completion_length": 573.0, - "delta_ref_entropy_loss": 0.0279541015625, - "delta_ref_ppl": -0.0157470703125, - "entropy_loss": -0.02252197265625, - "epoch": 0.2524, - "grad_norm": 0.4232360790652247, - "k1_kl": 0.015777587890625, - "k3_kl": 0.007171630859375, - "kimi_kl": 0.0130615234375, - "learning_rate": 3.738e-07, - "loss": 0.0003, - "ppl": 0.009307861328125, - "reward": 0.9984015226364136, - "reward_std": 0.001385110619594343, - "rewards/perpo_ocr_edit_distance_reward": 0.9984015822410583, + "advantages": 1.7540796761750244e-06, + "completion_length": 858.0, + "delta_ref_entropy_loss": 0.058837890625, + "delta_ref_ppl": -0.064453125, + "entropy_loss": -0.09912109375, + "epoch": 0.1262, + "grad_norm": 1.4080704637390138, + "k1_kl": 0.064453125, + "k3_kl": 0.041259765625, + "kimi_kl": 0.08935546875, + "learning_rate": 4.369e-07, + "loss": 0.0016, + "ppl": 0.0625, + "reward": 0.7945806384086609, + "reward_std": 0.009626274928450584, + "rewards/perpo_ocr_edit_distance_reward": 0.7945806384086609, "step": 631, "temperature": 0.9 }, { - "advantages": -7.405451885666992e-05, - "completion_length": 1080.5, - "delta_ref_entropy_loss": 0.022064208984375, - "delta_ref_ppl": -0.02056884765625, - "entropy_loss": -0.03228759765625, - "epoch": 0.2528, - "grad_norm": 1.5886696355858798, - "k1_kl": 0.0206298828125, - "k3_kl": 0.012420654296875, - "kimi_kl": 0.02838134765625, - "learning_rate": 3.7359999999999996e-07, - "loss": 0.0006, - "ppl": 0.01715087890625, - "reward": 0.888468861579895, - "reward_std": 0.07013953913701698, - "rewards/perpo_ocr_edit_distance_reward": 0.8884689509868622, + "advantages": -3.9679666770098265e-06, + "completion_length": 40.0, + "delta_ref_entropy_loss": 0.25390625, + "delta_ref_ppl": -0.53125, + "entropy_loss": -0.1591796875, + "epoch": 0.1264, + "grad_norm": 13.118077635233153, + "k1_kl": 0.53125, + "k3_kl": 0.39453125, + "kimi_kl": 1.3046875, + "learning_rate": 4.368e-07, + "loss": 0.0158, + "ppl": 0.068359375, + "reward": 0.5778853893280029, + "reward_std": 0.010676965117454529, + "rewards/perpo_ocr_edit_distance_reward": 0.5778854489326477, "step": 632, "temperature": 0.9 }, { - "advantages": -0.0002980232238769531, - "completion_length": 546.0, - "delta_ref_entropy_loss": 0.01953125, - "delta_ref_ppl": -0.010498046875, - "entropy_loss": -0.008514404296875, - "epoch": 0.2532, - "grad_norm": 0.012846949828352508, - "k1_kl": 0.010528564453125, - "k3_kl": 0.0043487548828125, - "kimi_kl": 0.00780487060546875, - "learning_rate": 3.734e-07, - "loss": 0.0005, - "ppl": 0.00286102294921875, - "reward": 0.9986217617988586, - "reward_std": 0.0, - "rewards/perpo_ocr_edit_distance_reward": 0.9986218214035034, + "advantages": -6.811959707420101e-08, + "completion_length": 2048.0, + "delta_ref_entropy_loss": 0.1513671875, + "delta_ref_ppl": -0.1083984375, + "entropy_loss": -0.314453125, + "epoch": 0.1266, + "grad_norm": 1.9045689761818192, + "k1_kl": 0.1083984375, + "k3_kl": 0.0595703125, + "kimi_kl": 0.11865234375, + "learning_rate": 4.3669999999999993e-07, + "loss": 0.0024, + "ppl": 0.16015625, + "reward": 0.6490818858146667, + "reward_std": 0.17510345578193665, + "rewards/perpo_ocr_edit_distance_reward": 0.6490819454193115, "step": 633, "temperature": 0.9 }, { - "advantages": -3.026851663889829e-05, - "completion_length": 291.5, - "delta_ref_entropy_loss": 0.04254150390625, - "delta_ref_ppl": -0.077911376953125, - "entropy_loss": -0.0418701171875, - "epoch": 0.2536, - "grad_norm": 2.6782568261644073, - "k1_kl": 0.078399658203125, - "k3_kl": 0.057647705078125, - "kimi_kl": 0.160247802734375, - "learning_rate": 3.732e-07, - "loss": 0.0023, - "ppl": 0.021820068359375, - "reward": 0.9904105365276337, - "reward_std": 0.002304893860127777, - "rewards/perpo_ocr_edit_distance_reward": 0.9904105961322784, + "advantages": -2.5204250050592236e-05, + "completion_length": 454.0, + "delta_ref_entropy_loss": 0.03857421875, + "delta_ref_ppl": -0.0712890625, + "entropy_loss": -0.0245361328125, + "epoch": 0.1268, + "grad_norm": 0.9054515972028698, + "k1_kl": 0.0712890625, + "k3_kl": 0.04931640625, + "kimi_kl": 0.1611328125, + "learning_rate": 4.366e-07, + "loss": 0.002, + "ppl": 0.01434326171875, + "reward": 0.9923312067985535, + "reward_std": 0.001252303016372025, + "rewards/perpo_ocr_edit_distance_reward": 0.992331326007843, "step": 634, "temperature": 0.9 }, { - "advantages": -4.036511745653115e-05, - "completion_length": 765.0, - "delta_ref_entropy_loss": 0.0579833984375, - "delta_ref_ppl": -0.0379638671875, - "entropy_loss": -0.04412841796875, - "epoch": 0.254, - "grad_norm": 0.48012565084027636, - "k1_kl": 0.0379638671875, - "k3_kl": 0.0188140869140625, - "kimi_kl": 0.03790283203125, - "learning_rate": 3.7299999999999997e-07, - "loss": 0.0008, - "ppl": 0.0226287841796875, - "reward": 0.9818963706493378, - "reward_std": 0.0005574118258664384, - "rewards/perpo_ocr_edit_distance_reward": 0.9818964004516602, + "advantages": -3.7465778746081924e-07, + "completion_length": 452.0, + "delta_ref_entropy_loss": 0.060302734375, + "delta_ref_ppl": -0.049072265625, + "entropy_loss": -0.0771484375, + "epoch": 0.127, + "grad_norm": 1.4674686630373999, + "k1_kl": 0.049072265625, + "k3_kl": 0.0277099609375, + "kimi_kl": 0.06982421875, + "learning_rate": 4.3649999999999997e-07, + "loss": 0.0011, + "ppl": 0.038818359375, + "reward": 0.29561346769332886, + "reward_std": 0.03256215900182724, + "rewards/perpo_ocr_edit_distance_reward": 0.29561346769332886, "step": 635, "temperature": 0.9 }, { - "advantages": -0.00012540817260742188, - "completion_length": 528.5, - "delta_ref_entropy_loss": 0.03485107421875, - "delta_ref_ppl": -0.0264892578125, - "entropy_loss": -0.037353515625, - "epoch": 0.2544, - "grad_norm": 4.746003520273472, - "k1_kl": 0.0264892578125, - "k3_kl": 0.014068603515625, - "kimi_kl": 0.028076171875, - "learning_rate": 3.728e-07, - "loss": 0.0007, - "ppl": 0.0201416015625, - "reward": 0.9982081949710846, - "reward_std": 0.0005103218136355281, - "rewards/perpo_ocr_edit_distance_reward": 0.9982082843780518, + "advantages": -2.89508284367912e-06, + "completion_length": 392.0, + "delta_ref_entropy_loss": 0.109375, + "delta_ref_ppl": -0.10302734375, + "entropy_loss": -0.09228515625, + "epoch": 0.1272, + "grad_norm": 1.2166752257632816, + "k1_kl": 0.10302734375, + "k3_kl": 0.06103515625, + "kimi_kl": 0.1533203125, + "learning_rate": 4.364e-07, + "loss": 0.0024, + "ppl": 0.053955078125, + "reward": 0.9021952152252197, + "reward_std": 0.02635793574154377, + "rewards/perpo_ocr_edit_distance_reward": 0.9021952748298645, "step": 636, "temperature": 0.9 }, { - "advantages": -7.531472817845497e-06, - "completion_length": 673.5, - "delta_ref_entropy_loss": 0.037353515625, - "delta_ref_ppl": -0.03314208984375, - "entropy_loss": -0.050201416015625, - "epoch": 0.2548, - "grad_norm": 1.9846646302195428, - "k1_kl": 0.03326416015625, - "k3_kl": 0.023468017578125, - "kimi_kl": 0.049468994140625, - "learning_rate": 3.726e-07, - "loss": 0.0009, - "ppl": 0.0250091552734375, - "reward": 0.9005014002323151, - "reward_std": 0.021199702110607177, - "rewards/perpo_ocr_edit_distance_reward": 0.9005014300346375, + "advantages": -6.190368731040508e-05, + "completion_length": 480.0, + "delta_ref_entropy_loss": 0.068359375, + "delta_ref_ppl": -0.05859375, + "entropy_loss": -0.03271484375, + "epoch": 0.1274, + "grad_norm": 0.7216368222613576, + "k1_kl": 0.058349609375, + "k3_kl": 0.034912109375, + "kimi_kl": 0.091796875, + "learning_rate": 4.363e-07, + "loss": 0.0015, + "ppl": 0.01397705078125, + "reward": 0.9976449012756348, + "reward_std": 0.0016880433540791273, + "rewards/perpo_ocr_edit_distance_reward": 0.9976450204849243, "step": 637, "temperature": 0.9 }, { - "advantages": -4.982948621545802e-05, - "completion_length": 745.5, - "delta_ref_entropy_loss": 0.035888671875, - "delta_ref_ppl": -0.02001953125, - "entropy_loss": -0.02679443359375, - "epoch": 0.2552, - "grad_norm": 8.410138844679125, - "k1_kl": 0.02001953125, - "k3_kl": 0.0201416015625, - "kimi_kl": 0.022857666015625, - "learning_rate": 3.7239999999999997e-07, - "loss": 0.0009, - "ppl": 0.0157470703125, - "reward": 0.9961423575878143, - "reward_std": 0.0012514686677604914, - "rewards/perpo_ocr_edit_distance_reward": 0.9961424171924591, + "advantages": 2.86272606899729e-05, + "completion_length": 707.0, + "delta_ref_entropy_loss": 0.06884765625, + "delta_ref_ppl": -0.07177734375, + "entropy_loss": -0.05810546875, + "epoch": 0.1276, + "grad_norm": 1.1496551479734474, + "k1_kl": 0.0712890625, + "k3_kl": 0.043212890625, + "kimi_kl": 0.12060546875, + "learning_rate": 4.3619999999999995e-07, + "loss": 0.0017, + "ppl": 0.033447265625, + "reward": 0.9547366499900818, + "reward_std": 0.0007919708732515574, + "rewards/perpo_ocr_edit_distance_reward": 0.954736590385437, "step": 638, "temperature": 0.9 }, { - "advantages": -1.7396042494510766e-05, - "completion_length": 400.5, - "delta_ref_entropy_loss": 0.0478515625, - "delta_ref_ppl": -0.02532958984375, - "entropy_loss": -0.03961181640625, - "epoch": 0.2556, - "grad_norm": 0.6214688501150515, - "k1_kl": 0.025390625, - "k3_kl": 0.011322021484375, - "kimi_kl": 0.016357421875, - "learning_rate": 3.7219999999999996e-07, - "loss": 0.0005, - "ppl": 0.02117919921875, - "reward": 0.9762330055236816, - "reward_std": 0.0005567544139921665, - "rewards/perpo_ocr_edit_distance_reward": 0.9762330651283264, + "advantages": -8.194841029762756e-06, + "completion_length": 2048.0, + "delta_ref_entropy_loss": 0.033447265625, + "delta_ref_ppl": 0.00341796875, + "entropy_loss": -0.1015625, + "epoch": 0.1278, + "grad_norm": 11107456.048851905, + "k1_kl": -0.003326416015625, + "k3_kl": 19840.0, + "kimi_kl": 0.169921875, + "learning_rate": 4.361e-07, + "loss": 792.2836, + "ppl": 0.09228515625, + "reward": 0.8962746262550354, + "reward_std": 0.006132605019956827, + "rewards/perpo_ocr_edit_distance_reward": 0.8962746858596802, "step": 639, "temperature": 0.9 }, { - "advantages": -8.707174856681377e-05, - "completion_length": 1044.5, - "delta_ref_entropy_loss": 0.018798828125, - "delta_ref_ppl": -0.007171630859375, - "entropy_loss": -0.0230712890625, - "epoch": 0.256, - "grad_norm": 0.6487624767021442, - "k1_kl": 0.0072021484375, - "k3_kl": 0.0025177001953125, - "kimi_kl": 0.00360870361328125, - "learning_rate": 3.72e-07, - "loss": 0.0002, - "ppl": 0.00946044921875, - "reward": 0.9985106885433197, - "reward_std": 0.0006615009624511003, - "rewards/perpo_ocr_edit_distance_reward": 0.9985107183456421, + "advantages": -8.97475729288999e-06, + "completion_length": 759.0, + "delta_ref_entropy_loss": 0.0341796875, + "delta_ref_ppl": -0.05126953125, + "entropy_loss": -0.0419921875, + "epoch": 0.128, + "grad_norm": 0.9408952652364095, + "k1_kl": 0.05126953125, + "k3_kl": 0.03125, + "kimi_kl": 0.0830078125, + "learning_rate": 4.36e-07, + "loss": 0.0013, + "ppl": 0.0203857421875, + "reward": 0.9689788818359375, + "reward_std": 0.00559088634327054, + "rewards/perpo_ocr_edit_distance_reward": 0.9689789414405823, "step": 640, "temperature": 0.9 }, { - "advantages": -0.00038682563172187656, - "completion_length": 811.0, - "delta_ref_entropy_loss": 0.0206298828125, - "delta_ref_ppl": -0.0089111328125, - "entropy_loss": -0.02056884765625, - "epoch": 0.2564, - "grad_norm": 0.9362992438745806, - "k1_kl": 0.0089111328125, - "k3_kl": 0.003520965576171875, - "kimi_kl": 0.0060272216796875, - "learning_rate": 3.718e-07, - "loss": 0.0005, - "ppl": 0.00955963134765625, - "reward": 0.998029500246048, - "reward_std": 0.000237678483244963, - "rewards/perpo_ocr_edit_distance_reward": 0.9980295896530151, + "advantages": -0.00010272434883518144, + "completion_length": 157.0, + "delta_ref_entropy_loss": 0.033447265625, + "delta_ref_ppl": -0.16015625, + "entropy_loss": -0.042236328125, + "epoch": 0.1282, + "grad_norm": 2.1931473316495467, + "k1_kl": 0.1591796875, + "k3_kl": 0.123046875, + "kimi_kl": 0.453125, + "learning_rate": 4.359e-07, + "loss": 0.005, + "ppl": 0.01806640625, + "reward": 0.9410299062728882, + "reward_std": 0.0011435897322371602, + "rewards/perpo_ocr_edit_distance_reward": 0.9410300254821777, "step": 641, "temperature": 0.9 }, { - "advantages": -2.301165181961551e-06, - "completion_length": 491.5, - "delta_ref_entropy_loss": 0.11700439453125, - "delta_ref_ppl": -0.079833984375, - "entropy_loss": -0.1802978515625, - "epoch": 0.2568, - "grad_norm": 1.7579440873518595, - "k1_kl": 0.079833984375, - "k3_kl": 0.0419921875, - "kimi_kl": 0.087158203125, - "learning_rate": 3.7159999999999997e-07, - "loss": 0.0017, - "ppl": 0.10333251953125, - "reward": 0.8465489447116852, - "reward_std": 0.01105393678881228, - "rewards/perpo_ocr_edit_distance_reward": 0.8465490341186523, + "advantages": -2.3177692128228955e-05, + "completion_length": 711.0, + "delta_ref_entropy_loss": 0.0703125, + "delta_ref_ppl": -0.052001953125, + "entropy_loss": -0.0703125, + "epoch": 0.1284, + "grad_norm": 0.9185560662948964, + "k1_kl": 0.052001953125, + "k3_kl": 0.02392578125, + "kimi_kl": 0.050048828125, + "learning_rate": 4.358e-07, + "loss": 0.001, + "ppl": 0.03662109375, + "reward": 0.9824516773223877, + "reward_std": 0.001736653153784573, + "rewards/perpo_ocr_edit_distance_reward": 0.9824517369270325, "step": 642, "temperature": 0.9 }, { - "advantages": -4.112720645821355e-05, - "completion_length": 684.0, - "delta_ref_entropy_loss": 0.061767578125, - "delta_ref_ppl": -0.0372314453125, - "entropy_loss": -0.0806884765625, - "epoch": 0.2572, - "grad_norm": 0.7103409216206392, - "k1_kl": 0.0372314453125, - "k3_kl": 0.020263671875, - "kimi_kl": 0.04364013671875, - "learning_rate": 3.714e-07, - "loss": 0.0009, - "ppl": 0.041748046875, - "reward": 0.9274450540542603, - "reward_std": 0.056338436814257875, - "rewards/perpo_ocr_edit_distance_reward": 0.927445113658905, + "advantages": -1.6902175048016943e-05, + "completion_length": 122.0, + "delta_ref_entropy_loss": 0.1640625, + "delta_ref_ppl": -0.1630859375, + "entropy_loss": -0.06494140625, + "epoch": 0.1286, + "grad_norm": 2.0774773565082167, + "k1_kl": 0.1630859375, + "k3_kl": 0.09765625, + "kimi_kl": 0.2265625, + "learning_rate": 4.3569999999999996e-07, + "loss": 0.0039, + "ppl": 0.0201416015625, + "reward": 0.8502304553985596, + "reward_std": 0.0014100084081292152, + "rewards/perpo_ocr_edit_distance_reward": 0.8502304553985596, "step": 643, "temperature": 0.9 }, { - "advantages": -7.419927010232641e-05, - "completion_length": 505.5, - "delta_ref_entropy_loss": 0.0631103515625, - "delta_ref_ppl": -0.0352783203125, - "entropy_loss": -0.091552734375, - "epoch": 0.2576, - "grad_norm": 1.0417436955651946, - "k1_kl": 0.03515625, - "k3_kl": 0.01666259765625, - "kimi_kl": 0.02978515625, - "learning_rate": 3.7119999999999994e-07, - "loss": 0.0007, - "ppl": 0.04815673828125, - "reward": 0.9529081583023071, - "reward_std": 0.0035632049839477986, - "rewards/perpo_ocr_edit_distance_reward": 0.9529081881046295, + "advantages": -6.61611557006836e-05, + "completion_length": 1090.0, + "delta_ref_entropy_loss": 0.060791015625, + "delta_ref_ppl": -0.046875, + "entropy_loss": -0.044677734375, + "epoch": 0.1288, + "grad_norm": 2.219379117396189, + "k1_kl": 0.046875, + "k3_kl": 0.026611328125, + "kimi_kl": 0.052734375, + "learning_rate": 4.3559999999999996e-07, + "loss": 0.0011, + "ppl": 0.0262451171875, + "reward": 0.9886207580566406, + "reward_std": 0.0008009625016711652, + "rewards/perpo_ocr_edit_distance_reward": 0.9886208176612854, "step": 644, "temperature": 0.9 }, { - "advantages": -0.00031823771496419795, - "completion_length": 342.0, - "delta_ref_entropy_loss": 0.0623779296875, - "delta_ref_ppl": -0.05548095703125, - "entropy_loss": -0.044921875, - "epoch": 0.258, - "grad_norm": 0.880309590638856, - "k1_kl": 0.05523681640625, - "k3_kl": 0.03365325927734375, - "kimi_kl": 0.0694427490234375, - "learning_rate": 3.71e-07, - "loss": 0.0017, - "ppl": 0.021274566650390625, - "reward": 0.9917130768299103, - "reward_std": 0.000582071312237531, - "rewards/perpo_ocr_edit_distance_reward": 0.991713136434555, + "advantages": 8.514949634275126e-09, + "completion_length": 215.0, + "delta_ref_entropy_loss": 0.045166015625, + "delta_ref_ppl": -0.1279296875, + "entropy_loss": -0.09326171875, + "epoch": 0.129, + "grad_norm": 3.6066689933765845, + "k1_kl": 0.12890625, + "k3_kl": 0.0947265625, + "kimi_kl": 0.283203125, + "learning_rate": 4.355e-07, + "loss": 0.0038, + "ppl": 0.046630859375, + "reward": 0.9847234487533569, + "reward_std": 0.003492597257718444, + "rewards/perpo_ocr_edit_distance_reward": 0.9847233891487122, "step": 645, "temperature": 0.9 }, { - "advantages": -3.864509790219017e-05, - "completion_length": 880.0, - "delta_ref_entropy_loss": 0.08758544921875, - "delta_ref_ppl": -0.04547119140625, - "entropy_loss": -0.1104736328125, - "epoch": 0.2584, - "grad_norm": 1.3009920497632033, - "k1_kl": 0.045501708984375, - "k3_kl": 0.0211029052734375, - "kimi_kl": 0.03656005859375, - "learning_rate": 3.708e-07, - "loss": 0.0009, - "ppl": 0.05999755859375, - "reward": 0.8258322775363922, - "reward_std": 0.0063342577195726335, - "rewards/perpo_ocr_edit_distance_reward": 0.825832337141037, + "advantages": -8.39574022393208e-06, + "completion_length": 1040.0, + "delta_ref_entropy_loss": 0.052978515625, + "delta_ref_ppl": -0.0478515625, + "entropy_loss": -0.083984375, + "epoch": 0.1292, + "grad_norm": 2911.3797810660462, + "k1_kl": 0.0478515625, + "k3_kl": 40.5, + "kimi_kl": 0.134765625, + "learning_rate": 4.354e-07, + "loss": 1.6198, + "ppl": 0.06494140625, + "reward": 0.9177994728088379, + "reward_std": 0.010052372701466084, + "rewards/perpo_ocr_edit_distance_reward": 0.9177995920181274, "step": 646, "temperature": 0.9 }, { - "advantages": -2.2696598307447857e-05, - "completion_length": 801.5, - "delta_ref_entropy_loss": 0.0274658203125, - "delta_ref_ppl": -0.017486572265625, - "entropy_loss": -0.04248046875, - "epoch": 0.2588, - "grad_norm": 0.6269155584455197, - "k1_kl": 0.01751708984375, - "k3_kl": 0.0079498291015625, - "kimi_kl": 0.0128936767578125, - "learning_rate": 3.7059999999999994e-07, - "loss": 0.0003, - "ppl": 0.019439697265625, - "reward": 0.99521803855896, - "reward_std": 0.004550379468128085, - "rewards/perpo_ocr_edit_distance_reward": 0.9952181279659271, + "advantages": -3.4494059946155176e-05, + "completion_length": 931.0, + "delta_ref_entropy_loss": 0.056396484375, + "delta_ref_ppl": -0.049560546875, + "entropy_loss": -0.02880859375, + "epoch": 0.1294, + "grad_norm": 0.8360237485317412, + "k1_kl": 0.049560546875, + "k3_kl": 0.026611328125, + "kimi_kl": 0.07080078125, + "learning_rate": 4.353e-07, + "loss": 0.0011, + "ppl": 0.0123291015625, + "reward": 0.9881618618965149, + "reward_std": 0.0006407225737348199, + "rewards/perpo_ocr_edit_distance_reward": 0.9881619215011597, "step": 647, "temperature": 0.9 }, { - "advantages": -3.9722238255990305e-05, - "completion_length": 466.5, - "delta_ref_entropy_loss": 0.04547119140625, - "delta_ref_ppl": -0.02264404296875, - "entropy_loss": -0.036346435546875, - "epoch": 0.2592, - "grad_norm": 0.5317079984103736, - "k1_kl": 0.022705078125, - "k3_kl": 0.00885009765625, - "kimi_kl": 0.0189208984375, - "learning_rate": 3.704e-07, - "loss": 0.0004, - "ppl": 0.0166778564453125, - "reward": 0.8803386092185974, - "reward_std": 0.01921824848977849, - "rewards/perpo_ocr_edit_distance_reward": 0.8803386390209198, + "advantages": -1.7029898913278885e-07, + "completion_length": 2048.0, + "delta_ref_entropy_loss": 0.055419921875, + "delta_ref_ppl": -0.0546875, + "entropy_loss": -0.04150390625, + "epoch": 0.1296, + "grad_norm": 0.4084398157345301, + "k1_kl": 0.054931640625, + "k3_kl": 0.031982421875, + "kimi_kl": 0.0869140625, + "learning_rate": 4.352e-07, + "loss": 0.0013, + "ppl": 0.02001953125, + "reward": 0.9014979600906372, + "reward_std": 0.21355877816677094, + "rewards/perpo_ocr_edit_distance_reward": 0.901498019695282, "step": 648, "temperature": 0.9 }, { - "advantages": -4.002026301463957e-07, - "completion_length": 658.5, - "delta_ref_entropy_loss": 0.0343017578125, - "delta_ref_ppl": -0.0596923828125, - "entropy_loss": -0.15087890625, - "epoch": 0.2596, - "grad_norm": 5.770909886446307, - "k1_kl": 0.05975341796875, - "k3_kl": 0.059326171875, - "kimi_kl": 0.10302734375, - "learning_rate": 3.7019999999999997e-07, - "loss": 0.0024, - "ppl": 0.08349609375, - "reward": 0.6433302015066147, - "reward_std": 0.12096176855266094, - "rewards/perpo_ocr_edit_distance_reward": 0.6433302611112595, + "advantages": -3.2356808787881164e-06, + "completion_length": 1177.0, + "delta_ref_entropy_loss": 0.031982421875, + "delta_ref_ppl": -0.028564453125, + "entropy_loss": -0.04345703125, + "epoch": 0.1298, + "grad_norm": 0.8513952849226604, + "k1_kl": 0.028564453125, + "k3_kl": 0.016845703125, + "kimi_kl": 0.03515625, + "learning_rate": 4.3509999999999997e-07, + "loss": 0.0007, + "ppl": 0.02392578125, + "reward": 0.9889258146286011, + "reward_std": 0.002531585516408086, + "rewards/perpo_ocr_edit_distance_reward": 0.9889258146286011, "step": 649, "temperature": 0.9 }, { - "advantages": -0.00029802109513954456, - "completion_length": 456.0, - "delta_ref_entropy_loss": 0.03387451171875, - "delta_ref_ppl": -0.022186279296875, - "entropy_loss": -0.0208740234375, - "epoch": 0.26, - "grad_norm": 0.32021582178287533, - "k1_kl": 0.022125244140625, - "k3_kl": 0.010498046875, - "kimi_kl": 0.021881103515625, - "learning_rate": 3.7e-07, + "advantages": -4.763262768392451e-05, + "completion_length": 915.0, + "delta_ref_entropy_loss": 0.049072265625, + "delta_ref_ppl": -0.03466796875, + "entropy_loss": -0.026611328125, + "epoch": 0.13, + "grad_norm": 0.4893454685538378, + "k1_kl": 0.03466796875, + "k3_kl": 0.017333984375, + "kimi_kl": 0.049072265625, + "learning_rate": 4.3499999999999996e-07, "loss": 0.0007, - "ppl": 0.00921630859375, - "reward": 0.9825462400913239, - "reward_std": 0.0007507888367399573, - "rewards/perpo_ocr_edit_distance_reward": 0.9825462698936462, + "ppl": 0.01287841796875, + "reward": 0.9954606294631958, + "reward_std": 0.0006147799431346357, + "rewards/perpo_ocr_edit_distance_reward": 0.9954606890678406, "step": 650, "temperature": 0.9 }, { - "advantages": -3.0432429682036855e-05, - "completion_length": 334.0, - "delta_ref_entropy_loss": 0.043212890625, - "delta_ref_ppl": -0.02691650390625, - "entropy_loss": -0.0511474609375, - "epoch": 0.2604, - "grad_norm": 1.0024155481538388, - "k1_kl": 0.02685546875, - "k3_kl": 0.013275146484375, - "kimi_kl": 0.0279541015625, - "learning_rate": 3.698e-07, - "loss": 0.0006, - "ppl": 0.023712158203125, - "reward": 0.9414300918579102, - "reward_std": 0.04507019731681794, - "rewards/perpo_ocr_edit_distance_reward": 0.9414302110671997, + "advantages": -1.5991075997590087e-05, + "completion_length": 382.0, + "delta_ref_entropy_loss": 0.0673828125, + "delta_ref_ppl": -0.09326171875, + "entropy_loss": -0.055908203125, + "epoch": 0.1302, + "grad_norm": 1.168283169570988, + "k1_kl": 0.0927734375, + "k3_kl": 0.061767578125, + "kimi_kl": 0.19921875, + "learning_rate": 4.349e-07, + "loss": 0.0025, + "ppl": 0.036865234375, + "reward": 0.9856542348861694, + "reward_std": 0.0020297879818826914, + "rewards/perpo_ocr_edit_distance_reward": 0.9856542348861694, "step": 651, "temperature": 0.9 }, { - "advantages": -0.00029907056273259514, - "completion_length": 374.0, - "delta_ref_entropy_loss": 0.0518798828125, - "delta_ref_ppl": -0.03192138671875, - "entropy_loss": -0.040283203125, - "epoch": 0.2608, - "grad_norm": 0.4760231074389208, - "k1_kl": 0.031982421875, - "k3_kl": 0.02105712890625, - "kimi_kl": 0.04449462890625, - "learning_rate": 3.696e-07, - "loss": 0.0011, - "ppl": 0.0230712890625, - "reward": 0.9932600855827332, - "reward_std": 0.0060847485437989235, - "rewards/perpo_ocr_edit_distance_reward": 0.9932601451873779, + "advantages": -0.00010064670641440898, + "completion_length": 661.0, + "delta_ref_entropy_loss": 0.08837890625, + "delta_ref_ppl": -0.07666015625, + "entropy_loss": -0.07373046875, + "epoch": 0.1304, + "grad_norm": 0.9907466136128208, + "k1_kl": 0.07666015625, + "k3_kl": 0.042236328125, + "kimi_kl": 0.1044921875, + "learning_rate": 4.348e-07, + "loss": 0.0018, + "ppl": 0.0380859375, + "reward": 0.9484524726867676, + "reward_std": 0.0005767357652075589, + "rewards/perpo_ocr_edit_distance_reward": 0.9484525322914124, "step": 652, "temperature": 0.9 }, { - "advantages": 5.10896995820076e-08, - "completion_length": 368.0, - "delta_ref_entropy_loss": 0.05029296875, - "delta_ref_ppl": -0.0556640625, - "entropy_loss": -0.119140625, - "epoch": 0.2612, - "grad_norm": 1.0927865585530083, - "k1_kl": 0.0555419921875, - "k3_kl": 0.0343017578125, - "kimi_kl": 0.08154296875, - "learning_rate": 3.694e-07, - "loss": 0.0014, - "ppl": 0.06304931640625, - "reward": 0.7915156781673431, - "reward_std": 0.13803626596927643, - "rewards/perpo_ocr_edit_distance_reward": 0.7915157079696655, + "advantages": -7.4931558629032224e-06, + "completion_length": 631.0, + "delta_ref_entropy_loss": 0.1533203125, + "delta_ref_ppl": -0.1396484375, + "entropy_loss": -0.248046875, + "epoch": 0.1306, + "grad_norm": 1.9033327100017268, + "k1_kl": 0.1396484375, + "k3_kl": 0.07861328125, + "kimi_kl": 0.169921875, + "learning_rate": 4.3469999999999994e-07, + "loss": 0.0031, + "ppl": 0.13671875, + "reward": 0.8239113688468933, + "reward_std": 0.004438977688550949, + "rewards/perpo_ocr_edit_distance_reward": 0.8239114284515381, "step": 653, "temperature": 0.9 }, { - "advantages": -7.799693594279233e-05, - "completion_length": 1203.5, - "delta_ref_entropy_loss": 0.01513671875, - "delta_ref_ppl": -0.0092620849609375, - "entropy_loss": -0.0086669921875, - "epoch": 0.2616, - "grad_norm": 2.344289361180001, - "k1_kl": 0.0092010498046875, - "k3_kl": 0.00499725341796875, - "kimi_kl": 0.01630401611328125, - "learning_rate": 3.6919999999999994e-07, - "loss": 0.0003, - "ppl": 0.00374603271484375, - "reward": 0.9960548877716064, - "reward_std": 0.001547989435493946, - "rewards/perpo_ocr_edit_distance_reward": 0.9960549473762512, + "advantages": -3.0313220122479834e-05, + "completion_length": 606.0, + "delta_ref_entropy_loss": 0.060302734375, + "delta_ref_ppl": -0.064453125, + "entropy_loss": -0.021484375, + "epoch": 0.1308, + "grad_norm": 0.30932708825502886, + "k1_kl": 0.064453125, + "k3_kl": 0.0341796875, + "kimi_kl": 0.09765625, + "learning_rate": 4.346e-07, + "loss": 0.0014, + "ppl": 0.00872802734375, + "reward": 0.9839943051338196, + "reward_std": 0.000461452582385391, + "rewards/perpo_ocr_edit_distance_reward": 0.9839943647384644, "step": 654, "temperature": 0.9 }, { - "advantages": -9.0650150809779e-05, - "completion_length": 923.5, - "delta_ref_entropy_loss": 0.08245849609375, - "delta_ref_ppl": -0.0400390625, - "entropy_loss": -0.08673095703125, - "epoch": 0.262, - "grad_norm": 1.7369171112862338, - "k1_kl": 0.040069580078125, - "k3_kl": 0.0199127197265625, - "kimi_kl": 0.0377197265625, - "learning_rate": 3.69e-07, - "loss": 0.0009, - "ppl": 0.044677734375, - "reward": 0.9084738492965698, - "reward_std": 0.02433584387472365, - "rewards/perpo_ocr_edit_distance_reward": 0.908473938703537, + "advantages": -1.7540796761750244e-05, + "completion_length": 575.0, + "delta_ref_entropy_loss": 0.046630859375, + "delta_ref_ppl": -0.03857421875, + "entropy_loss": -0.0206298828125, + "epoch": 0.131, + "grad_norm": 0.44473622661027545, + "k1_kl": 0.03857421875, + "k3_kl": 0.0205078125, + "kimi_kl": 0.05078125, + "learning_rate": 4.345e-07, + "loss": 0.0008, + "ppl": 0.00726318359375, + "reward": 0.9972589015960693, + "reward_std": 0.0008709515677765012, + "rewards/perpo_ocr_edit_distance_reward": 0.9972589015960693, "step": 655, "temperature": 0.9 }, { - "advantages": -8.726971918804338e-05, - "completion_length": 266.0, - "delta_ref_entropy_loss": 0.048095703125, - "delta_ref_ppl": -0.0396728515625, - "entropy_loss": -0.02093505859375, - "epoch": 0.2624, - "grad_norm": 0.564015452678888, - "k1_kl": 0.039794921875, - "k3_kl": 0.02191162109375, - "kimi_kl": 0.037841796875, - "learning_rate": 3.688e-07, - "loss": 0.001, - "ppl": 0.00823974609375, - "reward": 0.99561408162117, - "reward_std": 0.0010528592974878848, - "rewards/perpo_ocr_edit_distance_reward": 0.9956141710281372, + "advantages": -2.183233118557837e-05, + "completion_length": 1138.0, + "delta_ref_entropy_loss": 0.0625, + "delta_ref_ppl": -0.05126953125, + "entropy_loss": -0.0732421875, + "epoch": 0.1312, + "grad_norm": 1.4555769020236968, + "k1_kl": 0.05126953125, + "k3_kl": 0.0361328125, + "kimi_kl": 0.06298828125, + "learning_rate": 4.3439999999999997e-07, + "loss": 0.0015, + "ppl": 0.045654296875, + "reward": 0.9422322511672974, + "reward_std": 0.0018502166494727135, + "rewards/perpo_ocr_edit_distance_reward": 0.9422323107719421, "step": 656, "temperature": 0.9 }, { - "advantages": -3.5170999979072803e-05, - "completion_length": 1253.5, - "delta_ref_entropy_loss": 0.03021240234375, - "delta_ref_ppl": -0.0222320556640625, - "entropy_loss": -0.06585693359375, - "epoch": 0.2628, - "grad_norm": 1.3264209516172967, - "k1_kl": 0.022247314453125, - "k3_kl": 0.01399993896484375, - "kimi_kl": 0.0370025634765625, - "learning_rate": 3.6859999999999995e-07, - "loss": 0.0006, - "ppl": 0.0399169921875, - "reward": 0.8395728766918182, - "reward_std": 0.017699646850815043, - "rewards/perpo_ocr_edit_distance_reward": 0.839572936296463, + "advantages": -2.9887473829148803e-06, + "completion_length": 2048.0, + "delta_ref_entropy_loss": 0.0213623046875, + "delta_ref_ppl": -0.0269775390625, + "entropy_loss": -0.1796875, + "epoch": 0.1314, + "grad_norm": 17.529482440199125, + "k1_kl": 0.0272216796875, + "k3_kl": 0.1318359375, + "kimi_kl": 0.0673828125, + "learning_rate": 4.343e-07, + "loss": 0.0053, + "ppl": 0.130859375, + "reward": 0.6323057413101196, + "reward_std": 0.005586305167526007, + "rewards/perpo_ocr_edit_distance_reward": 0.6323057413101196, "step": 657, "temperature": 0.9 }, { - "advantages": -3.6605767945729895e-05, - "completion_length": 1210.0, - "delta_ref_entropy_loss": 0.03204345703125, - "delta_ref_ppl": -0.016204833984375, - "entropy_loss": -0.054168701171875, - "epoch": 0.2632, - "grad_norm": 0.857248045935611, - "k1_kl": 0.016143798828125, - "k3_kl": 0.0081939697265625, - "kimi_kl": 0.013580322265625, - "learning_rate": 3.684e-07, - "loss": 0.0004, - "ppl": 0.02825927734375, - "reward": 0.8178778886795044, - "reward_std": 0.0026831769791897386, - "rewards/perpo_ocr_edit_distance_reward": 0.8178779482841492, + "advantages": -3.150531426854286e-07, + "completion_length": 2048.0, + "delta_ref_entropy_loss": 0.033203125, + "delta_ref_ppl": -0.043701171875, + "entropy_loss": -0.09375, + "epoch": 0.1316, + "grad_norm": 6.544203442402772, + "k1_kl": 0.043701171875, + "k3_kl": 0.0439453125, + "kimi_kl": 0.083984375, + "learning_rate": 4.3419999999999996e-07, + "loss": 0.0018, + "ppl": 0.04296875, + "reward": 0.28021612763404846, + "reward_std": 0.09256833791732788, + "rewards/perpo_ocr_edit_distance_reward": 0.28021615743637085, "step": 658, "temperature": 0.9 }, { - "advantages": -3.2101360289971126e-06, - "completion_length": 285.0, - "delta_ref_entropy_loss": 0.087158203125, - "delta_ref_ppl": -0.0968017578125, - "entropy_loss": -0.07171630859375, - "epoch": 0.2636, - "grad_norm": 2.4188271498726497, - "k1_kl": 0.0968017578125, - "k3_kl": 0.068084716796875, - "kimi_kl": 0.1849365234375, - "learning_rate": 3.6820000000000003e-07, - "loss": 0.0027, - "ppl": 0.037078857421875, - "reward": 0.8780794739723206, - "reward_std": 0.09146136901108548, - "rewards/perpo_ocr_edit_distance_reward": 0.8780794739723206, + "advantages": -1.7702579498291016e-05, + "completion_length": 988.0, + "delta_ref_entropy_loss": 0.05419921875, + "delta_ref_ppl": -0.046142578125, + "entropy_loss": -0.036376953125, + "epoch": 0.1318, + "grad_norm": 0.9187185826905455, + "k1_kl": 0.046142578125, + "k3_kl": 0.0235595703125, + "kimi_kl": 0.062255859375, + "learning_rate": 4.3409999999999995e-07, + "loss": 0.001, + "ppl": 0.0205078125, + "reward": 0.9936157464981079, + "reward_std": 0.0023039928637444973, + "rewards/perpo_ocr_edit_distance_reward": 0.9936157464981079, "step": 659, "temperature": 0.9 }, { - "advantages": -2.2428377633332275e-05, - "completion_length": 923.5, - "delta_ref_entropy_loss": 0.06787109375, - "delta_ref_ppl": -0.063751220703125, - "entropy_loss": -0.0772705078125, - "epoch": 0.264, - "grad_norm": 1.3085105120365066, - "k1_kl": 0.063751220703125, - "k3_kl": 0.043853759765625, - "kimi_kl": 0.0956268310546875, - "learning_rate": 3.6799999999999996e-07, - "loss": 0.0018, - "ppl": 0.043121337890625, - "reward": 0.9867627620697021, - "reward_std": 0.000899178150575608, - "rewards/perpo_ocr_edit_distance_reward": 0.9867628216743469, + "advantages": -4.529953366727568e-05, + "completion_length": 495.0, + "delta_ref_entropy_loss": 0.0791015625, + "delta_ref_ppl": -0.08349609375, + "entropy_loss": -0.033203125, + "epoch": 0.132, + "grad_norm": 0.6338259700649124, + "k1_kl": 0.083984375, + "k3_kl": 0.048828125, + "kimi_kl": 0.1240234375, + "learning_rate": 4.34e-07, + "loss": 0.002, + "ppl": 0.01495361328125, + "reward": 0.9954990744590759, + "reward_std": 0.0015913223614916205, + "rewards/perpo_ocr_edit_distance_reward": 0.9954991936683655, "step": 660, "temperature": 0.9 }, { - "advantages": -0.00032832792931003496, - "completion_length": 195.5, - "delta_ref_entropy_loss": 0.08056640625, - "delta_ref_ppl": -0.0465087890625, - "entropy_loss": -0.06689453125, - "epoch": 0.2644, - "grad_norm": 0.5399417354832396, - "k1_kl": 0.04638671875, - "k3_kl": 0.02423095703125, - "kimi_kl": 0.03411865234375, - "learning_rate": 3.678e-07, - "loss": 0.0013, - "ppl": 0.037109375, - "reward": 0.9810470044612885, - "reward_std": 0.0006524411728605628, - "rewards/perpo_ocr_edit_distance_reward": 0.9810470938682556, + "advantages": 3.218651045244769e-06, + "completion_length": 1080.0, + "delta_ref_entropy_loss": 0.03466796875, + "delta_ref_ppl": -0.031982421875, + "entropy_loss": -0.033203125, + "epoch": 0.1322, + "grad_norm": 0.532162414786071, + "k1_kl": 0.03173828125, + "k3_kl": 0.017333984375, + "kimi_kl": 0.041748046875, + "learning_rate": 4.339e-07, + "loss": 0.0007, + "ppl": 0.01531982421875, + "reward": 0.9929215908050537, + "reward_std": 0.002547280164435506, + "rewards/perpo_ocr_edit_distance_reward": 0.9929215908050537, "step": 661, "temperature": 0.9 }, { - "advantages": -1.9788742591231312e-05, - "completion_length": 637.0, - "delta_ref_entropy_loss": 0.0489501953125, - "delta_ref_ppl": -0.0360107421875, - "entropy_loss": -0.052490234375, - "epoch": 0.2648, - "grad_norm": 1.0443252235805385, - "k1_kl": 0.0360107421875, - "k3_kl": 0.0194549560546875, - "kimi_kl": 0.044281005859375, - "learning_rate": 3.676e-07, - "loss": 0.0008, - "ppl": 0.0321807861328125, - "reward": 0.9101710617542267, - "reward_std": 0.0340163862274494, - "rewards/perpo_ocr_edit_distance_reward": 0.9101711213588715, + "advantages": -4.76837158203125e-07, + "completion_length": 1545.0, + "delta_ref_entropy_loss": 0.0162353515625, + "delta_ref_ppl": -0.01348876953125, + "entropy_loss": -0.032470703125, + "epoch": 0.1324, + "grad_norm": 0.9795277178995426, + "k1_kl": 0.01348876953125, + "k3_kl": 0.00714111328125, + "kimi_kl": 0.01409912109375, + "learning_rate": 4.338e-07, + "loss": 0.0003, + "ppl": 0.01470947265625, + "reward": 0.9621453881263733, + "reward_std": 0.017486877739429474, + "rewards/perpo_ocr_edit_distance_reward": 0.9621454477310181, "step": 662, "temperature": 0.9 }, { - "advantages": -4.182968950772192e-05, - "completion_length": 814.0, - "delta_ref_entropy_loss": 0.03204345703125, - "delta_ref_ppl": -0.024169921875, - "entropy_loss": -0.03155517578125, - "epoch": 0.2652, - "grad_norm": 0.38270601719676056, - "k1_kl": 0.024200439453125, - "k3_kl": 0.0131683349609375, - "kimi_kl": 0.03179931640625, - "learning_rate": 3.6739999999999997e-07, + "advantages": -1.1891127542185131e-05, + "completion_length": 737.0, + "delta_ref_entropy_loss": 0.025634765625, + "delta_ref_ppl": -0.0233154296875, + "entropy_loss": -0.030517578125, + "epoch": 0.1326, + "grad_norm": 0.8474890939654813, + "k1_kl": 0.0233154296875, + "k3_kl": 0.0140380859375, + "kimi_kl": 0.0390625, + "learning_rate": 4.3369999999999997e-07, "loss": 0.0006, - "ppl": 0.015380859375, - "reward": 0.9980464577674866, - "reward_std": 0.00027888633485417813, - "rewards/perpo_ocr_edit_distance_reward": 0.9980465173721313, + "ppl": 0.0140380859375, + "reward": 0.9743342399597168, + "reward_std": 0.0020471597090363503, + "rewards/perpo_ocr_edit_distance_reward": 0.9743342399597168, "step": 663, "temperature": 0.9 }, { - "advantages": -7.935933354019653e-06, - "completion_length": 426.0, - "delta_ref_entropy_loss": 0.0419921875, - "delta_ref_ppl": -0.0384521484375, - "entropy_loss": -0.0338134765625, - "epoch": 0.2656, - "grad_norm": 0.7110616515319653, - "k1_kl": 0.03857421875, - "k3_kl": 0.02349853515625, - "kimi_kl": 0.0537109375, - "learning_rate": 3.672e-07, - "loss": 0.0009, - "ppl": 0.0169677734375, - "reward": 0.9998363852500916, - "reward_std": 0.00021786909201182425, - "rewards/perpo_ocr_edit_distance_reward": 0.9998363852500916, + "advantages": -4.165513382758945e-05, + "completion_length": 679.0, + "delta_ref_entropy_loss": 0.049072265625, + "delta_ref_ppl": -0.07470703125, + "entropy_loss": -0.02783203125, + "epoch": 0.1328, + "grad_norm": 0.9198588572971048, + "k1_kl": 0.07470703125, + "k3_kl": 0.04736328125, + "kimi_kl": 0.1220703125, + "learning_rate": 4.3359999999999997e-07, + "loss": 0.0019, + "ppl": 0.0152587890625, + "reward": 0.9951621890068054, + "reward_std": 0.0015354609349742532, + "rewards/perpo_ocr_edit_distance_reward": 0.9951621890068054, "step": 664, "temperature": 0.9 }, { - "advantages": -8.855547747543824e-06, - "completion_length": 835.0, - "delta_ref_entropy_loss": 0.08251953125, - "delta_ref_ppl": -0.0498046875, - "entropy_loss": -0.15673828125, - "epoch": 0.266, - "grad_norm": 2.9184104688740806, - "k1_kl": 0.049560546875, - "k3_kl": 0.02862548828125, - "kimi_kl": 0.0877685546875, - "learning_rate": 3.67e-07, - "loss": 0.0012, - "ppl": 0.090087890625, - "reward": 0.6846733540296555, - "reward_std": 0.048272678162902594, - "rewards/perpo_ocr_edit_distance_reward": 0.6846734285354614, + "advantages": -3.5302982723806053e-05, + "completion_length": 591.0, + "delta_ref_entropy_loss": 0.045166015625, + "delta_ref_ppl": -0.062255859375, + "entropy_loss": -0.0224609375, + "epoch": 0.133, + "grad_norm": 0.507960851051726, + "k1_kl": 0.0625, + "k3_kl": 0.040771484375, + "kimi_kl": 0.1298828125, + "learning_rate": 4.3349999999999996e-07, + "loss": 0.0017, + "ppl": 0.0126953125, + "reward": 0.9784871339797974, + "reward_std": 0.0011060431133955717, + "rewards/perpo_ocr_edit_distance_reward": 0.9784871339797974, "step": 665, "temperature": 0.9 }, { - "advantages": -3.2476018532179296e-05, - "completion_length": 425.0, - "delta_ref_entropy_loss": 0.024749755859375, - "delta_ref_ppl": -0.0137939453125, - "entropy_loss": -0.009979248046875, - "epoch": 0.2664, - "grad_norm": 0.3123653181595479, - "k1_kl": 0.0137939453125, - "k3_kl": 0.0084228515625, - "kimi_kl": 0.019989013671875, - "learning_rate": 3.668e-07, - "loss": 0.0004, - "ppl": 0.00409698486328125, - "reward": 0.9975044429302216, - "reward_std": 0.0004745324549730867, - "rewards/perpo_ocr_edit_distance_reward": 0.997504472732544, + "advantages": -2.5225537569895096e-07, + "completion_length": 828.0, + "delta_ref_entropy_loss": 0.053955078125, + "delta_ref_ppl": -0.05517578125, + "entropy_loss": -0.1591796875, + "epoch": 0.1332, + "grad_norm": 1.8354613881758775, + "k1_kl": 0.05517578125, + "k3_kl": 0.034423828125, + "kimi_kl": 0.078125, + "learning_rate": 4.334e-07, + "loss": 0.0014, + "ppl": 0.111328125, + "reward": 0.7081744074821472, + "reward_std": 0.09743836522102356, + "rewards/perpo_ocr_edit_distance_reward": 0.708174467086792, "step": 666, "temperature": 0.9 }, { - "advantages": -0.00011355536662449595, - "completion_length": 1165.5, - "delta_ref_entropy_loss": 0.03704833984375, - "delta_ref_ppl": -0.04180908203125, - "entropy_loss": -0.04638671875, - "epoch": 0.2668, - "grad_norm": 1.3241532107991931, - "k1_kl": 0.04193115234375, - "k3_kl": 0.028778076171875, - "kimi_kl": 0.0677490234375, - "learning_rate": 3.6659999999999996e-07, - "loss": 0.0013, - "ppl": 0.024169921875, - "reward": 0.9943176805973053, - "reward_std": 0.0014088428288232535, - "rewards/perpo_ocr_edit_distance_reward": 0.9943177402019501, + "advantages": -2.0069735910510644e-05, + "completion_length": 1252.0, + "delta_ref_entropy_loss": 0.00823974609375, + "delta_ref_ppl": -0.01470947265625, + "entropy_loss": -0.0035552978515625, + "epoch": 0.1334, + "grad_norm": 0.26721400367468623, + "k1_kl": 0.01470947265625, + "k3_kl": 0.01055908203125, + "kimi_kl": 0.031005859375, + "learning_rate": 4.333e-07, + "loss": 0.0004, + "ppl": 0.0010833740234375, + "reward": 0.9909837245941162, + "reward_std": 0.0007496681064367294, + "rewards/perpo_ocr_edit_distance_reward": 0.990983784198761, "step": 667, "temperature": 0.9 }, { - "advantages": -1.59059254656313e-05, - "completion_length": 664.0, - "delta_ref_entropy_loss": 0.03955078125, - "delta_ref_ppl": -0.02825927734375, - "entropy_loss": -0.035400390625, - "epoch": 0.2672, - "grad_norm": 0.621877117597352, - "k1_kl": 0.028228759765625, - "k3_kl": 0.014129638671875, - "kimi_kl": 0.031982421875, - "learning_rate": 3.664e-07, + "advantages": -3.2680378353688866e-05, + "completion_length": 1300.0, + "delta_ref_entropy_loss": 0.0250244140625, + "delta_ref_ppl": -0.0220947265625, + "entropy_loss": -0.04248046875, + "epoch": 0.1336, + "grad_norm": 0.7996609241830672, + "k1_kl": 0.0220947265625, + "k3_kl": 0.01348876953125, + "kimi_kl": 0.0294189453125, + "learning_rate": 4.3319999999999994e-07, "loss": 0.0006, - "ppl": 0.017059326171875, - "reward": 0.9968865811824799, - "reward_std": 0.002017560153035447, - "rewards/perpo_ocr_edit_distance_reward": 0.9968866407871246, + "ppl": 0.02099609375, + "reward": 0.9857359528541565, + "reward_std": 0.0022449353709816933, + "rewards/perpo_ocr_edit_distance_reward": 0.9857360124588013, "step": 668, "temperature": 0.9 }, { - "advantages": -0.00011094127799538, - "completion_length": 549.0, - "delta_ref_entropy_loss": 0.072113037109375, - "delta_ref_ppl": -0.0450439453125, - "entropy_loss": -0.110198974609375, - "epoch": 0.2676, - "grad_norm": 1.0955653985283273, - "k1_kl": 0.0447998046875, - "k3_kl": 0.0270843505859375, - "kimi_kl": 0.0616455078125, - "learning_rate": 3.662e-07, - "loss": 0.0012, - "ppl": 0.057373046875, - "reward": 0.9150987565517426, - "reward_std": 0.015054839801450726, - "rewards/perpo_ocr_edit_distance_reward": 0.9150988459587097, + "advantages": -1.0388238251834991e-06, + "completion_length": 799.0, + "delta_ref_entropy_loss": 0.0634765625, + "delta_ref_ppl": -0.045654296875, + "entropy_loss": -0.042236328125, + "epoch": 0.1338, + "grad_norm": 0.9247839779590168, + "k1_kl": 0.045654296875, + "k3_kl": 0.022705078125, + "kimi_kl": 0.05322265625, + "learning_rate": 4.331e-07, + "loss": 0.0009, + "ppl": 0.01544189453125, + "reward": 0.932252824306488, + "reward_std": 0.1065063625574112, + "rewards/perpo_ocr_edit_distance_reward": 0.9322529435157776, "step": 669, "temperature": 0.9 }, { - "advantages": 4.3511395233508665e-06, - "completion_length": 748.0, - "delta_ref_entropy_loss": 0.0479736328125, - "delta_ref_ppl": -0.02874755859375, - "entropy_loss": -0.05535888671875, - "epoch": 0.268, - "grad_norm": 1.175467943556355, - "k1_kl": 0.0286865234375, - "k3_kl": 0.0140380859375, - "kimi_kl": 0.0316162109375, - "learning_rate": 3.6599999999999997e-07, - "loss": 0.0006, - "ppl": 0.03033447265625, - "reward": 0.936529815196991, - "reward_std": 0.0021445921156555414, - "rewards/perpo_ocr_edit_distance_reward": 0.936529815196991, + "advantages": -6.083931566536194e-06, + "completion_length": 296.0, + "delta_ref_entropy_loss": 0.054931640625, + "delta_ref_ppl": -0.068359375, + "entropy_loss": -0.053466796875, + "epoch": 0.134, + "grad_norm": 1.7762769126612328, + "k1_kl": 0.068359375, + "k3_kl": 0.04296875, + "kimi_kl": 0.111328125, + "learning_rate": 4.3299999999999997e-07, + "loss": 0.0017, + "ppl": 0.02685546875, + "reward": 0.9756847620010376, + "reward_std": 0.002705216407775879, + "rewards/perpo_ocr_edit_distance_reward": 0.9756848812103271, "step": 670, "temperature": 0.9 }, { - "advantages": -0.00030858176160108997, - "completion_length": 554.5, - "delta_ref_entropy_loss": 0.0230712890625, - "delta_ref_ppl": -0.01397705078125, - "entropy_loss": -0.0166015625, - "epoch": 0.2684, - "grad_norm": 0.6810434134075452, - "k1_kl": 0.014007568359375, - "k3_kl": 0.0073394775390625, - "kimi_kl": 0.0149688720703125, - "learning_rate": 3.658e-07, - "loss": 0.0006, - "ppl": 0.008331298828125, - "reward": 0.9972226619720459, - "reward_std": 0.0005553451483137906, - "rewards/perpo_ocr_edit_distance_reward": 0.9972227513790131, + "advantages": -3.916876778475853e-07, + "completion_length": 2048.0, + "delta_ref_entropy_loss": 0.043701171875, + "delta_ref_ppl": -0.037109375, + "entropy_loss": -0.11083984375, + "epoch": 0.1342, + "grad_norm": 6.1845524517617765, + "k1_kl": 0.037109375, + "k3_kl": 0.036865234375, + "kimi_kl": 0.04638671875, + "learning_rate": 4.3289999999999997e-07, + "loss": 0.0015, + "ppl": 0.064453125, + "reward": 0.8209600448608398, + "reward_std": 0.20120495557785034, + "rewards/perpo_ocr_edit_distance_reward": 0.8209601044654846, "step": 671, "temperature": 0.9 }, { - "advantages": -0.0001380358444293961, - "completion_length": 167.5, - "delta_ref_entropy_loss": 0.07763671875, - "delta_ref_ppl": -0.04052734375, - "entropy_loss": -0.02362060546875, - "epoch": 0.2688, - "grad_norm": 0.3751360043361522, - "k1_kl": 0.04052734375, - "k3_kl": 0.0173187255859375, - "kimi_kl": 0.027587890625, - "learning_rate": 3.6559999999999994e-07, - "loss": 0.0008, - "ppl": 0.0087432861328125, - "reward": 0.9871076047420502, - "reward_std": 0.00016589960432611406, - "rewards/perpo_ocr_edit_distance_reward": 0.987107664346695, + "advantages": -0.00016306128236465156, + "completion_length": 821.0, + "delta_ref_entropy_loss": 0.0400390625, + "delta_ref_ppl": -0.0478515625, + "entropy_loss": -0.052734375, + "epoch": 0.1344, + "grad_norm": 0.6121489937170358, + "k1_kl": 0.048095703125, + "k3_kl": 0.0267333984375, + "kimi_kl": 0.0703125, + "learning_rate": 4.328e-07, + "loss": 0.0012, + "ppl": 0.0245361328125, + "reward": 0.9754189848899841, + "reward_std": 0.0005266974330879748, + "rewards/perpo_ocr_edit_distance_reward": 0.9754191040992737, "step": 672, "temperature": 0.9 }, { - "advantages": -5.381448204389017e-06, - "completion_length": 316.5, - "delta_ref_entropy_loss": 0.118896484375, - "delta_ref_ppl": -0.0947265625, - "entropy_loss": -0.1806640625, - "epoch": 0.2692, - "grad_norm": 2.132682028760516, - "k1_kl": 0.094970703125, - "k3_kl": 0.0518798828125, - "kimi_kl": 0.103759765625, - "learning_rate": 3.654e-07, - "loss": 0.0021, - "ppl": 0.103759765625, - "reward": 0.6856733709573746, - "reward_std": 0.006630591116845608, - "rewards/perpo_ocr_edit_distance_reward": 0.685673400759697, + "advantages": -1.3623919414840202e-07, + "completion_length": 165.0, + "delta_ref_entropy_loss": 0.10595703125, + "delta_ref_ppl": -0.1904296875, + "entropy_loss": -0.2216796875, + "epoch": 0.1346, + "grad_norm": 2.8955279307631523, + "k1_kl": 0.1904296875, + "k3_kl": 0.1416015625, + "kimi_kl": 0.55078125, + "learning_rate": 4.3269999999999995e-07, + "loss": 0.0056, + "ppl": 0.0849609375, + "reward": 0.15851666033267975, + "reward_std": 0.08478587120771408, + "rewards/perpo_ocr_edit_distance_reward": 0.15851667523384094, "step": 673, "temperature": 0.9 }, { - "advantages": -0.00011617371819738764, - "completion_length": 684.0, - "delta_ref_entropy_loss": 0.059326171875, - "delta_ref_ppl": -0.02777099609375, - "entropy_loss": -0.052642822265625, - "epoch": 0.2696, - "grad_norm": 0.597173840871872, - "k1_kl": 0.027801513671875, - "k3_kl": 0.0118408203125, - "kimi_kl": 0.0262451171875, - "learning_rate": 3.652e-07, - "loss": 0.0006, - "ppl": 0.0261993408203125, - "reward": 0.9349887371063232, - "reward_std": 0.0024027874678722583, - "rewards/perpo_ocr_edit_distance_reward": 0.934988796710968, + "advantages": -1.78303052962292e-05, + "completion_length": 842.0, + "delta_ref_entropy_loss": 0.028076171875, + "delta_ref_ppl": -0.038818359375, + "entropy_loss": -0.02392578125, + "epoch": 0.1348, + "grad_norm": 0.5758619487673315, + "k1_kl": 0.03857421875, + "k3_kl": 0.0247802734375, + "kimi_kl": 0.0615234375, + "learning_rate": 4.3259999999999994e-07, + "loss": 0.001, + "ppl": 0.0115966796875, + "reward": 0.9934657216072083, + "reward_std": 0.0013328349450603127, + "rewards/perpo_ocr_edit_distance_reward": 0.9934657216072083, "step": 674, "temperature": 0.9 }, { - "advantages": -1.1418547273933655e-05, - "completion_length": 315.0, - "delta_ref_entropy_loss": 0.0821533203125, - "delta_ref_ppl": -0.0577392578125, - "entropy_loss": -0.0919189453125, - "epoch": 0.27, - "grad_norm": 1.3178643264135776, - "k1_kl": 0.0577392578125, - "k3_kl": 0.0274658203125, - "kimi_kl": 0.0478515625, - "learning_rate": 3.65e-07, - "loss": 0.0011, - "ppl": 0.05224609375, - "reward": 0.9129939675331116, - "reward_std": 0.0038799813482910395, - "rewards/perpo_ocr_edit_distance_reward": 0.9129940867424011, + "advantages": -9.98037212411873e-05, + "completion_length": 193.0, + "delta_ref_entropy_loss": 0.07958984375, + "delta_ref_ppl": -0.1484375, + "entropy_loss": -0.044921875, + "epoch": 0.135, + "grad_norm": 0.9688778316365196, + "k1_kl": 0.1494140625, + "k3_kl": 0.1015625, + "kimi_kl": 0.27734375, + "learning_rate": 4.325e-07, + "loss": 0.0042, + "ppl": 0.015625, + "reward": 0.987532913684845, + "reward_std": 0.0009237625054083765, + "rewards/perpo_ocr_edit_distance_reward": 0.9875330924987793, "step": 675, "temperature": 0.9 }, { - "advantages": -2.9870442631363403e-05, - "completion_length": 528.0, - "delta_ref_entropy_loss": 0.0477294921875, - "delta_ref_ppl": -0.02685546875, - "entropy_loss": -0.031494140625, - "epoch": 0.2704, - "grad_norm": 0.4584668638163916, - "k1_kl": 0.02685546875, - "k3_kl": 0.0140838623046875, - "kimi_kl": 0.045318603515625, - "learning_rate": 3.648e-07, - "loss": 0.0006, - "ppl": 0.0159149169921875, - "reward": 0.9603922665119171, - "reward_std": 0.0025751724606379867, - "rewards/perpo_ocr_edit_distance_reward": 0.9603923261165619, + "advantages": -7.280281806743005e-06, + "completion_length": 583.0, + "delta_ref_entropy_loss": 0.12060546875, + "delta_ref_ppl": -0.09814453125, + "entropy_loss": -0.09375, + "epoch": 0.1352, + "grad_norm": 1.7049121899255486, + "k1_kl": 0.09814453125, + "k3_kl": 0.05029296875, + "kimi_kl": 0.11865234375, + "learning_rate": 4.324e-07, + "loss": 0.002, + "ppl": 0.048583984375, + "reward": 0.9621188044548035, + "reward_std": 0.010420351289212704, + "rewards/perpo_ocr_edit_distance_reward": 0.9621188640594482, "step": 676, "temperature": 0.9 }, { - "advantages": -4.759005241794512e-05, - "completion_length": 478.5, - "delta_ref_entropy_loss": 0.02899169921875, - "delta_ref_ppl": -0.02166748046875, - "entropy_loss": -0.0187835693359375, - "epoch": 0.2708, - "grad_norm": 0.6502403888449848, - "k1_kl": 0.02166748046875, - "k3_kl": 0.011810302734375, - "kimi_kl": 0.027587890625, - "learning_rate": 3.6459999999999997e-07, - "loss": 0.0005, - "ppl": 0.008289337158203125, - "reward": 0.9987517893314362, - "reward_std": 0.0003972191771026701, - "rewards/perpo_ocr_edit_distance_reward": 0.9987518191337585, - "step": 677, - "temperature": 0.9 - }, + "advantages": -3.576278970740532e-07, + "completion_length": 2048.0, + "delta_ref_entropy_loss": 0.05224609375, + "delta_ref_ppl": -0.061767578125, + "entropy_loss": -0.04736328125, + "epoch": 0.1354, + "grad_norm": 0.5649403036152028, + "k1_kl": 0.061767578125, + "k3_kl": 0.03662109375, + "kimi_kl": 0.080078125, + "learning_rate": 4.3229999999999997e-07, + "loss": 0.0015, + "ppl": 0.022705078125, + "reward": 0.9043760299682617, + "reward_std": 0.21295084059238434, + "rewards/perpo_ocr_edit_distance_reward": 0.9043760895729065, + "step": 677, + "temperature": 0.9 + }, { - "advantages": -8.514949740856537e-07, - "completion_length": 369.5, - "delta_ref_entropy_loss": 0.10546875, - "delta_ref_ppl": -0.0802001953125, - "entropy_loss": -0.19091796875, - "epoch": 0.2712, - "grad_norm": 1.2018249460943957, - "k1_kl": 0.0804443359375, - "k3_kl": 0.0452880859375, - "kimi_kl": 0.09912109375, - "learning_rate": 3.644e-07, - "loss": 0.0018, - "ppl": 0.09814453125, - "reward": 0.7092234492301941, - "reward_std": 0.0886546466499567, - "rewards/perpo_ocr_edit_distance_reward": 0.7092234790325165, + "advantages": -8.702278137207031e-06, + "completion_length": 673.0, + "delta_ref_entropy_loss": 0.07666015625, + "delta_ref_ppl": -0.10205078125, + "entropy_loss": -0.07275390625, + "epoch": 0.1356, + "grad_norm": 1.0500355769764531, + "k1_kl": 0.10205078125, + "k3_kl": 0.06494140625, + "kimi_kl": 0.224609375, + "learning_rate": 4.3219999999999997e-07, + "loss": 0.0026, + "ppl": 0.037353515625, + "reward": 0.9463402032852173, + "reward_std": 0.005771820433437824, + "rewards/perpo_ocr_edit_distance_reward": 0.9463402628898621, "step": 678, "temperature": 0.9 }, { - "advantages": -1.9750425053643994e-05, - "completion_length": 814.0, - "delta_ref_entropy_loss": 0.058837890625, - "delta_ref_ppl": -0.0477294921875, - "entropy_loss": -0.074462890625, - "epoch": 0.2716, - "grad_norm": 1.2194234268521216, - "k1_kl": 0.0477294921875, - "k3_kl": 0.028228759765625, - "kimi_kl": 0.07080078125, - "learning_rate": 3.642e-07, - "loss": 0.0011, - "ppl": 0.03643798828125, - "reward": 0.9710698425769806, - "reward_std": 0.0008976560493465513, - "rewards/perpo_ocr_edit_distance_reward": 0.9710699319839478, + "advantages": -0.0005960464477539062, + "completion_length": 561.0, + "delta_ref_entropy_loss": 0.05224609375, + "delta_ref_ppl": -0.05224609375, + "entropy_loss": -0.0120849609375, + "epoch": 0.1358, + "grad_norm": 0.017412377664092888, + "k1_kl": 0.052490234375, + "k3_kl": 0.02978515625, + "kimi_kl": 0.10107421875, + "learning_rate": 4.3209999999999996e-07, + "loss": 0.0018, + "ppl": 0.003387451171875, + "reward": 0.9952983856201172, + "reward_std": 0.0, + "rewards/perpo_ocr_edit_distance_reward": 0.995298445224762, "step": 679, "temperature": 0.9 }, { - "advantages": -3.909745544206089e-05, - "completion_length": 972.5, - "delta_ref_entropy_loss": 0.0565185546875, - "delta_ref_ppl": -0.03515625, - "entropy_loss": -0.0487060546875, - "epoch": 0.272, - "grad_norm": 0.8691340538972572, - "k1_kl": 0.03509521484375, - "k3_kl": 0.0186767578125, - "kimi_kl": 0.03948974609375, - "learning_rate": 3.64e-07, - "loss": 0.0008, - "ppl": 0.0264892578125, - "reward": 0.9321056306362152, - "reward_std": 0.005570474255364388, - "rewards/perpo_ocr_edit_distance_reward": 0.9321057200431824, + "advantages": -0.00016344446339644492, + "completion_length": 576.0, + "delta_ref_entropy_loss": 0.0546875, + "delta_ref_ppl": -0.04833984375, + "entropy_loss": -0.0177001953125, + "epoch": 0.136, + "grad_norm": 0.30977232347388445, + "k1_kl": 0.04833984375, + "k3_kl": 0.02490234375, + "kimi_kl": 0.0625, + "learning_rate": 4.3199999999999995e-07, + "loss": 0.0012, + "ppl": 0.004913330078125, + "reward": 0.995693027973175, + "reward_std": 0.00016049259284045547, + "rewards/perpo_ocr_edit_distance_reward": 0.9956930875778198, "step": 680, "temperature": 0.9 }, { - "advantages": -0.0004470348358154297, - "completion_length": 598.5, - "delta_ref_entropy_loss": 0.022796630859375, - "delta_ref_ppl": -0.020477294921875, - "entropy_loss": -0.0223388671875, - "epoch": 0.2724, - "grad_norm": 0.024802853419884818, - "k1_kl": 0.020477294921875, - "k3_kl": 0.01116943359375, - "kimi_kl": 0.024444580078125, - "learning_rate": 3.638e-07, - "loss": 0.0009, - "ppl": 0.0083770751953125, - "reward": 0.6437660902738571, - "reward_std": 0.0, - "rewards/perpo_ocr_edit_distance_reward": 0.6437661349773407, + "advantages": -6.722552643623203e-05, + "completion_length": 369.0, + "delta_ref_entropy_loss": 0.05517578125, + "delta_ref_ppl": -0.091796875, + "entropy_loss": -0.0244140625, + "epoch": 0.1362, + "grad_norm": 0.5282920901776146, + "k1_kl": 0.091796875, + "k3_kl": 0.057373046875, + "kimi_kl": 0.162109375, + "learning_rate": 4.319e-07, + "loss": 0.0024, + "ppl": 0.00946044921875, + "reward": 0.9898382425308228, + "reward_std": 0.0005333416629582644, + "rewards/perpo_ocr_edit_distance_reward": 0.9898383021354675, "step": 681, "temperature": 0.9 }, { - "advantages": -0.0003033535822396516, - "completion_length": 505.5, - "delta_ref_entropy_loss": 0.04180908203125, - "delta_ref_ppl": -0.024658203125, - "entropy_loss": -0.0330963134765625, - "epoch": 0.2728, - "grad_norm": 0.7406771475022609, - "k1_kl": 0.02459716796875, - "k3_kl": 0.01519775390625, - "kimi_kl": 0.0369873046875, - "learning_rate": 3.6359999999999995e-07, - "loss": 0.0009, - "ppl": 0.01892852783203125, - "reward": 0.9967912137508392, - "reward_std": 0.0003498285950627178, - "rewards/perpo_ocr_edit_distance_reward": 0.9967912435531616, + "advantages": -0.0001145601345342584, + "completion_length": 731.0, + "delta_ref_entropy_loss": 0.041015625, + "delta_ref_ppl": -0.068359375, + "entropy_loss": -0.0284423828125, + "epoch": 0.1364, + "grad_norm": 0.6865753408744035, + "k1_kl": 0.068359375, + "k3_kl": 0.04638671875, + "kimi_kl": 0.1337890625, + "learning_rate": 4.318e-07, + "loss": 0.002, + "ppl": 0.0133056640625, + "reward": 0.9957996010780334, + "reward_std": 0.0005689185345545411, + "rewards/perpo_ocr_edit_distance_reward": 0.995799720287323, "step": 682, "temperature": 0.9 }, { - "advantages": -5.597940798907075e-05, - "completion_length": 687.5, - "delta_ref_entropy_loss": 0.02490234375, - "delta_ref_ppl": -0.01849365234375, - "entropy_loss": -0.01959228515625, - "epoch": 0.2732, - "grad_norm": 0.4099235479073845, - "k1_kl": 0.01849365234375, - "k3_kl": 0.010833740234375, - "kimi_kl": 0.02276611328125, - "learning_rate": 3.634e-07, - "loss": 0.0005, - "ppl": 0.0091552734375, - "reward": 0.9977173507213593, - "reward_std": 0.0004304722242522985, - "rewards/perpo_ocr_edit_distance_reward": 0.9977173805236816, + "advantages": 0.0, + "completion_length": 717.0, + "delta_ref_entropy_loss": 0.05859375, + "delta_ref_ppl": -0.046875, + "entropy_loss": -0.03662109375, + "epoch": 0.1366, + "grad_norm": 0.5686023982487541, + "k1_kl": 0.046875, + "k3_kl": 0.0234375, + "kimi_kl": 0.054443359375, + "learning_rate": 4.3169999999999993e-07, + "loss": 0.0009, + "ppl": 0.0145263671875, + "reward": 0.9888812303543091, + "reward_std": 0.0005346604739315808, + "rewards/perpo_ocr_edit_distance_reward": 0.9888812899589539, "step": 683, "temperature": 0.9 }, { - "advantages": -0.00029948779524602287, - "completion_length": 494.0, - "delta_ref_entropy_loss": 0.066650390625, - "delta_ref_ppl": -0.03753662109375, - "entropy_loss": -0.09521484375, - "epoch": 0.2736, - "grad_norm": 1.9025757654929207, - "k1_kl": 0.03759765625, - "k3_kl": 0.0165252685546875, - "kimi_kl": 0.028106689453125, - "learning_rate": 3.632e-07, - "loss": 0.001, - "ppl": 0.048702239990234375, - "reward": 0.9565906524658203, - "reward_std": 0.005761896260082722, - "rewards/perpo_ocr_edit_distance_reward": 0.9565906822681427, + "advantages": -6.7268101702211425e-06, + "completion_length": 758.0, + "delta_ref_entropy_loss": 0.06298828125, + "delta_ref_ppl": -0.05615234375, + "entropy_loss": -0.045166015625, + "epoch": 0.1368, + "grad_norm": 1.0429324714619306, + "k1_kl": 0.05615234375, + "k3_kl": 0.0301513671875, + "kimi_kl": 0.0703125, + "learning_rate": 4.316e-07, + "loss": 0.0012, + "ppl": 0.021240234375, + "reward": 0.981957733631134, + "reward_std": 0.0024366583675146103, + "rewards/perpo_ocr_edit_distance_reward": 0.9819577932357788, "step": 684, "temperature": 0.9 }, { - "advantages": -0.00010700311395339668, - "completion_length": 714.5, - "delta_ref_entropy_loss": 0.02740478515625, - "delta_ref_ppl": -0.014739990234375, - "entropy_loss": -0.020355224609375, - "epoch": 0.274, - "grad_norm": 0.4316838304570907, - "k1_kl": 0.01470947265625, - "k3_kl": 0.0059967041015625, - "kimi_kl": 0.010986328125, - "learning_rate": 3.6299999999999995e-07, - "loss": 0.0003, - "ppl": 0.0086669921875, - "reward": 0.9962607324123383, - "reward_std": 0.00038226446486078203, - "rewards/perpo_ocr_edit_distance_reward": 0.9962607622146606, + "advantages": 2.1406583982752636e-05, + "completion_length": 464.0, + "delta_ref_entropy_loss": 0.109375, + "delta_ref_ppl": -0.07568359375, + "entropy_loss": -0.07763671875, + "epoch": 0.137, + "grad_norm": 0.9175595453173463, + "k1_kl": 0.07568359375, + "k3_kl": 0.03759765625, + "kimi_kl": 0.08642578125, + "learning_rate": 4.3149999999999997e-07, + "loss": 0.0015, + "ppl": 0.043701171875, + "reward": 0.9655110836029053, + "reward_std": 0.0006941571482457221, + "rewards/perpo_ocr_edit_distance_reward": 0.9655110836029053, "step": 685, "temperature": 0.9 }, { - "advantages": -2.970014429592993e-05, - "completion_length": 454.5, - "delta_ref_entropy_loss": 0.0247802734375, - "delta_ref_ppl": -0.015472412109375, - "entropy_loss": -0.017333984375, - "epoch": 0.2744, - "grad_norm": 0.27117789262862857, - "k1_kl": 0.015533447265625, - "k3_kl": 0.0090484619140625, - "kimi_kl": 0.023345947265625, - "learning_rate": 3.628e-07, - "loss": 0.0004, - "ppl": 0.0075836181640625, - "reward": 0.9804844856262207, - "reward_std": 0.0003800044651143253, - "rewards/perpo_ocr_edit_distance_reward": 0.9804845154285431, + "advantages": -0.00010550022852839902, + "completion_length": 552.0, + "delta_ref_entropy_loss": 0.056396484375, + "delta_ref_ppl": -0.05712890625, + "entropy_loss": -0.0189208984375, + "epoch": 0.1372, + "grad_norm": 0.5084231518758661, + "k1_kl": 0.05712890625, + "k3_kl": 0.03173828125, + "kimi_kl": 0.095703125, + "learning_rate": 4.314e-07, + "loss": 0.0014, + "ppl": 0.007781982421875, + "reward": 0.996691882610321, + "reward_std": 0.0005457174847833812, + "rewards/perpo_ocr_edit_distance_reward": 0.9966919422149658, "step": 686, "temperature": 0.9 }, { - "advantages": -0.00042188167572021484, - "completion_length": 384.5, - "delta_ref_entropy_loss": 0.03399658203125, - "delta_ref_ppl": -0.02593994140625, - "entropy_loss": -0.02093505859375, - "epoch": 0.2748, - "grad_norm": 0.21553485353822263, - "k1_kl": 0.02593994140625, - "k3_kl": 0.0141448974609375, - "kimi_kl": 0.0364532470703125, - "learning_rate": 3.626e-07, - "loss": 0.001, - "ppl": 0.008148193359375, - "reward": 0.9955846667289734, - "reward_std": 0.00017341886996291578, - "rewards/perpo_ocr_edit_distance_reward": 0.9955847561359406, + "advantages": -2.3935523131513037e-05, + "completion_length": 234.0, + "delta_ref_entropy_loss": 0.04736328125, + "delta_ref_ppl": -0.171875, + "entropy_loss": -0.0693359375, + "epoch": 0.1374, + "grad_norm": 2.069996726132625, + "k1_kl": 0.1728515625, + "k3_kl": 0.130859375, + "kimi_kl": 0.451171875, + "learning_rate": 4.313e-07, + "loss": 0.0053, + "ppl": 0.031494140625, + "reward": 0.974233865737915, + "reward_std": 0.0016793982358649373, + "rewards/perpo_ocr_edit_distance_reward": 0.9742339253425598, "step": 687, "temperature": 0.9 }, { - "advantages": -3.176076279487461e-05, - "completion_length": 491.0, - "delta_ref_entropy_loss": 0.01361083984375, - "delta_ref_ppl": -0.0577392578125, - "entropy_loss": -0.023651123046875, - "epoch": 0.2752, - "grad_norm": 0.25401330515097625, - "k1_kl": 0.0577392578125, - "k3_kl": 0.0379638671875, - "kimi_kl": 0.090087890625, - "learning_rate": 3.6239999999999996e-07, - "loss": 0.0016, - "ppl": 0.0104827880859375, - "reward": 0.9995082020759583, - "reward_std": 0.00028514681616798043, - "rewards/perpo_ocr_edit_distance_reward": 0.999508261680603, + "advantages": 6.011554432916455e-06, + "completion_length": 1534.0, + "delta_ref_entropy_loss": 0.03173828125, + "delta_ref_ppl": -0.0247802734375, + "entropy_loss": -0.03076171875, + "epoch": 0.1376, + "grad_norm": 0.744865765855893, + "k1_kl": 0.0247802734375, + "k3_kl": 0.0145263671875, + "kimi_kl": 0.0296630859375, + "learning_rate": 4.312e-07, + "loss": 0.0006, + "ppl": 0.0128173828125, + "reward": 0.9250033497810364, + "reward_std": 0.0027333947364240885, + "rewards/perpo_ocr_edit_distance_reward": 0.9250033497810364, "step": 688, "temperature": 0.9 }, { - "advantages": 9.18763021218183e-06, - "completion_length": 678.0, - "delta_ref_entropy_loss": 0.063720703125, - "delta_ref_ppl": -0.029541015625, - "entropy_loss": -0.0611572265625, - "epoch": 0.2756, - "grad_norm": 0.6041620262141596, - "k1_kl": 0.0296630859375, - "k3_kl": 0.012451171875, - "kimi_kl": 0.025177001953125, - "learning_rate": 3.622e-07, - "loss": 0.0005, - "ppl": 0.03125, - "reward": 0.9856033027172089, - "reward_std": 0.001389801676850766, - "rewards/perpo_ocr_edit_distance_reward": 0.9856033027172089, + "advantages": -5.005513230571523e-05, + "completion_length": 696.0, + "delta_ref_entropy_loss": 0.0556640625, + "delta_ref_ppl": -0.0595703125, + "entropy_loss": -0.032958984375, + "epoch": 0.1378, + "grad_norm": 0.4618670605815481, + "k1_kl": 0.059814453125, + "k3_kl": 0.03271484375, + "kimi_kl": 0.0703125, + "learning_rate": 4.311e-07, + "loss": 0.0014, + "ppl": 0.01806640625, + "reward": 0.9880324006080627, + "reward_std": 0.0014314781874418259, + "rewards/perpo_ocr_edit_distance_reward": 0.9880325198173523, "step": 689, "temperature": 0.9 }, { - "advantages": -3.5975662740383996e-05, - "completion_length": 597.5, - "delta_ref_entropy_loss": 0.070556640625, - "delta_ref_ppl": -0.05517578125, - "entropy_loss": -0.0545654296875, - "epoch": 0.276, - "grad_norm": 3.0245829431390345, - "k1_kl": 0.054931640625, - "k3_kl": 0.0262451171875, - "kimi_kl": 0.042266845703125, - "learning_rate": 3.62e-07, - "loss": 0.0011, - "ppl": 0.0302734375, - "reward": 0.9519793689250946, - "reward_std": 0.0018100012675859034, - "rewards/perpo_ocr_edit_distance_reward": 0.9519794881343842, + "advantages": -3.262928657932207e-05, + "completion_length": 491.0, + "delta_ref_entropy_loss": 0.11328125, + "delta_ref_ppl": -0.0791015625, + "entropy_loss": -0.08447265625, + "epoch": 0.138, + "grad_norm": 1.2394153825983414, + "k1_kl": 0.0791015625, + "k3_kl": 0.040771484375, + "kimi_kl": 0.109375, + "learning_rate": 4.31e-07, + "loss": 0.0017, + "ppl": 0.047119140625, + "reward": 0.8472722768783569, + "reward_std": 0.001204548287205398, + "rewards/perpo_ocr_edit_distance_reward": 0.8472723364830017, "step": 690, "temperature": 0.9 }, { - "advantages": -6.871777077321894e-05, - "completion_length": 557.0, - "delta_ref_entropy_loss": 0.02093505859375, - "delta_ref_ppl": -0.0247344970703125, - "entropy_loss": -0.013916015625, - "epoch": 0.2764, - "grad_norm": 0.8054413390353545, - "k1_kl": 0.0247344970703125, - "k3_kl": 0.0155181884765625, - "kimi_kl": 0.032012939453125, - "learning_rate": 3.6179999999999997e-07, - "loss": 0.0007, - "ppl": 0.0069122314453125, - "reward": 0.9983832836151123, - "reward_std": 0.000523807480931282, - "rewards/perpo_ocr_edit_distance_reward": 0.9983833432197571, + "advantages": -4.300049567973474e-06, + "completion_length": 35.0, + "delta_ref_entropy_loss": 0.2099609375, + "delta_ref_ppl": -0.71875, + "entropy_loss": -0.2021484375, + "epoch": 0.1382, + "grad_norm": 6.833692170483994, + "k1_kl": 0.71875, + "k3_kl": 0.57421875, + "kimi_kl": 1.9140625, + "learning_rate": 4.309e-07, + "loss": 0.0229, + "ppl": 0.09521484375, + "reward": 0.5957522988319397, + "reward_std": 0.005803518462926149, + "rewards/perpo_ocr_edit_distance_reward": 0.5957523584365845, "step": 691, "temperature": 0.9 }, { - "advantages": -9.609972039470449e-05, - "completion_length": 817.0, - "delta_ref_entropy_loss": 0.021728515625, - "delta_ref_ppl": -0.0360107421875, - "entropy_loss": -0.0306396484375, - "epoch": 0.2768, - "grad_norm": 0.21944627525714136, - "k1_kl": 0.036102294921875, - "k3_kl": 0.025543212890625, - "kimi_kl": 0.08990478515625, - "learning_rate": 3.6159999999999996e-07, - "loss": 0.0011, - "ppl": 0.0140380859375, - "reward": 0.9985882341861725, - "reward_std": 0.0001936581829795614, - "rewards/perpo_ocr_edit_distance_reward": 0.9985883235931396, + "advantages": -1.9686563973664306e-05, + "completion_length": 353.0, + "delta_ref_entropy_loss": 0.130859375, + "delta_ref_ppl": -0.09765625, + "entropy_loss": -0.095703125, + "epoch": 0.1384, + "grad_norm": 1.1529078511134392, + "k1_kl": 0.09765625, + "k3_kl": 0.058349609375, + "kimi_kl": 0.13671875, + "learning_rate": 4.308e-07, + "loss": 0.0024, + "ppl": 0.048828125, + "reward": 0.965785026550293, + "reward_std": 0.0007657886017113924, + "rewards/perpo_ocr_edit_distance_reward": 0.9657850861549377, "step": 692, "temperature": 0.9 }, { - "advantages": -0.0002980232238769531, - "completion_length": 778.0, - "delta_ref_entropy_loss": 0.017181396484375, - "delta_ref_ppl": -0.0071258544921875, - "entropy_loss": -0.013397216796875, - "epoch": 0.2772, - "grad_norm": 0.19588855260820287, - "k1_kl": 0.0071258544921875, - "k3_kl": 0.003265380859375, - "kimi_kl": 0.0058441162109375, - "learning_rate": 3.614e-07, - "loss": 0.0004, - "ppl": 0.0052947998046875, - "reward": 0.9990808367729187, - "reward_std": 0.00012923662143293768, - "rewards/perpo_ocr_edit_distance_reward": 0.9990808963775635, + "advantages": 6.079674221837195e-06, + "completion_length": 82.0, + "delta_ref_entropy_loss": 0.041015625, + "delta_ref_ppl": -0.3046875, + "entropy_loss": -0.062255859375, + "epoch": 0.1386, + "grad_norm": 3.0955254448140317, + "k1_kl": 0.3046875, + "k3_kl": 0.236328125, + "kimi_kl": 0.78515625, + "learning_rate": 4.307e-07, + "loss": 0.0095, + "ppl": 0.01556396484375, + "reward": 0.9632652401924133, + "reward_std": 0.0026997686363756657, + "rewards/perpo_ocr_edit_distance_reward": 0.9632652997970581, "step": 693, "temperature": 0.9 }, { - "advantages": -6.301062853708572e-07, - "completion_length": 474.0, - "delta_ref_entropy_loss": 0.0601806640625, - "delta_ref_ppl": -0.034454345703125, - "entropy_loss": -0.08343505859375, - "epoch": 0.2776, - "grad_norm": 1.4263408577435315, - "k1_kl": 0.034454345703125, - "k3_kl": 0.015392303466796875, - "kimi_kl": 0.02605438232421875, - "learning_rate": 3.612e-07, - "loss": 0.0006, - "ppl": 0.04205322265625, - "reward": 0.8601981997489929, - "reward_std": 0.013578386977314949, - "rewards/perpo_ocr_edit_distance_reward": 0.8601981997489929, + "advantages": -7.595334864163306e-06, + "completion_length": 430.0, + "delta_ref_entropy_loss": 0.1455078125, + "delta_ref_ppl": -0.1279296875, + "entropy_loss": -0.1484375, + "epoch": 0.1388, + "grad_norm": 1.6317253905054265, + "k1_kl": 0.126953125, + "k3_kl": 0.0751953125, + "kimi_kl": 0.1640625, + "learning_rate": 4.3059999999999995e-07, + "loss": 0.003, + "ppl": 0.0849609375, + "reward": 0.9479995965957642, + "reward_std": 0.0021398121025413275, + "rewards/perpo_ocr_edit_distance_reward": 0.9479996562004089, "step": 694, "temperature": 0.9 }, { - "advantages": -0.00017596994985069614, - "completion_length": 553.5, - "delta_ref_entropy_loss": 0.02703857421875, - "delta_ref_ppl": -0.013946533203125, - "entropy_loss": -0.02117919921875, - "epoch": 0.278, - "grad_norm": 0.45644584278889183, - "k1_kl": 0.014007568359375, - "k3_kl": 0.006683349609375, - "kimi_kl": 0.01251220703125, - "learning_rate": 3.6099999999999996e-07, - "loss": 0.0004, - "ppl": 0.00951385498046875, - "reward": 0.9885147511959076, - "reward_std": 0.0013197078296798281, - "rewards/perpo_ocr_edit_distance_reward": 0.9885148406028748, + "advantages": -5.722046353184851e-06, + "completion_length": 822.0, + "delta_ref_entropy_loss": 0.025390625, + "delta_ref_ppl": -0.0281982421875, + "entropy_loss": -0.0245361328125, + "epoch": 0.139, + "grad_norm": 0.6499599285083973, + "k1_kl": 0.0283203125, + "k3_kl": 0.017578125, + "kimi_kl": 0.0478515625, + "learning_rate": 4.305e-07, + "loss": 0.0007, + "ppl": 0.01312255859375, + "reward": 0.9005204439163208, + "reward_std": 0.004353792872279882, + "rewards/perpo_ocr_edit_distance_reward": 0.9005205631256104, "step": 695, "temperature": 0.9 }, { - "advantages": -2.9853413641589555e-05, - "completion_length": 794.0, - "delta_ref_entropy_loss": 0.063232421875, - "delta_ref_ppl": -0.0379638671875, - "entropy_loss": -0.072998046875, - "epoch": 0.2784, - "grad_norm": 1.641782679265268, - "k1_kl": 0.037841796875, - "k3_kl": 0.01995849609375, - "kimi_kl": 0.0511474609375, - "learning_rate": 3.608e-07, - "loss": 0.0008, - "ppl": 0.03857421875, - "reward": 0.9572584629058838, - "reward_std": 0.004874969978118315, - "rewards/perpo_ocr_edit_distance_reward": 0.9572585225105286, + "advantages": -1.2295587112021167e-05, + "completion_length": 566.0, + "delta_ref_entropy_loss": 0.0888671875, + "delta_ref_ppl": -0.05517578125, + "entropy_loss": -0.049072265625, + "epoch": 0.1392, + "grad_norm": 0.9602558213806616, + "k1_kl": 0.054931640625, + "k3_kl": 0.0303955078125, + "kimi_kl": 0.0654296875, + "learning_rate": 4.304e-07, + "loss": 0.0012, + "ppl": 0.028564453125, + "reward": 0.9783052802085876, + "reward_std": 0.008209814317524433, + "rewards/perpo_ocr_edit_distance_reward": 0.9783053398132324, "step": 696, "temperature": 0.9 }, { - "advantages": -2.7826854875456775e-05, - "completion_length": 581.5, - "delta_ref_entropy_loss": 0.0474853515625, - "delta_ref_ppl": -0.0357666015625, - "entropy_loss": -0.04705810546875, - "epoch": 0.2788, - "grad_norm": 0.5784029217649527, - "k1_kl": 0.0357666015625, - "k3_kl": 0.0191650390625, - "kimi_kl": 0.054443359375, - "learning_rate": 3.6059999999999993e-07, - "loss": 0.0008, - "ppl": 0.021148681640625, - "reward": 0.9859402477741241, - "reward_std": 0.004678681638324633, - "rewards/perpo_ocr_edit_distance_reward": 0.9859403371810913, + "advantages": -0.00041197880636900663, + "completion_length": 569.0, + "delta_ref_entropy_loss": 0.039794921875, + "delta_ref_ppl": -0.043212890625, + "entropy_loss": -0.0198974609375, + "epoch": 0.1394, + "grad_norm": 0.7804517327225291, + "k1_kl": 0.043212890625, + "k3_kl": 0.0279541015625, + "kimi_kl": 0.1103515625, + "learning_rate": 4.303e-07, + "loss": 0.0015, + "ppl": 0.006378173828125, + "reward": 0.9987854957580566, + "reward_std": 0.0001893525040941313, + "rewards/perpo_ocr_edit_distance_reward": 0.9987856149673462, "step": 697, "temperature": 0.9 }, { - "advantages": -4.8620361667417455e-05, - "completion_length": 719.0, - "delta_ref_entropy_loss": 0.0184326171875, - "delta_ref_ppl": -0.009613037109375, - "entropy_loss": -0.02130126953125, - "epoch": 0.2792, - "grad_norm": 0.5032316736238153, - "k1_kl": 0.009613037109375, - "k3_kl": 0.004547119140625, - "kimi_kl": 0.0085601806640625, - "learning_rate": 3.6039999999999997e-07, - "loss": 0.0002, - "ppl": 0.010040283203125, - "reward": 0.9988557398319244, - "reward_std": 0.0010314087267033756, - "rewards/perpo_ocr_edit_distance_reward": 0.9988557696342468, + "advantages": 1.7029899268550253e-08, + "completion_length": 1436.0, + "delta_ref_entropy_loss": 0.0242919921875, + "delta_ref_ppl": -0.023681640625, + "entropy_loss": -0.030517578125, + "epoch": 0.1396, + "grad_norm": 0.4718583726535211, + "k1_kl": 0.023681640625, + "k3_kl": 0.01416015625, + "kimi_kl": 0.031982421875, + "learning_rate": 4.3020000000000003e-07, + "loss": 0.0006, + "ppl": 0.01544189453125, + "reward": 0.9904683828353882, + "reward_std": 0.0034487976226955652, + "rewards/perpo_ocr_edit_distance_reward": 0.9904683828353882, "step": 698, "temperature": 0.9 }, { - "advantages": -0.00029758044649952353, - "completion_length": 756.5, - "delta_ref_entropy_loss": 0.0341796875, - "delta_ref_ppl": -0.024169921875, - "entropy_loss": -0.039886474609375, - "epoch": 0.2796, - "grad_norm": 1.3036238259748654, - "k1_kl": 0.024200439453125, - "k3_kl": 0.01371002197265625, - "kimi_kl": 0.0249176025390625, - "learning_rate": 3.602e-07, - "loss": 0.0008, - "ppl": 0.020660400390625, - "reward": 0.9440478086471558, - "reward_std": 0.019498707726597786, - "rewards/perpo_ocr_edit_distance_reward": 0.9440478086471558, + "advantages": 9.366444686520481e-08, + "completion_length": 184.0, + "delta_ref_entropy_loss": 0.205078125, + "delta_ref_ppl": -0.392578125, + "entropy_loss": -0.310546875, + "epoch": 0.1398, + "grad_norm": 7.815746928112047, + "k1_kl": 0.390625, + "k3_kl": 0.296875, + "kimi_kl": 1.1015625, + "learning_rate": 4.3009999999999997e-07, + "loss": 0.0119, + "ppl": 0.12353515625, + "reward": 0.5369823575019836, + "reward_std": 0.15103913843631744, + "rewards/perpo_ocr_edit_distance_reward": 0.5369823575019836, "step": 699, "temperature": 0.9 }, { - "advantages": -0.0001106177078327164, - "completion_length": 687.5, - "delta_ref_entropy_loss": 0.019012451171875, - "delta_ref_ppl": -0.017791748046875, - "entropy_loss": -0.017578125, - "epoch": 0.28, - "grad_norm": 0.17922688349646532, - "k1_kl": 0.017822265625, - "k3_kl": 0.01239013671875, - "kimi_kl": 0.03533935546875, - "learning_rate": 3.6e-07, - "loss": 0.0006, - "ppl": 0.00794219970703125, - "reward": 0.9989729821681976, - "reward_std": 8.471130422549322e-05, - "rewards/perpo_ocr_edit_distance_reward": 0.9989730417728424, + "advantages": 9.707042408990674e-06, + "completion_length": 1099.0, + "delta_ref_entropy_loss": 0.0693359375, + "delta_ref_ppl": -0.060791015625, + "entropy_loss": -0.04052734375, + "epoch": 0.14, + "grad_norm": 0.8180780902244352, + "k1_kl": 0.060546875, + "k3_kl": 0.0294189453125, + "kimi_kl": 0.06396484375, + "learning_rate": 4.2999999999999996e-07, + "loss": 0.0012, + "ppl": 0.0189208984375, + "reward": 0.9893510341644287, + "reward_std": 0.0007761811139062047, + "rewards/perpo_ocr_edit_distance_reward": 0.9893510341644287, "step": 700, "temperature": 0.9 }, { - "advantages": 2.3458687792299315e-06, - "completion_length": 758.5, - "delta_ref_entropy_loss": 0.03753662109375, - "delta_ref_ppl": -0.0355224609375, - "entropy_loss": -0.04736328125, - "epoch": 0.2804, - "grad_norm": 0.7439089446322094, - "k1_kl": 0.03564453125, - "k3_kl": 0.0234375, - "kimi_kl": 0.060302734375, - "learning_rate": 3.598e-07, - "loss": 0.0009, - "ppl": 0.026611328125, - "reward": 0.9832414090633392, - "reward_std": 0.001963629329111427, - "rewards/perpo_ocr_edit_distance_reward": 0.9832414090633392, + "advantages": -0.00019310202333144844, + "completion_length": 544.0, + "delta_ref_entropy_loss": 0.05029296875, + "delta_ref_ppl": -0.047607421875, + "entropy_loss": -0.0247802734375, + "epoch": 0.1402, + "grad_norm": 0.6364062573398247, + "k1_kl": 0.047607421875, + "k3_kl": 0.02587890625, + "kimi_kl": 0.0615234375, + "learning_rate": 4.299e-07, + "loss": 0.0012, + "ppl": 0.0084228515625, + "reward": 0.9933541417121887, + "reward_std": 0.0002527753822505474, + "rewards/perpo_ocr_edit_distance_reward": 0.993354320526123, "step": 701, "temperature": 0.9 }, { - "advantages": -5.756531755451988e-05, - "completion_length": 639.5, - "delta_ref_entropy_loss": 0.0223388671875, - "delta_ref_ppl": -0.02082061767578125, - "entropy_loss": -0.050506591796875, - "epoch": 0.2808, - "grad_norm": 1.2226588224717831, - "k1_kl": 0.02094268798828125, - "k3_kl": 0.013782501220703125, - "kimi_kl": 0.035602569580078125, - "learning_rate": 3.5959999999999996e-07, - "loss": 0.0006, - "ppl": 0.0236663818359375, - "reward": 0.7093799710273743, - "reward_std": 0.08253888941544574, - "rewards/perpo_ocr_edit_distance_reward": 0.7093800157308578, + "advantages": -5.615183545160107e-05, + "completion_length": 1136.0, + "delta_ref_entropy_loss": 0.0498046875, + "delta_ref_ppl": -0.056396484375, + "entropy_loss": -0.039306640625, + "epoch": 0.1404, + "grad_norm": 5.3075685242781026, + "k1_kl": 0.056396484375, + "k3_kl": 0.06982421875, + "kimi_kl": 0.09912109375, + "learning_rate": 4.298e-07, + "loss": 0.0028, + "ppl": 0.0235595703125, + "reward": 0.9910520911216736, + "reward_std": 0.001112962025217712, + "rewards/perpo_ocr_edit_distance_reward": 0.9910522103309631, "step": 702, "temperature": 0.9 }, { - "advantages": -7.57575089664897e-05, - "completion_length": 1145.5, - "delta_ref_entropy_loss": 0.018798828125, - "delta_ref_ppl": -0.0092620849609375, - "entropy_loss": -0.016387939453125, - "epoch": 0.2812, - "grad_norm": 11.515880898198414, - "k1_kl": 0.0092620849609375, - "k3_kl": 0.06964111328125, - "kimi_kl": 0.009307861328125, - "learning_rate": 3.594e-07, - "loss": 0.0029, - "ppl": 0.007965087890625, - "reward": 0.9958787560462952, - "reward_std": 0.0011185424082214013, - "rewards/perpo_ocr_edit_distance_reward": 0.9958788156509399, + "advantages": -5.46659748579259e-06, + "completion_length": 1257.0, + "delta_ref_entropy_loss": 0.11962890625, + "delta_ref_ppl": -0.07470703125, + "entropy_loss": -0.1796875, + "epoch": 0.1406, + "grad_norm": 1.6910014837479603, + "k1_kl": 0.07470703125, + "k3_kl": 0.0419921875, + "kimi_kl": 0.10498046875, + "learning_rate": 4.297e-07, + "loss": 0.0017, + "ppl": 0.10400390625, + "reward": 0.8802365660667419, + "reward_std": 0.013926868326961994, + "rewards/perpo_ocr_edit_distance_reward": 0.8802366852760315, "step": 703, "temperature": 0.9 }, { - "advantages": -0.0003241470894863596, - "completion_length": 524.0, - "delta_ref_entropy_loss": 0.02880859375, - "delta_ref_ppl": -0.03070068359375, - "entropy_loss": -0.0249176025390625, - "epoch": 0.2816, - "grad_norm": 0.4246500258286958, - "k1_kl": 0.03070068359375, - "k3_kl": 0.018829345703125, - "kimi_kl": 0.0526123046875, - "learning_rate": 3.592e-07, - "loss": 0.0011, - "ppl": 0.011943817138671875, - "reward": 0.9975383281707764, - "reward_std": 0.0005203615291975439, - "rewards/perpo_ocr_edit_distance_reward": 0.9975384175777435, + "advantages": -9.468623829889111e-06, + "completion_length": 948.0, + "delta_ref_entropy_loss": 0.0198974609375, + "delta_ref_ppl": -0.029296875, + "entropy_loss": -0.03173828125, + "epoch": 0.1408, + "grad_norm": 1.463729692894438, + "k1_kl": 0.029296875, + "k3_kl": 0.01806640625, + "kimi_kl": 0.0458984375, + "learning_rate": 4.296e-07, + "loss": 0.0007, + "ppl": 0.0146484375, + "reward": 0.9874089360237122, + "reward_std": 0.0017049235757440329, + "rewards/perpo_ocr_edit_distance_reward": 0.9874089360237122, "step": 704, "temperature": 0.9 }, { - "advantages": -0.0002981424331665039, - "completion_length": 1242.0, - "delta_ref_entropy_loss": 0.02783203125, - "delta_ref_ppl": -0.014373779296875, - "entropy_loss": -0.03265380859375, - "epoch": 0.282, - "grad_norm": 0.4742149144638652, - "k1_kl": 0.014373779296875, - "k3_kl": 0.00699615478515625, - "kimi_kl": 0.01306915283203125, - "learning_rate": 3.5899999999999997e-07, - "loss": 0.0006, - "ppl": 0.015106201171875, - "reward": 0.9523080289363861, - "reward_std": 0.10194966942071915, - "rewards/perpo_ocr_edit_distance_reward": 0.9523081481456757, + "advantages": -9.194442827720195e-05, + "completion_length": 390.0, + "delta_ref_entropy_loss": 0.10400390625, + "delta_ref_ppl": -0.08056640625, + "entropy_loss": -0.052490234375, + "epoch": 0.141, + "grad_norm": 0.9477454344512908, + "k1_kl": 0.0810546875, + "k3_kl": 0.0458984375, + "kimi_kl": 0.115234375, + "learning_rate": 4.295e-07, + "loss": 0.0019, + "ppl": 0.0213623046875, + "reward": 0.9859600067138672, + "reward_std": 0.0008262729970738292, + "rewards/perpo_ocr_edit_distance_reward": 0.985960066318512, "step": 705, "temperature": 0.9 }, { - "advantages": -1.4083726455282886e-05, - "completion_length": 414.0, - "delta_ref_entropy_loss": 0.049072265625, - "delta_ref_ppl": -0.03704833984375, - "entropy_loss": -0.03155517578125, - "epoch": 0.2824, - "grad_norm": 0.4136275552793774, - "k1_kl": 0.03704833984375, - "k3_kl": 0.019287109375, - "kimi_kl": 0.044677734375, - "learning_rate": 3.588e-07, - "loss": 0.0008, - "ppl": 0.016387939453125, - "reward": 0.9889766573905945, - "reward_std": 0.0017650802619755268, - "rewards/perpo_ocr_edit_distance_reward": 0.9889767169952393, + "advantages": -1.0839530659723096e-05, + "completion_length": 939.0, + "delta_ref_entropy_loss": 0.06298828125, + "delta_ref_ppl": -0.06640625, + "entropy_loss": -0.061279296875, + "epoch": 0.1412, + "grad_norm": 0.9545507437507388, + "k1_kl": 0.06591796875, + "k3_kl": 0.03662109375, + "kimi_kl": 0.09716796875, + "learning_rate": 4.2939999999999997e-07, + "loss": 0.0015, + "ppl": 0.0296630859375, + "reward": 0.3146783113479614, + "reward_std": 0.0014717074809595942, + "rewards/perpo_ocr_edit_distance_reward": 0.3146783411502838, "step": 706, "temperature": 0.9 }, { - "advantages": -1.01327898960335e-06, - "completion_length": 531.0, - "delta_ref_entropy_loss": 0.0689697265625, - "delta_ref_ppl": -0.0499267578125, - "entropy_loss": -0.1617431640625, - "epoch": 0.2828, - "grad_norm": 2.7533408315796133, - "k1_kl": 0.05029296875, - "k3_kl": 0.02850341796875, - "kimi_kl": 0.067138671875, - "learning_rate": 3.5859999999999994e-07, - "loss": 0.0011, - "ppl": 0.0859375, - "reward": 0.8195511102676392, - "reward_std": 0.040952566312626004, - "rewards/perpo_ocr_edit_distance_reward": 0.8195511400699615, + "advantages": -6.897109415149316e-05, + "completion_length": 348.0, + "delta_ref_entropy_loss": 0.035400390625, + "delta_ref_ppl": -0.054931640625, + "entropy_loss": -0.026123046875, + "epoch": 0.1414, + "grad_norm": 0.9257290594836314, + "k1_kl": 0.0546875, + "k3_kl": 0.035888671875, + "kimi_kl": 0.099609375, + "learning_rate": 4.293e-07, + "loss": 0.0015, + "ppl": 0.0081787109375, + "reward": 0.959579586982727, + "reward_std": 0.0010112940799444914, + "rewards/perpo_ocr_edit_distance_reward": 0.9595796465873718, "step": 707, "temperature": 0.9 }, { - "advantages": -0.00030000720721545804, - "completion_length": 328.0, - "delta_ref_entropy_loss": 0.03582763671875, - "delta_ref_ppl": -0.021331787109375, - "entropy_loss": -0.01666259765625, - "epoch": 0.2832, - "grad_norm": 0.5413416224890205, - "k1_kl": 0.021392822265625, - "k3_kl": 0.0106201171875, - "kimi_kl": 0.0235595703125, - "learning_rate": 3.584e-07, - "loss": 0.0007, - "ppl": 0.0084381103515625, - "reward": 0.9962492287158966, - "reward_std": 0.0010287570767104626, - "rewards/perpo_ocr_edit_distance_reward": 0.996249258518219, + "advantages": -1.1989049198746216e-05, + "completion_length": 424.0, + "delta_ref_entropy_loss": 0.06787109375, + "delta_ref_ppl": -0.06201171875, + "entropy_loss": -0.057861328125, + "epoch": 0.1416, + "grad_norm": 0.7682592000155115, + "k1_kl": 0.062255859375, + "k3_kl": 0.033447265625, + "kimi_kl": 0.08154296875, + "learning_rate": 4.292e-07, + "loss": 0.0013, + "ppl": 0.026123046875, + "reward": 0.946064829826355, + "reward_std": 0.009853594936430454, + "rewards/perpo_ocr_edit_distance_reward": 0.9460650086402893, "step": 708, "temperature": 0.9 }, { - "advantages": -3.4604755796863174e-05, - "completion_length": 478.0, - "delta_ref_entropy_loss": 0.040771484375, - "delta_ref_ppl": -0.02685546875, - "entropy_loss": -0.0374755859375, - "epoch": 0.2836, - "grad_norm": 0.7017999829842008, - "k1_kl": 0.02679443359375, - "k3_kl": 0.01458740234375, - "kimi_kl": 0.030029296875, - "learning_rate": 3.582e-07, - "loss": 0.0006, - "ppl": 0.01800537109375, - "reward": 0.9872755706310272, - "reward_std": 0.002222639450337738, - "rewards/perpo_ocr_edit_distance_reward": 0.9872755706310272, + "advantages": -1.8732889373040962e-07, + "completion_length": 309.0, + "delta_ref_entropy_loss": 0.11962890625, + "delta_ref_ppl": -0.1220703125, + "entropy_loss": -0.0771484375, + "epoch": 0.1418, + "grad_norm": 0.7207956469435645, + "k1_kl": 0.1220703125, + "k3_kl": 0.07373046875, + "kimi_kl": 0.314453125, + "learning_rate": 4.2909999999999994e-07, + "loss": 0.0029, + "ppl": 0.029541015625, + "reward": 0.8897264003753662, + "reward_std": 0.25814077258110046, + "rewards/perpo_ocr_edit_distance_reward": 0.8897265195846558, "step": 709, "temperature": 0.9 }, { - "advantages": -6.130763722467236e-06, - "completion_length": 1036.5, - "delta_ref_entropy_loss": 0.0479736328125, - "delta_ref_ppl": -0.027099609375, - "entropy_loss": -0.08197021484375, - "epoch": 0.284, - "grad_norm": 0.7965528720541628, - "k1_kl": 0.0269775390625, - "k3_kl": 0.0133819580078125, - "kimi_kl": 0.023895263671875, - "learning_rate": 3.5799999999999995e-07, - "loss": 0.0005, - "ppl": 0.044403076171875, - "reward": 0.953557550907135, - "reward_std": 0.0013390418607741594, - "rewards/perpo_ocr_edit_distance_reward": 0.9535575807094574, + "advantages": -6.471361757576233e-07, + "completion_length": 1458.0, + "delta_ref_entropy_loss": 0.043212890625, + "delta_ref_ppl": -0.04248046875, + "entropy_loss": -0.055419921875, + "epoch": 0.142, + "grad_norm": 1.847314645744958, + "k1_kl": 0.042724609375, + "k3_kl": 0.03759765625, + "kimi_kl": 0.0791015625, + "learning_rate": 4.29e-07, + "loss": 0.0015, + "ppl": 0.0322265625, + "reward": 0.8676186203956604, + "reward_std": 0.09554863721132278, + "rewards/perpo_ocr_edit_distance_reward": 0.8676186800003052, "step": 710, "temperature": 0.9 }, { - "advantages": -9.583575547367218e-06, - "completion_length": 393.0, - "delta_ref_entropy_loss": 0.0799560546875, - "delta_ref_ppl": -0.07958984375, - "entropy_loss": -0.111328125, - "epoch": 0.2844, - "grad_norm": 5.710841373830624, - "k1_kl": 0.079833984375, - "k3_kl": 0.03985595703125, - "kimi_kl": 0.1024169921875, - "learning_rate": 3.578e-07, - "loss": 0.0016, - "ppl": 0.0699462890625, - "reward": 0.97173011302948, - "reward_std": 0.010115651995874941, - "rewards/perpo_ocr_edit_distance_reward": 0.9717301726341248, + "advantages": -1.3623919414840202e-07, + "completion_length": 217.0, + "delta_ref_entropy_loss": 0.1298828125, + "delta_ref_ppl": -0.1669921875, + "entropy_loss": -0.3046875, + "epoch": 0.1422, + "grad_norm": 5.0534629494526255, + "k1_kl": 0.1669921875, + "k3_kl": 0.1064453125, + "kimi_kl": 0.2578125, + "learning_rate": 4.289e-07, + "loss": 0.0043, + "ppl": 0.1376953125, + "reward": 0.288442462682724, + "reward_std": 0.23495766520500183, + "rewards/perpo_ocr_edit_distance_reward": 0.2884424924850464, "step": 711, "temperature": 0.9 }, { - "advantages": -1.348342266282998e-05, - "completion_length": 1130.0, - "delta_ref_entropy_loss": 0.04400634765625, - "delta_ref_ppl": -0.02154541015625, - "entropy_loss": -0.06884765625, - "epoch": 0.2848, - "grad_norm": 2.223857562129857, - "k1_kl": 0.0216064453125, - "k3_kl": 0.010833740234375, - "kimi_kl": 0.02008056640625, - "learning_rate": 3.5759999999999997e-07, - "loss": 0.0004, - "ppl": 0.03704833984375, - "reward": 0.982477992773056, - "reward_std": 0.004985877021681517, - "rewards/perpo_ocr_edit_distance_reward": 0.9824780523777008, + "advantages": 1.4151846698950976e-05, + "completion_length": 589.0, + "delta_ref_entropy_loss": 0.05859375, + "delta_ref_ppl": -0.05322265625, + "entropy_loss": -0.01708984375, + "epoch": 0.1424, + "grad_norm": 0.5247101627571195, + "k1_kl": 0.053466796875, + "k3_kl": 0.0281982421875, + "kimi_kl": 0.06640625, + "learning_rate": 4.288e-07, + "loss": 0.0011, + "ppl": 0.00634765625, + "reward": 0.9740689396858215, + "reward_std": 0.0011016594944521785, + "rewards/perpo_ocr_edit_distance_reward": 0.9740689992904663, "step": 712, "temperature": 0.9 }, { - "advantages": -2.0946775691754738e-06, - "completion_length": 870.5, - "delta_ref_entropy_loss": 0.02728271484375, - "delta_ref_ppl": -0.022216796875, - "entropy_loss": -0.06005859375, - "epoch": 0.2852, - "grad_norm": 1.7086173645840594, - "k1_kl": 0.02227783203125, - "k3_kl": 0.012725830078125, - "kimi_kl": 0.034637451171875, - "learning_rate": 3.5739999999999996e-07, - "loss": 0.0005, - "ppl": 0.0296630859375, - "reward": 0.866034984588623, - "reward_std": 0.05655797338113189, - "rewards/perpo_ocr_edit_distance_reward": 0.8660350143909454, + "advantages": -6.075416604289785e-05, + "completion_length": 806.0, + "delta_ref_entropy_loss": 0.08349609375, + "delta_ref_ppl": -0.05517578125, + "entropy_loss": -0.0250244140625, + "epoch": 0.1426, + "grad_norm": 0.4428641892291969, + "k1_kl": 0.05517578125, + "k3_kl": 0.0264892578125, + "kimi_kl": 0.05615234375, + "learning_rate": 4.287e-07, + "loss": 0.0011, + "ppl": 0.00726318359375, + "reward": 0.9903243780136108, + "reward_std": 0.0010213166242465377, + "rewards/perpo_ocr_edit_distance_reward": 0.9903244972229004, "step": 713, "temperature": 0.9 }, { - "advantages": -0.00034040212995023467, - "completion_length": 524.5, - "delta_ref_entropy_loss": 0.0228271484375, - "delta_ref_ppl": -0.015777587890625, - "entropy_loss": -0.0156097412109375, - "epoch": 0.2856, - "grad_norm": 0.2029330215697986, - "k1_kl": 0.0157470703125, - "k3_kl": 0.0073699951171875, - "kimi_kl": 0.015869140625, - "learning_rate": 3.572e-07, - "loss": 0.0006, - "ppl": 0.005802154541015625, - "reward": 0.9988729655742645, - "reward_std": 0.00015091717068571597, - "rewards/perpo_ocr_edit_distance_reward": 0.9988730251789093, + "advantages": -4.511220322456211e-05, + "completion_length": 51.0, + "delta_ref_entropy_loss": 0.058349609375, + "delta_ref_ppl": -0.55078125, + "entropy_loss": -0.06640625, + "epoch": 0.1428, + "grad_norm": 4.849398790553952, + "k1_kl": 0.5546875, + "k3_kl": 0.451171875, + "kimi_kl": 1.8515625, + "learning_rate": 4.2859999999999996e-07, + "loss": 0.0181, + "ppl": 0.0211181640625, + "reward": 0.9686211943626404, + "reward_std": 0.0023547846358269453, + "rewards/perpo_ocr_edit_distance_reward": 0.9686213135719299, "step": 714, "temperature": 0.9 }, { - "advantages": -0.0002733767014433397, - "completion_length": 811.0, - "delta_ref_entropy_loss": 0.02490234375, - "delta_ref_ppl": -0.014678955078125, - "entropy_loss": -0.018310546875, - "epoch": 0.286, - "grad_norm": 0.20468234484299042, - "k1_kl": 0.014678955078125, - "k3_kl": 0.00689697265625, - "kimi_kl": 0.01287841796875, - "learning_rate": 3.57e-07, - "loss": 0.0005, - "ppl": 0.00775146484375, - "reward": 0.9988729655742645, - "reward_std": 0.00020913375192321837, - "rewards/perpo_ocr_edit_distance_reward": 0.9988730251789093, + "advantages": -2.55448497910038e-07, + "completion_length": 376.0, + "delta_ref_entropy_loss": 0.140625, + "delta_ref_ppl": -0.119140625, + "entropy_loss": -0.103515625, + "epoch": 0.143, + "grad_norm": 0.7436852956964674, + "k1_kl": 0.119140625, + "k3_kl": 0.0595703125, + "kimi_kl": 0.10693359375, + "learning_rate": 4.2849999999999995e-07, + "loss": 0.0024, + "ppl": 0.054931640625, + "reward": 0.7425494194030762, + "reward_std": 0.2300177961587906, + "rewards/perpo_ocr_edit_distance_reward": 0.7425495386123657, "step": 715, "temperature": 0.9 }, { - "advantages": -0.00018715433543547988, - "completion_length": 655.5, - "delta_ref_entropy_loss": 0.03369140625, - "delta_ref_ppl": -0.024688720703125, - "entropy_loss": -0.0177001953125, - "epoch": 0.2864, - "grad_norm": 0.12091055214108534, - "k1_kl": 0.024658203125, - "k3_kl": 0.0131378173828125, - "kimi_kl": 0.0335235595703125, - "learning_rate": 3.5679999999999997e-07, - "loss": 0.0007, - "ppl": 0.007415771484375, - "reward": 0.60175921022892, - "reward_std": 7.511231524404138e-05, - "rewards/perpo_ocr_edit_distance_reward": 0.6017592400312424, + "advantages": -5.207743015489541e-05, + "completion_length": 224.0, + "delta_ref_entropy_loss": 0.07373046875, + "delta_ref_ppl": -0.1416015625, + "entropy_loss": -0.0269775390625, + "epoch": 0.1432, + "grad_norm": 0.8878293699783536, + "k1_kl": 0.1416015625, + "k3_kl": 0.099609375, + "kimi_kl": 0.3359375, + "learning_rate": 4.284e-07, + "loss": 0.004, + "ppl": 0.0086669921875, + "reward": 0.9681650996208191, + "reward_std": 0.0008810647996142507, + "rewards/perpo_ocr_edit_distance_reward": 0.9681651592254639, "step": 716, "temperature": 0.9 }, { - "advantages": -0.0002984149115548007, - "completion_length": 371.0, - "delta_ref_entropy_loss": 0.05010986328125, - "delta_ref_ppl": -0.039794921875, - "entropy_loss": -0.081695556640625, - "epoch": 0.2868, - "grad_norm": 0.6786742963410856, - "k1_kl": 0.03961181640625, - "k3_kl": 0.02301025390625, - "kimi_kl": 0.0513916015625, - "learning_rate": 3.5659999999999995e-07, - "loss": 0.0012, - "ppl": 0.04620361328125, - "reward": 0.9672737717628479, - "reward_std": 0.05919831991195679, - "rewards/perpo_ocr_edit_distance_reward": 0.9672738313674927, + "advantages": 0.0, + "completion_length": 43.0, + "delta_ref_entropy_loss": 0.1494140625, + "delta_ref_ppl": -0.38671875, + "entropy_loss": -0.1826171875, + "epoch": 0.1434, + "grad_norm": 8.702062137368253, + "k1_kl": 0.38671875, + "k3_kl": 0.29296875, + "kimi_kl": 1.0625, + "learning_rate": 4.283e-07, + "loss": 0.0117, + "ppl": 0.1376953125, + "reward": 0.9295774102210999, + "reward_std": 0.011499977670609951, + "rewards/perpo_ocr_edit_distance_reward": 0.9295774698257446, "step": 717, "temperature": 0.9 }, { - "advantages": -0.0003420795728743542, - "completion_length": 505.0, - "delta_ref_entropy_loss": 0.02008056640625, - "delta_ref_ppl": -0.0081939697265625, - "entropy_loss": -0.02587890625, - "epoch": 0.2872, - "grad_norm": 0.802468966470615, - "k1_kl": 0.0081939697265625, - "k3_kl": 0.003936767578125, - "kimi_kl": 0.0053863525390625, - "learning_rate": 3.564e-07, - "loss": 0.0005, - "ppl": 0.0125732421875, - "reward": 0.9981991946697235, - "reward_std": 0.0005299182375892997, - "rewards/perpo_ocr_edit_distance_reward": 0.9981992840766907, + "advantages": -3.970520992879756e-05, + "completion_length": 651.0, + "delta_ref_entropy_loss": 0.043212890625, + "delta_ref_ppl": -0.03955078125, + "entropy_loss": -0.0184326171875, + "epoch": 0.1436, + "grad_norm": 0.43904564347127706, + "k1_kl": 0.03955078125, + "k3_kl": 0.02001953125, + "kimi_kl": 0.044189453125, + "learning_rate": 4.282e-07, + "loss": 0.0008, + "ppl": 0.005615234375, + "reward": 0.9918166995048523, + "reward_std": 0.0005433154874481261, + "rewards/perpo_ocr_edit_distance_reward": 0.9918166995048523, "step": 718, "temperature": 0.9 }, { - "advantages": -3.4613270827321685e-06, - "completion_length": 902.0, - "delta_ref_entropy_loss": 0.03448486328125, - "delta_ref_ppl": -0.017303466796875, - "entropy_loss": -0.0537109375, - "epoch": 0.2876, - "grad_norm": 1.9199415522043224, - "k1_kl": 0.01727294921875, - "k3_kl": 0.0291748046875, - "kimi_kl": 0.02191162109375, - "learning_rate": 3.562e-07, - "loss": 0.0012, - "ppl": 0.02880859375, - "reward": 0.9534474313259125, - "reward_std": 0.024882866418920457, - "rewards/perpo_ocr_edit_distance_reward": 0.9534475207328796, + "advantages": 1.774515476427041e-05, + "completion_length": 579.0, + "delta_ref_entropy_loss": 0.0439453125, + "delta_ref_ppl": -0.05322265625, + "entropy_loss": -0.01416015625, + "epoch": 0.1438, + "grad_norm": 0.5089870202154038, + "k1_kl": 0.053466796875, + "k3_kl": 0.031982421875, + "kimi_kl": 0.07666015625, + "learning_rate": 4.281e-07, + "loss": 0.0013, + "ppl": 0.00433349609375, + "reward": 0.9910586476325989, + "reward_std": 0.00038018295890651643, + "rewards/perpo_ocr_edit_distance_reward": 0.9910587072372437, "step": 719, "temperature": 0.9 }, { - "advantages": -0.00015924658418953186, - "completion_length": 925.5, - "delta_ref_entropy_loss": 0.01763916015625, - "delta_ref_ppl": -0.01336669921875, - "entropy_loss": -0.0206298828125, - "epoch": 0.288, - "grad_norm": 0.3316879121856852, - "k1_kl": 0.013397216796875, - "k3_kl": 0.008392333984375, - "kimi_kl": 0.0206298828125, - "learning_rate": 3.5599999999999996e-07, - "loss": 0.0005, - "ppl": 0.00958251953125, - "reward": 0.9995567500591278, - "reward_std": 0.00018598866154206917, - "rewards/perpo_ocr_edit_distance_reward": 0.999556839466095, + "advantages": -4.478863502299646e-06, + "completion_length": 559.0, + "delta_ref_entropy_loss": 0.1533203125, + "delta_ref_ppl": -0.09814453125, + "entropy_loss": -0.2236328125, + "epoch": 0.144, + "grad_norm": 4.311395883715873, + "k1_kl": 0.09765625, + "k3_kl": 0.052001953125, + "kimi_kl": 0.1298828125, + "learning_rate": 4.2799999999999997e-07, + "loss": 0.0021, + "ppl": 0.11572265625, + "reward": 0.8832522630691528, + "reward_std": 0.009399499744176865, + "rewards/perpo_ocr_edit_distance_reward": 0.8832523226737976, "step": 720, "temperature": 0.9 }, { - "advantages": -1.2065684003914612e-05, - "completion_length": 755.0, - "delta_ref_entropy_loss": 0.0313568115234375, - "delta_ref_ppl": -0.025482177734375, - "entropy_loss": -0.124755859375, - "epoch": 0.2884, - "grad_norm": 0.848399557572912, - "k1_kl": 0.02532958984375, - "k3_kl": 0.01495361328125, - "kimi_kl": 0.032012939453125, - "learning_rate": 3.558e-07, - "loss": 0.0006, - "ppl": 0.05859375, - "reward": 0.8235155642032623, - "reward_std": 0.18120786093641073, - "rewards/perpo_ocr_edit_distance_reward": 0.8235155940055847, + "advantages": 3.121580448350869e-05, + "completion_length": 1110.0, + "delta_ref_entropy_loss": 0.0810546875, + "delta_ref_ppl": -0.051513671875, + "entropy_loss": -0.050537109375, + "epoch": 0.1442, + "grad_norm": 0.6959126454920382, + "k1_kl": 0.051513671875, + "k3_kl": 0.025390625, + "kimi_kl": 0.05126953125, + "learning_rate": 4.2789999999999996e-07, + "loss": 0.001, + "ppl": 0.023681640625, + "reward": 0.6021227240562439, + "reward_std": 0.001536587136797607, + "rewards/perpo_ocr_edit_distance_reward": 0.6021227240562439, "step": 721, "temperature": 0.9 }, { - "advantages": -1.164419330734745e-06, - "completion_length": 394.5, - "delta_ref_entropy_loss": 0.068603515625, - "delta_ref_ppl": -0.0416259765625, - "entropy_loss": -0.077392578125, - "epoch": 0.2888, - "grad_norm": 1.1876149678176673, - "k1_kl": 0.0418701171875, - "k3_kl": 0.023681640625, - "kimi_kl": 0.041015625, - "learning_rate": 3.5560000000000003e-07, - "loss": 0.0009, - "ppl": 0.044921875, - "reward": 0.9257476031780243, - "reward_std": 0.016820951364934444, - "rewards/perpo_ocr_edit_distance_reward": 0.9257476329803467, + "advantages": -2.3509775928687304e-05, + "completion_length": 201.0, + "delta_ref_entropy_loss": 0.08251953125, + "delta_ref_ppl": -0.11865234375, + "entropy_loss": -0.031982421875, + "epoch": 0.1444, + "grad_norm": 1.6291245447998601, + "k1_kl": 0.11865234375, + "k3_kl": 0.07861328125, + "kimi_kl": 0.205078125, + "learning_rate": 4.278e-07, + "loss": 0.0032, + "ppl": 0.0118408203125, + "reward": 0.7576243877410889, + "reward_std": 0.002072234870865941, + "rewards/perpo_ocr_edit_distance_reward": 0.7576244473457336, "step": 722, "temperature": 0.9 }, { - "advantages": -3.489426518399341e-05, - "completion_length": 661.5, - "delta_ref_entropy_loss": 0.0413818359375, - "delta_ref_ppl": -0.04620361328125, - "entropy_loss": -0.048583984375, - "epoch": 0.2892, - "grad_norm": 0.5784311908434941, - "k1_kl": 0.04620361328125, - "k3_kl": 0.028076171875, - "kimi_kl": 0.0751953125, - "learning_rate": 3.5539999999999997e-07, - "loss": 0.0012, - "ppl": 0.024261474609375, - "reward": 0.9312344193458557, - "reward_std": 0.014241465803934261, - "rewards/perpo_ocr_edit_distance_reward": 0.9312344491481781, + "advantages": -9.449465323996264e-06, + "completion_length": 415.0, + "delta_ref_entropy_loss": 0.17578125, + "delta_ref_ppl": -0.1474609375, + "entropy_loss": -0.166015625, + "epoch": 0.1446, + "grad_norm": 2.0207923817079188, + "k1_kl": 0.146484375, + "k3_kl": 0.078125, + "kimi_kl": 0.1953125, + "learning_rate": 4.277e-07, + "loss": 0.0031, + "ppl": 0.0849609375, + "reward": 0.88582843542099, + "reward_std": 0.008001105859875679, + "rewards/perpo_ocr_edit_distance_reward": 0.8858284950256348, "step": 723, "temperature": 0.9 }, { - "advantages": -0.0002564447422628291, - "completion_length": 581.5, - "delta_ref_entropy_loss": 0.0230712890625, - "delta_ref_ppl": -0.0104522705078125, - "entropy_loss": -0.02435302734375, - "epoch": 0.2896, - "grad_norm": 544.0287314041199, - "k1_kl": 0.0103759765625, - "k3_kl": 0.3375701904296875, - "kimi_kl": 0.02130126953125, - "learning_rate": 3.552e-07, - "loss": 0.0138, - "ppl": 0.016815185546875, - "reward": 0.9982964098453522, - "reward_std": 0.0005952952051302418, - "rewards/perpo_ocr_edit_distance_reward": 0.9982965290546417, + "advantages": -1.196350422105752e-05, + "completion_length": 288.0, + "delta_ref_entropy_loss": 0.06787109375, + "delta_ref_ppl": -0.1357421875, + "entropy_loss": -0.034912109375, + "epoch": 0.1448, + "grad_norm": 1.180068931749267, + "k1_kl": 0.1357421875, + "k3_kl": 0.099609375, + "kimi_kl": 0.359375, + "learning_rate": 4.2759999999999994e-07, + "loss": 0.004, + "ppl": 0.01239013671875, + "reward": 0.9818680882453918, + "reward_std": 0.0013217319501563907, + "rewards/perpo_ocr_edit_distance_reward": 0.9818681478500366, "step": 724, "temperature": 0.9 }, { - "advantages": -3.9287977415369824e-05, - "completion_length": 244.0, - "delta_ref_entropy_loss": 0.05413818359375, - "delta_ref_ppl": -0.0845947265625, - "entropy_loss": -0.04705810546875, - "epoch": 0.29, - "grad_norm": 3.2253295903079593, - "k1_kl": 0.0845947265625, - "k3_kl": 0.05712890625, - "kimi_kl": 0.15771484375, - "learning_rate": 3.55e-07, - "loss": 0.0023, - "ppl": 0.020660400390625, - "reward": 0.974096953868866, - "reward_std": 0.0032447122503072023, - "rewards/perpo_ocr_edit_distance_reward": 0.9740971028804779, + "advantages": -2.0240035155438818e-05, + "completion_length": 737.0, + "delta_ref_entropy_loss": 0.04443359375, + "delta_ref_ppl": -0.052001953125, + "entropy_loss": -0.06298828125, + "epoch": 0.145, + "grad_norm": 1.4682302995646126, + "k1_kl": 0.052001953125, + "k3_kl": 0.031494140625, + "kimi_kl": 0.0849609375, + "learning_rate": 4.275e-07, + "loss": 0.0013, + "ppl": 0.040283203125, + "reward": 0.9817811250686646, + "reward_std": 0.0020033835899084806, + "rewards/perpo_ocr_edit_distance_reward": 0.9817811846733093, "step": 725, "temperature": 0.9 }, { - "advantages": -6.0100642258476e-05, - "completion_length": 511.5, - "delta_ref_entropy_loss": 0.064727783203125, - "delta_ref_ppl": -0.065185546875, - "entropy_loss": -0.04803466796875, - "epoch": 0.2904, - "grad_norm": 0.9202700093446594, - "k1_kl": 0.065185546875, - "k3_kl": 0.039764404296875, - "kimi_kl": 0.09710693359375, - "learning_rate": 3.548e-07, - "loss": 0.0017, - "ppl": 0.021026611328125, - "reward": 0.9960732758045197, - "reward_std": 0.0026353880239184946, - "rewards/perpo_ocr_edit_distance_reward": 0.9960733354091644, + "advantages": -0.0001259446144104004, + "completion_length": 584.0, + "delta_ref_entropy_loss": 0.07421875, + "delta_ref_ppl": -0.0703125, + "entropy_loss": -0.036376953125, + "epoch": 0.1452, + "grad_norm": 0.9297379951752524, + "k1_kl": 0.0703125, + "k3_kl": 0.04296875, + "kimi_kl": 0.09716796875, + "learning_rate": 4.274e-07, + "loss": 0.0018, + "ppl": 0.01806640625, + "reward": 0.9900137186050415, + "reward_std": 0.0005085638840682805, + "rewards/perpo_ocr_edit_distance_reward": 0.990013837814331, "step": 726, "temperature": 0.9 }, { - "advantages": -3.7040032339064055e-06, - "completion_length": 525.0, - "delta_ref_entropy_loss": 0.0623779296875, - "delta_ref_ppl": -0.03680419921875, - "entropy_loss": -0.0396728515625, - "epoch": 0.2908, - "grad_norm": 0.7441219914136167, - "k1_kl": 0.0367431640625, - "k3_kl": 0.018768310546875, - "kimi_kl": 0.040283203125, - "learning_rate": 3.546e-07, - "loss": 0.0008, - "ppl": 0.018951416015625, - "reward": 0.9828972220420837, - "reward_std": 0.007547239074483514, - "rewards/perpo_ocr_edit_distance_reward": 0.9828972518444061, + "advantages": 3.738062787306262e-06, + "completion_length": 538.0, + "delta_ref_entropy_loss": 0.030029296875, + "delta_ref_ppl": -0.0283203125, + "entropy_loss": -0.01141357421875, + "epoch": 0.1454, + "grad_norm": 0.8011372498163559, + "k1_kl": 0.0284423828125, + "k3_kl": 0.0164794921875, + "kimi_kl": 0.04443359375, + "learning_rate": 4.2729999999999997e-07, + "loss": 0.0007, + "ppl": 0.0047607421875, + "reward": 0.9893871545791626, + "reward_std": 0.004447352606803179, + "rewards/perpo_ocr_edit_distance_reward": 0.9893871545791626, "step": 727, "temperature": 0.9 }, { - "advantages": -5.8685033309302526e-05, - "completion_length": 880.5, - "delta_ref_entropy_loss": 0.04681396484375, - "delta_ref_ppl": -0.032379150390625, - "entropy_loss": -0.0716552734375, - "epoch": 0.2912, - "grad_norm": 0.7936695398125031, - "k1_kl": 0.0323486328125, - "k3_kl": 0.01666259765625, - "kimi_kl": 0.031097412109375, - "learning_rate": 3.544e-07, - "loss": 0.0007, - "ppl": 0.03607177734375, - "reward": 0.9358969628810883, - "reward_std": 0.003111963756964542, - "rewards/perpo_ocr_edit_distance_reward": 0.9358969628810883, + "advantages": -7.021427154541016e-05, + "completion_length": 649.0, + "delta_ref_entropy_loss": 0.0673828125, + "delta_ref_ppl": -0.053955078125, + "entropy_loss": -0.0205078125, + "epoch": 0.1456, + "grad_norm": 0.616111485664427, + "k1_kl": 0.053955078125, + "k3_kl": 0.02685546875, + "kimi_kl": 0.06591796875, + "learning_rate": 4.272e-07, + "loss": 0.0011, + "ppl": 0.005462646484375, + "reward": 0.9961234927177429, + "reward_std": 0.0005064843571744859, + "rewards/perpo_ocr_edit_distance_reward": 0.9961234927177429, "step": 728, "temperature": 0.9 }, { - "advantages": -2.717546203712118e-05, - "completion_length": 282.5, - "delta_ref_entropy_loss": 0.04864501953125, - "delta_ref_ppl": -0.06243896484375, - "entropy_loss": -0.0989990234375, - "epoch": 0.2916, - "grad_norm": 2.2886499216704554, - "k1_kl": 0.06268310546875, - "k3_kl": 0.04052734375, - "kimi_kl": 0.1015625, - "learning_rate": 3.542e-07, - "loss": 0.0017, - "ppl": 0.060943603515625, - "reward": 0.7004406601190567, - "reward_std": 0.0025488517712801695, - "rewards/perpo_ocr_edit_distance_reward": 0.7004407197237015, + "advantages": -4.460130730876699e-05, + "completion_length": 515.0, + "delta_ref_entropy_loss": 0.06884765625, + "delta_ref_ppl": -0.09130859375, + "entropy_loss": -0.052734375, + "epoch": 0.1458, + "grad_norm": 1.4516524759316605, + "k1_kl": 0.09130859375, + "k3_kl": 0.052978515625, + "kimi_kl": 0.1484375, + "learning_rate": 4.2709999999999995e-07, + "loss": 0.0022, + "ppl": 0.0296630859375, + "reward": 0.9874005913734436, + "reward_std": 0.0012366523733362556, + "rewards/perpo_ocr_edit_distance_reward": 0.9874007105827332, "step": 729, "temperature": 0.9 }, { - "advantages": 1.3533448566249717e-06, - "completion_length": 939.0, - "delta_ref_entropy_loss": 0.094482421875, - "delta_ref_ppl": -0.04656982421875, - "entropy_loss": -0.091796875, - "epoch": 0.292, - "grad_norm": 1.9963768023546404, - "k1_kl": 0.046539306640625, - "k3_kl": 0.028564453125, - "kimi_kl": 0.04901123046875, - "learning_rate": 3.5399999999999997e-07, - "loss": 0.0011, - "ppl": 0.057373046875, - "reward": 0.5915485471487045, - "reward_std": 0.052261647884733975, - "rewards/perpo_ocr_edit_distance_reward": 0.5915485769510269, + "advantages": -1.709801836113911e-05, + "completion_length": 696.0, + "delta_ref_entropy_loss": 0.052001953125, + "delta_ref_ppl": -0.04296875, + "entropy_loss": -0.0250244140625, + "epoch": 0.146, + "grad_norm": 0.7323357710782998, + "k1_kl": 0.04296875, + "k3_kl": 0.0218505859375, + "kimi_kl": 0.04833984375, + "learning_rate": 4.2699999999999995e-07, + "loss": 0.0009, + "ppl": 0.00921630859375, + "reward": 0.995851457118988, + "reward_std": 0.003881769021973014, + "rewards/perpo_ocr_edit_distance_reward": 0.9958515763282776, "step": 730, "temperature": 0.9 }, { - "advantages": 6.215913401774742e-07, - "completion_length": 684.5, - "delta_ref_entropy_loss": 0.14697265625, - "delta_ref_ppl": -0.07275390625, - "entropy_loss": -0.33447265625, - "epoch": 0.2924, - "grad_norm": 2.2863530852486194, - "k1_kl": 0.0732421875, - "k3_kl": 0.039794921875, - "kimi_kl": 0.0615234375, - "learning_rate": 3.538e-07, - "loss": 0.0016, - "ppl": 0.18896484375, - "reward": 0.7239298820495605, - "reward_std": 0.04067272786051035, - "rewards/perpo_ocr_edit_distance_reward": 0.7239298820495605, + "advantages": -4.448209801921621e-05, + "completion_length": 738.0, + "delta_ref_entropy_loss": 0.09423828125, + "delta_ref_ppl": -0.0673828125, + "entropy_loss": -0.0390625, + "epoch": 0.1462, + "grad_norm": 1.2700277433606542, + "k1_kl": 0.06689453125, + "k3_kl": 0.035888671875, + "kimi_kl": 0.08056640625, + "learning_rate": 4.269e-07, + "loss": 0.0015, + "ppl": 0.0162353515625, + "reward": 0.9485512375831604, + "reward_std": 0.0010482837678864598, + "rewards/perpo_ocr_edit_distance_reward": 0.9485512375831604, "step": 731, "temperature": 0.9 }, { - "advantages": -3.0270645083874115e-06, - "completion_length": 470.0, - "delta_ref_entropy_loss": 0.071533203125, - "delta_ref_ppl": -0.0401611328125, - "entropy_loss": -0.102294921875, - "epoch": 0.2928, - "grad_norm": 1.5850279820460704, - "k1_kl": 0.040283203125, - "k3_kl": 0.01861572265625, - "kimi_kl": 0.03497314453125, - "learning_rate": 3.536e-07, - "loss": 0.0007, - "ppl": 0.052490234375, - "reward": 0.8586294054985046, - "reward_std": 0.0088583470787853, - "rewards/perpo_ocr_edit_distance_reward": 0.8586294651031494, + "advantages": -3.065381974920456e-07, + "completion_length": 60.0, + "delta_ref_entropy_loss": 0.11669921875, + "delta_ref_ppl": -0.734375, + "entropy_loss": -0.43359375, + "epoch": 0.1464, + "grad_norm": 21.492686324054972, + "k1_kl": 0.73828125, + "k3_kl": 0.53515625, + "kimi_kl": 1.5546875, + "learning_rate": 4.268e-07, + "loss": 0.0214, + "ppl": 0.134765625, + "reward": 0.2825404107570648, + "reward_std": 0.09104913473129272, + "rewards/perpo_ocr_edit_distance_reward": 0.2825404405593872, "step": 732, "temperature": 0.9 }, { - "advantages": -5.121742105984595e-06, - "completion_length": 81.0, - "delta_ref_entropy_loss": 0.03314208984375, - "delta_ref_ppl": -0.064697265625, - "entropy_loss": -0.0308837890625, - "epoch": 0.2932, - "grad_norm": 1.2564383934373808, - "k1_kl": 0.064697265625, - "k3_kl": 0.0384521484375, - "kimi_kl": 0.088623046875, - "learning_rate": 3.534e-07, - "loss": 0.0015, - "ppl": 0.01519775390625, - "reward": 0.9985714256763458, - "reward_std": 0.0024397477973252535, - "rewards/perpo_ocr_edit_distance_reward": 0.9985714554786682, + "advantages": -2.7247838829680404e-07, + "completion_length": 116.0, + "delta_ref_entropy_loss": 0.173828125, + "delta_ref_ppl": -0.27734375, + "entropy_loss": -0.033203125, + "epoch": 0.1466, + "grad_norm": 4.2579328864516865, + "k1_kl": 0.27734375, + "k3_kl": 0.19140625, + "kimi_kl": 0.5703125, + "learning_rate": 4.2670000000000003e-07, + "loss": 0.0077, + "ppl": 0.00787353515625, + "reward": 0.8894500732421875, + "reward_std": 0.1668979525566101, + "rewards/perpo_ocr_edit_distance_reward": 0.889450192451477, "step": 733, "temperature": 0.9 }, { - "advantages": -0.00030384744923139806, - "completion_length": 396.5, - "delta_ref_entropy_loss": 0.0255126953125, - "delta_ref_ppl": -0.015380859375, - "entropy_loss": -0.020263671875, - "epoch": 0.2936, - "grad_norm": 0.3130831446182461, - "k1_kl": 0.015380859375, - "k3_kl": 0.00713348388671875, - "kimi_kl": 0.01248931884765625, - "learning_rate": 3.532e-07, - "loss": 0.0006, - "ppl": 0.0076751708984375, - "reward": 0.9955143332481384, - "reward_std": 0.00031562341609969735, - "rewards/perpo_ocr_edit_distance_reward": 0.9955143630504608, + "advantages": -1.8732889373040962e-07, + "completion_length": 2048.0, + "delta_ref_entropy_loss": 0.032470703125, + "delta_ref_ppl": -0.03466796875, + "entropy_loss": -0.0135498046875, + "epoch": 0.1468, + "grad_norm": 0.4444594978355379, + "k1_kl": 0.03466796875, + "k3_kl": 0.021240234375, + "kimi_kl": 0.0615234375, + "learning_rate": 4.2659999999999997e-07, + "loss": 0.0008, + "ppl": 0.0048828125, + "reward": 0.49331244826316833, + "reward_std": 0.06955040246248245, + "rewards/perpo_ocr_edit_distance_reward": 0.4933124780654907, "step": 734, "temperature": 0.9 }, { - "advantages": -9.884153405437246e-05, - "completion_length": 1167.0, - "delta_ref_entropy_loss": 0.021820068359375, - "delta_ref_ppl": -0.013458251953125, - "entropy_loss": -0.02093505859375, - "epoch": 0.294, - "grad_norm": 0.44712735539837084, - "k1_kl": 0.013458251953125, - "k3_kl": 0.0071563720703125, - "kimi_kl": 0.0133056640625, - "learning_rate": 3.5299999999999994e-07, - "loss": 0.0004, - "ppl": 0.011505126953125, - "reward": 0.9978047013282776, - "reward_std": 0.0012090186937712133, - "rewards/perpo_ocr_edit_distance_reward": 0.9978047609329224, + "advantages": 1.2431826235115295e-06, + "completion_length": 1410.0, + "delta_ref_entropy_loss": 0.16015625, + "delta_ref_ppl": -0.0869140625, + "entropy_loss": -0.3046875, + "epoch": 0.147, + "grad_norm": 4.070594526697229, + "k1_kl": 0.0869140625, + "k3_kl": 0.04931640625, + "kimi_kl": 0.07421875, + "learning_rate": 4.2649999999999996e-07, + "loss": 0.002, + "ppl": 0.1796875, + "reward": 0.7447027564048767, + "reward_std": 0.006717768497765064, + "rewards/perpo_ocr_edit_distance_reward": 0.7447027564048767, "step": 735, "temperature": 0.9 }, { - "advantages": -0.00019875595899065956, - "completion_length": 463.0, - "delta_ref_entropy_loss": 0.0198974609375, - "delta_ref_ppl": -0.0076141357421875, - "entropy_loss": -0.01568603515625, - "epoch": 0.2944, - "grad_norm": 0.2518066038286687, - "k1_kl": 0.0076141357421875, - "k3_kl": 0.00339508056640625, - "kimi_kl": 0.0047454833984375, - "learning_rate": 3.528e-07, - "loss": 0.0003, - "ppl": 0.006805419921875, - "reward": 0.9986412227153778, - "reward_std": 0.00027362476248526946, - "rewards/perpo_ocr_edit_distance_reward": 0.998641312122345, + "advantages": -2.0452909666346386e-05, + "completion_length": 650.0, + "delta_ref_entropy_loss": 0.057373046875, + "delta_ref_ppl": -0.05859375, + "entropy_loss": -0.022705078125, + "epoch": 0.1472, + "grad_norm": 0.614625662993521, + "k1_kl": 0.05859375, + "k3_kl": 0.033203125, + "kimi_kl": 0.083984375, + "learning_rate": 4.264e-07, + "loss": 0.0013, + "ppl": 0.01055908203125, + "reward": 0.9876614809036255, + "reward_std": 0.0011500208638608456, + "rewards/perpo_ocr_edit_distance_reward": 0.9876614809036255, "step": 736, "temperature": 0.9 }, { - "advantages": -0.00010889768918787013, - "completion_length": 379.0, - "delta_ref_entropy_loss": 0.036865234375, - "delta_ref_ppl": -0.03228759765625, - "entropy_loss": -0.04205322265625, - "epoch": 0.2948, - "grad_norm": 0.7106140910974026, - "k1_kl": 0.0322265625, - "k3_kl": 0.023101806640625, - "kimi_kl": 0.08428955078125, - "learning_rate": 3.526e-07, - "loss": 0.001, - "ppl": 0.0269775390625, - "reward": 0.9934866726398468, - "reward_std": 0.000838190178910736, - "rewards/perpo_ocr_edit_distance_reward": 0.9934867024421692, + "advantages": -3.646952973213047e-05, + "completion_length": 498.0, + "delta_ref_entropy_loss": 0.0654296875, + "delta_ref_ppl": -0.060791015625, + "entropy_loss": -0.04345703125, + "epoch": 0.1474, + "grad_norm": 0.842238576694645, + "k1_kl": 0.060791015625, + "k3_kl": 0.033935546875, + "kimi_kl": 0.08740234375, + "learning_rate": 4.263e-07, + "loss": 0.0014, + "ppl": 0.0186767578125, + "reward": 0.9662661552429199, + "reward_std": 0.0010671776253730059, + "rewards/perpo_ocr_edit_distance_reward": 0.9662662148475647, "step": 737, "temperature": 0.9 }, { - "advantages": -1.6672271215156798e-05, - "completion_length": 882.0, - "delta_ref_entropy_loss": 0.026031494140625, - "delta_ref_ppl": -0.0206756591796875, - "entropy_loss": -0.025360107421875, - "epoch": 0.2952, - "grad_norm": 0.9294828632392184, - "k1_kl": 0.0207061767578125, - "k3_kl": 0.0112762451171875, - "kimi_kl": 0.0257568359375, - "learning_rate": 3.5239999999999995e-07, - "loss": 0.0005, - "ppl": 0.0174713134765625, - "reward": 0.8990601599216461, - "reward_std": 0.02666456816950813, - "rewards/perpo_ocr_edit_distance_reward": 0.8990602493286133, + "advantages": -5.960464477539063e-08, + "completion_length": 337.0, + "delta_ref_entropy_loss": 0.12890625, + "delta_ref_ppl": -0.08984375, + "entropy_loss": -0.333984375, + "epoch": 0.1476, + "grad_norm": 2.520574480220106, + "k1_kl": 0.08984375, + "k3_kl": 0.0458984375, + "kimi_kl": 0.07666015625, + "learning_rate": 4.262e-07, + "loss": 0.0018, + "ppl": 0.185546875, + "reward": 0.3711623549461365, + "reward_std": 0.10917157679796219, + "rewards/perpo_ocr_edit_distance_reward": 0.37116238474845886, "step": 738, "temperature": 0.9 }, { - "advantages": -9.70704263636435e-07, - "completion_length": 455.5, - "delta_ref_entropy_loss": 0.00238037109375, - "delta_ref_ppl": -0.05615234375, - "entropy_loss": -0.116729736328125, - "epoch": 0.2956, - "grad_norm": 1.30755397996983, - "k1_kl": 0.056182861328125, - "k3_kl": 0.0392913818359375, - "kimi_kl": 0.10223388671875, - "learning_rate": 3.522e-07, - "loss": 0.0016, - "ppl": 0.057159423828125, - "reward": 0.4286460280418396, - "reward_std": 0.024115562438964844, - "rewards/perpo_ocr_edit_distance_reward": 0.4286460876464844, + "advantages": -2.3216010959004052e-05, + "completion_length": 824.0, + "delta_ref_entropy_loss": 0.04150390625, + "delta_ref_ppl": -0.051025390625, + "entropy_loss": -0.034423828125, + "epoch": 0.1478, + "grad_norm": 0.8782872825781551, + "k1_kl": 0.051025390625, + "k3_kl": 0.034912109375, + "kimi_kl": 0.10205078125, + "learning_rate": 4.261e-07, + "loss": 0.0014, + "ppl": 0.0185546875, + "reward": 0.9185863137245178, + "reward_std": 0.0010006575612351298, + "rewards/perpo_ocr_edit_distance_reward": 0.9185863733291626, "step": 739, "temperature": 0.9 }, { - "advantages": -0.0002853018895621062, - "completion_length": 669.5, - "delta_ref_entropy_loss": 0.02471923828125, - "delta_ref_ppl": -0.026092529296875, - "entropy_loss": -0.015228271484375, - "epoch": 0.296, - "grad_norm": 0.20211082868184227, - "k1_kl": 0.026153564453125, - "k3_kl": 0.01763916015625, - "kimi_kl": 0.06158447265625, - "learning_rate": 3.52e-07, - "loss": 0.001, - "ppl": 0.0075836181640625, - "reward": 0.998042106628418, - "reward_std": 0.00011749690747819841, - "rewards/perpo_ocr_edit_distance_reward": 0.9980421364307404, + "advantages": -1.6859600009411224e-06, + "completion_length": 277.0, + "delta_ref_entropy_loss": 0.0859375, + "delta_ref_ppl": -0.138671875, + "entropy_loss": -0.13671875, + "epoch": 0.148, + "grad_norm": 2.5122645084083532, + "k1_kl": 0.138671875, + "k3_kl": 0.09130859375, + "kimi_kl": 0.267578125, + "learning_rate": 4.26e-07, + "loss": 0.0037, + "ppl": 0.0673828125, + "reward": 0.42662176489830017, + "reward_std": 0.017493782564997673, + "rewards/perpo_ocr_edit_distance_reward": 0.42662179470062256, "step": 740, "temperature": 0.9 }, { - "advantages": -8.784234751146869e-05, - "completion_length": 638.5, - "delta_ref_entropy_loss": 0.06329345703125, - "delta_ref_ppl": -0.03717041015625, - "entropy_loss": -0.1175537109375, - "epoch": 0.2964, - "grad_norm": 1.3603656208828796, - "k1_kl": 0.03704833984375, - "k3_kl": 0.0179443359375, - "kimi_kl": 0.03515625, - "learning_rate": 3.5179999999999996e-07, - "loss": 0.0008, - "ppl": 0.06005859375, - "reward": 0.8911766707897186, - "reward_std": 0.007472915414837189, - "rewards/perpo_ocr_edit_distance_reward": 0.8911767303943634, + "advantages": -2.111707544827368e-06, + "completion_length": 741.0, + "delta_ref_entropy_loss": 0.0498046875, + "delta_ref_ppl": -0.048583984375, + "entropy_loss": -0.03857421875, + "epoch": 0.1482, + "grad_norm": 0.6202284293371079, + "k1_kl": 0.048583984375, + "k3_kl": 0.030029296875, + "kimi_kl": 0.08935546875, + "learning_rate": 4.2589999999999997e-07, + "loss": 0.0012, + "ppl": 0.015869140625, + "reward": 0.7850807309150696, + "reward_std": 0.00788617879152298, + "rewards/perpo_ocr_edit_distance_reward": 0.7850806713104248, "step": 741, "temperature": 0.9 }, { - "advantages": -0.00029872144972387105, - "completion_length": 396.0, - "delta_ref_entropy_loss": 0.083984375, - "delta_ref_ppl": -0.0637054443359375, - "entropy_loss": -0.101654052734375, - "epoch": 0.2968, - "grad_norm": 1.6301593827139178, - "k1_kl": 0.063720703125, - "k3_kl": 0.03759765625, - "kimi_kl": 0.1083831787109375, - "learning_rate": 3.516e-07, - "loss": 0.0018, - "ppl": 0.052539825439453125, - "reward": 0.7015187442302704, - "reward_std": 0.011834761127829552, - "rewards/perpo_ocr_edit_distance_reward": 0.7015188485383987, + "advantages": -1.7029898913278885e-07, + "completion_length": 761.0, + "delta_ref_entropy_loss": 0.054443359375, + "delta_ref_ppl": -0.05810546875, + "entropy_loss": -0.0260009765625, + "epoch": 0.1484, + "grad_norm": 0.7216645587817555, + "k1_kl": 0.057861328125, + "k3_kl": 0.038818359375, + "kimi_kl": 0.10791015625, + "learning_rate": 4.258e-07, + "loss": 0.0016, + "ppl": 0.01068115234375, + "reward": 0.933542013168335, + "reward_std": 0.1103716641664505, + "rewards/perpo_ocr_edit_distance_reward": 0.9335420727729797, "step": 742, "temperature": 0.9 }, { - "advantages": -2.6490007712709485e-05, - "completion_length": 802.5, - "delta_ref_entropy_loss": 0.0216064453125, - "delta_ref_ppl": -0.013946533203125, - "entropy_loss": -0.05291748046875, - "epoch": 0.2972, - "grad_norm": 0.5811686807664984, - "k1_kl": 0.01397705078125, - "k3_kl": 0.0070953369140625, - "kimi_kl": 0.014251708984375, - "learning_rate": 3.514e-07, - "loss": 0.0003, - "ppl": 0.030181884765625, - "reward": 0.8989646136760712, - "reward_std": 0.018331704544834793, - "rewards/perpo_ocr_edit_distance_reward": 0.8989647328853607, + "advantages": -0.0005960464477539062, + "completion_length": 259.0, + "delta_ref_entropy_loss": 0.05078125, + "delta_ref_ppl": -0.11376953125, + "entropy_loss": -0.01458740234375, + "epoch": 0.1486, + "grad_norm": 0.04174962532475475, + "k1_kl": 0.11376953125, + "k3_kl": 0.0830078125, + "kimi_kl": 0.287109375, + "learning_rate": 4.257e-07, + "loss": 0.0039, + "ppl": 0.0034637451171875, + "reward": 0.993060827255249, + "reward_std": 0.0, + "rewards/perpo_ocr_edit_distance_reward": 0.9930609464645386, "step": 743, "temperature": 0.9 }, { - "advantages": -4.705361165946442e-05, - "completion_length": 338.5, - "delta_ref_entropy_loss": 0.0474853515625, - "delta_ref_ppl": -0.10723876953125, - "entropy_loss": -0.4014892578125, - "epoch": 0.2976, - "grad_norm": 3.7410153600708047, - "k1_kl": 0.1072998046875, - "k3_kl": 0.0850830078125, - "kimi_kl": 0.40899658203125, - "learning_rate": 3.512e-07, - "loss": 0.0035, - "ppl": 0.193359375, - "reward": 0.6919625997543335, - "reward_std": 0.0346042135861353, - "rewards/perpo_ocr_edit_distance_reward": 0.6919625997543335, + "advantages": -0.0001430213451385498, + "completion_length": 387.0, + "delta_ref_entropy_loss": 0.06494140625, + "delta_ref_ppl": -0.087890625, + "entropy_loss": -0.0247802734375, + "epoch": 0.1488, + "grad_norm": 0.6985055038270446, + "k1_kl": 0.087890625, + "k3_kl": 0.056884765625, + "kimi_kl": 0.1552734375, + "learning_rate": 4.2559999999999995e-07, + "loss": 0.0024, + "ppl": 0.01171875, + "reward": 0.9903575778007507, + "reward_std": 0.0004953921888954937, + "rewards/perpo_ocr_edit_distance_reward": 0.9903576970100403, "step": 744, "temperature": 0.9 }, { - "advantages": -6.082228537707124e-05, - "completion_length": 587.0, - "delta_ref_entropy_loss": 0.02197265625, - "delta_ref_ppl": -0.0147705078125, - "entropy_loss": -0.020843505859375, - "epoch": 0.298, - "grad_norm": 0.41604033245697, - "k1_kl": 0.01470947265625, - "k3_kl": 0.008453369140625, - "kimi_kl": 0.021240234375, - "learning_rate": 3.5099999999999995e-07, - "loss": 0.0004, - "ppl": 0.01007080078125, - "reward": 0.9974896907806396, - "reward_std": 0.0007197126687970012, - "rewards/perpo_ocr_edit_distance_reward": 0.9974897503852844, + "advantages": -0.0001444476074539125, + "completion_length": 771.0, + "delta_ref_entropy_loss": 0.047607421875, + "delta_ref_ppl": -0.04443359375, + "entropy_loss": -0.0225830078125, + "epoch": 0.149, + "grad_norm": 1.4942210338999167, + "k1_kl": 0.04443359375, + "k3_kl": 0.0220947265625, + "kimi_kl": 0.049072265625, + "learning_rate": 4.255e-07, + "loss": 0.001, + "ppl": 0.01202392578125, + "reward": 0.9943008422851562, + "reward_std": 0.0006074001430533826, + "rewards/perpo_ocr_edit_distance_reward": 0.994300901889801, "step": 745, "temperature": 0.9 }, { - "advantages": -1.1499439551698742e-05, - "completion_length": 542.5, - "delta_ref_entropy_loss": 0.06182861328125, - "delta_ref_ppl": -0.0532684326171875, - "entropy_loss": -0.0478515625, - "epoch": 0.2984, - "grad_norm": 1.8552059763771998, - "k1_kl": 0.05328369140625, - "k3_kl": 0.0241851806640625, - "kimi_kl": 0.0408172607421875, - "learning_rate": 3.508e-07, - "loss": 0.001, - "ppl": 0.027679443359375, - "reward": 0.9831958711147308, - "reward_std": 0.0030113724060356617, - "rewards/perpo_ocr_edit_distance_reward": 0.983195960521698, + "advantages": -5.10896995820076e-08, + "completion_length": 2048.0, + "delta_ref_entropy_loss": 0.037841796875, + "delta_ref_ppl": -0.05322265625, + "entropy_loss": -0.142578125, + "epoch": 0.1492, + "grad_norm": 1.4264645604074675, + "k1_kl": 0.053466796875, + "k3_kl": 0.04296875, + "kimi_kl": 0.10498046875, + "learning_rate": 4.254e-07, + "loss": 0.0017, + "ppl": 0.06884765625, + "reward": 0.40809497237205505, + "reward_std": 0.09575004130601883, + "rewards/perpo_ocr_edit_distance_reward": 0.40809497237205505, "step": 746, "temperature": 0.9 }, { - "advantages": -2.287115447074939e-05, - "completion_length": 1116.0, - "delta_ref_entropy_loss": 0.03619384765625, - "delta_ref_ppl": -0.017242431640625, - "entropy_loss": -0.1495361328125, - "epoch": 0.2988, - "grad_norm": 1.4942273915298403, - "k1_kl": 0.0172119140625, - "k3_kl": 0.0112152099609375, - "kimi_kl": 0.01593017578125, - "learning_rate": 3.5060000000000003e-07, - "loss": 0.0005, - "ppl": 0.0824432373046875, - "reward": 0.7920106053352356, - "reward_std": 0.02819405755144544, - "rewards/perpo_ocr_edit_distance_reward": 0.7920106649398804, + "advantages": -5.330358544597402e-05, + "completion_length": 1343.0, + "delta_ref_entropy_loss": 0.019287109375, + "delta_ref_ppl": -0.036376953125, + "entropy_loss": -0.0250244140625, + "epoch": 0.1494, + "grad_norm": 0.6625632072593747, + "k1_kl": 0.036376953125, + "k3_kl": 0.0262451171875, + "kimi_kl": 0.11083984375, + "learning_rate": 4.253e-07, + "loss": 0.0011, + "ppl": 0.01214599609375, + "reward": 0.9937667846679688, + "reward_std": 0.0008582985028624535, + "rewards/perpo_ocr_edit_distance_reward": 0.9937669038772583, "step": 747, "temperature": 0.9 }, { - "advantages": -2.9661827966265264e-05, - "completion_length": 771.5, - "delta_ref_entropy_loss": 0.03741455078125, - "delta_ref_ppl": -0.0301513671875, - "entropy_loss": -0.0535888671875, - "epoch": 0.2992, - "grad_norm": 9954.367834302582, - "k1_kl": 0.0301513671875, - "k3_kl": 10.26544189453125, - "kimi_kl": 0.0677490234375, - "learning_rate": 3.5039999999999996e-07, - "loss": 0.4109, - "ppl": 0.032135009765625, - "reward": 0.994050145149231, - "reward_std": 0.0008934635552577674, - "rewards/perpo_ocr_edit_distance_reward": 0.9940501749515533, + "advantages": 1.691920442681294e-05, + "completion_length": 533.0, + "delta_ref_entropy_loss": 0.0299072265625, + "delta_ref_ppl": -0.04931640625, + "entropy_loss": -0.0196533203125, + "epoch": 0.1496, + "grad_norm": 1.5841987456677895, + "k1_kl": 0.04931640625, + "k3_kl": 0.033935546875, + "kimi_kl": 0.09716796875, + "learning_rate": 4.252e-07, + "loss": 0.0013, + "ppl": 0.010009765625, + "reward": 0.9821851849555969, + "reward_std": 0.0009070779196918011, + "rewards/perpo_ocr_edit_distance_reward": 0.9821852445602417, "step": 748, "temperature": 0.9 }, { - "advantages": -2.4497511049048626e-05, - "completion_length": 567.0, - "delta_ref_entropy_loss": 0.035400390625, - "delta_ref_ppl": -0.0445556640625, - "entropy_loss": -0.03228759765625, - "epoch": 0.2996, - "grad_norm": 0.7719698375037548, - "k1_kl": 0.0445556640625, - "k3_kl": 0.028533935546875, - "kimi_kl": 0.08056640625, - "learning_rate": 3.502e-07, - "loss": 0.0012, - "ppl": 0.016815185546875, - "reward": 0.9951601326465607, - "reward_std": 0.0009748683369252831, - "rewards/perpo_ocr_edit_distance_reward": 0.9951602220535278, + "advantages": -4.419258857524255e-06, + "completion_length": 650.0, + "delta_ref_entropy_loss": 0.1416015625, + "delta_ref_ppl": -0.09423828125, + "entropy_loss": -0.171875, + "epoch": 0.1498, + "grad_norm": 1.88099024029384, + "k1_kl": 0.0947265625, + "k3_kl": 0.05859375, + "kimi_kl": 0.115234375, + "learning_rate": 4.2509999999999996e-07, + "loss": 0.0023, + "ppl": 0.09130859375, + "reward": 0.7853307723999023, + "reward_std": 0.0037532609421759844, + "rewards/perpo_ocr_edit_distance_reward": 0.7853308320045471, "step": 749, "temperature": 0.9 }, { - "advantages": -6.952966941753402e-05, - "completion_length": 513.5, - "delta_ref_entropy_loss": 0.019989013671875, - "delta_ref_ppl": -0.017730712890625, - "entropy_loss": -0.01953125, - "epoch": 0.3, - "grad_norm": 0.4683407674471898, - "k1_kl": 0.0177764892578125, - "k3_kl": 0.01019287109375, - "kimi_kl": 0.020294189453125, - "learning_rate": 3.5e-07, - "loss": 0.0005, - "ppl": 0.008544921875, - "reward": 0.9349695146083832, - "reward_std": 0.00013370647502597421, - "rewards/perpo_ocr_edit_distance_reward": 0.934969574213028, + "advantages": -0.0001539758377475664, + "completion_length": 747.0, + "delta_ref_entropy_loss": 0.0556640625, + "delta_ref_ppl": -0.056396484375, + "entropy_loss": -0.0169677734375, + "epoch": 0.15, + "grad_norm": 0.5004934276043272, + "k1_kl": 0.056396484375, + "k3_kl": 0.03173828125, + "kimi_kl": 0.08056640625, + "learning_rate": 4.2499999999999995e-07, + "loss": 0.0014, + "ppl": 0.00762939453125, + "reward": 0.9833580851554871, + "reward_std": 0.00023176598188001662, + "rewards/perpo_ocr_edit_distance_reward": 0.9833582043647766, "step": 750, "temperature": 0.9 }, { - "advantages": -3.798093257501023e-05, - "completion_length": 821.5, - "delta_ref_entropy_loss": 0.0179443359375, - "delta_ref_ppl": -0.02008056640625, - "entropy_loss": -0.01947021484375, - "epoch": 0.3004, - "grad_norm": 0.633055951440414, - "k1_kl": 0.02008056640625, - "k3_kl": 0.0127716064453125, - "kimi_kl": 0.04473876953125, - "learning_rate": 3.4979999999999997e-07, - "loss": 0.0005, - "ppl": 0.0082855224609375, - "reward": 0.9975495636463165, - "reward_std": 0.0008094083168543875, - "rewards/perpo_ocr_edit_distance_reward": 0.9975495934486389, + "advantages": -7.782664397382177e-06, + "completion_length": 836.0, + "delta_ref_entropy_loss": 0.0703125, + "delta_ref_ppl": -0.0732421875, + "entropy_loss": -0.06787109375, + "epoch": 0.1502, + "grad_norm": 1.0994486012982618, + "k1_kl": 0.0732421875, + "k3_kl": 0.047607421875, + "kimi_kl": 0.1337890625, + "learning_rate": 4.249e-07, + "loss": 0.0019, + "ppl": 0.0274658203125, + "reward": 0.8265166878700256, + "reward_std": 0.00646384758874774, + "rewards/perpo_ocr_edit_distance_reward": 0.8265167474746704, "step": 751, "temperature": 0.9 }, { - "advantages": -1.6008104921638733e-06, - "completion_length": 443.5, - "delta_ref_entropy_loss": 0.04638671875, - "delta_ref_ppl": -0.035888671875, - "entropy_loss": -0.042724609375, - "epoch": 0.3008, - "grad_norm": 0.7115372001331942, - "k1_kl": 0.0357666015625, - "k3_kl": 0.02001953125, - "kimi_kl": 0.0550537109375, - "learning_rate": 3.496e-07, - "loss": 0.0008, - "ppl": 0.021453857421875, - "reward": 0.9898380935192108, - "reward_std": 0.015853675082325935, - "rewards/perpo_ocr_edit_distance_reward": 0.9898381531238556, + "advantages": -3.065381974920456e-07, + "completion_length": 652.0, + "delta_ref_entropy_loss": 0.1767578125, + "delta_ref_ppl": -0.099609375, + "entropy_loss": -0.1845703125, + "epoch": 0.1504, + "grad_norm": 1.164299535181385, + "k1_kl": 0.10009765625, + "k3_kl": 0.052734375, + "kimi_kl": 0.09912109375, + "learning_rate": 4.248e-07, + "loss": 0.0021, + "ppl": 0.10107421875, + "reward": 0.5785494446754456, + "reward_std": 0.15452207624912262, + "rewards/perpo_ocr_edit_distance_reward": 0.5785495042800903, "step": 752, "temperature": 0.9 }, { - "advantages": -1.5271561778718024e-05, - "completion_length": 640.0, - "delta_ref_entropy_loss": 0.10882568359375, - "delta_ref_ppl": -0.0419769287109375, - "entropy_loss": -0.1815185546875, - "epoch": 0.3012, - "grad_norm": 2.1520032354985847, - "k1_kl": 0.0417327880859375, - "k3_kl": 0.017181396484375, - "kimi_kl": 0.021209716796875, - "learning_rate": 3.494e-07, - "loss": 0.0007, - "ppl": 0.10589599609375, - "reward": 0.9174700081348419, - "reward_std": 0.0036581841995939612, - "rewards/perpo_ocr_edit_distance_reward": 0.9174700677394867, + "advantages": 8.65118909132434e-06, + "completion_length": 505.0, + "delta_ref_entropy_loss": 0.11865234375, + "delta_ref_ppl": -0.08837890625, + "entropy_loss": -0.04833984375, + "epoch": 0.1506, + "grad_norm": 0.8248612654732499, + "k1_kl": 0.08837890625, + "k3_kl": 0.046875, + "kimi_kl": 0.130859375, + "learning_rate": 4.247e-07, + "loss": 0.0019, + "ppl": 0.0198974609375, + "reward": 0.9482401609420776, + "reward_std": 0.0008834992768242955, + "rewards/perpo_ocr_edit_distance_reward": 0.9482401609420776, "step": 753, "temperature": 0.9 }, { - "advantages": -6.220170871529263e-05, - "completion_length": 649.5, - "delta_ref_entropy_loss": 0.0518798828125, - "delta_ref_ppl": -0.0462646484375, - "entropy_loss": -0.0411376953125, - "epoch": 0.3016, - "grad_norm": 0.8508298701070518, - "k1_kl": 0.0462646484375, - "k3_kl": 0.0303955078125, - "kimi_kl": 0.0711669921875, - "learning_rate": 3.492e-07, - "loss": 0.0013, - "ppl": 0.02496337890625, - "reward": 0.9700488448143005, - "reward_std": 0.001234190131071955, - "rewards/perpo_ocr_edit_distance_reward": 0.9700489342212677, + "advantages": -3.0211042030714452e-05, + "completion_length": 372.0, + "delta_ref_entropy_loss": 0.08251953125, + "delta_ref_ppl": -0.091796875, + "entropy_loss": -0.033203125, + "epoch": 0.1508, + "grad_norm": 1.013590869930755, + "k1_kl": 0.091796875, + "k3_kl": 0.055419921875, + "kimi_kl": 0.1923828125, + "learning_rate": 4.246e-07, + "loss": 0.0023, + "ppl": 0.01336669921875, + "reward": 0.9790230393409729, + "reward_std": 0.0018712825840339065, + "rewards/perpo_ocr_edit_distance_reward": 0.9790231585502625, "step": 754, "temperature": 0.9 }, { - "advantages": -6.004742408549646e-05, - "completion_length": 585.0, - "delta_ref_entropy_loss": 0.044921875, - "delta_ref_ppl": -0.02093505859375, - "entropy_loss": -0.039306640625, - "epoch": 0.302, - "grad_norm": 0.441217046633801, - "k1_kl": 0.021026611328125, - "k3_kl": 0.0100250244140625, - "kimi_kl": 0.02008056640625, - "learning_rate": 3.4899999999999996e-07, - "loss": 0.0005, - "ppl": 0.0180511474609375, - "reward": 0.9570534527301788, - "reward_std": 0.0015940855082590133, - "rewards/perpo_ocr_edit_distance_reward": 0.9570535123348236, + "advantages": -1.736411104502622e-05, + "completion_length": 959.0, + "delta_ref_entropy_loss": 0.03759765625, + "delta_ref_ppl": -0.04443359375, + "entropy_loss": -0.103515625, + "epoch": 0.151, + "grad_norm": 5.843876549967348, + "k1_kl": 0.04443359375, + "k3_kl": 0.03369140625, + "kimi_kl": 0.10009765625, + "learning_rate": 4.2449999999999997e-07, + "loss": 0.0014, + "ppl": 0.0517578125, + "reward": 0.980358898639679, + "reward_std": 0.0023536481894552708, + "rewards/perpo_ocr_edit_distance_reward": 0.9803589582443237, "step": 755, "temperature": 0.9 }, { - "advantages": -9.213175417244202e-06, - "completion_length": 706.5, - "delta_ref_entropy_loss": 0.0635986328125, - "delta_ref_ppl": -0.045166015625, - "entropy_loss": -0.05609130859375, - "epoch": 0.3024, - "grad_norm": 0.758790075906776, - "k1_kl": 0.0450439453125, - "k3_kl": 0.02398681640625, - "kimi_kl": 0.0550537109375, - "learning_rate": 3.488e-07, - "loss": 0.001, - "ppl": 0.0281982421875, - "reward": 0.9582001566886902, - "reward_std": 0.003409476194065064, - "rewards/perpo_ocr_edit_distance_reward": 0.9582002460956573, + "advantages": -1.8947892385767773e-05, + "completion_length": 544.0, + "delta_ref_entropy_loss": 0.162109375, + "delta_ref_ppl": -0.1103515625, + "entropy_loss": -0.09765625, + "epoch": 0.1512, + "grad_norm": 2.063747590786327, + "k1_kl": 0.10986328125, + "k3_kl": 0.0634765625, + "kimi_kl": 0.1318359375, + "learning_rate": 4.2439999999999996e-07, + "loss": 0.0026, + "ppl": 0.053955078125, + "reward": 0.9560689926147461, + "reward_std": 0.001697793137282133, + "rewards/perpo_ocr_edit_distance_reward": 0.9560691118240356, "step": 756, "temperature": 0.9 }, { - "advantages": -1.9073487464993377e-06, - "completion_length": 750.5, - "delta_ref_entropy_loss": 0.07012939453125, - "delta_ref_ppl": -0.04638671875, - "entropy_loss": -0.1431884765625, - "epoch": 0.3028, - "grad_norm": 1.5542099543501042, - "k1_kl": 0.04638671875, - "k3_kl": 0.024688720703125, - "kimi_kl": 0.0546875, - "learning_rate": 3.486e-07, - "loss": 0.001, - "ppl": 0.077880859375, - "reward": 0.8010080754756927, - "reward_std": 0.06618406483903527, - "rewards/perpo_ocr_edit_distance_reward": 0.8010081350803375, + "advantages": -5.260535908746533e-05, + "completion_length": 557.0, + "delta_ref_entropy_loss": 0.07421875, + "delta_ref_ppl": -0.056396484375, + "entropy_loss": -0.0225830078125, + "epoch": 0.1514, + "grad_norm": 0.5964538298617861, + "k1_kl": 0.056396484375, + "k3_kl": 0.02978515625, + "kimi_kl": 0.07958984375, + "learning_rate": 4.243e-07, + "loss": 0.0012, + "ppl": 0.00946044921875, + "reward": 0.9937941431999207, + "reward_std": 0.0005472760531120002, + "rewards/perpo_ocr_edit_distance_reward": 0.9937942028045654, "step": 757, "temperature": 0.9 }, { - "advantages": -4.606374841387151e-05, - "completion_length": 659.0, - "delta_ref_entropy_loss": 0.04437255859375, - "delta_ref_ppl": -0.04461669921875, - "entropy_loss": -0.0557861328125, - "epoch": 0.3032, - "grad_norm": 0.8966484159302796, - "k1_kl": 0.04486083984375, - "k3_kl": 0.02850341796875, - "kimi_kl": 0.1067352294921875, - "learning_rate": 3.4839999999999997e-07, - "loss": 0.0012, - "ppl": 0.029052734375, - "reward": 0.9538333415985107, - "reward_std": 0.004320889216614887, - "rewards/perpo_ocr_edit_distance_reward": 0.9538334906101227, + "advantages": -9.952273103408515e-05, + "completion_length": 720.0, + "delta_ref_entropy_loss": 0.07470703125, + "delta_ref_ppl": -0.061767578125, + "entropy_loss": -0.022216796875, + "epoch": 0.1516, + "grad_norm": 0.5217353669107153, + "k1_kl": 0.061767578125, + "k3_kl": 0.031494140625, + "kimi_kl": 0.0751953125, + "learning_rate": 4.242e-07, + "loss": 0.0014, + "ppl": 0.01055908203125, + "reward": 0.9962120652198792, + "reward_std": 0.00041330078965984285, + "rewards/perpo_ocr_edit_distance_reward": 0.9962121844291687, "step": 758, "temperature": 0.9 }, { - "advantages": -4.2787622078321874e-05, - "completion_length": 763.5, - "delta_ref_entropy_loss": 0.0439453125, - "delta_ref_ppl": -0.01782989501953125, - "entropy_loss": -0.05792236328125, - "epoch": 0.3036, - "grad_norm": 0.7121477301782159, - "k1_kl": 0.0178375244140625, - "k3_kl": 0.008481025695800781, - "kimi_kl": 0.012434005737304688, - "learning_rate": 3.482e-07, - "loss": 0.0004, - "ppl": 0.028900146484375, - "reward": 0.9465698003768921, - "reward_std": 0.006706796881189803, - "rewards/perpo_ocr_edit_distance_reward": 0.9465698599815369, + "advantages": -0.0005960464477539062, + "completion_length": 122.0, + "delta_ref_entropy_loss": 0.107421875, + "delta_ref_ppl": -0.240234375, + "entropy_loss": -0.0242919921875, + "epoch": 0.1518, + "grad_norm": 0.022168321842520475, + "k1_kl": 0.240234375, + "k3_kl": 0.1689453125, + "kimi_kl": 0.57421875, + "learning_rate": 4.2409999999999994e-07, + "loss": 0.0074, + "ppl": 0.00360107421875, + "reward": 0.9841628074645996, + "reward_std": 0.0, + "rewards/perpo_ocr_edit_distance_reward": 0.9841629266738892, "step": 759, "temperature": 0.9 }, { - "advantages": -4.283019478634742e-06, - "completion_length": 228.0, - "delta_ref_entropy_loss": 0.109130859375, - "delta_ref_ppl": -0.136962890625, - "entropy_loss": -0.28125, - "epoch": 0.304, - "grad_norm": 4.553437716344921, - "k1_kl": 0.136962890625, - "k3_kl": 0.09228515625, - "kimi_kl": 0.275634765625, - "learning_rate": 3.4799999999999994e-07, - "loss": 0.0037, - "ppl": 0.15673828125, - "reward": 0.7456239759922028, - "reward_std": 0.048118193401023746, - "rewards/perpo_ocr_edit_distance_reward": 0.7456240057945251, + "advantages": -2.3509775928687304e-05, + "completion_length": 174.0, + "delta_ref_entropy_loss": 0.0888671875, + "delta_ref_ppl": -0.1943359375, + "entropy_loss": -0.08056640625, + "epoch": 0.152, + "grad_norm": 2.824496626451655, + "k1_kl": 0.1943359375, + "k3_kl": 0.1337890625, + "kimi_kl": 0.34765625, + "learning_rate": 4.24e-07, + "loss": 0.0054, + "ppl": 0.0341796875, + "reward": 0.9441244602203369, + "reward_std": 0.0017102425917983055, + "rewards/perpo_ocr_edit_distance_reward": 0.9441244602203369, "step": 760, "temperature": 0.9 }, { - "advantages": -4.020759297418408e-05, - "completion_length": 628.5, - "delta_ref_entropy_loss": 0.03759765625, - "delta_ref_ppl": -0.0191650390625, - "entropy_loss": -0.02642822265625, - "epoch": 0.3044, - "grad_norm": 1.3507433717429451, - "k1_kl": 0.0191650390625, - "k3_kl": 0.009002685546875, - "kimi_kl": 0.016693115234375, - "learning_rate": 3.478e-07, - "loss": 0.0004, - "ppl": 0.013702392578125, - "reward": 0.9934330582618713, - "reward_std": 0.0013884006475564092, - "rewards/perpo_ocr_edit_distance_reward": 0.9934331178665161, + "advantages": -7.663455107831396e-06, + "completion_length": 530.0, + "delta_ref_entropy_loss": 0.052734375, + "delta_ref_ppl": -0.0986328125, + "entropy_loss": -0.05224609375, + "epoch": 0.1522, + "grad_norm": 1.4850967002228241, + "k1_kl": 0.0986328125, + "k3_kl": 0.06689453125, + "kimi_kl": 0.1865234375, + "learning_rate": 4.239e-07, + "loss": 0.0027, + "ppl": 0.0264892578125, + "reward": 0.9788986444473267, + "reward_std": 0.004331559408456087, + "rewards/perpo_ocr_edit_distance_reward": 0.9788987040519714, "step": 761, "temperature": 0.9 }, { - "advantages": -5.4525480678080385e-05, - "completion_length": 597.5, - "delta_ref_entropy_loss": 0.02813720703125, - "delta_ref_ppl": -0.0179443359375, - "entropy_loss": -0.03533935546875, - "epoch": 0.3048, - "grad_norm": 0.921040318681228, - "k1_kl": 0.017913818359375, - "k3_kl": 0.0090179443359375, - "kimi_kl": 0.016082763671875, - "learning_rate": 3.476e-07, - "loss": 0.0004, - "ppl": 0.0165252685546875, - "reward": 0.955992579460144, - "reward_std": 0.097225675242953, - "rewards/perpo_ocr_edit_distance_reward": 0.9559926092624664, + "advantages": -3.2356808787881164e-06, + "completion_length": 546.0, + "delta_ref_entropy_loss": 0.080078125, + "delta_ref_ppl": -0.0634765625, + "entropy_loss": -0.02734375, + "epoch": 0.1524, + "grad_norm": 0.807038711787461, + "k1_kl": 0.06396484375, + "k3_kl": 0.033203125, + "kimi_kl": 0.0791015625, + "learning_rate": 4.2379999999999997e-07, + "loss": 0.0013, + "ppl": 0.00946044921875, + "reward": 0.9881249666213989, + "reward_std": 0.005141222849488258, + "rewards/perpo_ocr_edit_distance_reward": 0.9881250262260437, "step": 762, "temperature": 0.9 }, { - "advantages": -1.196350422105752e-05, - "completion_length": 314.0, - "delta_ref_entropy_loss": 0.065673828125, - "delta_ref_ppl": -0.0538330078125, - "entropy_loss": -0.06182861328125, - "epoch": 0.3052, - "grad_norm": 1.113173465873657, - "k1_kl": 0.0538330078125, - "k3_kl": 0.031494140625, - "kimi_kl": 0.07196044921875, - "learning_rate": 3.4739999999999995e-07, - "loss": 0.0013, - "ppl": 0.034912109375, - "reward": 0.9853874444961548, - "reward_std": 0.001973915088456124, - "rewards/perpo_ocr_edit_distance_reward": 0.9853874444961548, + "advantages": 0.0, + "completion_length": 35.0, + "delta_ref_entropy_loss": 0.17578125, + "delta_ref_ppl": -0.6796875, + "entropy_loss": -0.1220703125, + "epoch": 0.1526, + "grad_norm": 0.27678912849870685, + "k1_kl": 0.6796875, + "k3_kl": 0.53515625, + "kimi_kl": 1.84375, + "learning_rate": 4.237e-07, + "loss": 0.0213, + "ppl": 0.040283203125, + "reward": 0.6084336638450623, + "reward_std": 0.0, + "rewards/perpo_ocr_edit_distance_reward": 0.608433723449707, "step": 763, "temperature": 0.9 }, { - "advantages": -2.418245685475995e-06, - "completion_length": 579.0, - "delta_ref_entropy_loss": 0.0567626953125, - "delta_ref_ppl": -0.025970458984375, - "entropy_loss": -0.063720703125, - "epoch": 0.3056, - "grad_norm": 0.7399100393086214, - "k1_kl": 0.025970458984375, - "k3_kl": 0.0124359130859375, - "kimi_kl": 0.018951416015625, - "learning_rate": 3.472e-07, - "loss": 0.0005, - "ppl": 0.033447265625, - "reward": 0.9653053879737854, - "reward_std": 0.003469737246632576, - "rewards/perpo_ocr_edit_distance_reward": 0.9653054475784302, + "advantages": 1.9993101886939257e-05, + "completion_length": 785.0, + "delta_ref_entropy_loss": 0.0810546875, + "delta_ref_ppl": -0.057861328125, + "entropy_loss": -0.052978515625, + "epoch": 0.1528, + "grad_norm": 0.7993667489175111, + "k1_kl": 0.0576171875, + "k3_kl": 0.0306396484375, + "kimi_kl": 0.060546875, + "learning_rate": 4.2359999999999995e-07, + "loss": 0.0012, + "ppl": 0.0230712890625, + "reward": 0.9864711761474609, + "reward_std": 0.0007515996694564819, + "rewards/perpo_ocr_edit_distance_reward": 0.9864712357521057, "step": 764, "temperature": 0.9 }, { - "advantages": -3.659299636638025e-05, - "completion_length": 341.0, - "delta_ref_entropy_loss": 0.0482177734375, - "delta_ref_ppl": -0.03839111328125, - "entropy_loss": -0.04632568359375, - "epoch": 0.306, - "grad_norm": 0.9200511180364768, - "k1_kl": 0.0384521484375, - "k3_kl": 0.0216064453125, - "kimi_kl": 0.043701171875, - "learning_rate": 3.4699999999999997e-07, - "loss": 0.0009, - "ppl": 0.02197265625, - "reward": 0.7934320569038391, - "reward_std": 0.001142819644883275, - "rewards/perpo_ocr_edit_distance_reward": 0.7934321165084839, + "advantages": -0.0005960464477539062, + "completion_length": 322.0, + "delta_ref_entropy_loss": 0.08984375, + "delta_ref_ppl": -0.0771484375, + "entropy_loss": -0.020751953125, + "epoch": 0.153, + "grad_norm": 0.018524003102224282, + "k1_kl": 0.0771484375, + "k3_kl": 0.041259765625, + "kimi_kl": 0.10791015625, + "learning_rate": 4.2349999999999995e-07, + "loss": 0.0022, + "ppl": 0.004180908203125, + "reward": 0.9864864349365234, + "reward_std": 0.0, + "rewards/perpo_ocr_edit_distance_reward": 0.986486554145813, "step": 765, "temperature": 0.9 }, { - "advantages": -1.7000096192987257e-05, - "completion_length": 712.5, - "delta_ref_entropy_loss": 0.069580078125, - "delta_ref_ppl": -0.0361328125, - "entropy_loss": -0.0811767578125, - "epoch": 0.3064, - "grad_norm": 1.1734167545159373, - "k1_kl": 0.0361328125, - "k3_kl": 0.016632080078125, - "kimi_kl": 0.034912109375, - "learning_rate": 3.4679999999999996e-07, - "loss": 0.0007, - "ppl": 0.0445556640625, - "reward": 0.9551546573638916, - "reward_std": 0.013033025083132088, - "rewards/perpo_ocr_edit_distance_reward": 0.9551547169685364, + "advantages": 1.5088490727066528e-05, + "completion_length": 883.0, + "delta_ref_entropy_loss": 0.029296875, + "delta_ref_ppl": -0.044921875, + "entropy_loss": -0.03662109375, + "epoch": 0.1532, + "grad_norm": 0.49673561329099114, + "k1_kl": 0.044921875, + "k3_kl": 0.027099609375, + "kimi_kl": 0.0625, + "learning_rate": 4.234e-07, + "loss": 0.0011, + "ppl": 0.016845703125, + "reward": 0.983483612537384, + "reward_std": 0.002157347509637475, + "rewards/perpo_ocr_edit_distance_reward": 0.983483612537384, "step": 766, "temperature": 0.9 }, { - "advantages": -1.4368977645062841e-05, - "completion_length": 622.0, - "delta_ref_entropy_loss": 0.02166748046875, - "delta_ref_ppl": -0.011474609375, - "entropy_loss": -0.017120361328125, - "epoch": 0.3068, - "grad_norm": 0.44097976503704395, - "k1_kl": 0.011444091796875, - "k3_kl": 0.0063629150390625, - "kimi_kl": 0.0153656005859375, - "learning_rate": 3.466e-07, - "loss": 0.0003, - "ppl": 0.0091400146484375, - "reward": 0.9974852204322815, - "reward_std": 0.0007687183970119804, - "rewards/perpo_ocr_edit_distance_reward": 0.9974852800369263, + "advantages": -2.9359545806073584e-05, + "completion_length": 707.0, + "delta_ref_entropy_loss": 0.052001953125, + "delta_ref_ppl": -0.039306640625, + "entropy_loss": -0.0234375, + "epoch": 0.1534, + "grad_norm": 0.674428780424137, + "k1_kl": 0.039306640625, + "k3_kl": 0.02294921875, + "kimi_kl": 0.051025390625, + "learning_rate": 4.233e-07, + "loss": 0.0009, + "ppl": 0.01422119140625, + "reward": 0.9953629970550537, + "reward_std": 0.0010596453212201595, + "rewards/perpo_ocr_edit_distance_reward": 0.9953630566596985, "step": 767, "temperature": 0.9 }, { - "advantages": -8.919409765439923e-06, - "completion_length": 408.0, - "delta_ref_entropy_loss": 0.12060546875, - "delta_ref_ppl": -0.072021484375, - "entropy_loss": -0.18212890625, - "epoch": 0.3072, - "grad_norm": 1.7735623809502326, - "k1_kl": 0.072021484375, - "k3_kl": 0.0426025390625, - "kimi_kl": 0.067626953125, - "learning_rate": 3.464e-07, - "loss": 0.0017, - "ppl": 0.10009765625, - "reward": 0.8253172934055328, - "reward_std": 0.006426011328585446, - "rewards/perpo_ocr_edit_distance_reward": 0.8253173530101776, + "advantages": -1.992498255276587e-06, + "completion_length": 768.0, + "delta_ref_entropy_loss": 0.052734375, + "delta_ref_ppl": -0.0380859375, + "entropy_loss": -0.03564453125, + "epoch": 0.1536, + "grad_norm": 0.7416002337307059, + "k1_kl": 0.0380859375, + "k3_kl": 0.02001953125, + "kimi_kl": 0.038818359375, + "learning_rate": 4.232e-07, + "loss": 0.0008, + "ppl": 0.014892578125, + "reward": 0.979602575302124, + "reward_std": 0.004206902347505093, + "rewards/perpo_ocr_edit_distance_reward": 0.9796026349067688, "step": 768, "temperature": 0.9 }, { - "advantages": -3.516248511914455e-05, - "completion_length": 606.5, - "delta_ref_entropy_loss": 0.03326416015625, - "delta_ref_ppl": -0.0347900390625, - "entropy_loss": -0.03173828125, - "epoch": 0.3076, - "grad_norm": 0.6658470542711773, - "k1_kl": 0.03485107421875, - "k3_kl": 0.02142333984375, - "kimi_kl": 0.060546875, - "learning_rate": 3.462e-07, - "loss": 0.0009, - "ppl": 0.01519775390625, - "reward": 0.9862005114555359, - "reward_std": 0.005836794909555465, - "rewards/perpo_ocr_edit_distance_reward": 0.9862005412578583, + "advantages": -5.10896995820076e-08, + "completion_length": 219.0, + "delta_ref_entropy_loss": 0.09228515625, + "delta_ref_ppl": -0.1337890625, + "entropy_loss": -0.1162109375, + "epoch": 0.1538, + "grad_norm": 4.457012039869597, + "k1_kl": 0.1337890625, + "k3_kl": 0.0908203125, + "kimi_kl": 0.283203125, + "learning_rate": 4.2309999999999997e-07, + "loss": 0.0036, + "ppl": 0.061279296875, + "reward": 0.8820422291755676, + "reward_std": 0.2754378616809845, + "rewards/perpo_ocr_edit_distance_reward": 0.8820422291755676, "step": 769, "temperature": 0.9 }, { - "advantages": -2.017404449361493e-05, - "completion_length": 655.0, - "delta_ref_entropy_loss": 0.03741455078125, - "delta_ref_ppl": -0.02227783203125, - "entropy_loss": -0.05877685546875, - "epoch": 0.308, - "grad_norm": 0.7480404717129416, - "k1_kl": 0.02227783203125, - "k3_kl": 0.0126800537109375, - "kimi_kl": 0.02764892578125, - "learning_rate": 3.4599999999999995e-07, - "loss": 0.0005, - "ppl": 0.029388427734375, - "reward": 0.9939487874507904, - "reward_std": 0.0013876160082872957, - "rewards/perpo_ocr_edit_distance_reward": 0.9939487874507904, + "advantages": -3.294433918199502e-05, + "completion_length": 1236.0, + "delta_ref_entropy_loss": 0.072265625, + "delta_ref_ppl": -0.056884765625, + "entropy_loss": -0.06640625, + "epoch": 0.154, + "grad_norm": 2.5749954970669258, + "k1_kl": 0.05712890625, + "k3_kl": 0.0306396484375, + "kimi_kl": 0.05615234375, + "learning_rate": 4.2299999999999996e-07, + "loss": 0.0013, + "ppl": 0.0400390625, + "reward": 0.990517258644104, + "reward_std": 0.0017088382737711072, + "rewards/perpo_ocr_edit_distance_reward": 0.9905173778533936, "step": 770, "temperature": 0.9 }, { - "advantages": -1.1324882507324219e-06, - "completion_length": 358.5, - "delta_ref_entropy_loss": 0.017303466796875, - "delta_ref_ppl": -0.0515899658203125, - "entropy_loss": -0.040435791015625, - "epoch": 0.3084, - "grad_norm": 3.455382295516646, - "k1_kl": 0.0515899658203125, - "k3_kl": 0.03780364990234375, - "kimi_kl": 0.13167190551757812, - "learning_rate": 3.458e-07, - "loss": 0.0015, - "ppl": 0.0173797607421875, - "reward": 0.9918211102485657, - "reward_std": 0.005587208084762096, - "rewards/perpo_ocr_edit_distance_reward": 0.9918211698532104, + "advantages": -3.276552524766885e-05, + "completion_length": 1379.0, + "delta_ref_entropy_loss": 0.050048828125, + "delta_ref_ppl": -0.0439453125, + "entropy_loss": -0.06787109375, + "epoch": 0.1542, + "grad_norm": 1.0727209383211107, + "k1_kl": 0.0439453125, + "k3_kl": 0.0250244140625, + "kimi_kl": 0.0556640625, + "learning_rate": 4.2289999999999996e-07, + "loss": 0.001, + "ppl": 0.040283203125, + "reward": 0.9779393672943115, + "reward_std": 0.0009389087790623307, + "rewards/perpo_ocr_edit_distance_reward": 0.9779394268989563, "step": 771, "temperature": 0.9 }, { - "advantages": -1.1912414265680127e-05, - "completion_length": 840.0, - "delta_ref_entropy_loss": 0.05169677734375, - "delta_ref_ppl": -0.030059814453125, - "entropy_loss": -0.087646484375, - "epoch": 0.3088, - "grad_norm": 0.8431180148799183, - "k1_kl": 0.03009033203125, - "k3_kl": 0.01458740234375, - "kimi_kl": 0.02386474609375, - "learning_rate": 3.456e-07, - "loss": 0.0006, - "ppl": 0.047393798828125, - "reward": 0.9441504776477814, - "reward_std": 0.006438019569031894, - "rewards/perpo_ocr_edit_distance_reward": 0.9441505074501038, + "advantages": -5.458082796394592e-06, + "completion_length": 415.0, + "delta_ref_entropy_loss": 0.2412109375, + "delta_ref_ppl": -0.12890625, + "entropy_loss": -0.2470703125, + "epoch": 0.1544, + "grad_norm": 2.8083421665916193, + "k1_kl": 0.12890625, + "k3_kl": 0.0712890625, + "kimi_kl": 0.1279296875, + "learning_rate": 4.228e-07, + "loss": 0.0029, + "ppl": 0.138671875, + "reward": 0.7718127965927124, + "reward_std": 0.0045871734619140625, + "rewards/perpo_ocr_edit_distance_reward": 0.7718128561973572, "step": 772, "temperature": 0.9 }, { - "advantages": -1.3087477213957754e-05, - "completion_length": 643.5, - "delta_ref_entropy_loss": 0.0751953125, - "delta_ref_ppl": -0.0535888671875, - "entropy_loss": -0.1439208984375, - "epoch": 0.3092, - "grad_norm": 1.9349940485271446, - "k1_kl": 0.0537109375, - "k3_kl": 0.035400390625, - "kimi_kl": 0.095703125, - "learning_rate": 3.4539999999999996e-07, - "loss": 0.0014, - "ppl": 0.08197021484375, - "reward": 0.8463322222232819, - "reward_std": 0.006159664422739297, - "rewards/perpo_ocr_edit_distance_reward": 0.8463322520256042, + "advantages": -3.4059798537100505e-08, + "completion_length": 1484.0, + "delta_ref_entropy_loss": 0.06640625, + "delta_ref_ppl": -0.05322265625, + "entropy_loss": -0.16015625, + "epoch": 0.1546, + "grad_norm": 2.498114060369663, + "k1_kl": 0.053466796875, + "k3_kl": 0.050048828125, + "kimi_kl": 0.07666015625, + "learning_rate": 4.227e-07, + "loss": 0.002, + "ppl": 0.08642578125, + "reward": 0.5825161933898926, + "reward_std": 0.21384386718273163, + "rewards/perpo_ocr_edit_distance_reward": 0.5825161933898926, "step": 773, "temperature": 0.9 }, { - "advantages": -0.00014212302630767226, - "completion_length": 851.0, - "delta_ref_entropy_loss": 0.03765869140625, - "delta_ref_ppl": -0.02117919921875, - "entropy_loss": -0.04327392578125, - "epoch": 0.3096, - "grad_norm": 0.6102709405001968, - "k1_kl": 0.02117919921875, - "k3_kl": 0.010650634765625, - "kimi_kl": 0.02398681640625, - "learning_rate": 3.452e-07, - "loss": 0.0006, - "ppl": 0.02294921875, - "reward": 0.9851189255714417, - "reward_std": 0.0006232670712051913, - "rewards/perpo_ocr_edit_distance_reward": 0.9851190149784088, + "advantages": -6.037099228706211e-05, + "completion_length": 474.0, + "delta_ref_entropy_loss": 0.10009765625, + "delta_ref_ppl": -0.1240234375, + "entropy_loss": -0.03466796875, + "epoch": 0.1548, + "grad_norm": 0.9332448070576306, + "k1_kl": 0.12353515625, + "k3_kl": 0.07421875, + "kimi_kl": 0.2119140625, + "learning_rate": 4.2259999999999993e-07, + "loss": 0.003, + "ppl": 0.01513671875, + "reward": 0.9812923669815063, + "reward_std": 0.0011690047103911638, + "rewards/perpo_ocr_edit_distance_reward": 0.9812924861907959, "step": 774, "temperature": 0.9 }, { - "advantages": -6.267428761930205e-05, - "completion_length": 477.0, - "delta_ref_entropy_loss": 0.0654296875, - "delta_ref_ppl": -0.04888916015625, - "entropy_loss": -0.04010009765625, - "epoch": 0.31, - "grad_norm": 1.3814745166545987, - "k1_kl": 0.049072265625, - "k3_kl": 0.027008056640625, - "kimi_kl": 0.0750732421875, - "learning_rate": 3.45e-07, - "loss": 0.0011, - "ppl": 0.02093505859375, - "reward": 0.9905208349227905, - "reward_std": 0.0012764403218170628, - "rewards/perpo_ocr_edit_distance_reward": 0.9905208945274353, + "advantages": -1.4935221770429052e-05, + "completion_length": 552.0, + "delta_ref_entropy_loss": 0.046630859375, + "delta_ref_ppl": -0.05419921875, + "entropy_loss": -0.01611328125, + "epoch": 0.155, + "grad_norm": 0.6187617482899317, + "k1_kl": 0.05419921875, + "k3_kl": 0.031982421875, + "kimi_kl": 0.09521484375, + "learning_rate": 4.225e-07, + "loss": 0.0013, + "ppl": 0.00677490234375, + "reward": 0.9936630129814148, + "reward_std": 0.00046985066728666425, + "rewards/perpo_ocr_edit_distance_reward": 0.9936630129814148, "step": 775, "temperature": 0.9 }, { - "advantages": -2.276046006954857e-05, - "completion_length": 641.5, - "delta_ref_entropy_loss": 0.0369873046875, - "delta_ref_ppl": -0.0234375, - "entropy_loss": -0.02972412109375, - "epoch": 0.3104, - "grad_norm": 0.54724891842755, - "k1_kl": 0.0234375, - "k3_kl": 0.0120849609375, - "kimi_kl": 0.0330810546875, - "learning_rate": 3.4479999999999996e-07, - "loss": 0.0005, - "ppl": 0.013336181640625, - "reward": 0.9466366171836853, - "reward_std": 0.0017568565672263503, - "rewards/perpo_ocr_edit_distance_reward": 0.9466366469860077, + "advantages": -3.300394382677041e-05, + "completion_length": 429.0, + "delta_ref_entropy_loss": 0.134765625, + "delta_ref_ppl": -0.09423828125, + "entropy_loss": -0.09326171875, + "epoch": 0.1552, + "grad_norm": 1.3185034457876716, + "k1_kl": 0.09423828125, + "k3_kl": 0.050537109375, + "kimi_kl": 0.1220703125, + "learning_rate": 4.2239999999999997e-07, + "loss": 0.0021, + "ppl": 0.049072265625, + "reward": 0.8487809300422668, + "reward_std": 0.002224789233878255, + "rewards/perpo_ocr_edit_distance_reward": 0.8487810492515564, "step": 776, "temperature": 0.9 }, { - "advantages": -6.377697445714148e-06, - "completion_length": 370.0, - "delta_ref_entropy_loss": 0.077392578125, - "delta_ref_ppl": -0.05810546875, - "entropy_loss": -0.081787109375, - "epoch": 0.3108, - "grad_norm": 0.6837755282334198, - "k1_kl": 0.0582275390625, - "k3_kl": 0.031005859375, - "kimi_kl": 0.05792236328125, - "learning_rate": 3.446e-07, - "loss": 0.0012, - "ppl": 0.052490234375, - "reward": 0.9208467304706573, - "reward_std": 0.003286803839728236, - "rewards/perpo_ocr_edit_distance_reward": 0.9208468198776245, + "advantages": 7.66345493730114e-08, + "completion_length": 2048.0, + "delta_ref_entropy_loss": 0.04248046875, + "delta_ref_ppl": -0.068359375, + "entropy_loss": -0.1767578125, + "epoch": 0.1554, + "grad_norm": 6.010761807432861, + "k1_kl": 0.068359375, + "k3_kl": 0.04833984375, + "kimi_kl": 0.12060546875, + "learning_rate": 4.2229999999999996e-07, + "loss": 0.0019, + "ppl": 0.0849609375, + "reward": 0.33271121978759766, + "reward_std": 0.3130019009113312, + "rewards/perpo_ocr_edit_distance_reward": 0.33271121978759766, "step": 777, "temperature": 0.9 }, { - "advantages": -4.243850889906753e-05, - "completion_length": 452.5, - "delta_ref_entropy_loss": 0.0390625, - "delta_ref_ppl": -0.0394287109375, - "entropy_loss": -0.06103515625, - "epoch": 0.3112, - "grad_norm": 2.4131970994885683, - "k1_kl": 0.03924560546875, - "k3_kl": 0.025146484375, - "kimi_kl": 0.0645751953125, - "learning_rate": 3.444e-07, - "loss": 0.0011, - "ppl": 0.03289794921875, - "reward": 0.9971324503421783, - "reward_std": 0.001341443188721314, - "rewards/perpo_ocr_edit_distance_reward": 0.9971325397491455, + "advantages": -3.405979782655777e-07, + "completion_length": 407.0, + "delta_ref_entropy_loss": 0.07568359375, + "delta_ref_ppl": -0.0869140625, + "entropy_loss": -0.142578125, + "epoch": 0.1556, + "grad_norm": 4.925195653488127, + "k1_kl": 0.0869140625, + "k3_kl": 0.058837890625, + "kimi_kl": 0.1611328125, + "learning_rate": 4.222e-07, + "loss": 0.0024, + "ppl": 0.080078125, + "reward": 0.9276726245880127, + "reward_std": 0.117629274725914, + "rewards/perpo_ocr_edit_distance_reward": 0.9276726841926575, "step": 778, "temperature": 0.9 }, { - "advantages": -1.27724248955019e-07, - "completion_length": 95.0, - "delta_ref_entropy_loss": 0.07421875, - "delta_ref_ppl": -0.158203125, - "entropy_loss": -0.10076904296875, - "epoch": 0.3116, - "grad_norm": 5.821575336772714, - "k1_kl": 0.15771484375, - "k3_kl": 0.1142578125, - "kimi_kl": 0.3701171875, - "learning_rate": 3.4419999999999997e-07, - "loss": 0.0046, - "ppl": 0.051605224609375, - "reward": 0.8117313086986542, - "reward_std": 0.09419988095760345, - "rewards/perpo_ocr_edit_distance_reward": 0.8117313385009766, + "advantages": -7.263252336997539e-05, + "completion_length": 414.0, + "delta_ref_entropy_loss": 0.057861328125, + "delta_ref_ppl": -0.045166015625, + "entropy_loss": -0.026611328125, + "epoch": 0.1558, + "grad_norm": 0.4007484178778142, + "k1_kl": 0.045166015625, + "k3_kl": 0.02392578125, + "kimi_kl": 0.056396484375, + "learning_rate": 4.2209999999999995e-07, + "loss": 0.001, + "ppl": 0.01611328125, + "reward": 0.9933943748474121, + "reward_std": 0.0003688979195430875, + "rewards/perpo_ocr_edit_distance_reward": 0.9933944940567017, "step": 779, "temperature": 0.9 }, { - "advantages": -8.769546730036382e-05, - "completion_length": 430.0, - "delta_ref_entropy_loss": 0.044189453125, - "delta_ref_ppl": -0.027313232421875, - "entropy_loss": -0.038848876953125, - "epoch": 0.312, - "grad_norm": 0.8945980717263853, - "k1_kl": 0.027313232421875, - "k3_kl": 0.01387786865234375, - "kimi_kl": 0.034271240234375, - "learning_rate": 3.4399999999999996e-07, - "loss": 0.0006, - "ppl": 0.02117156982421875, - "reward": 0.9868879616260529, - "reward_std": 0.0008406109700445086, - "rewards/perpo_ocr_edit_distance_reward": 0.9868880808353424, + "advantages": -0.0005960464477539062, + "completion_length": 476.0, + "delta_ref_entropy_loss": 0.0595703125, + "delta_ref_ppl": -0.04931640625, + "entropy_loss": -0.01422119140625, + "epoch": 0.156, + "grad_norm": 0.016953057939385837, + "k1_kl": 0.049072265625, + "k3_kl": 0.0263671875, + "kimi_kl": 0.0654296875, + "learning_rate": 4.2199999999999994e-07, + "loss": 0.0017, + "ppl": 0.00299072265625, + "reward": 0.9535398483276367, + "reward_std": 0.0, + "rewards/perpo_ocr_edit_distance_reward": 0.9535399079322815, "step": 780, "temperature": 0.9 }, { - "advantages": -2.4267605365224654e-06, - "completion_length": 751.0, - "delta_ref_entropy_loss": 0.052001953125, - "delta_ref_ppl": -0.0411376953125, - "entropy_loss": -0.062255859375, - "epoch": 0.3124, - "grad_norm": 3.4876841472642943, - "k1_kl": 0.0411376953125, - "k3_kl": 0.0328369140625, - "kimi_kl": 0.08642578125, - "learning_rate": 3.438e-07, - "loss": 0.0013, - "ppl": 0.04107666015625, - "reward": 0.9642714262008667, - "reward_std": 0.007018005191639531, - "rewards/perpo_ocr_edit_distance_reward": 0.9642715156078339, + "advantages": -3.315721551189199e-05, + "completion_length": 158.0, + "delta_ref_entropy_loss": 0.052490234375, + "delta_ref_ppl": -0.1513671875, + "entropy_loss": -0.01953125, + "epoch": 0.1562, + "grad_norm": 1.627369925461531, + "k1_kl": 0.150390625, + "k3_kl": 0.119140625, + "kimi_kl": 0.419921875, + "learning_rate": 4.219e-07, + "loss": 0.0048, + "ppl": 0.00775146484375, + "reward": 0.9816390872001648, + "reward_std": 0.0011848531430587173, + "rewards/perpo_ocr_edit_distance_reward": 0.9816390872001648, "step": 781, "temperature": 0.9 }, { - "advantages": -0.0005960464477539062, - "completion_length": 279.5, - "delta_ref_entropy_loss": 0.0455322265625, - "delta_ref_ppl": -0.1708984375, - "entropy_loss": -0.04669189453125, - "epoch": 0.3128, - "grad_norm": 0.22759229419864302, - "k1_kl": 0.1708984375, - "k3_kl": 0.139556884765625, - "kimi_kl": 0.765625, - "learning_rate": 3.436e-07, - "loss": 0.0062, - "ppl": 0.0311279296875, - "reward": 0.9713644683361053, - "reward_std": 0.0, - "rewards/perpo_ocr_edit_distance_reward": 0.9713645279407501, + "advantages": -2.1798270608996972e-05, + "completion_length": 686.0, + "delta_ref_entropy_loss": 0.064453125, + "delta_ref_ppl": -0.0712890625, + "entropy_loss": -0.037353515625, + "epoch": 0.1564, + "grad_norm": 0.7566864393222797, + "k1_kl": 0.0712890625, + "k3_kl": 0.04150390625, + "kimi_kl": 0.0966796875, + "learning_rate": 4.218e-07, + "loss": 0.0017, + "ppl": 0.021484375, + "reward": 0.9915087223052979, + "reward_std": 0.001853050896897912, + "rewards/perpo_ocr_edit_distance_reward": 0.9915087223052979, "step": 782, "temperature": 0.9 }, { - "advantages": 0.0, - "completion_length": 394.5, - "delta_ref_entropy_loss": 0.0362548828125, - "delta_ref_ppl": -0.0380859375, - "entropy_loss": -0.0303955078125, - "epoch": 0.3132, - "grad_norm": 0.041125691609266685, - "k1_kl": 0.0380859375, - "k3_kl": 0.0240478515625, - "kimi_kl": 0.0635986328125, - "learning_rate": 3.4339999999999996e-07, - "loss": 0.001, - "ppl": 0.01544189453125, - "reward": 1.0, - "reward_std": 0.0, - "rewards/perpo_ocr_edit_distance_reward": 1.0, + "advantages": -4.373277988634072e-05, + "completion_length": 515.0, + "delta_ref_entropy_loss": 0.058349609375, + "delta_ref_ppl": -0.049072265625, + "entropy_loss": -0.031494140625, + "epoch": 0.1566, + "grad_norm": 0.6563621805428295, + "k1_kl": 0.049072265625, + "k3_kl": 0.0260009765625, + "kimi_kl": 0.06982421875, + "learning_rate": 4.217e-07, + "loss": 0.0011, + "ppl": 0.01300048828125, + "reward": 0.9954259395599365, + "reward_std": 0.0006792612839490175, + "rewards/perpo_ocr_edit_distance_reward": 0.9954259395599365, "step": 783, "temperature": 0.9 }, { - "advantages": -1.585696554684546e-05, - "completion_length": 436.0, - "delta_ref_entropy_loss": 0.0364990234375, - "delta_ref_ppl": -0.0205078125, - "entropy_loss": -0.025177001953125, - "epoch": 0.3136, - "grad_norm": 0.5105407777935208, - "k1_kl": 0.020538330078125, - "k3_kl": 0.00927734375, - "kimi_kl": 0.018157958984375, - "learning_rate": 3.432e-07, - "loss": 0.0004, - "ppl": 0.01110076904296875, - "reward": 0.9981322586536407, - "reward_std": 0.0007555070333182812, - "rewards/perpo_ocr_edit_distance_reward": 0.9981322884559631, + "advantages": -3.789152469835244e-05, + "completion_length": 692.0, + "delta_ref_entropy_loss": 0.051513671875, + "delta_ref_ppl": -0.072265625, + "entropy_loss": -0.0196533203125, + "epoch": 0.1568, + "grad_norm": 0.3581956116723224, + "k1_kl": 0.072265625, + "k3_kl": 0.044189453125, + "kimi_kl": 0.154296875, + "learning_rate": 4.2159999999999996e-07, + "loss": 0.0018, + "ppl": 0.006072998046875, + "reward": 0.9946249723434448, + "reward_std": 0.00034934631548821926, + "rewards/perpo_ocr_edit_distance_reward": 0.9946249723434448, "step": 784, "temperature": 0.9 }, { - "advantages": -2.8092947104596533e-05, - "completion_length": 586.0, - "delta_ref_entropy_loss": 0.071533203125, - "delta_ref_ppl": -0.03314208984375, - "entropy_loss": -0.097900390625, - "epoch": 0.314, - "grad_norm": 1.408301281211268, - "k1_kl": 0.03326416015625, - "k3_kl": 0.015411376953125, - "kimi_kl": 0.0242919921875, - "learning_rate": 3.43e-07, - "loss": 0.0006, - "ppl": 0.0567626953125, - "reward": 0.7686764597892761, - "reward_std": 0.004767358215758577, - "rewards/perpo_ocr_edit_distance_reward": 0.7686765193939209, + "advantages": -0.0005960464477539062, + "completion_length": 300.0, + "delta_ref_entropy_loss": 0.0654296875, + "delta_ref_ppl": -0.07958984375, + "entropy_loss": -0.019287109375, + "epoch": 0.157, + "grad_norm": 0.010238655016596336, + "k1_kl": 0.07958984375, + "k3_kl": 0.049560546875, + "kimi_kl": 0.16015625, + "learning_rate": 4.2149999999999996e-07, + "loss": 0.0026, + "ppl": 0.004638671875, + "reward": 0.9844036102294922, + "reward_std": 0.0, + "rewards/perpo_ocr_edit_distance_reward": 0.9844037294387817, "step": 785, "temperature": 0.9 }, { - "advantages": -2.2551843358087353e-05, - "completion_length": 469.0, - "delta_ref_entropy_loss": 0.056396484375, - "delta_ref_ppl": -0.12469482421875, - "entropy_loss": -0.12799072265625, - "epoch": 0.3144, - "grad_norm": 1.8553894537896702, - "k1_kl": 0.12420654296875, - "k3_kl": 0.09423828125, - "kimi_kl": 0.4095458984375, - "learning_rate": 3.4279999999999997e-07, - "loss": 0.0038, - "ppl": 0.0726776123046875, - "reward": 0.9305191338062286, - "reward_std": 0.01542601473920513, - "rewards/perpo_ocr_edit_distance_reward": 0.9305191338062286, + "advantages": 0.0, + "completion_length": 929.0, + "delta_ref_entropy_loss": 0.11181640625, + "delta_ref_ppl": -0.0615234375, + "entropy_loss": -0.1123046875, + "epoch": 0.1572, + "grad_norm": 1.5322767261905441, + "k1_kl": 0.0615234375, + "k3_kl": 0.028076171875, + "kimi_kl": 0.0498046875, + "learning_rate": 4.214e-07, + "loss": 0.0011, + "ppl": 0.054931640625, + "reward": 0.795596182346344, + "reward_std": 0.043831124901771545, + "rewards/perpo_ocr_edit_distance_reward": 0.795596182346344, "step": 786, "temperature": 0.9 }, { - "advantages": -0.00032134992761712056, - "completion_length": 365.0, - "delta_ref_entropy_loss": 0.0335693359375, - "delta_ref_ppl": -0.03094482421875, - "entropy_loss": -0.03857421875, - "epoch": 0.3148, - "grad_norm": 0.3739958807779021, - "k1_kl": 0.03094482421875, - "k3_kl": 0.019195556640625, - "kimi_kl": 0.043212890625, - "learning_rate": 3.426e-07, - "loss": 0.0011, - "ppl": 0.018798828125, - "reward": 0.9971530139446259, - "reward_std": 0.0002237300795968622, - "rewards/perpo_ocr_edit_distance_reward": 0.9971530437469482, + "advantages": -8.425542910117656e-05, + "completion_length": 584.0, + "delta_ref_entropy_loss": 0.04248046875, + "delta_ref_ppl": -0.03564453125, + "entropy_loss": -0.0167236328125, + "epoch": 0.1574, + "grad_norm": 0.7841998106606702, + "k1_kl": 0.03564453125, + "k3_kl": 0.01806640625, + "kimi_kl": 0.0439453125, + "learning_rate": 4.213e-07, + "loss": 0.0008, + "ppl": 0.007080078125, + "reward": 0.989984393119812, + "reward_std": 0.0005064141587354243, + "rewards/perpo_ocr_edit_distance_reward": 0.9899845123291016, "step": 787, "temperature": 0.9 }, { - "advantages": -4.8773631419862795e-05, - "completion_length": 436.0, - "delta_ref_entropy_loss": 0.04345703125, - "delta_ref_ppl": -0.0408935546875, - "entropy_loss": -0.041259765625, - "epoch": 0.3152, - "grad_norm": 0.934385094847352, - "k1_kl": 0.041015625, - "k3_kl": 0.02679443359375, - "kimi_kl": 0.074462890625, - "learning_rate": 3.4239999999999994e-07, - "loss": 0.0011, - "ppl": 0.02197265625, - "reward": 0.9904975891113281, - "reward_std": 0.009332134417491034, - "rewards/perpo_ocr_edit_distance_reward": 0.9904976189136505, + "advantages": -1.7029899268550253e-08, + "completion_length": 191.0, + "delta_ref_entropy_loss": 0.1650390625, + "delta_ref_ppl": -0.1572265625, + "entropy_loss": -0.138671875, + "epoch": 0.1576, + "grad_norm": 3.514717333610522, + "k1_kl": 0.158203125, + "k3_kl": 0.09326171875, + "kimi_kl": 0.26171875, + "learning_rate": 4.212e-07, + "loss": 0.0037, + "ppl": 0.061279296875, + "reward": 0.4277070462703705, + "reward_std": 0.040267977863550186, + "rewards/perpo_ocr_edit_distance_reward": 0.4277070164680481, "step": 788, "temperature": 0.9 }, { - "advantages": 8.059400244064818e-06, - "completion_length": 909.0, - "delta_ref_entropy_loss": 0.01885986328125, - "delta_ref_ppl": -0.01190185546875, - "entropy_loss": -0.02630615234375, - "epoch": 0.3156, - "grad_norm": 2.278828882141232, - "k1_kl": 0.011871337890625, - "k3_kl": 0.015472412109375, - "kimi_kl": 0.019256591796875, - "learning_rate": 3.422e-07, - "loss": 0.0006, - "ppl": 0.01580810546875, - "reward": 0.9386149942874908, - "reward_std": 0.0641345304902643, - "rewards/perpo_ocr_edit_distance_reward": 0.9386150240898132, + "advantages": -6.130763949840912e-07, + "completion_length": 392.0, + "delta_ref_entropy_loss": 0.10888671875, + "delta_ref_ppl": -0.1435546875, + "entropy_loss": -0.1396484375, + "epoch": 0.1578, + "grad_norm": 2.1351517228023535, + "k1_kl": 0.1435546875, + "k3_kl": 0.08642578125, + "kimi_kl": 0.1796875, + "learning_rate": 4.211e-07, + "loss": 0.0035, + "ppl": 0.06884765625, + "reward": 0.9223529696464539, + "reward_std": 0.0565863698720932, + "rewards/perpo_ocr_edit_distance_reward": 0.9223529696464539, "step": 789, "temperature": 0.9 }, { - "advantages": -2.6353769158049545e-06, - "completion_length": 659.5, - "delta_ref_entropy_loss": 0.0867919921875, - "delta_ref_ppl": -0.047760009765625, - "entropy_loss": -0.076904296875, - "epoch": 0.316, - "grad_norm": 1.4880681609718904, - "k1_kl": 0.04754638671875, - "k3_kl": 0.0218658447265625, - "kimi_kl": 0.041900634765625, - "learning_rate": 3.42e-07, - "loss": 0.0009, - "ppl": 0.040618896484375, - "reward": 0.9000988304615021, - "reward_std": 0.008318326435983181, - "rewards/perpo_ocr_edit_distance_reward": 0.9000988602638245, + "advantages": -1.663608259150351e-06, + "completion_length": 1134.0, + "delta_ref_entropy_loss": 0.0771484375, + "delta_ref_ppl": -0.058837890625, + "entropy_loss": -0.0791015625, + "epoch": 0.158, + "grad_norm": 2.073884400556421, + "k1_kl": 0.05908203125, + "k3_kl": 0.033447265625, + "kimi_kl": 0.0791015625, + "learning_rate": 4.2099999999999997e-07, + "loss": 0.0013, + "ppl": 0.0458984375, + "reward": 0.9004380702972412, + "reward_std": 0.015422608703374863, + "rewards/perpo_ocr_edit_distance_reward": 0.900438129901886, "step": 790, "temperature": 0.9 }, { - "advantages": -1.3879367770641693e-06, - "completion_length": 298.0, - "delta_ref_entropy_loss": 0.0482177734375, - "delta_ref_ppl": -0.054931640625, - "entropy_loss": -0.04217529296875, - "epoch": 0.3164, - "grad_norm": 1.1452227851123693, - "k1_kl": 0.05523681640625, - "k3_kl": 0.03387451171875, - "kimi_kl": 0.1014404296875, - "learning_rate": 3.418e-07, - "loss": 0.0014, - "ppl": 0.01800537109375, - "reward": 0.8935833275318146, - "reward_std": 0.0076850466430187225, - "rewards/perpo_ocr_edit_distance_reward": 0.8935833275318146, + "advantages": -8.002349932212383e-05, + "completion_length": 675.0, + "delta_ref_entropy_loss": 0.07275390625, + "delta_ref_ppl": -0.05419921875, + "entropy_loss": -0.03173828125, + "epoch": 0.1582, + "grad_norm": 0.6892510584782321, + "k1_kl": 0.05419921875, + "k3_kl": 0.026123046875, + "kimi_kl": 0.0615234375, + "learning_rate": 4.2089999999999996e-07, + "loss": 0.0011, + "ppl": 0.01385498046875, + "reward": 0.9599236845970154, + "reward_std": 0.000538370746653527, + "rewards/perpo_ocr_edit_distance_reward": 0.9599238038063049, "step": 791, "temperature": 0.9 }, { - "advantages": -2.3011651137494482e-05, - "completion_length": 317.0, - "delta_ref_entropy_loss": 0.0552978515625, - "delta_ref_ppl": -0.045654296875, - "entropy_loss": -0.044342041015625, - "epoch": 0.3168, - "grad_norm": 0.7682939462045079, - "k1_kl": 0.045654296875, - "k3_kl": 0.02691650390625, - "kimi_kl": 0.072021484375, - "learning_rate": 3.416e-07, - "loss": 0.0011, - "ppl": 0.022796630859375, - "reward": 0.7485902309417725, - "reward_std": 0.0004123573307879269, - "rewards/perpo_ocr_edit_distance_reward": 0.7485902458429337, + "advantages": -0.00010800362360896543, + "completion_length": 616.0, + "delta_ref_entropy_loss": 0.05908203125, + "delta_ref_ppl": -0.0654296875, + "entropy_loss": -0.0198974609375, + "epoch": 0.1584, + "grad_norm": 1.8161201307413193, + "k1_kl": 0.0654296875, + "k3_kl": 0.05908203125, + "kimi_kl": 0.1259765625, + "learning_rate": 4.208e-07, + "loss": 0.0025, + "ppl": 0.01123046875, + "reward": 0.9886487126350403, + "reward_std": 0.00045184302143752575, + "rewards/perpo_ocr_edit_distance_reward": 0.9886487722396851, "step": 792, "temperature": 0.9 }, { - "advantages": -2.3160662294685608e-06, - "completion_length": 299.0, - "delta_ref_entropy_loss": 0.09234619140625, - "delta_ref_ppl": -0.07342529296875, - "entropy_loss": -0.10467529296875, - "epoch": 0.3172, - "grad_norm": 1.1183110254442015, - "k1_kl": 0.07342529296875, - "k3_kl": 0.040496826171875, - "kimi_kl": 0.10601806640625, - "learning_rate": 3.4139999999999997e-07, - "loss": 0.0016, - "ppl": 0.057159423828125, - "reward": 0.9078116714954376, - "reward_std": 0.0036397711373865604, - "rewards/perpo_ocr_edit_distance_reward": 0.9078117311000824, + "advantages": -2.7886460429726867e-06, + "completion_length": 819.0, + "delta_ref_entropy_loss": 0.1220703125, + "delta_ref_ppl": -0.07275390625, + "entropy_loss": -0.1474609375, + "epoch": 0.1586, + "grad_norm": 1.6236503289507183, + "k1_kl": 0.07275390625, + "k3_kl": 0.032470703125, + "kimi_kl": 0.05908203125, + "learning_rate": 4.207e-07, + "loss": 0.0013, + "ppl": 0.078125, + "reward": 0.9137169718742371, + "reward_std": 0.009079577401280403, + "rewards/perpo_ocr_edit_distance_reward": 0.9137170314788818, "step": 793, "temperature": 0.9 }, { - "advantages": -5.287783824314829e-05, - "completion_length": 533.5, - "delta_ref_entropy_loss": 0.036865234375, - "delta_ref_ppl": -0.03558349609375, - "entropy_loss": -0.0394287109375, - "epoch": 0.3176, - "grad_norm": 0.8130489527362166, - "k1_kl": 0.03570556640625, - "k3_kl": 0.023193359375, - "kimi_kl": 0.06494140625, - "learning_rate": 3.412e-07, - "loss": 0.001, - "ppl": 0.01934814453125, - "reward": 0.9948058128356934, - "reward_std": 0.0010722557490225881, - "rewards/perpo_ocr_edit_distance_reward": 0.9948058724403381, + "advantages": -0.00012105704081477597, + "completion_length": 489.0, + "delta_ref_entropy_loss": 0.07861328125, + "delta_ref_ppl": -0.060791015625, + "entropy_loss": -0.01708984375, + "epoch": 0.1588, + "grad_norm": 0.3840364229446942, + "k1_kl": 0.060546875, + "k3_kl": 0.030029296875, + "kimi_kl": 0.08349609375, + "learning_rate": 4.2059999999999994e-07, + "loss": 0.0013, + "ppl": 0.00750732421875, + "reward": 0.9969061017036438, + "reward_std": 0.00025166498380713165, + "rewards/perpo_ocr_edit_distance_reward": 0.9969061613082886, "step": 794, "temperature": 0.9 }, { - "advantages": -8.58306884765625e-06, - "completion_length": 310.0, - "delta_ref_entropy_loss": 0.1416015625, - "delta_ref_ppl": -0.1920166015625, - "entropy_loss": -0.100830078125, - "epoch": 0.318, - "grad_norm": 0.4329042088024429, - "k1_kl": 0.1920166015625, - "k3_kl": 0.1441650390625, - "kimi_kl": 0.5526123046875, - "learning_rate": 3.41e-07, - "loss": 0.0058, - "ppl": 0.05169677734375, - "reward": 0.9985481798648834, - "reward_std": 0.00044574131607078016, - "rewards/perpo_ocr_edit_distance_reward": 0.9985482096672058, + "advantages": -1.1248248483752832e-05, + "completion_length": 631.0, + "delta_ref_entropy_loss": 0.1875, + "delta_ref_ppl": -0.1044921875, + "entropy_loss": -0.2265625, + "epoch": 0.159, + "grad_norm": 2.384007856264862, + "k1_kl": 0.1044921875, + "k3_kl": 0.0615234375, + "kimi_kl": 0.11865234375, + "learning_rate": 4.205e-07, + "loss": 0.0025, + "ppl": 0.12109375, + "reward": 0.8905383944511414, + "reward_std": 0.007472456432878971, + "rewards/perpo_ocr_edit_distance_reward": 0.8905385136604309, "step": 795, "temperature": 0.9 }, { - "advantages": -6.967782974243164e-05, - "completion_length": 518.0, - "delta_ref_entropy_loss": 0.029327392578125, - "delta_ref_ppl": -0.0531005859375, - "entropy_loss": -0.027374267578125, - "epoch": 0.3184, - "grad_norm": 0.42131523433907614, - "k1_kl": 0.0531005859375, - "k3_kl": 0.039764404296875, - "kimi_kl": 0.150390625, - "learning_rate": 3.408e-07, - "loss": 0.0017, - "ppl": 0.010284423828125, - "reward": 0.9996259212493896, - "reward_std": 0.0002555323007982224, - "rewards/perpo_ocr_edit_distance_reward": 0.9996259808540344, - "step": 796, - "temperature": 0.9 - }, - { - "advantages": -4.166790529325226e-05, - "completion_length": 1026.5, - "delta_ref_entropy_loss": 0.02484130859375, - "delta_ref_ppl": -0.022491455078125, - "entropy_loss": -0.0247802734375, - "epoch": 0.3188, - "grad_norm": 12.598760814824805, - "k1_kl": 0.0225830078125, - "k3_kl": 0.038909912109375, - "kimi_kl": 0.04327392578125, - "learning_rate": 3.406e-07, - "loss": 0.0016, - "ppl": 0.013824462890625, - "reward": 0.938540130853653, - "reward_std": 0.012629727840248961, - "rewards/perpo_ocr_edit_distance_reward": 0.9385402202606201, + "advantages": -2.6736940981209045e-06, + "completion_length": 526.0, + "delta_ref_entropy_loss": 0.1396484375, + "delta_ref_ppl": -0.099609375, + "entropy_loss": -0.130859375, + "epoch": 0.1592, + "grad_norm": 5.302582010393888, + "k1_kl": 0.099609375, + "k3_kl": 0.05615234375, + "kimi_kl": 0.1484375, + "learning_rate": 4.204e-07, + "loss": 0.0023, + "ppl": 0.05615234375, + "reward": 0.9406940340995789, + "reward_std": 0.009498147293925285, + "rewards/perpo_ocr_edit_distance_reward": 0.9406941533088684, + "step": 796, + "temperature": 0.9 + }, + { + "advantages": -2.6498522856854834e-05, + "completion_length": 764.0, + "delta_ref_entropy_loss": 0.0341796875, + "delta_ref_ppl": -0.03759765625, + "entropy_loss": -0.0224609375, + "epoch": 0.1594, + "grad_norm": 0.7759152249438518, + "k1_kl": 0.03759765625, + "k3_kl": 0.0218505859375, + "kimi_kl": 0.046630859375, + "learning_rate": 4.2029999999999997e-07, + "loss": 0.0009, + "ppl": 0.0111083984375, + "reward": 0.9964243769645691, + "reward_std": 0.0011847623391076922, + "rewards/perpo_ocr_edit_distance_reward": 0.9964244365692139, "step": 797, "temperature": 0.9 }, { - "advantages": -4.979116897629865e-05, - "completion_length": 752.5, - "delta_ref_entropy_loss": 0.0948486328125, - "delta_ref_ppl": -0.0723876953125, - "entropy_loss": -0.121826171875, - "epoch": 0.3192, - "grad_norm": 1.199177174071988, - "k1_kl": 0.0721435546875, - "k3_kl": 0.0421142578125, - "kimi_kl": 0.10986328125, - "learning_rate": 3.4039999999999995e-07, - "loss": 0.0017, - "ppl": 0.072265625, - "reward": 0.9281732439994812, - "reward_std": 0.003085833915974945, - "rewards/perpo_ocr_edit_distance_reward": 0.9281733632087708, + "advantages": 0.0, + "completion_length": 404.0, + "delta_ref_entropy_loss": 0.111328125, + "delta_ref_ppl": -0.0830078125, + "entropy_loss": -0.046142578125, + "epoch": 0.1596, + "grad_norm": 0.046876313485267815, + "k1_kl": 0.0830078125, + "k3_kl": 0.044921875, + "kimi_kl": 0.1142578125, + "learning_rate": 4.202e-07, + "loss": 0.0018, + "ppl": 0.0196533203125, + "reward": 0.9483436346054077, + "reward_std": 0.0, + "rewards/perpo_ocr_edit_distance_reward": 0.9483436346054077, "step": 798, "temperature": 0.9 }, { - "advantages": -0.00030815601348876953, - "completion_length": 386.0, - "delta_ref_entropy_loss": 0.03955078125, - "delta_ref_ppl": -0.03472900390625, - "entropy_loss": -0.019744873046875, - "epoch": 0.3196, - "grad_norm": 0.6008697150288083, - "k1_kl": 0.03466796875, - "k3_kl": 0.02142333984375, - "kimi_kl": 0.0665283203125, - "learning_rate": 3.402e-07, - "loss": 0.0012, - "ppl": 0.00833892822265625, - "reward": 0.9935556352138519, - "reward_std": 0.0007901951321400702, - "rewards/perpo_ocr_edit_distance_reward": 0.9935556948184967, + "advantages": 5.34483406227082e-05, + "completion_length": 272.0, + "delta_ref_entropy_loss": 0.10400390625, + "delta_ref_ppl": -0.08935546875, + "entropy_loss": -0.04443359375, + "epoch": 0.1598, + "grad_norm": 0.8488819458843805, + "k1_kl": 0.0888671875, + "k3_kl": 0.05078125, + "kimi_kl": 0.1494140625, + "learning_rate": 4.2009999999999996e-07, + "loss": 0.002, + "ppl": 0.014404296875, + "reward": 0.6768125891685486, + "reward_std": 0.0005372219020500779, + "rewards/perpo_ocr_edit_distance_reward": 0.6768125891685486, "step": 799, "temperature": 0.9 }, { - "advantages": -2.952133036160376e-05, - "completion_length": 471.0, - "delta_ref_entropy_loss": 0.03155517578125, - "delta_ref_ppl": -0.027587890625, - "entropy_loss": -0.01837158203125, - "epoch": 0.32, - "grad_norm": 0.29180309304515084, - "k1_kl": 0.027587890625, - "k3_kl": 0.016754150390625, - "kimi_kl": 0.04107666015625, - "learning_rate": 3.4000000000000003e-07, - "loss": 0.0007, - "ppl": 0.0091400146484375, - "reward": 0.9932643175125122, - "reward_std": 0.0006710457964800298, - "rewards/perpo_ocr_edit_distance_reward": 0.993264377117157, + "advantages": -2.3509775928687304e-05, + "completion_length": 424.0, + "delta_ref_entropy_loss": 0.0673828125, + "delta_ref_ppl": -0.09130859375, + "entropy_loss": -0.036865234375, + "epoch": 0.16, + "grad_norm": 1.0838171649099273, + "k1_kl": 0.09130859375, + "k3_kl": 0.060791015625, + "kimi_kl": 0.18359375, + "learning_rate": 4.1999999999999995e-07, + "loss": 0.0025, + "ppl": 0.018310546875, + "reward": 0.9650743007659912, + "reward_std": 0.002072311704978347, + "rewards/perpo_ocr_edit_distance_reward": 0.9650744199752808, "step": 800, "temperature": 0.9 }, { - "advantages": -1.711504955892451e-05, - "completion_length": 299.5, - "delta_ref_entropy_loss": 0.0477294921875, - "delta_ref_ppl": -0.070556640625, - "entropy_loss": -0.048828125, - "epoch": 0.3204, - "grad_norm": 0.73240111329276, - "k1_kl": 0.070556640625, - "k3_kl": 0.047119140625, - "kimi_kl": 0.117431640625, - "learning_rate": 3.3979999999999996e-07, - "loss": 0.0019, - "ppl": 0.0247802734375, - "reward": 0.9988527297973633, - "reward_std": 0.0006960561731830239, - "rewards/perpo_ocr_edit_distance_reward": 0.9988527595996857, + "advantages": -1.1580331147342804e-06, + "completion_length": 693.0, + "delta_ref_entropy_loss": 0.038818359375, + "delta_ref_ppl": -0.040771484375, + "entropy_loss": -0.0125732421875, + "epoch": 0.1602, + "grad_norm": 1.2440460843614378, + "k1_kl": 0.040771484375, + "k3_kl": 0.0234375, + "kimi_kl": 0.0693359375, + "learning_rate": 4.199e-07, + "loss": 0.0009, + "ppl": 0.0059814453125, + "reward": 0.943435549736023, + "reward_std": 0.0667196735739708, + "rewards/perpo_ocr_edit_distance_reward": 0.9434356689453125, "step": 801, "temperature": 0.9 }, { - "advantages": -3.2356808787881164e-07, - "completion_length": 330.5, - "delta_ref_entropy_loss": 0.040283203125, - "delta_ref_ppl": -0.064697265625, - "entropy_loss": -0.029815673828125, - "epoch": 0.3208, - "grad_norm": 0.4829898654536539, - "k1_kl": 0.064697265625, - "k3_kl": 0.04620361328125, - "kimi_kl": 0.15234375, - "learning_rate": 3.396e-07, - "loss": 0.0018, - "ppl": 0.012676239013671875, - "reward": 0.9251333773136139, - "reward_std": 0.01323766354471445, - "rewards/perpo_ocr_edit_distance_reward": 0.9251333773136139, + "advantages": -4.8739570047473535e-05, + "completion_length": 572.0, + "delta_ref_entropy_loss": 0.080078125, + "delta_ref_ppl": -0.0634765625, + "entropy_loss": -0.033935546875, + "epoch": 0.1604, + "grad_norm": 1.0849961677146533, + "k1_kl": 0.0634765625, + "k3_kl": 0.03173828125, + "kimi_kl": 0.07470703125, + "learning_rate": 4.198e-07, + "loss": 0.0013, + "ppl": 0.0166015625, + "reward": 0.9840584993362427, + "reward_std": 0.00042416603537276387, + "rewards/perpo_ocr_edit_distance_reward": 0.9840585589408875, "step": 802, "temperature": 0.9 }, { - "advantages": -2.869538093364099e-06, - "completion_length": 509.5, - "delta_ref_entropy_loss": 0.05078125, - "delta_ref_ppl": -0.03277587890625, - "entropy_loss": -0.0391845703125, - "epoch": 0.3212, - "grad_norm": 0.9145176212078032, - "k1_kl": 0.0325927734375, - "k3_kl": 0.02069091796875, - "kimi_kl": 0.054931640625, - "learning_rate": 3.394e-07, - "loss": 0.0008, - "ppl": 0.020721435546875, - "reward": 0.9774698615074158, - "reward_std": 0.012772734044119716, - "rewards/perpo_ocr_edit_distance_reward": 0.9774699211120605, + "advantages": -7.748603820800781e-06, + "completion_length": 798.0, + "delta_ref_entropy_loss": 0.05810546875, + "delta_ref_ppl": -0.04345703125, + "entropy_loss": -0.038818359375, + "epoch": 0.1606, + "grad_norm": 0.6446120899363498, + "k1_kl": 0.043212890625, + "k3_kl": 0.022216796875, + "kimi_kl": 0.0576171875, + "learning_rate": 4.197e-07, + "loss": 0.0009, + "ppl": 0.015380859375, + "reward": 0.9193780422210693, + "reward_std": 0.007603465113788843, + "rewards/perpo_ocr_edit_distance_reward": 0.9193781614303589, "step": 803, "temperature": 0.9 }, { - "advantages": -2.021023340148531e-05, - "completion_length": 510.5, - "delta_ref_entropy_loss": 0.0489501953125, - "delta_ref_ppl": -0.04248046875, - "entropy_loss": -0.0562744140625, - "epoch": 0.3216, - "grad_norm": 0.9547729649051718, - "k1_kl": 0.04248046875, - "k3_kl": 0.02520751953125, - "kimi_kl": 0.0511474609375, - "learning_rate": 3.3919999999999997e-07, - "loss": 0.001, - "ppl": 0.031005859375, - "reward": 0.7314193546772003, - "reward_std": 0.012968810013262555, - "rewards/perpo_ocr_edit_distance_reward": 0.7314193695783615, + "advantages": -7.867813110351562e-06, + "completion_length": 514.0, + "delta_ref_entropy_loss": 0.03515625, + "delta_ref_ppl": -0.0458984375, + "entropy_loss": -0.01177978515625, + "epoch": 0.1608, + "grad_norm": 0.6072919823399773, + "k1_kl": 0.0458984375, + "k3_kl": 0.0296630859375, + "kimi_kl": 0.07373046875, + "learning_rate": 4.1959999999999997e-07, + "loss": 0.0012, + "ppl": 0.00408935546875, + "reward": 0.9684178829193115, + "reward_std": 0.006392087787389755, + "rewards/perpo_ocr_edit_distance_reward": 0.9684180021286011, "step": 804, "temperature": 0.9 }, { - "advantages": -6.602917608233838e-05, - "completion_length": 1136.5, - "delta_ref_entropy_loss": 0.02069091796875, - "delta_ref_ppl": -0.02691650390625, - "entropy_loss": -0.027587890625, - "epoch": 0.322, - "grad_norm": 1.6059491175176308, - "k1_kl": 0.02685546875, - "k3_kl": 0.0189208984375, - "kimi_kl": 0.046630859375, - "learning_rate": 3.39e-07, - "loss": 0.0008, - "ppl": 0.014617919921875, - "reward": 0.986823558807373, - "reward_std": 0.0028383527242112905, - "rewards/perpo_ocr_edit_distance_reward": 0.9868236184120178, + "advantages": -6.982258469179214e-07, + "completion_length": 2042.0, + "delta_ref_entropy_loss": 0.0810546875, + "delta_ref_ppl": -0.058349609375, + "entropy_loss": -0.53515625, + "epoch": 0.161, + "grad_norm": 25508.11064326471, + "k1_kl": 0.058349609375, + "k3_kl": 82.0, + "kimi_kl": 0.1708984375, + "learning_rate": 4.1949999999999996e-07, + "loss": 3.2936, + "ppl": 0.404296875, + "reward": 0.6176445484161377, + "reward_std": 0.06122004985809326, + "rewards/perpo_ocr_edit_distance_reward": 0.6176446080207825, "step": 805, "temperature": 0.9 }, { - "advantages": 3.916876636367306e-07, - "completion_length": 740.5, - "delta_ref_entropy_loss": 0.0592041015625, - "delta_ref_ppl": -0.039794921875, - "entropy_loss": -0.14312744140625, - "epoch": 0.3224, - "grad_norm": 9.136957482609962, - "k1_kl": 0.0396728515625, - "k3_kl": 0.02783203125, - "kimi_kl": 0.0743408203125, - "learning_rate": 3.388e-07, - "loss": 0.0011, - "ppl": 0.0830078125, - "reward": 0.9151336252689362, - "reward_std": 0.11569307325407863, - "rewards/perpo_ocr_edit_distance_reward": 0.9151336848735809, + "advantages": -0.00017794968152884394, + "completion_length": 399.0, + "delta_ref_entropy_loss": 0.072265625, + "delta_ref_ppl": -0.0791015625, + "entropy_loss": -0.01806640625, + "epoch": 0.1612, + "grad_norm": 0.7823257936134655, + "k1_kl": 0.0791015625, + "k3_kl": 0.046875, + "kimi_kl": 0.1591796875, + "learning_rate": 4.1939999999999996e-07, + "loss": 0.0021, + "ppl": 0.007110595703125, + "reward": 0.8730311393737793, + "reward_std": 0.00023494078777730465, + "rewards/perpo_ocr_edit_distance_reward": 0.8730311989784241, "step": 806, "temperature": 0.9 }, { - "advantages": 0.0, - "completion_length": 617.0, - "delta_ref_entropy_loss": 0.0238037109375, - "delta_ref_ppl": -0.02911376953125, - "entropy_loss": -0.012786865234375, - "epoch": 0.3228, - "grad_norm": 0.018563628027892645, - "k1_kl": 0.02899169921875, - "k3_kl": 0.020782470703125, - "kimi_kl": 0.09075164794921875, - "learning_rate": 3.386e-07, - "loss": 0.0008, - "ppl": 0.005584716796875, - "reward": 0.9960866570472717, - "reward_std": 0.0, - "rewards/perpo_ocr_edit_distance_reward": 0.9960866868495941, + "advantages": -4.1944640543079004e-05, + "completion_length": 103.0, + "delta_ref_entropy_loss": 0.04736328125, + "delta_ref_ppl": -0.19921875, + "entropy_loss": -0.0439453125, + "epoch": 0.1614, + "grad_norm": 2.971109477096801, + "k1_kl": 0.2001953125, + "k3_kl": 0.154296875, + "kimi_kl": 0.57421875, + "learning_rate": 4.193e-07, + "loss": 0.0062, + "ppl": 0.020751953125, + "reward": 0.8904573917388916, + "reward_std": 0.0019286618335172534, + "rewards/perpo_ocr_edit_distance_reward": 0.8904574513435364, "step": 807, "temperature": 0.9 }, { - "advantages": -0.0001030692073982209, - "completion_length": 522.0, - "delta_ref_entropy_loss": 0.0501708984375, - "delta_ref_ppl": -0.0284423828125, - "entropy_loss": -0.01959228515625, - "epoch": 0.3232, - "grad_norm": 0.3280900994504297, - "k1_kl": 0.02850341796875, - "k3_kl": 0.01348876953125, - "kimi_kl": 0.032196044921875, - "learning_rate": 3.3839999999999996e-07, - "loss": 0.0006, - "ppl": 0.0075531005859375, - "reward": 0.9946532845497131, - "reward_std": 0.00025979787460528314, - "rewards/perpo_ocr_edit_distance_reward": 0.9946533441543579, + "advantages": -8.65800102474168e-05, + "completion_length": 240.0, + "delta_ref_entropy_loss": 0.09375, + "delta_ref_ppl": -0.11083984375, + "entropy_loss": -0.04052734375, + "epoch": 0.1616, + "grad_norm": 1.6958799656878956, + "k1_kl": 0.11083984375, + "k3_kl": 0.0673828125, + "kimi_kl": 0.1728515625, + "learning_rate": 4.192e-07, + "loss": 0.0028, + "ppl": 0.025390625, + "reward": 0.9842352867126465, + "reward_std": 0.0007850959664210677, + "rewards/perpo_ocr_edit_distance_reward": 0.984235405921936, "step": 808, "temperature": 0.9 }, { - "advantages": -4.998275471734814e-06, - "completion_length": 825.0, - "delta_ref_entropy_loss": 0.03118896484375, - "delta_ref_ppl": -0.03173828125, - "entropy_loss": -0.0279541015625, - "epoch": 0.3236, - "grad_norm": 0.5599734748361459, - "k1_kl": 0.031768798828125, - "k3_kl": 0.0215301513671875, - "kimi_kl": 0.0895843505859375, - "learning_rate": 3.382e-07, - "loss": 0.0009, - "ppl": 0.01361083984375, - "reward": 0.9969905018806458, - "reward_std": 0.0017574463054188527, - "rewards/perpo_ocr_edit_distance_reward": 0.9969905018806458, + "advantages": 0.0, + "completion_length": 716.0, + "delta_ref_entropy_loss": 0.0673828125, + "delta_ref_ppl": -0.0830078125, + "entropy_loss": -0.0400390625, + "epoch": 0.1618, + "grad_norm": 3.262384751347607, + "k1_kl": 0.0830078125, + "k3_kl": 0.05419921875, + "kimi_kl": 0.1591796875, + "learning_rate": 4.1909999999999993e-07, + "loss": 0.0022, + "ppl": 0.0208740234375, + "reward": 0.9812668561935425, + "reward_std": 0.003171427408233285, + "rewards/perpo_ocr_edit_distance_reward": 0.9812668561935425, "step": 809, "temperature": 0.9 }, { - "advantages": 6.556511209510063e-07, - "completion_length": 414.5, - "delta_ref_entropy_loss": 0.054443359375, - "delta_ref_ppl": -0.0595703125, - "entropy_loss": -0.08740234375, - "epoch": 0.324, - "grad_norm": 2.5056307016009045, - "k1_kl": 0.0596923828125, - "k3_kl": 0.0345458984375, - "kimi_kl": 0.08984375, - "learning_rate": 3.38e-07, - "loss": 0.0014, - "ppl": 0.0445556640625, - "reward": 0.6392226368188858, - "reward_std": 0.0277191917411983, - "rewards/perpo_ocr_edit_distance_reward": 0.6392226815223694, + "advantages": -9.519713785266504e-06, + "completion_length": 701.0, + "delta_ref_entropy_loss": 0.06591796875, + "delta_ref_ppl": -0.0517578125, + "entropy_loss": -0.0751953125, + "epoch": 0.162, + "grad_norm": 2.7896772753664876, + "k1_kl": 0.0517578125, + "k3_kl": 0.03173828125, + "kimi_kl": 0.0703125, + "learning_rate": 4.19e-07, + "loss": 0.0013, + "ppl": 0.03857421875, + "reward": 0.9910843372344971, + "reward_std": 0.0016871015541255474, + "rewards/perpo_ocr_edit_distance_reward": 0.9910843968391418, "step": 810, "temperature": 0.9 }, { - "advantages": -5.2571298510883935e-05, - "completion_length": 759.5, - "delta_ref_entropy_loss": 0.0625, - "delta_ref_ppl": -0.03179931640625, - "entropy_loss": -0.059814453125, - "epoch": 0.3244, - "grad_norm": 3.574161759394232, - "k1_kl": 0.03179931640625, - "k3_kl": 0.0179443359375, - "kimi_kl": 0.0355224609375, - "learning_rate": 3.3779999999999997e-07, - "loss": 0.0008, - "ppl": 0.03582763671875, - "reward": 0.9729870855808258, - "reward_std": 0.0014838093484286219, - "rewards/perpo_ocr_edit_distance_reward": 0.9729871451854706, + "advantages": -7.144043047446758e-05, + "completion_length": 368.0, + "delta_ref_entropy_loss": 0.0830078125, + "delta_ref_ppl": -0.080078125, + "entropy_loss": -0.02783203125, + "epoch": 0.1622, + "grad_norm": 1.0357591952180007, + "k1_kl": 0.080078125, + "k3_kl": 0.0439453125, + "kimi_kl": 0.1064453125, + "learning_rate": 4.1889999999999997e-07, + "loss": 0.0018, + "ppl": 0.0076904296875, + "reward": 0.9956439137458801, + "reward_std": 0.0006151861161924899, + "rewards/perpo_ocr_edit_distance_reward": 0.9956439733505249, "step": 811, "temperature": 0.9 }, { - "advantages": -7.909536361694336e-05, - "completion_length": 802.0, - "delta_ref_entropy_loss": 0.060546875, - "delta_ref_ppl": -0.04180908203125, - "entropy_loss": -0.08233642578125, - "epoch": 0.3248, - "grad_norm": 0.7689094930388248, - "k1_kl": 0.04168701171875, - "k3_kl": 0.021240234375, - "kimi_kl": 0.0484619140625, - "learning_rate": 3.376e-07, - "loss": 0.0009, - "ppl": 0.049102783203125, - "reward": 0.9069635570049286, - "reward_std": 0.0007822522893548012, - "rewards/perpo_ocr_edit_distance_reward": 0.9069636166095734, + "advantages": -1.2014594176434912e-05, + "completion_length": 575.0, + "delta_ref_entropy_loss": 0.193359375, + "delta_ref_ppl": -0.10693359375, + "entropy_loss": -0.2177734375, + "epoch": 0.1624, + "grad_norm": 3.2398129138765537, + "k1_kl": 0.107421875, + "k3_kl": 0.053955078125, + "kimi_kl": 0.08837890625, + "learning_rate": 4.1879999999999996e-07, + "loss": 0.0022, + "ppl": 0.11865234375, + "reward": 0.653526782989502, + "reward_std": 0.006994398310780525, + "rewards/perpo_ocr_edit_distance_reward": 0.6535269021987915, "step": 812, "temperature": 0.9 }, { - "advantages": -6.305320027877315e-06, - "completion_length": 507.5, - "delta_ref_entropy_loss": 0.0225830078125, - "delta_ref_ppl": -0.03265380859375, - "entropy_loss": -0.02142333984375, - "epoch": 0.3252, - "grad_norm": 0.8920680529971825, - "k1_kl": 0.03271484375, - "k3_kl": 0.02325439453125, - "kimi_kl": 0.067626953125, - "learning_rate": 3.3739999999999994e-07, - "loss": 0.0009, - "ppl": 0.01141357421875, - "reward": 0.9910511374473572, - "reward_std": 0.014994880853919312, - "rewards/perpo_ocr_edit_distance_reward": 0.9910511672496796, + "advantages": -7.115943299140781e-05, + "completion_length": 321.0, + "delta_ref_entropy_loss": 0.09423828125, + "delta_ref_ppl": -0.0966796875, + "entropy_loss": -0.025146484375, + "epoch": 0.1626, + "grad_norm": 0.5629934498644853, + "k1_kl": 0.09716796875, + "k3_kl": 0.054443359375, + "kimi_kl": 0.1416015625, + "learning_rate": 4.187e-07, + "loss": 0.0023, + "ppl": 0.0079345703125, + "reward": 0.9818207621574402, + "reward_std": 0.0003787218302022666, + "rewards/perpo_ocr_edit_distance_reward": 0.981820821762085, "step": 813, "temperature": 0.9 }, { - "advantages": -7.268999618759153e-05, - "completion_length": 568.0, - "delta_ref_entropy_loss": 0.04150390625, - "delta_ref_ppl": -0.085205078125, - "entropy_loss": -0.0712890625, - "epoch": 0.3256, - "grad_norm": 2.0124711706814025, - "k1_kl": 0.085693359375, - "k3_kl": 0.063079833984375, - "kimi_kl": 0.26513671875, - "learning_rate": 3.372e-07, - "loss": 0.0026, - "ppl": 0.03802490234375, - "reward": 0.9800465703010559, - "reward_std": 0.04240935941925272, - "rewards/perpo_ocr_edit_distance_reward": 0.9800466597080231, + "advantages": -6.386212135112146e-06, + "completion_length": 1252.0, + "delta_ref_entropy_loss": 0.0184326171875, + "delta_ref_ppl": -0.0211181640625, + "entropy_loss": -0.031005859375, + "epoch": 0.1628, + "grad_norm": 0.6595459327765102, + "k1_kl": 0.0211181640625, + "k3_kl": 0.0140380859375, + "kimi_kl": 0.03564453125, + "learning_rate": 4.186e-07, + "loss": 0.0006, + "ppl": 0.015869140625, + "reward": 0.9927265048027039, + "reward_std": 0.0012346161529421806, + "rewards/perpo_ocr_edit_distance_reward": 0.9927265048027039, "step": 814, "temperature": 0.9 }, { - "advantages": -4.023313522338867e-05, - "completion_length": 429.5, - "delta_ref_entropy_loss": 0.065673828125, - "delta_ref_ppl": -0.0701904296875, - "entropy_loss": -0.02783203125, - "epoch": 0.326, - "grad_norm": 0.32602220025535433, - "k1_kl": 0.0699462890625, - "k3_kl": 0.042510986328125, - "kimi_kl": 0.111572265625, - "learning_rate": 3.37e-07, - "loss": 0.0017, - "ppl": 0.01026153564453125, - "reward": 0.9984513521194458, - "reward_std": 0.0002674342249520123, - "rewards/perpo_ocr_edit_distance_reward": 0.9984513819217682, + "advantages": -0.00014334065781440586, + "completion_length": 527.0, + "delta_ref_entropy_loss": 0.1328125, + "delta_ref_ppl": -0.07861328125, + "entropy_loss": -0.049072265625, + "epoch": 0.163, + "grad_norm": 0.6321781171345409, + "k1_kl": 0.07861328125, + "k3_kl": 0.033203125, + "kimi_kl": 0.08935546875, + "learning_rate": 4.1849999999999994e-07, + "loss": 0.0015, + "ppl": 0.0220947265625, + "reward": 0.969143807888031, + "reward_std": 0.0004346006317064166, + "rewards/perpo_ocr_edit_distance_reward": 0.9691439270973206, "step": 815, "temperature": 0.9 }, { - "advantages": -1.6506230167578906e-05, - "completion_length": 435.0, - "delta_ref_entropy_loss": 0.06658935546875, - "delta_ref_ppl": -0.2484130859375, - "entropy_loss": -0.042236328125, - "epoch": 0.3264, - "grad_norm": 11.509120575527371, - "k1_kl": 0.2474365234375, - "k3_kl": 0.19207763671875, - "kimi_kl": 0.7822265625, - "learning_rate": 3.368e-07, - "loss": 0.0077, - "ppl": 0.0221405029296875, - "reward": 0.9991106390953064, - "reward_std": 0.0022801135783083737, - "rewards/perpo_ocr_edit_distance_reward": 0.9991106986999512, + "advantages": -1.8392290712654358e-06, + "completion_length": 195.0, + "delta_ref_entropy_loss": 0.10986328125, + "delta_ref_ppl": -0.111328125, + "entropy_loss": -0.06884765625, + "epoch": 0.1632, + "grad_norm": 2.5267378457332565, + "k1_kl": 0.111328125, + "k3_kl": 0.06298828125, + "kimi_kl": 0.1318359375, + "learning_rate": 4.184e-07, + "loss": 0.0025, + "ppl": 0.028564453125, + "reward": 0.3659087121486664, + "reward_std": 0.009185044094920158, + "rewards/perpo_ocr_edit_distance_reward": 0.3659087121486664, "step": 816, "temperature": 0.9 }, { - "advantages": -1.2061425877618603e-05, - "completion_length": 285.0, - "delta_ref_entropy_loss": 0.0904541015625, - "delta_ref_ppl": -0.0963134765625, - "entropy_loss": -0.045654296875, - "epoch": 0.3268, - "grad_norm": 0.43050088952123067, - "k1_kl": 0.0963134765625, - "k3_kl": 0.0572509765625, - "kimi_kl": 0.133544921875, - "learning_rate": 3.366e-07, - "loss": 0.0023, - "ppl": 0.01690673828125, - "reward": 0.9953419268131256, - "reward_std": 0.0004793921543750912, - "rewards/perpo_ocr_edit_distance_reward": 0.9953419268131256, + "advantages": -7.373946573352441e-05, + "completion_length": 631.0, + "delta_ref_entropy_loss": 0.07861328125, + "delta_ref_ppl": -0.080078125, + "entropy_loss": -0.047119140625, + "epoch": 0.1634, + "grad_norm": 0.5632720080154042, + "k1_kl": 0.080078125, + "k3_kl": 0.050048828125, + "kimi_kl": 0.12060546875, + "learning_rate": 4.183e-07, + "loss": 0.0021, + "ppl": 0.0203857421875, + "reward": 0.7716841101646423, + "reward_std": 0.0011705723591148853, + "rewards/perpo_ocr_edit_distance_reward": 0.7716842889785767, "step": 817, "temperature": 0.9 }, { - "advantages": -3.901975765074894e-05, - "completion_length": 475.5, - "delta_ref_entropy_loss": 0.0660400390625, - "delta_ref_ppl": -0.0645751953125, - "entropy_loss": -0.06201171875, - "epoch": 0.3272, - "grad_norm": 1.437511406264891, - "k1_kl": 0.0645751953125, - "k3_kl": 0.04150390625, - "kimi_kl": 0.140869140625, - "learning_rate": 3.3639999999999997e-07, - "loss": 0.0017, - "ppl": 0.029052734375, - "reward": 0.9347876906394958, - "reward_std": 0.006053523102309555, - "rewards/perpo_ocr_edit_distance_reward": 0.934787780046463, + "advantages": 4.564013124763733e-06, + "completion_length": 1025.0, + "delta_ref_entropy_loss": 0.09326171875, + "delta_ref_ppl": -0.06298828125, + "entropy_loss": -0.09326171875, + "epoch": 0.1636, + "grad_norm": 1.2856109573432328, + "k1_kl": 0.06298828125, + "k3_kl": 0.03564453125, + "kimi_kl": 0.0625, + "learning_rate": 4.1819999999999997e-07, + "loss": 0.0014, + "ppl": 0.052978515625, + "reward": 0.7820333242416382, + "reward_std": 0.001768397050909698, + "rewards/perpo_ocr_edit_distance_reward": 0.7820333242416382, "step": 818, "temperature": 0.9 }, { - "advantages": -6.412182665371802e-05, - "completion_length": 804.0, - "delta_ref_entropy_loss": 0.04937744140625, - "delta_ref_ppl": -0.035247802734375, - "entropy_loss": -0.05218505859375, - "epoch": 0.3276, - "grad_norm": 1.1832398710638619, - "k1_kl": 0.035125732421875, - "k3_kl": 0.025299072265625, - "kimi_kl": 0.057586669921875, - "learning_rate": 3.362e-07, - "loss": 0.0011, - "ppl": 0.027801513671875, - "reward": 0.9864364862442017, - "reward_std": 0.0015237906482070684, - "rewards/perpo_ocr_edit_distance_reward": 0.9864366054534912, + "advantages": -2.4199487597797997e-05, + "completion_length": 428.0, + "delta_ref_entropy_loss": 0.09765625, + "delta_ref_ppl": -0.07080078125, + "entropy_loss": -0.052734375, + "epoch": 0.1638, + "grad_norm": 0.8234535331394088, + "k1_kl": 0.07080078125, + "k3_kl": 0.034423828125, + "kimi_kl": 0.080078125, + "learning_rate": 4.181e-07, + "loss": 0.0014, + "ppl": 0.0247802734375, + "reward": 0.9867379665374756, + "reward_std": 0.002363434061408043, + "rewards/perpo_ocr_edit_distance_reward": 0.9867380857467651, "step": 819, "temperature": 0.9 }, { - "advantages": -6.275943559330699e-05, - "completion_length": 612.0, - "delta_ref_entropy_loss": 0.039794921875, - "delta_ref_ppl": -0.060791015625, - "entropy_loss": -0.04693603515625, - "epoch": 0.328, - "grad_norm": 1.2810697608658907, - "k1_kl": 0.0606689453125, - "k3_kl": 0.0438232421875, - "kimi_kl": 0.156005859375, - "learning_rate": 3.36e-07, - "loss": 0.0018, - "ppl": 0.02527618408203125, - "reward": 0.8453610241413116, - "reward_std": 0.016156324432813562, - "rewards/perpo_ocr_edit_distance_reward": 0.8453611135482788, + "advantages": -3.9287977415369824e-05, + "completion_length": 150.0, + "delta_ref_entropy_loss": 0.05810546875, + "delta_ref_ppl": -0.2041015625, + "entropy_loss": -0.0556640625, + "epoch": 0.164, + "grad_norm": 1.1503752140461423, + "k1_kl": 0.2041015625, + "k3_kl": 0.1552734375, + "kimi_kl": 0.48828125, + "learning_rate": 4.1799999999999996e-07, + "loss": 0.0062, + "ppl": 0.018798828125, + "reward": 0.9475492835044861, + "reward_std": 0.0014163604937493801, + "rewards/perpo_ocr_edit_distance_reward": 0.9475493431091309, "step": 820, "temperature": 0.9 }, { - "advantages": -4.029273986816406e-05, - "completion_length": 427.5, - "delta_ref_entropy_loss": 0.0255126953125, - "delta_ref_ppl": -0.013885498046875, - "entropy_loss": -0.010009765625, - "epoch": 0.3284, - "grad_norm": 0.24224434517591698, - "k1_kl": 0.01385498046875, - "k3_kl": 0.0067291259765625, - "kimi_kl": 0.012786865234375, - "learning_rate": 3.358e-07, - "loss": 0.0003, - "ppl": 0.00347137451171875, - "reward": 0.9997818470001221, - "reward_std": 0.00037256733048707247, - "rewards/perpo_ocr_edit_distance_reward": 0.9997818768024445, + "advantages": -0.00013207537995185703, + "completion_length": 437.0, + "delta_ref_entropy_loss": 0.058349609375, + "delta_ref_ppl": -0.06201171875, + "entropy_loss": -0.02197265625, + "epoch": 0.1642, + "grad_norm": 0.7687332193019425, + "k1_kl": 0.062255859375, + "k3_kl": 0.0361328125, + "kimi_kl": 0.08935546875, + "learning_rate": 4.1789999999999995e-07, + "loss": 0.0016, + "ppl": 0.00927734375, + "reward": 0.9939595460891724, + "reward_std": 0.00041583346319384873, + "rewards/perpo_ocr_edit_distance_reward": 0.9939596056938171, "step": 821, "temperature": 0.9 }, { - "advantages": 3.7197558413026854e-05, - "completion_length": 433.0, - "delta_ref_entropy_loss": 0.02679443359375, - "delta_ref_ppl": -0.01617431640625, - "entropy_loss": -0.010955810546875, - "epoch": 0.3288, - "grad_norm": 0.12146326279285566, - "k1_kl": 0.01611328125, - "k3_kl": 0.00777435302734375, - "kimi_kl": 0.0150146484375, - "learning_rate": 3.356e-07, - "loss": 0.0003, - "ppl": 0.00310516357421875, - "reward": 0.8791090250015259, - "reward_std": 6.447638588724658e-05, - "rewards/perpo_ocr_edit_distance_reward": 0.8791090250015259, + "advantages": -2.322026739420835e-05, + "completion_length": 444.0, + "delta_ref_entropy_loss": 0.042724609375, + "delta_ref_ppl": -0.0517578125, + "entropy_loss": -0.01544189453125, + "epoch": 0.1644, + "grad_norm": 0.9598610585187091, + "k1_kl": 0.0517578125, + "k3_kl": 0.0308837890625, + "kimi_kl": 0.0908203125, + "learning_rate": 4.178e-07, + "loss": 0.0013, + "ppl": 0.005584716796875, + "reward": 0.9756758213043213, + "reward_std": 0.0010003630304709077, + "rewards/perpo_ocr_edit_distance_reward": 0.9756758809089661, "step": 822, "temperature": 0.9 }, { - "advantages": -3.583090801839717e-05, - "completion_length": 664.0, - "delta_ref_entropy_loss": 0.05975341796875, - "delta_ref_ppl": -0.0333251953125, - "entropy_loss": -0.03118896484375, - "epoch": 0.3292, - "grad_norm": 0.44074484917535856, - "k1_kl": 0.033294677734375, - "k3_kl": 0.0126953125, - "kimi_kl": 0.01904296875, - "learning_rate": 3.3539999999999995e-07, - "loss": 0.0005, - "ppl": 0.01409912109375, - "reward": 0.9990267753601074, - "reward_std": 0.0004253692168276757, - "rewards/perpo_ocr_edit_distance_reward": 0.9990268051624298, + "advantages": -1.0507447768759448e-05, + "completion_length": 591.0, + "delta_ref_entropy_loss": 0.044677734375, + "delta_ref_ppl": -0.047119140625, + "entropy_loss": -0.0177001953125, + "epoch": 0.1646, + "grad_norm": 0.6893093757777536, + "k1_kl": 0.046875, + "k3_kl": 0.0262451171875, + "kimi_kl": 0.08056640625, + "learning_rate": 4.177e-07, + "loss": 0.0011, + "ppl": 0.006072998046875, + "reward": 0.9662048816680908, + "reward_std": 0.004758388735353947, + "rewards/perpo_ocr_edit_distance_reward": 0.9662048816680908, "step": 823, "temperature": 0.9 }, { - "advantages": -1.0371208453818781e-05, - "completion_length": 917.0, - "delta_ref_entropy_loss": 0.0322265625, - "delta_ref_ppl": -0.02520751953125, - "entropy_loss": -0.03369140625, - "epoch": 0.3296, - "grad_norm": 0.6869731813017673, - "k1_kl": 0.02520751953125, - "k3_kl": 0.014617919921875, - "kimi_kl": 0.037841796875, - "learning_rate": 3.352e-07, - "loss": 0.0006, - "ppl": 0.01727294921875, - "reward": 0.9664739370346069, - "reward_std": 0.027563054114580154, - "rewards/perpo_ocr_edit_distance_reward": 0.9664739668369293, + "advantages": -8.581366273574531e-05, + "completion_length": 193.0, + "delta_ref_entropy_loss": 0.0751953125, + "delta_ref_ppl": -0.12255859375, + "entropy_loss": -0.0390625, + "epoch": 0.1648, + "grad_norm": 0.9666038649760752, + "k1_kl": 0.12255859375, + "k3_kl": 0.0869140625, + "kimi_kl": 0.3125, + "learning_rate": 4.1760000000000003e-07, + "loss": 0.0036, + "ppl": 0.015869140625, + "reward": 0.9882408976554871, + "reward_std": 0.0005945739685557783, + "rewards/perpo_ocr_edit_distance_reward": 0.9882409572601318, "step": 824, "temperature": 0.9 }, { - "advantages": -1.3445105781162425e-05, - "completion_length": 632.5, - "delta_ref_entropy_loss": 0.034423828125, - "delta_ref_ppl": -0.03118896484375, - "entropy_loss": -0.060821533203125, - "epoch": 0.33, - "grad_norm": 1.3259631236098388, - "k1_kl": 0.0311279296875, - "k3_kl": 0.0240478515625, - "kimi_kl": 0.0987548828125, - "learning_rate": 3.35e-07, - "loss": 0.001, - "ppl": 0.035430908203125, - "reward": 0.8038070499897003, - "reward_std": 0.000773285559262149, - "rewards/perpo_ocr_edit_distance_reward": 0.8038070797920227, + "advantages": -0.00013758456043433398, + "completion_length": 724.0, + "delta_ref_entropy_loss": 0.040771484375, + "delta_ref_ppl": -0.04541015625, + "entropy_loss": -0.0120849609375, + "epoch": 0.165, + "grad_norm": 0.6522130925444405, + "k1_kl": 0.04541015625, + "k3_kl": 0.0245361328125, + "kimi_kl": 0.0712890625, + "learning_rate": 4.1749999999999997e-07, + "loss": 0.0011, + "ppl": 0.004669189453125, + "reward": 0.9991379380226135, + "reward_std": 0.00033323236857540905, + "rewards/perpo_ocr_edit_distance_reward": 0.9991380572319031, "step": 825, "temperature": 0.9 }, { - "advantages": -7.982765168890182e-06, - "completion_length": 600.0, - "delta_ref_entropy_loss": 0.0421142578125, - "delta_ref_ppl": -0.05419921875, - "entropy_loss": -0.025970458984375, - "epoch": 0.3304, - "grad_norm": 0.645381832785729, - "k1_kl": 0.0540771484375, - "k3_kl": 0.03570556640625, - "kimi_kl": 0.089111328125, - "learning_rate": 3.3479999999999995e-07, - "loss": 0.0014, - "ppl": 0.014129638671875, - "reward": 0.9928012490272522, - "reward_std": 0.004077033081557602, - "rewards/perpo_ocr_edit_distance_reward": 0.9928012788295746, + "advantages": -1.1324882507324219e-06, + "completion_length": 1286.0, + "delta_ref_entropy_loss": 0.1328125, + "delta_ref_ppl": -0.1015625, + "entropy_loss": -0.2392578125, + "epoch": 0.1652, + "grad_norm": 10.214824702965323, + "k1_kl": 0.1015625, + "k3_kl": 0.076171875, + "kimi_kl": 0.1328125, + "learning_rate": 4.1739999999999997e-07, + "loss": 0.0031, + "ppl": 0.1416015625, + "reward": 0.8685354590415955, + "reward_std": 0.03779080882668495, + "rewards/perpo_ocr_edit_distance_reward": 0.868535578250885, "step": 826, "temperature": 0.9 }, { - "advantages": -0.0002137550328029647, - "completion_length": 798.0, - "delta_ref_entropy_loss": 0.075927734375, - "delta_ref_ppl": -0.08135986328125, - "entropy_loss": -0.09521484375, - "epoch": 0.3308, - "grad_norm": 23.358997363240764, - "k1_kl": 0.08135986328125, - "k3_kl": 0.07208251953125, - "kimi_kl": 0.1544189453125, - "learning_rate": 3.346e-07, - "loss": 0.0031, - "ppl": 0.05621337890625, - "reward": 0.9720622897148132, - "reward_std": 0.019199739683244843, - "rewards/perpo_ocr_edit_distance_reward": 0.972062349319458, + "advantages": -1.3504710295819677e-05, + "completion_length": 577.0, + "delta_ref_entropy_loss": 0.0908203125, + "delta_ref_ppl": -0.08056640625, + "entropy_loss": -0.0556640625, + "epoch": 0.1654, + "grad_norm": 0.9434797674267472, + "k1_kl": 0.08056640625, + "k3_kl": 0.046630859375, + "kimi_kl": 0.1416015625, + "learning_rate": 4.173e-07, + "loss": 0.0019, + "ppl": 0.0311279296875, + "reward": 0.9762660264968872, + "reward_std": 0.0017924333224073052, + "rewards/perpo_ocr_edit_distance_reward": 0.976266086101532, "step": 827, "temperature": 0.9 }, { - "advantages": -1.1410032811909332e-06, - "completion_length": 469.0, - "delta_ref_entropy_loss": 0.08465576171875, - "delta_ref_ppl": -0.06866455078125, - "entropy_loss": -0.1634521484375, - "epoch": 0.3312, - "grad_norm": 1.7133958700889083, - "k1_kl": 0.06866455078125, - "k3_kl": 0.044921875, - "kimi_kl": 0.1484375, - "learning_rate": 3.344e-07, - "loss": 0.0018, - "ppl": 0.090576171875, - "reward": 0.7542570531368256, - "reward_std": 0.03959278576076031, - "rewards/perpo_ocr_edit_distance_reward": 0.7542570531368256, + "advantages": -1.7029899268550253e-08, + "completion_length": 629.0, + "delta_ref_entropy_loss": 0.09716796875, + "delta_ref_ppl": -0.06884765625, + "entropy_loss": -0.072265625, + "epoch": 0.1656, + "grad_norm": 0.9901812943731562, + "k1_kl": 0.06884765625, + "k3_kl": 0.033447265625, + "kimi_kl": 0.0751953125, + "learning_rate": 4.172e-07, + "loss": 0.0013, + "ppl": 0.03466796875, + "reward": 0.9796727299690247, + "reward_std": 0.0008474335190840065, + "rewards/perpo_ocr_edit_distance_reward": 0.9796727299690247, "step": 828, "temperature": 0.9 }, { - "advantages": -2.3488488295697607e-05, - "completion_length": 600.5, - "delta_ref_entropy_loss": 0.07080078125, - "delta_ref_ppl": -0.0418701171875, - "entropy_loss": -0.085174560546875, - "epoch": 0.3316, - "grad_norm": 2.714240113900934, - "k1_kl": 0.0419921875, - "k3_kl": 0.02783203125, - "kimi_kl": 0.10888671875, - "learning_rate": 3.3419999999999996e-07, - "loss": 0.0011, - "ppl": 0.0445709228515625, - "reward": 0.921534925699234, - "reward_std": 0.01491697080200538, - "rewards/perpo_ocr_edit_distance_reward": 0.9215349555015564, + "advantages": -8.504305878886953e-05, + "completion_length": 972.0, + "delta_ref_entropy_loss": 0.042724609375, + "delta_ref_ppl": -0.03759765625, + "entropy_loss": -0.01324462890625, + "epoch": 0.1658, + "grad_norm": 1.7305728425574862, + "k1_kl": 0.03759765625, + "k3_kl": 0.019287109375, + "kimi_kl": 0.0498046875, + "learning_rate": 4.171e-07, + "loss": 0.0009, + "ppl": 0.0047607421875, + "reward": 0.9965366721153259, + "reward_std": 0.0006007830379530787, + "rewards/perpo_ocr_edit_distance_reward": 0.9965368509292603, "step": 829, "temperature": 0.9 }, { - "advantages": -1.824753690016223e-05, - "completion_length": 510.0, - "delta_ref_entropy_loss": 0.0452880859375, - "delta_ref_ppl": -0.0400390625, - "entropy_loss": -0.033447265625, - "epoch": 0.332, - "grad_norm": 0.4220901665593556, - "k1_kl": 0.0400390625, - "k3_kl": 0.02587890625, - "kimi_kl": 0.0821533203125, - "learning_rate": 3.34e-07, + "advantages": -5.589213105849922e-05, + "completion_length": 815.0, + "delta_ref_entropy_loss": 0.08154296875, + "delta_ref_ppl": -0.054443359375, + "entropy_loss": -0.06640625, + "epoch": 0.166, + "grad_norm": 0.7011271744015168, + "k1_kl": 0.0546875, + "k3_kl": 0.0269775390625, + "kimi_kl": 0.060546875, + "learning_rate": 4.17e-07, "loss": 0.0011, - "ppl": 0.01727294921875, - "reward": 0.990522176027298, - "reward_std": 0.0008368608541786671, - "rewards/perpo_ocr_edit_distance_reward": 0.990522176027298, + "ppl": 0.031982421875, + "reward": 0.9565283060073853, + "reward_std": 0.0011188009520992637, + "rewards/perpo_ocr_edit_distance_reward": 0.9565284252166748, "step": 830, "temperature": 0.9 }, { - "advantages": -0.0002980232238769531, - "completion_length": 205.5, - "delta_ref_entropy_loss": 0.0418701171875, - "delta_ref_ppl": -0.0804443359375, - "entropy_loss": -0.013458251953125, - "epoch": 0.3324, - "grad_norm": 0.01416584648015841, - "k1_kl": 0.080322265625, - "k3_kl": 0.06097412109375, - "kimi_kl": 0.2900390625, - "learning_rate": 3.338e-07, - "loss": 0.0027, - "ppl": 0.002681732177734375, - "reward": 0.914332777261734, - "reward_std": 0.0, - "rewards/perpo_ocr_edit_distance_reward": 0.9143328070640564, + "advantages": -9.901183511829004e-05, + "completion_length": 554.0, + "delta_ref_entropy_loss": 0.043212890625, + "delta_ref_ppl": -0.072265625, + "entropy_loss": -0.033203125, + "epoch": 0.1662, + "grad_norm": 1.2441871649848033, + "k1_kl": 0.072265625, + "k3_kl": 0.04443359375, + "kimi_kl": 0.1259765625, + "learning_rate": 4.169e-07, + "loss": 0.0019, + "ppl": 0.0157470703125, + "reward": 0.9977156519889832, + "reward_std": 0.0005878978408873081, + "rewards/perpo_ocr_edit_distance_reward": 0.9977157711982727, "step": 831, "temperature": 0.9 }, { - "advantages": -2.5808812551986193e-05, - "completion_length": 144.5, - "delta_ref_entropy_loss": 0.02520751953125, - "delta_ref_ppl": -0.111572265625, - "entropy_loss": -0.036376953125, - "epoch": 0.3328, - "grad_norm": 1.8038062483008082, - "k1_kl": 0.111083984375, - "k3_kl": 0.0914306640625, - "kimi_kl": 0.524658203125, - "learning_rate": 3.3359999999999997e-07, - "loss": 0.0037, - "ppl": 0.02154541015625, - "reward": 0.99177086353302, - "reward_std": 0.007617065974045545, - "rewards/perpo_ocr_edit_distance_reward": 0.9917709231376648, + "advantages": -7.152557941481064e-07, + "completion_length": 480.0, + "delta_ref_entropy_loss": 0.1044921875, + "delta_ref_ppl": -0.08837890625, + "entropy_loss": -0.042236328125, + "epoch": 0.1664, + "grad_norm": 2.6636497761620728, + "k1_kl": 0.087890625, + "k3_kl": 0.0517578125, + "kimi_kl": 0.107421875, + "learning_rate": 4.1679999999999997e-07, + "loss": 0.0021, + "ppl": 0.0211181640625, + "reward": 0.5699917674064636, + "reward_std": 0.06761110574007034, + "rewards/perpo_ocr_edit_distance_reward": 0.5699917674064636, "step": 832, "temperature": 0.9 }, { - "advantages": -5.53471727471333e-06, - "completion_length": 175.0, - "delta_ref_entropy_loss": 0.064453125, - "delta_ref_ppl": -0.106201171875, - "entropy_loss": -0.0393829345703125, - "epoch": 0.3332, - "grad_norm": 2.0778715097960223, - "k1_kl": 0.1060791015625, - "k3_kl": 0.0767822265625, - "kimi_kl": 0.270263671875, - "learning_rate": 3.3339999999999995e-07, - "loss": 0.0031, - "ppl": 0.021501541137695312, - "reward": 0.9974337220191956, - "reward_std": 0.001876596244983375, - "rewards/perpo_ocr_edit_distance_reward": 0.9974337220191956, + "advantages": -1.4994826415204443e-05, + "completion_length": 534.0, + "delta_ref_entropy_loss": 0.05908203125, + "delta_ref_ppl": -0.0517578125, + "entropy_loss": -0.0247802734375, + "epoch": 0.1666, + "grad_norm": 1.2250520919611532, + "k1_kl": 0.0517578125, + "k3_kl": 0.0274658203125, + "kimi_kl": 0.06787109375, + "learning_rate": 4.167e-07, + "loss": 0.0011, + "ppl": 0.0098876953125, + "reward": 0.9945213198661804, + "reward_std": 0.00046783790457993746, + "rewards/perpo_ocr_edit_distance_reward": 0.9945213198661804, "step": 833, "temperature": 0.9 }, { - "advantages": 1.2261527899681823e-06, - "completion_length": 449.0, - "delta_ref_entropy_loss": 0.06591796875, - "delta_ref_ppl": -0.04833984375, - "entropy_loss": -0.048553466796875, - "epoch": 0.3336, - "grad_norm": 0.6437902812534674, - "k1_kl": 0.04833984375, - "k3_kl": 0.0294189453125, - "kimi_kl": 0.0810546875, - "learning_rate": 3.332e-07, - "loss": 0.0012, - "ppl": 0.0237274169921875, - "reward": 0.9844893515110016, - "reward_std": 0.0016779254656285048, - "rewards/perpo_ocr_edit_distance_reward": 0.984489381313324, + "advantages": -2.660070276760962e-05, + "completion_length": 202.0, + "delta_ref_entropy_loss": 0.1416015625, + "delta_ref_ppl": -0.1728515625, + "entropy_loss": -0.08056640625, + "epoch": 0.1668, + "grad_norm": 2.315322027487522, + "k1_kl": 0.1728515625, + "k3_kl": 0.10205078125, + "kimi_kl": 0.265625, + "learning_rate": 4.166e-07, + "loss": 0.0041, + "ppl": 0.041259765625, + "reward": 0.9758750796318054, + "reward_std": 0.0011799721978604794, + "rewards/perpo_ocr_edit_distance_reward": 0.9758751392364502, "step": 834, "temperature": 0.9 }, { - "advantages": -7.846525477361865e-06, - "completion_length": 316.5, - "delta_ref_entropy_loss": 0.06005859375, - "delta_ref_ppl": -0.0455322265625, - "entropy_loss": -0.046142578125, - "epoch": 0.334, - "grad_norm": 1.3927704843656874, - "k1_kl": 0.0455322265625, - "k3_kl": 0.02471923828125, - "kimi_kl": 0.048828125, - "learning_rate": 3.33e-07, - "loss": 0.001, - "ppl": 0.02349853515625, - "reward": 0.9991874694824219, - "reward_std": 0.0011120679118903354, - "rewards/perpo_ocr_edit_distance_reward": 0.9991874992847443, + "advantages": -0.00013899803161621094, + "completion_length": 800.0, + "delta_ref_entropy_loss": 0.07177734375, + "delta_ref_ppl": -0.051025390625, + "entropy_loss": -0.04931640625, + "epoch": 0.167, + "grad_norm": 2.438101932674661, + "k1_kl": 0.05126953125, + "k3_kl": 0.02587890625, + "kimi_kl": 0.0673828125, + "learning_rate": 4.1649999999999995e-07, + "loss": 0.0012, + "ppl": 0.02587890625, + "reward": 0.9903298020362854, + "reward_std": 0.00032892549643293023, + "rewards/perpo_ocr_edit_distance_reward": 0.9903298616409302, "step": 835, "temperature": 0.9 }, { - "advantages": -7.501670296505836e-05, - "completion_length": 465.5, - "delta_ref_entropy_loss": 0.03466796875, - "delta_ref_ppl": -0.05963134765625, - "entropy_loss": -0.0450439453125, - "epoch": 0.3344, - "grad_norm": 0.8634705096268397, - "k1_kl": 0.05963134765625, - "k3_kl": 0.04547119140625, - "kimi_kl": 0.1749267578125, - "learning_rate": 3.3279999999999996e-07, - "loss": 0.0019, - "ppl": 0.024261474609375, - "reward": 0.5980640724301338, - "reward_std": 0.016599789261817932, - "rewards/perpo_ocr_edit_distance_reward": 0.5980641171336174, + "advantages": -1.890318890218623e-05, + "completion_length": 487.0, + "delta_ref_entropy_loss": 0.08544921875, + "delta_ref_ppl": -0.07666015625, + "entropy_loss": -0.042724609375, + "epoch": 0.1672, + "grad_norm": 1.091330816632884, + "k1_kl": 0.07666015625, + "k3_kl": 0.044189453125, + "kimi_kl": 0.11328125, + "learning_rate": 4.164e-07, + "loss": 0.0018, + "ppl": 0.021484375, + "reward": 0.9725647568702698, + "reward_std": 0.0016998442588374019, + "rewards/perpo_ocr_edit_distance_reward": 0.9725648164749146, "step": 836, "temperature": 0.9 }, { - "advantages": -1.319817215517105e-06, - "completion_length": 371.0, - "delta_ref_entropy_loss": 0.05950927734375, - "delta_ref_ppl": -0.02349853515625, - "entropy_loss": -0.07196044921875, - "epoch": 0.3348, - "grad_norm": 0.9045212365362078, - "k1_kl": 0.02362060546875, - "k3_kl": 0.0096588134765625, - "kimi_kl": 0.015380859375, - "learning_rate": 3.326e-07, - "loss": 0.0004, - "ppl": 0.03784942626953125, - "reward": 0.9363577365875244, - "reward_std": 0.011209916323423386, - "rewards/perpo_ocr_edit_distance_reward": 0.9363577663898468, + "advantages": -6.6331458583590575e-06, + "completion_length": 131.0, + "delta_ref_entropy_loss": 0.09375, + "delta_ref_ppl": -0.1318359375, + "entropy_loss": -0.050048828125, + "epoch": 0.1674, + "grad_norm": 4.20066328439212, + "k1_kl": 0.1318359375, + "k3_kl": 0.08935546875, + "kimi_kl": 0.2373046875, + "learning_rate": 4.163e-07, + "loss": 0.0036, + "ppl": 0.025634765625, + "reward": 0.9734565615653992, + "reward_std": 0.003756707301363349, + "rewards/perpo_ocr_edit_distance_reward": 0.973456621170044, "step": 837, "temperature": 0.9 }, { - "advantages": -3.0836890289265284e-05, - "completion_length": 651.0, - "delta_ref_entropy_loss": 0.0728759765625, - "delta_ref_ppl": -0.0584716796875, - "entropy_loss": -0.169189453125, - "epoch": 0.3352, - "grad_norm": 1.0212883451844241, - "k1_kl": 0.05859375, - "k3_kl": 0.03521728515625, - "kimi_kl": 0.096923828125, - "learning_rate": 3.3239999999999993e-07, - "loss": 0.0014, - "ppl": 0.102294921875, - "reward": 0.8761681616306305, - "reward_std": 0.11907135813089553, - "rewards/perpo_ocr_edit_distance_reward": 0.87616828083992, + "advantages": -3.065381974920456e-07, + "completion_length": 1158.0, + "delta_ref_entropy_loss": 0.04345703125, + "delta_ref_ppl": -0.034423828125, + "entropy_loss": -0.02490234375, + "epoch": 0.1676, + "grad_norm": 0.7295106197091332, + "k1_kl": 0.034423828125, + "k3_kl": 0.0184326171875, + "kimi_kl": 0.038818359375, + "learning_rate": 4.162e-07, + "loss": 0.0007, + "ppl": 0.0108642578125, + "reward": 0.9676852822303772, + "reward_std": 0.05796800181269646, + "rewards/perpo_ocr_edit_distance_reward": 0.967685341835022, "step": 838, "temperature": 0.9 }, { - "advantages": 0.0, - "completion_length": 676.5, - "delta_ref_entropy_loss": 0.05328369140625, - "delta_ref_ppl": -0.03765869140625, - "entropy_loss": -0.0430755615234375, - "epoch": 0.3356, - "grad_norm": 0.6567225693149331, - "k1_kl": 0.037811279296875, - "k3_kl": 0.022705078125, - "kimi_kl": 0.07061767578125, - "learning_rate": 3.3219999999999997e-07, - "loss": 0.0009, - "ppl": 0.025960922241210938, - "reward": 0.9841703176498413, - "reward_std": 0.0006748611340299249, - "rewards/perpo_ocr_edit_distance_reward": 0.9841703176498413, + "advantages": -6.270408630371094e-05, + "completion_length": 620.0, + "delta_ref_entropy_loss": 0.06591796875, + "delta_ref_ppl": -0.059326171875, + "entropy_loss": -0.03173828125, + "epoch": 0.1678, + "grad_norm": 0.7011624816781421, + "k1_kl": 0.05908203125, + "k3_kl": 0.033203125, + "kimi_kl": 0.08447265625, + "learning_rate": 4.1610000000000003e-07, + "loss": 0.0014, + "ppl": 0.01470947265625, + "reward": 0.9861039519309998, + "reward_std": 0.0005788315320387483, + "rewards/perpo_ocr_edit_distance_reward": 0.9861040711402893, "step": 839, "temperature": 0.9 }, { - "advantages": -4.07227471441729e-05, - "completion_length": 760.5, - "delta_ref_entropy_loss": 0.0589599609375, - "delta_ref_ppl": -0.0447998046875, - "entropy_loss": -0.0533447265625, - "epoch": 0.336, - "grad_norm": 0.851303014147578, - "k1_kl": 0.0447998046875, - "k3_kl": 0.02874755859375, - "kimi_kl": 0.060302734375, - "learning_rate": 3.32e-07, - "loss": 0.0012, - "ppl": 0.026763916015625, - "reward": 0.9948038458824158, - "reward_std": 0.0016357994172722101, - "rewards/perpo_ocr_edit_distance_reward": 0.9948039650917053, + "advantages": -2.6191984943579882e-05, + "completion_length": 829.0, + "delta_ref_entropy_loss": 0.0771484375, + "delta_ref_ppl": -0.049072265625, + "entropy_loss": -0.06884765625, + "epoch": 0.168, + "grad_norm": 0.7753610126325113, + "k1_kl": 0.049072265625, + "k3_kl": 0.0238037109375, + "kimi_kl": 0.055908203125, + "learning_rate": 4.1599999999999997e-07, + "loss": 0.001, + "ppl": 0.031005859375, + "reward": 0.9715173244476318, + "reward_std": 0.001200436963699758, + "rewards/perpo_ocr_edit_distance_reward": 0.9715174436569214, "step": 840, "temperature": 0.9 }, { - "advantages": 1.819644785427954e-05, - "completion_length": 817.0, - "delta_ref_entropy_loss": 0.02239990234375, - "delta_ref_ppl": -0.016265869140625, - "entropy_loss": -0.01422119140625, - "epoch": 0.3364, - "grad_norm": 0.15334579163531648, - "k1_kl": 0.016204833984375, - "k3_kl": 0.00909423828125, - "kimi_kl": 0.026397705078125, - "learning_rate": 3.318e-07, - "loss": 0.0003, - "ppl": 0.0058746337890625, - "reward": 0.9992474913597107, - "reward_std": 0.0003010229265782982, - "rewards/perpo_ocr_edit_distance_reward": 0.9992474317550659, + "advantages": 0.0, + "completion_length": 440.0, + "delta_ref_entropy_loss": 0.04296875, + "delta_ref_ppl": -0.056640625, + "entropy_loss": -0.01153564453125, + "epoch": 0.1682, + "grad_norm": 3.103443850869175, + "k1_kl": 0.056640625, + "k3_kl": 0.033935546875, + "kimi_kl": 0.095703125, + "learning_rate": 4.1589999999999996e-07, + "loss": 0.0014, + "ppl": 0.0033721923828125, + "reward": 0.9952357411384583, + "reward_std": 0.0015963662881404161, + "rewards/perpo_ocr_edit_distance_reward": 0.9952357411384583, "step": 841, "temperature": 0.9 }, { - "advantages": -4.040343628730625e-05, - "completion_length": 767.5, - "delta_ref_entropy_loss": 0.0533447265625, - "delta_ref_ppl": -0.0472412109375, - "entropy_loss": -0.03179931640625, - "epoch": 0.3368, - "grad_norm": 0.21801379117884792, - "k1_kl": 0.0474853515625, - "k3_kl": 0.0263671875, - "kimi_kl": 0.0562744140625, - "learning_rate": 3.316e-07, - "loss": 0.0011, - "ppl": 0.015350341796875, - "reward": 0.9976922869682312, - "reward_std": 0.0003715589118655771, - "rewards/perpo_ocr_edit_distance_reward": 0.997692346572876, + "advantages": -0.0005960464477539062, + "completion_length": 492.0, + "delta_ref_entropy_loss": 0.0634765625, + "delta_ref_ppl": -0.056884765625, + "entropy_loss": -0.009765625, + "epoch": 0.1684, + "grad_norm": 0.004309621717615587, + "k1_kl": 0.056884765625, + "k3_kl": 0.031494140625, + "kimi_kl": 0.10498046875, + "learning_rate": 4.158e-07, + "loss": 0.0019, + "ppl": 0.00177001953125, + "reward": 0.9940059185028076, + "reward_std": 0.0, + "rewards/perpo_ocr_edit_distance_reward": 0.9940060377120972, "step": 842, "temperature": 0.9 }, { - "advantages": 1.3926200153946411e-05, - "completion_length": 276.0, - "delta_ref_entropy_loss": 0.0550537109375, - "delta_ref_ppl": -0.0462646484375, - "entropy_loss": -0.02374267578125, - "epoch": 0.3372, - "grad_norm": 0.4632465772444311, - "k1_kl": 0.0462646484375, - "k3_kl": 0.02825927734375, - "kimi_kl": 0.07568359375, - "learning_rate": 3.3139999999999996e-07, - "loss": 0.0011, - "ppl": 0.01190185546875, - "reward": 0.9992109835147858, - "reward_std": 0.0002556208928581327, - "rewards/perpo_ocr_edit_distance_reward": 0.9992109835147858, + "advantages": -6.164823389553931e-06, + "completion_length": 25.0, + "delta_ref_entropy_loss": 0.1630859375, + "delta_ref_ppl": -0.52734375, + "entropy_loss": -0.193359375, + "epoch": 0.1686, + "grad_norm": 19.217038234126193, + "k1_kl": 0.52734375, + "k3_kl": 0.451171875, + "kimi_kl": 1.828125, + "learning_rate": 4.157e-07, + "loss": 0.018, + "ppl": 0.1396484375, + "reward": 0.8812198638916016, + "reward_std": 0.010965186171233654, + "rewards/perpo_ocr_edit_distance_reward": 0.8812199831008911, "step": 843, "temperature": 0.9 }, { - "advantages": -6.807702391142811e-06, - "completion_length": 510.5, - "delta_ref_entropy_loss": 0.0380859375, - "delta_ref_ppl": -0.041015625, - "entropy_loss": -0.033660888671875, - "epoch": 0.3376, - "grad_norm": 0.6178010592035681, - "k1_kl": 0.041015625, - "k3_kl": 0.02471923828125, - "kimi_kl": 0.0675048828125, - "learning_rate": 3.312e-07, - "loss": 0.001, - "ppl": 0.017791748046875, - "reward": 0.6307866871356964, - "reward_std": 0.01345315961225424, - "rewards/perpo_ocr_edit_distance_reward": 0.6307866871356964, + "advantages": -2.779279566311743e-05, + "completion_length": 2048.0, + "delta_ref_entropy_loss": 0.042236328125, + "delta_ref_ppl": -0.0274658203125, + "entropy_loss": -0.08251953125, + "epoch": 0.1688, + "grad_norm": 20384.451819087215, + "k1_kl": 0.0274658203125, + "k3_kl": 26.0, + "kimi_kl": 0.083984375, + "learning_rate": 4.156e-07, + "loss": 1.0382, + "ppl": 0.06005859375, + "reward": 0.8794304728507996, + "reward_std": 0.001735835918225348, + "rewards/perpo_ocr_edit_distance_reward": 0.8794305324554443, "step": 844, "temperature": 0.9 }, { - "advantages": -6.939683771634009e-06, - "completion_length": 1071.5, - "delta_ref_entropy_loss": 0.0396728515625, - "delta_ref_ppl": -0.02099609375, - "entropy_loss": -0.060394287109375, - "epoch": 0.338, - "grad_norm": 4.085526224397175, - "k1_kl": 0.02099609375, - "k3_kl": 0.073822021484375, - "kimi_kl": 0.02471923828125, - "learning_rate": 3.31e-07, - "loss": 0.0029, - "ppl": 0.0441741943359375, - "reward": 0.986573189496994, - "reward_std": 0.0011753315338864923, - "rewards/perpo_ocr_edit_distance_reward": 0.9865732192993164, + "advantages": -2.043587983280304e-06, + "completion_length": 629.0, + "delta_ref_entropy_loss": 0.1484375, + "delta_ref_ppl": -0.11669921875, + "entropy_loss": -0.18359375, + "epoch": 0.169, + "grad_norm": 2.125435791651497, + "k1_kl": 0.11669921875, + "k3_kl": 0.0615234375, + "kimi_kl": 0.1533203125, + "learning_rate": 4.155e-07, + "loss": 0.0025, + "ppl": 0.09521484375, + "reward": 0.8456838726997375, + "reward_std": 0.020796652883291245, + "rewards/perpo_ocr_edit_distance_reward": 0.8456839323043823, "step": 845, "temperature": 0.9 }, { - "advantages": -0.00011666757950479223, - "completion_length": 925.5, - "delta_ref_entropy_loss": 0.0357666015625, - "delta_ref_ppl": -0.0213623046875, - "entropy_loss": -0.0572509765625, - "epoch": 0.3384, - "grad_norm": 0.624360792681555, - "k1_kl": 0.021484375, - "k3_kl": 0.0113525390625, - "kimi_kl": 0.018707275390625, - "learning_rate": 3.3079999999999997e-07, - "loss": 0.0006, - "ppl": 0.029541015625, - "reward": 0.9770602881908417, - "reward_std": 0.0036713564913952723, - "rewards/perpo_ocr_edit_distance_reward": 0.9770603775978088, + "advantages": -3.1726704037282616e-05, + "completion_length": 460.0, + "delta_ref_entropy_loss": 0.1455078125, + "delta_ref_ppl": -0.10205078125, + "entropy_loss": -0.1328125, + "epoch": 0.1692, + "grad_norm": 7.110659943767766, + "k1_kl": 0.10205078125, + "k3_kl": 0.054931640625, + "kimi_kl": 0.107421875, + "learning_rate": 4.154e-07, + "loss": 0.0022, + "ppl": 0.0771484375, + "reward": 0.8312981128692627, + "reward_std": 0.0025838101282715797, + "rewards/perpo_ocr_edit_distance_reward": 0.8312981724739075, "step": 846, "temperature": 0.9 }, { - "advantages": -8.593287020630669e-05, - "completion_length": 663.5, - "delta_ref_entropy_loss": 0.04473876953125, - "delta_ref_ppl": -0.0372314453125, - "entropy_loss": -0.0382080078125, - "epoch": 0.3388, - "grad_norm": 0.7546863872647157, - "k1_kl": 0.0372314453125, - "k3_kl": 0.029296875, - "kimi_kl": 0.0784912109375, - "learning_rate": 3.306e-07, - "loss": 0.0013, - "ppl": 0.02166748046875, - "reward": 0.9884186685085297, - "reward_std": 0.000749227314372547, - "rewards/perpo_ocr_edit_distance_reward": 0.9884187579154968, + "advantages": -0.00014343432849273086, + "completion_length": 856.0, + "delta_ref_entropy_loss": 0.07763671875, + "delta_ref_ppl": -0.07421875, + "entropy_loss": -0.046875, + "epoch": 0.1694, + "grad_norm": 0.5926064906270161, + "k1_kl": 0.07373046875, + "k3_kl": 0.039794921875, + "kimi_kl": 0.1015625, + "learning_rate": 4.1529999999999997e-07, + "loss": 0.0017, + "ppl": 0.0218505859375, + "reward": 0.9078760743141174, + "reward_std": 0.0004341786552686244, + "rewards/perpo_ocr_edit_distance_reward": 0.9078761339187622, "step": 847, "temperature": 0.9 }, { - "advantages": -6.152050929131292e-06, - "completion_length": 464.0, - "delta_ref_entropy_loss": 0.03350830078125, - "delta_ref_ppl": -0.022216796875, - "entropy_loss": -0.0155029296875, - "epoch": 0.3392, - "grad_norm": 0.5987393832960983, - "k1_kl": 0.022216796875, - "k3_kl": 0.012054443359375, - "kimi_kl": 0.03558349609375, - "learning_rate": 3.304e-07, - "loss": 0.0005, - "ppl": 0.0066375732421875, - "reward": 0.9949902594089508, - "reward_std": 0.0062015746952965856, - "rewards/perpo_ocr_edit_distance_reward": 0.9949903190135956, + "advantages": -2.9802324661432067e-06, + "completion_length": 1064.0, + "delta_ref_entropy_loss": 0.07177734375, + "delta_ref_ppl": -0.058349609375, + "entropy_loss": -0.02783203125, + "epoch": 0.1696, + "grad_norm": 1.0423518329488348, + "k1_kl": 0.05859375, + "k3_kl": 0.02734375, + "kimi_kl": 0.05615234375, + "learning_rate": 4.152e-07, + "loss": 0.0011, + "ppl": 0.0107421875, + "reward": 0.9876111745834351, + "reward_std": 0.005607233848422766, + "rewards/perpo_ocr_edit_distance_reward": 0.9876111745834351, "step": 848, "temperature": 0.9 }, { - "advantages": 4.225543889901928e-06, - "completion_length": 506.5, - "delta_ref_entropy_loss": 0.05499267578125, - "delta_ref_ppl": -0.063720703125, - "entropy_loss": -0.1103515625, - "epoch": 0.3396, - "grad_norm": 1.0420406170175907, - "k1_kl": 0.0638427734375, + "advantages": -3.8402424252126366e-05, + "completion_length": 376.0, + "delta_ref_entropy_loss": 0.048583984375, + "delta_ref_ppl": -0.0693359375, + "entropy_loss": -0.017822265625, + "epoch": 0.1698, + "grad_norm": 0.8151984897728558, + "k1_kl": 0.06982421875, "k3_kl": 0.044677734375, - "kimi_kl": 0.199951171875, - "learning_rate": 3.302e-07, + "kimi_kl": 0.12255859375, + "learning_rate": 4.151e-07, "loss": 0.0018, - "ppl": 0.0537109375, - "reward": 0.8668201267719269, - "reward_std": 0.11348837072728202, - "rewards/perpo_ocr_edit_distance_reward": 0.8668201267719269, + "ppl": 0.0059814453125, + "reward": 0.9768319129943848, + "reward_std": 0.0005647233338095248, + "rewards/perpo_ocr_edit_distance_reward": 0.9768319725990295, "step": 849, "temperature": 0.9 }, { - "advantages": -9.020525011393943e-07, - "completion_length": 615.5, - "delta_ref_entropy_loss": 0.084716796875, - "delta_ref_ppl": -0.0556640625, - "entropy_loss": -0.0950927734375, - "epoch": 0.34, - "grad_norm": 1.6554451776714363, - "k1_kl": 0.0556640625, - "k3_kl": 0.02862548828125, - "kimi_kl": 0.066650390625, - "learning_rate": 3.3e-07, - "loss": 0.0011, - "ppl": 0.055999755859375, - "reward": 0.9545819461345673, - "reward_std": 0.0107968709198758, - "rewards/perpo_ocr_edit_distance_reward": 0.954582005739212, + "advantages": -1.6765936379670165e-05, + "completion_length": 601.0, + "delta_ref_entropy_loss": 0.109375, + "delta_ref_ppl": -0.1083984375, + "entropy_loss": -0.0966796875, + "epoch": 0.17, + "grad_norm": 1.764354765236412, + "k1_kl": 0.1083984375, + "k3_kl": 0.06640625, + "kimi_kl": 0.173828125, + "learning_rate": 4.1499999999999994e-07, + "loss": 0.0027, + "ppl": 0.05126953125, + "reward": 0.8398189544677734, + "reward_std": 0.004989509470760822, + "rewards/perpo_ocr_edit_distance_reward": 0.8398191332817078, "step": 850, "temperature": 0.9 }, { - "advantages": 0.0, - "completion_length": 63.0, - "delta_ref_entropy_loss": 0.05010986328125, - "delta_ref_ppl": -0.043212890625, - "entropy_loss": -0.01739501953125, - "epoch": 0.3404, - "grad_norm": 0.028042241391237264, - "k1_kl": 0.043212890625, - "k3_kl": 0.023773193359375, - "kimi_kl": 0.054840087890625, - "learning_rate": 3.2979999999999995e-07, - "loss": 0.001, - "ppl": 0.00258636474609375, - "reward": 1.0, - "reward_std": 0.0, - "rewards/perpo_ocr_edit_distance_reward": 1.0, + "advantages": -1.1784690286731347e-05, + "completion_length": 380.0, + "delta_ref_entropy_loss": 0.06396484375, + "delta_ref_ppl": -0.07177734375, + "entropy_loss": -0.043212890625, + "epoch": 0.1702, + "grad_norm": 0.9986604773933725, + "k1_kl": 0.072265625, + "k3_kl": 0.041015625, + "kimi_kl": 0.09326171875, + "learning_rate": 4.149e-07, + "loss": 0.0017, + "ppl": 0.0194091796875, + "reward": 0.9875690340995789, + "reward_std": 0.002071817172691226, + "rewards/perpo_ocr_edit_distance_reward": 0.9875690937042236, "step": 851, "temperature": 0.9 }, { - "advantages": 1.3687781574844848e-05, - "completion_length": 857.5, - "delta_ref_entropy_loss": 0.029998779296875, - "delta_ref_ppl": -0.018463134765625, - "entropy_loss": -0.00927734375, - "epoch": 0.3408, - "grad_norm": 0.1009513606018183, - "k1_kl": 0.0185089111328125, - "k3_kl": 0.01081085205078125, - "kimi_kl": 0.040740966796875, - "learning_rate": 3.296e-07, - "loss": 0.0004, - "ppl": 0.002681732177734375, - "reward": 0.9999153017997742, - "reward_std": 0.00010563300020294264, - "rewards/perpo_ocr_edit_distance_reward": 0.9999153017997742, + "advantages": -8.471523324260488e-05, + "completion_length": 711.0, + "delta_ref_entropy_loss": 0.035888671875, + "delta_ref_ppl": -0.034423828125, + "entropy_loss": -0.0216064453125, + "epoch": 0.1704, + "grad_norm": 0.44715482429195175, + "k1_kl": 0.03466796875, + "k3_kl": 0.0208740234375, + "kimi_kl": 0.0634765625, + "learning_rate": 4.148e-07, + "loss": 0.0009, + "ppl": 0.0111083984375, + "reward": 0.9801359176635742, + "reward_std": 0.0006035997648723423, + "rewards/perpo_ocr_edit_distance_reward": 0.980135977268219, "step": 852, "temperature": 0.9 }, { - "advantages": -3.982441921834834e-05, - "completion_length": 186.0, - "delta_ref_entropy_loss": 0.076904296875, - "delta_ref_ppl": -0.0426025390625, - "entropy_loss": -0.021881103515625, - "epoch": 0.3412, - "grad_norm": 0.5386037812070843, - "k1_kl": 0.042724609375, - "k3_kl": 0.021728515625, - "kimi_kl": 0.0528564453125, - "learning_rate": 3.294e-07, - "loss": 0.0009, - "ppl": 0.007640838623046875, - "reward": 0.9998774826526642, - "reward_std": 0.00032414571614935994, - "rewards/perpo_ocr_edit_distance_reward": 0.9998775124549866, + "advantages": -4.3051586544606835e-05, + "completion_length": 821.0, + "delta_ref_entropy_loss": 0.1337890625, + "delta_ref_ppl": -0.08984375, + "entropy_loss": -0.109375, + "epoch": 0.1706, + "grad_norm": 1.913504884157255, + "k1_kl": 0.08984375, + "k3_kl": 0.051025390625, + "kimi_kl": 0.099609375, + "learning_rate": 4.147e-07, + "loss": 0.0021, + "ppl": 0.058837890625, + "reward": 0.8871725797653198, + "reward_std": 0.002471604384481907, + "rewards/perpo_ocr_edit_distance_reward": 0.8871727585792542, "step": 853, "temperature": 0.9 }, { - "advantages": -1.2772424042850616e-06, - "completion_length": 357.5, - "delta_ref_entropy_loss": 0.050537109375, - "delta_ref_ppl": -0.0343017578125, - "entropy_loss": -0.02874755859375, - "epoch": 0.3416, - "grad_norm": 0.9623844008207497, - "k1_kl": 0.03424072265625, - "k3_kl": 0.017425537109375, - "kimi_kl": 0.033172607421875, - "learning_rate": 3.2919999999999996e-07, - "loss": 0.0007, - "ppl": 0.0146942138671875, - "reward": 0.992975503206253, - "reward_std": 0.010006869211792946, - "rewards/perpo_ocr_edit_distance_reward": 0.9929755330085754, + "advantages": 0.0, + "completion_length": 526.0, + "delta_ref_entropy_loss": 0.0927734375, + "delta_ref_ppl": -0.07861328125, + "entropy_loss": -0.05126953125, + "epoch": 0.1708, + "grad_norm": 0.8365502957373978, + "k1_kl": 0.07861328125, + "k3_kl": 0.04345703125, + "kimi_kl": 0.09619140625, + "learning_rate": 4.146e-07, + "loss": 0.0017, + "ppl": 0.02685546875, + "reward": 0.8432705998420715, + "reward_std": 0.0006162696518003941, + "rewards/perpo_ocr_edit_distance_reward": 0.8432705998420715, "step": 854, "temperature": 0.9 }, { - "advantages": -4.0096896555041894e-05, - "completion_length": 415.5, - "delta_ref_entropy_loss": 0.10009765625, - "delta_ref_ppl": -0.082763671875, - "entropy_loss": -0.03070068359375, - "epoch": 0.342, - "grad_norm": 0.21856704876215194, - "k1_kl": 0.082763671875, - "k3_kl": 0.05078125, - "kimi_kl": 0.1334228515625, - "learning_rate": 3.29e-07, - "loss": 0.0021, - "ppl": 0.0082550048828125, - "reward": 0.9038470685482025, - "reward_std": 0.00026857107877731323, - "rewards/perpo_ocr_edit_distance_reward": 0.9038471281528473, + "advantages": -4.319633808336221e-05, + "completion_length": 465.0, + "delta_ref_entropy_loss": 0.0634765625, + "delta_ref_ppl": -0.055908203125, + "entropy_loss": -0.01287841796875, + "epoch": 0.171, + "grad_norm": 0.7735885689459586, + "k1_kl": 0.055908203125, + "k3_kl": 0.03076171875, + "kimi_kl": 0.08056640625, + "learning_rate": 4.1449999999999996e-07, + "loss": 0.0013, + "ppl": 0.0033721923828125, + "reward": 0.9941734671592712, + "reward_std": 0.0004914047312922776, + "rewards/perpo_ocr_edit_distance_reward": 0.994173526763916, "step": 855, "temperature": 0.9 }, { - "advantages": -4.0735518268775195e-05, - "completion_length": 384.5, - "delta_ref_entropy_loss": 0.0264892578125, - "delta_ref_ppl": -0.02191162109375, - "entropy_loss": -0.015838623046875, - "epoch": 0.3424, - "grad_norm": 0.3912193833451745, - "k1_kl": 0.02191162109375, - "k3_kl": 0.01385498046875, - "kimi_kl": 0.03961181640625, - "learning_rate": 3.288e-07, - "loss": 0.0006, - "ppl": 0.008941650390625, - "reward": 0.9991195797920227, - "reward_std": 0.00026357738533988595, - "rewards/perpo_ocr_edit_distance_reward": 0.9991196095943451, + "advantages": -2.276045961480122e-05, + "completion_length": 522.0, + "delta_ref_entropy_loss": 0.11962890625, + "delta_ref_ppl": -0.0810546875, + "entropy_loss": -0.051025390625, + "epoch": 0.1712, + "grad_norm": 0.9466752157876629, + "k1_kl": 0.0810546875, + "k3_kl": 0.0419921875, + "kimi_kl": 0.0810546875, + "learning_rate": 4.1439999999999995e-07, + "loss": 0.0017, + "ppl": 0.0235595703125, + "reward": 0.9364997744560242, + "reward_std": 0.0013980374205857515, + "rewards/perpo_ocr_edit_distance_reward": 0.9364997744560242, "step": 856, "temperature": 0.9 }, { - "advantages": -0.00030142068885652407, - "completion_length": 239.0, - "delta_ref_entropy_loss": 0.0592041015625, - "delta_ref_ppl": -0.060546875, - "entropy_loss": -0.0444183349609375, - "epoch": 0.3428, - "grad_norm": 1.2793008561938366, - "k1_kl": 0.060546875, - "k3_kl": 0.03765869140625, - "kimi_kl": 0.11376953125, - "learning_rate": 3.2859999999999996e-07, - "loss": 0.0018, - "ppl": 0.021457672119140625, - "reward": 0.9930997788906097, - "reward_std": 0.004979232791811228, - "rewards/perpo_ocr_edit_distance_reward": 0.9930998682975769, + "advantages": -5.10896995820076e-08, + "completion_length": 100.0, + "delta_ref_entropy_loss": 0.248046875, + "delta_ref_ppl": -0.369140625, + "entropy_loss": -0.30859375, + "epoch": 0.1714, + "grad_norm": 4.6565988047158955, + "k1_kl": 0.369140625, + "k3_kl": 0.2578125, + "kimi_kl": 1.109375, + "learning_rate": 4.143e-07, + "loss": 0.0103, + "ppl": 0.1455078125, + "reward": 0.37395915389060974, + "reward_std": 0.10811702162027359, + "rewards/perpo_ocr_edit_distance_reward": 0.37395915389060974, "step": 857, "temperature": 0.9 }, { - "advantages": -2.924063301179558e-05, - "completion_length": 424.5, - "delta_ref_entropy_loss": 0.03656005859375, - "delta_ref_ppl": -0.025726318359375, - "entropy_loss": -0.021392822265625, - "epoch": 0.3432, - "grad_norm": 0.47454987983375435, - "k1_kl": 0.025726318359375, - "k3_kl": 0.013946533203125, - "kimi_kl": 0.03961181640625, - "learning_rate": 3.284e-07, - "loss": 0.0006, - "ppl": 0.011077880859375, - "reward": 0.9995824098587036, - "reward_std": 0.0002411646710243076, - "rewards/perpo_ocr_edit_distance_reward": 0.9995824098587036, + "advantages": -0.0005960464477539062, + "completion_length": 224.0, + "delta_ref_entropy_loss": 0.060546875, + "delta_ref_ppl": -0.12109375, + "entropy_loss": -0.0162353515625, + "epoch": 0.1716, + "grad_norm": 0.01057555696553318, + "k1_kl": 0.12109375, + "k3_kl": 0.0791015625, + "kimi_kl": 0.2236328125, + "learning_rate": 4.142e-07, + "loss": 0.0038, + "ppl": 0.002716064453125, + "reward": 0.9892578125, + "reward_std": 0.0, + "rewards/perpo_ocr_edit_distance_reward": 0.9892578721046448, "step": 858, "temperature": 0.9 }, { - "advantages": -2.1287374352141342e-07, - "completion_length": 295.5, - "delta_ref_entropy_loss": 0.04058837890625, - "delta_ref_ppl": -0.071044921875, - "entropy_loss": -0.022705078125, - "epoch": 0.3436, - "grad_norm": 0.7241094841818759, - "k1_kl": 0.07098388671875, - "k3_kl": 0.053924560546875, - "kimi_kl": 0.2564697265625, - "learning_rate": 3.282e-07, - "loss": 0.0022, - "ppl": 0.00724029541015625, - "reward": 0.9024685025215149, - "reward_std": 0.03087868168950081, - "rewards/perpo_ocr_edit_distance_reward": 0.9024685025215149, + "advantages": -0.0005960464477539062, + "completion_length": 558.0, + "delta_ref_entropy_loss": 0.0576171875, + "delta_ref_ppl": -0.051513671875, + "entropy_loss": -0.01177978515625, + "epoch": 0.1718, + "grad_norm": 0.007165112732898288, + "k1_kl": 0.051513671875, + "k3_kl": 0.0262451171875, + "kimi_kl": 0.06396484375, + "learning_rate": 4.141e-07, + "loss": 0.0016, + "ppl": 0.0022735595703125, + "reward": 0.99837726354599, + "reward_std": 0.0, + "rewards/perpo_ocr_edit_distance_reward": 0.9983773231506348, "step": 859, "temperature": 0.9 }, { - "advantages": -2.7247838829680404e-07, - "completion_length": 403.0, - "delta_ref_entropy_loss": 0.068603515625, - "delta_ref_ppl": -0.063720703125, - "entropy_loss": -0.049560546875, - "epoch": 0.344, - "grad_norm": 0.7540453486932207, - "k1_kl": 0.0634765625, - "k3_kl": 0.03961181640625, - "kimi_kl": 0.104736328125, - "learning_rate": 3.28e-07, - "loss": 0.0016, - "ppl": 0.019256591796875, - "reward": 0.7571711838245392, - "reward_std": 0.03762511536478996, - "rewards/perpo_ocr_edit_distance_reward": 0.7571712136268616, + "advantages": 3.7465779314516112e-06, + "completion_length": 571.0, + "delta_ref_entropy_loss": 0.11328125, + "delta_ref_ppl": -0.1015625, + "entropy_loss": -0.08154296875, + "epoch": 0.172, + "grad_norm": 1.0907719793297335, + "k1_kl": 0.1015625, + "k3_kl": 0.05810546875, + "kimi_kl": 0.1259765625, + "learning_rate": 4.14e-07, + "loss": 0.0023, + "ppl": 0.043701171875, + "reward": 0.9375112056732178, + "reward_std": 0.002184599172323942, + "rewards/perpo_ocr_edit_distance_reward": 0.9375112056732178, "step": 860, "temperature": 0.9 }, { - "advantages": -0.00010214533904218115, - "completion_length": 920.0, - "delta_ref_entropy_loss": 0.041015625, - "delta_ref_ppl": -0.0281982421875, - "entropy_loss": -0.037109375, - "epoch": 0.3444, - "grad_norm": 0.567699799904958, - "k1_kl": 0.028076171875, - "k3_kl": 0.016448974609375, - "kimi_kl": 0.0411376953125, - "learning_rate": 3.2779999999999996e-07, - "loss": 0.0008, - "ppl": 0.0178375244140625, - "reward": 0.9948900640010834, - "reward_std": 0.0005255699506960809, - "rewards/perpo_ocr_edit_distance_reward": 0.9948901534080505, + "advantages": -5.606242848443799e-05, + "completion_length": 344.0, + "delta_ref_entropy_loss": 0.068359375, + "delta_ref_ppl": -0.09619140625, + "entropy_loss": -0.039794921875, + "epoch": 0.1722, + "grad_norm": 1.0548284027562393, + "k1_kl": 0.09619140625, + "k3_kl": 0.061279296875, + "kimi_kl": 0.2041015625, + "learning_rate": 4.1389999999999997e-07, + "loss": 0.0025, + "ppl": 0.01953125, + "reward": 0.9887691140174866, + "reward_std": 0.001114975311793387, + "rewards/perpo_ocr_edit_distance_reward": 0.9887691736221313, "step": 861, "temperature": 0.9 }, { - "advantages": -6.400687743735034e-05, - "completion_length": 856.0, - "delta_ref_entropy_loss": 0.02008056640625, - "delta_ref_ppl": -0.017669677734375, - "entropy_loss": -0.0323486328125, - "epoch": 0.3448, - "grad_norm": 0.2775000405018541, - "k1_kl": 0.017669677734375, - "k3_kl": 0.0110321044921875, - "kimi_kl": 0.0296630859375, - "learning_rate": 3.276e-07, - "loss": 0.0005, - "ppl": 0.015777587890625, - "reward": 0.9962174296379089, - "reward_std": 0.0006944460910744965, - "rewards/perpo_ocr_edit_distance_reward": 0.9962175190448761, + "advantages": -4.6674696932313964e-05, + "completion_length": 839.0, + "delta_ref_entropy_loss": 0.0220947265625, + "delta_ref_ppl": -0.0279541015625, + "entropy_loss": -0.0201416015625, + "epoch": 0.1724, + "grad_norm": 0.3863202067903682, + "k1_kl": 0.028076171875, + "k3_kl": 0.0185546875, + "kimi_kl": 0.06201171875, + "learning_rate": 4.1379999999999996e-07, + "loss": 0.0008, + "ppl": 0.00885009765625, + "reward": 0.9844095706939697, + "reward_std": 0.0002647044893819839, + "rewards/perpo_ocr_edit_distance_reward": 0.984409511089325, "step": 862, "temperature": 0.9 }, { - "advantages": -0.00015119570161914453, - "completion_length": 364.0, - "delta_ref_entropy_loss": 0.0439453125, - "delta_ref_ppl": -0.0406494140625, - "entropy_loss": -0.03424072265625, - "epoch": 0.3452, - "grad_norm": 0.7934352566077503, - "k1_kl": 0.040771484375, - "k3_kl": 0.0244140625, - "kimi_kl": 0.0584716796875, - "learning_rate": 3.2740000000000003e-07, - "loss": 0.0011, - "ppl": 0.01739501953125, - "reward": 0.9976000785827637, - "reward_std": 0.0004176519432803616, - "rewards/perpo_ocr_edit_distance_reward": 0.9976001381874084, + "advantages": 2.7392592528485693e-05, + "completion_length": 437.0, + "delta_ref_entropy_loss": 0.08447265625, + "delta_ref_ppl": -0.0732421875, + "entropy_loss": -0.031494140625, + "epoch": 0.1726, + "grad_norm": 0.6831254043485748, + "k1_kl": 0.07373046875, + "k3_kl": 0.04296875, + "kimi_kl": 0.10791015625, + "learning_rate": 4.137e-07, + "loss": 0.0017, + "ppl": 0.01092529296875, + "reward": 0.9834768772125244, + "reward_std": 0.0005221008905209601, + "rewards/perpo_ocr_edit_distance_reward": 0.9834769368171692, "step": 863, "temperature": 0.9 }, { - "advantages": -0.00030633384085376747, - "completion_length": 629.5, - "delta_ref_entropy_loss": 0.048583984375, - "delta_ref_ppl": -0.03375244140625, - "entropy_loss": -0.024810791015625, - "epoch": 0.3456, - "grad_norm": 0.4658388799395541, - "k1_kl": 0.03375244140625, - "k3_kl": 0.01690673828125, - "kimi_kl": 0.042724609375, - "learning_rate": 3.2719999999999997e-07, - "loss": 0.001, - "ppl": 0.0122222900390625, - "reward": 0.990217536687851, - "reward_std": 0.00032636670948704705, - "rewards/perpo_ocr_edit_distance_reward": 0.9902176260948181, + "advantages": 0.0, + "completion_length": 594.0, + "delta_ref_entropy_loss": 0.05712890625, + "delta_ref_ppl": -0.06640625, + "entropy_loss": -0.025634765625, + "epoch": 0.1728, + "grad_norm": 2.1720238524411872, + "k1_kl": 0.06591796875, + "k3_kl": 0.037841796875, + "kimi_kl": 0.12255859375, + "learning_rate": 4.136e-07, + "loss": 0.0015, + "ppl": 0.01055908203125, + "reward": 0.9894831776618958, + "reward_std": 0.0011901074321940541, + "rewards/perpo_ocr_edit_distance_reward": 0.9894832372665405, "step": 864, "temperature": 0.9 }, { - "advantages": -0.0001917502813739702, - "completion_length": 1162.5, - "delta_ref_entropy_loss": 0.021942138671875, - "delta_ref_ppl": -0.023040771484375, - "entropy_loss": -0.0196533203125, - "epoch": 0.346, - "grad_norm": 0.3040540988744775, - "k1_kl": 0.023040771484375, - "k3_kl": 0.015350341796875, - "kimi_kl": 0.065521240234375, - "learning_rate": 3.27e-07, - "loss": 0.0008, - "ppl": 0.0082855224609375, - "reward": 0.9520220756530762, - "reward_std": 0.00041411987331230193, - "rewards/perpo_ocr_edit_distance_reward": 0.9520221948623657, + "advantages": -1.7174654203699902e-05, + "completion_length": 549.0, + "delta_ref_entropy_loss": 0.07568359375, + "delta_ref_ppl": -0.057861328125, + "entropy_loss": -0.0419921875, + "epoch": 0.173, + "grad_norm": 0.9939933125931623, + "k1_kl": 0.05810546875, + "k3_kl": 0.0284423828125, + "kimi_kl": 0.060546875, + "learning_rate": 4.1349999999999994e-07, + "loss": 0.0012, + "ppl": 0.02197265625, + "reward": 0.9638286828994751, + "reward_std": 0.0008920531836338341, + "rewards/perpo_ocr_edit_distance_reward": 0.9638286828994751, "step": 865, "temperature": 0.9 }, { - "advantages": -5.174748002900742e-05, - "completion_length": 892.5, - "delta_ref_entropy_loss": 0.0247802734375, - "delta_ref_ppl": -0.01922607421875, - "entropy_loss": -0.01617431640625, - "epoch": 0.3464, - "grad_norm": 0.25204511063368296, - "k1_kl": 0.01922607421875, - "k3_kl": 0.0113677978515625, - "kimi_kl": 0.03021240234375, - "learning_rate": 3.268e-07, - "loss": 0.0005, - "ppl": 0.006744384765625, - "reward": 0.9983397424221039, - "reward_std": 0.00020769655384356156, - "rewards/perpo_ocr_edit_distance_reward": 0.9983397424221039, + "advantages": -4.9837999540613964e-05, + "completion_length": 762.0, + "delta_ref_entropy_loss": 0.0556640625, + "delta_ref_ppl": -0.06103515625, + "entropy_loss": -0.0361328125, + "epoch": 0.1732, + "grad_norm": 0.7416610174121324, + "k1_kl": 0.06103515625, + "k3_kl": 0.03369140625, + "kimi_kl": 0.07421875, + "learning_rate": 4.134e-07, + "loss": 0.0014, + "ppl": 0.0194091796875, + "reward": 0.9763979315757751, + "reward_std": 0.0009250258444808424, + "rewards/perpo_ocr_edit_distance_reward": 0.9763979911804199, "step": 866, "temperature": 0.9 }, { - "advantages": -2.5570392892859672e-05, - "completion_length": 318.0, - "delta_ref_entropy_loss": 0.089599609375, - "delta_ref_ppl": -0.0693359375, - "entropy_loss": -0.052001953125, - "epoch": 0.3468, - "grad_norm": 1.1792252050791023, - "k1_kl": 0.069580078125, - "k3_kl": 0.04150390625, - "kimi_kl": 0.127685546875, - "learning_rate": 3.2659999999999997e-07, - "loss": 0.0017, - "ppl": 0.02459716796875, - "reward": 0.8061354756355286, - "reward_std": 0.017941783706191927, - "rewards/perpo_ocr_edit_distance_reward": 0.8061355650424957, + "advantages": -6.505421424662927e-06, + "completion_length": 748.0, + "delta_ref_entropy_loss": 0.06005859375, + "delta_ref_ppl": -0.04931640625, + "entropy_loss": -0.0203857421875, + "epoch": 0.1734, + "grad_norm": 1.030210693199552, + "k1_kl": 0.04931640625, + "k3_kl": 0.0252685546875, + "kimi_kl": 0.0478515625, + "learning_rate": 4.133e-07, + "loss": 0.001, + "ppl": 0.0089111328125, + "reward": 0.9911960959434509, + "reward_std": 0.005139498971402645, + "rewards/perpo_ocr_edit_distance_reward": 0.9911962151527405, "step": 867, "temperature": 0.9 }, { - "advantages": -0.0003305588470539078, - "completion_length": 403.0, - "delta_ref_entropy_loss": 0.0416259765625, - "delta_ref_ppl": -0.03558349609375, - "entropy_loss": -0.01776123046875, - "epoch": 0.3472, - "grad_norm": 0.6193471572709072, - "k1_kl": 0.03558349609375, - "k3_kl": 0.021728515625, - "kimi_kl": 0.06689453125, - "learning_rate": 3.264e-07, - "loss": 0.0012, - "ppl": 0.0055694580078125, - "reward": 0.9947327673435211, - "reward_std": 0.0006044126930646598, - "rewards/perpo_ocr_edit_distance_reward": 0.9947328567504883, + "advantages": -1.711504955892451e-05, + "completion_length": 516.0, + "delta_ref_entropy_loss": 0.095703125, + "delta_ref_ppl": -0.078125, + "entropy_loss": -0.0615234375, + "epoch": 0.1736, + "grad_norm": 1.195623779748496, + "k1_kl": 0.07861328125, + "k3_kl": 0.04296875, + "kimi_kl": 0.10693359375, + "learning_rate": 4.1319999999999997e-07, + "loss": 0.0017, + "ppl": 0.034912109375, + "reward": 0.9783583283424377, + "reward_std": 0.002882914850488305, + "rewards/perpo_ocr_edit_distance_reward": 0.9783584475517273, "step": 868, "temperature": 0.9 }, { - "advantages": -0.00018678393939808302, - "completion_length": 717.5, - "delta_ref_entropy_loss": 0.03558349609375, - "delta_ref_ppl": -0.05419921875, - "entropy_loss": -0.0272216796875, - "epoch": 0.3476, - "grad_norm": 0.43494610066230277, - "k1_kl": 0.05419921875, - "k3_kl": 0.035186767578125, - "kimi_kl": 0.1005859375, - "learning_rate": 3.262e-07, - "loss": 0.0016, - "ppl": 0.0119476318359375, - "reward": 0.9782581925392151, - "reward_std": 0.009367394573928323, - "rewards/perpo_ocr_edit_distance_reward": 0.9782583117485046, + "advantages": -2.615792618598789e-05, + "completion_length": 1788.0, + "delta_ref_entropy_loss": 0.10302734375, + "delta_ref_ppl": -0.056640625, + "entropy_loss": -0.1396484375, + "epoch": 0.1738, + "grad_norm": 2.65919143702467, + "k1_kl": 0.056396484375, + "k3_kl": 0.037841796875, + "kimi_kl": 0.05126953125, + "learning_rate": 4.131e-07, + "loss": 0.0015, + "ppl": 0.080078125, + "reward": 0.9426112174987793, + "reward_std": 0.003152716439217329, + "rewards/perpo_ocr_edit_distance_reward": 0.9426113367080688, "step": 869, "temperature": 0.9 }, { - "advantages": -2.7341503880506934e-05, - "completion_length": 782.0, - "delta_ref_entropy_loss": 0.0323486328125, - "delta_ref_ppl": -0.020660400390625, - "entropy_loss": -0.0286865234375, - "epoch": 0.348, - "grad_norm": 0.6390597300298461, - "k1_kl": 0.020660400390625, - "k3_kl": 0.010009765625, - "kimi_kl": 0.01934814453125, - "learning_rate": 3.26e-07, - "loss": 0.0004, - "ppl": 0.01556396484375, - "reward": 0.9785632193088531, - "reward_std": 0.023476049187593162, - "rewards/perpo_ocr_edit_distance_reward": 0.9785632789134979, + "advantages": -2.8031214242218994e-05, + "completion_length": 599.0, + "delta_ref_entropy_loss": 0.052978515625, + "delta_ref_ppl": -0.042236328125, + "entropy_loss": -0.0107421875, + "epoch": 0.174, + "grad_norm": 0.42098659879961814, + "k1_kl": 0.042236328125, + "k3_kl": 0.0201416015625, + "kimi_kl": 0.045654296875, + "learning_rate": 4.1299999999999995e-07, + "loss": 0.0008, + "ppl": 0.0031890869140625, + "reward": 0.9979304075241089, + "reward_std": 0.00020377668261062354, + "rewards/perpo_ocr_edit_distance_reward": 0.9979304075241089, "step": 870, "temperature": 0.9 }, { - "advantages": -4.80136723126634e-05, - "completion_length": 712.0, - "delta_ref_entropy_loss": 0.03057861328125, - "delta_ref_ppl": -0.02435302734375, - "entropy_loss": -0.0238037109375, - "epoch": 0.3484, - "grad_norm": 0.49409107171075517, - "k1_kl": 0.02435302734375, - "k3_kl": 0.016693115234375, - "kimi_kl": 0.040283203125, - "learning_rate": 3.2579999999999997e-07, - "loss": 0.0007, - "ppl": 0.01251220703125, - "reward": 0.9978234767913818, - "reward_std": 0.000641564023680985, - "rewards/perpo_ocr_edit_distance_reward": 0.9978235363960266, + "advantages": -8.566039468860254e-06, + "completion_length": 805.0, + "delta_ref_entropy_loss": 0.091796875, + "delta_ref_ppl": -0.07421875, + "entropy_loss": -0.10498046875, + "epoch": 0.1742, + "grad_norm": 1.4956407430885108, + "k1_kl": 0.07421875, + "k3_kl": 0.040771484375, + "kimi_kl": 0.09033203125, + "learning_rate": 4.1289999999999995e-07, + "loss": 0.0016, + "ppl": 0.0615234375, + "reward": 0.9286600947380066, + "reward_std": 0.007842295803129673, + "rewards/perpo_ocr_edit_distance_reward": 0.9286601543426514, "step": 871, "temperature": 0.9 }, { - "advantages": -0.000121601997648213, - "completion_length": 742.0, - "delta_ref_entropy_loss": 0.0355224609375, - "delta_ref_ppl": -0.019775390625, - "entropy_loss": -0.01800537109375, - "epoch": 0.3488, - "grad_norm": 0.4829254171977127, - "k1_kl": 0.019775390625, - "k3_kl": 0.01053619384765625, - "kimi_kl": 0.0425872802734375, - "learning_rate": 3.256e-07, - "loss": 0.0005, - "ppl": 0.0069580078125, - "reward": 0.8956846296787262, - "reward_std": 0.010173684684559703, - "rewards/perpo_ocr_edit_distance_reward": 0.895684689283371, + "advantages": -9.400504495715722e-05, + "completion_length": 893.0, + "delta_ref_entropy_loss": 0.0277099609375, + "delta_ref_ppl": -0.033203125, + "entropy_loss": -0.0234375, + "epoch": 0.1744, + "grad_norm": 0.31842461704427666, + "k1_kl": 0.033203125, + "k3_kl": 0.020751953125, + "kimi_kl": 0.060791015625, + "learning_rate": 4.128e-07, + "loss": 0.0009, + "ppl": 0.0115966796875, + "reward": 0.9937306642532349, + "reward_std": 0.0005341126234270632, + "rewards/perpo_ocr_edit_distance_reward": 0.9937306642532349, "step": 872, "temperature": 0.9 }, { - "advantages": -0.0002980232238769531, - "completion_length": 203.5, - "delta_ref_entropy_loss": 0.073486328125, - "delta_ref_ppl": -0.16162109375, - "entropy_loss": -0.02374267578125, - "epoch": 0.3492, - "grad_norm": 0.029282269236057198, - "k1_kl": 0.16162109375, - "k3_kl": 0.12939453125, - "kimi_kl": 0.73828125, - "learning_rate": 3.254e-07, - "loss": 0.0055, - "ppl": 0.007415771484375, - "reward": 0.997778981924057, - "reward_std": 0.0, - "rewards/perpo_ocr_edit_distance_reward": 0.9977790415287018, + "advantages": -7.152557941481064e-07, + "completion_length": 13.0, + "delta_ref_entropy_loss": 0.19140625, + "delta_ref_ppl": -1.375, + "entropy_loss": -0.296875, + "epoch": 0.1746, + "grad_norm": 14.231733471136362, + "k1_kl": 1.375, + "k3_kl": 1.0234375, + "kimi_kl": 3.234375, + "learning_rate": 4.127e-07, + "loss": 0.041, + "ppl": 0.1201171875, + "reward": 0.08436131477355957, + "reward_std": 0.0013591941678896546, + "rewards/perpo_ocr_edit_distance_reward": 0.08436132222414017, "step": 873, "temperature": 0.9 }, { - "advantages": -0.0001285501966776792, - "completion_length": 961.5, - "delta_ref_entropy_loss": 0.035614013671875, - "delta_ref_ppl": -0.0189361572265625, - "entropy_loss": -0.021240234375, - "epoch": 0.3496, - "grad_norm": 0.35632042382060175, - "k1_kl": 0.0189361572265625, - "k3_kl": 0.01020050048828125, - "kimi_kl": 0.01859283447265625, - "learning_rate": 3.252e-07, - "loss": 0.0005, - "ppl": 0.0095672607421875, - "reward": 0.9970263838768005, - "reward_std": 0.0010593628830974922, - "rewards/perpo_ocr_edit_distance_reward": 0.9970265030860901, - "step": 874, - "temperature": 0.9 - }, - { - "advantages": -2.5783267119550146e-05, - "completion_length": 634.5, - "delta_ref_entropy_loss": 0.040283203125, - "delta_ref_ppl": -0.02764892578125, - "entropy_loss": -0.0214385986328125, - "epoch": 0.35, - "grad_norm": 0.2758262606780317, - "k1_kl": 0.02764892578125, - "k3_kl": 0.013946533203125, - "kimi_kl": 0.0318603515625, - "learning_rate": 3.25e-07, - "loss": 0.0006, - "ppl": 0.009571075439453125, - "reward": 0.9867056012153625, - "reward_std": 0.0004455179441720247, - "rewards/perpo_ocr_edit_distance_reward": 0.9867056608200073, + "advantages": -1.0192395166086499e-05, + "completion_length": 737.0, + "delta_ref_entropy_loss": 0.031494140625, + "delta_ref_ppl": -0.036865234375, + "entropy_loss": -0.018798828125, + "epoch": 0.1748, + "grad_norm": 0.4757545092422261, + "k1_kl": 0.036865234375, + "k3_kl": 0.0218505859375, + "kimi_kl": 0.057861328125, + "learning_rate": 4.1260000000000003e-07, + "loss": 0.0009, + "ppl": 0.00714111328125, + "reward": 0.9850612282752991, + "reward_std": 0.0032421916257590055, + "rewards/perpo_ocr_edit_distance_reward": 0.9850612878799438, + "step": 874, + "temperature": 0.9 + }, + { + "advantages": -0.0002532346115913242, + "completion_length": 567.0, + "delta_ref_entropy_loss": 0.07568359375, + "delta_ref_ppl": -0.07763671875, + "entropy_loss": -0.01806640625, + "epoch": 0.175, + "grad_norm": 0.47315418884407917, + "k1_kl": 0.07763671875, + "k3_kl": 0.043701171875, + "kimi_kl": 0.1396484375, + "learning_rate": 4.1249999999999997e-07, + "loss": 0.002, + "ppl": 0.00823974609375, + "reward": 0.9978880286216736, + "reward_std": 0.00020262974430806935, + "rewards/perpo_ocr_edit_distance_reward": 0.9978880882263184, "step": 875, "temperature": 0.9 }, { - "advantages": -0.0002829262184604886, - "completion_length": 335.0, - "delta_ref_entropy_loss": 0.0574951171875, - "delta_ref_ppl": -0.0361328125, - "entropy_loss": -0.036865234375, - "epoch": 0.3504, - "grad_norm": 0.5911743365239887, - "k1_kl": 0.0360107421875, - "k3_kl": 0.0162353515625, - "kimi_kl": 0.026611328125, - "learning_rate": 3.2479999999999994e-07, - "loss": 0.0009, - "ppl": 0.0177154541015625, - "reward": 0.9876319468021393, - "reward_std": 0.0005138209671713412, - "rewards/perpo_ocr_edit_distance_reward": 0.9876320064067841, + "advantages": -2.188342023146106e-06, + "completion_length": 385.0, + "delta_ref_entropy_loss": 0.15625, + "delta_ref_ppl": -0.091796875, + "entropy_loss": -0.07568359375, + "epoch": 0.1752, + "grad_norm": 2.3453878297548325, + "k1_kl": 0.091796875, + "k3_kl": 0.048828125, + "kimi_kl": 0.0908203125, + "learning_rate": 4.1239999999999996e-07, + "loss": 0.002, + "ppl": 0.0390625, + "reward": 0.4563511610031128, + "reward_std": 0.009631606750190258, + "rewards/perpo_ocr_edit_distance_reward": 0.4563511908054352, "step": 876, "temperature": 0.9 }, { - "advantages": 1.8881901269196533e-06, - "completion_length": 486.5, - "delta_ref_entropy_loss": 0.0380859375, - "delta_ref_ppl": -0.0418701171875, - "entropy_loss": -0.0667724609375, - "epoch": 0.3508, - "grad_norm": 0.7100102562240382, - "k1_kl": 0.041839599609375, - "k3_kl": 0.02728271484375, - "kimi_kl": 0.093017578125, - "learning_rate": 3.246e-07, - "loss": 0.0011, - "ppl": 0.037750244140625, - "reward": 0.9588445723056793, - "reward_std": 0.014015875291079283, - "rewards/perpo_ocr_edit_distance_reward": 0.9588446319103241, + "advantages": 5.211149073147681e-06, + "completion_length": 431.0, + "delta_ref_entropy_loss": 0.034423828125, + "delta_ref_ppl": -0.047607421875, + "entropy_loss": -0.0211181640625, + "epoch": 0.1754, + "grad_norm": 0.5107005514503231, + "k1_kl": 0.047607421875, + "k3_kl": 0.0311279296875, + "kimi_kl": 0.09814453125, + "learning_rate": 4.123e-07, + "loss": 0.0012, + "ppl": 0.00811767578125, + "reward": 0.9956477284431458, + "reward_std": 0.003159934189170599, + "rewards/perpo_ocr_edit_distance_reward": 0.9956477284431458, "step": 877, "temperature": 0.9 }, { - "advantages": -3.229294816264883e-05, - "completion_length": 442.0, - "delta_ref_entropy_loss": 0.02056884765625, - "delta_ref_ppl": -0.01300048828125, - "entropy_loss": -0.01300048828125, - "epoch": 0.3512, - "grad_norm": 0.29686761434558107, - "k1_kl": 0.01300048828125, - "k3_kl": 0.0062408447265625, - "kimi_kl": 0.0111846923828125, - "learning_rate": 3.244e-07, - "loss": 0.0003, - "ppl": 0.006439208984375, - "reward": 0.9989240765571594, - "reward_std": 0.0006091179093346, - "rewards/perpo_ocr_edit_distance_reward": 0.9989241063594818, + "advantages": -1.0456357813382056e-05, + "completion_length": 379.0, + "delta_ref_entropy_loss": 0.203125, + "delta_ref_ppl": -0.1689453125, + "entropy_loss": -0.203125, + "epoch": 0.1756, + "grad_norm": 2.3230764221566242, + "k1_kl": 0.1689453125, + "k3_kl": 0.09521484375, + "kimi_kl": 0.2119140625, + "learning_rate": 4.122e-07, + "loss": 0.0038, + "ppl": 0.10888671875, + "reward": 0.8780409097671509, + "reward_std": 0.0031489278189837933, + "rewards/perpo_ocr_edit_distance_reward": 0.8780409693717957, "step": 878, "temperature": 0.9 }, { - "advantages": -0.00010347366651330958, - "completion_length": 553.0, - "delta_ref_entropy_loss": 0.0780029296875, - "delta_ref_ppl": -0.04876708984375, - "entropy_loss": -0.03826904296875, - "epoch": 0.3516, - "grad_norm": 1.203767303439815, - "k1_kl": 0.04876708984375, - "k3_kl": 0.028167724609375, - "kimi_kl": 0.0999755859375, - "learning_rate": 3.2419999999999995e-07, - "loss": 0.0012, - "ppl": 0.018951416015625, - "reward": 0.8829725086688995, - "reward_std": 0.0013546484005928505, - "rewards/perpo_ocr_edit_distance_reward": 0.8829725682735443, + "advantages": -2.309254341525957e-05, + "completion_length": 658.0, + "delta_ref_entropy_loss": 0.1044921875, + "delta_ref_ppl": -0.07373046875, + "entropy_loss": -0.0654296875, + "epoch": 0.1758, + "grad_norm": 2.75540372826821, + "k1_kl": 0.0732421875, + "k3_kl": 0.038330078125, + "kimi_kl": 0.08203125, + "learning_rate": 4.121e-07, + "loss": 0.0016, + "ppl": 0.0283203125, + "reward": 0.9263353943824768, + "reward_std": 0.002846892224624753, + "rewards/perpo_ocr_edit_distance_reward": 0.9263355135917664, "step": 879, "temperature": 0.9 }, { - "advantages": -5.3469627005142684e-05, - "completion_length": 373.0, - "delta_ref_entropy_loss": 0.0614013671875, - "delta_ref_ppl": -0.0582275390625, - "entropy_loss": -0.02581787109375, - "epoch": 0.352, - "grad_norm": 1.4279764241509871, - "k1_kl": 0.05859375, - "k3_kl": 0.03515625, - "kimi_kl": 0.0986328125, - "learning_rate": 3.24e-07, - "loss": 0.0015, - "ppl": 0.01007080078125, - "reward": 0.9590295851230621, - "reward_std": 0.007142123213270679, - "rewards/perpo_ocr_edit_distance_reward": 0.9590296745300293, + "advantages": 4.9727306759450585e-05, + "completion_length": 525.0, + "delta_ref_entropy_loss": 0.07861328125, + "delta_ref_ppl": -0.0634765625, + "entropy_loss": -0.02490234375, + "epoch": 0.176, + "grad_norm": 0.464820683900682, + "k1_kl": 0.0634765625, + "k3_kl": 0.032958984375, + "kimi_kl": 0.0908203125, + "learning_rate": 4.12e-07, + "loss": 0.0013, + "ppl": 0.00946044921875, + "reward": 0.9943879246711731, + "reward_std": 0.00024265727552119642, + "rewards/perpo_ocr_edit_distance_reward": 0.9943879842758179, "step": 880, "temperature": 0.9 }, { - "advantages": -0.00013498324005922768, - "completion_length": 874.0, - "delta_ref_entropy_loss": 0.061767578125, - "delta_ref_ppl": -0.0426025390625, - "entropy_loss": -0.04998779296875, - "epoch": 0.3524, - "grad_norm": 0.6802287184083022, - "k1_kl": 0.0426025390625, - "k3_kl": 0.023895263671875, - "kimi_kl": 0.073486328125, - "learning_rate": 3.238e-07, - "loss": 0.0011, - "ppl": 0.025115966796875, - "reward": 0.9807490706443787, - "reward_std": 0.0004096119628229644, - "rewards/perpo_ocr_edit_distance_reward": 0.9807491302490234, + "advantages": 0.0, + "completion_length": 305.0, + "delta_ref_entropy_loss": 0.076171875, + "delta_ref_ppl": -0.087890625, + "entropy_loss": -0.03173828125, + "epoch": 0.1762, + "grad_norm": 0.9551090992817364, + "k1_kl": 0.087890625, + "k3_kl": 0.054931640625, + "kimi_kl": 0.16015625, + "learning_rate": 4.119e-07, + "loss": 0.0022, + "ppl": 0.01422119140625, + "reward": 0.9921639561653137, + "reward_std": 0.0011960860574617982, + "rewards/perpo_ocr_edit_distance_reward": 0.9921639561653137, "step": 881, "temperature": 0.9 }, { - "advantages": -1.60506795054971e-06, - "completion_length": 265.0, - "delta_ref_entropy_loss": 0.079833984375, - "delta_ref_ppl": -0.12841796875, - "entropy_loss": -0.05767822265625, - "epoch": 0.3528, - "grad_norm": 0.8218079522514093, - "k1_kl": 0.1279296875, - "k3_kl": 0.094482421875, - "kimi_kl": 0.36669921875, - "learning_rate": 3.2359999999999996e-07, - "loss": 0.0038, - "ppl": 0.0299530029296875, - "reward": 0.959381103515625, - "reward_std": 0.014489906840026379, - "rewards/perpo_ocr_edit_distance_reward": 0.9593811333179474, + "advantages": -1.6178404393940582e-07, + "completion_length": 1167.0, + "delta_ref_entropy_loss": 0.06005859375, + "delta_ref_ppl": -0.055419921875, + "entropy_loss": -0.064453125, + "epoch": 0.1764, + "grad_norm": 2.4049112648719464, + "k1_kl": 0.055419921875, + "k3_kl": 0.0341796875, + "kimi_kl": 0.0693359375, + "learning_rate": 4.1179999999999997e-07, + "loss": 0.0014, + "ppl": 0.037109375, + "reward": 0.8680568933486938, + "reward_std": 0.04851736128330231, + "rewards/perpo_ocr_edit_distance_reward": 0.8680568933486938, "step": 882, "temperature": 0.9 }, { - "advantages": -1.2253012755536474e-05, - "completion_length": 611.0, - "delta_ref_entropy_loss": 0.056732177734375, - "delta_ref_ppl": -0.10675048828125, - "entropy_loss": -0.02410888671875, - "epoch": 0.3532, - "grad_norm": 2.1206371671203503, - "k1_kl": 0.107269287109375, - "k3_kl": 0.085693359375, - "kimi_kl": 0.4403076171875, - "learning_rate": 3.234e-07, - "loss": 0.0034, - "ppl": 0.01434326171875, - "reward": 0.9980246424674988, - "reward_std": 0.002927527646534145, - "rewards/perpo_ocr_edit_distance_reward": 0.9980247318744659, + "advantages": -0.0002106343163177371, + "completion_length": 504.0, + "delta_ref_entropy_loss": 0.09716796875, + "delta_ref_ppl": -0.078125, + "entropy_loss": -0.0216064453125, + "epoch": 0.1766, + "grad_norm": 0.49265304297686435, + "k1_kl": 0.078125, + "k3_kl": 0.03857421875, + "kimi_kl": 0.0986328125, + "learning_rate": 4.117e-07, + "loss": 0.0018, + "ppl": 0.00640869140625, + "reward": 0.9384894371032715, + "reward_std": 0.00034464074997231364, + "rewards/perpo_ocr_edit_distance_reward": 0.938489556312561, "step": 883, "temperature": 0.9 }, { - "advantages": -4.053116754221264e-06, - "completion_length": 886.5, - "delta_ref_entropy_loss": 0.04156494140625, - "delta_ref_ppl": -0.02984619140625, - "entropy_loss": -0.0367431640625, - "epoch": 0.3536, - "grad_norm": 0.7188416354997229, - "k1_kl": 0.02984619140625, - "k3_kl": 0.0169677734375, - "kimi_kl": 0.051513671875, - "learning_rate": 3.232e-07, - "loss": 0.0007, - "ppl": 0.0186767578125, - "reward": 0.9756519794464111, - "reward_std": 0.0009330932225566357, - "rewards/perpo_ocr_edit_distance_reward": 0.9756519794464111, + "advantages": -2.869538093364099e-06, + "completion_length": 511.0, + "delta_ref_entropy_loss": 0.091796875, + "delta_ref_ppl": -0.0810546875, + "entropy_loss": -0.060546875, + "epoch": 0.1768, + "grad_norm": 1.7106090994435572, + "k1_kl": 0.08056640625, + "k3_kl": 0.046875, + "kimi_kl": 0.11767578125, + "learning_rate": 4.116e-07, + "loss": 0.0019, + "ppl": 0.0299072265625, + "reward": 0.9503514766693115, + "reward_std": 0.011801573447883129, + "rewards/perpo_ocr_edit_distance_reward": 0.9503515362739563, "step": 884, "temperature": 0.9 }, { - "advantages": 1.7029899268550253e-08, - "completion_length": 250.0, - "delta_ref_entropy_loss": 0.03643798828125, - "delta_ref_ppl": -0.0631103515625, - "entropy_loss": -0.329925537109375, - "epoch": 0.354, - "grad_norm": 2.0536424978987853, - "k1_kl": 0.063232421875, - "k3_kl": 0.04376220703125, - "kimi_kl": 0.114501953125, - "learning_rate": 3.23e-07, - "loss": 0.0018, - "ppl": 0.18912506103515625, - "reward": 0.6608975976705551, - "reward_std": 0.049118805676698685, - "rewards/perpo_ocr_edit_distance_reward": 0.6608975827693939, + "advantages": -2.3245813281391747e-05, + "completion_length": 661.0, + "delta_ref_entropy_loss": 0.15625, + "delta_ref_ppl": -0.109375, + "entropy_loss": -0.126953125, + "epoch": 0.177, + "grad_norm": 1.3625722282584343, + "k1_kl": 0.10986328125, + "k3_kl": 0.05615234375, + "kimi_kl": 0.130859375, + "learning_rate": 4.1149999999999995e-07, + "loss": 0.0023, + "ppl": 0.07080078125, + "reward": 0.8953301310539246, + "reward_std": 0.0017306413501501083, + "rewards/perpo_ocr_edit_distance_reward": 0.8953302502632141, "step": 885, "temperature": 0.9 }, { - "advantages": -8.600098908573273e-07, - "completion_length": 306.0, - "delta_ref_entropy_loss": 0.0758056640625, - "delta_ref_ppl": -0.08233642578125, - "entropy_loss": -0.0865478515625, - "epoch": 0.3544, - "grad_norm": 1.5551932798153014, - "k1_kl": 0.0823974609375, - "k3_kl": 0.04937744140625, - "kimi_kl": 0.1488037109375, - "learning_rate": 3.2279999999999995e-07, - "loss": 0.002, - "ppl": 0.043060302734375, - "reward": 0.9705739319324493, - "reward_std": 0.013534352998249233, - "rewards/perpo_ocr_edit_distance_reward": 0.9705739319324493, + "advantages": -3.916876721632434e-06, + "completion_length": 464.0, + "delta_ref_entropy_loss": 0.040283203125, + "delta_ref_ppl": -0.06640625, + "entropy_loss": -0.0220947265625, + "epoch": 0.1772, + "grad_norm": 1.4451549539424708, + "k1_kl": 0.06640625, + "k3_kl": 0.045166015625, + "kimi_kl": 0.138671875, + "learning_rate": 4.114e-07, + "loss": 0.0018, + "ppl": 0.00946044921875, + "reward": 0.9171717166900635, + "reward_std": 0.015182364732027054, + "rewards/perpo_ocr_edit_distance_reward": 0.9171717762947083, "step": 886, "temperature": 0.9 }, { - "advantages": -1.4292343280430941e-05, - "completion_length": 432.5, - "delta_ref_entropy_loss": 0.0400390625, - "delta_ref_ppl": -0.0284423828125, - "entropy_loss": -0.04815673828125, - "epoch": 0.3548, - "grad_norm": 1.471896963699842, - "k1_kl": 0.0284423828125, - "k3_kl": 0.0155181884765625, - "kimi_kl": 0.065216064453125, - "learning_rate": 3.226e-07, - "loss": 0.0006, - "ppl": 0.02239990234375, - "reward": 0.9630453288555145, - "reward_std": 0.010958394850604236, - "rewards/perpo_ocr_edit_distance_reward": 0.9630454182624817, + "advantages": -2.5033950805664062e-06, + "completion_length": 191.0, + "delta_ref_entropy_loss": 0.11376953125, + "delta_ref_ppl": -0.1884765625, + "entropy_loss": -0.053466796875, + "epoch": 0.1774, + "grad_norm": 2.993979861981374, + "k1_kl": 0.1884765625, + "k3_kl": 0.130859375, + "kimi_kl": 0.37890625, + "learning_rate": 4.113e-07, + "loss": 0.0052, + "ppl": 0.02197265625, + "reward": 0.9500580430030823, + "reward_std": 0.010092451237142086, + "rewards/perpo_ocr_edit_distance_reward": 0.950058102607727, "step": 887, "temperature": 0.9 }, { - "advantages": -1.368352369013337e-05, - "completion_length": 886.5, - "delta_ref_entropy_loss": 0.04156494140625, - "delta_ref_ppl": -0.0667724609375, - "entropy_loss": -0.0401611328125, - "epoch": 0.3552, - "grad_norm": 0.5742792005184123, - "k1_kl": 0.0667724609375, - "k3_kl": 0.0482177734375, - "kimi_kl": 0.1533203125, - "learning_rate": 3.2240000000000003e-07, - "loss": 0.0019, - "ppl": 0.02044677734375, - "reward": 0.9153978526592255, - "reward_std": 0.01009285420877859, - "rewards/perpo_ocr_edit_distance_reward": 0.9153979122638702, + "advantages": -0.0005960464477539062, + "completion_length": 110.0, + "delta_ref_entropy_loss": 0.046142578125, + "delta_ref_ppl": -0.1435546875, + "entropy_loss": -0.027587890625, + "epoch": 0.1776, + "grad_norm": 0.052003178022644723, + "k1_kl": 0.1435546875, + "k3_kl": 0.1083984375, + "kimi_kl": 0.4140625, + "learning_rate": 4.112e-07, + "loss": 0.0049, + "ppl": 0.00665283203125, + "reward": 0.9738956093788147, + "reward_std": 0.0, + "rewards/perpo_ocr_edit_distance_reward": 0.9738956689834595, "step": 888, "temperature": 0.9 }, { - "advantages": -1.055853772413684e-05, - "completion_length": 303.5, - "delta_ref_entropy_loss": 0.0517578125, - "delta_ref_ppl": -0.0433349609375, - "entropy_loss": -0.0341796875, - "epoch": 0.3556, - "grad_norm": 0.7860847049968146, - "k1_kl": 0.04327392578125, - "k3_kl": 0.02587890625, - "kimi_kl": 0.091796875, - "learning_rate": 3.2219999999999996e-07, - "loss": 0.001, - "ppl": 0.0161895751953125, - "reward": 0.9954327344894409, - "reward_std": 0.0005540905985981226, - "rewards/perpo_ocr_edit_distance_reward": 0.9954327642917633, + "advantages": 3.916876778475853e-07, + "completion_length": 672.0, + "delta_ref_entropy_loss": 0.1298828125, + "delta_ref_ppl": -0.09130859375, + "entropy_loss": -0.3203125, + "epoch": 0.1778, + "grad_norm": 4.045705121624376, + "k1_kl": 0.0908203125, + "k3_kl": 0.05029296875, + "kimi_kl": 0.08837890625, + "learning_rate": 4.111e-07, + "loss": 0.002, + "ppl": 0.193359375, + "reward": 0.8448290228843689, + "reward_std": 0.021838104352355003, + "rewards/perpo_ocr_edit_distance_reward": 0.8448290228843689, "step": 889, "temperature": 0.9 }, { - "advantages": -3.135204315185547e-05, - "completion_length": 243.0, - "delta_ref_entropy_loss": 0.07574462890625, - "delta_ref_ppl": -0.204864501953125, - "entropy_loss": -0.06414794921875, - "epoch": 0.356, - "grad_norm": 0.28783744533700584, - "k1_kl": 0.203857421875, - "k3_kl": 0.1653594970703125, - "kimi_kl": 0.90142822265625, - "learning_rate": 3.22e-07, - "loss": 0.0067, - "ppl": 0.03302001953125, - "reward": 0.999941885471344, - "reward_std": 0.00015371403424069285, - "rewards/perpo_ocr_edit_distance_reward": 0.9999419152736664, + "advantages": -1.862645149230957e-05, + "completion_length": 6.0, + "delta_ref_entropy_loss": -0.173828125, + "delta_ref_ppl": -2.09375, + "entropy_loss": -0.4609375, + "epoch": 0.178, + "grad_norm": 1.0086182497426839, + "k1_kl": 2.09375, + "k3_kl": 1.75, + "kimi_kl": 8.8125, + "learning_rate": 4.1099999999999996e-07, + "loss": 0.0698, + "ppl": 0.07080078125, + "reward": 0.027363182976841927, + "reward_std": 0.0, + "rewards/perpo_ocr_edit_distance_reward": 0.027363186702132225, "step": 890, "temperature": 0.9 }, { - "advantages": -0.00011596297346283535, - "completion_length": 230.5, - "delta_ref_entropy_loss": -0.00048828125, - "delta_ref_ppl": -0.1048583984375, - "entropy_loss": -0.079742431640625, - "epoch": 0.3564, - "grad_norm": 3.66656182620314, - "k1_kl": 0.1048583984375, - "k3_kl": 0.0740966796875, - "kimi_kl": 0.20751953125, - "learning_rate": 3.218e-07, - "loss": 0.0031, - "ppl": 0.0286102294921875, - "reward": 0.6812592595815659, - "reward_std": 0.018034002190688625, - "rewards/perpo_ocr_edit_distance_reward": 0.6812593191862106, + "advantages": -9.770053293323144e-05, + "completion_length": 636.0, + "delta_ref_entropy_loss": 0.058837890625, + "delta_ref_ppl": -0.05419921875, + "entropy_loss": -0.021240234375, + "epoch": 0.1782, + "grad_norm": 0.6634467770290722, + "k1_kl": 0.05419921875, + "k3_kl": 0.0308837890625, + "kimi_kl": 0.0849609375, + "learning_rate": 4.1089999999999995e-07, + "loss": 0.0013, + "ppl": 0.00982666015625, + "reward": 0.9800872802734375, + "reward_std": 0.000858684245031327, + "rewards/perpo_ocr_edit_distance_reward": 0.9800873398780823, "step": 891, "temperature": 0.9 }, { - "advantages": -6.29723099336843e-05, - "completion_length": 1103.0, - "delta_ref_entropy_loss": 0.035430908203125, - "delta_ref_ppl": -0.03411865234375, - "entropy_loss": -0.0458984375, - "epoch": 0.3568, - "grad_norm": 1.019686608801705, - "k1_kl": 0.03411865234375, - "k3_kl": 0.02288818359375, - "kimi_kl": 0.0635986328125, - "learning_rate": 3.2159999999999997e-07, - "loss": 0.001, - "ppl": 0.0278472900390625, - "reward": 0.9836963713169098, - "reward_std": 0.0015094919290277176, - "rewards/perpo_ocr_edit_distance_reward": 0.9836964011192322, + "advantages": -6.0626439335464966e-06, + "completion_length": 652.0, + "delta_ref_entropy_loss": 0.1669921875, + "delta_ref_ppl": -0.11962890625, + "entropy_loss": -0.1806640625, + "epoch": 0.1784, + "grad_norm": 2.2643850037849407, + "k1_kl": 0.11962890625, + "k3_kl": 0.06103515625, + "kimi_kl": 0.1357421875, + "learning_rate": 4.108e-07, + "loss": 0.0025, + "ppl": 0.10205078125, + "reward": 0.7332439422607422, + "reward_std": 0.005514896474778652, + "rewards/perpo_ocr_edit_distance_reward": 0.733244001865387, "step": 892, "temperature": 0.9 }, { - "advantages": -1.8856356291507836e-05, - "completion_length": 841.0, - "delta_ref_entropy_loss": 0.0924072265625, - "delta_ref_ppl": -0.0699462890625, - "entropy_loss": -0.1075439453125, - "epoch": 0.3572, - "grad_norm": 0.8625356840931738, - "k1_kl": 0.070068359375, - "k3_kl": 0.0404052734375, - "kimi_kl": 0.09442138671875, - "learning_rate": 3.214e-07, - "loss": 0.0016, - "ppl": 0.0577392578125, - "reward": 0.9363212883472443, - "reward_std": 0.0028400871669873595, - "rewards/perpo_ocr_edit_distance_reward": 0.936321347951889, + "advantages": -2.0648752979468554e-05, + "completion_length": 569.0, + "delta_ref_entropy_loss": 0.11083984375, + "delta_ref_ppl": -0.1142578125, + "entropy_loss": -0.10888671875, + "epoch": 0.1786, + "grad_norm": 1.359060040847767, + "k1_kl": 0.11376953125, + "k3_kl": 0.06396484375, + "kimi_kl": 0.1357421875, + "learning_rate": 4.107e-07, + "loss": 0.0026, + "ppl": 0.06396484375, + "reward": 0.856113076210022, + "reward_std": 0.002786272903904319, + "rewards/perpo_ocr_edit_distance_reward": 0.8561131954193115, "step": 893, "temperature": 0.9 }, { - "advantages": -1.6987323760986328e-06, - "completion_length": 441.0, - "delta_ref_entropy_loss": 0.0684814453125, - "delta_ref_ppl": -0.059814453125, - "entropy_loss": -0.0535888671875, - "epoch": 0.3576, - "grad_norm": 0.8371457187459805, - "k1_kl": 0.059814453125, - "k3_kl": 0.03765869140625, - "kimi_kl": 0.1331787109375, - "learning_rate": 3.212e-07, - "loss": 0.0015, - "ppl": 0.02862548828125, - "reward": 0.8982428908348083, - "reward_std": 0.005678532645106316, - "rewards/perpo_ocr_edit_distance_reward": 0.8982429802417755, + "advantages": -5.3048138397571165e-06, + "completion_length": 665.0, + "delta_ref_entropy_loss": 0.1142578125, + "delta_ref_ppl": -0.09228515625, + "entropy_loss": -0.12255859375, + "epoch": 0.1788, + "grad_norm": 1.6520602736676913, + "k1_kl": 0.09228515625, + "k3_kl": 0.050537109375, + "kimi_kl": 0.10302734375, + "learning_rate": 4.106e-07, + "loss": 0.002, + "ppl": 0.06640625, + "reward": 0.9238521456718445, + "reward_std": 0.00793343037366867, + "rewards/perpo_ocr_edit_distance_reward": 0.9238522052764893, "step": 894, "temperature": 0.9 }, { - "advantages": -7.16022136657557e-05, - "completion_length": 1174.0, - "delta_ref_entropy_loss": 0.0347900390625, - "delta_ref_ppl": -0.026092529296875, - "entropy_loss": -0.03717041015625, - "epoch": 0.358, - "grad_norm": 1.8327709069395128, - "k1_kl": 0.026123046875, - "k3_kl": 0.016998291015625, - "kimi_kl": 0.04827880859375, - "learning_rate": 3.21e-07, - "loss": 0.0008, - "ppl": 0.0196533203125, - "reward": 0.9679253995418549, - "reward_std": 0.001833323563914746, - "rewards/perpo_ocr_edit_distance_reward": 0.9679254591464996, + "advantages": -5.551747108256677e-06, + "completion_length": 1999.0, + "delta_ref_entropy_loss": 0.07568359375, + "delta_ref_ppl": -0.041748046875, + "entropy_loss": -0.076171875, + "epoch": 0.179, + "grad_norm": 2430.2755190331095, + "k1_kl": 0.041748046875, + "k3_kl": 9.75, + "kimi_kl": 0.06494140625, + "learning_rate": 4.105e-07, + "loss": 0.3907, + "ppl": 0.048828125, + "reward": 0.9601005911827087, + "reward_std": 0.0029597121756523848, + "rewards/perpo_ocr_edit_distance_reward": 0.9601006507873535, "step": 895, "temperature": 0.9 }, { - "advantages": -2.3416112782115306e-07, - "completion_length": 1135.5, - "delta_ref_entropy_loss": 0.0450439453125, - "delta_ref_ppl": -0.032470703125, - "entropy_loss": -0.0343017578125, - "epoch": 0.3584, - "grad_norm": 0.9097608789840534, - "k1_kl": 0.0325927734375, - "k3_kl": 0.0185546875, - "kimi_kl": 0.0361328125, - "learning_rate": 3.2079999999999996e-07, - "loss": 0.0007, - "ppl": 0.015350341796875, - "reward": 0.9437803030014038, - "reward_std": 0.08950867733801715, - "rewards/perpo_ocr_edit_distance_reward": 0.9437803626060486, + "advantages": -4.427774001669604e-06, + "completion_length": 512.0, + "delta_ref_entropy_loss": 0.1064453125, + "delta_ref_ppl": -0.1279296875, + "entropy_loss": -0.1044921875, + "epoch": 0.1792, + "grad_norm": 2.015806649929184, + "k1_kl": 0.1279296875, + "k3_kl": 0.0810546875, + "kimi_kl": 0.205078125, + "learning_rate": 4.1039999999999997e-07, + "loss": 0.0032, + "ppl": 0.061279296875, + "reward": 0.9543060064315796, + "reward_std": 0.02475249394774437, + "rewards/perpo_ocr_edit_distance_reward": 0.9543061256408691, "step": 896, "temperature": 0.9 }, { - "advantages": -0.00010206018669123296, - "completion_length": 1091.5, - "delta_ref_entropy_loss": 0.0394287109375, - "delta_ref_ppl": -0.03900146484375, - "entropy_loss": -0.02685546875, - "epoch": 0.3588, - "grad_norm": 0.278013934035457, - "k1_kl": 0.03900146484375, - "k3_kl": 0.024566650390625, - "kimi_kl": 0.0699462890625, - "learning_rate": 3.206e-07, - "loss": 0.0011, - "ppl": 0.011688232421875, - "reward": 0.9980524480342865, - "reward_std": 0.00047162336704786867, - "rewards/perpo_ocr_edit_distance_reward": 0.998052567243576, + "advantages": -7.799693776178174e-06, + "completion_length": 465.0, + "delta_ref_entropy_loss": 0.0274658203125, + "delta_ref_ppl": -0.0517578125, + "entropy_loss": -0.040771484375, + "epoch": 0.1794, + "grad_norm": 1.0224562358794922, + "k1_kl": 0.051513671875, + "k3_kl": 0.038330078125, + "kimi_kl": 0.08447265625, + "learning_rate": 4.1029999999999996e-07, + "loss": 0.0015, + "ppl": 0.017822265625, + "reward": 0.865821123123169, + "reward_std": 0.00971404928714037, + "rewards/perpo_ocr_edit_distance_reward": 0.8658211827278137, "step": 897, "temperature": 0.9 }, { - "advantages": -0.0003094886033068178, - "completion_length": 992.0, - "delta_ref_entropy_loss": 0.02386474609375, - "delta_ref_ppl": -0.017852783203125, - "entropy_loss": -0.02813720703125, - "epoch": 0.3592, - "grad_norm": 0.577228150926008, - "k1_kl": 0.017852783203125, - "k3_kl": 0.0111846923828125, - "kimi_kl": 0.0401611328125, - "learning_rate": 3.204e-07, - "loss": 0.0008, - "ppl": 0.013153076171875, - "reward": 0.9971480667591095, - "reward_std": 0.0008778701303526759, - "rewards/perpo_ocr_edit_distance_reward": 0.9971481263637543, + "advantages": -1.2193408110761084e-05, + "completion_length": 1133.0, + "delta_ref_entropy_loss": 0.08349609375, + "delta_ref_ppl": -0.06591796875, + "entropy_loss": -0.06591796875, + "epoch": 0.1796, + "grad_norm": 1.3334094771950293, + "k1_kl": 0.06591796875, + "k3_kl": 0.036865234375, + "kimi_kl": 0.08154296875, + "learning_rate": 4.102e-07, + "loss": 0.0015, + "ppl": 0.03662109375, + "reward": 0.7312761545181274, + "reward_std": 0.0019976734183728695, + "rewards/perpo_ocr_edit_distance_reward": 0.7312761545181274, "step": 898, "temperature": 0.9 }, { - "advantages": -2.4514539745723596e-05, - "completion_length": 396.0, - "delta_ref_entropy_loss": 0.047607421875, - "delta_ref_ppl": -0.04034423828125, - "entropy_loss": -0.0234375, - "epoch": 0.3596, - "grad_norm": 0.6334429459615482, - "k1_kl": 0.04034423828125, - "k3_kl": 0.02398681640625, - "kimi_kl": 0.0709228515625, - "learning_rate": 3.2019999999999997e-07, - "loss": 0.001, - "ppl": 0.01220703125, - "reward": 0.9951838552951813, - "reward_std": 0.0010646430309861898, - "rewards/perpo_ocr_edit_distance_reward": 0.995183914899826, + "advantages": -6.811959707420101e-08, + "completion_length": 23.0, + "delta_ref_entropy_loss": 0.19921875, + "delta_ref_ppl": -0.6796875, + "entropy_loss": -0.255859375, + "epoch": 0.1798, + "grad_norm": 12.96526217070299, + "k1_kl": 0.6796875, + "k3_kl": 0.5078125, + "kimi_kl": 1.7578125, + "learning_rate": 4.101e-07, + "loss": 0.0203, + "ppl": 0.1376953125, + "reward": 0.4047618806362152, + "reward_std": 0.13214990496635437, + "rewards/perpo_ocr_edit_distance_reward": 0.4047619104385376, "step": 899, "temperature": 0.9 }, { - "advantages": -3.377029190687608e-05, - "completion_length": 518.5, - "delta_ref_entropy_loss": 0.0595703125, - "delta_ref_ppl": -0.03485107421875, - "entropy_loss": -0.049896240234375, - "epoch": 0.36, - "grad_norm": 0.8340120167269178, - "k1_kl": 0.03485107421875, - "k3_kl": 0.01800537109375, - "kimi_kl": 0.03558349609375, - "learning_rate": 3.2e-07, - "loss": 0.0008, - "ppl": 0.0292205810546875, - "reward": 0.9522643983364105, - "reward_std": 0.0015735630877315998, - "rewards/perpo_ocr_edit_distance_reward": 0.9522644281387329, + "advantages": -3.296988506917842e-05, + "completion_length": 248.0, + "delta_ref_entropy_loss": 0.0498046875, + "delta_ref_ppl": -0.0947265625, + "entropy_loss": -0.0269775390625, + "epoch": 0.18, + "grad_norm": 0.9110053299747377, + "k1_kl": 0.0947265625, + "k3_kl": 0.06640625, + "kimi_kl": 0.1826171875, + "learning_rate": 4.0999999999999994e-07, + "loss": 0.0027, + "ppl": 0.0133056640625, + "reward": 0.9966342449188232, + "reward_std": 0.0009328614687547088, + "rewards/perpo_ocr_edit_distance_reward": 0.996634304523468, "step": 900, "temperature": 0.9 }, { - "advantages": -3.0142920763864822e-06, - "completion_length": 430.0, - "delta_ref_entropy_loss": 0.0250244140625, - "delta_ref_ppl": -0.019073486328125, - "entropy_loss": -0.017578125, - "epoch": 0.3604, - "grad_norm": 0.6884884753737965, - "k1_kl": 0.019134521484375, - "k3_kl": 0.009063720703125, - "kimi_kl": 0.01629638671875, - "learning_rate": 3.1979999999999994e-07, - "loss": 0.0004, - "ppl": 0.010162353515625, - "reward": 0.9923690557479858, - "reward_std": 0.00580500258365646, - "rewards/perpo_ocr_edit_distance_reward": 0.9923690557479858, + "advantages": -6.26679029664956e-05, + "completion_length": 364.0, + "delta_ref_entropy_loss": 0.0869140625, + "delta_ref_ppl": -0.0703125, + "entropy_loss": -0.0213623046875, + "epoch": 0.1802, + "grad_norm": 0.9970820111755117, + "k1_kl": 0.0703125, + "k3_kl": 0.033935546875, + "kimi_kl": 0.07861328125, + "learning_rate": 4.099e-07, + "loss": 0.0014, + "ppl": 0.0118408203125, + "reward": 0.7708291411399841, + "reward_std": 0.0013949096901342273, + "rewards/perpo_ocr_edit_distance_reward": 0.7708292603492737, "step": 901, "temperature": 0.9 }, { - "advantages": -0.00010438476510898909, - "completion_length": 826.5, - "delta_ref_entropy_loss": 0.025177001953125, - "delta_ref_ppl": -0.0433349609375, - "entropy_loss": -0.02667236328125, - "epoch": 0.3608, - "grad_norm": 2.0813091777941413, - "k1_kl": 0.0433349609375, - "k3_kl": 0.03045654296875, - "kimi_kl": 0.0892333984375, - "learning_rate": 3.196e-07, - "loss": 0.0013, - "ppl": 0.013153076171875, - "reward": 0.9968116581439972, - "reward_std": 0.0037852657405892387, - "rewards/perpo_ocr_edit_distance_reward": 0.9968117773532867, + "advantages": -8.233956032199785e-05, + "completion_length": 134.0, + "delta_ref_entropy_loss": 0.138671875, + "delta_ref_ppl": -0.1689453125, + "entropy_loss": -0.052001953125, + "epoch": 0.1804, + "grad_norm": 1.5762519312006766, + "k1_kl": 0.1689453125, + "k3_kl": 0.11279296875, + "kimi_kl": 0.34375, + "learning_rate": 4.098e-07, + "loss": 0.0046, + "ppl": 0.0240478515625, + "reward": 0.5396414399147034, + "reward_std": 0.0006240195361897349, + "rewards/perpo_ocr_edit_distance_reward": 0.5396414995193481, "step": 902, "temperature": 0.9 }, { - "advantages": -1.5058687949931482e-05, - "completion_length": 872.0, - "delta_ref_entropy_loss": 0.0526123046875, - "delta_ref_ppl": -0.0340576171875, - "entropy_loss": -0.0380859375, - "epoch": 0.3612, - "grad_norm": 0.5471585040263606, - "k1_kl": 0.0341796875, - "k3_kl": 0.0186767578125, - "kimi_kl": 0.0416259765625, - "learning_rate": 3.194e-07, - "loss": 0.0008, - "ppl": 0.01806640625, - "reward": 0.8587788343429565, - "reward_std": 0.0027949815848842263, - "rewards/perpo_ocr_edit_distance_reward": 0.8587788939476013, + "advantages": -1.6748905181884766e-05, + "completion_length": 812.0, + "delta_ref_entropy_loss": 0.058837890625, + "delta_ref_ppl": -0.078125, + "entropy_loss": -0.037353515625, + "epoch": 0.1806, + "grad_norm": 0.7778206265112082, + "k1_kl": 0.078125, + "k3_kl": 0.04736328125, + "kimi_kl": 0.125, + "learning_rate": 4.0969999999999997e-07, + "loss": 0.0019, + "ppl": 0.016357421875, + "reward": 0.979128897190094, + "reward_std": 0.002441353164613247, + "rewards/perpo_ocr_edit_distance_reward": 0.9791289567947388, "step": 903, "temperature": 0.9 }, { - "advantages": -8.496855116391089e-05, - "completion_length": 695.5, - "delta_ref_entropy_loss": 0.0423583984375, - "delta_ref_ppl": -0.03094482421875, - "entropy_loss": -0.0257568359375, - "epoch": 0.3616, - "grad_norm": 0.4677206549528673, - "k1_kl": 0.031005859375, - "k3_kl": 0.016937255859375, - "kimi_kl": 0.048095703125, - "learning_rate": 3.1919999999999995e-07, - "loss": 0.0008, - "ppl": 0.015411376953125, - "reward": 0.9948232173919678, - "reward_std": 0.0004918614722555503, - "rewards/perpo_ocr_edit_distance_reward": 0.9948232471942902, + "advantages": -3.405979782655777e-07, + "completion_length": 100.0, + "delta_ref_entropy_loss": 0.12451171875, + "delta_ref_ppl": -0.302734375, + "entropy_loss": -0.2734375, + "epoch": 0.1808, + "grad_norm": 5.294611459770845, + "k1_kl": 0.302734375, + "k3_kl": 0.216796875, + "kimi_kl": 0.65234375, + "learning_rate": 4.096e-07, + "loss": 0.0087, + "ppl": 0.1142578125, + "reward": 0.3912394344806671, + "reward_std": 0.10280521214008331, + "rewards/perpo_ocr_edit_distance_reward": 0.3912394642829895, "step": 904, "temperature": 0.9 }, { - "advantages": 1.8856355836760486e-05, - "completion_length": 619.5, - "delta_ref_entropy_loss": 0.064697265625, - "delta_ref_ppl": -0.04315185546875, - "entropy_loss": -0.04998779296875, - "epoch": 0.362, - "grad_norm": 0.8203540949292866, - "k1_kl": 0.043182373046875, - "k3_kl": 0.024505615234375, - "kimi_kl": 0.045318603515625, - "learning_rate": 3.19e-07, - "loss": 0.001, - "ppl": 0.02911376953125, - "reward": 0.9923915863037109, - "reward_std": 0.001668834185693413, - "rewards/perpo_ocr_edit_distance_reward": 0.9923915565013885, + "advantages": 3.1505312563240295e-06, + "completion_length": 355.0, + "delta_ref_entropy_loss": 0.115234375, + "delta_ref_ppl": -0.107421875, + "entropy_loss": -0.06787109375, + "epoch": 0.181, + "grad_norm": 1.503860506507045, + "k1_kl": 0.10693359375, + "k3_kl": 0.05908203125, + "kimi_kl": 0.14453125, + "learning_rate": 4.0949999999999995e-07, + "loss": 0.0024, + "ppl": 0.037841796875, + "reward": 0.935289740562439, + "reward_std": 0.0026065383572131395, + "rewards/perpo_ocr_edit_distance_reward": 0.9352896809577942, "step": 905, "temperature": 0.9 }, { - "advantages": -1.3832535842084326e-05, - "completion_length": 1436.5, - "delta_ref_entropy_loss": 0.02642822265625, - "delta_ref_ppl": -0.0189208984375, - "entropy_loss": -0.086944580078125, - "epoch": 0.3624, - "grad_norm": 17.20638815236126, - "k1_kl": 0.018768310546875, - "k3_kl": 0.0340118408203125, - "kimi_kl": 0.02685546875, - "learning_rate": 3.1879999999999997e-07, - "loss": 0.0014, - "ppl": 0.061859130859375, - "reward": 0.8803199231624603, - "reward_std": 0.0013355882838368416, - "rewards/perpo_ocr_edit_distance_reward": 0.8803199529647827, + "advantages": -7.3143419285770506e-06, + "completion_length": 252.0, + "delta_ref_entropy_loss": 0.16015625, + "delta_ref_ppl": -0.11669921875, + "entropy_loss": -0.1162109375, + "epoch": 0.1812, + "grad_norm": 2.1308771672551097, + "k1_kl": 0.11669921875, + "k3_kl": 0.06494140625, + "kimi_kl": 0.1728515625, + "learning_rate": 4.0939999999999995e-07, + "loss": 0.0026, + "ppl": 0.051025390625, + "reward": 0.9194759130477905, + "reward_std": 0.002232522936537862, + "rewards/perpo_ocr_edit_distance_reward": 0.9194759130477905, "step": 906, "temperature": 0.9 }, { - "advantages": -0.00029799342155456543, - "completion_length": 378.5, - "delta_ref_entropy_loss": 0.05328369140625, - "delta_ref_ppl": -0.27276611328125, - "entropy_loss": -0.126922607421875, - "epoch": 0.3628, - "grad_norm": 12.101685315958958, - "k1_kl": 0.270751953125, - "k3_kl": 0.182861328125, - "kimi_kl": 0.420166015625, - "learning_rate": 3.1859999999999995e-07, - "loss": 0.0076, - "ppl": 0.055023193359375, - "reward": 0.5906615629792213, - "reward_std": 0.0304380115121603, - "rewards/perpo_ocr_edit_distance_reward": 0.5906615927815437, + "advantages": -1.5480178262805566e-05, + "completion_length": 350.0, + "delta_ref_entropy_loss": 0.0341796875, + "delta_ref_ppl": -0.07421875, + "entropy_loss": -0.0167236328125, + "epoch": 0.1814, + "grad_norm": 1.3318279548233043, + "k1_kl": 0.07470703125, + "k3_kl": 0.053466796875, + "kimi_kl": 0.197265625, + "learning_rate": 4.093e-07, + "loss": 0.0021, + "ppl": 0.01025390625, + "reward": 0.9917542338371277, + "reward_std": 0.001550647197291255, + "rewards/perpo_ocr_edit_distance_reward": 0.9917542934417725, "step": 907, "temperature": 0.9 }, { - "advantages": -1.5884638287388952e-05, - "completion_length": 342.0, - "delta_ref_entropy_loss": 0.05517578125, - "delta_ref_ppl": -0.05926513671875, - "entropy_loss": -0.06475830078125, - "epoch": 0.3632, - "grad_norm": 1.3879031381462947, - "k1_kl": 0.05902099609375, - "k3_kl": 0.037811279296875, - "kimi_kl": 0.08514404296875, - "learning_rate": 3.184e-07, - "loss": 0.0015, - "ppl": 0.037139892578125, - "reward": 0.9897593855857849, - "reward_std": 0.0020492508774623275, - "rewards/perpo_ocr_edit_distance_reward": 0.9897594451904297, + "advantages": -7.356916285061743e-06, + "completion_length": 679.0, + "delta_ref_entropy_loss": 0.0732421875, + "delta_ref_ppl": -0.07470703125, + "entropy_loss": -0.03076171875, + "epoch": 0.1816, + "grad_norm": 0.9819480118930728, + "k1_kl": 0.07470703125, + "k3_kl": 0.04150390625, + "kimi_kl": 0.146484375, + "learning_rate": 4.092e-07, + "loss": 0.0017, + "ppl": 0.01025390625, + "reward": 0.9460640549659729, + "reward_std": 0.005663772113621235, + "rewards/perpo_ocr_edit_distance_reward": 0.9460641741752625, "step": 908, "temperature": 0.9 }, { - "advantages": -6.7949297672953435e-06, - "completion_length": 852.5, - "delta_ref_entropy_loss": 0.0521240234375, - "delta_ref_ppl": -0.04669189453125, - "entropy_loss": -0.0623779296875, - "epoch": 0.3636, - "grad_norm": 1.8297180164923927, - "k1_kl": 0.0467529296875, - "k3_kl": 0.0328369140625, - "kimi_kl": 0.11993408203125, - "learning_rate": 3.182e-07, - "loss": 0.0013, - "ppl": 0.0360107421875, - "reward": 0.6868799328804016, - "reward_std": 0.06317071849480271, - "rewards/perpo_ocr_edit_distance_reward": 0.686879962682724, + "advantages": -1.965250339708291e-05, + "completion_length": 260.0, + "delta_ref_entropy_loss": 0.061279296875, + "delta_ref_ppl": -0.07861328125, + "entropy_loss": -0.02978515625, + "epoch": 0.1818, + "grad_norm": 1.5842261621213016, + "k1_kl": 0.07861328125, + "k3_kl": 0.053466796875, + "kimi_kl": 0.140625, + "learning_rate": 4.091e-07, + "loss": 0.0022, + "ppl": 0.0125732421875, + "reward": 0.9865604043006897, + "reward_std": 0.0020648364443331957, + "rewards/perpo_ocr_edit_distance_reward": 0.9865605235099792, "step": 909, "temperature": 0.9 }, { - "advantages": -7.647489019291243e-05, - "completion_length": 637.0, - "delta_ref_entropy_loss": 0.0430908203125, - "delta_ref_ppl": -0.02801513671875, - "entropy_loss": -0.02557373046875, - "epoch": 0.364, - "grad_norm": 0.45600064688289665, - "k1_kl": 0.02801513671875, - "k3_kl": 0.015106201171875, - "kimi_kl": 0.03533935546875, - "learning_rate": 3.18e-07, + "advantages": -3.2033240131568164e-05, + "completion_length": 666.0, + "delta_ref_entropy_loss": 0.019775390625, + "delta_ref_ppl": -0.0269775390625, + "entropy_loss": -0.01904296875, + "epoch": 0.182, + "grad_norm": 0.48162734575064253, + "k1_kl": 0.0269775390625, + "k3_kl": 0.017822265625, + "kimi_kl": 0.037353515625, + "learning_rate": 4.0899999999999997e-07, "loss": 0.0007, - "ppl": 0.01275634765625, - "reward": 0.997828483581543, - "reward_std": 0.0007595667848363519, - "rewards/perpo_ocr_edit_distance_reward": 0.9978285133838654, + "ppl": 0.0087890625, + "reward": 0.9915830492973328, + "reward_std": 0.00043143032235093415, + "rewards/perpo_ocr_edit_distance_reward": 0.9915831089019775, "step": 910, "temperature": 0.9 }, { - "advantages": -2.946172571682837e-06, - "completion_length": 642.0, - "delta_ref_entropy_loss": 0.02557373046875, - "delta_ref_ppl": -0.017486572265625, - "entropy_loss": -0.018524169921875, - "epoch": 0.3644, - "grad_norm": 0.6991649294406108, - "k1_kl": 0.01751708984375, - "k3_kl": 0.0108489990234375, - "kimi_kl": 0.0302734375, - "learning_rate": 3.178e-07, - "loss": 0.0004, - "ppl": 0.0070648193359375, - "reward": 0.9878344237804413, - "reward_std": 0.005816672768560238, - "rewards/perpo_ocr_edit_distance_reward": 0.9878344833850861, + "advantages": -5.231585237197578e-05, + "completion_length": 825.0, + "delta_ref_entropy_loss": 0.054443359375, + "delta_ref_ppl": -0.050048828125, + "entropy_loss": -0.023681640625, + "epoch": 0.1822, + "grad_norm": 0.8269047158888251, + "k1_kl": 0.050048828125, + "k3_kl": 0.025390625, + "kimi_kl": 0.05615234375, + "learning_rate": 4.0889999999999996e-07, + "loss": 0.0011, + "ppl": 0.0108642578125, + "reward": 0.997688889503479, + "reward_std": 0.0005511845229193568, + "rewards/perpo_ocr_edit_distance_reward": 0.9976890087127686, "step": 911, "temperature": 0.9 }, { - "advantages": -4.598072791850427e-06, - "completion_length": 560.5, - "delta_ref_entropy_loss": 0.03570556640625, - "delta_ref_ppl": -0.018768310546875, - "entropy_loss": -0.028594970703125, - "epoch": 0.3648, - "grad_norm": 0.7298193959479327, - "k1_kl": 0.018798828125, - "k3_kl": 0.009185791015625, - "kimi_kl": 0.0164794921875, - "learning_rate": 3.176e-07, - "loss": 0.0004, - "ppl": 0.0145263671875, - "reward": 0.9880095422267914, - "reward_std": 0.0008756567840464413, - "rewards/perpo_ocr_edit_distance_reward": 0.9880096018314362, + "advantages": -1.1069434435739822e-07, + "completion_length": 15.0, + "delta_ref_entropy_loss": 0.2021484375, + "delta_ref_ppl": -0.56640625, + "entropy_loss": -0.50390625, + "epoch": 0.1824, + "grad_norm": 15.128154985667123, + "k1_kl": 0.5625, + "k3_kl": 0.419921875, + "kimi_kl": 1.359375, + "learning_rate": 4.0879999999999995e-07, + "loss": 0.0168, + "ppl": 0.25390625, + "reward": 0.16472302377223969, + "reward_std": 0.07659941911697388, + "rewards/perpo_ocr_edit_distance_reward": 0.16472303867340088, "step": 912, "temperature": 0.9 }, { - "advantages": 1.671910331424442e-05, - "completion_length": 1408.5, - "delta_ref_entropy_loss": 0.016754150390625, - "delta_ref_ppl": -0.008331298828125, - "entropy_loss": -0.02691650390625, - "epoch": 0.3652, - "grad_norm": 0.24560163411156116, - "k1_kl": 0.00836181640625, - "k3_kl": 0.004364013671875, - "kimi_kl": 0.0068817138671875, - "learning_rate": 3.174e-07, - "loss": 0.0002, - "ppl": 0.01373291015625, - "reward": 0.9975097477436066, - "reward_std": 0.0004519943322520703, - "rewards/perpo_ocr_edit_distance_reward": 0.997509777545929, + "advantages": 0.0, + "completion_length": 593.0, + "delta_ref_entropy_loss": 0.03466796875, + "delta_ref_ppl": -0.03564453125, + "entropy_loss": -0.0133056640625, + "epoch": 0.1826, + "grad_norm": 0.009755157061529188, + "k1_kl": 0.03564453125, + "k3_kl": 0.0198974609375, + "kimi_kl": 0.05126953125, + "learning_rate": 4.087e-07, + "loss": 0.0008, + "ppl": 0.00439453125, + "reward": 0.994808554649353, + "reward_std": 0.0, + "rewards/perpo_ocr_edit_distance_reward": 0.994808554649353, "step": 913, "temperature": 0.9 }, { - "advantages": -5.15239607921103e-05, - "completion_length": 266.5, - "delta_ref_entropy_loss": 0.0697021484375, - "delta_ref_ppl": -0.0667724609375, - "entropy_loss": -0.0328369140625, - "epoch": 0.3656, - "grad_norm": 0.9607764760914967, - "k1_kl": 0.0670166015625, - "k3_kl": 0.04052734375, - "kimi_kl": 0.129150390625, - "learning_rate": 3.1719999999999996e-07, - "loss": 0.0017, - "ppl": 0.01751708984375, - "reward": 0.8959050476551056, - "reward_std": 0.0011277172598056495, - "rewards/perpo_ocr_edit_distance_reward": 0.8959051966667175, + "advantages": -0.00011683362390613183, + "completion_length": 169.0, + "delta_ref_entropy_loss": 0.05419921875, + "delta_ref_ppl": -0.111328125, + "entropy_loss": -0.0419921875, + "epoch": 0.1828, + "grad_norm": 1.6171670404267184, + "k1_kl": 0.111328125, + "k3_kl": 0.07861328125, + "kimi_kl": 0.263671875, + "learning_rate": 4.086e-07, + "loss": 0.0033, + "ppl": 0.01422119140625, + "reward": 0.9326215386390686, + "reward_std": 0.0006288193399086595, + "rewards/perpo_ocr_edit_distance_reward": 0.9326215982437134, "step": 914, "temperature": 0.9 }, { - "advantages": -0.0001068179130925273, - "completion_length": 770.0, - "delta_ref_entropy_loss": 0.025634765625, - "delta_ref_ppl": -0.01776123046875, - "entropy_loss": -0.01812744140625, - "epoch": 0.366, - "grad_norm": 0.7516269462538496, - "k1_kl": 0.01776123046875, - "k3_kl": 0.01068115234375, - "kimi_kl": 0.01934814453125, - "learning_rate": 3.17e-07, - "loss": 0.0005, - "ppl": 0.008819580078125, - "reward": 0.9962175190448761, - "reward_std": 0.003758824914257275, - "rewards/perpo_ocr_edit_distance_reward": 0.9962176382541656, + "advantages": 2.665179181349231e-06, + "completion_length": 672.0, + "delta_ref_entropy_loss": 0.1083984375, + "delta_ref_ppl": -0.07421875, + "entropy_loss": -0.158203125, + "epoch": 0.183, + "grad_norm": 1.6451526808374506, + "k1_kl": 0.07421875, + "k3_kl": 0.04150390625, + "kimi_kl": 0.08740234375, + "learning_rate": 4.0849999999999993e-07, + "loss": 0.0017, + "ppl": 0.080078125, + "reward": 0.8855665326118469, + "reward_std": 0.009514762088656425, + "rewards/perpo_ocr_edit_distance_reward": 0.8855664730072021, "step": 915, "temperature": 0.9 }, { - "advantages": -7.231321069411933e-05, - "completion_length": 676.0, - "delta_ref_entropy_loss": 0.017822265625, - "delta_ref_ppl": -0.02386474609375, - "entropy_loss": -0.010345458984375, - "epoch": 0.3664, - "grad_norm": 0.16737627297135926, - "k1_kl": 0.02392578125, - "k3_kl": 0.015625, - "kimi_kl": 0.040771484375, - "learning_rate": 3.1680000000000003e-07, - "loss": 0.0007, - "ppl": 0.0040740966796875, - "reward": 0.9987631440162659, - "reward_std": 0.00018552450637798756, - "rewards/perpo_ocr_edit_distance_reward": 0.998763233423233, + "advantages": 8.174351933121216e-07, + "completion_length": 572.0, + "delta_ref_entropy_loss": 0.1826171875, + "delta_ref_ppl": -0.11376953125, + "entropy_loss": -0.265625, + "epoch": 0.1832, + "grad_norm": 2.734142208811726, + "k1_kl": 0.11376953125, + "k3_kl": 0.0576171875, + "kimi_kl": 0.0966796875, + "learning_rate": 4.084e-07, + "loss": 0.0023, + "ppl": 0.162109375, + "reward": 0.41418853402137756, + "reward_std": 0.005005571525543928, + "rewards/perpo_ocr_edit_distance_reward": 0.41418853402137756, "step": 916, "temperature": 0.9 }, { - "advantages": -0.0001776293065596235, - "completion_length": 1557.5, - "delta_ref_entropy_loss": 0.0333251953125, - "delta_ref_ppl": -0.04443359375, - "entropy_loss": -0.04949951171875, - "epoch": 0.3668, - "grad_norm": 0.529510333567855, - "k1_kl": 0.0443115234375, - "k3_kl": 0.02978515625, - "kimi_kl": 0.095703125, - "learning_rate": 3.1659999999999996e-07, - "loss": 0.0014, - "ppl": 0.0182647705078125, - "reward": 0.7967961728572845, - "reward_std": 0.0980435349047184, - "rewards/perpo_ocr_edit_distance_reward": 0.7967962324619293, + "advantages": -1.043932843458606e-05, + "completion_length": 636.0, + "delta_ref_entropy_loss": 0.05029296875, + "delta_ref_ppl": -0.06103515625, + "entropy_loss": -0.024169921875, + "epoch": 0.1834, + "grad_norm": 0.5234779250559964, + "k1_kl": 0.06103515625, + "k3_kl": 0.03759765625, + "kimi_kl": 0.107421875, + "learning_rate": 4.0829999999999997e-07, + "loss": 0.0015, + "ppl": 0.01068115234375, + "reward": 0.8216798305511475, + "reward_std": 0.0007162849069572985, + "rewards/perpo_ocr_edit_distance_reward": 0.8216798305511475, "step": 917, "temperature": 0.9 }, { - "advantages": -2.052102932914579e-06, - "completion_length": 865.5, - "delta_ref_entropy_loss": 0.0516357421875, - "delta_ref_ppl": -0.03692626953125, - "entropy_loss": -0.0341796875, - "epoch": 0.3672, - "grad_norm": 0.6030798666078949, - "k1_kl": 0.03704833984375, - "k3_kl": 0.020233154296875, - "kimi_kl": 0.049560546875, - "learning_rate": 3.164e-07, - "loss": 0.0008, - "ppl": 0.014801025390625, - "reward": 0.9345507621765137, - "reward_std": 0.0032018728088587523, - "rewards/perpo_ocr_edit_distance_reward": 0.9345507621765137, - "step": 918, - "temperature": 0.9 - }, - { - "advantages": -6.221873627509922e-05, - "completion_length": 461.0, + "advantages": 0.0, + "completion_length": 1448.0, "delta_ref_entropy_loss": 0.040771484375, - "delta_ref_ppl": -0.050537109375, - "entropy_loss": -0.017608642578125, - "epoch": 0.3676, - "grad_norm": 0.5277566740439262, - "k1_kl": 0.05047607421875, - "k3_kl": 0.0352783203125, - "kimi_kl": 0.1365966796875, - "learning_rate": 3.162e-07, - "loss": 0.0015, - "ppl": 0.008392333984375, - "reward": 0.9989630281925201, - "reward_std": 0.0004397291486384347, - "rewards/perpo_ocr_edit_distance_reward": 0.9989630579948425, + "delta_ref_ppl": -0.0380859375, + "entropy_loss": -0.04345703125, + "epoch": 0.1836, + "grad_norm": 0.7358746288791984, + "k1_kl": 0.038330078125, + "k3_kl": 0.0257568359375, + "kimi_kl": 0.06005859375, + "learning_rate": 4.0819999999999996e-07, + "loss": 0.001, + "ppl": 0.023193359375, + "reward": 0.9943212270736694, + "reward_std": 0.0010251685744151473, + "rewards/perpo_ocr_edit_distance_reward": 0.9943211674690247, + "step": 918, + "temperature": 0.9 + }, + { + "advantages": -1.4994826415204443e-05, + "completion_length": 629.0, + "delta_ref_entropy_loss": 0.2373046875, + "delta_ref_ppl": -0.1328125, + "entropy_loss": -0.23046875, + "epoch": 0.1838, + "grad_norm": 2.6229626497520346, + "k1_kl": 0.1328125, + "k3_kl": 0.0693359375, + "kimi_kl": 0.142578125, + "learning_rate": 4.081e-07, + "loss": 0.0028, + "ppl": 0.1298828125, + "reward": 0.8147951364517212, + "reward_std": 0.0033093371894210577, + "rewards/perpo_ocr_edit_distance_reward": 0.8147951364517212, "step": 919, "temperature": 0.9 }, { - "advantages": -0.00029809985842632614, - "completion_length": 271.0, - "delta_ref_entropy_loss": 0.0560302734375, - "delta_ref_ppl": -0.07891845703125, - "entropy_loss": -0.0203857421875, - "epoch": 0.368, - "grad_norm": 0.5062901279876939, - "k1_kl": 0.07891845703125, - "k3_kl": 0.056732177734375, - "kimi_kl": 0.29522705078125, - "learning_rate": 3.1599999999999997e-07, - "loss": 0.0026, - "ppl": 0.011810302734375, - "reward": 0.9621673226356506, - "reward_std": 0.05629344657063484, - "rewards/perpo_ocr_edit_distance_reward": 0.9621673822402954, + "advantages": -7.672395440749824e-05, + "completion_length": 636.0, + "delta_ref_entropy_loss": 0.042724609375, + "delta_ref_ppl": -0.02734375, + "entropy_loss": -0.050537109375, + "epoch": 0.184, + "grad_norm": 0.22291994512994626, + "k1_kl": 0.02734375, + "k3_kl": 0.01495361328125, + "kimi_kl": 0.029296875, + "learning_rate": 4.0799999999999995e-07, + "loss": 0.0007, + "ppl": 0.0235595703125, + "reward": 0.14338816702365875, + "reward_std": 3.872546585625969e-05, + "rewards/perpo_ocr_edit_distance_reward": 0.14338818192481995, "step": 920, "temperature": 0.9 }, { - "advantages": -5.015305396227632e-05, - "completion_length": 689.5, - "delta_ref_entropy_loss": 0.072509765625, - "delta_ref_ppl": -0.04638671875, - "entropy_loss": -0.0902099609375, - "epoch": 0.3684, - "grad_norm": 0.7326719419125517, - "k1_kl": 0.0465087890625, - "k3_kl": 0.02593994140625, - "kimi_kl": 0.0859375, - "learning_rate": 3.158e-07, - "loss": 0.0011, - "ppl": 0.05364990234375, - "reward": 0.860982745885849, - "reward_std": 0.002860681910533458, - "rewards/perpo_ocr_edit_distance_reward": 0.8609828650951385, + "advantages": -6.0643469623755664e-05, + "completion_length": 1318.0, + "delta_ref_entropy_loss": 0.02490234375, + "delta_ref_ppl": -0.027587890625, + "entropy_loss": -0.047119140625, + "epoch": 0.1842, + "grad_norm": 1.099196981612744, + "k1_kl": 0.027587890625, + "k3_kl": 0.0184326171875, + "kimi_kl": 0.041015625, + "learning_rate": 4.0789999999999994e-07, + "loss": 0.0008, + "ppl": 0.0263671875, + "reward": 0.989680826663971, + "reward_std": 0.0011635569389909506, + "rewards/perpo_ocr_edit_distance_reward": 0.9896809458732605, "step": 921, "temperature": 0.9 }, { - "advantages": -1.701925543784455e-05, - "completion_length": 969.5, - "delta_ref_entropy_loss": 0.05029296875, - "delta_ref_ppl": -0.036865234375, - "entropy_loss": -0.0386962890625, - "epoch": 0.3688, - "grad_norm": 1.704259307140471, - "k1_kl": 0.03692626953125, - "k3_kl": 0.0377197265625, - "kimi_kl": 0.0621337890625, - "learning_rate": 3.156e-07, - "loss": 0.0015, - "ppl": 0.018646240234375, - "reward": 0.9560819268226624, - "reward_std": 0.00449920172104612, - "rewards/perpo_ocr_edit_distance_reward": 0.9560819864273071, + "advantages": -5.051068001193926e-05, + "completion_length": 1172.0, + "delta_ref_entropy_loss": 0.056884765625, + "delta_ref_ppl": -0.04150390625, + "entropy_loss": -0.05419921875, + "epoch": 0.1844, + "grad_norm": 1.207756206063324, + "k1_kl": 0.04150390625, + "k3_kl": 0.0303955078125, + "kimi_kl": 0.0498046875, + "learning_rate": 4.078e-07, + "loss": 0.0013, + "ppl": 0.0308837890625, + "reward": 0.9865738153457642, + "reward_std": 0.0015855402452871203, + "rewards/perpo_ocr_edit_distance_reward": 0.9865739345550537, "step": 922, "temperature": 0.9 }, { - "advantages": -3.2356808787881164e-07, - "completion_length": 666.0, - "delta_ref_entropy_loss": 0.0369873046875, - "delta_ref_ppl": -0.023193359375, - "entropy_loss": -0.028076171875, - "epoch": 0.3692, - "grad_norm": 0.8100654744067387, - "k1_kl": 0.02325439453125, - "k3_kl": 0.01141357421875, - "kimi_kl": 0.01983642578125, - "learning_rate": 3.154e-07, - "loss": 0.0005, - "ppl": 0.01446533203125, - "reward": 0.9544567465782166, - "reward_std": 0.01312936283648014, - "rewards/perpo_ocr_edit_distance_reward": 0.9544567465782166, + "advantages": -6.369182301568799e-06, + "completion_length": 1269.0, + "delta_ref_entropy_loss": 0.0791015625, + "delta_ref_ppl": -0.057373046875, + "entropy_loss": -0.03271484375, + "epoch": 0.1846, + "grad_norm": 1.1042201407486338, + "k1_kl": 0.057373046875, + "k3_kl": 0.026611328125, + "kimi_kl": 0.060546875, + "learning_rate": 4.077e-07, + "loss": 0.0011, + "ppl": 0.01544189453125, + "reward": 0.6756600737571716, + "reward_std": 0.009284310974180698, + "rewards/perpo_ocr_edit_distance_reward": 0.6756601929664612, "step": 923, "temperature": 0.9 }, { - "advantages": -1.7753669681042084e-06, - "completion_length": 208.5, - "delta_ref_entropy_loss": 0.063232421875, - "delta_ref_ppl": -0.0587158203125, - "entropy_loss": -0.020751953125, - "epoch": 0.3696, - "grad_norm": 0.6530289599007392, - "k1_kl": 0.0587158203125, - "k3_kl": 0.04180908203125, - "kimi_kl": 0.16064453125, - "learning_rate": 3.1519999999999996e-07, - "loss": 0.0017, - "ppl": 0.0103912353515625, - "reward": 0.9988554120063782, - "reward_std": 0.001149432617239654, - "rewards/perpo_ocr_edit_distance_reward": 0.998855471611023, + "advantages": -1.2295587112021167e-05, + "completion_length": 588.0, + "delta_ref_entropy_loss": 0.09375, + "delta_ref_ppl": -0.06689453125, + "entropy_loss": -0.1025390625, + "epoch": 0.1848, + "grad_norm": 1.9042790412588109, + "k1_kl": 0.06689453125, + "k3_kl": 0.03515625, + "kimi_kl": 0.08056640625, + "learning_rate": 4.076e-07, + "loss": 0.0014, + "ppl": 0.04638671875, + "reward": 0.6779849529266357, + "reward_std": 0.0033607452642172575, + "rewards/perpo_ocr_edit_distance_reward": 0.6779850721359253, "step": 924, "temperature": 0.9 }, { - "advantages": -3.261225629103137e-06, - "completion_length": 1112.5, - "delta_ref_entropy_loss": 0.02520751953125, - "delta_ref_ppl": -0.017364501953125, - "entropy_loss": -0.03363037109375, - "epoch": 0.37, - "grad_norm": 0.8433192273468664, - "k1_kl": 0.0173187255859375, - "k3_kl": 0.01812744140625, - "kimi_kl": 0.02392578125, - "learning_rate": 3.15e-07, - "loss": 0.0007, - "ppl": 0.02301025390625, - "reward": 0.9953672289848328, - "reward_std": 0.0019017258891835809, - "rewards/perpo_ocr_edit_distance_reward": 0.9953672885894775, + "advantages": -0.00010415486030979082, + "completion_length": 947.0, + "delta_ref_entropy_loss": 0.0498046875, + "delta_ref_ppl": -0.041259765625, + "entropy_loss": -0.0238037109375, + "epoch": 0.185, + "grad_norm": 0.33462673480963623, + "k1_kl": 0.041259765625, + "k3_kl": 0.0228271484375, + "kimi_kl": 0.054443359375, + "learning_rate": 4.0749999999999996e-07, + "loss": 0.001, + "ppl": 0.01007080078125, + "reward": 0.8549990057945251, + "reward_std": 0.0002270457916893065, + "rewards/perpo_ocr_edit_distance_reward": 0.8549990057945251, "step": 925, "temperature": 0.9 }, { - "advantages": -1.6135829639551957e-05, - "completion_length": 599.5, - "delta_ref_entropy_loss": 0.11328125, - "delta_ref_ppl": -0.048583984375, - "entropy_loss": -0.1085205078125, - "epoch": 0.3704, - "grad_norm": 0.9803316215095974, - "k1_kl": 0.04852294921875, - "k3_kl": 0.01959228515625, - "kimi_kl": 0.030975341796875, - "learning_rate": 3.148e-07, + "advantages": -2.2172929675434716e-05, + "completion_length": 800.0, + "delta_ref_entropy_loss": 0.0196533203125, + "delta_ref_ppl": -0.0308837890625, + "entropy_loss": -0.016845703125, + "epoch": 0.1852, + "grad_norm": 0.75461169002473, + "k1_kl": 0.031005859375, + "k3_kl": 0.020263671875, + "kimi_kl": 0.05810546875, + "learning_rate": 4.0739999999999996e-07, "loss": 0.0008, - "ppl": 0.058868408203125, - "reward": 0.9297740757465363, - "reward_std": 0.04780803783796728, - "rewards/perpo_ocr_edit_distance_reward": 0.9297741949558258, + "ppl": 0.007080078125, + "reward": 0.9929183721542358, + "reward_std": 0.004127796273678541, + "rewards/perpo_ocr_edit_distance_reward": 0.9929184913635254, "step": 926, "temperature": 0.9 }, { - "advantages": -8.44682980982725e-06, - "completion_length": 1623.5, - "delta_ref_entropy_loss": 0.0430908203125, - "delta_ref_ppl": -0.0443115234375, - "entropy_loss": -0.04730224609375, - "epoch": 0.3708, - "grad_norm": 1.2978491366832756, - "k1_kl": 0.04443359375, - "k3_kl": 0.02703857421875, - "kimi_kl": 0.07147216796875, - "learning_rate": 3.1459999999999997e-07, - "loss": 0.0011, - "ppl": 0.02313232421875, - "reward": 0.901899516582489, - "reward_std": 0.04297211137600243, - "rewards/perpo_ocr_edit_distance_reward": 0.9018996059894562, + "advantages": -1.1163098861288745e-05, + "completion_length": 349.0, + "delta_ref_entropy_loss": 0.06689453125, + "delta_ref_ppl": -0.058349609375, + "entropy_loss": -0.0299072265625, + "epoch": 0.1854, + "grad_norm": 1.2253513547431978, + "k1_kl": 0.058349609375, + "k3_kl": 0.0311279296875, + "kimi_kl": 0.07763671875, + "learning_rate": 4.073e-07, + "loss": 0.0013, + "ppl": 0.01263427734375, + "reward": 0.9907580018043518, + "reward_std": 0.001424776972271502, + "rewards/perpo_ocr_edit_distance_reward": 0.9907580614089966, "step": 927, "temperature": 0.9 }, { - "advantages": -4.208087989354681e-05, - "completion_length": 416.5, - "delta_ref_entropy_loss": 0.02947998046875, - "delta_ref_ppl": -0.0350341796875, - "entropy_loss": -0.02606201171875, - "epoch": 0.3712, - "grad_norm": 2.4875237435454176, - "k1_kl": 0.03515625, - "k3_kl": 0.035888671875, - "kimi_kl": 0.09808349609375, - "learning_rate": 3.144e-07, - "loss": 0.0015, - "ppl": 0.017059326171875, - "reward": 0.9968634843826294, - "reward_std": 0.0011932130728382617, - "rewards/perpo_ocr_edit_distance_reward": 0.9968635439872742, + "advantages": -8.623940811958164e-05, + "completion_length": 438.0, + "delta_ref_entropy_loss": 0.048828125, + "delta_ref_ppl": -0.043212890625, + "entropy_loss": -0.0220947265625, + "epoch": 0.1856, + "grad_norm": 0.884629323808472, + "k1_kl": 0.04345703125, + "k3_kl": 0.023681640625, + "kimi_kl": 0.0537109375, + "learning_rate": 4.072e-07, + "loss": 0.001, + "ppl": 0.00970458984375, + "reward": 0.9887698888778687, + "reward_std": 0.0009862050646916032, + "rewards/perpo_ocr_edit_distance_reward": 0.9887700080871582, "step": 928, "temperature": 0.9 }, { - "advantages": -5.9404546586705465e-05, - "completion_length": 972.0, - "delta_ref_entropy_loss": 0.02471923828125, - "delta_ref_ppl": -0.02154541015625, - "entropy_loss": -0.046875, - "epoch": 0.3716, - "grad_norm": 2.4885528596275077, - "k1_kl": 0.02154541015625, - "k3_kl": 0.0138397216796875, - "kimi_kl": 0.039947509765625, - "learning_rate": 3.1419999999999994e-07, - "loss": 0.0006, - "ppl": 0.03271484375, - "reward": 0.9378473460674286, - "reward_std": 0.07582300112699158, - "rewards/perpo_ocr_edit_distance_reward": 0.9378474354743958, + "advantages": -3.4059798537100505e-08, + "completion_length": 289.0, + "delta_ref_entropy_loss": 0.10009765625, + "delta_ref_ppl": -0.1552734375, + "entropy_loss": -0.25390625, + "epoch": 0.1858, + "grad_norm": 4.022723707647154, + "k1_kl": 0.1552734375, + "k3_kl": 0.09912109375, + "kimi_kl": 0.2431640625, + "learning_rate": 4.071e-07, + "loss": 0.004, + "ppl": 0.1259765625, + "reward": 0.426356703042984, + "reward_std": 0.13947297632694244, + "rewards/perpo_ocr_edit_distance_reward": 0.4263567328453064, "step": 929, "temperature": 0.9 }, { - "advantages": -2.2238920337258605e-05, - "completion_length": 937.0, - "delta_ref_entropy_loss": 0.0330810546875, - "delta_ref_ppl": -0.03143310546875, - "entropy_loss": -0.0306396484375, - "epoch": 0.372, - "grad_norm": 0.490865357136254, - "k1_kl": 0.03143310546875, - "k3_kl": 0.02044677734375, - "kimi_kl": 0.0970458984375, - "learning_rate": 3.14e-07, - "loss": 0.0008, - "ppl": 0.012969970703125, - "reward": 0.9624385237693787, - "reward_std": 0.0005953207437414676, - "rewards/perpo_ocr_edit_distance_reward": 0.962438553571701, + "advantages": -1.862645149230957e-05, + "completion_length": 296.0, + "delta_ref_entropy_loss": 0.060546875, + "delta_ref_ppl": -0.0791015625, + "entropy_loss": -0.0191650390625, + "epoch": 0.186, + "grad_norm": 0.007102486826857342, + "k1_kl": 0.07861328125, + "k3_kl": 0.054443359375, + "kimi_kl": 0.1845703125, + "learning_rate": 4.07e-07, + "loss": 0.0022, + "ppl": 0.003570556640625, + "reward": 0.022403258830308914, + "reward_std": 0.0, + "rewards/perpo_ocr_edit_distance_reward": 0.022403260692954063, "step": 930, "temperature": 0.9 }, { - "advantages": -2.341611207157257e-06, - "completion_length": 856.5, - "delta_ref_entropy_loss": 0.0284423828125, - "delta_ref_ppl": -0.0223388671875, - "entropy_loss": -0.038360595703125, - "epoch": 0.3724, - "grad_norm": 5.391722102509044, - "k1_kl": 0.022491455078125, - "k3_kl": 0.013671875, - "kimi_kl": 0.034423828125, - "learning_rate": 3.138e-07, - "loss": 0.0005, - "ppl": 0.019681930541992188, - "reward": 0.9634803831577301, - "reward_std": 0.008157515898346901, - "rewards/perpo_ocr_edit_distance_reward": 0.9634804129600525, + "advantages": -4.76837158203125e-07, + "completion_length": 586.0, + "delta_ref_entropy_loss": 0.185546875, + "delta_ref_ppl": -0.12890625, + "entropy_loss": -0.31640625, + "epoch": 0.1862, + "grad_norm": 2.072297930157608, + "k1_kl": 0.12890625, + "k3_kl": 0.07666015625, + "kimi_kl": 0.1455078125, + "learning_rate": 4.0689999999999997e-07, + "loss": 0.0031, + "ppl": 0.169921875, + "reward": 0.41071879863739014, + "reward_std": 0.053987693041563034, + "rewards/perpo_ocr_edit_distance_reward": 0.41071879863739014, "step": 931, "temperature": 0.9 }, { - "advantages": -2.5544849449943285e-05, - "completion_length": 450.5, - "delta_ref_entropy_loss": 0.0478515625, - "delta_ref_ppl": -0.03662109375, - "entropy_loss": -0.0390625, - "epoch": 0.3728, - "grad_norm": 1.1679405014506128, - "k1_kl": 0.0364990234375, - "k3_kl": 0.02252197265625, - "kimi_kl": 0.062744140625, - "learning_rate": 3.1359999999999995e-07, - "loss": 0.0009, - "ppl": 0.0223388671875, - "reward": 0.9394738078117371, - "reward_std": 0.013996845169458538, - "rewards/perpo_ocr_edit_distance_reward": 0.9394739270210266, + "advantages": -1.021793991640152e-07, + "completion_length": 766.0, + "delta_ref_entropy_loss": 0.09326171875, + "delta_ref_ppl": -0.08642578125, + "entropy_loss": -0.1103515625, + "epoch": 0.1864, + "grad_norm": 2.496837080674314, + "k1_kl": 0.0859375, + "k3_kl": 0.051513671875, + "kimi_kl": 0.09912109375, + "learning_rate": 4.0679999999999996e-07, + "loss": 0.0021, + "ppl": 0.05419921875, + "reward": 0.7561601996421814, + "reward_std": 0.23909786343574524, + "rewards/perpo_ocr_edit_distance_reward": 0.7561601996421814, "step": 932, "temperature": 0.9 }, { - "advantages": 2.3933394231789862e-05, - "completion_length": 568.0, - "delta_ref_entropy_loss": 0.05615234375, - "delta_ref_ppl": -0.0341796875, - "entropy_loss": -0.0582275390625, - "epoch": 0.3732, - "grad_norm": 1.143996798232374, - "k1_kl": 0.0340576171875, - "k3_kl": 0.01861572265625, - "kimi_kl": 0.040771484375, - "learning_rate": 3.134e-07, - "loss": 0.0007, - "ppl": 0.03277587890625, - "reward": 0.9766333401203156, - "reward_std": 0.0018767733417917043, - "rewards/perpo_ocr_edit_distance_reward": 0.9766333699226379, + "advantages": -4.231929779052734e-06, + "completion_length": 259.0, + "delta_ref_entropy_loss": 0.08984375, + "delta_ref_ppl": -0.1171875, + "entropy_loss": -0.0947265625, + "epoch": 0.1866, + "grad_norm": 2.187279638019227, + "k1_kl": 0.1171875, + "k3_kl": 0.076171875, + "kimi_kl": 0.1904296875, + "learning_rate": 4.067e-07, + "loss": 0.003, + "ppl": 0.050048828125, + "reward": 0.9727855920791626, + "reward_std": 0.007960467599332333, + "rewards/perpo_ocr_edit_distance_reward": 0.9727856516838074, "step": 933, "temperature": 0.9 }, { - "advantages": -4.576785499921243e-06, - "completion_length": 404.5, - "delta_ref_entropy_loss": 0.1279296875, - "delta_ref_ppl": -0.07269287109375, - "entropy_loss": -0.19512939453125, - "epoch": 0.3736, - "grad_norm": 1.9939071669726223, - "k1_kl": 0.07269287109375, - "k3_kl": 0.040985107421875, - "kimi_kl": 0.10064697265625, - "learning_rate": 3.1319999999999997e-07, - "loss": 0.0016, - "ppl": 0.1169891357421875, - "reward": 0.7166122049093246, - "reward_std": 0.011719272704795003, - "rewards/perpo_ocr_edit_distance_reward": 0.7166122645139694, + "advantages": -2.244540701212827e-05, + "completion_length": 659.0, + "delta_ref_entropy_loss": 0.056396484375, + "delta_ref_ppl": -0.04833984375, + "entropy_loss": -0.02001953125, + "epoch": 0.1868, + "grad_norm": 0.7094474226041434, + "k1_kl": 0.04833984375, + "k3_kl": 0.0247802734375, + "kimi_kl": 0.052490234375, + "learning_rate": 4.066e-07, + "loss": 0.001, + "ppl": 0.0069580078125, + "reward": 0.990976870059967, + "reward_std": 0.0010377209400758147, + "rewards/perpo_ocr_edit_distance_reward": 0.9909769892692566, "step": 934, "temperature": 0.9 }, { - "advantages": -0.00031177486744127236, - "completion_length": 518.0, - "delta_ref_entropy_loss": 0.0460205078125, - "delta_ref_ppl": -0.026123046875, - "entropy_loss": -0.02398681640625, - "epoch": 0.374, - "grad_norm": 0.22302784346295976, - "k1_kl": 0.026123046875, - "k3_kl": 0.01348876953125, - "kimi_kl": 0.032623291015625, - "learning_rate": 3.13e-07, - "loss": 0.0008, - "ppl": 0.0076446533203125, - "reward": 0.9771600365638733, - "reward_std": 0.0004144430859014392, - "rewards/perpo_ocr_edit_distance_reward": 0.9771600663661957, + "advantages": -2.55448497910038e-07, + "completion_length": 1275.0, + "delta_ref_entropy_loss": 0.08251953125, + "delta_ref_ppl": -0.07177734375, + "entropy_loss": -0.1376953125, + "epoch": 0.187, + "grad_norm": 2.6343776407941086, + "k1_kl": 0.07177734375, + "k3_kl": 0.043212890625, + "kimi_kl": 0.08935546875, + "learning_rate": 4.0649999999999994e-07, + "loss": 0.0017, + "ppl": 0.07861328125, + "reward": 0.793215811252594, + "reward_std": 0.0701938271522522, + "rewards/perpo_ocr_edit_distance_reward": 0.7932159304618835, "step": 935, "temperature": 0.9 }, { - "advantages": -3.3932074074982665e-06, - "completion_length": 543.0, - "delta_ref_entropy_loss": 0.046630859375, - "delta_ref_ppl": -0.04071044921875, - "entropy_loss": -0.031829833984375, - "epoch": 0.3744, - "grad_norm": 1.0538204825633062, - "k1_kl": 0.04071044921875, - "k3_kl": 0.024749755859375, - "kimi_kl": 0.07647705078125, - "learning_rate": 3.128e-07, - "loss": 0.001, - "ppl": 0.017375946044921875, - "reward": 0.9897556304931641, - "reward_std": 0.0043336208909749985, - "rewards/perpo_ocr_edit_distance_reward": 0.9897556602954865, + "advantages": -1.6689300537109375e-05, + "completion_length": 857.0, + "delta_ref_entropy_loss": 0.05029296875, + "delta_ref_ppl": -0.042236328125, + "entropy_loss": -0.01361083984375, + "epoch": 0.1872, + "grad_norm": 0.4811379979020994, + "k1_kl": 0.042236328125, + "k3_kl": 0.022705078125, + "kimi_kl": 0.058349609375, + "learning_rate": 4.064e-07, + "loss": 0.0009, + "ppl": 0.00482177734375, + "reward": 0.9906690120697021, + "reward_std": 0.00041078298818320036, + "rewards/perpo_ocr_edit_distance_reward": 0.9906690120697021, "step": 936, "temperature": 0.9 }, { - "advantages": -1.0341406323277624e-05, - "completion_length": 699.5, - "delta_ref_entropy_loss": 0.0574951171875, - "delta_ref_ppl": -0.07513427734375, - "entropy_loss": -0.03973388671875, - "epoch": 0.3748, - "grad_norm": 1.2523036012356745, - "k1_kl": 0.0751953125, - "k3_kl": 0.049560546875, - "kimi_kl": 0.15234375, - "learning_rate": 3.126e-07, - "loss": 0.002, - "ppl": 0.0211181640625, - "reward": 0.98982834815979, - "reward_std": 0.0077692021150141954, - "rewards/perpo_ocr_edit_distance_reward": 0.9898284673690796, + "advantages": -3.222057057428174e-05, + "completion_length": 582.0, + "delta_ref_entropy_loss": 0.04931640625, + "delta_ref_ppl": -0.047607421875, + "entropy_loss": -0.035400390625, + "epoch": 0.1874, + "grad_norm": 0.8855334983381188, + "k1_kl": 0.047607421875, + "k3_kl": 0.0274658203125, + "kimi_kl": 0.057861328125, + "learning_rate": 4.063e-07, + "loss": 0.0011, + "ppl": 0.020263671875, + "reward": 0.981775164604187, + "reward_std": 0.0006931954412721097, + "rewards/perpo_ocr_edit_distance_reward": 0.981775164604187, "step": 937, "temperature": 0.9 }, { - "advantages": -3.772122681766632e-06, - "completion_length": 556.0, - "delta_ref_entropy_loss": 0.02191162109375, - "delta_ref_ppl": -0.024932861328125, - "entropy_loss": -0.036834716796875, - "epoch": 0.3752, - "grad_norm": 0.7306467278867831, - "k1_kl": 0.024749755859375, - "k3_kl": 0.01678466796875, - "kimi_kl": 0.052703857421875, - "learning_rate": 3.124e-07, - "loss": 0.0007, - "ppl": 0.0208587646484375, - "reward": 0.9945054650306702, - "reward_std": 0.000512636557687074, - "rewards/perpo_ocr_edit_distance_reward": 0.9945054948329926, + "advantages": -6.212507287273183e-05, + "completion_length": 881.0, + "delta_ref_entropy_loss": 0.06201171875, + "delta_ref_ppl": -0.044677734375, + "entropy_loss": -0.02783203125, + "epoch": 0.1876, + "grad_norm": 0.8838397733990694, + "k1_kl": 0.044921875, + "k3_kl": 0.0220947265625, + "kimi_kl": 0.04638671875, + "learning_rate": 4.0619999999999997e-07, + "loss": 0.0009, + "ppl": 0.01446533203125, + "reward": 0.9941964149475098, + "reward_std": 0.0014075180515646935, + "rewards/perpo_ocr_edit_distance_reward": 0.9941965341567993, "step": 938, "temperature": 0.9 }, { - "advantages": -1.1116266932731378e-05, - "completion_length": 548.0, - "delta_ref_entropy_loss": 0.0595703125, - "delta_ref_ppl": -0.0433349609375, - "entropy_loss": -0.154296875, - "epoch": 0.3756, - "grad_norm": 2.3108144454911907, - "k1_kl": 0.043212890625, - "k3_kl": 0.0235595703125, - "kimi_kl": 0.0489501953125, - "learning_rate": 3.1219999999999995e-07, - "loss": 0.001, - "ppl": 0.0889892578125, - "reward": 0.9448840022087097, - "reward_std": 0.02343328041024506, - "rewards/perpo_ocr_edit_distance_reward": 0.9448841214179993, + "advantages": -2.6856150725507177e-05, + "completion_length": 285.0, + "delta_ref_entropy_loss": 0.1767578125, + "delta_ref_ppl": -0.1708984375, + "entropy_loss": -0.083984375, + "epoch": 0.1878, + "grad_norm": 2.265841255030018, + "k1_kl": 0.169921875, + "k3_kl": 0.109375, + "kimi_kl": 0.384765625, + "learning_rate": 4.061e-07, + "loss": 0.0044, + "ppl": 0.041259765625, + "reward": 0.9643020629882812, + "reward_std": 0.001485528307966888, + "rewards/perpo_ocr_edit_distance_reward": 0.964302122592926, "step": 939, "temperature": 0.9 }, { - "advantages": -1.3960260275780456e-05, - "completion_length": 1202.0, - "delta_ref_entropy_loss": 0.0377197265625, - "delta_ref_ppl": -0.0391845703125, - "entropy_loss": -0.0858154296875, - "epoch": 0.376, - "grad_norm": 51.80980644723674, - "k1_kl": 0.0391845703125, - "k3_kl": 0.08203125, - "kimi_kl": 0.0777587890625, - "learning_rate": 3.12e-07, - "loss": 0.0033, - "ppl": 0.05499267578125, - "reward": 0.9333036243915558, - "reward_std": 0.005522497580386698, - "rewards/perpo_ocr_edit_distance_reward": 0.933303713798523, + "advantages": -1.1358943083905615e-05, + "completion_length": 522.0, + "delta_ref_entropy_loss": 0.068359375, + "delta_ref_ppl": -0.06591796875, + "entropy_loss": -0.0224609375, + "epoch": 0.188, + "grad_norm": 1.4918040332392615, + "k1_kl": 0.06591796875, + "k3_kl": 0.03515625, + "kimi_kl": 0.1025390625, + "learning_rate": 4.06e-07, + "loss": 0.0014, + "ppl": 0.0086669921875, + "reward": 0.9872942566871643, + "reward_std": 0.005144371651113033, + "rewards/perpo_ocr_edit_distance_reward": 0.9872943162918091, "step": 940, "temperature": 0.9 }, { - "advantages": -1.7587628917681286e-05, - "completion_length": 1019.0, - "delta_ref_entropy_loss": 0.02166748046875, - "delta_ref_ppl": -0.0203857421875, - "entropy_loss": -0.02325439453125, - "epoch": 0.3764, - "grad_norm": 0.7335800970764639, - "k1_kl": 0.0203857421875, - "k3_kl": 0.011688232421875, - "kimi_kl": 0.026275634765625, - "learning_rate": 3.118e-07, - "loss": 0.0005, - "ppl": 0.01220703125, - "reward": 0.9979960918426514, - "reward_std": 0.0010629486641846597, - "rewards/perpo_ocr_edit_distance_reward": 0.9979961216449738, + "advantages": -2.887419395847246e-05, + "completion_length": 913.0, + "delta_ref_entropy_loss": 0.047607421875, + "delta_ref_ppl": -0.04052734375, + "entropy_loss": -0.047607421875, + "epoch": 0.1882, + "grad_norm": 1.322198374972445, + "k1_kl": 0.04052734375, + "k3_kl": 0.0228271484375, + "kimi_kl": 0.053955078125, + "learning_rate": 4.0589999999999995e-07, + "loss": 0.0009, + "ppl": 0.0233154296875, + "reward": 0.9747611880302429, + "reward_std": 0.0004894656594842672, + "rewards/perpo_ocr_edit_distance_reward": 0.9747611880302429, "step": 941, "temperature": 0.9 }, { - "advantages": -3.150531426854286e-07, - "completion_length": 849.0, - "delta_ref_entropy_loss": 0.059814453125, - "delta_ref_ppl": -0.04278564453125, - "entropy_loss": -0.1424560546875, - "epoch": 0.3768, - "grad_norm": 2.9507305479041466, - "k1_kl": 0.04302978515625, - "k3_kl": 0.029571533203125, - "kimi_kl": 0.0606689453125, - "learning_rate": 3.1159999999999996e-07, - "loss": 0.0012, - "ppl": 0.0720367431640625, - "reward": 0.8622352480888367, - "reward_std": 0.03386460617184639, - "rewards/perpo_ocr_edit_distance_reward": 0.8622352480888367, + "advantages": -1.106943500417401e-06, + "completion_length": 675.0, + "delta_ref_entropy_loss": 0.1904296875, + "delta_ref_ppl": -0.12255859375, + "entropy_loss": -0.1416015625, + "epoch": 0.1884, + "grad_norm": 3.0598801555723676, + "k1_kl": 0.123046875, + "k3_kl": 0.08154296875, + "kimi_kl": 0.1767578125, + "learning_rate": 4.058e-07, + "loss": 0.0033, + "ppl": 0.078125, + "reward": 0.7850560545921326, + "reward_std": 0.0765291303396225, + "rewards/perpo_ocr_edit_distance_reward": 0.7850561738014221, "step": 942, "temperature": 0.9 }, { - "advantages": -4.806688957614824e-05, - "completion_length": 469.0, - "delta_ref_entropy_loss": 0.017578125, - "delta_ref_ppl": -0.01578521728515625, - "entropy_loss": -0.01861572265625, - "epoch": 0.3772, - "grad_norm": 0.40172556565115025, - "k1_kl": 0.0157928466796875, - "k3_kl": 0.011060714721679688, - "kimi_kl": 0.04026031494140625, - "learning_rate": 3.114e-07, - "loss": 0.0005, - "ppl": 0.009124755859375, - "reward": 0.999619334936142, - "reward_std": 0.00026004164828918874, - "rewards/perpo_ocr_edit_distance_reward": 0.9996193647384644, + "advantages": -9.645734826335683e-05, + "completion_length": 546.0, + "delta_ref_entropy_loss": 0.05517578125, + "delta_ref_ppl": -0.07080078125, + "entropy_loss": -0.01953125, + "epoch": 0.1886, + "grad_norm": 0.40190468881198654, + "k1_kl": 0.07080078125, + "k3_kl": 0.043212890625, + "kimi_kl": 0.1396484375, + "learning_rate": 4.057e-07, + "loss": 0.0018, + "ppl": 0.01007080078125, + "reward": 0.9969918131828308, + "reward_std": 0.0004297163395676762, + "rewards/perpo_ocr_edit_distance_reward": 0.9969918727874756, "step": 943, "temperature": 0.9 }, { - "advantages": -8.377433550776914e-05, - "completion_length": 439.5, - "delta_ref_entropy_loss": 0.03656005859375, - "delta_ref_ppl": -0.042724609375, - "entropy_loss": -0.009002685546875, - "epoch": 0.3776, - "grad_norm": 0.09366784414579446, - "k1_kl": 0.042999267578125, - "k3_kl": 0.03173828125, - "kimi_kl": 0.13580322265625, - "learning_rate": 3.112e-07, - "loss": 0.0013, - "ppl": 0.00394439697265625, - "reward": 0.9881342053413391, - "reward_std": 0.00010246300371363759, - "rewards/perpo_ocr_edit_distance_reward": 0.9881342351436615, + "advantages": -1.691068973741494e-05, + "completion_length": 239.0, + "delta_ref_entropy_loss": 0.1376953125, + "delta_ref_ppl": -0.1630859375, + "entropy_loss": -0.0927734375, + "epoch": 0.1888, + "grad_norm": 2.0064798903671606, + "k1_kl": 0.1640625, + "k3_kl": 0.10546875, + "kimi_kl": 0.3828125, + "learning_rate": 4.056e-07, + "loss": 0.0042, + "ppl": 0.041748046875, + "reward": 0.3744603395462036, + "reward_std": 0.002417017240077257, + "rewards/perpo_ocr_edit_distance_reward": 0.374460369348526, "step": 944, "temperature": 0.9 }, { - "advantages": 0.0, - "completion_length": 466.5, - "delta_ref_entropy_loss": 0.04107666015625, - "delta_ref_ppl": -0.0487060546875, - "entropy_loss": -0.03045654296875, - "epoch": 0.378, - "grad_norm": 0.8604005813219896, - "k1_kl": 0.04876708984375, - "k3_kl": 0.031402587890625, - "kimi_kl": 0.088226318359375, - "learning_rate": 3.1099999999999997e-07, - "loss": 0.0013, - "ppl": 0.013641357421875, - "reward": 0.9187673330307007, - "reward_std": 0.0011176406114827842, - "rewards/perpo_ocr_edit_distance_reward": 0.9187672734260559, + "advantages": -5.66244170840946e-06, + "completion_length": 374.0, + "delta_ref_entropy_loss": 0.2490234375, + "delta_ref_ppl": -0.162109375, + "entropy_loss": -0.2158203125, + "epoch": 0.189, + "grad_norm": 2.520099677513084, + "k1_kl": 0.1611328125, + "k3_kl": 0.07861328125, + "kimi_kl": 0.158203125, + "learning_rate": 4.055e-07, + "loss": 0.0032, + "ppl": 0.1181640625, + "reward": 0.8151533007621765, + "reward_std": 0.004403210710734129, + "rewards/perpo_ocr_edit_distance_reward": 0.8151533007621765, "step": 945, "temperature": 0.9 }, { - "advantages": -4.661934781324817e-06, - "completion_length": 620.5, - "delta_ref_entropy_loss": 0.0296630859375, - "delta_ref_ppl": -0.02264404296875, - "entropy_loss": -0.0152435302734375, - "epoch": 0.3784, - "grad_norm": 0.46414696207739786, - "k1_kl": 0.02264404296875, - "k3_kl": 0.013458251953125, - "kimi_kl": 0.0445556640625, - "learning_rate": 3.108e-07, - "loss": 0.0005, - "ppl": 0.007305145263671875, - "reward": 0.9828147888183594, - "reward_std": 0.0031473995186388493, - "rewards/perpo_ocr_edit_distance_reward": 0.9828148186206818, + "advantages": -1.7370497289448394e-06, + "completion_length": 265.0, + "delta_ref_entropy_loss": 0.19921875, + "delta_ref_ppl": -0.1591796875, + "entropy_loss": -0.2197265625, + "epoch": 0.1892, + "grad_norm": 3.965705272280568, + "k1_kl": 0.1591796875, + "k3_kl": 0.09423828125, + "kimi_kl": 0.2890625, + "learning_rate": 4.0539999999999996e-07, + "loss": 0.0038, + "ppl": 0.10302734375, + "reward": 0.655068576335907, + "reward_std": 0.03442474827170372, + "rewards/perpo_ocr_edit_distance_reward": 0.6550686359405518, "step": 946, "temperature": 0.9 }, { - "advantages": -2.0410334855114343e-05, - "completion_length": 683.5, - "delta_ref_entropy_loss": 0.091796875, - "delta_ref_ppl": -0.05615234375, - "entropy_loss": -0.12451171875, - "epoch": 0.3788, - "grad_norm": 1.3173917704300309, - "k1_kl": 0.05615234375, - "k3_kl": 0.0306396484375, - "kimi_kl": 0.10986328125, - "learning_rate": 3.106e-07, - "loss": 0.0012, - "ppl": 0.066650390625, - "reward": 0.9261210560798645, - "reward_std": 0.002281857538037002, - "rewards/perpo_ocr_edit_distance_reward": 0.9261211156845093, + "advantages": -1.8392290712654358e-06, + "completion_length": 909.0, + "delta_ref_entropy_loss": 0.04736328125, + "delta_ref_ppl": -0.04248046875, + "entropy_loss": -0.032958984375, + "epoch": 0.1894, + "grad_norm": 1.2381687717700705, + "k1_kl": 0.042724609375, + "k3_kl": 0.025146484375, + "kimi_kl": 0.068359375, + "learning_rate": 4.0529999999999996e-07, + "loss": 0.001, + "ppl": 0.017822265625, + "reward": 0.7028462886810303, + "reward_std": 0.013719587586820126, + "rewards/perpo_ocr_edit_distance_reward": 0.7028462886810303, "step": 947, "temperature": 0.9 }, { - "advantages": -2.767358637356665e-06, - "completion_length": 694.0, - "delta_ref_entropy_loss": 0.0472412109375, - "delta_ref_ppl": -0.03857421875, - "entropy_loss": -0.06427001953125, - "epoch": 0.3792, - "grad_norm": 1.1497545248692556, - "k1_kl": 0.03857421875, - "k3_kl": 0.0234375, - "kimi_kl": 0.0794677734375, - "learning_rate": 3.104e-07, - "loss": 0.0009, - "ppl": 0.0344696044921875, - "reward": 0.9590466022491455, - "reward_std": 0.0068667554296553135, - "rewards/perpo_ocr_edit_distance_reward": 0.9590466618537903, + "advantages": -1.7774957086658105e-05, + "completion_length": 759.0, + "delta_ref_entropy_loss": 0.0771484375, + "delta_ref_ppl": -0.09423828125, + "entropy_loss": -0.09912109375, + "epoch": 0.1896, + "grad_norm": 6.054361250972544, + "k1_kl": 0.09423828125, + "k3_kl": 0.0634765625, + "kimi_kl": 0.1845703125, + "learning_rate": 4.052e-07, + "loss": 0.0026, + "ppl": 0.052978515625, + "reward": 0.9425972104072571, + "reward_std": 0.0022979220375418663, + "rewards/perpo_ocr_edit_distance_reward": 0.9425972700119019, "step": 948, "temperature": 0.9 }, { - "advantages": -0.00011791501947300276, - "completion_length": 686.5, - "delta_ref_entropy_loss": 0.032470703125, - "delta_ref_ppl": -0.019866943359375, - "entropy_loss": -0.020751953125, - "epoch": 0.3796, - "grad_norm": 0.5787466992776511, - "k1_kl": 0.019866943359375, - "k3_kl": 0.0109405517578125, - "kimi_kl": 0.019287109375, - "learning_rate": 3.1019999999999996e-07, - "loss": 0.0006, - "ppl": 0.00921630859375, - "reward": 0.9988990724086761, - "reward_std": 0.0019562252527975943, - "rewards/perpo_ocr_edit_distance_reward": 0.9988991320133209, + "advantages": -8.83851771504851e-06, + "completion_length": 702.0, + "delta_ref_entropy_loss": 0.07470703125, + "delta_ref_ppl": -0.0771484375, + "entropy_loss": -0.039306640625, + "epoch": 0.1898, + "grad_norm": 3.131192987043884, + "k1_kl": 0.07666015625, + "k3_kl": 0.046630859375, + "kimi_kl": 0.140625, + "learning_rate": 4.051e-07, + "loss": 0.0019, + "ppl": 0.0205078125, + "reward": 0.9862150549888611, + "reward_std": 0.001822611317038536, + "rewards/perpo_ocr_edit_distance_reward": 0.9862151145935059, "step": 949, "temperature": 0.9 }, { - "advantages": -3.161174981869408e-05, - "completion_length": 427.5, - "delta_ref_entropy_loss": 0.0416259765625, - "delta_ref_ppl": -0.03399658203125, - "entropy_loss": -0.04229736328125, - "epoch": 0.38, - "grad_norm": 0.9790034883351112, - "k1_kl": 0.0341796875, - "k3_kl": 0.020111083984375, - "kimi_kl": 0.043212890625, - "learning_rate": 3.1e-07, - "loss": 0.0008, - "ppl": 0.0244140625, - "reward": 0.9950050711631775, - "reward_std": 0.004095215699635446, - "rewards/perpo_ocr_edit_distance_reward": 0.9950051605701447, + "advantages": -2.895082786835701e-07, + "completion_length": 413.0, + "delta_ref_entropy_loss": 0.061767578125, + "delta_ref_ppl": -0.0791015625, + "entropy_loss": -0.01153564453125, + "epoch": 0.19, + "grad_norm": 1.7116706989891923, + "k1_kl": 0.0791015625, + "k3_kl": 0.0537109375, + "kimi_kl": 0.189453125, + "learning_rate": 4.05e-07, + "loss": 0.0021, + "ppl": 0.00408935546875, + "reward": 0.9634625911712646, + "reward_std": 0.029721371829509735, + "rewards/perpo_ocr_edit_distance_reward": 0.9634626507759094, "step": 950, "temperature": 0.9 }, { - "advantages": -0.0002980232238769531, - "completion_length": 664.0, - "delta_ref_entropy_loss": 0.02081298828125, - "delta_ref_ppl": -0.01934814453125, - "entropy_loss": -0.012908935546875, - "epoch": 0.3804, - "grad_norm": 0.2859060053299423, - "k1_kl": 0.0194091796875, - "k3_kl": 0.01214599609375, - "kimi_kl": 0.02978515625, - "learning_rate": 3.098e-07, - "loss": 0.0008, - "ppl": 0.0060577392578125, - "reward": 0.999129056930542, - "reward_std": 0.00030709875863976777, - "rewards/perpo_ocr_edit_distance_reward": 0.9991291165351868, + "advantages": 4.180840278422693e-06, + "completion_length": 956.0, + "delta_ref_entropy_loss": 0.158203125, + "delta_ref_ppl": -0.08154296875, + "entropy_loss": -0.2177734375, + "epoch": 0.1902, + "grad_norm": 510.53387174517815, + "k1_kl": 0.08203125, + "k3_kl": 0.81640625, + "kimi_kl": 0.08740234375, + "learning_rate": 4.049e-07, + "loss": 0.0326, + "ppl": 0.1220703125, + "reward": 0.9051522016525269, + "reward_std": 0.006018025800585747, + "rewards/perpo_ocr_edit_distance_reward": 0.9051522016525269, "step": 951, "temperature": 0.9 }, { - "advantages": -1.5991075997590087e-05, - "completion_length": 907.0, - "delta_ref_entropy_loss": 0.03497314453125, - "delta_ref_ppl": -0.016632080078125, - "entropy_loss": -0.030548095703125, - "epoch": 0.3808, - "grad_norm": 0.4494668911523769, - "k1_kl": 0.01654052734375, - "k3_kl": 0.008453369140625, - "kimi_kl": 0.01959228515625, - "learning_rate": 3.0959999999999997e-07, - "loss": 0.0004, - "ppl": 0.01442718505859375, - "reward": 0.9982772767543793, - "reward_std": 0.0010158090153709054, - "rewards/perpo_ocr_edit_distance_reward": 0.9982773065567017, + "advantages": 2.3330962903855834e-06, + "completion_length": 167.0, + "delta_ref_entropy_loss": 0.072265625, + "delta_ref_ppl": -0.109375, + "entropy_loss": -0.041259765625, + "epoch": 0.1904, + "grad_norm": 1.492804488356284, + "k1_kl": 0.10986328125, + "k3_kl": 0.0732421875, + "kimi_kl": 0.1982421875, + "learning_rate": 4.0479999999999997e-07, + "loss": 0.0029, + "ppl": 0.0211181640625, + "reward": 0.9570940732955933, + "reward_std": 0.010945841670036316, + "rewards/perpo_ocr_edit_distance_reward": 0.9570940732955933, "step": 952, "temperature": 0.9 }, { - "advantages": -5.038295681458749e-05, - "completion_length": 192.5, - "delta_ref_entropy_loss": 0.058837890625, - "delta_ref_ppl": -0.064453125, - "entropy_loss": -0.095672607421875, - "epoch": 0.3812, - "grad_norm": 4.645808159801794, - "k1_kl": 0.064453125, - "k3_kl": 0.0361328125, - "kimi_kl": 0.06884765625, - "learning_rate": 3.094e-07, - "loss": 0.0015, - "ppl": 0.0537872314453125, - "reward": 0.8009862899780273, - "reward_std": 0.10243212495697662, - "rewards/perpo_ocr_edit_distance_reward": 0.8009863793849945, + "advantages": 3.4059798537100505e-08, + "completion_length": 306.0, + "delta_ref_entropy_loss": 0.189453125, + "delta_ref_ppl": -0.14453125, + "entropy_loss": -0.1171875, + "epoch": 0.1906, + "grad_norm": 3.323208367188017, + "k1_kl": 0.14453125, + "k3_kl": 0.08056640625, + "kimi_kl": 0.17578125, + "learning_rate": 4.0469999999999996e-07, + "loss": 0.0032, + "ppl": 0.06640625, + "reward": 0.6838297247886658, + "reward_std": 0.16061493754386902, + "rewards/perpo_ocr_edit_distance_reward": 0.6838297247886658, "step": 953, "temperature": 0.9 }, { - "advantages": -1.9184180928277783e-05, - "completion_length": 411.5, - "delta_ref_entropy_loss": 0.01666259765625, - "delta_ref_ppl": -0.010650634765625, - "entropy_loss": -0.0073394775390625, - "epoch": 0.3816, - "grad_norm": 0.23027118903606525, - "k1_kl": 0.010650634765625, - "k3_kl": 0.0068511962890625, - "kimi_kl": 0.0124359130859375, - "learning_rate": 3.0919999999999994e-07, - "loss": 0.0003, - "ppl": 0.00341033935546875, - "reward": 0.999851256608963, - "reward_std": 0.0003935066924896091, - "rewards/perpo_ocr_edit_distance_reward": 0.9998512864112854, + "advantages": 0.0, + "completion_length": 172.0, + "delta_ref_entropy_loss": 0.054931640625, + "delta_ref_ppl": -0.14453125, + "entropy_loss": -0.0196533203125, + "epoch": 0.1908, + "grad_norm": 0.9698955330954293, + "k1_kl": 0.1435546875, + "k3_kl": 0.11181640625, + "kimi_kl": 0.380859375, + "learning_rate": 4.046e-07, + "loss": 0.0045, + "ppl": 0.007232666015625, + "reward": 0.9739181399345398, + "reward_std": 0.002024696674197912, + "rewards/perpo_ocr_edit_distance_reward": 0.9739181995391846, "step": 954, "temperature": 0.9 }, { - "advantages": 6.428788310586242e-07, - "completion_length": 309.0, - "delta_ref_entropy_loss": 0.0692138671875, - "delta_ref_ppl": -0.064208984375, - "entropy_loss": -0.068115234375, - "epoch": 0.382, - "grad_norm": 1.2142731058390885, - "k1_kl": 0.064208984375, - "k3_kl": 0.03857421875, - "kimi_kl": 0.114013671875, - "learning_rate": 3.09e-07, - "loss": 0.0015, - "ppl": 0.03369140625, - "reward": 0.9911467432975769, - "reward_std": 0.0025501588243059814, - "rewards/perpo_ocr_edit_distance_reward": 0.9911467432975769, + "advantages": -3.4059798537100505e-08, + "completion_length": 386.0, + "delta_ref_entropy_loss": 0.05126953125, + "delta_ref_ppl": -0.051513671875, + "entropy_loss": -0.0189208984375, + "epoch": 0.191, + "grad_norm": 1.0108962201588354, + "k1_kl": 0.051513671875, + "k3_kl": 0.031982421875, + "kimi_kl": 0.09814453125, + "learning_rate": 4.045e-07, + "loss": 0.0013, + "ppl": 0.006378173828125, + "reward": 0.9614845514297485, + "reward_std": 0.0020961607806384563, + "rewards/perpo_ocr_edit_distance_reward": 0.9614846110343933, "step": 955, "temperature": 0.9 }, { - "advantages": -3.232061953895027e-05, - "completion_length": 751.5, - "delta_ref_entropy_loss": 0.04052734375, - "delta_ref_ppl": -0.0318603515625, - "entropy_loss": -0.0269775390625, - "epoch": 0.3824, - "grad_norm": 0.8892034167229234, - "k1_kl": 0.031982421875, - "k3_kl": 0.0179443359375, - "kimi_kl": 0.0360107421875, - "learning_rate": 3.088e-07, - "loss": 0.0008, - "ppl": 0.015289306640625, - "reward": 0.99605792760849, - "reward_std": 0.0006839819980086759, - "rewards/perpo_ocr_edit_distance_reward": 0.99605792760849, + "advantages": -6.286587449721992e-05, + "completion_length": 451.0, + "delta_ref_entropy_loss": 0.034912109375, + "delta_ref_ppl": -0.060791015625, + "entropy_loss": -0.0150146484375, + "epoch": 0.1912, + "grad_norm": 0.667650082751411, + "k1_kl": 0.06103515625, + "k3_kl": 0.042236328125, + "kimi_kl": 0.171875, + "learning_rate": 4.0439999999999994e-07, + "loss": 0.0018, + "ppl": 0.005523681640625, + "reward": 0.9965477585792542, + "reward_std": 0.0016609420999884605, + "rewards/perpo_ocr_edit_distance_reward": 0.9965478777885437, "step": 956, "temperature": 0.9 }, { - "advantages": -4.306861592340283e-05, - "completion_length": 677.0, - "delta_ref_entropy_loss": 0.028961181640625, - "delta_ref_ppl": -0.0267333984375, - "entropy_loss": -0.020355224609375, - "epoch": 0.3828, - "grad_norm": 0.19986818635840062, - "k1_kl": 0.0267333984375, - "k3_kl": 0.014892578125, - "kimi_kl": 0.02813720703125, - "learning_rate": 3.086e-07, - "loss": 0.0006, - "ppl": 0.008350372314453125, - "reward": 0.9943269491195679, - "reward_std": 0.00034541491186246276, - "rewards/perpo_ocr_edit_distance_reward": 0.994327038526535, + "advantages": -2.1798271063744323e-06, + "completion_length": 610.0, + "delta_ref_entropy_loss": 0.15625, + "delta_ref_ppl": -0.09521484375, + "entropy_loss": -0.1689453125, + "epoch": 0.1914, + "grad_norm": 2.620233530424873, + "k1_kl": 0.0947265625, + "k3_kl": 0.0537109375, + "kimi_kl": 0.099609375, + "learning_rate": 4.043e-07, + "loss": 0.0021, + "ppl": 0.09033203125, + "reward": 0.8457728624343872, + "reward_std": 0.00767414178699255, + "rewards/perpo_ocr_edit_distance_reward": 0.845772922039032, "step": 957, "temperature": 0.9 }, { - "advantages": 0.0, - "completion_length": 440.0, - "delta_ref_entropy_loss": 0.02703857421875, - "delta_ref_ppl": -0.0147705078125, - "entropy_loss": -0.0103759765625, - "epoch": 0.3832, - "grad_norm": 0.009129631604208007, - "k1_kl": 0.014801025390625, - "k3_kl": 0.0074462890625, - "kimi_kl": 0.019012451171875, - "learning_rate": 3.084e-07, - "loss": 0.0003, - "ppl": 0.002788543701171875, - "reward": 1.0, - "reward_std": 0.0, - "rewards/perpo_ocr_edit_distance_reward": 1.0, + "advantages": -2.6728426746558398e-05, + "completion_length": 449.0, + "delta_ref_entropy_loss": 0.08642578125, + "delta_ref_ppl": -0.0791015625, + "entropy_loss": -0.04541015625, + "epoch": 0.1916, + "grad_norm": 0.8546554345976376, + "k1_kl": 0.0791015625, + "k3_kl": 0.045654296875, + "kimi_kl": 0.1572265625, + "learning_rate": 4.042e-07, + "loss": 0.0019, + "ppl": 0.0169677734375, + "reward": 0.9918754696846008, + "reward_std": 0.00149276084266603, + "rewards/perpo_ocr_edit_distance_reward": 0.9918754696846008, "step": 958, "temperature": 0.9 }, { - "advantages": -2.567257524788147e-06, - "completion_length": 699.0, - "delta_ref_entropy_loss": 0.06549072265625, - "delta_ref_ppl": -0.0594482421875, - "entropy_loss": -0.056884765625, - "epoch": 0.3836, - "grad_norm": 5.444655173737037, - "k1_kl": 0.0596923828125, - "k3_kl": 0.0328369140625, - "kimi_kl": 0.06658935546875, - "learning_rate": 3.0819999999999997e-07, - "loss": 0.0013, - "ppl": 0.02618408203125, - "reward": 0.9929260313510895, - "reward_std": 0.0022143858659546822, - "rewards/perpo_ocr_edit_distance_reward": 0.9929260909557343, + "advantages": -0.00015602793428115547, + "completion_length": 1036.0, + "delta_ref_entropy_loss": 0.0634765625, + "delta_ref_ppl": -0.07861328125, + "entropy_loss": -0.042236328125, + "epoch": 0.1918, + "grad_norm": 0.6412616134190794, + "k1_kl": 0.07861328125, + "k3_kl": 0.043701171875, + "kimi_kl": 0.0966796875, + "learning_rate": 4.0409999999999997e-07, + "loss": 0.0019, + "ppl": 0.01904296875, + "reward": 0.6760563850402832, + "reward_std": 0.000500345544423908, + "rewards/perpo_ocr_edit_distance_reward": 0.676056444644928, "step": 959, "temperature": 0.9 }, { - "advantages": -4.002132664027158e-05, - "completion_length": 1077.5, - "delta_ref_entropy_loss": 0.018524169921875, - "delta_ref_ppl": -0.018280029296875, - "entropy_loss": -0.024169921875, - "epoch": 0.384, - "grad_norm": 0.5275021303037046, - "k1_kl": 0.01824951171875, - "k3_kl": 0.0122222900390625, - "kimi_kl": 0.0398101806640625, - "learning_rate": 3.08e-07, - "loss": 0.0005, - "ppl": 0.012420654296875, - "reward": 0.9980548322200775, - "reward_std": 0.0008693408744875342, - "rewards/perpo_ocr_edit_distance_reward": 0.9980548918247223, + "advantages": -2.6566642645775573e-06, + "completion_length": 291.0, + "delta_ref_entropy_loss": 0.04541015625, + "delta_ref_ppl": -0.068359375, + "entropy_loss": -0.04150390625, + "epoch": 0.192, + "grad_norm": 1.7635625614802872, + "k1_kl": 0.06884765625, + "k3_kl": 0.0439453125, + "kimi_kl": 0.10693359375, + "learning_rate": 4.04e-07, + "loss": 0.0018, + "ppl": 0.0240478515625, + "reward": 0.9864345788955688, + "reward_std": 0.0031126802787184715, + "rewards/perpo_ocr_edit_distance_reward": 0.9864346385002136, "step": 960, "temperature": 0.9 }, { - "advantages": -0.00013529829448089004, - "completion_length": 623.0, - "delta_ref_entropy_loss": 0.041748046875, - "delta_ref_ppl": -0.03387451171875, - "entropy_loss": -0.05126953125, - "epoch": 0.3844, - "grad_norm": 2.516443634650943, - "k1_kl": 0.03387451171875, - "k3_kl": 0.02276611328125, - "kimi_kl": 0.0638427734375, - "learning_rate": 3.078e-07, - "loss": 0.001, - "ppl": 0.027618408203125, - "reward": 0.992254912853241, - "reward_std": 0.0005160593282198533, - "rewards/perpo_ocr_edit_distance_reward": 0.9922550022602081, + "advantages": -0.00013141121598891914, + "completion_length": 955.0, + "delta_ref_entropy_loss": 0.052978515625, + "delta_ref_ppl": -0.04541015625, + "entropy_loss": -0.0205078125, + "epoch": 0.1922, + "grad_norm": 3.964587216306195, + "k1_kl": 0.04541015625, + "k3_kl": 0.02490234375, + "kimi_kl": 0.06787109375, + "learning_rate": 4.0389999999999996e-07, + "loss": 0.0011, + "ppl": 0.00921630859375, + "reward": 0.9935857057571411, + "reward_std": 0.00028878243756480515, + "rewards/perpo_ocr_edit_distance_reward": 0.9935857653617859, "step": 961, "temperature": 0.9 }, { - "advantages": -8.343373337993398e-05, - "completion_length": 452.0, - "delta_ref_entropy_loss": 0.0267333984375, - "delta_ref_ppl": -0.013458251953125, - "entropy_loss": -0.015472412109375, - "epoch": 0.3848, - "grad_norm": 0.37536188152783784, - "k1_kl": 0.013427734375, - "k3_kl": 0.005645751953125, - "kimi_kl": 0.0091094970703125, - "learning_rate": 3.076e-07, - "loss": 0.0003, - "ppl": 0.007171630859375, - "reward": 0.9922555983066559, - "reward_std": 0.01978408700961154, - "rewards/perpo_ocr_edit_distance_reward": 0.9922556579113007, + "advantages": 1.355580025119707e-05, + "completion_length": 626.0, + "delta_ref_entropy_loss": 0.05810546875, + "delta_ref_ppl": -0.0595703125, + "entropy_loss": -0.0103759765625, + "epoch": 0.1924, + "grad_norm": 0.6904870688346765, + "k1_kl": 0.0595703125, + "k3_kl": 0.03515625, + "kimi_kl": 0.091796875, + "learning_rate": 4.0379999999999995e-07, + "loss": 0.0014, + "ppl": 0.00469970703125, + "reward": 0.9969414472579956, + "reward_std": 0.0005281754420138896, + "rewards/perpo_ocr_edit_distance_reward": 0.9969414472579956, "step": 962, "temperature": 0.9 }, { - "advantages": -5.643708664138103e-05, - "completion_length": 1336.0, - "delta_ref_entropy_loss": 0.015350341796875, - "delta_ref_ppl": -0.00836181640625, - "entropy_loss": -0.012237548828125, - "epoch": 0.3852, - "grad_norm": 0.18470649104799233, - "k1_kl": 0.00836181640625, - "k3_kl": 0.00377655029296875, - "kimi_kl": 0.0068359375, - "learning_rate": 3.074e-07, - "loss": 0.0002, - "ppl": 0.0048675537109375, - "reward": 0.9986439049243927, - "reward_std": 0.0009164507500827312, - "rewards/perpo_ocr_edit_distance_reward": 0.9986439347267151, + "advantages": -0.0005960464477539062, + "completion_length": 96.0, + "delta_ref_entropy_loss": 0.09130859375, + "delta_ref_ppl": -0.2451171875, + "entropy_loss": -0.0198974609375, + "epoch": 0.1926, + "grad_norm": 0.04078955850137679, + "k1_kl": 0.2451171875, + "k3_kl": 0.19140625, + "kimi_kl": 0.8125, + "learning_rate": 4.037e-07, + "loss": 0.0083, + "ppl": 0.0034637451171875, + "reward": 0.9937238693237305, + "reward_std": 0.0, + "rewards/perpo_ocr_edit_distance_reward": 0.9937239289283752, "step": 963, "temperature": 0.9 }, { - "advantages": -4.464387950520177e-05, - "completion_length": 696.5, - "delta_ref_entropy_loss": 0.04150390625, - "delta_ref_ppl": -0.05120849609375, - "entropy_loss": -0.02655029296875, - "epoch": 0.3856, - "grad_norm": 1.4405019932299672, - "k1_kl": 0.0511474609375, - "k3_kl": 0.03546142578125, - "kimi_kl": 0.12628173828125, - "learning_rate": 3.0719999999999995e-07, - "loss": 0.0015, - "ppl": 0.01177978515625, - "reward": 0.9475600123405457, - "reward_std": 0.01285807733074762, - "rewards/perpo_ocr_edit_distance_reward": 0.9475600719451904, + "advantages": -0.0005960464477539062, + "completion_length": 95.0, + "delta_ref_entropy_loss": 0.11328125, + "delta_ref_ppl": -0.2294921875, + "entropy_loss": -0.025634765625, + "epoch": 0.1928, + "grad_norm": 0.02915933494125959, + "k1_kl": 0.228515625, + "k3_kl": 0.1640625, + "kimi_kl": 0.5859375, + "learning_rate": 4.036e-07, + "loss": 0.0072, + "ppl": 0.006317138671875, + "reward": 0.9848484396934509, + "reward_std": 0.0, + "rewards/perpo_ocr_edit_distance_reward": 0.9848485589027405, "step": 964, "temperature": 0.9 }, { - "advantages": -4.0948393234430114e-05, - "completion_length": 661.5, - "delta_ref_entropy_loss": 0.065185546875, - "delta_ref_ppl": -0.0401611328125, - "entropy_loss": -0.068603515625, - "epoch": 0.386, - "grad_norm": 0.8770132894330945, - "k1_kl": 0.0401611328125, - "k3_kl": 0.02252197265625, - "kimi_kl": 0.0711669921875, - "learning_rate": 3.07e-07, - "loss": 0.0009, - "ppl": 0.034210205078125, - "reward": 0.9756314754486084, - "reward_std": 0.001915348184411414, - "rewards/perpo_ocr_edit_distance_reward": 0.9756315350532532, + "advantages": -7.6379101301427e-06, + "completion_length": 46.0, + "delta_ref_entropy_loss": 0.11865234375, + "delta_ref_ppl": -0.5390625, + "entropy_loss": -0.1982421875, + "epoch": 0.193, + "grad_norm": 7.783040298647777, + "k1_kl": 0.5390625, + "k3_kl": 0.419921875, + "kimi_kl": 1.6484375, + "learning_rate": 4.0350000000000003e-07, + "loss": 0.0168, + "ppl": 0.06591796875, + "reward": 0.5731497406959534, + "reward_std": 0.008818354457616806, + "rewards/perpo_ocr_edit_distance_reward": 0.5731498003005981, "step": 965, "temperature": 0.9 }, { - "advantages": -6.159714484965662e-05, - "completion_length": 497.0, - "delta_ref_entropy_loss": 0.044189453125, - "delta_ref_ppl": -0.06634521484375, - "entropy_loss": -0.04443359375, - "epoch": 0.3864, - "grad_norm": 1.0802742305823558, - "k1_kl": 0.06634521484375, - "k3_kl": 0.046905517578125, - "kimi_kl": 0.1580810546875, - "learning_rate": 3.068e-07, - "loss": 0.0019, - "ppl": 0.020050048828125, - "reward": 0.8156650960445404, - "reward_std": 0.001419051637640223, - "rewards/perpo_ocr_edit_distance_reward": 0.8156651556491852, + "advantages": 6.308726005954668e-05, + "completion_length": 1493.0, + "delta_ref_entropy_loss": 0.0228271484375, + "delta_ref_ppl": -0.03271484375, + "entropy_loss": -0.0517578125, + "epoch": 0.1932, + "grad_norm": 0.6041375257191144, + "k1_kl": 0.03271484375, + "k3_kl": 0.021728515625, + "kimi_kl": 0.052978515625, + "learning_rate": 4.0339999999999997e-07, + "loss": 0.0008, + "ppl": 0.0264892578125, + "reward": 0.9831398129463196, + "reward_std": 0.0003048814251087606, + "rewards/perpo_ocr_edit_distance_reward": 0.9831398129463196, "step": 966, "temperature": 0.9 }, { - "advantages": -0.0004202468117000535, - "completion_length": 492.5, - "delta_ref_entropy_loss": 0.0340576171875, - "delta_ref_ppl": -0.02801513671875, - "entropy_loss": -0.02587890625, - "epoch": 0.3868, - "grad_norm": 0.42582158640429507, - "k1_kl": 0.02801513671875, - "k3_kl": 0.017547607421875, - "kimi_kl": 0.0472412109375, - "learning_rate": 3.0659999999999995e-07, - "loss": 0.0011, - "ppl": 0.0123138427734375, - "reward": 0.997447669506073, - "reward_std": 0.00015901718870736659, - "rewards/perpo_ocr_edit_distance_reward": 0.9974477887153625, + "advantages": -2.7247839170740917e-05, + "completion_length": 910.0, + "delta_ref_entropy_loss": 0.03466796875, + "delta_ref_ppl": -0.03125, + "entropy_loss": -0.037353515625, + "epoch": 0.1934, + "grad_norm": 1.0871975393597901, + "k1_kl": 0.03125, + "k3_kl": 0.02001953125, + "kimi_kl": 0.053955078125, + "learning_rate": 4.0329999999999997e-07, + "loss": 0.0008, + "ppl": 0.01953125, + "reward": 0.9960826635360718, + "reward_std": 0.0005249575478956103, + "rewards/perpo_ocr_edit_distance_reward": 0.9960826635360718, "step": 967, "temperature": 0.9 }, { - "advantages": 1.0473388556420105e-06, - "completion_length": 1565.5, - "delta_ref_entropy_loss": 0.0269775390625, - "delta_ref_ppl": -0.01458740234375, - "entropy_loss": -0.05157470703125, - "epoch": 0.3872, - "grad_norm": 12.037690815015557, - "k1_kl": 0.0145416259765625, - "k3_kl": 0.020275115966796875, - "kimi_kl": 0.02266693115234375, - "learning_rate": 3.064e-07, - "loss": 0.0008, - "ppl": 0.0347900390625, - "reward": 0.9312343001365662, - "reward_std": 0.004005388356745243, - "rewards/perpo_ocr_edit_distance_reward": 0.9312343001365662, + "advantages": -7.714544153714087e-06, + "completion_length": 1156.0, + "delta_ref_entropy_loss": 0.1357421875, + "delta_ref_ppl": -0.08740234375, + "entropy_loss": -0.1455078125, + "epoch": 0.1936, + "grad_norm": 2.226006122494948, + "k1_kl": 0.08740234375, + "k3_kl": 0.044189453125, + "kimi_kl": 0.10546875, + "learning_rate": 4.032e-07, + "loss": 0.0018, + "ppl": 0.0810546875, + "reward": 0.8114335536956787, + "reward_std": 0.004313231445848942, + "rewards/perpo_ocr_edit_distance_reward": 0.8114336133003235, "step": 968, "temperature": 0.9 }, { - "advantages": -6.378974467224907e-05, - "completion_length": 662.0, - "delta_ref_entropy_loss": 0.02484130859375, - "delta_ref_ppl": -0.0218505859375, - "entropy_loss": -0.02337646484375, - "epoch": 0.3876, - "grad_norm": 0.8894592912135278, - "k1_kl": 0.0218658447265625, - "k3_kl": 0.0127105712890625, - "kimi_kl": 0.03334808349609375, - "learning_rate": 3.0620000000000003e-07, - "loss": 0.0006, - "ppl": 0.0105438232421875, - "reward": 0.986860990524292, - "reward_std": 0.001315788395004347, - "rewards/perpo_ocr_edit_distance_reward": 0.9868611097335815, + "advantages": -9.366444686520481e-08, + "completion_length": 712.0, + "delta_ref_entropy_loss": 0.2236328125, + "delta_ref_ppl": -0.138671875, + "entropy_loss": -0.2119140625, + "epoch": 0.1938, + "grad_norm": 34.71215885690683, + "k1_kl": 0.138671875, + "k3_kl": 0.09228515625, + "kimi_kl": 0.201171875, + "learning_rate": 4.031e-07, + "loss": 0.0037, + "ppl": 0.126953125, + "reward": 0.5306037664413452, + "reward_std": 0.228570356965065, + "rewards/perpo_ocr_edit_distance_reward": 0.5306037664413452, "step": 969, "temperature": 0.9 }, { - "advantages": -0.00010797807772178203, - "completion_length": 876.0, - "delta_ref_entropy_loss": 0.025146484375, - "delta_ref_ppl": -0.018768310546875, - "entropy_loss": -0.018798828125, - "epoch": 0.388, - "grad_norm": 0.7637834070162685, - "k1_kl": 0.01873779296875, - "k3_kl": 0.011688232421875, - "kimi_kl": 0.030059814453125, - "learning_rate": 3.0599999999999996e-07, - "loss": 0.0006, - "ppl": 0.009185791015625, - "reward": 0.9958246350288391, - "reward_std": 0.00045152840903028846, - "rewards/perpo_ocr_edit_distance_reward": 0.9958247244358063, + "advantages": -3.11647163471207e-05, + "completion_length": 749.0, + "delta_ref_entropy_loss": 0.033935546875, + "delta_ref_ppl": -0.04345703125, + "entropy_loss": -0.01708984375, + "epoch": 0.194, + "grad_norm": 0.6009541808561342, + "k1_kl": 0.04345703125, + "k3_kl": 0.0272216796875, + "kimi_kl": 0.07421875, + "learning_rate": 4.03e-07, + "loss": 0.0011, + "ppl": 0.006378173828125, + "reward": 0.9939053654670715, + "reward_std": 0.0009927296778187156, + "rewards/perpo_ocr_edit_distance_reward": 0.9939053654670715, "step": 970, "temperature": 0.9 }, { - "advantages": -2.3122345737647265e-05, - "completion_length": 615.5, - "delta_ref_entropy_loss": 0.0599365234375, - "delta_ref_ppl": -0.0379638671875, - "entropy_loss": -0.04986572265625, - "epoch": 0.3884, - "grad_norm": 0.6228592733860726, - "k1_kl": 0.037841796875, - "k3_kl": 0.022064208984375, - "kimi_kl": 0.071044921875, - "learning_rate": 3.058e-07, - "loss": 0.0009, - "ppl": 0.026031494140625, - "reward": 0.9725496470928192, - "reward_std": 0.000659901270410046, - "rewards/perpo_ocr_edit_distance_reward": 0.972549706697464, + "advantages": -3.4144948131142883e-06, + "completion_length": 2028.0, + "delta_ref_entropy_loss": 0.042236328125, + "delta_ref_ppl": -0.0311279296875, + "entropy_loss": -0.042724609375, + "epoch": 0.1942, + "grad_norm": 2.26141106373454, + "k1_kl": 0.0311279296875, + "k3_kl": 0.043701171875, + "kimi_kl": 0.04833984375, + "learning_rate": 4.029e-07, + "loss": 0.0017, + "ppl": 0.0272216796875, + "reward": 0.8592205047607422, + "reward_std": 0.019945070147514343, + "rewards/perpo_ocr_edit_distance_reward": 0.859220564365387, "step": 971, "temperature": 0.9 }, { - "advantages": -0.0002980232238769531, - "completion_length": 488.0, - "delta_ref_entropy_loss": 0.01971435546875, - "delta_ref_ppl": -0.015472412109375, - "entropy_loss": -0.01025390625, - "epoch": 0.3888, - "grad_norm": 0.0205170607086345, - "k1_kl": 0.01556396484375, - "k3_kl": 0.01016998291015625, - "kimi_kl": 0.0507049560546875, - "learning_rate": 3.056e-07, - "loss": 0.0007, - "ppl": 0.00445556640625, - "reward": 0.9996664226055145, - "reward_std": 0.0, - "rewards/perpo_ocr_edit_distance_reward": 0.9996664822101593, + "advantages": -2.806527481880039e-05, + "completion_length": 231.0, + "delta_ref_entropy_loss": 0.09423828125, + "delta_ref_ppl": -0.13671875, + "entropy_loss": -0.0751953125, + "epoch": 0.1944, + "grad_norm": 1.8415673889453705, + "k1_kl": 0.13671875, + "k3_kl": 0.08984375, + "kimi_kl": 0.263671875, + "learning_rate": 4.028e-07, + "loss": 0.0036, + "ppl": 0.032470703125, + "reward": 0.9225512146949768, + "reward_std": 0.0026303024496883154, + "rewards/perpo_ocr_edit_distance_reward": 0.9225513339042664, "step": 972, "temperature": 0.9 }, { - "advantages": -9.272567331208847e-05, - "completion_length": 1111.5, - "delta_ref_entropy_loss": 0.019989013671875, - "delta_ref_ppl": -0.0247802734375, - "entropy_loss": -0.02587890625, - "epoch": 0.3892, - "grad_norm": 0.3200102846097915, - "k1_kl": 0.02484130859375, - "k3_kl": 0.0177001953125, - "kimi_kl": 0.0531005859375, - "learning_rate": 3.0539999999999997e-07, - "loss": 0.0008, - "ppl": 0.014892578125, - "reward": 0.9968531727790833, - "reward_std": 0.0006493481196230277, - "rewards/perpo_ocr_edit_distance_reward": 0.9968532621860504, + "advantages": -4.884174995822832e-05, + "completion_length": 817.0, + "delta_ref_entropy_loss": 0.0439453125, + "delta_ref_ppl": -0.05224609375, + "entropy_loss": -0.052734375, + "epoch": 0.1946, + "grad_norm": 1.1090783540373692, + "k1_kl": 0.052001953125, + "k3_kl": 0.030517578125, + "kimi_kl": 0.06689453125, + "learning_rate": 4.0269999999999997e-07, + "loss": 0.0013, + "ppl": 0.0301513671875, + "reward": 0.9748119115829468, + "reward_std": 0.0018175962613895535, + "rewards/perpo_ocr_edit_distance_reward": 0.9748120307922363, "step": 973, "temperature": 0.9 }, { - "advantages": 5.023820222049835e-07, - "completion_length": 485.5, - "delta_ref_entropy_loss": 0.0450439453125, - "delta_ref_ppl": -0.02960205078125, - "entropy_loss": -0.0384521484375, - "epoch": 0.3896, - "grad_norm": 0.6739430937142386, - "k1_kl": 0.029541015625, - "k3_kl": 0.0167236328125, - "kimi_kl": 0.04547119140625, - "learning_rate": 3.052e-07, - "loss": 0.0007, - "ppl": 0.0216064453125, - "reward": 0.2245209626853466, - "reward_std": 0.004365775079349987, - "rewards/perpo_ocr_edit_distance_reward": 0.2245209775865078, + "advantages": -2.393958311586175e-05, + "completion_length": 160.0, + "delta_ref_entropy_loss": 0.0947265625, + "delta_ref_ppl": -0.130859375, + "entropy_loss": -0.09423828125, + "epoch": 0.1948, + "grad_norm": 1.9634148505680928, + "k1_kl": 0.1318359375, + "k3_kl": 0.08837890625, + "kimi_kl": 0.2431640625, + "learning_rate": 4.026e-07, + "loss": 0.0036, + "ppl": 0.031005859375, + "reward": 0.9606624841690063, + "reward_std": 0.0023906887508928776, + "rewards/perpo_ocr_edit_distance_reward": 0.9606626033782959, "step": 974, "temperature": 0.9 }, { - "advantages": -3.2356808787881164e-07, - "completion_length": 1047.0, - "delta_ref_entropy_loss": 0.0450439453125, - "delta_ref_ppl": -0.02752685546875, - "entropy_loss": -0.050048828125, - "epoch": 0.39, - "grad_norm": 1.3923516417374824, - "k1_kl": 0.0274658203125, - "k3_kl": 0.01593017578125, - "kimi_kl": 0.03228759765625, - "learning_rate": 3.05e-07, - "loss": 0.0006, - "ppl": 0.027099609375, - "reward": 0.9332287907600403, - "reward_std": 0.010511243948712945, - "rewards/perpo_ocr_edit_distance_reward": 0.9332288205623627, + "advantages": -2.430166568956338e-05, + "completion_length": 486.0, + "delta_ref_entropy_loss": 0.034423828125, + "delta_ref_ppl": -0.06396484375, + "entropy_loss": -0.036865234375, + "epoch": 0.195, + "grad_norm": 1.7822892055924908, + "k1_kl": 0.064453125, + "k3_kl": 0.042236328125, + "kimi_kl": 0.111328125, + "learning_rate": 4.025e-07, + "loss": 0.0017, + "ppl": 0.0191650390625, + "reward": 0.9933202266693115, + "reward_std": 0.0023535063955932856, + "rewards/perpo_ocr_edit_distance_reward": 0.9933202266693115, "step": 975, "temperature": 0.9 }, { - "advantages": -1.8647739352672943e-06, - "completion_length": 1733.0, - "delta_ref_entropy_loss": 0.0616455078125, - "delta_ref_ppl": -0.042236328125, - "entropy_loss": -0.09912109375, - "epoch": 0.3904, - "grad_norm": 0.8037159083556076, - "k1_kl": 0.0423583984375, - "k3_kl": 0.02313232421875, - "kimi_kl": 0.07269287109375, - "learning_rate": 3.048e-07, - "loss": 0.0009, - "ppl": 0.0494384765625, - "reward": 0.8987597227096558, - "reward_std": 0.1032545049674809, - "rewards/perpo_ocr_edit_distance_reward": 0.8987597823143005, + "advantages": -8.514949740856537e-07, + "completion_length": 181.0, + "delta_ref_entropy_loss": 0.1201171875, + "delta_ref_ppl": -0.1708984375, + "entropy_loss": -0.11083984375, + "epoch": 0.1952, + "grad_norm": 2.8288401776648135, + "k1_kl": 0.1708984375, + "k3_kl": 0.11328125, + "kimi_kl": 0.38671875, + "learning_rate": 4.0239999999999995e-07, + "loss": 0.0045, + "ppl": 0.04345703125, + "reward": 0.8652451634407043, + "reward_std": 0.020203810185194016, + "rewards/perpo_ocr_edit_distance_reward": 0.8652452230453491, "step": 976, "temperature": 0.9 }, { - "advantages": -0.0003279745596955763, - "completion_length": 237.5, - "delta_ref_entropy_loss": 0.0565185546875, - "delta_ref_ppl": -0.0555419921875, - "entropy_loss": -0.02301025390625, - "epoch": 0.3908, - "grad_norm": 0.8427391607676448, - "k1_kl": 0.0555419921875, - "k3_kl": 0.03558349609375, - "kimi_kl": 0.111083984375, - "learning_rate": 3.0459999999999996e-07, - "loss": 0.0018, - "ppl": 0.0087432861328125, - "reward": 0.9895922243595123, - "reward_std": 0.0005898021627217531, - "rewards/perpo_ocr_edit_distance_reward": 0.9895923435688019, + "advantages": 2.4693355271665496e-07, + "completion_length": 7.0, + "delta_ref_entropy_loss": 0.14453125, + "delta_ref_ppl": -2.828125, + "entropy_loss": -0.48828125, + "epoch": 0.1954, + "grad_norm": 31.656479521001344, + "k1_kl": 2.828125, + "k3_kl": 2.5, + "kimi_kl": 11.1875, + "learning_rate": 4.023e-07, + "loss": 0.0996, + "ppl": 0.1845703125, + "reward": 0.03822937607765198, + "reward_std": 0.002216325607150793, + "rewards/perpo_ocr_edit_distance_reward": 0.03822937607765198, "step": 977, "temperature": 0.9 }, { - "advantages": -5.46659762079571e-06, - "completion_length": 786.0, - "delta_ref_entropy_loss": 0.103759765625, - "delta_ref_ppl": -0.076171875, - "entropy_loss": -0.11083984375, - "epoch": 0.3912, - "grad_norm": 1430.6861999648515, - "k1_kl": 0.075927734375, - "k3_kl": 4.687744140625, - "kimi_kl": 0.12890625, - "learning_rate": 3.044e-07, - "loss": 0.188, - "ppl": 0.0633544921875, - "reward": 0.8833534717559814, - "reward_std": 0.1210885135224089, - "rewards/perpo_ocr_edit_distance_reward": 0.8833535015583038, + "advantages": -6.326607490336755e-06, + "completion_length": 408.0, + "delta_ref_entropy_loss": 0.1708984375, + "delta_ref_ppl": -0.1533203125, + "entropy_loss": -0.1513671875, + "epoch": 0.1956, + "grad_norm": 2.516851688834601, + "k1_kl": 0.15234375, + "k3_kl": 0.09326171875, + "kimi_kl": 0.298828125, + "learning_rate": 4.022e-07, + "loss": 0.0037, + "ppl": 0.080078125, + "reward": 0.8029137849807739, + "reward_std": 0.00795720610767603, + "rewards/perpo_ocr_edit_distance_reward": 0.8029138445854187, "step": 978, "temperature": 0.9 }, { - "advantages": -7.578304916933121e-07, - "completion_length": 505.5, - "delta_ref_entropy_loss": 0.0379638671875, - "delta_ref_ppl": -0.03082275390625, - "entropy_loss": -0.01922607421875, - "epoch": 0.3916, - "grad_norm": 0.6144377722659317, - "k1_kl": 0.03094482421875, - "k3_kl": 0.019134521484375, - "kimi_kl": 0.08038330078125, - "learning_rate": 3.0420000000000004e-07, - "loss": 0.0008, - "ppl": 0.00865936279296875, - "reward": 0.9865180850028992, - "reward_std": 0.011211633682250977, - "rewards/perpo_ocr_edit_distance_reward": 0.9865180850028992, + "advantages": -0.0005960464477539062, + "completion_length": 442.0, + "delta_ref_entropy_loss": 0.048828125, + "delta_ref_ppl": -0.058837890625, + "entropy_loss": -0.0164794921875, + "epoch": 0.1958, + "grad_norm": 0.016425825960995465, + "k1_kl": 0.05859375, + "k3_kl": 0.033447265625, + "kimi_kl": 0.103515625, + "learning_rate": 4.021e-07, + "loss": 0.0019, + "ppl": 0.0023040771484375, + "reward": 0.9982134699821472, + "reward_std": 0.0, + "rewards/perpo_ocr_edit_distance_reward": 0.998213529586792, "step": 979, "temperature": 0.9 }, { - "advantages": -3.9322038730915665e-05, - "completion_length": 1145.5, - "delta_ref_entropy_loss": 0.025390625, - "delta_ref_ppl": -0.01495361328125, - "entropy_loss": -0.0439453125, - "epoch": 0.392, - "grad_norm": 0.5986094823973972, - "k1_kl": 0.01495361328125, - "k3_kl": 0.0083770751953125, - "kimi_kl": 0.0126953125, - "learning_rate": 3.0399999999999997e-07, - "loss": 0.0004, - "ppl": 0.020263671875, - "reward": 0.944611519575119, - "reward_std": 0.00940934925165493, - "rewards/perpo_ocr_edit_distance_reward": 0.9446115493774414, + "advantages": -3.549030952854082e-05, + "completion_length": 395.0, + "delta_ref_entropy_loss": 0.0732421875, + "delta_ref_ppl": -0.0849609375, + "entropy_loss": -0.03173828125, + "epoch": 0.196, + "grad_norm": 1.0001275376342287, + "k1_kl": 0.0849609375, + "k3_kl": 0.0556640625, + "kimi_kl": 0.16015625, + "learning_rate": 4.02e-07, + "loss": 0.0023, + "ppl": 0.015625, + "reward": 0.9869794249534607, + "reward_std": 0.002058698795735836, + "rewards/perpo_ocr_edit_distance_reward": 0.9869794845581055, "step": 980, "temperature": 0.9 }, { - "advantages": -1.2687275230405248e-06, - "completion_length": 408.0, - "delta_ref_entropy_loss": 0.0528564453125, - "delta_ref_ppl": -0.0555419921875, - "entropy_loss": -0.0557861328125, - "epoch": 0.3924, - "grad_norm": 1.5418486236009432, - "k1_kl": 0.0555419921875, - "k3_kl": 0.035888671875, - "kimi_kl": 0.1005859375, - "learning_rate": 3.038e-07, - "loss": 0.0014, - "ppl": 0.0252685546875, - "reward": 0.2841249331831932, - "reward_std": 0.12273251684382558, - "rewards/perpo_ocr_edit_distance_reward": 0.2841249704360962, + "advantages": -6.816217501182109e-05, + "completion_length": 817.0, + "delta_ref_entropy_loss": 0.041015625, + "delta_ref_ppl": -0.03857421875, + "entropy_loss": -0.0211181640625, + "epoch": 0.1962, + "grad_norm": 0.6917814998306134, + "k1_kl": 0.03857421875, + "k3_kl": 0.021728515625, + "kimi_kl": 0.06103515625, + "learning_rate": 4.0189999999999997e-07, + "loss": 0.0009, + "ppl": 0.00921630859375, + "reward": 0.9962077140808105, + "reward_std": 0.0005245510255917907, + "rewards/perpo_ocr_edit_distance_reward": 0.9962077140808105, "step": 981, "temperature": 0.9 }, { - "advantages": -7.634259554833989e-05, - "completion_length": 484.0, - "delta_ref_entropy_loss": 0.0350341796875, - "delta_ref_ppl": -0.0440673828125, - "entropy_loss": -0.03253173828125, - "epoch": 0.3928, - "grad_norm": 0.5122069764384644, - "k1_kl": 0.0440673828125, - "k3_kl": 0.0311279296875, - "kimi_kl": 0.09814453125, - "learning_rate": 3.036e-07, - "loss": 0.0013, - "ppl": 0.016693115234375, - "reward": 0.98377326130867, - "reward_std": 0.0007225992885651067, - "rewards/perpo_ocr_edit_distance_reward": 0.9837733209133148, + "advantages": -9.67298274190398e-06, + "completion_length": 698.0, + "delta_ref_entropy_loss": 0.16015625, + "delta_ref_ppl": -0.1484375, + "entropy_loss": -0.10107421875, + "epoch": 0.1964, + "grad_norm": 4.216701574882819, + "k1_kl": 0.1484375, + "k3_kl": 0.08544921875, + "kimi_kl": 0.28125, + "learning_rate": 4.0179999999999996e-07, + "loss": 0.0034, + "ppl": 0.058837890625, + "reward": 0.9124419093132019, + "reward_std": 0.003420510096475482, + "rewards/perpo_ocr_edit_distance_reward": 0.9124420285224915, "step": 982, "temperature": 0.9 }, { - "advantages": -2.2522041831507522e-06, - "completion_length": 330.0, - "delta_ref_entropy_loss": 0.09033203125, - "delta_ref_ppl": -0.231201171875, - "entropy_loss": -0.2109375, - "epoch": 0.3932, - "grad_norm": 6.215054655036141, - "k1_kl": 0.231201171875, - "k3_kl": 0.1573486328125, - "kimi_kl": 0.421875, - "learning_rate": 3.034e-07, - "loss": 0.0063, - "ppl": 0.1279296875, - "reward": 0.9416408538818359, - "reward_std": 0.018096239771693945, - "rewards/perpo_ocr_edit_distance_reward": 0.9416408836841583, + "advantages": -1.151221204054309e-05, + "completion_length": 575.0, + "delta_ref_entropy_loss": 0.037109375, + "delta_ref_ppl": -0.0458984375, + "entropy_loss": -0.022216796875, + "epoch": 0.1966, + "grad_norm": 0.5655720984610368, + "k1_kl": 0.046142578125, + "k3_kl": 0.027099609375, + "kimi_kl": 0.07421875, + "learning_rate": 4.017e-07, + "loss": 0.0011, + "ppl": 0.009765625, + "reward": 0.9972906708717346, + "reward_std": 0.0006394759984686971, + "rewards/perpo_ocr_edit_distance_reward": 0.9972906708717346, "step": 983, "temperature": 0.9 }, { - "advantages": -1.39560027605512e-05, - "completion_length": 587.5, - "delta_ref_entropy_loss": 0.107666015625, - "delta_ref_ppl": -0.059326171875, - "entropy_loss": -0.11993408203125, - "epoch": 0.3936, - "grad_norm": 1.237922349194652, - "k1_kl": 0.05908203125, - "k3_kl": 0.03179931640625, - "kimi_kl": 0.08367919921875, - "learning_rate": 3.032e-07, - "loss": 0.0013, - "ppl": 0.066619873046875, - "reward": 0.8885097801685333, - "reward_std": 0.01005987502867356, - "rewards/perpo_ocr_edit_distance_reward": 0.8885098099708557, + "advantages": -1.7029899268550253e-08, + "completion_length": 183.0, + "delta_ref_entropy_loss": 0.22265625, + "delta_ref_ppl": -0.21484375, + "entropy_loss": -0.328125, + "epoch": 0.1968, + "grad_norm": 5.012533535813254, + "k1_kl": 0.21484375, + "k3_kl": 0.1474609375, + "kimi_kl": 0.48046875, + "learning_rate": 4.016e-07, + "loss": 0.0059, + "ppl": 0.1728515625, + "reward": 0.2705489993095398, + "reward_std": 0.08898764103651047, + "rewards/perpo_ocr_edit_distance_reward": 0.2705490291118622, "step": 984, "temperature": 0.9 }, { - "advantages": -1.9477947034829413e-05, - "completion_length": 955.0, - "delta_ref_entropy_loss": 0.0252685546875, - "delta_ref_ppl": -0.019744873046875, - "entropy_loss": -0.0435791015625, - "epoch": 0.394, - "grad_norm": 1.1009430855710602, - "k1_kl": 0.019805908203125, - "k3_kl": 0.0156402587890625, - "kimi_kl": 0.028564453125, - "learning_rate": 3.03e-07, - "loss": 0.0006, - "ppl": 0.0226287841796875, - "reward": 0.9512437880039215, - "reward_std": 0.010434180789161474, - "rewards/perpo_ocr_edit_distance_reward": 0.9512438476085663, + "advantages": 5.10896995820076e-08, + "completion_length": 47.0, + "delta_ref_entropy_loss": -0.0908203125, + "delta_ref_ppl": -0.298828125, + "entropy_loss": -0.208984375, + "epoch": 0.197, + "grad_norm": 4.762123627683053, + "k1_kl": 0.30078125, + "k3_kl": 0.2431640625, + "kimi_kl": 0.87890625, + "learning_rate": 4.015e-07, + "loss": 0.0097, + "ppl": 0.0849609375, + "reward": 0.7106841802597046, + "reward_std": 0.3004266321659088, + "rewards/perpo_ocr_edit_distance_reward": 0.7106841206550598, "step": 985, "temperature": 0.9 }, { - "advantages": -0.0003732187469722703, - "completion_length": 519.5, - "delta_ref_entropy_loss": 0.023162841796875, - "delta_ref_ppl": -0.02825927734375, - "entropy_loss": -0.017608642578125, - "epoch": 0.3944, - "grad_norm": 0.24167524643163155, - "k1_kl": 0.02825927734375, - "k3_kl": 0.01971435546875, - "kimi_kl": 0.07843017578125, - "learning_rate": 3.028e-07, - "loss": 0.0012, - "ppl": 0.0086822509765625, - "reward": 0.9248827993869781, - "reward_std": 0.00017645867774263024, - "rewards/perpo_ocr_edit_distance_reward": 0.9248828291893005, + "advantages": -9.451593996345764e-07, + "completion_length": 841.0, + "delta_ref_entropy_loss": 0.1787109375, + "delta_ref_ppl": -0.0732421875, + "entropy_loss": -0.376953125, + "epoch": 0.1972, + "grad_norm": 2.9307450519207823, + "k1_kl": 0.07421875, + "k3_kl": 0.036376953125, + "kimi_kl": 0.054931640625, + "learning_rate": 4.014e-07, + "loss": 0.0015, + "ppl": 0.201171875, + "reward": 0.7235431671142578, + "reward_std": 0.04480733722448349, + "rewards/perpo_ocr_edit_distance_reward": 0.7235432863235474, "step": 986, "temperature": 0.9 }, { - "advantages": -0.0001883847414489992, - "completion_length": 655.5, - "delta_ref_entropy_loss": 0.04547119140625, - "delta_ref_ppl": -0.079803466796875, - "entropy_loss": -0.0435028076171875, - "epoch": 0.3948, - "grad_norm": 4.182734609842232, - "k1_kl": 0.0802764892578125, - "k3_kl": 0.061252593994140625, - "kimi_kl": 0.21456146240234375, - "learning_rate": 3.0259999999999997e-07, - "loss": 0.0026, - "ppl": 0.018777847290039062, - "reward": 0.8744347095489502, - "reward_std": 0.05673472314811079, - "rewards/perpo_ocr_edit_distance_reward": 0.8744347989559174, + "advantages": -9.911401321005542e-06, + "completion_length": 287.0, + "delta_ref_entropy_loss": 0.1162109375, + "delta_ref_ppl": -0.1279296875, + "entropy_loss": -0.038330078125, + "epoch": 0.1974, + "grad_norm": 1.47772633155227, + "k1_kl": 0.1279296875, + "k3_kl": 0.07763671875, + "kimi_kl": 0.2041015625, + "learning_rate": 4.013e-07, + "loss": 0.0031, + "ppl": 0.018310546875, + "reward": 0.9781088829040527, + "reward_std": 0.0033301892690360546, + "rewards/perpo_ocr_edit_distance_reward": 0.9781090021133423, "step": 987, "temperature": 0.9 }, { - "advantages": 0.0, - "completion_length": 337.5, - "delta_ref_entropy_loss": 0.04052734375, - "delta_ref_ppl": -0.03717041015625, - "entropy_loss": -0.014404296875, - "epoch": 0.3952, - "grad_norm": 0.017225291742211248, - "k1_kl": 0.03729248046875, - "k3_kl": 0.0240478515625, - "kimi_kl": 0.08770751953125, - "learning_rate": 3.024e-07, - "loss": 0.001, - "ppl": 0.00531005859375, - "reward": 1.0, - "reward_std": 0.0, - "rewards/perpo_ocr_edit_distance_reward": 1.0, + "advantages": -7.476977043552324e-05, + "completion_length": 785.0, + "delta_ref_entropy_loss": 0.05712890625, + "delta_ref_ppl": -0.0537109375, + "entropy_loss": -0.022216796875, + "epoch": 0.1976, + "grad_norm": 0.5944800522364906, + "k1_kl": 0.053955078125, + "k3_kl": 0.028076171875, + "kimi_kl": 0.068359375, + "learning_rate": 4.0119999999999997e-07, + "loss": 0.0012, + "ppl": 0.0084228515625, + "reward": 0.9850941896438599, + "reward_std": 0.0006971742259338498, + "rewards/perpo_ocr_edit_distance_reward": 0.9850942492485046, "step": 988, "temperature": 0.9 }, { - "advantages": -7.3079556841548765e-06, - "completion_length": 926.0, - "delta_ref_entropy_loss": 0.02227783203125, - "delta_ref_ppl": -0.017669677734375, - "entropy_loss": -0.0369873046875, - "epoch": 0.3956, - "grad_norm": 1.2242279440093833, - "k1_kl": 0.017669677734375, - "k3_kl": 0.00970458984375, - "kimi_kl": 0.0205078125, - "learning_rate": 3.022e-07, - "loss": 0.0004, - "ppl": 0.01727294921875, - "reward": 0.9683034718036652, - "reward_std": 0.0010722529841586947, - "rewards/perpo_ocr_edit_distance_reward": 0.9683035016059875, + "advantages": -0.0005960464477539062, + "completion_length": 198.0, + "delta_ref_entropy_loss": 0.057861328125, + "delta_ref_ppl": -0.11669921875, + "entropy_loss": -0.016845703125, + "epoch": 0.1978, + "grad_norm": 0.009154124584670537, + "k1_kl": 0.1162109375, + "k3_kl": 0.0869140625, + "kimi_kl": 0.328125, + "learning_rate": 4.011e-07, + "loss": 0.0041, + "ppl": 0.0032806396484375, + "reward": 0.9917525053024292, + "reward_std": 0.0, + "rewards/perpo_ocr_edit_distance_reward": 0.9917526245117188, "step": 989, "temperature": 0.9 }, { - "advantages": -3.301245851616841e-05, - "completion_length": 608.5, - "delta_ref_entropy_loss": 0.0516357421875, - "delta_ref_ppl": -0.0352783203125, - "entropy_loss": -0.08740234375, - "epoch": 0.396, - "grad_norm": 1.0536019238341483, - "k1_kl": 0.03515625, - "k3_kl": 0.02032470703125, - "kimi_kl": 0.0419921875, - "learning_rate": 3.02e-07, - "loss": 0.0008, - "ppl": 0.0506591796875, - "reward": 0.9208069443702698, - "reward_std": 0.0031097056926228106, - "rewards/perpo_ocr_edit_distance_reward": 0.9208070635795593, + "advantages": -2.8661321266554296e-05, + "completion_length": 775.0, + "delta_ref_entropy_loss": 0.06689453125, + "delta_ref_ppl": -0.07275390625, + "entropy_loss": -0.046875, + "epoch": 0.198, + "grad_norm": 1.1829321530391177, + "k1_kl": 0.07275390625, + "k3_kl": 0.05029296875, + "kimi_kl": 0.11474609375, + "learning_rate": 4.01e-07, + "loss": 0.002, + "ppl": 0.0252685546875, + "reward": 0.9896025657653809, + "reward_std": 0.0022760401479899883, + "rewards/perpo_ocr_edit_distance_reward": 0.9896026849746704, "step": 990, "temperature": 0.9 }, { - "advantages": -0.0003307291444798466, - "completion_length": 301.5, - "delta_ref_entropy_loss": 0.05328369140625, - "delta_ref_ppl": -0.04156494140625, - "entropy_loss": -0.02447509765625, - "epoch": 0.3964, - "grad_norm": 0.3887150263407509, - "k1_kl": 0.041748046875, - "k3_kl": 0.025177001953125, - "kimi_kl": 0.06072998046875, - "learning_rate": 3.018e-07, - "loss": 0.0013, - "ppl": 0.0147705078125, - "reward": 0.9822722971439362, - "reward_std": 0.0004055634781252593, - "rewards/perpo_ocr_edit_distance_reward": 0.9822723865509033, + "advantages": -0.0005960464477539062, + "completion_length": 421.0, + "delta_ref_entropy_loss": 0.07373046875, + "delta_ref_ppl": -0.08740234375, + "entropy_loss": -0.02099609375, + "epoch": 0.1982, + "grad_norm": 0.021077105773791144, + "k1_kl": 0.08740234375, + "k3_kl": 0.053955078125, + "kimi_kl": 0.15625, + "learning_rate": 4.0089999999999994e-07, + "loss": 0.0028, + "ppl": 0.0072021484375, + "reward": 0.9894471764564514, + "reward_std": 0.0, + "rewards/perpo_ocr_edit_distance_reward": 0.989447295665741, "step": 991, "temperature": 0.9 }, { - "advantages": -0.00013398273222264834, - "completion_length": 630.0, - "delta_ref_entropy_loss": 0.0186767578125, - "delta_ref_ppl": -0.0091552734375, - "entropy_loss": -0.018218994140625, - "epoch": 0.3968, - "grad_norm": 0.3468041549984229, - "k1_kl": 0.0091552734375, - "k3_kl": 0.004547119140625, - "kimi_kl": 0.007843017578125, - "learning_rate": 3.0159999999999995e-07, - "loss": 0.0003, - "ppl": 0.0100250244140625, - "reward": 0.9997786283493042, - "reward_std": 0.000361438789695967, - "rewards/perpo_ocr_edit_distance_reward": 0.9997787177562714, + "advantages": -7.706029282417148e-05, + "completion_length": 917.0, + "delta_ref_entropy_loss": 0.044921875, + "delta_ref_ppl": -0.04638671875, + "entropy_loss": -0.0306396484375, + "epoch": 0.1984, + "grad_norm": 0.49811791522699056, + "k1_kl": 0.04638671875, + "k3_kl": 0.0283203125, + "kimi_kl": 0.09130859375, + "learning_rate": 4.008e-07, + "loss": 0.0012, + "ppl": 0.0125732421875, + "reward": 0.9060975313186646, + "reward_std": 0.001004863646812737, + "rewards/perpo_ocr_edit_distance_reward": 0.9060975909233093, "step": 992, "temperature": 0.9 }, { - "advantages": -7.214290872070706e-05, - "completion_length": 636.5, - "delta_ref_entropy_loss": 0.0966796875, - "delta_ref_ppl": -0.05792236328125, - "entropy_loss": -0.1270751953125, - "epoch": 0.3972, - "grad_norm": 1.2563759207981176, - "k1_kl": 0.0579833984375, - "k3_kl": 0.032440185546875, - "kimi_kl": 0.097198486328125, - "learning_rate": 3.014e-07, - "loss": 0.0014, - "ppl": 0.07403564453125, - "reward": 0.9483493864536285, - "reward_std": 0.001458217710023746, - "rewards/perpo_ocr_edit_distance_reward": 0.9483494460582733, + "advantages": -1.4109271432971582e-05, + "completion_length": 405.0, + "delta_ref_entropy_loss": 0.1591796875, + "delta_ref_ppl": -0.10888671875, + "entropy_loss": -0.095703125, + "epoch": 0.1986, + "grad_norm": 1.4305900224535633, + "k1_kl": 0.109375, + "k3_kl": 0.06494140625, + "kimi_kl": 0.185546875, + "learning_rate": 4.007e-07, + "loss": 0.0026, + "ppl": 0.05322265625, + "reward": 0.6412806510925293, + "reward_std": 0.001711223623715341, + "rewards/perpo_ocr_edit_distance_reward": 0.6412806510925293, "step": 993, "temperature": 0.9 }, { - "advantages": -9.656804104452021e-05, - "completion_length": 501.0, - "delta_ref_entropy_loss": 0.014739990234375, - "delta_ref_ppl": -0.0124359130859375, - "entropy_loss": -0.010772705078125, - "epoch": 0.3976, - "grad_norm": 0.27229846366137106, - "k1_kl": 0.0124359130859375, - "k3_kl": 0.0088653564453125, - "kimi_kl": 0.030517578125, - "learning_rate": 3.012e-07, - "loss": 0.0005, - "ppl": 0.0050201416015625, - "reward": 0.9997861385345459, - "reward_std": 0.00020554790535243228, - "rewards/perpo_ocr_edit_distance_reward": 0.9997861385345459, + "advantages": -6.442410813178867e-05, + "completion_length": 136.0, + "delta_ref_entropy_loss": 0.0361328125, + "delta_ref_ppl": -0.1767578125, + "entropy_loss": -0.0234375, + "epoch": 0.1988, + "grad_norm": 1.2397336634088854, + "k1_kl": 0.177734375, + "k3_kl": 0.138671875, + "kimi_kl": 0.57421875, + "learning_rate": 4.006e-07, + "loss": 0.0056, + "ppl": 0.0084228515625, + "reward": 0.9909544587135315, + "reward_std": 0.0008252485422417521, + "rewards/perpo_ocr_edit_distance_reward": 0.9909545183181763, "step": 994, "temperature": 0.9 }, { - "advantages": -3.8853713476783014e-05, - "completion_length": 964.0, - "delta_ref_entropy_loss": 0.063232421875, - "delta_ref_ppl": -0.04510498046875, - "entropy_loss": -0.09979248046875, - "epoch": 0.398, - "grad_norm": 2.0962832031192082, - "k1_kl": 0.04510498046875, - "k3_kl": 0.0251617431640625, - "kimi_kl": 0.06768798828125, - "learning_rate": 3.0099999999999996e-07, - "loss": 0.001, - "ppl": 0.0545196533203125, - "reward": 0.27094898372888565, - "reward_std": 0.0019351135706529021, - "rewards/perpo_ocr_edit_distance_reward": 0.27094900235533714, + "advantages": -3.598417606553994e-05, + "completion_length": 676.0, + "delta_ref_entropy_loss": 0.095703125, + "delta_ref_ppl": -0.07275390625, + "entropy_loss": -0.0311279296875, + "epoch": 0.199, + "grad_norm": 0.8957071726432465, + "k1_kl": 0.07275390625, + "k3_kl": 0.038330078125, + "kimi_kl": 0.1142578125, + "learning_rate": 4.005e-07, + "loss": 0.0016, + "ppl": 0.01226806640625, + "reward": 0.9926916360855103, + "reward_std": 0.0013199297245591879, + "rewards/perpo_ocr_edit_distance_reward": 0.992691695690155, "step": 995, "temperature": 0.9 }, { - "advantages": 3.684418697957881e-05, - "completion_length": 434.5, - "delta_ref_entropy_loss": 0.0333251953125, - "delta_ref_ppl": -0.04193115234375, - "entropy_loss": -0.020904541015625, - "epoch": 0.3984, - "grad_norm": 0.11962152043167827, - "k1_kl": 0.04193115234375, - "k3_kl": 0.02960205078125, - "kimi_kl": 0.11822509765625, - "learning_rate": 3.008e-07, - "loss": 0.0011, - "ppl": 0.0114288330078125, - "reward": 0.9997202754020691, - "reward_std": 0.00012334228085819632, - "rewards/perpo_ocr_edit_distance_reward": 0.9997202754020691, + "advantages": -0.00012880563735961914, + "completion_length": 269.0, + "delta_ref_entropy_loss": 0.06591796875, + "delta_ref_ppl": -0.119140625, + "entropy_loss": -0.0189208984375, + "epoch": 0.1992, + "grad_norm": 0.6155175649928426, + "k1_kl": 0.11962890625, + "k3_kl": 0.08837890625, + "kimi_kl": 0.3515625, + "learning_rate": 4.0039999999999996e-07, + "loss": 0.0037, + "ppl": 0.006011962890625, + "reward": 0.9855452179908752, + "reward_std": 0.0002966037718579173, + "rewards/perpo_ocr_edit_distance_reward": 0.9855453372001648, "step": 996, "temperature": 0.9 }, { - "advantages": -0.00010606221258058213, - "completion_length": 745.0, - "delta_ref_entropy_loss": 0.03192138671875, - "delta_ref_ppl": -0.02532958984375, - "entropy_loss": -0.02789306640625, - "epoch": 0.3988, - "grad_norm": 0.5030077962598919, - "k1_kl": 0.0252685546875, - "k3_kl": 0.016021728515625, - "kimi_kl": 0.04571533203125, - "learning_rate": 3.006e-07, - "loss": 0.0007, - "ppl": 0.013336181640625, - "reward": 0.9991788864135742, - "reward_std": 0.0005116616084706038, - "rewards/perpo_ocr_edit_distance_reward": 0.999178946018219, + "advantages": 1.5922955753921997e-06, + "completion_length": 504.0, + "delta_ref_entropy_loss": 0.04248046875, + "delta_ref_ppl": -0.037109375, + "entropy_loss": -0.0091552734375, + "epoch": 0.1994, + "grad_norm": 0.6000031340694322, + "k1_kl": 0.036865234375, + "k3_kl": 0.022216796875, + "kimi_kl": 0.058837890625, + "learning_rate": 4.0029999999999995e-07, + "loss": 0.0009, + "ppl": 0.0024871826171875, + "reward": 0.9339684247970581, + "reward_std": 0.02126733399927616, + "rewards/perpo_ocr_edit_distance_reward": 0.9339684247970581, "step": 997, "temperature": 0.9 }, { - "advantages": -0.0002980232238769531, - "completion_length": 618.0, - "delta_ref_entropy_loss": 0.05255126953125, - "delta_ref_ppl": -0.037353515625, - "entropy_loss": -0.048126220703125, - "epoch": 0.3992, - "grad_norm": 1.9120402936153142, - "k1_kl": 0.037353515625, - "k3_kl": 0.02215576171875, - "kimi_kl": 0.058349609375, - "learning_rate": 3.0039999999999996e-07, - "loss": 0.0012, - "ppl": 0.023101806640625, - "reward": 0.9102523922920227, - "reward_std": 0.053054749965667725, - "rewards/perpo_ocr_edit_distance_reward": 0.9102524220943451, + "advantages": -2.0265579223632812e-06, + "completion_length": 1534.0, + "delta_ref_entropy_loss": 0.05419921875, + "delta_ref_ppl": -0.04833984375, + "entropy_loss": -0.119140625, + "epoch": 0.1996, + "grad_norm": 3.737133269002052, + "k1_kl": 0.04833984375, + "k3_kl": 0.03515625, + "kimi_kl": 0.07373046875, + "learning_rate": 4.002e-07, + "loss": 0.0014, + "ppl": 0.064453125, + "reward": 0.9160547852516174, + "reward_std": 0.04182827100157738, + "rewards/perpo_ocr_edit_distance_reward": 0.916054904460907, "step": 998, "temperature": 0.9 }, { - "advantages": -1.597830305399839e-05, - "completion_length": 493.0, - "delta_ref_entropy_loss": 0.03570556640625, - "delta_ref_ppl": -0.03497314453125, - "entropy_loss": -0.03399658203125, - "epoch": 0.3996, - "grad_norm": 1.1455222123765332, - "k1_kl": 0.03509521484375, - "k3_kl": 0.02337646484375, - "kimi_kl": 0.0517578125, - "learning_rate": 3.002e-07, - "loss": 0.001, - "ppl": 0.01947021484375, - "reward": 0.9970746636390686, - "reward_std": 0.0009287814609706402, - "rewards/perpo_ocr_edit_distance_reward": 0.9970747530460358, + "advantages": -0.00010017838212661445, + "completion_length": 677.0, + "delta_ref_entropy_loss": 0.072265625, + "delta_ref_ppl": -0.055908203125, + "entropy_loss": -0.029296875, + "epoch": 0.1998, + "grad_norm": 0.7177996670837178, + "k1_kl": 0.05615234375, + "k3_kl": 0.03125, + "kimi_kl": 0.08349609375, + "learning_rate": 4.001e-07, + "loss": 0.0013, + "ppl": 0.013671875, + "reward": 0.9912909269332886, + "reward_std": 0.0008348734118044376, + "rewards/perpo_ocr_edit_distance_reward": 0.9912911057472229, "step": 999, "temperature": 0.9 }, { - "advantages": -2.3245812883487815e-06, - "completion_length": 441.0, - "delta_ref_entropy_loss": 0.0609130859375, - "delta_ref_ppl": -0.1123046875, - "entropy_loss": -0.196044921875, - "epoch": 0.4, - "grad_norm": 2.9261337516094876, - "k1_kl": 0.1123046875, - "k3_kl": 0.0697021484375, - "kimi_kl": 0.169921875, - "learning_rate": 3e-07, - "loss": 0.0028, - "ppl": 0.0794677734375, - "reward": 0.6619932800531387, - "reward_std": 0.03483077744022012, - "rewards/perpo_ocr_edit_distance_reward": 0.6619932949542999, - "step": 1000, + "advantages": -1.021793991640152e-07, + "completion_length": 1486.0, + "delta_ref_entropy_loss": 0.04833984375, + "delta_ref_ppl": -0.07421875, + "entropy_loss": -0.0751953125, + "epoch": 0.2, + "grad_norm": 4.668943976966839, + "k1_kl": 0.07421875, + "k3_kl": 0.052001953125, + "kimi_kl": 0.1650390625, + "learning_rate": 4e-07, + "loss": 0.0021, + "ppl": 0.037109375, + "reward": 0.7348268032073975, + "reward_std": 0.19828090071678162, + "rewards/perpo_ocr_edit_distance_reward": 0.7348268628120422, + "step": 1000, "temperature": 0.9 }, { - "advantages": 4.180840733170044e-06, - "completion_length": 331.0, - "delta_ref_entropy_loss": 0.032958984375, - "delta_ref_ppl": -0.0264892578125, - "entropy_loss": -0.04046630859375, - "epoch": 0.4004, - "grad_norm": 1.3465359783323285, - "k1_kl": 0.0263671875, - "k3_kl": 0.01422119140625, - "kimi_kl": 0.02520751953125, - "learning_rate": 2.9979999999999997e-07, - "loss": 0.0006, - "ppl": 0.02392578125, - "reward": 0.9949521422386169, - "reward_std": 0.0015603705105604604, - "rewards/perpo_ocr_edit_distance_reward": 0.9949522316455841, + "advantages": -0.00011161396105308086, + "completion_length": 531.0, + "delta_ref_entropy_loss": 0.04833984375, + "delta_ref_ppl": -0.0625, + "entropy_loss": -0.0216064453125, + "epoch": 0.2002, + "grad_norm": 0.5319567776781045, + "k1_kl": 0.0625, + "k3_kl": 0.040771484375, + "kimi_kl": 0.1328125, + "learning_rate": 3.999e-07, + "loss": 0.0017, + "ppl": 0.007110595703125, + "reward": 0.9898625612258911, + "reward_std": 0.00028143724193796515, + "rewards/perpo_ocr_edit_distance_reward": 0.9898625612258911, "step": 1001, "temperature": 0.9 }, { - "advantages": -3.7478549529623706e-05, - "completion_length": 771.0, - "delta_ref_entropy_loss": 0.040283203125, - "delta_ref_ppl": -0.027679443359375, - "entropy_loss": -0.027099609375, - "epoch": 0.4008, - "grad_norm": 0.4318292729064685, - "k1_kl": 0.02777099609375, - "k3_kl": 0.016326904296875, - "kimi_kl": 0.0406494140625, - "learning_rate": 2.9959999999999996e-07, - "loss": 0.0007, - "ppl": 0.013336181640625, - "reward": 0.9936330914497375, - "reward_std": 0.0006917903665453196, - "rewards/perpo_ocr_edit_distance_reward": 0.9936331510543823, + "advantages": 0.0, + "completion_length": 660.0, + "delta_ref_entropy_loss": 0.041259765625, + "delta_ref_ppl": -0.05126953125, + "entropy_loss": -0.022705078125, + "epoch": 0.2004, + "grad_norm": 0.5391513398893558, + "k1_kl": 0.05126953125, + "k3_kl": 0.036376953125, + "kimi_kl": 0.10400390625, + "learning_rate": 3.9979999999999997e-07, + "loss": 0.0015, + "ppl": 0.0113525390625, + "reward": 0.9836011528968811, + "reward_std": 0.0018859106348827481, + "rewards/perpo_ocr_edit_distance_reward": 0.9836012125015259, "step": 1002, "temperature": 0.9 }, { - "advantages": -1.3121537193683253e-05, - "completion_length": 307.5, - "delta_ref_entropy_loss": 0.110595703125, - "delta_ref_ppl": -0.105224609375, - "entropy_loss": -0.17529296875, - "epoch": 0.4012, - "grad_norm": 2.3733751346810075, - "k1_kl": 0.10498046875, - "k3_kl": 0.0679931640625, - "kimi_kl": 0.2197265625, - "learning_rate": 2.994e-07, - "loss": 0.0027, - "ppl": 0.087158203125, - "reward": 0.9136054217815399, - "reward_std": 0.023126912681618705, - "rewards/perpo_ocr_edit_distance_reward": 0.9136055111885071, + "advantages": 5.516835881280713e-05, + "completion_length": 438.0, + "delta_ref_entropy_loss": 0.07275390625, + "delta_ref_ppl": -0.087890625, + "entropy_loss": -0.0238037109375, + "epoch": 0.2006, + "grad_norm": 0.46655184593542365, + "k1_kl": 0.087890625, + "k3_kl": 0.056640625, + "kimi_kl": 0.169921875, + "learning_rate": 3.9969999999999996e-07, + "loss": 0.0022, + "ppl": 0.00982666015625, + "reward": 0.9921948909759521, + "reward_std": 0.00020858706557191908, + "rewards/perpo_ocr_edit_distance_reward": 0.9921948909759521, "step": 1003, "temperature": 0.9 }, { - "advantages": -5.226901703281328e-05, - "completion_length": 297.0, - "delta_ref_entropy_loss": 0.0142669677734375, - "delta_ref_ppl": -0.114044189453125, - "entropy_loss": -0.029632568359375, - "epoch": 0.4016, - "grad_norm": 0.25458039845763764, - "k1_kl": 0.114044189453125, - "k3_kl": 0.0912322998046875, - "kimi_kl": 0.2909088134765625, - "learning_rate": 2.9920000000000003e-07, - "loss": 0.0037, - "ppl": 0.0159454345703125, - "reward": 0.9995907247066498, - "reward_std": 7.218111568363383e-05, - "rewards/perpo_ocr_edit_distance_reward": 0.9995907545089722, + "advantages": -6.978426972636953e-05, + "completion_length": 476.0, + "delta_ref_entropy_loss": 0.0625, + "delta_ref_ppl": -0.056396484375, + "entropy_loss": -0.0179443359375, + "epoch": 0.2008, + "grad_norm": 1.396974600779381, + "k1_kl": 0.056396484375, + "k3_kl": 0.032470703125, + "kimi_kl": 0.0849609375, + "learning_rate": 3.996e-07, + "loss": 0.0014, + "ppl": 0.00830078125, + "reward": 0.9917527437210083, + "reward_std": 0.0009980321628972888, + "rewards/perpo_ocr_edit_distance_reward": 0.9917528033256531, "step": 1004, "temperature": 0.9 }, { - "advantages": -5.9344943110772874e-05, - "completion_length": 332.5, - "delta_ref_entropy_loss": 0.04437255859375, - "delta_ref_ppl": -0.0479736328125, - "entropy_loss": -0.040283203125, - "epoch": 0.402, - "grad_norm": 0.9099220164891738, - "k1_kl": 0.04803466796875, - "k3_kl": 0.030609130859375, - "kimi_kl": 0.12432861328125, - "learning_rate": 2.9899999999999996e-07, - "loss": 0.0013, - "ppl": 0.0224609375, - "reward": 0.9923337996006012, - "reward_std": 0.0010875624138861895, - "rewards/perpo_ocr_edit_distance_reward": 0.992333859205246, + "advantages": -1.7029899268550253e-08, + "completion_length": 714.0, + "delta_ref_entropy_loss": 0.06982421875, + "delta_ref_ppl": -0.06591796875, + "entropy_loss": -0.034423828125, + "epoch": 0.201, + "grad_norm": 1.082504986680021, + "k1_kl": 0.06591796875, + "k3_kl": 0.04150390625, + "kimi_kl": 0.134765625, + "learning_rate": 3.995e-07, + "loss": 0.0017, + "ppl": 0.014892578125, + "reward": 0.9458496570587158, + "reward_std": 0.00980261992663145, + "rewards/perpo_ocr_edit_distance_reward": 0.9458496570587158, "step": 1005, "temperature": 0.9 }, { - "advantages": -0.0003460560546955094, - "completion_length": 453.0, - "delta_ref_entropy_loss": 0.0513916015625, - "delta_ref_ppl": -0.04296875, - "entropy_loss": -0.032867431640625, - "epoch": 0.4024, - "grad_norm": 0.4645504004965047, - "k1_kl": 0.04296875, - "k3_kl": 0.02679443359375, - "kimi_kl": 0.1025390625, - "learning_rate": 2.988e-07, - "loss": 0.0014, - "ppl": 0.01671600341796875, - "reward": 0.9959001243114471, - "reward_std": 0.0002602420572657138, - "rewards/perpo_ocr_edit_distance_reward": 0.9959002435207367, + "advantages": -0.00010103839304065332, + "completion_length": 223.0, + "delta_ref_entropy_loss": 0.1484375, + "delta_ref_ppl": -0.158203125, + "entropy_loss": -0.0341796875, + "epoch": 0.2012, + "grad_norm": 0.7954212601661768, + "k1_kl": 0.158203125, + "k3_kl": 0.10205078125, + "kimi_kl": 0.36328125, + "learning_rate": 3.9939999999999994e-07, + "loss": 0.0042, + "ppl": 0.0147705078125, + "reward": 0.9552053213119507, + "reward_std": 0.0010800809832289815, + "rewards/perpo_ocr_edit_distance_reward": 0.9552054405212402, "step": 1006, "temperature": 0.9 }, { - "advantages": -0.00012924842121719848, - "completion_length": 415.0, - "delta_ref_entropy_loss": 0.0523681640625, - "delta_ref_ppl": -0.0406494140625, - "entropy_loss": -0.0518798828125, - "epoch": 0.4028, - "grad_norm": 0.6250588041484533, - "k1_kl": 0.0406494140625, - "k3_kl": 0.022705078125, - "kimi_kl": 0.0609130859375, - "learning_rate": 2.986e-07, - "loss": 0.001, - "ppl": 0.02532958984375, - "reward": 0.9736263453960419, - "reward_std": 0.0008009310840861872, - "rewards/perpo_ocr_edit_distance_reward": 0.973626434803009, + "advantages": -9.366444686520481e-08, + "completion_length": 145.0, + "delta_ref_entropy_loss": 0.1533203125, + "delta_ref_ppl": -0.30078125, + "entropy_loss": -0.318359375, + "epoch": 0.2014, + "grad_norm": 5.909047317375092, + "k1_kl": 0.30078125, + "k3_kl": 0.212890625, + "kimi_kl": 1.171875, + "learning_rate": 3.993e-07, + "loss": 0.0085, + "ppl": 0.146484375, + "reward": 0.2808465361595154, + "reward_std": 0.0841226577758789, + "rewards/perpo_ocr_edit_distance_reward": 0.28084656596183777, "step": 1007, "temperature": 0.9 }, { - "advantages": -0.00029829570226524993, - "completion_length": 290.0, - "delta_ref_entropy_loss": 0.02996826171875, - "delta_ref_ppl": -0.04302978515625, - "entropy_loss": -0.0299072265625, - "epoch": 0.4032, - "grad_norm": 0.6509014117672811, - "k1_kl": 0.043243408203125, - "k3_kl": 0.0321502685546875, - "kimi_kl": 0.159576416015625, - "learning_rate": 2.9839999999999997e-07, - "loss": 0.0016, - "ppl": 0.01336669921875, - "reward": 0.9271291196346283, - "reward_std": 0.0235308650881052, - "rewards/perpo_ocr_edit_distance_reward": 0.9271292090415955, + "advantages": -3.641843795776367e-05, + "completion_length": 525.0, + "delta_ref_entropy_loss": 0.099609375, + "delta_ref_ppl": -0.087890625, + "entropy_loss": -0.062255859375, + "epoch": 0.2016, + "grad_norm": 1.409121413796218, + "k1_kl": 0.087890625, + "k3_kl": 0.052734375, + "kimi_kl": 0.1474609375, + "learning_rate": 3.992e-07, + "loss": 0.0021, + "ppl": 0.0269775390625, + "reward": 0.9459669589996338, + "reward_std": 0.0015363424317911267, + "rewards/perpo_ocr_edit_distance_reward": 0.9459670186042786, "step": 1008, "temperature": 0.9 }, { - "advantages": -5.296298629886564e-06, - "completion_length": 894.5, - "delta_ref_entropy_loss": 0.0860595703125, - "delta_ref_ppl": -0.0537109375, - "entropy_loss": -0.17626953125, - "epoch": 0.4036, - "grad_norm": 1.988738702058808, - "k1_kl": 0.0538330078125, - "k3_kl": 0.0321044921875, - "kimi_kl": 0.0810546875, - "learning_rate": 2.982e-07, - "loss": 0.0013, - "ppl": 0.09375, - "reward": 0.5237862169742584, - "reward_std": 0.04954620869830251, - "rewards/perpo_ocr_edit_distance_reward": 0.5237862765789032, + "advantages": -0.00011681458272505552, + "completion_length": 441.0, + "delta_ref_entropy_loss": 0.046875, + "delta_ref_ppl": -0.08203125, + "entropy_loss": -0.029541015625, + "epoch": 0.2018, + "grad_norm": 0.46571584160922636, + "k1_kl": 0.08203125, + "k3_kl": 0.06005859375, + "kimi_kl": 0.2041015625, + "learning_rate": 3.9909999999999997e-07, + "loss": 0.0025, + "ppl": 0.01318359375, + "reward": 0.9934673309326172, + "reward_std": 0.000410311360610649, + "rewards/perpo_ocr_edit_distance_reward": 0.993467390537262, "step": 1009, "temperature": 0.9 }, { - "advantages": -9.366444828629028e-07, - "completion_length": 370.5, - "delta_ref_entropy_loss": 0.058837890625, - "delta_ref_ppl": -0.0650634765625, - "entropy_loss": -0.0560302734375, - "epoch": 0.404, - "grad_norm": 0.3390421321741597, - "k1_kl": 0.0650634765625, - "k3_kl": 0.041015625, - "kimi_kl": 0.1171875, - "learning_rate": 2.98e-07, - "loss": 0.0016, - "ppl": 0.03253173828125, - "reward": 0.9903165698051453, - "reward_std": 0.004445315338671207, - "rewards/perpo_ocr_edit_distance_reward": 0.9903165698051453, + "advantages": 4.619360254309868e-07, + "completion_length": 407.0, + "delta_ref_entropy_loss": 0.1259765625, + "delta_ref_ppl": -0.0908203125, + "entropy_loss": -0.044921875, + "epoch": 0.202, + "grad_norm": 0.5930150948890034, + "k1_kl": 0.09130859375, + "k3_kl": 0.050048828125, + "kimi_kl": 0.13671875, + "learning_rate": 3.99e-07, + "loss": 0.002, + "ppl": 0.01483154296875, + "reward": 0.689171552658081, + "reward_std": 0.017820019274950027, + "rewards/perpo_ocr_edit_distance_reward": 0.689171552658081, "step": 1010, "temperature": 0.9 }, { - "advantages": -5.877444073121296e-05, - "completion_length": 503.0, - "delta_ref_entropy_loss": 0.05206298828125, - "delta_ref_ppl": -0.027587890625, - "entropy_loss": -0.03961181640625, - "epoch": 0.4044, - "grad_norm": 1.0416088495531293, - "k1_kl": 0.027587890625, - "k3_kl": 0.01507568359375, - "kimi_kl": 0.0321044921875, - "learning_rate": 2.978e-07, - "loss": 0.0007, - "ppl": 0.023040771484375, - "reward": 0.9713790118694305, - "reward_std": 0.0006891115917824209, - "rewards/perpo_ocr_edit_distance_reward": 0.9713790416717529, + "advantages": 0.0, + "completion_length": 385.0, + "delta_ref_entropy_loss": 0.020751953125, + "delta_ref_ppl": -0.07177734375, + "entropy_loss": -0.02294921875, + "epoch": 0.2022, + "grad_norm": 0.02729076374645822, + "k1_kl": 0.0712890625, + "k3_kl": 0.05126953125, + "kimi_kl": 0.16796875, + "learning_rate": 3.9889999999999995e-07, + "loss": 0.002, + "ppl": 0.005706787109375, + "reward": 0.9872449040412903, + "reward_std": 0.0, + "rewards/perpo_ocr_edit_distance_reward": 0.9872449040412903, "step": 1011, "temperature": 0.9 }, { - "advantages": -7.820981409167871e-06, - "completion_length": 888.0, - "delta_ref_entropy_loss": 0.0517578125, - "delta_ref_ppl": -0.0460205078125, - "entropy_loss": -0.0513916015625, - "epoch": 0.4048, - "grad_norm": 0.9275900141931516, - "k1_kl": 0.0458984375, - "k3_kl": 0.0301513671875, - "kimi_kl": 0.077392578125, - "learning_rate": 2.9759999999999996e-07, - "loss": 0.0012, - "ppl": 0.0291748046875, - "reward": 0.9848320484161377, - "reward_std": 0.017640579069848172, - "rewards/perpo_ocr_edit_distance_reward": 0.9848320782184601, + "advantages": -3.2407897379016504e-05, + "completion_length": 290.0, + "delta_ref_entropy_loss": 0.09326171875, + "delta_ref_ppl": -0.12255859375, + "entropy_loss": -0.0294189453125, + "epoch": 0.2024, + "grad_norm": 1.4905089746725002, + "k1_kl": 0.1220703125, + "k3_kl": 0.08203125, + "kimi_kl": 0.29296875, + "learning_rate": 3.9879999999999994e-07, + "loss": 0.0033, + "ppl": 0.00897216796875, + "reward": 0.9960004687309265, + "reward_std": 0.0006883495370857418, + "rewards/perpo_ocr_edit_distance_reward": 0.9960005283355713, "step": 1012, "temperature": 0.9 }, { - "advantages": -4.630003786587622e-06, - "completion_length": 380.5, - "delta_ref_entropy_loss": 0.0845947265625, - "delta_ref_ppl": -0.050689697265625, - "entropy_loss": -0.108551025390625, - "epoch": 0.4052, - "grad_norm": 1.23336935983184, - "k1_kl": 0.050933837890625, - "k3_kl": 0.0267333984375, - "kimi_kl": 0.0679931640625, - "learning_rate": 2.974e-07, - "loss": 0.0011, - "ppl": 0.05999755859375, - "reward": 0.9256851673126221, - "reward_std": 0.0022485239896923304, - "rewards/perpo_ocr_edit_distance_reward": 0.9256851673126221, + "advantages": -1.819644785427954e-05, + "completion_length": 1896.0, + "delta_ref_entropy_loss": 0.0185546875, + "delta_ref_ppl": -0.0181884765625, + "entropy_loss": -0.0302734375, + "epoch": 0.2026, + "grad_norm": 1.8020520825732025, + "k1_kl": 0.0181884765625, + "k3_kl": 0.013671875, + "kimi_kl": 0.0302734375, + "learning_rate": 3.987e-07, + "loss": 0.0006, + "ppl": 0.0189208984375, + "reward": 0.9820342063903809, + "reward_std": 0.005048053339123726, + "rewards/perpo_ocr_edit_distance_reward": 0.9820343255996704, "step": 1013, "temperature": 0.9 }, { - "advantages": -6.292547709563223e-06, - "completion_length": 394.5, - "delta_ref_entropy_loss": 0.102783203125, - "delta_ref_ppl": -0.068359375, - "entropy_loss": -0.1099853515625, - "epoch": 0.4056, - "grad_norm": 1.298341081345164, - "k1_kl": 0.068359375, - "k3_kl": 0.0421142578125, - "kimi_kl": 0.1318359375, - "learning_rate": 2.972e-07, - "loss": 0.0017, - "ppl": 0.06256103515625, - "reward": 0.899128258228302, - "reward_std": 0.0044898370979353786, - "rewards/perpo_ocr_edit_distance_reward": 0.8991282880306244, + "advantages": -0.00022489258844871074, + "completion_length": 523.0, + "delta_ref_entropy_loss": 0.055908203125, + "delta_ref_ppl": -0.06494140625, + "entropy_loss": -0.022705078125, + "epoch": 0.2028, + "grad_norm": 0.27707218775784237, + "k1_kl": 0.06494140625, + "k3_kl": 0.04296875, + "kimi_kl": 0.1591796875, + "learning_rate": 3.986e-07, + "loss": 0.0019, + "ppl": 0.005767822265625, + "reward": 0.9932283759117126, + "reward_std": 0.00020288962696213275, + "rewards/perpo_ocr_edit_distance_reward": 0.9932284355163574, "step": 1014, "temperature": 0.9 }, { - "advantages": -4.9327103624818847e-05, - "completion_length": 496.0, - "delta_ref_entropy_loss": 0.0623779296875, - "delta_ref_ppl": -0.06231689453125, - "entropy_loss": -0.0399169921875, - "epoch": 0.406, - "grad_norm": 5.279290724155878, - "k1_kl": 0.06201171875, - "k3_kl": 0.038330078125, - "kimi_kl": 0.09771728515625, - "learning_rate": 2.9699999999999997e-07, - "loss": 0.0016, - "ppl": 0.01898193359375, - "reward": 0.9731557965278625, - "reward_std": 0.00029523775447160006, - "rewards/perpo_ocr_edit_distance_reward": 0.9731558561325073, + "advantages": -7.924011879367754e-05, + "completion_length": 636.0, + "delta_ref_entropy_loss": 0.05126953125, + "delta_ref_ppl": -0.046875, + "entropy_loss": -0.0142822265625, + "epoch": 0.203, + "grad_norm": 0.7016853913122397, + "k1_kl": 0.046630859375, + "k3_kl": 0.0279541015625, + "kimi_kl": 0.07421875, + "learning_rate": 3.9850000000000003e-07, + "loss": 0.0012, + "ppl": 0.005035400390625, + "reward": 0.992982804775238, + "reward_std": 0.0003298170631751418, + "rewards/perpo_ocr_edit_distance_reward": 0.9929828643798828, "step": 1015, "temperature": 0.9 }, { - "advantages": -7.237706967089252e-08, - "completion_length": 117.0, - "delta_ref_entropy_loss": 0.0848388671875, - "delta_ref_ppl": -0.108642578125, - "entropy_loss": -0.0771484375, - "epoch": 0.4064, - "grad_norm": 1.8687910118862985, - "k1_kl": 0.108642578125, - "k3_kl": 0.072509765625, - "kimi_kl": 0.22265625, - "learning_rate": 2.968e-07, - "loss": 0.0029, - "ppl": 0.03900146484375, - "reward": 0.7188208550214767, - "reward_std": 0.035898152738809586, - "rewards/perpo_ocr_edit_distance_reward": 0.7188208699226379, + "advantages": -1.7029899268550253e-08, + "completion_length": 784.0, + "delta_ref_entropy_loss": 0.1396484375, + "delta_ref_ppl": -0.1328125, + "entropy_loss": -0.11083984375, + "epoch": 0.2032, + "grad_norm": 1.564852464021893, + "k1_kl": 0.1328125, + "k3_kl": 0.076171875, + "kimi_kl": 0.2119140625, + "learning_rate": 3.9839999999999997e-07, + "loss": 0.003, + "ppl": 0.058349609375, + "reward": 0.8584195971488953, + "reward_std": 0.002248358214274049, + "rewards/perpo_ocr_edit_distance_reward": 0.8584195971488953, "step": 1016, "temperature": 0.9 }, { - "advantages": -2.5589552024030127e-05, - "completion_length": 623.0, - "delta_ref_entropy_loss": 0.03619384765625, - "delta_ref_ppl": -0.0299072265625, - "entropy_loss": -0.0362548828125, - "epoch": 0.4068, - "grad_norm": 0.504641223369475, - "k1_kl": 0.0299072265625, - "k3_kl": 0.019287109375, - "kimi_kl": 0.07635498046875, - "learning_rate": 2.9659999999999994e-07, + "advantages": 4.512923226229759e-07, + "completion_length": 1839.0, + "delta_ref_entropy_loss": 0.029296875, + "delta_ref_ppl": -0.0302734375, + "entropy_loss": -0.091796875, + "epoch": 0.2034, + "grad_norm": 2.0282649509921935, + "k1_kl": 0.0301513671875, + "k3_kl": 0.0194091796875, + "kimi_kl": 0.0361328125, + "learning_rate": 3.9829999999999996e-07, "loss": 0.0008, - "ppl": 0.01904296875, - "reward": 0.9893068969249725, - "reward_std": 0.0012425024178810418, - "rewards/perpo_ocr_edit_distance_reward": 0.9893069863319397, + "ppl": 0.0498046875, + "reward": 0.7662466764450073, + "reward_std": 0.11321964114904404, + "rewards/perpo_ocr_edit_distance_reward": 0.7662466764450073, "step": 1017, "temperature": 0.9 }, { - "advantages": -7.5953353189106565e-06, - "completion_length": 645.5, - "delta_ref_entropy_loss": 0.0406494140625, - "delta_ref_ppl": -0.031982421875, - "entropy_loss": -0.0699462890625, - "epoch": 0.4072, - "grad_norm": 1.9670589934273233, - "k1_kl": 0.03179931640625, - "k3_kl": 0.01953125, - "kimi_kl": 0.044189453125, - "learning_rate": 2.964e-07, - "loss": 0.0008, - "ppl": 0.0443115234375, - "reward": 0.9903416633605957, - "reward_std": 0.002891315147280693, - "rewards/perpo_ocr_edit_distance_reward": 0.9903416633605957, + "advantages": -0.00015105521015357226, + "completion_length": 700.0, + "delta_ref_entropy_loss": 0.042236328125, + "delta_ref_ppl": -0.044189453125, + "entropy_loss": -0.025634765625, + "epoch": 0.2036, + "grad_norm": 0.49032690005347984, + "k1_kl": 0.044189453125, + "k3_kl": 0.0272216796875, + "kimi_kl": 0.07373046875, + "learning_rate": 3.982e-07, + "loss": 0.0012, + "ppl": 0.0096435546875, + "reward": 0.9848132729530334, + "reward_std": 0.0003509810194373131, + "rewards/perpo_ocr_edit_distance_reward": 0.9848133325576782, "step": 1018, "temperature": 0.9 }, { - "advantages": 7.876328254496912e-06, - "completion_length": 664.0, - "delta_ref_entropy_loss": 0.04241943359375, - "delta_ref_ppl": -0.0285797119140625, - "entropy_loss": -0.03924560546875, - "epoch": 0.4076, - "grad_norm": 0.504833270342447, - "k1_kl": 0.0285797119140625, - "k3_kl": 0.01561737060546875, - "kimi_kl": 0.051422119140625, - "learning_rate": 2.962e-07, - "loss": 0.0006, - "ppl": 0.018341064453125, - "reward": 0.985710620880127, - "reward_std": 0.0017218545763171278, - "rewards/perpo_ocr_edit_distance_reward": 0.9857106804847717, + "advantages": -1.9993101886939257e-05, + "completion_length": 821.0, + "delta_ref_entropy_loss": 0.035400390625, + "delta_ref_ppl": -0.0458984375, + "entropy_loss": -0.01434326171875, + "epoch": 0.2038, + "grad_norm": 0.7720937420405768, + "k1_kl": 0.0458984375, + "k3_kl": 0.0289306640625, + "kimi_kl": 0.0869140625, + "learning_rate": 3.981e-07, + "loss": 0.0012, + "ppl": 0.0050048828125, + "reward": 0.9967616200447083, + "reward_std": 0.0003258985816501081, + "rewards/perpo_ocr_edit_distance_reward": 0.996761679649353, "step": 1019, "temperature": 0.9 }, { - "advantages": -2.7418137733548065e-06, - "completion_length": 504.5, - "delta_ref_entropy_loss": 0.0697021484375, - "delta_ref_ppl": -0.04827880859375, - "entropy_loss": -0.0828857421875, - "epoch": 0.408, - "grad_norm": 1.1716297796380737, - "k1_kl": 0.0479736328125, - "k3_kl": 0.0284423828125, - "kimi_kl": 0.08612060546875, - "learning_rate": 2.9599999999999995e-07, - "loss": 0.0011, - "ppl": 0.0430908203125, - "reward": 0.9233888387680054, - "reward_std": 0.02163826208561659, - "rewards/perpo_ocr_edit_distance_reward": 0.9233889281749725, + "advantages": -0.0005960464477539062, + "completion_length": 393.0, + "delta_ref_entropy_loss": 0.07421875, + "delta_ref_ppl": -0.08837890625, + "entropy_loss": -0.0169677734375, + "epoch": 0.204, + "grad_norm": 0.013606002515432555, + "k1_kl": 0.08837890625, + "k3_kl": 0.051025390625, + "kimi_kl": 0.17578125, + "learning_rate": 3.98e-07, + "loss": 0.0026, + "ppl": 0.0045166015625, + "reward": 0.9927988648414612, + "reward_std": 0.0, + "rewards/perpo_ocr_edit_distance_reward": 0.992798924446106, "step": 1020, "temperature": 0.9 }, { - "advantages": -0.00017816255558500416, - "completion_length": 447.0, - "delta_ref_entropy_loss": 0.0697021484375, - "delta_ref_ppl": -0.0699462890625, - "entropy_loss": -0.0673828125, - "epoch": 0.4084, - "grad_norm": 2.6398512294154868, - "k1_kl": 0.0701904296875, - "k3_kl": 0.046478271484375, - "kimi_kl": 0.118896484375, - "learning_rate": 2.958e-07, - "loss": 0.002, - "ppl": 0.03912353515625, - "reward": 0.9970566630363464, - "reward_std": 0.0030912339061615057, - "rewards/perpo_ocr_edit_distance_reward": 0.9970567524433136, + "advantages": -3.3889500627992675e-06, + "completion_length": 354.0, + "delta_ref_entropy_loss": 0.1640625, + "delta_ref_ppl": -0.1376953125, + "entropy_loss": -0.1748046875, + "epoch": 0.2042, + "grad_norm": 2.0327636803150195, + "k1_kl": 0.138671875, + "k3_kl": 0.08349609375, + "kimi_kl": 0.232421875, + "learning_rate": 3.979e-07, + "loss": 0.0033, + "ppl": 0.09033203125, + "reward": 0.7890377044677734, + "reward_std": 0.00496014766395092, + "rewards/perpo_ocr_edit_distance_reward": 0.789037823677063, "step": 1021, "temperature": 0.9 }, { - "advantages": -6.245715404418206e-06, - "completion_length": 240.0, - "delta_ref_entropy_loss": 0.0635986328125, - "delta_ref_ppl": -0.135986328125, - "entropy_loss": -0.1162109375, - "epoch": 0.4088, - "grad_norm": 2.346086854336411, - "k1_kl": 0.13623046875, - "k3_kl": 0.1103515625, - "kimi_kl": 0.5205078125, - "learning_rate": 2.9559999999999997e-07, - "loss": 0.0044, - "ppl": 0.067626953125, - "reward": 0.6093652248382568, - "reward_std": 0.05040693935006857, - "rewards/perpo_ocr_edit_distance_reward": 0.6093652248382568, + "advantages": -2.8525080324470764e-06, + "completion_length": 420.0, + "delta_ref_entropy_loss": 0.13671875, + "delta_ref_ppl": -0.1220703125, + "entropy_loss": -0.0771484375, + "epoch": 0.2044, + "grad_norm": 2.238572614324734, + "k1_kl": 0.12255859375, + "k3_kl": 0.06787109375, + "kimi_kl": 0.1669921875, + "learning_rate": 3.978e-07, + "loss": 0.0027, + "ppl": 0.045166015625, + "reward": 0.9283069372177124, + "reward_std": 0.03264608606696129, + "rewards/perpo_ocr_edit_distance_reward": 0.928307056427002, "step": 1022, "temperature": 0.9 }, { - "advantages": -6.546293207065901e-05, - "completion_length": 821.0, - "delta_ref_entropy_loss": 0.03375244140625, - "delta_ref_ppl": -0.03857421875, - "entropy_loss": -0.0462646484375, - "epoch": 0.4092, - "grad_norm": 0.9822221116442609, - "k1_kl": 0.0386962890625, - "k3_kl": 0.02545166015625, - "kimi_kl": 0.0699462890625, - "learning_rate": 2.9539999999999996e-07, - "loss": 0.0011, - "ppl": 0.020904541015625, - "reward": 0.9933995008468628, - "reward_std": 0.0042255144799128175, - "rewards/perpo_ocr_edit_distance_reward": 0.9933995306491852, + "advantages": 1.7029899268550253e-08, + "completion_length": 533.0, + "delta_ref_entropy_loss": 0.0654296875, + "delta_ref_ppl": -0.06689453125, + "entropy_loss": -0.037353515625, + "epoch": 0.2046, + "grad_norm": 1.1010466691754677, + "k1_kl": 0.06689453125, + "k3_kl": 0.040771484375, + "kimi_kl": 0.11962890625, + "learning_rate": 3.9769999999999997e-07, + "loss": 0.0016, + "ppl": 0.0174560546875, + "reward": 0.9938159584999084, + "reward_std": 0.0017245277995243669, + "rewards/perpo_ocr_edit_distance_reward": 0.9938160181045532, "step": 1023, "temperature": 0.9 }, { - "advantages": -2.0786055756616406e-05, - "completion_length": 254.5, - "delta_ref_entropy_loss": 0.06982421875, - "delta_ref_ppl": -0.12371826171875, - "entropy_loss": -0.0406494140625, - "epoch": 0.4096, - "grad_norm": 0.526565577987607, - "k1_kl": 0.1236572265625, - "k3_kl": 0.092681884765625, - "kimi_kl": 0.322265625, - "learning_rate": 2.952e-07, - "loss": 0.0037, - "ppl": 0.0164794921875, - "reward": 0.9985785186290741, - "reward_std": 0.0006668506539426744, - "rewards/perpo_ocr_edit_distance_reward": 0.9985785782337189, + "advantages": -7.289648056030273e-05, + "completion_length": 482.0, + "delta_ref_entropy_loss": 0.0419921875, + "delta_ref_ppl": -0.0576171875, + "entropy_loss": -0.0107421875, + "epoch": 0.2048, + "grad_norm": 0.5030705830733149, + "k1_kl": 0.057861328125, + "k3_kl": 0.041259765625, + "kimi_kl": 0.189453125, + "learning_rate": 3.976e-07, + "loss": 0.0017, + "ppl": 0.0029296875, + "reward": 0.956064760684967, + "reward_std": 0.0007175251375883818, + "rewards/perpo_ocr_edit_distance_reward": 0.9560648798942566, "step": 1024, "temperature": 0.9 }, { - "advantages": 3.8321530155371875e-05, - "completion_length": 459.0, - "delta_ref_entropy_loss": 0.068603515625, - "delta_ref_ppl": -0.06591796875, - "entropy_loss": -0.0330810546875, - "epoch": 0.41, - "grad_norm": 0.18341590740369318, - "k1_kl": 0.06591796875, - "k3_kl": 0.042266845703125, - "kimi_kl": 0.12017822265625, - "learning_rate": 2.95e-07, - "loss": 0.0017, - "ppl": 0.018280029296875, - "reward": 0.9996671080589294, - "reward_std": 0.00011663533950923011, - "rewards/perpo_ocr_edit_distance_reward": 0.9996671080589294, + "advantages": -2.16364878724562e-05, + "completion_length": 389.0, + "delta_ref_entropy_loss": 0.061279296875, + "delta_ref_ppl": -0.07373046875, + "entropy_loss": -0.03564453125, + "epoch": 0.205, + "grad_norm": 1.1234814543277278, + "k1_kl": 0.07373046875, + "k3_kl": 0.052001953125, + "kimi_kl": 0.1396484375, + "learning_rate": 3.975e-07, + "loss": 0.0021, + "ppl": 0.01397705078125, + "reward": 0.8690277338027954, + "reward_std": 0.0030459475237876177, + "rewards/perpo_ocr_edit_distance_reward": 0.869027853012085, "step": 1025, "temperature": 0.9 }, { - "advantages": -0.00029861927032470703, - "completion_length": 462.5, - "delta_ref_entropy_loss": 0.0404052734375, - "delta_ref_ppl": -0.042724609375, - "entropy_loss": -0.03759765625, - "epoch": 0.4104, - "grad_norm": 0.9806403319693455, - "k1_kl": 0.042724609375, - "k3_kl": 0.0296630859375, - "kimi_kl": 0.09808349609375, - "learning_rate": 2.948e-07, - "loss": 0.0015, - "ppl": 0.021942138671875, - "reward": 0.9281362891197205, - "reward_std": 0.028434500098228455, - "rewards/perpo_ocr_edit_distance_reward": 0.92813640832901, + "advantages": -0.00022795371478423476, + "completion_length": 559.0, + "delta_ref_entropy_loss": 0.07861328125, + "delta_ref_ppl": -0.06640625, + "entropy_loss": -0.023193359375, + "epoch": 0.2052, + "grad_norm": 0.2563106222162385, + "k1_kl": 0.06640625, + "k3_kl": 0.0361328125, + "kimi_kl": 0.0947265625, + "learning_rate": 3.9739999999999995e-07, + "loss": 0.0017, + "ppl": 0.0062255859375, + "reward": 0.9828910827636719, + "reward_std": 0.00016148353461176157, + "rewards/perpo_ocr_edit_distance_reward": 0.9828911423683167, "step": 1026, "temperature": 0.9 }, { - "advantages": -9.655952567300119e-06, - "completion_length": 541.0, - "delta_ref_entropy_loss": 0.07275390625, - "delta_ref_ppl": -0.0543212890625, - "entropy_loss": -0.0859375, - "epoch": 0.4108, - "grad_norm": 10.52652502977859, - "k1_kl": 0.05419921875, - "k3_kl": 0.0355224609375, - "kimi_kl": 0.1298828125, - "learning_rate": 2.9459999999999995e-07, - "loss": 0.0014, - "ppl": 0.0482177734375, - "reward": 0.9179910719394684, - "reward_std": 0.002555926621425897, - "rewards/perpo_ocr_edit_distance_reward": 0.9179911315441132, + "advantages": -1.0831015970325097e-05, + "completion_length": 194.0, + "delta_ref_entropy_loss": 0.1435546875, + "delta_ref_ppl": -0.1591796875, + "entropy_loss": -0.044189453125, + "epoch": 0.2054, + "grad_norm": 1.1427165041092866, + "k1_kl": 0.1591796875, + "k3_kl": 0.1064453125, + "kimi_kl": 0.43359375, + "learning_rate": 3.973e-07, + "loss": 0.0043, + "ppl": 0.0185546875, + "reward": 0.9917476177215576, + "reward_std": 0.0006861609290353954, + "rewards/perpo_ocr_edit_distance_reward": 0.9917476773262024, "step": 1027, "temperature": 0.9 }, { - "advantages": -3.5166741554348846e-06, - "completion_length": 535.5, - "delta_ref_entropy_loss": 0.0615234375, - "delta_ref_ppl": -0.04638671875, - "entropy_loss": -0.062744140625, - "epoch": 0.4112, - "grad_norm": 1.1559606388872192, - "k1_kl": 0.04644775390625, - "k3_kl": 0.02850341796875, - "kimi_kl": 0.0709228515625, - "learning_rate": 2.944e-07, - "loss": 0.0011, - "ppl": 0.0327301025390625, - "reward": 0.9500490427017212, - "reward_std": 0.02495174016803503, - "rewards/perpo_ocr_edit_distance_reward": 0.9500491619110107, + "advantages": -8.475781214656308e-05, + "completion_length": 301.0, + "delta_ref_entropy_loss": 0.1083984375, + "delta_ref_ppl": -0.10986328125, + "entropy_loss": -0.02294921875, + "epoch": 0.2056, + "grad_norm": 0.7230804632216016, + "k1_kl": 0.1103515625, + "k3_kl": 0.0771484375, + "kimi_kl": 0.23046875, + "learning_rate": 3.972e-07, + "loss": 0.0032, + "ppl": 0.009521484375, + "reward": 0.9927979707717896, + "reward_std": 0.00040220090886577964, + "rewards/perpo_ocr_edit_distance_reward": 0.9927980899810791, "step": 1028, "temperature": 0.9 }, { - "advantages": -7.60150869609788e-05, - "completion_length": 473.5, - "delta_ref_entropy_loss": 0.11883544921875, - "delta_ref_ppl": -0.242401123046875, - "entropy_loss": -0.08734130859375, - "epoch": 0.4116, - "grad_norm": 0.20433242842713803, - "k1_kl": 0.242431640625, - "k3_kl": 0.1782073974609375, - "kimi_kl": 0.581146240234375, - "learning_rate": 2.9420000000000003e-07, - "loss": 0.0072, - "ppl": 0.0389404296875, - "reward": 0.9996822774410248, - "reward_std": 0.00011801698565250263, - "rewards/perpo_ocr_edit_distance_reward": 0.9996823370456696, + "advantages": -7.288796950888354e-06, + "completion_length": 994.0, + "delta_ref_entropy_loss": 0.0771484375, + "delta_ref_ppl": -0.05322265625, + "entropy_loss": -0.0341796875, + "epoch": 0.2058, + "grad_norm": 2.9054499540420466, + "k1_kl": 0.05322265625, + "k3_kl": 0.029296875, + "kimi_kl": 0.0712890625, + "learning_rate": 3.971e-07, + "loss": 0.0012, + "ppl": 0.01611328125, + "reward": 0.9874773621559143, + "reward_std": 0.005759583320468664, + "rewards/perpo_ocr_edit_distance_reward": 0.9874773621559143, "step": 1029, "temperature": 0.9 }, { - "advantages": -7.30167575966334e-05, - "completion_length": 481.0, - "delta_ref_entropy_loss": 0.07684326171875, - "delta_ref_ppl": -0.0623779296875, - "entropy_loss": -0.06781005859375, - "epoch": 0.412, - "grad_norm": 1.0352504431136536, - "k1_kl": 0.0623779296875, - "k3_kl": 0.03704833984375, - "kimi_kl": 0.101806640625, - "learning_rate": 2.9399999999999996e-07, - "loss": 0.0016, - "ppl": 0.03472900390625, - "reward": 0.9772048890590668, - "reward_std": 0.0011583173472899944, - "rewards/perpo_ocr_edit_distance_reward": 0.9772050380706787, + "advantages": -1.5224730304908007e-05, + "completion_length": 286.0, + "delta_ref_entropy_loss": 0.11572265625, + "delta_ref_ppl": -0.1357421875, + "entropy_loss": -0.037841796875, + "epoch": 0.206, + "grad_norm": 1.5391733277225945, + "k1_kl": 0.13671875, + "k3_kl": 0.09228515625, + "kimi_kl": 0.3359375, + "learning_rate": 3.97e-07, + "loss": 0.0037, + "ppl": 0.01806640625, + "reward": 0.9854745268821716, + "reward_std": 0.0032598664984107018, + "rewards/perpo_ocr_edit_distance_reward": 0.9854745864868164, "step": 1030, "temperature": 0.9 }, { - "advantages": -0.00014829210289235562, - "completion_length": 591.0, - "delta_ref_entropy_loss": 0.08721923828125, - "delta_ref_ppl": -0.05621337890625, - "entropy_loss": -0.1134033203125, - "epoch": 0.4124, - "grad_norm": 5.47103578977593, - "k1_kl": 0.05645751953125, - "k3_kl": 0.060394287109375, - "kimi_kl": 0.0885009765625, - "learning_rate": 2.938e-07, - "loss": 0.0026, - "ppl": 0.064605712890625, - "reward": 0.8860781192779541, - "reward_std": 0.011346216677338816, - "rewards/perpo_ocr_edit_distance_reward": 0.8860782384872437, + "advantages": -1.576968679728452e-05, + "completion_length": 357.0, + "delta_ref_entropy_loss": 0.0703125, + "delta_ref_ppl": -0.06201171875, + "entropy_loss": -0.01373291015625, + "epoch": 0.2062, + "grad_norm": 0.48507382832826185, + "k1_kl": 0.0615234375, + "k3_kl": 0.0322265625, + "kimi_kl": 0.07275390625, + "learning_rate": 3.9689999999999996e-07, + "loss": 0.0013, + "ppl": 0.0048828125, + "reward": 0.9975053071975708, + "reward_std": 0.0004400172911118716, + "rewards/perpo_ocr_edit_distance_reward": 0.9975054264068604, "step": 1031, "temperature": 0.9 }, { - "advantages": -6.029164592291636e-05, - "completion_length": 489.0, - "delta_ref_entropy_loss": 0.0872802734375, - "delta_ref_ppl": -0.051513671875, - "entropy_loss": -0.1064453125, - "epoch": 0.4128, - "grad_norm": 2.03974856536149, - "k1_kl": 0.0517578125, - "k3_kl": 0.0264892578125, - "kimi_kl": 0.060302734375, - "learning_rate": 2.9360000000000003e-07, - "loss": 0.0011, - "ppl": 0.05572509765625, - "reward": 0.9545745849609375, - "reward_std": 0.015731562176370062, - "rewards/perpo_ocr_edit_distance_reward": 0.9545746743679047, + "advantages": 0.0, + "completion_length": 34.0, + "delta_ref_entropy_loss": 0.201171875, + "delta_ref_ppl": -0.765625, + "entropy_loss": -0.03369140625, + "epoch": 0.2064, + "grad_norm": 0.02213115735415483, + "k1_kl": 0.76171875, + "k3_kl": 0.6171875, + "kimi_kl": 2.59375, + "learning_rate": 3.9679999999999995e-07, + "loss": 0.0247, + "ppl": 0.005645751953125, + "reward": 0.8295454382896423, + "reward_std": 0.0, + "rewards/perpo_ocr_edit_distance_reward": 0.8295454382896423, "step": 1032, "temperature": 0.9 }, { - "advantages": -6.377697388870729e-06, - "completion_length": 338.5, - "delta_ref_entropy_loss": 0.0631103515625, - "delta_ref_ppl": -0.07177734375, - "entropy_loss": -0.0526123046875, - "epoch": 0.4132, - "grad_norm": 1.2644909430133038, - "k1_kl": 0.072021484375, - "k3_kl": 0.046630859375, - "kimi_kl": 0.13623046875, - "learning_rate": 2.9339999999999997e-07, - "loss": 0.0019, - "ppl": 0.02825927734375, - "reward": 0.9571170210838318, - "reward_std": 0.022390968922991306, - "rewards/perpo_ocr_edit_distance_reward": 0.9571170508861542, + "advantages": -6.726810397594818e-07, + "completion_length": 947.0, + "delta_ref_entropy_loss": 0.09130859375, + "delta_ref_ppl": -0.0673828125, + "entropy_loss": -0.11376953125, + "epoch": 0.2066, + "grad_norm": 3.938046238312207, + "k1_kl": 0.0673828125, + "k3_kl": 0.03955078125, + "kimi_kl": 0.072265625, + "learning_rate": 3.967e-07, + "loss": 0.0016, + "ppl": 0.052734375, + "reward": 0.5576093792915344, + "reward_std": 0.06562750041484833, + "rewards/perpo_ocr_edit_distance_reward": 0.5576093792915344, "step": 1033, "temperature": 0.9 }, { - "advantages": 5.066394805908203e-07, - "completion_length": 234.0, - "delta_ref_entropy_loss": 0.0513916015625, - "delta_ref_ppl": -0.07623291015625, - "entropy_loss": -0.026458740234375, - "epoch": 0.4136, - "grad_norm": 0.9053648940217914, - "k1_kl": 0.0767822265625, - "k3_kl": 0.053436279296875, - "kimi_kl": 0.2828369140625, - "learning_rate": 2.932e-07, - "loss": 0.0021, - "ppl": 0.0123748779296875, - "reward": 0.8199935555458069, - "reward_std": 0.0009607465763110667, - "rewards/perpo_ocr_edit_distance_reward": 0.8199935853481293, + "advantages": -7.459095741069177e-06, + "completion_length": 90.0, + "delta_ref_entropy_loss": 0.04150390625, + "delta_ref_ppl": -0.25390625, + "entropy_loss": -0.0849609375, + "epoch": 0.2068, + "grad_norm": 4.249683510003986, + "k1_kl": 0.25390625, + "k3_kl": 0.197265625, + "kimi_kl": 0.78125, + "learning_rate": 3.966e-07, + "loss": 0.0079, + "ppl": 0.042724609375, + "reward": 0.8296974301338196, + "reward_std": 0.003313272725790739, + "rewards/perpo_ocr_edit_distance_reward": 0.8296974301338196, "step": 1034, "temperature": 0.9 }, { - "advantages": -1.8588134480523877e-05, - "completion_length": 706.0, - "delta_ref_entropy_loss": 0.0408935546875, - "delta_ref_ppl": -0.0582275390625, - "entropy_loss": -0.036865234375, - "epoch": 0.414, - "grad_norm": 0.309984343651253, - "k1_kl": 0.0582275390625, - "k3_kl": 0.04046630859375, - "kimi_kl": 0.153076171875, - "learning_rate": 2.93e-07, - "loss": 0.0016, - "ppl": 0.019683837890625, - "reward": 0.9921649098396301, - "reward_std": 0.0010949841234833002, - "rewards/perpo_ocr_edit_distance_reward": 0.9921649694442749, + "advantages": -1.8392290712654358e-06, + "completion_length": 1619.0, + "delta_ref_entropy_loss": 0.02587890625, + "delta_ref_ppl": -0.0322265625, + "entropy_loss": -0.045654296875, + "epoch": 0.207, + "grad_norm": 15.857141008834187, + "k1_kl": 0.0322265625, + "k3_kl": 0.02001953125, + "kimi_kl": 0.04931640625, + "learning_rate": 3.965e-07, + "loss": 0.0008, + "ppl": 0.0299072265625, + "reward": 0.9779382348060608, + "reward_std": 0.041740719228982925, + "rewards/perpo_ocr_edit_distance_reward": 0.9779384136199951, "step": 1035, "temperature": 0.9 }, { - "advantages": -8.766566043050261e-05, - "completion_length": 649.5, - "delta_ref_entropy_loss": 0.057861328125, - "delta_ref_ppl": -0.0458984375, - "entropy_loss": -0.046478271484375, - "epoch": 0.4144, - "grad_norm": 0.6573102346841732, - "k1_kl": 0.04595947265625, - "k3_kl": 0.02728271484375, - "kimi_kl": 0.0772705078125, - "learning_rate": 2.928e-07, - "loss": 0.0012, - "ppl": 0.0225067138671875, - "reward": 0.9463482201099396, - "reward_std": 0.0016218741293414496, - "rewards/perpo_ocr_edit_distance_reward": 0.9463482797145844, + "advantages": 0.0, + "completion_length": 36.0, + "delta_ref_entropy_loss": 0.30078125, + "delta_ref_ppl": -0.55078125, + "entropy_loss": -0.060302734375, + "epoch": 0.2072, + "grad_norm": 0.0899918669662859, + "k1_kl": 0.546875, + "k3_kl": 0.3828125, + "kimi_kl": 1.09375, + "learning_rate": 3.964e-07, + "loss": 0.0153, + "ppl": 0.01019287109375, + "reward": 0.8986486196517944, + "reward_std": 0.0, + "rewards/perpo_ocr_edit_distance_reward": 0.8986486196517944, "step": 1036, "temperature": 0.9 }, { - "advantages": -9.70704263636435e-07, - "completion_length": 991.0, - "delta_ref_entropy_loss": 0.0274658203125, - "delta_ref_ppl": -0.0316162109375, - "entropy_loss": -0.0302734375, - "epoch": 0.4148, - "grad_norm": 1.0990225027998408, - "k1_kl": 0.0316162109375, - "k3_kl": 0.0219268798828125, - "kimi_kl": 0.081512451171875, - "learning_rate": 2.926e-07, - "loss": 0.0009, - "ppl": 0.016082763671875, - "reward": 0.9863583743572235, - "reward_std": 0.01767222210764885, - "rewards/perpo_ocr_edit_distance_reward": 0.9863584041595459, + "advantages": -1.7029899268550253e-08, + "completion_length": 996.0, + "delta_ref_entropy_loss": 0.0595703125, + "delta_ref_ppl": -0.05029296875, + "entropy_loss": -0.03173828125, + "epoch": 0.2074, + "grad_norm": 0.36759985841847365, + "k1_kl": 0.05029296875, + "k3_kl": 0.0269775390625, + "kimi_kl": 0.06689453125, + "learning_rate": 3.9629999999999997e-07, + "loss": 0.0011, + "ppl": 0.01263427734375, + "reward": 0.9964096546173096, + "reward_std": 0.0003468406794127077, + "rewards/perpo_ocr_edit_distance_reward": 0.9964097142219543, "step": 1037, "temperature": 0.9 }, { - "advantages": 7.459095670014904e-06, - "completion_length": 181.0, - "delta_ref_entropy_loss": 0.10955810546875, - "delta_ref_ppl": -0.00970458984375, - "entropy_loss": -0.155517578125, - "epoch": 0.4152, - "grad_norm": 11.939961189387235, - "k1_kl": 0.009765625, - "k3_kl": 0.694580078125, - "kimi_kl": 0.330078125, - "learning_rate": 2.924e-07, - "loss": 0.0278, - "ppl": 0.14764404296875, - "reward": 0.8112952411174774, - "reward_std": 0.21639748354209587, - "rewards/perpo_ocr_edit_distance_reward": 0.8112952709197998, + "advantages": -5.601985321845859e-05, + "completion_length": 818.0, + "delta_ref_entropy_loss": 0.030517578125, + "delta_ref_ppl": -0.033935546875, + "entropy_loss": -0.0179443359375, + "epoch": 0.2076, + "grad_norm": 0.6282545774059817, + "k1_kl": 0.033935546875, + "k3_kl": 0.019775390625, + "kimi_kl": 0.050048828125, + "learning_rate": 3.9619999999999996e-07, + "loss": 0.0008, + "ppl": 0.0089111328125, + "reward": 0.9962009787559509, + "reward_std": 0.0008120395359583199, + "rewards/perpo_ocr_edit_distance_reward": 0.9962010383605957, "step": 1038, "temperature": 0.9 }, { - "advantages": -1.9754683762585046e-05, - "completion_length": 750.0, - "delta_ref_entropy_loss": 0.0360107421875, - "delta_ref_ppl": -0.0618896484375, - "entropy_loss": -0.05078125, - "epoch": 0.4156, - "grad_norm": 1.2863056378213824, - "k1_kl": 0.0618896484375, - "k3_kl": 0.0460205078125, - "kimi_kl": 0.16162109375, - "learning_rate": 2.922e-07, - "loss": 0.0019, - "ppl": 0.02520751953125, - "reward": 0.9897417724132538, - "reward_std": 0.003235402749851346, - "rewards/perpo_ocr_edit_distance_reward": 0.989741861820221, - "step": 1039, - "temperature": 0.9 - }, - { - "advantages": 2.6472977879166137e-05, - "completion_length": 726.0, - "delta_ref_entropy_loss": 0.03662109375, - "delta_ref_ppl": -0.035888671875, - "entropy_loss": -0.0323486328125, - "epoch": 0.416, - "grad_norm": 1.3054252933214243, - "k1_kl": 0.035888671875, - "k3_kl": 0.023193359375, - "kimi_kl": 0.065185546875, - "learning_rate": 2.9199999999999997e-07, - "loss": 0.0009, - "ppl": 0.016448974609375, - "reward": 0.9921243488788605, - "reward_std": 0.0014758456673007458, - "rewards/perpo_ocr_edit_distance_reward": 0.9921243786811829, + "advantages": -0.0005960464477539062, + "completion_length": 677.0, + "delta_ref_entropy_loss": 0.0771484375, + "delta_ref_ppl": -0.06494140625, + "entropy_loss": -0.0103759765625, + "epoch": 0.2078, + "grad_norm": 0.0036827049757117794, + "k1_kl": 0.06494140625, + "k3_kl": 0.038330078125, + "kimi_kl": 0.142578125, + "learning_rate": 3.961e-07, + "loss": 0.0021, + "ppl": 0.001861572265625, + "reward": 0.9834237694740295, + "reward_std": 0.0, + "rewards/perpo_ocr_edit_distance_reward": 0.9834238290786743, + "step": 1039, + "temperature": 0.9 + }, + { + "advantages": -8.30207568469632e-07, + "completion_length": 1103.0, + "delta_ref_entropy_loss": 0.0306396484375, + "delta_ref_ppl": -0.042724609375, + "entropy_loss": -0.0625, + "epoch": 0.208, + "grad_norm": 1.0341572241850032, + "k1_kl": 0.04248046875, + "k3_kl": 0.0286865234375, + "kimi_kl": 0.076171875, + "learning_rate": 3.96e-07, + "loss": 0.0012, + "ppl": 0.03271484375, + "reward": 0.8502388000488281, + "reward_std": 0.07093478739261627, + "rewards/perpo_ocr_edit_distance_reward": 0.8502388596534729, "step": 1040, "temperature": 0.9 }, { - "advantages": -4.180840278422693e-05, - "completion_length": 662.0, - "delta_ref_entropy_loss": 0.01922607421875, - "delta_ref_ppl": -0.02642822265625, - "entropy_loss": -0.0177001953125, - "epoch": 0.4164, - "grad_norm": 0.6820723430829692, - "k1_kl": 0.02642822265625, - "k3_kl": 0.017364501953125, - "kimi_kl": 0.0540771484375, - "learning_rate": 2.918e-07, - "loss": 0.0007, - "ppl": 0.009735107421875, - "reward": 0.9935475885868073, - "reward_std": 0.0011207265342818573, - "rewards/perpo_ocr_edit_distance_reward": 0.9935475885868073, + "advantages": -2.2138868871479644e-07, + "completion_length": 471.0, + "delta_ref_entropy_loss": 0.130859375, + "delta_ref_ppl": -0.1298828125, + "entropy_loss": -0.2412109375, + "epoch": 0.2082, + "grad_norm": 2.7743715984082074, + "k1_kl": 0.12890625, + "k3_kl": 0.0859375, + "kimi_kl": 0.2734375, + "learning_rate": 3.9589999999999994e-07, + "loss": 0.0034, + "ppl": 0.12109375, + "reward": 0.7293691039085388, + "reward_std": 0.07567644864320755, + "rewards/perpo_ocr_edit_distance_reward": 0.7293691039085388, "step": 1041, "temperature": 0.9 }, { - "advantages": -4.4103179789090063e-05, - "completion_length": 511.0, - "delta_ref_entropy_loss": 0.026214599609375, - "delta_ref_ppl": -0.022735595703125, - "entropy_loss": -0.039306640625, - "epoch": 0.4168, - "grad_norm": 1.0763069482396805, - "k1_kl": 0.022857666015625, - "k3_kl": 0.013671875, - "kimi_kl": 0.032806396484375, - "learning_rate": 2.916e-07, - "loss": 0.0006, - "ppl": 0.022308349609375, - "reward": 0.9945041537284851, - "reward_std": 0.0008639208972454071, - "rewards/perpo_ocr_edit_distance_reward": 0.9945042133331299, + "advantages": 0.0, + "completion_length": 447.0, + "delta_ref_entropy_loss": 0.05908203125, + "delta_ref_ppl": -0.061279296875, + "entropy_loss": -0.01312255859375, + "epoch": 0.2084, + "grad_norm": 0.01586397488264622, + "k1_kl": 0.061279296875, + "k3_kl": 0.0400390625, + "kimi_kl": 0.134765625, + "learning_rate": 3.958e-07, + "loss": 0.0016, + "ppl": 0.005950927734375, + "reward": 0.992416501045227, + "reward_std": 0.0, + "rewards/perpo_ocr_edit_distance_reward": 0.9924165606498718, "step": 1042, "temperature": 0.9 }, { - "advantages": -4.427773774295929e-07, - "completion_length": 582.5, - "delta_ref_entropy_loss": 0.0927734375, - "delta_ref_ppl": -0.110595703125, - "entropy_loss": -0.15478515625, - "epoch": 0.4172, - "grad_norm": 4.60050822362895, - "k1_kl": 0.110595703125, - "k3_kl": 0.0794677734375, - "kimi_kl": 0.270263671875, - "learning_rate": 2.914e-07, - "loss": 0.0032, - "ppl": 0.0869140625, - "reward": 0.6478443741798401, - "reward_std": 0.09887131489813328, - "rewards/perpo_ocr_edit_distance_reward": 0.6478444337844849, + "advantages": -0.0005960464477539062, + "completion_length": 97.0, + "delta_ref_entropy_loss": 0.1005859375, + "delta_ref_ppl": -0.2578125, + "entropy_loss": -0.018310546875, + "epoch": 0.2086, + "grad_norm": 0.015376387779989759, + "k1_kl": 0.2578125, + "k3_kl": 0.1962890625, + "kimi_kl": 0.77734375, + "learning_rate": 3.957e-07, + "loss": 0.0084, + "ppl": 0.002593994140625, + "reward": 0.9876033067703247, + "reward_std": 0.0, + "rewards/perpo_ocr_edit_distance_reward": 0.9876033663749695, "step": 1043, "temperature": 0.9 }, { - "advantages": -1.7029899268550253e-08, - "completion_length": 613.0, - "delta_ref_entropy_loss": 0.0390625, - "delta_ref_ppl": -0.02789306640625, - "entropy_loss": -0.025360107421875, - "epoch": 0.4176, - "grad_norm": 0.274048324719953, - "k1_kl": 0.02789306640625, - "k3_kl": 0.016510009765625, - "kimi_kl": 0.0521240234375, - "learning_rate": 2.912e-07, - "loss": 0.0007, - "ppl": 0.01409912109375, - "reward": 0.9950443208217621, - "reward_std": 0.0020647156052291393, - "rewards/perpo_ocr_edit_distance_reward": 0.9950443208217621, + "advantages": -8.310590601467993e-06, + "completion_length": 1184.0, + "delta_ref_entropy_loss": 0.06298828125, + "delta_ref_ppl": -0.051513671875, + "entropy_loss": -0.037109375, + "epoch": 0.2088, + "grad_norm": 3.8986466953217147, + "k1_kl": 0.051513671875, + "k3_kl": 0.0267333984375, + "kimi_kl": 0.056396484375, + "learning_rate": 3.9559999999999997e-07, + "loss": 0.0011, + "ppl": 0.019287109375, + "reward": 0.9921896457672119, + "reward_std": 0.000920991413295269, + "rewards/perpo_ocr_edit_distance_reward": 0.9921897053718567, "step": 1044, "temperature": 0.9 }, { - "advantages": -2.1342721993278246e-05, - "completion_length": 509.5, - "delta_ref_entropy_loss": 0.0452880859375, - "delta_ref_ppl": -0.0318603515625, - "entropy_loss": -0.05364990234375, - "epoch": 0.418, - "grad_norm": 1.8854391634750236, - "k1_kl": 0.031982421875, - "k3_kl": 0.02081298828125, - "kimi_kl": 0.05377197265625, - "learning_rate": 2.9099999999999995e-07, - "loss": 0.0009, - "ppl": 0.029296875, - "reward": 0.9954390227794647, - "reward_std": 0.0014282560150604695, - "rewards/perpo_ocr_edit_distance_reward": 0.9954390823841095, + "advantages": -7.370540697593242e-05, + "completion_length": 599.0, + "delta_ref_entropy_loss": 0.08984375, + "delta_ref_ppl": -0.083984375, + "entropy_loss": -0.0311279296875, + "epoch": 0.209, + "grad_norm": 0.6796195078129804, + "k1_kl": 0.083984375, + "k3_kl": 0.044677734375, + "kimi_kl": 0.126953125, + "learning_rate": 3.955e-07, + "loss": 0.0019, + "ppl": 0.0159912109375, + "reward": 0.9655437469482422, + "reward_std": 0.0009398451657034457, + "rewards/perpo_ocr_edit_distance_reward": 0.9655438661575317, "step": 1045, "temperature": 0.9 }, { - "advantages": -1.1912414265680127e-05, - "completion_length": 367.5, - "delta_ref_entropy_loss": 0.037841796875, - "delta_ref_ppl": -0.0484619140625, - "entropy_loss": -0.017852783203125, - "epoch": 0.4184, - "grad_norm": 0.14442039274407267, - "k1_kl": 0.04833984375, - "k3_kl": 0.03253173828125, - "kimi_kl": 0.11181640625, - "learning_rate": 2.908e-07, - "loss": 0.0013, - "ppl": 0.007476806640625, - "reward": 0.9998117387294769, - "reward_std": 0.00012860984134022146, - "rewards/perpo_ocr_edit_distance_reward": 0.9998117387294769, + "advantages": -1.7361982827424072e-05, + "completion_length": 932.0, + "delta_ref_entropy_loss": 0.06982421875, + "delta_ref_ppl": -0.08154296875, + "entropy_loss": -0.07568359375, + "epoch": 0.2092, + "grad_norm": 2.6695568063639428, + "k1_kl": 0.08154296875, + "k3_kl": 0.047607421875, + "kimi_kl": 0.126953125, + "learning_rate": 3.9539999999999995e-07, + "loss": 0.0019, + "ppl": 0.0439453125, + "reward": 0.7948551177978516, + "reward_std": 0.0018595498986542225, + "rewards/perpo_ocr_edit_distance_reward": 0.7948551774024963, "step": 1046, "temperature": 0.9 }, { - "advantages": -1.3806990409648279e-05, - "completion_length": 660.5, - "delta_ref_entropy_loss": 0.06341552734375, - "delta_ref_ppl": -0.061279296875, - "entropy_loss": -0.07757568359375, - "epoch": 0.4188, - "grad_norm": 1.0219663896294207, - "k1_kl": 0.061279296875, - "k3_kl": 0.04302978515625, - "kimi_kl": 0.1119384765625, - "learning_rate": 2.906e-07, - "loss": 0.0017, - "ppl": 0.04449462890625, - "reward": 0.9696392118930817, - "reward_std": 0.002752375148702413, - "rewards/perpo_ocr_edit_distance_reward": 0.9696393311023712, + "advantages": -6.949475937290117e-05, + "completion_length": 439.0, + "delta_ref_entropy_loss": 0.07958984375, + "delta_ref_ppl": -0.076171875, + "entropy_loss": -0.0458984375, + "epoch": 0.2094, + "grad_norm": 1.1905851312190716, + "k1_kl": 0.076171875, + "k3_kl": 0.045654296875, + "kimi_kl": 0.1474609375, + "learning_rate": 3.9529999999999995e-07, + "loss": 0.0019, + "ppl": 0.022216796875, + "reward": 0.982483446598053, + "reward_std": 0.0007579560624435544, + "rewards/perpo_ocr_edit_distance_reward": 0.9824835062026978, "step": 1047, "temperature": 0.9 }, { - "advantages": -1.269579001927923e-05, - "completion_length": 799.5, - "delta_ref_entropy_loss": 0.03521728515625, - "delta_ref_ppl": -0.0242156982421875, - "entropy_loss": -0.05120849609375, - "epoch": 0.4192, - "grad_norm": 0.7313963785294999, - "k1_kl": 0.024200439453125, - "k3_kl": 0.0152587890625, - "kimi_kl": 0.036041259765625, - "learning_rate": 2.9039999999999995e-07, - "loss": 0.0006, - "ppl": 0.03033447265625, - "reward": 0.9597012102603912, - "reward_std": 0.008542855881387368, - "rewards/perpo_ocr_edit_distance_reward": 0.959701269865036, + "advantages": -5.79016568735824e-06, + "completion_length": 823.0, + "delta_ref_entropy_loss": 0.1025390625, + "delta_ref_ppl": -0.0751953125, + "entropy_loss": -0.037109375, + "epoch": 0.2096, + "grad_norm": 1.0950481661512788, + "k1_kl": 0.0751953125, + "k3_kl": 0.0400390625, + "kimi_kl": 0.1005859375, + "learning_rate": 3.952e-07, + "loss": 0.0016, + "ppl": 0.0172119140625, + "reward": 0.948798656463623, + "reward_std": 0.017486317083239555, + "rewards/perpo_ocr_edit_distance_reward": 0.9487987756729126, "step": 1048, "temperature": 0.9 }, { - "advantages": -1.6093255226223846e-06, - "completion_length": 429.5, - "delta_ref_entropy_loss": 0.04266357421875, - "delta_ref_ppl": -0.0440673828125, - "entropy_loss": -0.0421142578125, - "epoch": 0.4196, - "grad_norm": 1.5475867601577575, - "k1_kl": 0.04425048828125, - "k3_kl": 0.025634765625, - "kimi_kl": 0.06170654296875, - "learning_rate": 2.902e-07, - "loss": 0.001, - "ppl": 0.022003173828125, - "reward": 0.9898702502250671, - "reward_std": 0.017220642417669296, - "rewards/perpo_ocr_edit_distance_reward": 0.9898703098297119, + "advantages": 6.5394815464969724e-06, + "completion_length": 1445.0, + "delta_ref_entropy_loss": 0.030517578125, + "delta_ref_ppl": -0.03369140625, + "entropy_loss": -0.039794921875, + "epoch": 0.2098, + "grad_norm": 0.6483773179544685, + "k1_kl": 0.033447265625, + "k3_kl": 0.021240234375, + "kimi_kl": 0.048828125, + "learning_rate": 3.951e-07, + "loss": 0.0008, + "ppl": 0.018798828125, + "reward": 0.9939883351325989, + "reward_std": 0.0038013411685824394, + "rewards/perpo_ocr_edit_distance_reward": 0.9939883351325989, "step": 1049, "temperature": 0.9 }, { - "advantages": -5.251595393929165e-05, - "completion_length": 1112.0, - "delta_ref_entropy_loss": 0.017333984375, - "delta_ref_ppl": -0.011199951171875, - "entropy_loss": -0.02935791015625, - "epoch": 0.42, - "grad_norm": 0.45233276837725606, - "k1_kl": 0.01123046875, - "k3_kl": 0.00714111328125, - "kimi_kl": 0.01513671875, - "learning_rate": 2.9e-07, - "loss": 0.0003, - "ppl": 0.015289306640625, - "reward": 0.9978002607822418, - "reward_std": 0.00023827970289858058, - "rewards/perpo_ocr_edit_distance_reward": 0.9978002905845642, + "advantages": -1.2261527899681823e-06, + "completion_length": 1218.0, + "delta_ref_entropy_loss": 0.087890625, + "delta_ref_ppl": -0.05859375, + "entropy_loss": -0.0791015625, + "epoch": 0.21, + "grad_norm": 181.6580618287874, + "k1_kl": 0.058349609375, + "k3_kl": 0.1455078125, + "kimi_kl": 0.080078125, + "learning_rate": 3.95e-07, + "loss": 0.0058, + "ppl": 0.048583984375, + "reward": 0.9667728543281555, + "reward_std": 0.006861220579594374, + "rewards/perpo_ocr_edit_distance_reward": 0.9667729139328003, "step": 1050, "temperature": 0.9 }, { - "advantages": -2.906152303694398e-05, - "completion_length": 598.5, - "delta_ref_entropy_loss": 0.03155517578125, - "delta_ref_ppl": -0.01898193359375, - "entropy_loss": -0.0269775390625, - "epoch": 0.4204, - "grad_norm": 0.5387963403864082, - "k1_kl": 0.01898193359375, - "k3_kl": 0.0111846923828125, - "kimi_kl": 0.03082275390625, - "learning_rate": 2.898e-07, - "loss": 0.0005, - "ppl": 0.015228271484375, - "reward": 0.9990407228469849, - "reward_std": 0.0010129196743946522, - "rewards/perpo_ocr_edit_distance_reward": 0.9990407526493073, + "advantages": -1.6399793821619824e-05, + "completion_length": 626.0, + "delta_ref_entropy_loss": 0.032470703125, + "delta_ref_ppl": -0.041015625, + "entropy_loss": -0.0179443359375, + "epoch": 0.2102, + "grad_norm": 0.37639593146200884, + "k1_kl": 0.041015625, + "k3_kl": 0.025390625, + "kimi_kl": 0.0654296875, + "learning_rate": 3.9489999999999997e-07, + "loss": 0.001, + "ppl": 0.007232666015625, + "reward": 0.9927266240119934, + "reward_std": 0.0019736846443265676, + "rewards/perpo_ocr_edit_distance_reward": 0.9927266240119934, "step": 1051, "temperature": 0.9 }, { - "advantages": -2.1627972399684836e-06, - "completion_length": 282.0, - "delta_ref_entropy_loss": 0.0225830078125, - "delta_ref_ppl": -0.0382080078125, - "entropy_loss": -0.02764892578125, - "epoch": 0.4208, - "grad_norm": 2.650708781242906, - "k1_kl": 0.0382080078125, - "k3_kl": 0.0257568359375, - "kimi_kl": 0.055419921875, - "learning_rate": 2.896e-07, - "loss": 0.001, - "ppl": 0.01458740234375, - "reward": 0.9983400404453278, - "reward_std": 0.002170563442632556, - "rewards/perpo_ocr_edit_distance_reward": 0.9983400702476501, + "advantages": -8.516652451362461e-05, + "completion_length": 843.0, + "delta_ref_entropy_loss": 0.0537109375, + "delta_ref_ppl": -0.0400390625, + "entropy_loss": -0.0458984375, + "epoch": 0.2104, + "grad_norm": 1.0873447304449952, + "k1_kl": 0.040283203125, + "k3_kl": 0.0264892578125, + "kimi_kl": 0.05224609375, + "learning_rate": 3.9479999999999996e-07, + "loss": 0.0011, + "ppl": 0.0247802734375, + "reward": 0.9867292046546936, + "reward_std": 0.0007996479980647564, + "rewards/perpo_ocr_edit_distance_reward": 0.9867292046546936, "step": 1052, "temperature": 0.9 }, { - "advantages": -7.241964794957312e-06, - "completion_length": 363.0, - "delta_ref_entropy_loss": 0.045166015625, - "delta_ref_ppl": -0.0606689453125, - "entropy_loss": -0.044525146484375, - "epoch": 0.4212, - "grad_norm": 0.6334289272743991, - "k1_kl": 0.0609130859375, - "k3_kl": 0.0452880859375, - "kimi_kl": 0.18408203125, - "learning_rate": 2.894e-07, - "loss": 0.0018, - "ppl": 0.021240234375, - "reward": 0.8600784242153168, - "reward_std": 0.002594246529042721, - "rewards/perpo_ocr_edit_distance_reward": 0.8600784540176392, + "advantages": -0.00011809809075202793, + "completion_length": 408.0, + "delta_ref_entropy_loss": 0.053955078125, + "delta_ref_ppl": -0.07568359375, + "entropy_loss": -0.0140380859375, + "epoch": 0.2106, + "grad_norm": 0.631649739750444, + "k1_kl": 0.07568359375, + "k3_kl": 0.04931640625, + "kimi_kl": 0.1611328125, + "learning_rate": 3.9469999999999995e-07, + "loss": 0.0021, + "ppl": 0.00506591796875, + "reward": 0.9971703886985779, + "reward_std": 0.00040466233622282743, + "rewards/perpo_ocr_edit_distance_reward": 0.9971704483032227, "step": 1053, "temperature": 0.9 }, { - "advantages": -8.438315103376226e-05, - "completion_length": 343.0, - "delta_ref_entropy_loss": 0.125732421875, - "delta_ref_ppl": -0.09130859375, - "entropy_loss": -0.1578369140625, - "epoch": 0.4216, - "grad_norm": 1.1850093493112823, - "k1_kl": 0.091796875, - "k3_kl": 0.052978515625, - "kimi_kl": 0.15234375, - "learning_rate": 2.892e-07, - "loss": 0.0022, - "ppl": 0.09100341796875, - "reward": 0.8860470056533813, - "reward_std": 0.0016511373105458915, - "rewards/perpo_ocr_edit_distance_reward": 0.8860470652580261, + "advantages": -9.03436193766538e-06, + "completion_length": 741.0, + "delta_ref_entropy_loss": 0.0947265625, + "delta_ref_ppl": -0.08154296875, + "entropy_loss": -0.0625, + "epoch": 0.2108, + "grad_norm": 1.0153030715094264, + "k1_kl": 0.08154296875, + "k3_kl": 0.044921875, + "kimi_kl": 0.10009765625, + "learning_rate": 3.946e-07, + "loss": 0.0018, + "ppl": 0.0322265625, + "reward": 0.9742379784584045, + "reward_std": 0.005553736351430416, + "rewards/perpo_ocr_edit_distance_reward": 0.9742380976676941, "step": 1054, "temperature": 0.9 }, { - "advantages": -3.2867706636352523e-06, - "completion_length": 990.5, - "delta_ref_entropy_loss": 0.04931640625, - "delta_ref_ppl": -0.03167724609375, - "entropy_loss": -0.0565185546875, - "epoch": 0.422, - "grad_norm": 0.9210429885638054, - "k1_kl": 0.03167724609375, - "k3_kl": 0.01641845703125, - "kimi_kl": 0.0401611328125, - "learning_rate": 2.8899999999999995e-07, - "loss": 0.0007, - "ppl": 0.03082275390625, - "reward": 0.923059344291687, - "reward_std": 0.010210923384875059, - "rewards/perpo_ocr_edit_distance_reward": 0.9230593740940094, + "advantages": -8.148806955432519e-05, + "completion_length": 592.0, + "delta_ref_entropy_loss": 0.053466796875, + "delta_ref_ppl": -0.0625, + "entropy_loss": -0.01519775390625, + "epoch": 0.211, + "grad_norm": 0.31145113925555873, + "k1_kl": 0.0625, + "k3_kl": 0.036865234375, + "kimi_kl": 0.10009765625, + "learning_rate": 3.945e-07, + "loss": 0.0016, + "ppl": 0.00634765625, + "reward": 0.9963409900665283, + "reward_std": 0.0004224983276799321, + "rewards/perpo_ocr_edit_distance_reward": 0.9963410496711731, "step": 1055, "temperature": 0.9 }, { - "advantages": -3.916876778475853e-07, - "completion_length": 981.5, - "delta_ref_entropy_loss": 0.0400390625, - "delta_ref_ppl": -0.05224609375, - "entropy_loss": -0.0914306640625, - "epoch": 0.4224, - "grad_norm": 0.5212005668184724, - "k1_kl": 0.0523681640625, - "k3_kl": 0.037353515625, - "kimi_kl": 0.163330078125, - "learning_rate": 2.888e-07, - "loss": 0.0015, - "ppl": 0.0460205078125, - "reward": 0.8967156708240509, - "reward_std": 0.04817419499158859, - "rewards/perpo_ocr_edit_distance_reward": 0.8967157006263733, + "advantages": -7.663454653084045e-07, + "completion_length": 152.0, + "delta_ref_entropy_loss": 0.0810546875, + "delta_ref_ppl": -0.205078125, + "entropy_loss": -0.02685546875, + "epoch": 0.2112, + "grad_norm": 1.7225846048727163, + "k1_kl": 0.2060546875, + "k3_kl": 0.1591796875, + "kimi_kl": 0.63671875, + "learning_rate": 3.9439999999999993e-07, + "loss": 0.0064, + "ppl": 0.012451171875, + "reward": 0.9769017696380615, + "reward_std": 0.021438928321003914, + "rewards/perpo_ocr_edit_distance_reward": 0.9769018292427063, "step": 1056, "temperature": 0.9 }, { - "advantages": -4.4937645725440234e-05, - "completion_length": 528.5, - "delta_ref_entropy_loss": 0.04296875, - "delta_ref_ppl": -0.023590087890625, - "entropy_loss": -0.03857421875, - "epoch": 0.4228, - "grad_norm": 0.40590372809749575, - "k1_kl": 0.023590087890625, - "k3_kl": 0.0108795166015625, - "kimi_kl": 0.019622802734375, - "learning_rate": 2.8860000000000003e-07, - "loss": 0.0005, - "ppl": 0.017669677734375, - "reward": 0.999366283416748, - "reward_std": 0.0003289082960691303, - "rewards/perpo_ocr_edit_distance_reward": 0.9993663430213928, + "advantages": -2.0614692402887158e-05, + "completion_length": 606.0, + "delta_ref_entropy_loss": 0.053466796875, + "delta_ref_ppl": -0.08154296875, + "entropy_loss": -0.01611328125, + "epoch": 0.2114, + "grad_norm": 0.6660882475969144, + "k1_kl": 0.08154296875, + "k3_kl": 0.052001953125, + "kimi_kl": 0.166015625, + "learning_rate": 3.943e-07, + "loss": 0.0021, + "ppl": 0.005126953125, + "reward": 0.9952344298362732, + "reward_std": 0.0027902040164917707, + "rewards/perpo_ocr_edit_distance_reward": 0.995234489440918, "step": 1057, "temperature": 0.9 }, { - "advantages": -8.102826177491806e-05, - "completion_length": 959.5, - "delta_ref_entropy_loss": 0.0543212890625, - "delta_ref_ppl": -0.0745849609375, - "entropy_loss": -0.0374755859375, - "epoch": 0.4232, - "grad_norm": 2.602562630773769, - "k1_kl": 0.0745849609375, - "k3_kl": 0.0469970703125, - "kimi_kl": 0.12939453125, - "learning_rate": 2.8839999999999996e-07, - "loss": 0.002, - "ppl": 0.023193359375, - "reward": 0.9960523843765259, - "reward_std": 0.0009768139716470614, - "rewards/perpo_ocr_edit_distance_reward": 0.996052473783493, + "advantages": -1.7838819985627197e-05, + "completion_length": 1386.0, + "delta_ref_entropy_loss": 0.1064453125, + "delta_ref_ppl": -0.0732421875, + "entropy_loss": -0.11572265625, + "epoch": 0.2116, + "grad_norm": 4.536334157475349, + "k1_kl": 0.0732421875, + "k3_kl": 0.05419921875, + "kimi_kl": 0.0654296875, + "learning_rate": 3.9419999999999997e-07, + "loss": 0.0022, + "ppl": 0.068359375, + "reward": 0.9800671935081482, + "reward_std": 0.004195712972432375, + "rewards/perpo_ocr_edit_distance_reward": 0.980067253112793, "step": 1058, "temperature": 0.9 }, { - "advantages": -2.307551369540306e-06, - "completion_length": 883.5, - "delta_ref_entropy_loss": 0.03106689453125, - "delta_ref_ppl": -0.02880859375, - "entropy_loss": -0.0870361328125, - "epoch": 0.4236, - "grad_norm": 0.6929143903855134, - "k1_kl": 0.0286865234375, - "k3_kl": 0.016937255859375, - "kimi_kl": 0.041748046875, - "learning_rate": 2.882e-07, - "loss": 0.0007, - "ppl": 0.052001953125, - "reward": 0.8999696969985962, - "reward_std": 0.05301400413736701, - "rewards/perpo_ocr_edit_distance_reward": 0.899969756603241, + "advantages": -0.00014365572133101523, + "completion_length": 1244.0, + "delta_ref_entropy_loss": 0.06884765625, + "delta_ref_ppl": -0.06396484375, + "entropy_loss": -0.029052734375, + "epoch": 0.2118, + "grad_norm": 6.136376572223492, + "k1_kl": 0.06396484375, + "k3_kl": 0.04296875, + "kimi_kl": 0.09033203125, + "learning_rate": 3.9409999999999996e-07, + "loss": 0.0019, + "ppl": 0.01544189453125, + "reward": 0.9772591590881348, + "reward_std": 0.00043349951738491654, + "rewards/perpo_ocr_edit_distance_reward": 0.9772592186927795, "step": 1059, "temperature": 0.9 }, { - "advantages": -7.740089131402783e-06, - "completion_length": 527.0, - "delta_ref_entropy_loss": 0.0679931640625, - "delta_ref_ppl": -0.064208984375, - "entropy_loss": -0.0860595703125, - "epoch": 0.424, - "grad_norm": 1.157523664730949, - "k1_kl": 0.06396484375, - "k3_kl": 0.040924072265625, - "kimi_kl": 0.1060791015625, - "learning_rate": 2.88e-07, - "loss": 0.0016, - "ppl": 0.0511474609375, - "reward": 0.9759450256824493, - "reward_std": 0.0023038746730890125, - "rewards/perpo_ocr_edit_distance_reward": 0.9759451150894165, + "advantages": -5.15154442837229e-06, + "completion_length": 334.0, + "delta_ref_entropy_loss": 0.09716796875, + "delta_ref_ppl": -0.0986328125, + "entropy_loss": -0.05078125, + "epoch": 0.212, + "grad_norm": 1.9150014100444426, + "k1_kl": 0.0986328125, + "k3_kl": 0.05712890625, + "kimi_kl": 0.1494140625, + "learning_rate": 3.94e-07, + "loss": 0.0023, + "ppl": 0.020751953125, + "reward": 0.976962685585022, + "reward_std": 0.0015537587460130453, + "rewards/perpo_ocr_edit_distance_reward": 0.9769627451896667, "step": 1060, "temperature": 0.9 }, { - "advantages": -2.360344115004409e-05, - "completion_length": 664.0, - "delta_ref_entropy_loss": 0.032470703125, - "delta_ref_ppl": -0.053619384765625, - "entropy_loss": -0.03411865234375, - "epoch": 0.4244, - "grad_norm": 1.2306905562443093, - "k1_kl": 0.053619384765625, - "k3_kl": 0.04632568359375, - "kimi_kl": 0.0919189453125, - "learning_rate": 2.8779999999999997e-07, - "loss": 0.0019, - "ppl": 0.02569580078125, - "reward": 0.9978574216365814, - "reward_std": 0.00031058600870892406, - "rewards/perpo_ocr_edit_distance_reward": 0.9978574216365814, + "advantages": -1.0277543879055884e-05, + "completion_length": 857.0, + "delta_ref_entropy_loss": 0.061279296875, + "delta_ref_ppl": -0.05224609375, + "entropy_loss": -0.03564453125, + "epoch": 0.2122, + "grad_norm": 0.6522077425092805, + "k1_kl": 0.05224609375, + "k3_kl": 0.0274658203125, + "kimi_kl": 0.06787109375, + "learning_rate": 3.9389999999999995e-07, + "loss": 0.0011, + "ppl": 0.017333984375, + "reward": 0.981372594833374, + "reward_std": 0.002385428873822093, + "rewards/perpo_ocr_edit_distance_reward": 0.9813727140426636, "step": 1061, "temperature": 0.9 }, { - "advantages": -0.00020213638526911382, - "completion_length": 1088.5, - "delta_ref_entropy_loss": 0.017578125, - "delta_ref_ppl": -0.010101318359375, - "entropy_loss": -0.01220703125, - "epoch": 0.4248, - "grad_norm": 0.23611438789795447, - "k1_kl": 0.010101318359375, - "k3_kl": 0.0046844482421875, - "kimi_kl": 0.007720947265625, - "learning_rate": 2.876e-07, - "loss": 0.0004, - "ppl": 0.005523681640625, - "reward": 0.9978056252002716, - "reward_std": 0.0005058215974713676, - "rewards/perpo_ocr_edit_distance_reward": 0.9978056848049164, + "advantages": -1.9482204152154736e-05, + "completion_length": 373.0, + "delta_ref_entropy_loss": 0.125, + "delta_ref_ppl": -0.11083984375, + "entropy_loss": -0.09912109375, + "epoch": 0.2124, + "grad_norm": 1.665255535663707, + "k1_kl": 0.11083984375, + "k3_kl": 0.05810546875, + "kimi_kl": 0.1220703125, + "learning_rate": 3.9379999999999994e-07, + "loss": 0.0023, + "ppl": 0.03759765625, + "reward": 0.892188549041748, + "reward_std": 0.002958412514999509, + "rewards/perpo_ocr_edit_distance_reward": 0.892188549041748, "step": 1062, "temperature": 0.9 }, { - "advantages": -6.777048474759795e-05, - "completion_length": 492.0, - "delta_ref_entropy_loss": 0.0341796875, - "delta_ref_ppl": -0.034912109375, - "entropy_loss": -0.0308837890625, - "epoch": 0.4252, - "grad_norm": 0.6220577093482725, - "k1_kl": 0.03497314453125, - "k3_kl": 0.022552490234375, - "kimi_kl": 0.0615234375, - "learning_rate": 2.874e-07, - "loss": 0.001, - "ppl": 0.019805908203125, - "reward": 0.9874055683612823, - "reward_std": 0.0007672336214454845, - "rewards/perpo_ocr_edit_distance_reward": 0.9874056279659271, + "advantages": -5.023820222049835e-07, + "completion_length": 436.0, + "delta_ref_entropy_loss": 0.1376953125, + "delta_ref_ppl": -0.12890625, + "entropy_loss": -0.08984375, + "epoch": 0.2126, + "grad_norm": 1.1640943581629426, + "k1_kl": 0.12890625, + "k3_kl": 0.07275390625, + "kimi_kl": 0.1962890625, + "learning_rate": 3.937e-07, + "loss": 0.0029, + "ppl": 0.038330078125, + "reward": 0.8178948163986206, + "reward_std": 0.09945374727249146, + "rewards/perpo_ocr_edit_distance_reward": 0.8178948760032654, "step": 1063, "temperature": 0.9 }, { - "advantages": -1.4952251831346075e-05, - "completion_length": 498.0, - "delta_ref_entropy_loss": 0.05859375, - "delta_ref_ppl": -0.063232421875, - "entropy_loss": -0.099853515625, - "epoch": 0.4256, - "grad_norm": 2.065434119567507, - "k1_kl": 0.0634765625, - "k3_kl": 0.0400390625, - "kimi_kl": 0.0924072265625, - "learning_rate": 2.872e-07, - "loss": 0.0016, - "ppl": 0.05120849609375, - "reward": 0.944383054971695, - "reward_std": 0.006858535693027079, - "rewards/perpo_ocr_edit_distance_reward": 0.9443831741809845, + "advantages": -8.761882781982422e-06, + "completion_length": 694.0, + "delta_ref_entropy_loss": 0.08935546875, + "delta_ref_ppl": -0.057861328125, + "entropy_loss": -0.049560546875, + "epoch": 0.2128, + "grad_norm": 0.7753076296236454, + "k1_kl": 0.0576171875, + "k3_kl": 0.0252685546875, + "kimi_kl": 0.057861328125, + "learning_rate": 3.936e-07, + "loss": 0.001, + "ppl": 0.0203857421875, + "reward": 0.9668927788734436, + "reward_std": 0.0018433730583637953, + "rewards/perpo_ocr_edit_distance_reward": 0.9668928384780884, "step": 1064, "temperature": 0.9 }, { - "advantages": -2.5161676603602245e-05, - "completion_length": 270.0, - "delta_ref_entropy_loss": 0.03643798828125, - "delta_ref_ppl": -0.083343505859375, - "entropy_loss": -0.0341796875, - "epoch": 0.426, - "grad_norm": 0.19601754223700008, - "k1_kl": 0.083831787109375, - "k3_kl": 0.0632171630859375, - "kimi_kl": 0.2723655700683594, - "learning_rate": 2.8699999999999996e-07, - "loss": 0.0026, - "ppl": 0.015594482421875, - "reward": 0.9967776536941528, - "reward_std": 0.00020382126967888325, - "rewards/perpo_ocr_edit_distance_reward": 0.9967776834964752, + "advantages": -1.767703543009702e-05, + "completion_length": 491.0, + "delta_ref_entropy_loss": 0.061279296875, + "delta_ref_ppl": -0.0771484375, + "entropy_loss": -0.02734375, + "epoch": 0.213, + "grad_norm": 0.5481875024938324, + "k1_kl": 0.0771484375, + "k3_kl": 0.05029296875, + "kimi_kl": 0.1748046875, + "learning_rate": 3.935e-07, + "loss": 0.002, + "ppl": 0.010986328125, + "reward": 0.9971139430999756, + "reward_std": 0.00038177191163413227, + "rewards/perpo_ocr_edit_distance_reward": 0.9971140027046204, "step": 1065, "temperature": 0.9 }, { - "advantages": -6.556511493727157e-07, - "completion_length": 520.5, - "delta_ref_entropy_loss": 0.0533447265625, - "delta_ref_ppl": -0.04052734375, - "entropy_loss": -0.0660400390625, - "epoch": 0.4264, - "grad_norm": 0.7624106939144994, - "k1_kl": 0.0404052734375, - "k3_kl": 0.02130126953125, - "kimi_kl": 0.04547119140625, - "learning_rate": 2.868e-07, - "loss": 0.0009, - "ppl": 0.033843994140625, - "reward": 0.9839304387569427, - "reward_std": 0.022435931488871574, - "rewards/perpo_ocr_edit_distance_reward": 0.9839304387569427, + "advantages": -1.8988337160408264e-06, + "completion_length": 428.0, + "delta_ref_entropy_loss": 0.11865234375, + "delta_ref_ppl": -0.11962890625, + "entropy_loss": -0.13671875, + "epoch": 0.2132, + "grad_norm": 2.057124520450832, + "k1_kl": 0.11962890625, + "k3_kl": 0.0732421875, + "kimi_kl": 0.26171875, + "learning_rate": 3.934e-07, + "loss": 0.0029, + "ppl": 0.06787109375, + "reward": 0.6248671412467957, + "reward_std": 0.02229136787354946, + "rewards/perpo_ocr_edit_distance_reward": 0.6248672008514404, "step": 1066, "temperature": 0.9 }, { - "advantages": -6.811959565311554e-07, - "completion_length": 425.5, - "delta_ref_entropy_loss": 0.0279541015625, - "delta_ref_ppl": -0.01397705078125, - "entropy_loss": -0.02191162109375, - "epoch": 0.4268, - "grad_norm": 0.5007828665569384, - "k1_kl": 0.013916015625, - "k3_kl": 0.006072998046875, - "kimi_kl": 0.0091400146484375, - "learning_rate": 2.866e-07, - "loss": 0.0002, - "ppl": 0.01092529296875, - "reward": 0.9915308058261871, - "reward_std": 0.003064945572987199, - "rewards/perpo_ocr_edit_distance_reward": 0.9915308356285095, + "advantages": -0.00018898930284194648, + "completion_length": 350.0, + "delta_ref_entropy_loss": 0.1005859375, + "delta_ref_ppl": -0.12109375, + "entropy_loss": -0.0240478515625, + "epoch": 0.2134, + "grad_norm": 0.3880528606640293, + "k1_kl": 0.12158203125, + "k3_kl": 0.076171875, + "kimi_kl": 0.3125, + "learning_rate": 3.9329999999999995e-07, + "loss": 0.0032, + "ppl": 0.0098876953125, + "reward": 0.8238588571548462, + "reward_std": 0.00017033251060638577, + "rewards/perpo_ocr_edit_distance_reward": 0.823858916759491, "step": 1067, "temperature": 0.9 }, { - "advantages": 4.478863957046997e-06, - "completion_length": 170.0, - "delta_ref_entropy_loss": 0.087646484375, - "delta_ref_ppl": -0.2698974609375, - "entropy_loss": -0.06060791015625, - "epoch": 0.4272, - "grad_norm": 2.6094886549699097, - "k1_kl": 0.269775390625, - "k3_kl": 0.205810546875, - "kimi_kl": 0.78466796875, - "learning_rate": 2.8639999999999997e-07, - "loss": 0.0083, - "ppl": 0.039947509765625, - "reward": 0.6640619337558746, - "reward_std": 0.0022127551346784458, - "rewards/perpo_ocr_edit_distance_reward": 0.6640619486570358, + "advantages": -0.00023188762133941054, + "completion_length": 662.0, + "delta_ref_entropy_loss": 0.08837890625, + "delta_ref_ppl": -0.0654296875, + "entropy_loss": -0.019287109375, + "epoch": 0.2136, + "grad_norm": 0.7065667150651225, + "k1_kl": 0.0654296875, + "k3_kl": 0.032470703125, + "kimi_kl": 0.0732421875, + "learning_rate": 3.932e-07, + "loss": 0.0015, + "ppl": 0.006866455078125, + "reward": 0.9770102500915527, + "reward_std": 0.0001937465276569128, + "rewards/perpo_ocr_edit_distance_reward": 0.9770103693008423, "step": 1068, "temperature": 0.9 }, { - "advantages": -1.8732888358741207e-05, - "completion_length": 433.0, - "delta_ref_entropy_loss": 0.0462646484375, - "delta_ref_ppl": -0.03118896484375, - "entropy_loss": -0.0433349609375, - "epoch": 0.4276, - "grad_norm": 0.47544390529722547, - "k1_kl": 0.03118896484375, - "k3_kl": 0.020721435546875, - "kimi_kl": 0.05853271484375, - "learning_rate": 2.862e-07, - "loss": 0.0008, - "ppl": 0.021484375, - "reward": 0.9394363760948181, - "reward_std": 0.13927119871368632, - "rewards/perpo_ocr_edit_distance_reward": 0.9394364058971405, + "advantages": 6.267002845561365e-06, + "completion_length": 506.0, + "delta_ref_entropy_loss": 0.14453125, + "delta_ref_ppl": -0.07470703125, + "entropy_loss": -0.0849609375, + "epoch": 0.2138, + "grad_norm": 1.2459822083620693, + "k1_kl": 0.07470703125, + "k3_kl": 0.03173828125, + "kimi_kl": 0.06689453125, + "learning_rate": 3.931e-07, + "loss": 0.0013, + "ppl": 0.036376953125, + "reward": 0.29576918482780457, + "reward_std": 0.0026099584065377712, + "rewards/perpo_ocr_edit_distance_reward": 0.29576918482780457, "step": 1069, "temperature": 0.9 }, { - "advantages": -0.00014837299750070088, - "completion_length": 663.5, - "delta_ref_entropy_loss": 0.043212890625, - "delta_ref_ppl": -0.029541015625, - "entropy_loss": -0.0264892578125, - "epoch": 0.428, - "grad_norm": 0.3594106971214069, - "k1_kl": 0.029541015625, - "k3_kl": 0.015228271484375, - "kimi_kl": 0.0423583984375, - "learning_rate": 2.8599999999999994e-07, - "loss": 0.0008, - "ppl": 0.012939453125, - "reward": 0.9982617795467377, - "reward_std": 0.0003916900168405846, - "rewards/perpo_ocr_edit_distance_reward": 0.9982618689537048, + "advantages": -6.437302090489538e-06, + "completion_length": 746.0, + "delta_ref_entropy_loss": 0.07763671875, + "delta_ref_ppl": -0.07373046875, + "entropy_loss": -0.052978515625, + "epoch": 0.214, + "grad_norm": 1.3993093311286473, + "k1_kl": 0.0732421875, + "k3_kl": 0.041259765625, + "kimi_kl": 0.10302734375, + "learning_rate": 3.93e-07, + "loss": 0.0017, + "ppl": 0.0260009765625, + "reward": 0.9824998378753662, + "reward_std": 0.001221305108629167, + "rewards/perpo_ocr_edit_distance_reward": 0.982499897480011, "step": 1070, "temperature": 0.9 }, { - "advantages": -8.370195246243384e-06, - "completion_length": 657.0, - "delta_ref_entropy_loss": 0.07769775390625, - "delta_ref_ppl": -0.05712890625, - "entropy_loss": -0.097686767578125, - "epoch": 0.4284, - "grad_norm": 0.9592359918313719, - "k1_kl": 0.05712890625, - "k3_kl": 0.03057861328125, - "kimi_kl": 0.0653076171875, - "learning_rate": 2.858e-07, - "loss": 0.0012, - "ppl": 0.050933837890625, - "reward": 0.9569530189037323, - "reward_std": 0.0011002399696735665, - "rewards/perpo_ocr_edit_distance_reward": 0.9569530189037323, + "advantages": -0.0005960464477539062, + "completion_length": 122.0, + "delta_ref_entropy_loss": 0.0927734375, + "delta_ref_ppl": -0.2177734375, + "entropy_loss": -0.04541015625, + "epoch": 0.2142, + "grad_norm": 0.08692750502114016, + "k1_kl": 0.216796875, + "k3_kl": 0.1552734375, + "kimi_kl": 0.5859375, + "learning_rate": 3.9290000000000003e-07, + "loss": 0.0068, + "ppl": 0.018798828125, + "reward": 0.9920318722724915, + "reward_std": 0.0, + "rewards/perpo_ocr_edit_distance_reward": 0.9920319318771362, "step": 1071, "temperature": 0.9 }, { - "advantages": -1.8579620316927503e-05, - "completion_length": 927.0, - "delta_ref_entropy_loss": 0.0426025390625, - "delta_ref_ppl": -0.02667236328125, - "entropy_loss": -0.0625, - "epoch": 0.4288, - "grad_norm": 3.640654306260831, - "k1_kl": 0.026580810546875, - "k3_kl": 0.0152130126953125, - "kimi_kl": 0.04071044921875, - "learning_rate": 2.856e-07, - "loss": 0.0006, - "ppl": 0.037841796875, - "reward": 0.9513713717460632, - "reward_std": 0.08836405473994091, - "rewards/perpo_ocr_edit_distance_reward": 0.9513714611530304, + "advantages": -2.7963094908045605e-05, + "completion_length": 401.0, + "delta_ref_entropy_loss": 0.07568359375, + "delta_ref_ppl": -0.10595703125, + "entropy_loss": -0.040771484375, + "epoch": 0.2144, + "grad_norm": 2.6860191429965723, + "k1_kl": 0.10595703125, + "k3_kl": 0.07080078125, + "kimi_kl": 0.25390625, + "learning_rate": 3.9279999999999997e-07, + "loss": 0.0029, + "ppl": 0.01611328125, + "reward": 0.9856911301612854, + "reward_std": 0.0017270561074838042, + "rewards/perpo_ocr_edit_distance_reward": 0.985691249370575, "step": 1072, "temperature": 0.9 }, { - "advantages": -0.00010387387237642542, - "completion_length": 476.0, - "delta_ref_entropy_loss": 0.04248046875, - "delta_ref_ppl": -0.03466796875, - "entropy_loss": -0.0450439453125, - "epoch": 0.4292, - "grad_norm": 0.5818153201686836, - "k1_kl": 0.0347900390625, - "k3_kl": 0.019287109375, - "kimi_kl": 0.0465087890625, - "learning_rate": 2.8539999999999995e-07, - "loss": 0.0009, - "ppl": 0.02178955078125, - "reward": 0.9824126362800598, - "reward_std": 0.0012901332811452448, - "rewards/perpo_ocr_edit_distance_reward": 0.982412725687027, + "advantages": 1.9226756194257177e-05, + "completion_length": 456.0, + "delta_ref_entropy_loss": 0.0732421875, + "delta_ref_ppl": -0.0673828125, + "entropy_loss": -0.034912109375, + "epoch": 0.2146, + "grad_norm": 0.4155769449061232, + "k1_kl": 0.0673828125, + "k3_kl": 0.03857421875, + "kimi_kl": 0.10205078125, + "learning_rate": 3.9269999999999996e-07, + "loss": 0.0015, + "ppl": 0.01239013671875, + "reward": 0.988440215587616, + "reward_std": 0.0012291044695302844, + "rewards/perpo_ocr_edit_distance_reward": 0.9884402751922607, "step": 1073, "temperature": 0.9 }, { - "advantages": -0.00011743605864467099, - "completion_length": 818.5, - "delta_ref_entropy_loss": 0.03466796875, - "delta_ref_ppl": -0.033355712890625, - "entropy_loss": -0.026611328125, - "epoch": 0.4296, - "grad_norm": 0.3226240425474267, - "k1_kl": 0.033233642578125, - "k3_kl": 0.0228729248046875, - "kimi_kl": 0.10919189453125, - "learning_rate": 2.852e-07, - "loss": 0.001, - "ppl": 0.0121612548828125, - "reward": 0.993113100528717, - "reward_std": 0.0004179970419500023, - "rewards/perpo_ocr_edit_distance_reward": 0.9931131601333618, + "advantages": -8.514949634275126e-09, + "completion_length": 723.0, + "delta_ref_entropy_loss": 0.09912109375, + "delta_ref_ppl": -0.07470703125, + "entropy_loss": -0.049560546875, + "epoch": 0.2148, + "grad_norm": 0.9316204807972032, + "k1_kl": 0.07470703125, + "k3_kl": 0.0361328125, + "kimi_kl": 0.07666015625, + "learning_rate": 3.926e-07, + "loss": 0.0014, + "ppl": 0.020263671875, + "reward": 0.9862473011016846, + "reward_std": 0.0009307046420872211, + "rewards/perpo_ocr_edit_distance_reward": 0.9862473607063293, "step": 1074, "temperature": 0.9 }, { - "advantages": 1.596553011040669e-05, - "completion_length": 631.0, - "delta_ref_entropy_loss": 0.0225830078125, - "delta_ref_ppl": -0.024658203125, - "entropy_loss": -0.0106201171875, - "epoch": 0.43, - "grad_norm": 0.07162700008423883, - "k1_kl": 0.02471923828125, - "k3_kl": 0.016845703125, - "kimi_kl": 0.052978515625, - "learning_rate": 2.8499999999999997e-07, - "loss": 0.0007, - "ppl": 0.0029144287109375, - "reward": 0.9998779892921448, - "reward_std": 8.332592551596463e-05, - "rewards/perpo_ocr_edit_distance_reward": 0.9998780190944672, + "advantages": -8.514949513482861e-06, + "completion_length": 102.0, + "delta_ref_entropy_loss": 0.142578125, + "delta_ref_ppl": -0.275390625, + "entropy_loss": -0.05908203125, + "epoch": 0.215, + "grad_norm": 2.1355254546367513, + "k1_kl": 0.275390625, + "k3_kl": 0.2041015625, + "kimi_kl": 0.70703125, + "learning_rate": 3.925e-07, + "loss": 0.0081, + "ppl": 0.01806640625, + "reward": 0.9783068299293518, + "reward_std": 0.006905393209308386, + "rewards/perpo_ocr_edit_distance_reward": 0.9783069491386414, "step": 1075, "temperature": 0.9 }, { - "advantages": -1.976532621483784e-05, - "completion_length": 614.0, - "delta_ref_entropy_loss": 0.01885986328125, - "delta_ref_ppl": -0.028076171875, - "entropy_loss": -0.020660400390625, - "epoch": 0.4304, - "grad_norm": 0.6922987662626395, - "k1_kl": 0.028076171875, - "k3_kl": 0.020477294921875, - "kimi_kl": 0.0716552734375, - "learning_rate": 2.848e-07, - "loss": 0.0008, - "ppl": 0.011322021484375, - "reward": 0.9988909363746643, - "reward_std": 0.0008116636308841407, - "rewards/perpo_ocr_edit_distance_reward": 0.9988909661769867, + "advantages": -2.588544703030493e-06, + "completion_length": 173.0, + "delta_ref_entropy_loss": 0.171875, + "delta_ref_ppl": -0.2177734375, + "entropy_loss": -0.0654296875, + "epoch": 0.2152, + "grad_norm": 1.9968842626253434, + "k1_kl": 0.216796875, + "k3_kl": 0.1552734375, + "kimi_kl": 0.6484375, + "learning_rate": 3.924e-07, + "loss": 0.0062, + "ppl": 0.0257568359375, + "reward": 0.9109848737716675, + "reward_std": 0.006469057407230139, + "rewards/perpo_ocr_edit_distance_reward": 0.9109848737716675, "step": 1076, "temperature": 0.9 }, { - "advantages": 7.842269042157568e-06, - "completion_length": 586.0, - "delta_ref_entropy_loss": 0.0391845703125, - "delta_ref_ppl": -0.016937255859375, - "entropy_loss": -0.0543212890625, - "epoch": 0.4308, - "grad_norm": 0.5302245798140969, - "k1_kl": 0.0169677734375, - "k3_kl": 0.0082855224609375, - "kimi_kl": 0.0184783935546875, - "learning_rate": 2.846e-07, - "loss": 0.0003, - "ppl": 0.0227203369140625, - "reward": 0.7964335978031158, - "reward_std": 0.0004938275087624788, - "rewards/perpo_ocr_edit_distance_reward": 0.7964335978031158, + "advantages": 0.0, + "completion_length": 93.0, + "delta_ref_entropy_loss": 0.166015625, + "delta_ref_ppl": -0.345703125, + "entropy_loss": -0.0400390625, + "epoch": 0.2154, + "grad_norm": 0.06599740971743684, + "k1_kl": 0.34375, + "k3_kl": 0.25, + "kimi_kl": 0.80078125, + "learning_rate": 3.923e-07, + "loss": 0.01, + "ppl": 0.01068115234375, + "reward": 0.8654292225837708, + "reward_std": 0.0, + "rewards/perpo_ocr_edit_distance_reward": 0.8654292225837708, "step": 1077, "temperature": 0.9 }, { - "advantages": -5.053622641959521e-05, - "completion_length": 846.0, - "delta_ref_entropy_loss": 0.0533447265625, - "delta_ref_ppl": -0.05010986328125, - "entropy_loss": -0.111785888671875, - "epoch": 0.4312, - "grad_norm": 4.062337081895057, - "k1_kl": 0.04986572265625, - "k3_kl": 0.03302001953125, - "kimi_kl": 0.091064453125, - "learning_rate": 2.844e-07, - "loss": 0.0014, - "ppl": 0.055938720703125, - "reward": 0.7752784490585327, - "reward_std": 0.16113464030786417, - "rewards/perpo_ocr_edit_distance_reward": 0.7752784788608551, + "advantages": -0.00014519691467285156, + "completion_length": 920.0, + "delta_ref_entropy_loss": 0.04052734375, + "delta_ref_ppl": -0.03515625, + "entropy_loss": -0.0184326171875, + "epoch": 0.2156, + "grad_norm": 0.5812078337755063, + "k1_kl": 0.034912109375, + "k3_kl": 0.0186767578125, + "kimi_kl": 0.043212890625, + "learning_rate": 3.922e-07, + "loss": 0.0009, + "ppl": 0.0086669921875, + "reward": 0.9874314069747925, + "reward_std": 0.0002519167319405824, + "rewards/perpo_ocr_edit_distance_reward": 0.9874314069747925, "step": 1078, "temperature": 0.9 }, { - "advantages": -0.0002966693468806625, - "completion_length": 482.0, - "delta_ref_entropy_loss": 0.0595703125, - "delta_ref_ppl": -0.0421142578125, - "entropy_loss": -0.064605712890625, - "epoch": 0.4316, - "grad_norm": 0.7875939660416562, - "k1_kl": 0.0421142578125, - "k3_kl": 0.0216217041015625, - "kimi_kl": 0.0420684814453125, - "learning_rate": 2.842e-07, - "loss": 0.0012, - "ppl": 0.0329132080078125, - "reward": 0.9812775254249573, - "reward_std": 0.007760579232126474, - "rewards/perpo_ocr_edit_distance_reward": 0.9812775552272797, + "advantages": -1.1920928955078125e-07, + "completion_length": 83.0, + "delta_ref_entropy_loss": 0.1669921875, + "delta_ref_ppl": -0.29296875, + "entropy_loss": -0.2060546875, + "epoch": 0.2158, + "grad_norm": 6.1529820851011765, + "k1_kl": 0.29296875, + "k3_kl": 0.2001953125, + "kimi_kl": 0.55859375, + "learning_rate": 3.9209999999999997e-07, + "loss": 0.008, + "ppl": 0.0966796875, + "reward": 0.47341465950012207, + "reward_std": 0.301076203584671, + "rewards/perpo_ocr_edit_distance_reward": 0.47341468930244446, "step": 1079, "temperature": 0.9 }, { - "advantages": -6.577798922080547e-05, - "completion_length": 433.5, - "delta_ref_entropy_loss": 0.02313232421875, - "delta_ref_ppl": -0.020477294921875, - "entropy_loss": -0.016693115234375, - "epoch": 0.432, - "grad_norm": 0.17841299337103012, - "k1_kl": 0.020477294921875, - "k3_kl": 0.0110321044921875, - "kimi_kl": 0.0211944580078125, - "learning_rate": 2.8399999999999995e-07, - "loss": 0.0005, - "ppl": 0.00655364990234375, - "reward": 0.999701052904129, - "reward_std": 0.00011182868911419064, - "rewards/perpo_ocr_edit_distance_reward": 0.999701052904129, + "advantages": 6.556510925292969e-05, + "completion_length": 819.0, + "delta_ref_entropy_loss": 0.0478515625, + "delta_ref_ppl": -0.048828125, + "entropy_loss": -0.026611328125, + "epoch": 0.216, + "grad_norm": 0.815339974648997, + "k1_kl": 0.048828125, + "k3_kl": 0.02587890625, + "kimi_kl": 0.051025390625, + "learning_rate": 3.92e-07, + "loss": 0.001, + "ppl": 0.0115966796875, + "reward": 0.9954341053962708, + "reward_std": 0.00028955316520296037, + "rewards/perpo_ocr_edit_distance_reward": 0.9954341053962708, "step": 1080, "temperature": 0.9 }, { - "advantages": -5.960464477539063e-08, - "completion_length": 348.0, - "delta_ref_entropy_loss": 0.04461669921875, - "delta_ref_ppl": -0.08251953125, - "entropy_loss": -0.0916748046875, - "epoch": 0.4324, - "grad_norm": 1.39490087239606, - "k1_kl": 0.08251953125, - "k3_kl": 0.06201171875, - "kimi_kl": 0.2744140625, - "learning_rate": 2.838e-07, - "loss": 0.0025, - "ppl": 0.0518341064453125, - "reward": 0.845453292131424, - "reward_std": 0.09903278946876526, - "rewards/perpo_ocr_edit_distance_reward": 0.8454533219337463, + "advantages": -2.946172571682837e-06, + "completion_length": 911.0, + "delta_ref_entropy_loss": 0.06201171875, + "delta_ref_ppl": -0.044921875, + "entropy_loss": -0.021728515625, + "epoch": 0.2162, + "grad_norm": 1.0561445897331958, + "k1_kl": 0.044921875, + "k3_kl": 0.0240478515625, + "kimi_kl": 0.055419921875, + "learning_rate": 3.919e-07, + "loss": 0.001, + "ppl": 0.00726318359375, + "reward": 0.9774559140205383, + "reward_std": 0.011501484550535679, + "rewards/perpo_ocr_edit_distance_reward": 0.9774559140205383, "step": 1081, "temperature": 0.9 }, { - "advantages": 0.0, - "completion_length": 528.5, - "delta_ref_entropy_loss": 0.02301025390625, - "delta_ref_ppl": -0.02325439453125, - "entropy_loss": -0.018096923828125, - "epoch": 0.4328, - "grad_norm": 0.019435778731277446, - "k1_kl": 0.023193359375, - "k3_kl": 0.014312744140625, - "kimi_kl": 0.0380859375, - "learning_rate": 2.836e-07, - "loss": 0.0006, - "ppl": 0.00757598876953125, - "reward": 0.9985455274581909, - "reward_std": 0.0, - "rewards/perpo_ocr_edit_distance_reward": 0.9985455274581909, + "advantages": -2.2990363390817947e-07, + "completion_length": 1308.0, + "delta_ref_entropy_loss": 0.036376953125, + "delta_ref_ppl": -0.0299072265625, + "entropy_loss": -0.052490234375, + "epoch": 0.2164, + "grad_norm": 1.1134750622890988, + "k1_kl": 0.030029296875, + "k3_kl": 0.0218505859375, + "kimi_kl": 0.037109375, + "learning_rate": 3.9179999999999995e-07, + "loss": 0.0009, + "ppl": 0.0245361328125, + "reward": 0.5875757932662964, + "reward_std": 0.05574547499418259, + "rewards/perpo_ocr_edit_distance_reward": 0.5875758528709412, "step": 1082, "temperature": 0.9 }, { - "advantages": -0.0002675269315659534, - "completion_length": 984.5, - "delta_ref_entropy_loss": 0.026123046875, - "delta_ref_ppl": -0.0185546875, - "entropy_loss": -0.025848388671875, - "epoch": 0.4332, - "grad_norm": 0.2829569919482908, - "k1_kl": 0.0186767578125, - "k3_kl": 0.010711669921875, - "kimi_kl": 0.0267333984375, - "learning_rate": 2.8339999999999996e-07, - "loss": 0.0007, - "ppl": 0.01226806640625, - "reward": 0.9988362193107605, - "reward_std": 0.00015942346362862736, - "rewards/perpo_ocr_edit_distance_reward": 0.9988362491130829, + "advantages": -1.3181142094254028e-05, + "completion_length": 1199.0, + "delta_ref_entropy_loss": 0.0322265625, + "delta_ref_ppl": -0.0458984375, + "entropy_loss": -0.0291748046875, + "epoch": 0.2166, + "grad_norm": 0.5312897392762173, + "k1_kl": 0.0458984375, + "k3_kl": 0.029052734375, + "kimi_kl": 0.07373046875, + "learning_rate": 3.917e-07, + "loss": 0.0012, + "ppl": 0.0157470703125, + "reward": 0.9913556575775146, + "reward_std": 0.0018365941941738129, + "rewards/perpo_ocr_edit_distance_reward": 0.9913556575775146, "step": 1083, "temperature": 0.9 }, { - "advantages": -4.14678052607087e-06, - "completion_length": 684.0, - "delta_ref_entropy_loss": 0.0606689453125, - "delta_ref_ppl": -0.05078125, - "entropy_loss": -0.0670166015625, - "epoch": 0.4336, - "grad_norm": 1.064804443468628, - "k1_kl": 0.051025390625, - "k3_kl": 0.0294189453125, - "kimi_kl": 0.064697265625, - "learning_rate": 2.832e-07, - "loss": 0.0012, - "ppl": 0.0352783203125, - "reward": 0.9664715230464935, - "reward_std": 0.043539746198803186, - "rewards/perpo_ocr_edit_distance_reward": 0.9664716124534607, + "advantages": 0.0, + "completion_length": 47.0, + "delta_ref_entropy_loss": 0.171875, + "delta_ref_ppl": -0.640625, + "entropy_loss": -0.04150390625, + "epoch": 0.2168, + "grad_norm": 0.062373940128537066, + "k1_kl": 0.640625, + "k3_kl": 0.52734375, + "kimi_kl": 2.21875, + "learning_rate": 3.916e-07, + "loss": 0.021, + "ppl": 0.0087890625, + "reward": 0.9219858646392822, + "reward_std": 0.0, + "rewards/perpo_ocr_edit_distance_reward": 0.9219858050346375, "step": 1084, "temperature": 0.9 }, { - "advantages": -8.004052460819366e-07, - "completion_length": 195.5, - "delta_ref_entropy_loss": 0.076904296875, - "delta_ref_ppl": -0.0528564453125, - "entropy_loss": -0.08868408203125, - "epoch": 0.434, - "grad_norm": 0.9098226385935001, - "k1_kl": 0.052978515625, - "k3_kl": 0.033203125, - "kimi_kl": 0.081298828125, - "learning_rate": 2.83e-07, - "loss": 0.0013, - "ppl": 0.047119140625, - "reward": 0.8061765134334564, - "reward_std": 0.023562202230095863, - "rewards/perpo_ocr_edit_distance_reward": 0.8061765432357788, + "advantages": -7.152557941481064e-07, + "completion_length": 2047.0, + "delta_ref_entropy_loss": 0.12060546875, + "delta_ref_ppl": -0.07470703125, + "entropy_loss": -0.26171875, + "epoch": 0.217, + "grad_norm": 8.621650727855025, + "k1_kl": 0.07470703125, + "k3_kl": 0.08349609375, + "kimi_kl": 0.09130859375, + "learning_rate": 3.915e-07, + "loss": 0.0033, + "ppl": 0.1787109375, + "reward": 0.7491343021392822, + "reward_std": 0.11340835690498352, + "rewards/perpo_ocr_edit_distance_reward": 0.749134361743927, "step": 1085, "temperature": 0.9 }, { - "advantages": -6.213358756212983e-05, - "completion_length": 1354.0, - "delta_ref_entropy_loss": 0.01611328125, - "delta_ref_ppl": -0.0084686279296875, - "entropy_loss": -0.022369384765625, - "epoch": 0.4344, - "grad_norm": 0.4794005193549197, - "k1_kl": 0.008453369140625, - "k3_kl": 0.003742218017578125, - "kimi_kl": 0.006622314453125, - "learning_rate": 2.8279999999999996e-07, - "loss": 0.0002, - "ppl": 0.0092315673828125, - "reward": 0.9740688502788544, - "reward_std": 0.0008916749502532184, - "rewards/perpo_ocr_edit_distance_reward": 0.9740688502788544, + "advantages": 0.0, + "completion_length": 37.0, + "delta_ref_entropy_loss": 0.2216796875, + "delta_ref_ppl": -0.376953125, + "entropy_loss": -0.033935546875, + "epoch": 0.2172, + "grad_norm": 0.09467329760666879, + "k1_kl": 0.37890625, + "k3_kl": 0.263671875, + "kimi_kl": 0.8984375, + "learning_rate": 3.914e-07, + "loss": 0.0105, + "ppl": 0.007171630859375, + "reward": 0.9398496150970459, + "reward_std": 0.0, + "rewards/perpo_ocr_edit_distance_reward": 0.9398496150970459, "step": 1086, "temperature": 0.9 }, { - "advantages": -1.6429595689260168e-05, - "completion_length": 721.0, - "delta_ref_entropy_loss": 0.097412109375, - "delta_ref_ppl": -0.061767578125, - "entropy_loss": -0.1689453125, - "epoch": 0.4348, - "grad_norm": 2.9384527435973706, - "k1_kl": 0.061767578125, - "k3_kl": 0.03985595703125, - "kimi_kl": 0.0799560546875, - "learning_rate": 2.826e-07, - "loss": 0.0016, - "ppl": 0.102783203125, - "reward": 0.8971556425094604, - "reward_std": 0.002205016673542559, - "rewards/perpo_ocr_edit_distance_reward": 0.8971557021141052, + "advantages": 5.10896995820076e-07, + "completion_length": 1119.0, + "delta_ref_entropy_loss": 0.08837890625, + "delta_ref_ppl": -0.078125, + "entropy_loss": -0.1064453125, + "epoch": 0.2174, + "grad_norm": 3.8485904789322056, + "k1_kl": 0.07861328125, + "k3_kl": 0.04736328125, + "kimi_kl": 0.1123046875, + "learning_rate": 3.9129999999999996e-07, + "loss": 0.0019, + "ppl": 0.05859375, + "reward": 0.9395111799240112, + "reward_std": 0.01608450897037983, + "rewards/perpo_ocr_edit_distance_reward": 0.9395111799240112, "step": 1087, "temperature": 0.9 }, { - "advantages": -5.959613190498203e-05, - "completion_length": 646.0, - "delta_ref_entropy_loss": 0.0284423828125, - "delta_ref_ppl": -0.022247314453125, - "entropy_loss": -0.01560211181640625, - "epoch": 0.4352, - "grad_norm": 0.3870183051536338, - "k1_kl": 0.022247314453125, - "k3_kl": 0.0135650634765625, - "kimi_kl": 0.028472900390625, - "learning_rate": 2.824e-07, - "loss": 0.0006, - "ppl": 0.0091705322265625, - "reward": 0.9997861385345459, - "reward_std": 0.00020004576072096825, - "rewards/perpo_ocr_edit_distance_reward": 0.9997861683368683, + "advantages": -0.00011417696077842265, + "completion_length": 737.0, + "delta_ref_entropy_loss": 0.040771484375, + "delta_ref_ppl": -0.038330078125, + "entropy_loss": -0.0244140625, + "epoch": 0.2176, + "grad_norm": 0.3276272605196492, + "k1_kl": 0.038330078125, + "k3_kl": 0.02392578125, + "kimi_kl": 0.057373046875, + "learning_rate": 3.9119999999999996e-07, + "loss": 0.0011, + "ppl": 0.0086669921875, + "reward": 0.9966517686843872, + "reward_std": 0.00034738852991722524, + "rewards/perpo_ocr_edit_distance_reward": 0.996651828289032, "step": 1088, "temperature": 0.9 }, { - "advantages": -2.614089498820249e-05, - "completion_length": 399.0, - "delta_ref_entropy_loss": 0.03778076171875, - "delta_ref_ppl": -0.0401611328125, - "entropy_loss": -0.02105712890625, - "epoch": 0.4356, - "grad_norm": 0.4264326658030378, - "k1_kl": 0.040283203125, - "k3_kl": 0.02679443359375, - "kimi_kl": 0.080322265625, - "learning_rate": 2.8219999999999997e-07, - "loss": 0.0011, - "ppl": 0.011138916015625, - "reward": 0.9962822794914246, - "reward_std": 0.00043879239819943905, - "rewards/perpo_ocr_edit_distance_reward": 0.9962823390960693, + "advantages": -2.384185791015625e-07, + "completion_length": 752.0, + "delta_ref_entropy_loss": 0.1357421875, + "delta_ref_ppl": -0.09814453125, + "entropy_loss": -0.265625, + "epoch": 0.2178, + "grad_norm": 2.0061825013335137, + "k1_kl": 0.0986328125, + "k3_kl": 0.0673828125, + "kimi_kl": 0.1357421875, + "learning_rate": 3.911e-07, + "loss": 0.0027, + "ppl": 0.13671875, + "reward": 0.4474329948425293, + "reward_std": 0.1201334148645401, + "rewards/perpo_ocr_edit_distance_reward": 0.4474330544471741, "step": 1089, "temperature": 0.9 }, { - "advantages": -4.894393168797251e-05, - "completion_length": 554.5, - "delta_ref_entropy_loss": 0.0352783203125, - "delta_ref_ppl": -0.0302734375, - "entropy_loss": -0.03021240234375, - "epoch": 0.436, - "grad_norm": 0.35365460445315877, - "k1_kl": 0.0302734375, - "k3_kl": 0.01959228515625, - "kimi_kl": 0.05908203125, - "learning_rate": 2.8199999999999996e-07, - "loss": 0.0008, - "ppl": 0.015777587890625, - "reward": 0.997984766960144, - "reward_std": 0.0010193444031756371, - "rewards/perpo_ocr_edit_distance_reward": 0.9979848265647888, + "advantages": -1.6842570403241552e-05, + "completion_length": 1863.0, + "delta_ref_entropy_loss": 0.06591796875, + "delta_ref_ppl": -0.04638671875, + "entropy_loss": -0.07080078125, + "epoch": 0.218, + "grad_norm": 1.2515529206835334, + "k1_kl": 0.04638671875, + "k3_kl": 0.03515625, + "kimi_kl": 0.05517578125, + "learning_rate": 3.91e-07, + "loss": 0.0014, + "ppl": 0.0400390625, + "reward": 0.9780182838439941, + "reward_std": 0.003946912940591574, + "rewards/perpo_ocr_edit_distance_reward": 0.9780184030532837, "step": 1090, "temperature": 0.9 }, { - "advantages": -3.17990779876709e-05, - "completion_length": 482.5, - "delta_ref_entropy_loss": 0.041748046875, - "delta_ref_ppl": -0.04437255859375, - "entropy_loss": -0.0341796875, - "epoch": 0.4364, - "grad_norm": 0.38435383573402604, - "k1_kl": 0.04437255859375, - "k3_kl": 0.028167724609375, - "kimi_kl": 0.0806884765625, - "learning_rate": 2.818e-07, - "loss": 0.0012, - "ppl": 0.01824951171875, - "reward": 0.9970653057098389, - "reward_std": 0.0005525367450900376, - "rewards/perpo_ocr_edit_distance_reward": 0.9970653653144836, + "advantages": -0.00011483261187095195, + "completion_length": 350.0, + "delta_ref_entropy_loss": 0.08203125, + "delta_ref_ppl": -0.11669921875, + "entropy_loss": -0.0250244140625, + "epoch": 0.2182, + "grad_norm": 0.9511237431403183, + "k1_kl": 0.11669921875, + "k3_kl": 0.08447265625, + "kimi_kl": 0.310546875, + "learning_rate": 3.909e-07, + "loss": 0.0035, + "ppl": 0.01275634765625, + "reward": 0.7026576995849609, + "reward_std": 0.0005673733539879322, + "rewards/perpo_ocr_edit_distance_reward": 0.7026578783988953, "step": 1091, "temperature": 0.9 }, { - "advantages": -1.8485956445601914e-05, - "completion_length": 657.5, - "delta_ref_entropy_loss": 0.10626220703125, - "delta_ref_ppl": -0.054840087890625, - "entropy_loss": -0.14404296875, - "epoch": 0.4368, - "grad_norm": 1.2358061856562914, - "k1_kl": 0.054840087890625, - "k3_kl": 0.032989501953125, - "kimi_kl": 0.057952880859375, - "learning_rate": 2.816e-07, - "loss": 0.0013, - "ppl": 0.07745361328125, - "reward": 0.8501745164394379, - "reward_std": 0.008326587441843003, - "rewards/perpo_ocr_edit_distance_reward": 0.850174605846405, + "advantages": -8.685248758411035e-06, + "completion_length": 1491.0, + "delta_ref_entropy_loss": 0.01397705078125, + "delta_ref_ppl": -0.033935546875, + "entropy_loss": -0.0255126953125, + "epoch": 0.2184, + "grad_norm": 0.4439557228074404, + "k1_kl": 0.033935546875, + "k3_kl": 0.0247802734375, + "kimi_kl": 0.09228515625, + "learning_rate": 3.908e-07, + "loss": 0.001, + "ppl": 0.013916015625, + "reward": 0.9868019223213196, + "reward_std": 0.002842125715687871, + "rewards/perpo_ocr_edit_distance_reward": 0.9868019819259644, "step": 1092, "temperature": 0.9 }, { - "advantages": -3.193106010712654e-07, - "completion_length": 125.5, - "delta_ref_entropy_loss": 0.0836181640625, - "delta_ref_ppl": -0.202392578125, - "entropy_loss": -0.04638671875, - "epoch": 0.4372, - "grad_norm": 1.0363805667157056, - "k1_kl": 0.203369140625, - "k3_kl": 0.1552734375, - "kimi_kl": 0.66943359375, - "learning_rate": 2.8139999999999997e-07, - "loss": 0.0062, - "ppl": 0.02032470703125, - "reward": 0.9525982141494751, - "reward_std": 0.08064247667789459, - "rewards/perpo_ocr_edit_distance_reward": 0.9525982439517975, + "advantages": -2.3560865884064697e-05, + "completion_length": 879.0, + "delta_ref_entropy_loss": 0.109375, + "delta_ref_ppl": -0.08056640625, + "entropy_loss": -0.04052734375, + "epoch": 0.2186, + "grad_norm": 0.5878244729014691, + "k1_kl": 0.08056640625, + "k3_kl": 0.042724609375, + "kimi_kl": 0.10888671875, + "learning_rate": 3.9069999999999997e-07, + "loss": 0.0017, + "ppl": 0.017578125, + "reward": 0.972476601600647, + "reward_std": 0.001345275086350739, + "rewards/perpo_ocr_edit_distance_reward": 0.9724766612052917, "step": 1093, "temperature": 0.9 }, { - "advantages": -1.6910690192162292e-05, - "completion_length": 418.5, - "delta_ref_entropy_loss": 0.0614013671875, - "delta_ref_ppl": -0.0810546875, - "entropy_loss": -0.06494140625, - "epoch": 0.4376, - "grad_norm": 1.3133680642397658, - "k1_kl": 0.0810546875, - "k3_kl": 0.054443359375, - "kimi_kl": 0.19921875, - "learning_rate": 2.812e-07, - "loss": 0.0022, - "ppl": 0.03106689453125, - "reward": 0.9883656799793243, - "reward_std": 0.003001141478307545, - "rewards/perpo_ocr_edit_distance_reward": 0.9883657991886139, + "advantages": 0.0, + "completion_length": 250.0, + "delta_ref_entropy_loss": 0.0712890625, + "delta_ref_ppl": -0.0947265625, + "entropy_loss": -0.0130615234375, + "epoch": 0.2188, + "grad_norm": 0.008841958962035868, + "k1_kl": 0.09521484375, + "k3_kl": 0.0634765625, + "kimi_kl": 0.2021484375, + "learning_rate": 3.9059999999999996e-07, + "loss": 0.0025, + "ppl": 0.00225830078125, + "reward": 0.9929220676422119, + "reward_std": 0.0, + "rewards/perpo_ocr_edit_distance_reward": 0.9929221272468567, "step": 1094, "temperature": 0.9 }, { - "advantages": -2.435275564494077e-05, - "completion_length": 765.0, - "delta_ref_entropy_loss": 0.01953125, - "delta_ref_ppl": -0.01861572265625, - "entropy_loss": -0.024017333984375, - "epoch": 0.438, - "grad_norm": 0.4307383475335131, - "k1_kl": 0.01861572265625, - "k3_kl": 0.011383056640625, - "kimi_kl": 0.029296875, - "learning_rate": 2.8100000000000004e-07, - "loss": 0.0005, - "ppl": 0.01092529296875, - "reward": 0.9847878813743591, - "reward_std": 0.0016546151600778103, - "rewards/perpo_ocr_edit_distance_reward": 0.9847879409790039, + "advantages": -1.2261527899681823e-06, + "completion_length": 730.0, + "delta_ref_entropy_loss": 0.055908203125, + "delta_ref_ppl": -0.05859375, + "entropy_loss": -0.0294189453125, + "epoch": 0.219, + "grad_norm": 0.974689019492367, + "k1_kl": 0.05859375, + "k3_kl": 0.038818359375, + "kimi_kl": 0.14453125, + "learning_rate": 3.905e-07, + "loss": 0.0016, + "ppl": 0.01318359375, + "reward": 0.982562780380249, + "reward_std": 0.007100271992385387, + "rewards/perpo_ocr_edit_distance_reward": 0.982562780380249, "step": 1095, "temperature": 0.9 }, { - "advantages": -2.2121839720057324e-05, - "completion_length": 615.5, - "delta_ref_entropy_loss": 0.03228759765625, - "delta_ref_ppl": -0.0255126953125, - "entropy_loss": -0.0281982421875, - "epoch": 0.4384, - "grad_norm": 0.6631359888365582, - "k1_kl": 0.02545166015625, - "k3_kl": 0.016387939453125, - "kimi_kl": 0.04998779296875, - "learning_rate": 2.8079999999999997e-07, - "loss": 0.0007, - "ppl": 0.014862060546875, - "reward": 0.9913780093193054, - "reward_std": 0.0002387301647104323, - "rewards/perpo_ocr_edit_distance_reward": 0.9913780093193054, + "advantages": -0.0005960464477539062, + "completion_length": 550.0, + "delta_ref_entropy_loss": 0.0380859375, + "delta_ref_ppl": -0.034423828125, + "entropy_loss": -0.01611328125, + "epoch": 0.2192, + "grad_norm": 0.013237128428976322, + "k1_kl": 0.03466796875, + "k3_kl": 0.0201416015625, + "kimi_kl": 0.0458984375, + "learning_rate": 3.904e-07, + "loss": 0.0014, + "ppl": 0.004302978515625, + "reward": 0.9707411527633667, + "reward_std": 0.0, + "rewards/perpo_ocr_edit_distance_reward": 0.9707412719726562, "step": 1096, "temperature": 0.9 }, { - "advantages": -1.984409027500078e-05, - "completion_length": 445.0, - "delta_ref_entropy_loss": 0.06488037109375, - "delta_ref_ppl": -0.137451171875, - "entropy_loss": -0.07275390625, - "epoch": 0.4388, - "grad_norm": 5.20986036971549, - "k1_kl": 0.13739013671875, - "k3_kl": 0.09967041015625, - "kimi_kl": 0.269775390625, - "learning_rate": 2.806e-07, - "loss": 0.004, - "ppl": 0.04931640625, - "reward": 0.9812265336513519, - "reward_std": 0.0024984186165966094, - "rewards/perpo_ocr_edit_distance_reward": 0.9812265634536743, + "advantages": -0.00012493986287154257, + "completion_length": 616.0, + "delta_ref_entropy_loss": 0.058349609375, + "delta_ref_ppl": -0.0791015625, + "entropy_loss": -0.0155029296875, + "epoch": 0.2194, + "grad_norm": 0.7229817419270688, + "k1_kl": 0.07861328125, + "k3_kl": 0.04931640625, + "kimi_kl": 0.14453125, + "learning_rate": 3.9029999999999994e-07, + "loss": 0.0021, + "ppl": 0.005950927734375, + "reward": 0.9937693476676941, + "reward_std": 0.00024077876878436655, + "rewards/perpo_ocr_edit_distance_reward": 0.9937693476676941, "step": 1097, "temperature": 0.9 }, { - "advantages": -1.664672737433648e-05, - "completion_length": 611.0, - "delta_ref_entropy_loss": 0.057861328125, - "delta_ref_ppl": -0.04827880859375, - "entropy_loss": -0.063720703125, - "epoch": 0.4392, - "grad_norm": 0.860288472680571, - "k1_kl": 0.04833984375, - "k3_kl": 0.029876708984375, - "kimi_kl": 0.106201171875, - "learning_rate": 2.804e-07, - "loss": 0.0012, - "ppl": 0.03369140625, - "reward": 0.9632999897003174, - "reward_std": 0.02275031543103978, - "rewards/perpo_ocr_edit_distance_reward": 0.9633000493049622, + "advantages": -3.9275204471778125e-05, + "completion_length": 554.0, + "delta_ref_entropy_loss": 0.0703125, + "delta_ref_ppl": -0.07568359375, + "entropy_loss": -0.05029296875, + "epoch": 0.2196, + "grad_norm": 1.3030610630008936, + "k1_kl": 0.0751953125, + "k3_kl": 0.044189453125, + "kimi_kl": 0.1328125, + "learning_rate": 3.902e-07, + "loss": 0.0018, + "ppl": 0.0244140625, + "reward": 0.994178295135498, + "reward_std": 0.0025008143857121468, + "rewards/perpo_ocr_edit_distance_reward": 0.9941784143447876, "step": 1098, "temperature": 0.9 }, { - "advantages": -0.00012413944932632148, - "completion_length": 754.0, - "delta_ref_entropy_loss": 0.0194091796875, - "delta_ref_ppl": -0.02191162109375, - "entropy_loss": -0.01708984375, - "epoch": 0.4396, - "grad_norm": 0.3502692712012292, - "k1_kl": 0.02197265625, - "k3_kl": 0.013458251953125, - "kimi_kl": 0.03106689453125, - "learning_rate": 2.802e-07, - "loss": 0.0007, - "ppl": 0.0083770751953125, - "reward": 0.9989950656890869, - "reward_std": 0.00041521977254888043, - "rewards/perpo_ocr_edit_distance_reward": 0.9989951550960541, + "advantages": -9.128026249527466e-06, + "completion_length": 948.0, + "delta_ref_entropy_loss": 0.0712890625, + "delta_ref_ppl": -0.072265625, + "entropy_loss": -0.07666015625, + "epoch": 0.2198, + "grad_norm": 2.4553121861464597, + "k1_kl": 0.072265625, + "k3_kl": 0.042236328125, + "kimi_kl": 0.09033203125, + "learning_rate": 3.901e-07, + "loss": 0.0017, + "ppl": 0.05712890625, + "reward": 0.9766213893890381, + "reward_std": 0.0036382919643074274, + "rewards/perpo_ocr_edit_distance_reward": 0.9766214489936829, "step": 1099, "temperature": 0.9 }, { - "advantages": 4.810946570010799e-07, - "completion_length": 1454.5, - "delta_ref_entropy_loss": 0.02935791015625, - "delta_ref_ppl": -0.02130126953125, - "entropy_loss": -0.109619140625, - "epoch": 0.44, - "grad_norm": 47.53417429521547, - "k1_kl": 0.02142333984375, - "k3_kl": 0.0948486328125, - "kimi_kl": 0.03485107421875, - "learning_rate": 2.8e-07, - "loss": 0.0038, - "ppl": 0.07049560546875, - "reward": 0.9268530607223511, - "reward_std": 0.004388625900901388, - "rewards/perpo_ocr_edit_distance_reward": 0.9268530607223511, + "advantages": -0.0001259616547031328, + "completion_length": 462.0, + "delta_ref_entropy_loss": 0.027587890625, + "delta_ref_ppl": -0.03662109375, + "entropy_loss": -0.01275634765625, + "epoch": 0.22, + "grad_norm": 0.5866401269030587, + "k1_kl": 0.03662109375, + "k3_kl": 0.024169921875, + "kimi_kl": 0.07958984375, + "learning_rate": 3.8999999999999997e-07, + "loss": 0.0011, + "ppl": 0.00469970703125, + "reward": 0.9978991746902466, + "reward_std": 0.00037324678851291537, + "rewards/perpo_ocr_edit_distance_reward": 0.9978992342948914, "step": 1100, "temperature": 0.9 }, { - "advantages": -0.00010580250818748027, - "completion_length": 649.0, - "delta_ref_entropy_loss": 0.046875, - "delta_ref_ppl": -0.0430908203125, - "entropy_loss": -0.05126953125, - "epoch": 0.4404, - "grad_norm": 1.0739229056773292, - "k1_kl": 0.0433349609375, - "k3_kl": 0.0267333984375, - "kimi_kl": 0.0882568359375, - "learning_rate": 2.798e-07, - "loss": 0.0012, - "ppl": 0.02618408203125, - "reward": 0.9844757318496704, - "reward_std": 0.0004617693048203364, - "rewards/perpo_ocr_edit_distance_reward": 0.9844757616519928, + "advantages": -1.4168876077746972e-05, + "completion_length": 519.0, + "delta_ref_entropy_loss": 0.140625, + "delta_ref_ppl": -0.109375, + "entropy_loss": -0.0966796875, + "epoch": 0.2202, + "grad_norm": 2.0186135953303643, + "k1_kl": 0.109375, + "k3_kl": 0.05908203125, + "kimi_kl": 0.154296875, + "learning_rate": 3.899e-07, + "loss": 0.0024, + "ppl": 0.052734375, + "reward": 0.9547618627548218, + "reward_std": 0.005312852095812559, + "rewards/perpo_ocr_edit_distance_reward": 0.9547619819641113, "step": 1101, "temperature": 0.9 }, { - "advantages": -0.00032135844412550796, - "completion_length": 660.0, - "delta_ref_entropy_loss": 0.04144287109375, - "delta_ref_ppl": -0.04510498046875, - "entropy_loss": -0.025146484375, - "epoch": 0.4408, - "grad_norm": 0.9041002886156817, - "k1_kl": 0.04534912109375, - "k3_kl": 0.028350830078125, - "kimi_kl": 0.0863037109375, - "learning_rate": 2.796e-07, - "loss": 0.0015, - "ppl": 0.01116943359375, - "reward": 0.9264042675495148, - "reward_std": 0.0003149319381918758, - "rewards/perpo_ocr_edit_distance_reward": 0.9264043569564819, + "advantages": 1.8596649169921875e-05, + "completion_length": 815.0, + "delta_ref_entropy_loss": 0.060546875, + "delta_ref_ppl": -0.0498046875, + "entropy_loss": -0.0220947265625, + "epoch": 0.2204, + "grad_norm": 0.7000201848575317, + "k1_kl": 0.0498046875, + "k3_kl": 0.0244140625, + "kimi_kl": 0.0654296875, + "learning_rate": 3.8979999999999996e-07, + "loss": 0.001, + "ppl": 0.00933837890625, + "reward": 0.6625394821166992, + "reward_std": 0.0003585220838431269, + "rewards/perpo_ocr_edit_distance_reward": 0.6625394821166992, "step": 1102, "temperature": 0.9 }, { - "advantages": -2.5476729206275195e-05, - "completion_length": 544.0, - "delta_ref_entropy_loss": 0.05120849609375, - "delta_ref_ppl": -0.0396728515625, - "entropy_loss": -0.049774169921875, - "epoch": 0.4412, - "grad_norm": 0.6674334713013346, - "k1_kl": 0.03955078125, - "k3_kl": 0.01922607421875, - "kimi_kl": 0.04052734375, - "learning_rate": 2.794e-07, - "loss": 0.0008, - "ppl": 0.0238037109375, - "reward": 0.9925636947154999, - "reward_std": 0.0006583079230040312, - "rewards/perpo_ocr_edit_distance_reward": 0.9925637245178223, + "advantages": -9.380919800605625e-05, + "completion_length": 870.0, + "delta_ref_entropy_loss": 0.05810546875, + "delta_ref_ppl": -0.056640625, + "entropy_loss": -0.0296630859375, + "epoch": 0.2206, + "grad_norm": 0.4579104798602294, + "k1_kl": 0.056640625, + "k3_kl": 0.03662109375, + "kimi_kl": 0.1259765625, + "learning_rate": 3.8969999999999995e-07, + "loss": 0.0016, + "ppl": 0.013671875, + "reward": 0.9485339522361755, + "reward_std": 0.0008984280866570771, + "rewards/perpo_ocr_edit_distance_reward": 0.9485340118408203, "step": 1103, "temperature": 0.9 }, { - "advantages": -1.5005470231699292e-05, - "completion_length": 596.0, - "delta_ref_entropy_loss": 0.02655029296875, - "delta_ref_ppl": -0.0284423828125, - "entropy_loss": -0.0323486328125, - "epoch": 0.4416, - "grad_norm": 0.43875240308182106, - "k1_kl": 0.0284423828125, - "k3_kl": 0.0201416015625, - "kimi_kl": 0.0521240234375, - "learning_rate": 2.792e-07, - "loss": 0.0008, - "ppl": 0.01605224609375, - "reward": 0.9880558848381042, - "reward_std": 0.0007488047704100609, - "rewards/perpo_ocr_edit_distance_reward": 0.9880559146404266, + "advantages": -9.681497431301977e-06, + "completion_length": 1046.0, + "delta_ref_entropy_loss": 0.08544921875, + "delta_ref_ppl": -0.06396484375, + "entropy_loss": -0.07080078125, + "epoch": 0.2208, + "grad_norm": 0.9016057283530371, + "k1_kl": 0.06396484375, + "k3_kl": 0.033447265625, + "kimi_kl": 0.07861328125, + "learning_rate": 3.896e-07, + "loss": 0.0013, + "ppl": 0.035888671875, + "reward": 0.955689013004303, + "reward_std": 0.005177667830139399, + "rewards/perpo_ocr_edit_distance_reward": 0.9556890726089478, "step": 1104, "temperature": 0.9 }, { - "advantages": -0.00012857148476541624, - "completion_length": 1391.5, - "delta_ref_entropy_loss": 0.011810302734375, - "delta_ref_ppl": -0.008697509765625, - "entropy_loss": -0.02392578125, - "epoch": 0.442, - "grad_norm": 0.3389096931850168, - "k1_kl": 0.0086822509765625, - "k3_kl": 0.00536346435546875, - "kimi_kl": 0.01203155517578125, - "learning_rate": 2.79e-07, - "loss": 0.0003, - "ppl": 0.0120849609375, - "reward": 0.9846571981906891, - "reward_std": 0.0032050320005510002, - "rewards/perpo_ocr_edit_distance_reward": 0.9846572875976562, + "advantages": -1.7404556274414062e-05, + "completion_length": 974.0, + "delta_ref_entropy_loss": 0.10009765625, + "delta_ref_ppl": -0.08642578125, + "entropy_loss": -0.10107421875, + "epoch": 0.221, + "grad_norm": 1.112815574141896, + "k1_kl": 0.0859375, + "k3_kl": 0.0498046875, + "kimi_kl": 0.11328125, + "learning_rate": 3.895e-07, + "loss": 0.002, + "ppl": 0.05419921875, + "reward": 0.9397664666175842, + "reward_std": 0.001856097369454801, + "rewards/perpo_ocr_edit_distance_reward": 0.939766526222229, "step": 1105, "temperature": 0.9 }, { - "advantages": 0.0, - "completion_length": 445.0, - "delta_ref_entropy_loss": 0.02606201171875, - "delta_ref_ppl": -0.02911376953125, - "entropy_loss": -0.0196533203125, - "epoch": 0.4424, - "grad_norm": 0.02008874301896849, - "k1_kl": 0.029296875, - "k3_kl": 0.0177001953125, - "kimi_kl": 0.0491943359375, - "learning_rate": 2.788e-07, + "advantages": -3.048352027690271e-06, + "completion_length": 716.0, + "delta_ref_entropy_loss": 0.02587890625, + "delta_ref_ppl": -0.03125, + "entropy_loss": -0.0703125, + "epoch": 0.2212, + "grad_norm": 1.3295912426040821, + "k1_kl": 0.03125, + "k3_kl": 0.0184326171875, + "kimi_kl": 0.040771484375, + "learning_rate": 3.8940000000000003e-07, "loss": 0.0007, - "ppl": 0.0088958740234375, - "reward": 1.0, - "reward_std": 0.0, - "rewards/perpo_ocr_edit_distance_reward": 1.0, + "ppl": 0.040771484375, + "reward": 0.8396212458610535, + "reward_std": 0.030548183247447014, + "rewards/perpo_ocr_edit_distance_reward": 0.8396213054656982, "step": 1106, "temperature": 0.9 }, { - "advantages": -0.0003298861665825825, - "completion_length": 438.0, - "delta_ref_entropy_loss": 0.0498046875, - "delta_ref_ppl": -0.0665283203125, - "entropy_loss": -0.01953125, - "epoch": 0.4428, - "grad_norm": 0.6890069983415477, - "k1_kl": 0.06640625, - "k3_kl": 0.04669189453125, - "kimi_kl": 0.173095703125, - "learning_rate": 2.786e-07, - "loss": 0.0022, - "ppl": 0.0078277587890625, - "reward": 0.9991082847118378, - "reward_std": 0.00035088969161733985, - "rewards/perpo_ocr_edit_distance_reward": 0.9991083443164825, + "advantages": -2.1002122593927197e-05, + "completion_length": 1369.0, + "delta_ref_entropy_loss": 0.062255859375, + "delta_ref_ppl": -0.043701171875, + "entropy_loss": -0.03466796875, + "epoch": 0.2214, + "grad_norm": 1.0286075626604543, + "k1_kl": 0.043701171875, + "k3_kl": 0.0235595703125, + "kimi_kl": 0.06103515625, + "learning_rate": 3.8929999999999997e-07, + "loss": 0.001, + "ppl": 0.01544189453125, + "reward": 0.9620859622955322, + "reward_std": 0.0011155146639794111, + "rewards/perpo_ocr_edit_distance_reward": 0.9620859026908875, "step": 1107, "temperature": 0.9 }, { - "advantages": -6.699136315546639e-05, - "completion_length": 1287.5, - "delta_ref_entropy_loss": 0.032501220703125, - "delta_ref_ppl": -0.020782470703125, - "entropy_loss": -0.04638671875, - "epoch": 0.4432, - "grad_norm": 1.658721471490001, - "k1_kl": 0.020782470703125, - "k3_kl": 0.013641357421875, - "kimi_kl": 0.041656494140625, - "learning_rate": 2.7839999999999995e-07, - "loss": 0.0006, - "ppl": 0.02655029296875, - "reward": 0.9859067499637604, - "reward_std": 0.0010076358157675713, - "rewards/perpo_ocr_edit_distance_reward": 0.9859068095684052, + "advantages": -0.0005960464477539062, + "completion_length": 61.0, + "delta_ref_entropy_loss": 0.10302734375, + "delta_ref_ppl": -0.5078125, + "entropy_loss": -0.047119140625, + "epoch": 0.2216, + "grad_norm": 0.07959264293391417, + "k1_kl": 0.5078125, + "k3_kl": 0.421875, + "kimi_kl": 1.8828125, + "learning_rate": 3.8919999999999996e-07, + "loss": 0.0174, + "ppl": 0.0146484375, + "reward": 0.9634145498275757, + "reward_std": 0.0, + "rewards/perpo_ocr_edit_distance_reward": 0.9634146690368652, "step": 1108, "temperature": 0.9 }, { - "advantages": -6.10521892667748e-05, - "completion_length": 557.5, - "delta_ref_entropy_loss": 0.0126953125, - "delta_ref_ppl": -0.014373779296875, - "entropy_loss": -0.01776123046875, - "epoch": 0.4436, - "grad_norm": 0.23450084372837202, - "k1_kl": 0.014434814453125, - "k3_kl": 0.00970458984375, - "kimi_kl": 0.023681640625, - "learning_rate": 2.782e-07, - "loss": 0.0004, - "ppl": 0.010040283203125, - "reward": 0.9992210865020752, - "reward_std": 0.0002638506412040442, - "rewards/perpo_ocr_edit_distance_reward": 0.99922114610672, + "advantages": -2.0223005776642822e-05, + "completion_length": 850.0, + "delta_ref_entropy_loss": 0.0576171875, + "delta_ref_ppl": -0.064453125, + "entropy_loss": -0.02099609375, + "epoch": 0.2218, + "grad_norm": 0.34666511770037955, + "k1_kl": 0.064453125, + "k3_kl": 0.03564453125, + "kimi_kl": 0.1015625, + "learning_rate": 3.891e-07, + "loss": 0.0014, + "ppl": 0.007781982421875, + "reward": 0.9844704270362854, + "reward_std": 0.0003205750253982842, + "rewards/perpo_ocr_edit_distance_reward": 0.9844704270362854, "step": 1109, "temperature": 0.9 }, { - "advantages": -0.00017645104526309296, - "completion_length": 1151.0, - "delta_ref_entropy_loss": 0.0191650390625, - "delta_ref_ppl": -0.0113525390625, - "entropy_loss": -0.015869140625, - "epoch": 0.444, - "grad_norm": 0.2746749406070507, - "k1_kl": 0.011383056640625, - "k3_kl": 0.0059661865234375, - "kimi_kl": 0.011444091796875, - "learning_rate": 2.7800000000000003e-07, - "loss": 0.0004, - "ppl": 0.008544921875, - "reward": 0.9994066059589386, - "reward_std": 0.0003849916538456455, - "rewards/perpo_ocr_edit_distance_reward": 0.9994067251682281, + "advantages": -3.1897001463221386e-05, + "completion_length": 819.0, + "delta_ref_entropy_loss": 0.0576171875, + "delta_ref_ppl": -0.0537109375, + "entropy_loss": -0.0294189453125, + "epoch": 0.222, + "grad_norm": 0.29617332002289665, + "k1_kl": 0.0537109375, + "k3_kl": 0.03173828125, + "kimi_kl": 0.0966796875, + "learning_rate": 3.89e-07, + "loss": 0.0013, + "ppl": 0.01348876953125, + "reward": 0.9934278130531311, + "reward_std": 0.00016708896146155894, + "rewards/perpo_ocr_edit_distance_reward": 0.9934278726577759, "step": 1110, "temperature": 0.9 }, { - "advantages": -1.278945404692422e-05, - "completion_length": 573.0, - "delta_ref_entropy_loss": 0.08099365234375, - "delta_ref_ppl": -0.045257568359375, - "entropy_loss": -0.08563232421875, - "epoch": 0.4444, - "grad_norm": 1.2025886961381147, - "k1_kl": 0.045257568359375, - "k3_kl": 0.0239105224609375, - "kimi_kl": 0.05242919921875, - "learning_rate": 2.7779999999999996e-07, - "loss": 0.001, - "ppl": 0.045196533203125, - "reward": 0.8007816076278687, - "reward_std": 0.009265757034881972, - "rewards/perpo_ocr_edit_distance_reward": 0.8007816672325134, + "advantages": -2.213886909885332e-05, + "completion_length": 137.0, + "delta_ref_entropy_loss": 0.0810546875, + "delta_ref_ppl": -0.1767578125, + "entropy_loss": -0.036865234375, + "epoch": 0.2222, + "grad_norm": 2.0987293574742036, + "k1_kl": 0.1767578125, + "k3_kl": 0.1328125, + "kimi_kl": 0.474609375, + "learning_rate": 3.889e-07, + "loss": 0.0053, + "ppl": 0.02294921875, + "reward": 0.9697179794311523, + "reward_std": 0.002594772493466735, + "rewards/perpo_ocr_edit_distance_reward": 0.9697180390357971, "step": 1111, "temperature": 0.9 }, { - "advantages": -2.7460712317406433e-05, - "completion_length": 407.0, - "delta_ref_entropy_loss": 0.0341796875, - "delta_ref_ppl": -0.0362548828125, - "entropy_loss": -0.0316162109375, - "epoch": 0.4448, - "grad_norm": 1.0564981910117839, - "k1_kl": 0.0362548828125, - "k3_kl": 0.02301025390625, - "kimi_kl": 0.0537109375, - "learning_rate": 2.776e-07, - "loss": 0.0009, - "ppl": 0.01654052734375, - "reward": 0.9970452189445496, - "reward_std": 0.007632598833879456, - "rewards/perpo_ocr_edit_distance_reward": 0.9970452785491943, + "advantages": -0.0001417739113094285, + "completion_length": 374.0, + "delta_ref_entropy_loss": 0.1455078125, + "delta_ref_ppl": -0.08544921875, + "entropy_loss": -0.04736328125, + "epoch": 0.2224, + "grad_norm": 0.5068185453713044, + "k1_kl": 0.08544921875, + "k3_kl": 0.048095703125, + "kimi_kl": 0.1630859375, + "learning_rate": 3.888e-07, + "loss": 0.0021, + "ppl": 0.0150146484375, + "reward": 0.8355072140693665, + "reward_std": 0.0004405357176437974, + "rewards/perpo_ocr_edit_distance_reward": 0.8355072736740112, "step": 1112, "temperature": 0.9 }, { - "advantages": -1.274049463972915e-05, - "completion_length": 584.5, - "delta_ref_entropy_loss": 0.029327392578125, - "delta_ref_ppl": -0.021759033203125, - "entropy_loss": -0.02197265625, - "epoch": 0.4452, - "grad_norm": 0.3527990705353534, - "k1_kl": 0.02178955078125, - "k3_kl": 0.0134429931640625, - "kimi_kl": 0.0333251953125, - "learning_rate": 2.774e-07, - "loss": 0.0006, - "ppl": 0.01123046875, - "reward": 0.9981106817722321, - "reward_std": 0.0003833516238955781, - "rewards/perpo_ocr_edit_distance_reward": 0.9981107413768768, + "advantages": -0.00012442044680938125, + "completion_length": 241.0, + "delta_ref_entropy_loss": 0.058837890625, + "delta_ref_ppl": -0.12451171875, + "entropy_loss": -0.016357421875, + "epoch": 0.2226, + "grad_norm": 0.4734260517725797, + "k1_kl": 0.1259765625, + "k3_kl": 0.09326171875, + "kimi_kl": 0.357421875, + "learning_rate": 3.887e-07, + "loss": 0.0039, + "ppl": 0.004241943359375, + "reward": 0.9429717063903809, + "reward_std": 0.0003791035560425371, + "rewards/perpo_ocr_edit_distance_reward": 0.9429718255996704, "step": 1113, "temperature": 0.9 }, { - "advantages": -4.7406980229425244e-05, - "completion_length": 748.0, - "delta_ref_entropy_loss": 0.03326416015625, - "delta_ref_ppl": -0.0211944580078125, - "entropy_loss": -0.0477294921875, - "epoch": 0.4456, - "grad_norm": 0.6528399184319342, - "k1_kl": 0.0211639404296875, - "k3_kl": 0.013458251953125, - "kimi_kl": 0.023101806640625, - "learning_rate": 2.7719999999999997e-07, - "loss": 0.0006, - "ppl": 0.029693603515625, - "reward": 0.9957868456840515, - "reward_std": 0.0011724442010745406, - "rewards/perpo_ocr_edit_distance_reward": 0.9957869350910187, + "advantages": -1.370906943520822e-06, + "completion_length": 392.0, + "delta_ref_entropy_loss": 0.1923828125, + "delta_ref_ppl": -0.1591796875, + "entropy_loss": -0.15234375, + "epoch": 0.2228, + "grad_norm": 2.611466124316173, + "k1_kl": 0.16015625, + "k3_kl": 0.09228515625, + "kimi_kl": 0.21484375, + "learning_rate": 3.8859999999999997e-07, + "loss": 0.0037, + "ppl": 0.0791015625, + "reward": 0.6607394218444824, + "reward_std": 0.018467368558049202, + "rewards/perpo_ocr_edit_distance_reward": 0.6607394218444824, "step": 1114, "temperature": 0.9 }, { - "advantages": 1.2201923141219595e-05, - "completion_length": 461.0, - "delta_ref_entropy_loss": 0.052001953125, - "delta_ref_ppl": -0.0587158203125, - "entropy_loss": -0.0799560546875, - "epoch": 0.446, - "grad_norm": 1.0786804919571638, - "k1_kl": 0.0584716796875, - "k3_kl": 0.03582763671875, - "kimi_kl": 0.08251953125, - "learning_rate": 2.77e-07, + "advantages": -2.6736940981209045e-06, + "completion_length": 1110.0, + "delta_ref_entropy_loss": 0.1103515625, + "delta_ref_ppl": -0.06640625, + "entropy_loss": -0.2001953125, + "epoch": 0.223, + "grad_norm": 1.904104161142268, + "k1_kl": 0.06591796875, + "k3_kl": 0.03466796875, + "kimi_kl": 0.07080078125, + "learning_rate": 3.885e-07, "loss": 0.0014, - "ppl": 0.045013427734375, - "reward": 0.86773282289505, - "reward_std": 0.02150560452719219, - "rewards/perpo_ocr_edit_distance_reward": 0.8677328824996948, + "ppl": 0.1015625, + "reward": 0.7815317511558533, + "reward_std": 0.0030738934874534607, + "rewards/perpo_ocr_edit_distance_reward": 0.7815317511558533, "step": 1115, "temperature": 0.9 }, { - "advantages": -0.0002981083733715195, - "completion_length": 462.0, - "delta_ref_entropy_loss": 0.0919189453125, - "delta_ref_ppl": -0.0614013671875, - "entropy_loss": -0.14202880859375, - "epoch": 0.4464, - "grad_norm": 1.09279168067906, - "k1_kl": 0.0614013671875, - "k3_kl": 0.034423828125, - "kimi_kl": 0.100341796875, - "learning_rate": 2.768e-07, - "loss": 0.0017, - "ppl": 0.080413818359375, - "reward": 0.9292263686656952, - "reward_std": 0.11380941420793533, - "rewards/perpo_ocr_edit_distance_reward": 0.9292263984680176, + "advantages": 1.2380736734485254e-05, + "completion_length": 397.0, + "delta_ref_entropy_loss": 0.07080078125, + "delta_ref_ppl": -0.08544921875, + "entropy_loss": -0.0177001953125, + "epoch": 0.2232, + "grad_norm": 1.4149817077918152, + "k1_kl": 0.0859375, + "k3_kl": 0.05322265625, + "kimi_kl": 0.1474609375, + "learning_rate": 3.884e-07, + "loss": 0.0021, + "ppl": 0.005645751953125, + "reward": 0.9917038679122925, + "reward_std": 0.0012763390550389886, + "rewards/perpo_ocr_edit_distance_reward": 0.9917038679122925, "step": 1116, "temperature": 0.9 }, { - "advantages": -0.00011390447926373781, - "completion_length": 665.5, - "delta_ref_entropy_loss": 0.05859375, - "delta_ref_ppl": -0.06591796875, - "entropy_loss": -0.11151123046875, - "epoch": 0.4468, - "grad_norm": 1.356230106899699, - "k1_kl": 0.06585693359375, - "k3_kl": 0.043212890625, - "kimi_kl": 0.1461181640625, - "learning_rate": 2.766e-07, - "loss": 0.0018, - "ppl": 0.05615234375, - "reward": 0.9750402867794037, - "reward_std": 0.006594200323888799, - "rewards/perpo_ocr_edit_distance_reward": 0.9750402569770813, + "advantages": -0.0005960464477539062, + "completion_length": 611.0, + "delta_ref_entropy_loss": 0.041748046875, + "delta_ref_ppl": -0.024658203125, + "entropy_loss": -0.011474609375, + "epoch": 0.2234, + "grad_norm": 0.003053641854496423, + "k1_kl": 0.0245361328125, + "k3_kl": 0.00946044921875, + "kimi_kl": 0.0166015625, + "learning_rate": 3.8829999999999995e-07, + "loss": 0.001, + "ppl": 0.00201416015625, + "reward": 0.9965047240257263, + "reward_std": 0.0, + "rewards/perpo_ocr_edit_distance_reward": 0.9965047836303711, "step": 1117, "temperature": 0.9 }, { - "advantages": -0.0003049969677704212, - "completion_length": 537.5, - "delta_ref_entropy_loss": 0.026031494140625, - "delta_ref_ppl": -0.0262451171875, - "entropy_loss": -0.029541015625, - "epoch": 0.4472, - "grad_norm": 0.365692751215637, - "k1_kl": 0.0262451171875, - "k3_kl": 0.0172119140625, - "kimi_kl": 0.0518798828125, - "learning_rate": 2.7639999999999996e-07, - "loss": 0.001, - "ppl": 0.013824462890625, - "reward": 0.9921322166919708, - "reward_std": 0.0033092789817601442, - "rewards/perpo_ocr_edit_distance_reward": 0.9921323657035828, + "advantages": -3.237383862142451e-05, + "completion_length": 276.0, + "delta_ref_entropy_loss": 0.1240234375, + "delta_ref_ppl": -0.19921875, + "entropy_loss": -0.04736328125, + "epoch": 0.2236, + "grad_norm": 1.3614933757119358, + "k1_kl": 0.19921875, + "k3_kl": 0.140625, + "kimi_kl": 0.50390625, + "learning_rate": 3.882e-07, + "loss": 0.0056, + "ppl": 0.022705078125, + "reward": 0.9806980490684509, + "reward_std": 0.002005249261856079, + "rewards/perpo_ocr_edit_distance_reward": 0.9806981682777405, "step": 1118, "temperature": 0.9 }, { - "advantages": -1.9401312442823837e-05, - "completion_length": 1079.0, - "delta_ref_entropy_loss": 0.016571044921875, - "delta_ref_ppl": -0.00839996337890625, - "entropy_loss": -0.01153564453125, - "epoch": 0.4476, - "grad_norm": 0.6553367416032055, - "k1_kl": 0.0084228515625, - "k3_kl": 0.010406494140625, - "kimi_kl": 0.0110931396484375, - "learning_rate": 2.762e-07, - "loss": 0.0004, - "ppl": 0.0075225830078125, - "reward": 0.9717587530612946, - "reward_std": 0.018497686309274286, - "rewards/perpo_ocr_edit_distance_reward": 0.9717588126659393, + "advantages": -2.145767393813003e-05, + "completion_length": 806.0, + "delta_ref_entropy_loss": 0.061279296875, + "delta_ref_ppl": -0.060302734375, + "entropy_loss": -0.030517578125, + "epoch": 0.2238, + "grad_norm": 1.2658243279253687, + "k1_kl": 0.06005859375, + "k3_kl": 0.03662109375, + "kimi_kl": 0.1181640625, + "learning_rate": 3.881e-07, + "loss": 0.0015, + "ppl": 0.0140380859375, + "reward": 0.9826328158378601, + "reward_std": 0.0006947169895283878, + "rewards/perpo_ocr_edit_distance_reward": 0.9826328754425049, "step": 1119, "temperature": 0.9 }, { - "advantages": -3.784895073355443e-06, - "completion_length": 992.0, - "delta_ref_entropy_loss": 0.079345703125, - "delta_ref_ppl": -0.0535888671875, - "entropy_loss": -0.1328125, - "epoch": 0.448, - "grad_norm": 1.6241083624633796, - "k1_kl": 0.0535888671875, - "k3_kl": 0.03076171875, - "kimi_kl": 0.0557861328125, - "learning_rate": 2.7600000000000004e-07, - "loss": 0.0012, - "ppl": 0.0750732421875, - "reward": 0.9403431117534637, - "reward_std": 0.0046168866829248145, - "rewards/perpo_ocr_edit_distance_reward": 0.9403431713581085, + "advantages": 0.0, + "completion_length": 624.0, + "delta_ref_entropy_loss": 0.056396484375, + "delta_ref_ppl": -0.045654296875, + "entropy_loss": -0.0198974609375, + "epoch": 0.224, + "grad_norm": 0.4667735172093663, + "k1_kl": 0.0458984375, + "k3_kl": 0.021484375, + "kimi_kl": 0.0517578125, + "learning_rate": 3.88e-07, + "loss": 0.0009, + "ppl": 0.007598876953125, + "reward": 0.9969800114631653, + "reward_std": 0.0006164525984786451, + "rewards/perpo_ocr_edit_distance_reward": 0.9969800114631653, "step": 1120, "temperature": 0.9 }, { - "advantages": -1.183152244266239e-05, - "completion_length": 516.0, - "delta_ref_entropy_loss": 0.0557861328125, - "delta_ref_ppl": -0.03912353515625, - "entropy_loss": -0.06365966796875, - "epoch": 0.4484, - "grad_norm": 0.8030767299836032, - "k1_kl": 0.03912353515625, - "k3_kl": 0.02313232421875, - "kimi_kl": 0.0625, - "learning_rate": 2.7579999999999997e-07, - "loss": 0.0009, - "ppl": 0.0343017578125, - "reward": 0.9742502570152283, - "reward_std": 0.005536233773455024, - "rewards/perpo_ocr_edit_distance_reward": 0.9742503464221954, + "advantages": -2.5595938495825976e-05, + "completion_length": 657.0, + "delta_ref_entropy_loss": 0.1064453125, + "delta_ref_ppl": -0.11767578125, + "entropy_loss": -0.0791015625, + "epoch": 0.2242, + "grad_norm": 1.9797148383626226, + "k1_kl": 0.11767578125, + "k3_kl": 0.0712890625, + "kimi_kl": 0.1513671875, + "learning_rate": 3.879e-07, + "loss": 0.0029, + "ppl": 0.043701171875, + "reward": 0.8726832866668701, + "reward_std": 0.0018961310852319002, + "rewards/perpo_ocr_edit_distance_reward": 0.8726834058761597, "step": 1121, "temperature": 0.9 }, { - "advantages": -3.185016885254299e-05, - "completion_length": 610.0, - "delta_ref_entropy_loss": 0.05657958984375, - "delta_ref_ppl": -0.044677734375, - "entropy_loss": -0.09423828125, - "epoch": 0.4488, - "grad_norm": 1.1754288285484216, - "k1_kl": 0.04443359375, - "k3_kl": 0.02825927734375, - "kimi_kl": 0.115478515625, - "learning_rate": 2.756e-07, - "loss": 0.0012, - "ppl": 0.0621337890625, - "reward": 0.9618372023105621, - "reward_std": 0.0032970444881357253, - "rewards/perpo_ocr_edit_distance_reward": 0.9618372619152069, + "advantages": 6.811959707420101e-08, + "completion_length": 944.0, + "delta_ref_entropy_loss": 0.07275390625, + "delta_ref_ppl": -0.048583984375, + "entropy_loss": -0.043701171875, + "epoch": 0.2244, + "grad_norm": 6.034313028021474, + "k1_kl": 0.048828125, + "k3_kl": 0.021728515625, + "kimi_kl": 0.052734375, + "learning_rate": 3.8779999999999997e-07, + "loss": 0.0009, + "ppl": 0.021484375, + "reward": 0.9121884703636169, + "reward_std": 0.16942837834358215, + "rewards/perpo_ocr_edit_distance_reward": 0.9121884107589722, "step": 1122, "temperature": 0.9 }, { - "advantages": -1.9890921976184472e-05, - "completion_length": 230.0, - "delta_ref_entropy_loss": 0.0819091796875, - "delta_ref_ppl": -0.1307373046875, - "entropy_loss": -0.04107666015625, - "epoch": 0.4492, - "grad_norm": 1.5115610079613946, - "k1_kl": 0.1307373046875, - "k3_kl": 0.095458984375, - "kimi_kl": 0.5224609375, - "learning_rate": 2.754e-07, - "loss": 0.0038, - "ppl": 0.02288818359375, - "reward": 0.9960124492645264, - "reward_std": 0.0005921811680309474, - "rewards/perpo_ocr_edit_distance_reward": 0.9960125088691711, + "advantages": -6.811959707420101e-08, + "completion_length": 201.0, + "delta_ref_entropy_loss": 0.0888671875, + "delta_ref_ppl": -0.1552734375, + "entropy_loss": -0.10302734375, + "epoch": 0.2246, + "grad_norm": 2.0058240056446057, + "k1_kl": 0.1552734375, + "k3_kl": 0.11572265625, + "kimi_kl": 0.408203125, + "learning_rate": 3.8769999999999996e-07, + "loss": 0.0046, + "ppl": 0.035400390625, + "reward": 0.2942764163017273, + "reward_std": 0.18549343943595886, + "rewards/perpo_ocr_edit_distance_reward": 0.2942764163017273, "step": 1123, "temperature": 0.9 }, { - "advantages": -8.361680556845386e-06, - "completion_length": 622.0, - "delta_ref_entropy_loss": 0.06805419921875, - "delta_ref_ppl": -0.0413818359375, - "entropy_loss": -0.06134033203125, - "epoch": 0.4496, - "grad_norm": 0.998327785666682, - "k1_kl": 0.04132080078125, - "k3_kl": 0.0244140625, - "kimi_kl": 0.0565185546875, - "learning_rate": 2.752e-07, - "loss": 0.001, - "ppl": 0.037353515625, - "reward": 0.9327667951583862, - "reward_std": 0.0013988154532853514, - "rewards/perpo_ocr_edit_distance_reward": 0.9327668249607086, + "advantages": -4.171473847236484e-05, + "completion_length": 1495.0, + "delta_ref_entropy_loss": 0.00927734375, + "delta_ref_ppl": -0.02001953125, + "entropy_loss": -0.01055908203125, + "epoch": 0.2248, + "grad_norm": 0.3687043788957627, + "k1_kl": 0.02001953125, + "k3_kl": 0.01513671875, + "kimi_kl": 0.061767578125, + "learning_rate": 3.876e-07, + "loss": 0.0006, + "ppl": 0.0057373046875, + "reward": 0.784988522529602, + "reward_std": 0.0009203527006320655, + "rewards/perpo_ocr_edit_distance_reward": 0.7849886417388916, "step": 1124, "temperature": 0.9 }, { - "advantages": -0.00016648854580125771, - "completion_length": 577.0, - "delta_ref_entropy_loss": 0.0423583984375, - "delta_ref_ppl": -0.0447998046875, - "entropy_loss": -0.0294189453125, - "epoch": 0.45, - "grad_norm": 0.8680354665960376, - "k1_kl": 0.044921875, - "k3_kl": 0.02838134765625, - "kimi_kl": 0.083740234375, - "learning_rate": 2.75e-07, - "loss": 0.0013, - "ppl": 0.01564788818359375, - "reward": 0.9978980720043182, - "reward_std": 0.0005506710549525451, - "rewards/perpo_ocr_edit_distance_reward": 0.997898131608963, + "advantages": -2.1100046069477685e-05, + "completion_length": 793.0, + "delta_ref_entropy_loss": 0.0308837890625, + "delta_ref_ppl": -0.053955078125, + "entropy_loss": -0.0341796875, + "epoch": 0.225, + "grad_norm": 0.7529891769082931, + "k1_kl": 0.0537109375, + "k3_kl": 0.037353515625, + "kimi_kl": 0.0986328125, + "learning_rate": 3.875e-07, + "loss": 0.0015, + "ppl": 0.0166015625, + "reward": 0.993841826915741, + "reward_std": 0.0031264915596693754, + "rewards/perpo_ocr_edit_distance_reward": 0.993841826915741, "step": 1125, "temperature": 0.9 }, { - "advantages": -1.2057169442414306e-05, - "completion_length": 513.5, - "delta_ref_entropy_loss": 0.06396484375, - "delta_ref_ppl": -0.0526123046875, - "entropy_loss": -0.0750732421875, - "epoch": 0.4504, - "grad_norm": 0.9944713942989797, - "k1_kl": 0.0526123046875, - "k3_kl": 0.0362548828125, - "kimi_kl": 0.09716796875, - "learning_rate": 2.748e-07, - "loss": 0.0015, - "ppl": 0.04522705078125, - "reward": 0.9739783704280853, - "reward_std": 0.0015025374304968864, - "rewards/perpo_ocr_edit_distance_reward": 0.9739783704280853, + "advantages": -6.905624468345195e-05, + "completion_length": 159.0, + "delta_ref_entropy_loss": 0.10791015625, + "delta_ref_ppl": -0.1728515625, + "entropy_loss": -0.04150390625, + "epoch": 0.2252, + "grad_norm": 1.9652958038741797, + "k1_kl": 0.1728515625, + "k3_kl": 0.115234375, + "kimi_kl": 0.310546875, + "learning_rate": 3.874e-07, + "loss": 0.0047, + "ppl": 0.01513671875, + "reward": 0.9677801132202148, + "reward_std": 0.0008864374249242246, + "rewards/perpo_ocr_edit_distance_reward": 0.9677802324295044, "step": 1126, "temperature": 0.9 }, { - "advantages": -4.278549204173032e-05, - "completion_length": 778.5, - "delta_ref_entropy_loss": 0.0489501953125, - "delta_ref_ppl": -0.06951904296875, - "entropy_loss": -0.0625, - "epoch": 0.4508, - "grad_norm": 0.9801540123658279, - "k1_kl": 0.06939697265625, - "k3_kl": 0.045501708984375, - "kimi_kl": 0.17791748046875, - "learning_rate": 2.746e-07, - "loss": 0.0019, - "ppl": 0.03106689453125, - "reward": 0.9926692545413971, - "reward_std": 0.001648678706260398, - "rewards/perpo_ocr_edit_distance_reward": 0.9926693737506866, + "advantages": -1.7029899268550253e-08, + "completion_length": 1141.0, + "delta_ref_entropy_loss": 0.0595703125, + "delta_ref_ppl": -0.052734375, + "entropy_loss": -0.034423828125, + "epoch": 0.2254, + "grad_norm": 0.9834955093998392, + "k1_kl": 0.052734375, + "k3_kl": 0.03466796875, + "kimi_kl": 0.058837890625, + "learning_rate": 3.873e-07, + "loss": 0.0014, + "ppl": 0.016845703125, + "reward": 0.9945529103279114, + "reward_std": 0.000885635381564498, + "rewards/perpo_ocr_edit_distance_reward": 0.9945529103279114, "step": 1127, "temperature": 0.9 }, { - "advantages": -4.112720489501953e-06, - "completion_length": 610.5, - "delta_ref_entropy_loss": 0.0299072265625, - "delta_ref_ppl": -0.020782470703125, - "entropy_loss": -0.02142333984375, - "epoch": 0.4512, - "grad_norm": 0.2693505952990663, - "k1_kl": 0.020782470703125, - "k3_kl": 0.010528564453125, - "kimi_kl": 0.0250244140625, - "learning_rate": 2.7439999999999997e-07, - "loss": 0.0004, - "ppl": 0.0099334716796875, - "reward": 0.9987381100654602, - "reward_std": 0.00046831791405566037, - "rewards/perpo_ocr_edit_distance_reward": 0.9987381100654602, + "advantages": -0.0002932037750724703, + "completion_length": 617.0, + "delta_ref_entropy_loss": 0.0458984375, + "delta_ref_ppl": -0.04248046875, + "entropy_loss": -0.017333984375, + "epoch": 0.2256, + "grad_norm": 0.23563449625749958, + "k1_kl": 0.042236328125, + "k3_kl": 0.025146484375, + "kimi_kl": 0.068359375, + "learning_rate": 3.8719999999999997e-07, + "loss": 0.0013, + "ppl": 0.005401611328125, + "reward": 0.986994743347168, + "reward_std": 0.0001323264150414616, + "rewards/perpo_ocr_edit_distance_reward": 0.9869948625564575, "step": 1128, "temperature": 0.9 }, { - "advantages": -7.450580596923828e-05, - "completion_length": 216.0, - "delta_ref_entropy_loss": 0.16912841796875, - "delta_ref_ppl": -0.77191162109375, - "entropy_loss": -0.096923828125, - "epoch": 0.4516, - "grad_norm": 0.2553976357067772, - "k1_kl": 0.77581787109375, - "k3_kl": 0.6241912841796875, - "kimi_kl": 2.411712646484375, - "learning_rate": 2.742e-07, - "loss": 0.025, - "ppl": 0.05303955078125, - "reward": 0.5764705911278725, - "reward_std": 0.0, - "rewards/perpo_ocr_edit_distance_reward": 0.5764705985784531, + "advantages": -9.602309000911191e-05, + "completion_length": 1284.0, + "delta_ref_entropy_loss": 0.0615234375, + "delta_ref_ppl": -0.05810546875, + "entropy_loss": -0.03125, + "epoch": 0.2258, + "grad_norm": 0.841905652521651, + "k1_kl": 0.05810546875, + "k3_kl": 0.046875, + "kimi_kl": 0.0771484375, + "learning_rate": 3.8709999999999997e-07, + "loss": 0.002, + "ppl": 0.0167236328125, + "reward": 0.7825562953948975, + "reward_std": 0.0006980302277952433, + "rewards/perpo_ocr_edit_distance_reward": 0.782556414604187, "step": 1129, "temperature": 0.9 }, { - "advantages": -8.514949456639442e-08, - "completion_length": 319.5, - "delta_ref_entropy_loss": 0.0390625, - "delta_ref_ppl": -0.0274658203125, - "entropy_loss": -0.049346923828125, - "epoch": 0.452, - "grad_norm": 1.7564132316487318, - "k1_kl": 0.0274658203125, - "k3_kl": 0.014129638671875, - "kimi_kl": 0.03436279296875, - "learning_rate": 2.74e-07, - "loss": 0.0006, - "ppl": 0.02370452880859375, - "reward": 0.8825235962867737, - "reward_std": 0.1902281641960144, - "rewards/perpo_ocr_edit_distance_reward": 0.8825235962867737, + "advantages": -0.0001214998192153871, + "completion_length": 506.0, + "delta_ref_entropy_loss": 0.0732421875, + "delta_ref_ppl": -0.080078125, + "entropy_loss": -0.020751953125, + "epoch": 0.226, + "grad_norm": 0.38637374017752774, + "k1_kl": 0.080078125, + "k3_kl": 0.04833984375, + "kimi_kl": 0.1376953125, + "learning_rate": 3.87e-07, + "loss": 0.0021, + "ppl": 0.00567626953125, + "reward": 0.9962514042854309, + "reward_std": 0.00018031761283054948, + "rewards/perpo_ocr_edit_distance_reward": 0.9962514042854309, "step": 1130, "temperature": 0.9 }, { - "advantages": -2.156411028408911e-05, - "completion_length": 292.5, - "delta_ref_entropy_loss": 0.048828125, - "delta_ref_ppl": -0.0433349609375, - "entropy_loss": -0.02392578125, - "epoch": 0.4524, - "grad_norm": 0.31966795556930183, - "k1_kl": 0.043212890625, - "k3_kl": 0.02325439453125, - "kimi_kl": 0.05224609375, - "learning_rate": 2.738e-07, - "loss": 0.001, - "ppl": 0.009124755859375, - "reward": 0.9998696148395538, - "reward_std": 0.0003448587958700955, - "rewards/perpo_ocr_edit_distance_reward": 0.9998696744441986, + "advantages": -0.00011437280045356601, + "completion_length": 788.0, + "delta_ref_entropy_loss": 0.0419921875, + "delta_ref_ppl": -0.0400390625, + "entropy_loss": -0.0184326171875, + "epoch": 0.2262, + "grad_norm": 0.36776945272550343, + "k1_kl": 0.0400390625, + "k3_kl": 0.024169921875, + "kimi_kl": 0.068359375, + "learning_rate": 3.869e-07, + "loss": 0.0011, + "ppl": 0.0068359375, + "reward": 0.9965541362762451, + "reward_std": 0.000346731161698699, + "rewards/perpo_ocr_edit_distance_reward": 0.9965542554855347, "step": 1131, "temperature": 0.9 }, { - "advantages": -0.00033075469036703, - "completion_length": 445.0, - "delta_ref_entropy_loss": 0.0455322265625, - "delta_ref_ppl": -0.04168701171875, - "entropy_loss": -0.04364013671875, - "epoch": 0.4528, - "grad_norm": 0.460115603123097, - "k1_kl": 0.0416259765625, - "k3_kl": 0.024505615234375, - "kimi_kl": 0.066650390625, - "learning_rate": 2.736e-07, - "loss": 0.0013, - "ppl": 0.02020263671875, - "reward": 0.834794670343399, - "reward_std": 0.000535472936462611, - "rewards/perpo_ocr_edit_distance_reward": 0.8347947597503662, + "advantages": -2.2615706257056445e-05, + "completion_length": 646.0, + "delta_ref_entropy_loss": 0.0625, + "delta_ref_ppl": -0.0615234375, + "entropy_loss": -0.0294189453125, + "epoch": 0.2264, + "grad_norm": 0.536405141044973, + "k1_kl": 0.0615234375, + "k3_kl": 0.035888671875, + "kimi_kl": 0.08544921875, + "learning_rate": 3.8679999999999994e-07, + "loss": 0.0015, + "ppl": 0.00885009765625, + "reward": 0.9957097768783569, + "reward_std": 0.0006527756922878325, + "rewards/perpo_ocr_edit_distance_reward": 0.9957097172737122, "step": 1132, "temperature": 0.9 }, { - "advantages": -1.0388239060077353e-06, - "completion_length": 597.0, - "delta_ref_entropy_loss": 0.0751953125, - "delta_ref_ppl": -0.067626953125, - "entropy_loss": -0.0849609375, - "epoch": 0.4532, - "grad_norm": 0.9844330253112394, - "k1_kl": 0.067626953125, - "k3_kl": 0.0418701171875, - "kimi_kl": 0.1103515625, - "learning_rate": 2.7339999999999995e-07, - "loss": 0.0017, - "ppl": 0.04266357421875, - "reward": 0.7424896359443665, - "reward_std": 0.004159849908319302, - "rewards/perpo_ocr_edit_distance_reward": 0.7424896359443665, + "advantages": 2.5544848085701233e-06, + "completion_length": 210.0, + "delta_ref_entropy_loss": 0.0888671875, + "delta_ref_ppl": -0.150390625, + "entropy_loss": -0.05029296875, + "epoch": 0.2266, + "grad_norm": 2.4216967002745644, + "k1_kl": 0.150390625, + "k3_kl": 0.11328125, + "kimi_kl": 0.4140625, + "learning_rate": 3.867e-07, + "loss": 0.0045, + "ppl": 0.023681640625, + "reward": 0.7986577153205872, + "reward_std": 0.003212856827303767, + "rewards/perpo_ocr_edit_distance_reward": 0.7986577153205872, "step": 1133, "temperature": 0.9 }, { - "advantages": 0.0, - "completion_length": 419.0, - "delta_ref_entropy_loss": 0.0269775390625, - "delta_ref_ppl": -0.03753662109375, - "entropy_loss": -0.012451171875, - "epoch": 0.4536, - "grad_norm": 0.01322015570760386, - "k1_kl": 0.03759765625, - "k3_kl": 0.02166748046875, - "kimi_kl": 0.0496826171875, - "learning_rate": 2.732e-07, - "loss": 0.0009, - "ppl": 0.00494384765625, - "reward": 1.0, - "reward_std": 0.0, - "rewards/perpo_ocr_edit_distance_reward": 1.0, + "advantages": -1.9448145394562744e-05, + "completion_length": 582.0, + "delta_ref_entropy_loss": 0.10302734375, + "delta_ref_ppl": -0.11181640625, + "entropy_loss": -0.042724609375, + "epoch": 0.2268, + "grad_norm": 3.3280120606043737, + "k1_kl": 0.11181640625, + "k3_kl": 0.06494140625, + "kimi_kl": 0.177734375, + "learning_rate": 3.866e-07, + "loss": 0.0026, + "ppl": 0.019287109375, + "reward": 0.9657583236694336, + "reward_std": 0.0025277237873524427, + "rewards/perpo_ocr_edit_distance_reward": 0.9657583832740784, "step": 1134, "temperature": 0.9 }, { - "advantages": 0.0, - "completion_length": 355.5, - "delta_ref_entropy_loss": 0.0479736328125, - "delta_ref_ppl": -0.045867919921875, - "entropy_loss": -0.05035400390625, - "epoch": 0.454, - "grad_norm": 0.8953660718441705, - "k1_kl": 0.0456390380859375, - "k3_kl": 0.02677154541015625, - "kimi_kl": 0.07064056396484375, - "learning_rate": 2.73e-07, - "loss": 0.0011, - "ppl": 0.023590087890625, - "reward": 0.9958847761154175, - "reward_std": 0.0005600120639428496, - "rewards/perpo_ocr_edit_distance_reward": 0.9958847761154175, + "advantages": -2.741813841566909e-05, + "completion_length": 875.0, + "delta_ref_entropy_loss": 0.05810546875, + "delta_ref_ppl": -0.08056640625, + "entropy_loss": -0.0306396484375, + "epoch": 0.227, + "grad_norm": 1.8062784630743287, + "k1_kl": 0.080078125, + "k3_kl": 0.052490234375, + "kimi_kl": 0.1845703125, + "learning_rate": 3.8649999999999997e-07, + "loss": 0.0021, + "ppl": 0.0146484375, + "reward": 0.9081233739852905, + "reward_std": 0.003940066322684288, + "rewards/perpo_ocr_edit_distance_reward": 0.9081234931945801, "step": 1135, "temperature": 0.9 }, { - "advantages": -2.7469227802612295e-05, - "completion_length": 585.5, - "delta_ref_entropy_loss": 0.0343017578125, - "delta_ref_ppl": -0.029541015625, - "entropy_loss": -0.032958984375, - "epoch": 0.4544, - "grad_norm": 1.058662367820884, - "k1_kl": 0.02960205078125, - "k3_kl": 0.020050048828125, - "kimi_kl": 0.0684814453125, - "learning_rate": 2.7279999999999995e-07, - "loss": 0.0008, - "ppl": 0.01544189453125, - "reward": 0.9924264550209045, - "reward_std": 0.0028877301956526935, - "rewards/perpo_ocr_edit_distance_reward": 0.9924264848232269, + "advantages": -8.97475729288999e-06, + "completion_length": 122.0, + "delta_ref_entropy_loss": 0.043212890625, + "delta_ref_ppl": -0.1953125, + "entropy_loss": -0.04345703125, + "epoch": 0.2272, + "grad_norm": 1.5973288527178824, + "k1_kl": 0.1953125, + "k3_kl": 0.1455078125, + "kimi_kl": 0.59375, + "learning_rate": 3.864e-07, + "loss": 0.0058, + "ppl": 0.020751953125, + "reward": 0.9801409244537354, + "reward_std": 0.0008474478963762522, + "rewards/perpo_ocr_edit_distance_reward": 0.9801409244537354, "step": 1136, "temperature": 0.9 }, { - "advantages": -2.537880754971411e-05, - "completion_length": 473.0, - "delta_ref_entropy_loss": 0.02838134765625, - "delta_ref_ppl": -0.0299072265625, - "entropy_loss": -0.03887939453125, - "epoch": 0.4548, - "grad_norm": 5.512012831035565, - "k1_kl": 0.0299072265625, - "k3_kl": 0.094970703125, - "kimi_kl": 0.0673828125, - "learning_rate": 2.726e-07, - "loss": 0.0038, - "ppl": 0.02593994140625, - "reward": 0.9451557099819183, - "reward_std": 0.017806610907427967, - "rewards/perpo_ocr_edit_distance_reward": 0.9451557993888855, + "advantages": -0.00020834378665313125, + "completion_length": 526.0, + "delta_ref_entropy_loss": 0.03564453125, + "delta_ref_ppl": -0.054931640625, + "entropy_loss": -0.0084228515625, + "epoch": 0.2274, + "grad_norm": 0.38718589148874366, + "k1_kl": 0.054931640625, + "k3_kl": 0.032958984375, + "kimi_kl": 0.0859375, + "learning_rate": 3.8629999999999996e-07, + "loss": 0.0015, + "ppl": 0.001953125, + "reward": 0.9971161484718323, + "reward_std": 0.00018608490063343197, + "rewards/perpo_ocr_edit_distance_reward": 0.997116208076477, "step": 1137, "temperature": 0.9 }, { - "advantages": -0.00015309879381675273, - "completion_length": 259.0, - "delta_ref_entropy_loss": 0.048828125, - "delta_ref_ppl": -0.1253662109375, - "entropy_loss": -0.016265869140625, - "epoch": 0.4552, - "grad_norm": 0.2490129329811376, - "k1_kl": 0.1253662109375, - "k3_kl": 0.099853515625, - "kimi_kl": 0.588134765625, - "learning_rate": 2.724e-07, - "loss": 0.0041, - "ppl": 0.00478363037109375, - "reward": 0.9999663531780243, - "reward_std": 8.90524170245044e-05, - "rewards/perpo_ocr_edit_distance_reward": 0.9999663829803467, + "advantages": -4.7998772060964257e-05, + "completion_length": 695.0, + "delta_ref_entropy_loss": 0.04150390625, + "delta_ref_ppl": -0.049560546875, + "entropy_loss": -0.0135498046875, + "epoch": 0.2276, + "grad_norm": 0.36997806121520194, + "k1_kl": 0.049560546875, + "k3_kl": 0.0303955078125, + "kimi_kl": 0.095703125, + "learning_rate": 3.8619999999999995e-07, + "loss": 0.0013, + "ppl": 0.005889892578125, + "reward": 0.9837381839752197, + "reward_std": 0.00043217267375439405, + "rewards/perpo_ocr_edit_distance_reward": 0.9837381839752197, "step": 1138, "temperature": 0.9 }, { - "advantages": -6.811959565311554e-07, - "completion_length": 369.0, - "delta_ref_entropy_loss": 0.03411865234375, - "delta_ref_ppl": -0.049560546875, - "entropy_loss": -0.0729522705078125, - "epoch": 0.4556, - "grad_norm": 1.4921488679722006, - "k1_kl": 0.04931640625, - "k3_kl": 0.036163330078125, - "kimi_kl": 0.1041259765625, - "learning_rate": 2.7219999999999996e-07, - "loss": 0.0014, - "ppl": 0.0357513427734375, - "reward": 0.9056309163570404, - "reward_std": 0.028107423335313797, - "rewards/perpo_ocr_edit_distance_reward": 0.9056309461593628, + "advantages": -2.0308154489612207e-05, + "completion_length": 43.0, + "delta_ref_entropy_loss": 0.06103515625, + "delta_ref_ppl": -0.4140625, + "entropy_loss": -0.0791015625, + "epoch": 0.2278, + "grad_norm": 7.8998141400141675, + "k1_kl": 0.4140625, + "k3_kl": 0.310546875, + "kimi_kl": 1.234375, + "learning_rate": 3.861e-07, + "loss": 0.0125, + "ppl": 0.0303955078125, + "reward": 0.9685713648796082, + "reward_std": 0.003253032686188817, + "rewards/perpo_ocr_edit_distance_reward": 0.9685714840888977, "step": 1139, "temperature": 0.9 }, { - "advantages": -0.00011532009011716582, - "completion_length": 840.5, - "delta_ref_entropy_loss": 0.03729248046875, - "delta_ref_ppl": -0.02703857421875, - "entropy_loss": -0.02935791015625, - "epoch": 0.456, - "grad_norm": 0.38450458733949217, - "k1_kl": 0.027069091796875, - "k3_kl": 0.01409912109375, - "kimi_kl": 0.04290771484375, - "learning_rate": 2.72e-07, - "loss": 0.0007, - "ppl": 0.014434814453125, - "reward": 0.99725142121315, - "reward_std": 0.0004284189344616607, - "rewards/perpo_ocr_edit_distance_reward": 0.9972514808177948, + "advantages": -5.892345143365674e-06, + "completion_length": 491.0, + "delta_ref_entropy_loss": 0.11181640625, + "delta_ref_ppl": -0.08935546875, + "entropy_loss": -0.07666015625, + "epoch": 0.228, + "grad_norm": 1.239347987177975, + "k1_kl": 0.08935546875, + "k3_kl": 0.05615234375, + "kimi_kl": 0.2080078125, + "learning_rate": 3.86e-07, + "loss": 0.0023, + "ppl": 0.04052734375, + "reward": 0.9649603366851807, + "reward_std": 0.0027974462136626244, + "rewards/perpo_ocr_edit_distance_reward": 0.9649603962898254, "step": 1140, "temperature": 0.9 }, { - "advantages": -2.1287374085687816e-09, - "completion_length": 565.5, - "delta_ref_entropy_loss": 0.021881103515625, - "delta_ref_ppl": -0.017791748046875, - "entropy_loss": -0.0125579833984375, - "epoch": 0.4564, - "grad_norm": 0.4303191671122862, - "k1_kl": 0.017791748046875, - "k3_kl": 0.01032257080078125, - "kimi_kl": 0.03116607666015625, - "learning_rate": 2.718e-07, - "loss": 0.0004, - "ppl": 0.005535125732421875, - "reward": 0.999926894903183, - "reward_std": 0.00012479958240874112, - "rewards/perpo_ocr_edit_distance_reward": 0.9999269247055054, + "advantages": 6.624630623264238e-05, + "completion_length": 1032.0, + "delta_ref_entropy_loss": 0.036376953125, + "delta_ref_ppl": -0.035888671875, + "entropy_loss": -0.020751953125, + "epoch": 0.2282, + "grad_norm": 0.34788805259364863, + "k1_kl": 0.03564453125, + "k3_kl": 0.0194091796875, + "kimi_kl": 0.05224609375, + "learning_rate": 3.859e-07, + "loss": 0.0007, + "ppl": 0.00830078125, + "reward": 0.9949263334274292, + "reward_std": 0.00041434820741415024, + "rewards/perpo_ocr_edit_distance_reward": 0.9949263334274292, "step": 1141, "temperature": 0.9 }, { - "advantages": -4.2613066852936754e-05, - "completion_length": 160.0, - "delta_ref_entropy_loss": 0.037933349609375, - "delta_ref_ppl": -0.050048828125, - "entropy_loss": -0.0189208984375, - "epoch": 0.4568, - "grad_norm": 1.2050149744742784, - "k1_kl": 0.050048828125, - "k3_kl": 0.0321044921875, - "kimi_kl": 0.0823974609375, - "learning_rate": 2.7159999999999997e-07, - "loss": 0.0013, - "ppl": 0.009735107421875, - "reward": 0.9906762540340424, - "reward_std": 0.0024185969086829573, - "rewards/perpo_ocr_edit_distance_reward": 0.9906763136386871, + "advantages": -5.824225354444934e-06, + "completion_length": 584.0, + "delta_ref_entropy_loss": 0.1259765625, + "delta_ref_ppl": -0.10693359375, + "entropy_loss": -0.1416015625, + "epoch": 0.2284, + "grad_norm": 2.3678302297749916, + "k1_kl": 0.107421875, + "k3_kl": 0.061279296875, + "kimi_kl": 0.1572265625, + "learning_rate": 3.858e-07, + "loss": 0.0025, + "ppl": 0.07763671875, + "reward": 0.6224442720413208, + "reward_std": 0.004270756617188454, + "rewards/perpo_ocr_edit_distance_reward": 0.6224443316459656, "step": 1142, "temperature": 0.9 }, { - "advantages": -4.257474728319721e-08, - "completion_length": 626.0, - "delta_ref_entropy_loss": 0.05303955078125, - "delta_ref_ppl": -0.06201171875, - "entropy_loss": -0.1109619140625, - "epoch": 0.4572, - "grad_norm": 2.1405817880525415, - "k1_kl": 0.0621337890625, - "k3_kl": 0.041259765625, - "kimi_kl": 0.130126953125, - "learning_rate": 2.7139999999999996e-07, - "loss": 0.0017, - "ppl": 0.0608978271484375, - "reward": 0.79778653383255, - "reward_std": 0.1216098815202713, - "rewards/perpo_ocr_edit_distance_reward": 0.7977865636348724, + "advantages": -1.0149819900107104e-05, + "completion_length": 255.0, + "delta_ref_entropy_loss": 0.1025390625, + "delta_ref_ppl": -0.1748046875, + "entropy_loss": -0.059814453125, + "epoch": 0.2286, + "grad_norm": 1.3930910205898124, + "k1_kl": 0.17578125, + "k3_kl": 0.125, + "kimi_kl": 0.462890625, + "learning_rate": 3.8569999999999997e-07, + "loss": 0.005, + "ppl": 0.0308837890625, + "reward": 0.9784566164016724, + "reward_std": 0.0032592695206403732, + "rewards/perpo_ocr_edit_distance_reward": 0.9784566760063171, "step": 1143, "temperature": 0.9 }, { - "advantages": -3.147125335090095e-05, - "completion_length": 879.0, - "delta_ref_entropy_loss": 0.03076171875, - "delta_ref_ppl": -0.0316162109375, - "entropy_loss": -0.02960205078125, - "epoch": 0.4576, - "grad_norm": 0.477447887971599, - "k1_kl": 0.03173828125, - "k3_kl": 0.0206298828125, - "kimi_kl": 0.066162109375, - "learning_rate": 2.712e-07, - "loss": 0.0009, - "ppl": 0.014892578125, - "reward": 0.978567361831665, - "reward_std": 0.0015916048432700336, - "rewards/perpo_ocr_edit_distance_reward": 0.9785674512386322, + "advantages": -5.27926886206842e-07, + "completion_length": 1698.0, + "delta_ref_entropy_loss": 0.04150390625, + "delta_ref_ppl": -0.03466796875, + "entropy_loss": -0.041748046875, + "epoch": 0.2288, + "grad_norm": 6.066625431482784, + "k1_kl": 0.03466796875, + "k3_kl": 0.0361328125, + "kimi_kl": 0.06201171875, + "learning_rate": 3.8559999999999996e-07, + "loss": 0.0014, + "ppl": 0.0234375, + "reward": 0.9769504070281982, + "reward_std": 0.01597083918750286, + "rewards/perpo_ocr_edit_distance_reward": 0.976950466632843, "step": 1144, "temperature": 0.9 }, { - "advantages": -4.8620360757922754e-05, - "completion_length": 538.5, - "delta_ref_entropy_loss": 0.0201416015625, - "delta_ref_ppl": -0.015838623046875, - "entropy_loss": -0.014556884765625, - "epoch": 0.458, - "grad_norm": 0.16359795439286032, - "k1_kl": 0.015777587890625, - "k3_kl": 0.0094146728515625, - "kimi_kl": 0.02593994140625, - "learning_rate": 2.7100000000000003e-07, - "loss": 0.0004, - "ppl": 0.0064544677734375, - "reward": 0.9996151626110077, - "reward_std": 0.00034410087391734123, - "rewards/perpo_ocr_edit_distance_reward": 0.9996152222156525, + "advantages": -6.410479545593262e-05, + "completion_length": 406.0, + "delta_ref_entropy_loss": 0.10205078125, + "delta_ref_ppl": -0.11572265625, + "entropy_loss": -0.0224609375, + "epoch": 0.229, + "grad_norm": 1.1641382076868318, + "k1_kl": 0.11572265625, + "k3_kl": 0.0693359375, + "kimi_kl": 0.1962890625, + "learning_rate": 3.855e-07, + "loss": 0.0028, + "ppl": 0.00897216796875, + "reward": 0.9958384037017822, + "reward_std": 0.0009626062237657607, + "rewards/perpo_ocr_edit_distance_reward": 0.9958384037017822, "step": 1145, "temperature": 0.9 }, { - "advantages": -9.634665616431448e-05, - "completion_length": 513.5, - "delta_ref_entropy_loss": 0.025390625, - "delta_ref_ppl": -0.03204345703125, - "entropy_loss": -0.017974853515625, - "epoch": 0.4584, - "grad_norm": 0.4562428843128516, - "k1_kl": 0.03216552734375, - "k3_kl": 0.02264404296875, - "kimi_kl": 0.09527587890625, - "learning_rate": 2.7079999999999996e-07, - "loss": 0.001, - "ppl": 0.0067138671875, - "reward": 0.9512292742729187, - "reward_std": 0.007636574489879422, - "rewards/perpo_ocr_edit_distance_reward": 0.9512293338775635, + "advantages": -3.8283214962575585e-05, + "completion_length": 574.0, + "delta_ref_entropy_loss": 0.095703125, + "delta_ref_ppl": -0.0966796875, + "entropy_loss": -0.04248046875, + "epoch": 0.2292, + "grad_norm": 1.1387381584617273, + "k1_kl": 0.0966796875, + "k3_kl": 0.06298828125, + "kimi_kl": 0.232421875, + "learning_rate": 3.854e-07, + "loss": 0.0026, + "ppl": 0.01904296875, + "reward": 0.6675841808319092, + "reward_std": 0.001901598065160215, + "rewards/perpo_ocr_edit_distance_reward": 0.6675841808319092, "step": 1146, "temperature": 0.9 }, { - "advantages": -1.2346677209507106e-05, - "completion_length": 687.5, - "delta_ref_entropy_loss": 0.1484375, - "delta_ref_ppl": -0.0859375, - "entropy_loss": -0.197265625, - "epoch": 0.4588, - "grad_norm": 1.6169128799418147, - "k1_kl": 0.0859375, - "k3_kl": 0.0430908203125, - "kimi_kl": 0.10302734375, - "learning_rate": 2.706e-07, - "loss": 0.0017, - "ppl": 0.110595703125, - "reward": 0.8458763062953949, - "reward_std": 0.017795626306906343, - "rewards/perpo_ocr_edit_distance_reward": 0.8458763957023621, + "advantages": -2.346720066270791e-05, + "completion_length": 1035.0, + "delta_ref_entropy_loss": 0.04248046875, + "delta_ref_ppl": -0.03466796875, + "entropy_loss": -0.01068115234375, + "epoch": 0.2294, + "grad_norm": 0.3065144705279431, + "k1_kl": 0.03466796875, + "k3_kl": 0.0224609375, + "kimi_kl": 0.0869140625, + "learning_rate": 3.8529999999999994e-07, + "loss": 0.0009, + "ppl": 0.0035552978515625, + "reward": 0.8597784042358398, + "reward_std": 0.0024386735167354345, + "rewards/perpo_ocr_edit_distance_reward": 0.8597784638404846, "step": 1147, "temperature": 0.9 }, { - "advantages": -3.0057772164582275e-05, - "completion_length": 603.0, - "delta_ref_entropy_loss": 0.0263671875, - "delta_ref_ppl": -0.01129150390625, - "entropy_loss": -0.016754150390625, - "epoch": 0.4592, - "grad_norm": 0.20493425911148774, - "k1_kl": 0.01129150390625, - "k3_kl": 0.00434112548828125, - "kimi_kl": 0.007354736328125, - "learning_rate": 2.704e-07, - "loss": 0.0002, - "ppl": 0.0077056884765625, - "reward": 0.9976320862770081, - "reward_std": 0.0004459170449990779, - "rewards/perpo_ocr_edit_distance_reward": 0.9976320862770081, + "advantages": -8.106231689453125e-05, + "completion_length": 730.0, + "delta_ref_entropy_loss": 0.045166015625, + "delta_ref_ppl": -0.037109375, + "entropy_loss": -0.031982421875, + "epoch": 0.2296, + "grad_norm": 0.445036031673499, + "k1_kl": 0.037109375, + "k3_kl": 0.0201416015625, + "kimi_kl": 0.052734375, + "learning_rate": 3.852e-07, + "loss": 0.0009, + "ppl": 0.01251220703125, + "reward": 0.9928121566772461, + "reward_std": 0.000635325035545975, + "rewards/perpo_ocr_edit_distance_reward": 0.9928122758865356, "step": 1148, "temperature": 0.9 }, { - "advantages": -5.108969673983665e-07, - "completion_length": 571.5, - "delta_ref_entropy_loss": 0.05511474609375, - "delta_ref_ppl": -0.21826171875, - "entropy_loss": -0.14599609375, - "epoch": 0.4596, - "grad_norm": 4.853342550083219, - "k1_kl": 0.2181396484375, - "k3_kl": 0.1676025390625, - "kimi_kl": 0.645263671875, - "learning_rate": 2.7019999999999997e-07, - "loss": 0.0067, - "ppl": 0.08056640625, - "reward": 0.3950134441256523, - "reward_std": 0.026387169491499662, - "rewards/perpo_ocr_edit_distance_reward": 0.3950134739279747, + "advantages": 2.55448497910038e-08, + "completion_length": 55.0, + "delta_ref_entropy_loss": 0.11376953125, + "delta_ref_ppl": -0.5703125, + "entropy_loss": -0.3203125, + "epoch": 0.2298, + "grad_norm": 7.598558841409085, + "k1_kl": 0.57421875, + "k3_kl": 0.40234375, + "kimi_kl": 1.609375, + "learning_rate": 3.851e-07, + "loss": 0.0161, + "ppl": 0.0869140625, + "reward": 0.3262786269187927, + "reward_std": 0.027164055034518242, + "rewards/perpo_ocr_edit_distance_reward": 0.3262786567211151, "step": 1149, "temperature": 0.9 }, { - "advantages": -0.00020389685855093376, - "completion_length": 374.5, - "delta_ref_entropy_loss": 0.03143310546875, - "delta_ref_ppl": -0.03383636474609375, - "entropy_loss": -0.06610107421875, - "epoch": 0.46, - "grad_norm": 1.9737524959859105, - "k1_kl": 0.03385162353515625, - "k3_kl": 0.024202346801757812, - "kimi_kl": 0.056682586669921875, - "learning_rate": 2.7e-07, - "loss": 0.0012, - "ppl": 0.03466796875, - "reward": 0.6649200022220612, - "reward_std": 0.024268849396321457, - "rewards/perpo_ocr_edit_distance_reward": 0.6649200767278671, + "advantages": -8.544752199668437e-05, + "completion_length": 670.0, + "delta_ref_entropy_loss": 0.035400390625, + "delta_ref_ppl": -0.042236328125, + "entropy_loss": -0.0216064453125, + "epoch": 0.23, + "grad_norm": 0.6141188066033408, + "k1_kl": 0.042236328125, + "k3_kl": 0.025390625, + "kimi_kl": 0.07373046875, + "learning_rate": 3.8499999999999997e-07, + "loss": 0.0011, + "ppl": 0.00897216796875, + "reward": 0.9987204074859619, + "reward_std": 0.0005974302184768021, + "rewards/perpo_ocr_edit_distance_reward": 0.9987205266952515, "step": 1150, "temperature": 0.9 }, { - "advantages": -0.0002106981773977168, - "completion_length": 465.5, - "delta_ref_entropy_loss": 0.029541015625, - "delta_ref_ppl": -0.02716064453125, - "entropy_loss": -0.023681640625, - "epoch": 0.4604, - "grad_norm": 1.0309752327276656, - "k1_kl": 0.02716064453125, - "k3_kl": 0.0155029296875, - "kimi_kl": 0.0369873046875, - "learning_rate": 2.698e-07, - "loss": 0.0008, - "ppl": 0.01226806640625, - "reward": 0.9982469081878662, - "reward_std": 0.0007123248287825845, - "rewards/perpo_ocr_edit_distance_reward": 0.9982469975948334, + "advantages": -4.087175966560608e-07, + "completion_length": 1426.0, + "delta_ref_entropy_loss": 0.054931640625, + "delta_ref_ppl": -0.044921875, + "entropy_loss": -0.08349609375, + "epoch": 0.2302, + "grad_norm": 3.3198351621007998, + "k1_kl": 0.044921875, + "k3_kl": 0.0439453125, + "kimi_kl": 0.05126953125, + "learning_rate": 3.849e-07, + "loss": 0.0018, + "ppl": 0.0458984375, + "reward": 0.8917400240898132, + "reward_std": 0.2359282225370407, + "rewards/perpo_ocr_edit_distance_reward": 0.8917401432991028, "step": 1151, "temperature": 0.9 }, { - "advantages": -5.132385922479443e-06, - "completion_length": 559.5, - "delta_ref_entropy_loss": 0.027099609375, - "delta_ref_ppl": -0.01568603515625, - "entropy_loss": -0.02392578125, - "epoch": 0.4608, - "grad_norm": 0.556120639478596, - "k1_kl": 0.01568603515625, - "k3_kl": 0.00838470458984375, - "kimi_kl": 0.013336181640625, - "learning_rate": 2.696e-07, - "loss": 0.0003, - "ppl": 0.0121917724609375, - "reward": 0.9964958131313324, - "reward_std": 0.0011970058549195528, - "rewards/perpo_ocr_edit_distance_reward": 0.9964958131313324, + "advantages": -5.630084706353955e-05, + "completion_length": 811.0, + "delta_ref_entropy_loss": 0.0673828125, + "delta_ref_ppl": -0.0517578125, + "entropy_loss": -0.0130615234375, + "epoch": 0.2304, + "grad_norm": 0.4131737918203671, + "k1_kl": 0.0517578125, + "k3_kl": 0.0220947265625, + "kimi_kl": 0.044189453125, + "learning_rate": 3.8479999999999995e-07, + "loss": 0.0009, + "ppl": 0.004974365234375, + "reward": 0.9939929246902466, + "reward_std": 0.0003537604643497616, + "rewards/perpo_ocr_edit_distance_reward": 0.9939929842948914, "step": 1152, "temperature": 0.9 }, { - "advantages": -2.55448497910038e-08, - "completion_length": 513.0, - "delta_ref_entropy_loss": 0.077392578125, - "delta_ref_ppl": -0.052978515625, - "entropy_loss": -0.0516357421875, - "epoch": 0.4612, - "grad_norm": 14.861182278298202, - "k1_kl": 0.0531005859375, - "k3_kl": 0.05194091796875, - "kimi_kl": 0.13720703125, - "learning_rate": 2.6939999999999996e-07, - "loss": 0.0021, - "ppl": 0.028564453125, - "reward": 0.5373661667108536, - "reward_std": 0.22706104069948196, - "rewards/perpo_ocr_edit_distance_reward": 0.5373661816120148, + "advantages": -1.5037400771689136e-05, + "completion_length": 199.0, + "delta_ref_entropy_loss": 0.1494140625, + "delta_ref_ppl": -0.1787109375, + "entropy_loss": -0.03271484375, + "epoch": 0.2306, + "grad_norm": 3.059883404023978, + "k1_kl": 0.1787109375, + "k3_kl": 0.12255859375, + "kimi_kl": 0.5, + "learning_rate": 3.8469999999999994e-07, + "loss": 0.0049, + "ppl": 0.01470947265625, + "reward": 0.2373218536376953, + "reward_std": 0.00131542282178998, + "rewards/perpo_ocr_edit_distance_reward": 0.2373218685388565, "step": 1153, "temperature": 0.9 }, { "advantages": 0.0, - "completion_length": 289.0, - "delta_ref_entropy_loss": 0.04022216796875, - "delta_ref_ppl": -0.05029296875, - "entropy_loss": -0.0218505859375, - "epoch": 0.4616, - "grad_norm": 0.036456932670295185, - "k1_kl": 0.0504150390625, - "k3_kl": 0.03765869140625, - "kimi_kl": 0.143310546875, - "learning_rate": 2.692e-07, - "loss": 0.0015, - "ppl": 0.0101165771484375, - "reward": 1.0, + "completion_length": 115.0, + "delta_ref_entropy_loss": 0.07080078125, + "delta_ref_ppl": -0.212890625, + "entropy_loss": -0.0291748046875, + "epoch": 0.2308, + "grad_norm": 0.036136065110183524, + "k1_kl": 0.212890625, + "k3_kl": 0.15625, + "kimi_kl": 0.5703125, + "learning_rate": 3.846e-07, + "loss": 0.0063, + "ppl": 0.005096435546875, + "reward": 0.9587156176567078, "reward_std": 0.0, - "rewards/perpo_ocr_edit_distance_reward": 1.0, + "rewards/perpo_ocr_edit_distance_reward": 0.9587156176567078, "step": 1154, "temperature": 0.9 }, { - "advantages": -0.0003057973726754426, - "completion_length": 488.0, - "delta_ref_entropy_loss": 0.02850341796875, - "delta_ref_ppl": -0.0345458984375, - "entropy_loss": -0.016326904296875, - "epoch": 0.462, - "grad_norm": 0.25557044582948146, - "k1_kl": 0.03460693359375, - "k3_kl": 0.023681640625, - "kimi_kl": 0.07330322265625, - "learning_rate": 2.69e-07, - "loss": 0.0013, - "ppl": 0.006317138671875, - "reward": 0.9993613362312317, - "reward_std": 0.0004981511738151312, - "rewards/perpo_ocr_edit_distance_reward": 0.9993613958358765, + "advantages": -1.1444092706369702e-05, + "completion_length": 1767.0, + "delta_ref_entropy_loss": 0.0289306640625, + "delta_ref_ppl": -0.0274658203125, + "entropy_loss": -0.048828125, + "epoch": 0.231, + "grad_norm": 3492.8165836611906, + "k1_kl": 0.0274658203125, + "k3_kl": 0.69921875, + "kimi_kl": 0.05126953125, + "learning_rate": 3.845e-07, + "loss": 0.0279, + "ppl": 0.029296875, + "reward": 0.9447411894798279, + "reward_std": 0.008842220529913902, + "rewards/perpo_ocr_edit_distance_reward": 0.9447412490844727, "step": 1155, "temperature": 0.9 }, { - "advantages": -8.821487881505163e-06, - "completion_length": 669.0, - "delta_ref_entropy_loss": 0.064697265625, - "delta_ref_ppl": -0.053466796875, - "entropy_loss": -0.07891845703125, - "epoch": 0.4624, - "grad_norm": 2.318040084434739, - "k1_kl": 0.0533447265625, - "k3_kl": 0.03271484375, - "kimi_kl": 0.075439453125, - "learning_rate": 2.6879999999999997e-07, - "loss": 0.0013, - "ppl": 0.043243408203125, - "reward": 0.9327494502067566, - "reward_std": 0.0049245195696130395, - "rewards/perpo_ocr_edit_distance_reward": 0.9327495396137238, + "advantages": -0.00011658669245662168, + "completion_length": 431.0, + "delta_ref_entropy_loss": 0.07763671875, + "delta_ref_ppl": -0.0654296875, + "entropy_loss": -0.0228271484375, + "epoch": 0.2312, + "grad_norm": 0.5221345637064387, + "k1_kl": 0.0654296875, + "k3_kl": 0.037109375, + "kimi_kl": 0.0908203125, + "learning_rate": 3.8440000000000003e-07, + "loss": 0.0016, + "ppl": 0.006256103515625, + "reward": 0.9963468909263611, + "reward_std": 0.00041127417352981865, + "rewards/perpo_ocr_edit_distance_reward": 0.9963470101356506, "step": 1156, "temperature": 0.9 }, { - "advantages": -6.99162483215332e-05, - "completion_length": 346.5, - "delta_ref_entropy_loss": 0.02679443359375, - "delta_ref_ppl": -0.02618408203125, - "entropy_loss": -0.02947998046875, - "epoch": 0.4628, - "grad_norm": 0.5508080728033953, - "k1_kl": 0.02618408203125, - "k3_kl": 0.012237548828125, - "kimi_kl": 0.0257568359375, - "learning_rate": 2.686e-07, - "loss": 0.0006, - "ppl": 0.013885498046875, - "reward": 0.997298002243042, - "reward_std": 7.180095417425036e-05, - "rewards/perpo_ocr_edit_distance_reward": 0.9972980618476868, + "advantages": -3.154788828396704e-06, + "completion_length": 395.0, + "delta_ref_entropy_loss": 0.1337890625, + "delta_ref_ppl": -0.1201171875, + "entropy_loss": -0.06103515625, + "epoch": 0.2314, + "grad_norm": 1.4724169489689165, + "k1_kl": 0.1201171875, + "k3_kl": 0.0693359375, + "kimi_kl": 0.193359375, + "learning_rate": 3.8429999999999997e-07, + "loss": 0.0028, + "ppl": 0.027099609375, + "reward": 0.8947169780731201, + "reward_std": 0.007971587590873241, + "rewards/perpo_ocr_edit_distance_reward": 0.8947170376777649, "step": 1157, "temperature": 0.9 }, { - "advantages": -6.125228992459597e-05, - "completion_length": 1152.5, - "delta_ref_entropy_loss": 0.02020263671875, - "delta_ref_ppl": -0.01824951171875, - "entropy_loss": -0.021209716796875, - "epoch": 0.4632, - "grad_norm": 0.7374265270341194, - "k1_kl": 0.01824951171875, - "k3_kl": 0.01222991943359375, - "kimi_kl": 0.0295867919921875, - "learning_rate": 2.684e-07, - "loss": 0.0006, - "ppl": 0.0111083984375, - "reward": 0.9979400932788849, - "reward_std": 0.0009092901309486479, - "rewards/perpo_ocr_edit_distance_reward": 0.9979402422904968, + "advantages": -6.590571047127014e-06, + "completion_length": 564.0, + "delta_ref_entropy_loss": 0.1630859375, + "delta_ref_ppl": -0.103515625, + "entropy_loss": -0.1337890625, + "epoch": 0.2316, + "grad_norm": 2.4216755341199128, + "k1_kl": 0.10400390625, + "k3_kl": 0.0654296875, + "kimi_kl": 0.1806640625, + "learning_rate": 3.8419999999999996e-07, + "loss": 0.0026, + "ppl": 0.078125, + "reward": 0.6500230431556702, + "reward_std": 0.011552856303751469, + "rewards/perpo_ocr_edit_distance_reward": 0.6500231027603149, "step": 1158, "temperature": 0.9 }, { - "advantages": -4.4362886910676025e-05, - "completion_length": 261.5, - "delta_ref_entropy_loss": 0.0418701171875, - "delta_ref_ppl": -0.0975341796875, - "entropy_loss": -0.031280517578125, - "epoch": 0.4636, - "grad_norm": 2.526241052363613, - "k1_kl": 0.0975341796875, - "k3_kl": 0.075531005859375, - "kimi_kl": 0.2415771484375, - "learning_rate": 2.682e-07, - "loss": 0.0031, - "ppl": 0.0164947509765625, - "reward": 0.9993632733821869, - "reward_std": 0.001182843028800562, - "rewards/perpo_ocr_edit_distance_reward": 0.9993633031845093, + "advantages": 2.7409623726271093e-05, + "completion_length": 456.0, + "delta_ref_entropy_loss": 0.11279296875, + "delta_ref_ppl": -0.10009765625, + "entropy_loss": -0.072265625, + "epoch": 0.2318, + "grad_norm": 1.2373892864579321, + "k1_kl": 0.1005859375, + "k3_kl": 0.060302734375, + "kimi_kl": 0.150390625, + "learning_rate": 3.841e-07, + "loss": 0.0024, + "ppl": 0.035400390625, + "reward": 0.7604823112487793, + "reward_std": 0.0011428477009758353, + "rewards/perpo_ocr_edit_distance_reward": 0.7604822516441345, "step": 1159, "temperature": 0.9 }, { - "advantages": -2.8014184863422997e-05, - "completion_length": 436.5, - "delta_ref_entropy_loss": 0.055419921875, - "delta_ref_ppl": -0.041015625, - "entropy_loss": -0.0380859375, - "epoch": 0.464, - "grad_norm": 0.7162722248973792, - "k1_kl": 0.0411376953125, - "k3_kl": 0.02374267578125, - "kimi_kl": 0.066162109375, - "learning_rate": 2.68e-07, - "loss": 0.001, - "ppl": 0.0162353515625, - "reward": 0.9923892319202423, - "reward_std": 0.0011270969698671252, - "rewards/perpo_ocr_edit_distance_reward": 0.9923892617225647, + "advantages": -8.123261977743823e-06, + "completion_length": 455.0, + "delta_ref_entropy_loss": 0.0712890625, + "delta_ref_ppl": -0.07373046875, + "entropy_loss": -0.03662109375, + "epoch": 0.232, + "grad_norm": 1.0613009055678144, + "k1_kl": 0.07373046875, + "k3_kl": 0.045166015625, + "kimi_kl": 0.12353515625, + "learning_rate": 3.84e-07, + "loss": 0.0018, + "ppl": 0.01287841796875, + "reward": 0.9414561986923218, + "reward_std": 0.008260825648903847, + "rewards/perpo_ocr_edit_distance_reward": 0.9414563179016113, "step": 1160, "temperature": 0.9 }, { - "advantages": -3.569892624000204e-05, - "completion_length": 621.5, - "delta_ref_entropy_loss": 0.04852294921875, - "delta_ref_ppl": -0.0357666015625, - "entropy_loss": -0.0628662109375, - "epoch": 0.4644, - "grad_norm": 0.786459140092416, - "k1_kl": 0.03570556640625, - "k3_kl": 0.020111083984375, - "kimi_kl": 0.0482177734375, - "learning_rate": 2.6779999999999995e-07, - "loss": 0.0008, - "ppl": 0.035369873046875, - "reward": 0.9748233258724213, - "reward_std": 0.001039416150888428, - "rewards/perpo_ocr_edit_distance_reward": 0.9748233556747437, + "advantages": -2.0980836779926904e-05, + "completion_length": 848.0, + "delta_ref_entropy_loss": 0.051513671875, + "delta_ref_ppl": -0.038330078125, + "entropy_loss": -0.0281982421875, + "epoch": 0.2322, + "grad_norm": 0.6848007948063211, + "k1_kl": 0.038330078125, + "k3_kl": 0.0181884765625, + "kimi_kl": 0.038330078125, + "learning_rate": 3.839e-07, + "loss": 0.0007, + "ppl": 0.011474609375, + "reward": 0.9827477335929871, + "reward_std": 0.0027396054938435555, + "rewards/perpo_ocr_edit_distance_reward": 0.9827477931976318, "step": 1161, "temperature": 0.9 }, { - "advantages": 2.660921836650232e-06, - "completion_length": 490.5, - "delta_ref_entropy_loss": 0.0263671875, - "delta_ref_ppl": -0.026123046875, - "entropy_loss": -0.025390625, - "epoch": 0.4648, - "grad_norm": 0.6024814880792698, - "k1_kl": 0.026123046875, - "k3_kl": 0.01519775390625, - "kimi_kl": 0.03271484375, - "learning_rate": 2.676e-07, - "loss": 0.0006, - "ppl": 0.01220703125, - "reward": 0.9974384307861328, - "reward_std": 0.0010655124788172543, - "rewards/perpo_ocr_edit_distance_reward": 0.9974384605884552, + "advantages": -6.588640826521441e-05, + "completion_length": 353.0, + "delta_ref_entropy_loss": 0.1220703125, + "delta_ref_ppl": -0.1767578125, + "entropy_loss": -0.060302734375, + "epoch": 0.2324, + "grad_norm": 1.075792767425318, + "k1_kl": 0.1767578125, + "k3_kl": 0.11572265625, + "kimi_kl": 0.4296875, + "learning_rate": 3.838e-07, + "loss": 0.0047, + "ppl": 0.034423828125, + "reward": 0.6861978769302368, + "reward_std": 0.001063151634298265, + "rewards/perpo_ocr_edit_distance_reward": 0.6861979961395264, "step": 1162, "temperature": 0.9 }, { - "advantages": -4.918234844808467e-05, - "completion_length": 850.5, - "delta_ref_entropy_loss": 0.04803466796875, - "delta_ref_ppl": -0.04461669921875, - "entropy_loss": -0.03021240234375, - "epoch": 0.4652, - "grad_norm": 0.46699490142157674, - "k1_kl": 0.04461669921875, - "k3_kl": 0.02825927734375, - "kimi_kl": 0.106201171875, - "learning_rate": 2.674e-07, - "loss": 0.0012, - "ppl": 0.01361083984375, - "reward": 0.9983169734477997, - "reward_std": 0.0005967705219518393, - "rewards/perpo_ocr_edit_distance_reward": 0.9983170330524445, + "advantages": -8.514949634275126e-09, + "completion_length": 1119.0, + "delta_ref_entropy_loss": 0.08349609375, + "delta_ref_ppl": -0.09033203125, + "entropy_loss": -0.0927734375, + "epoch": 0.2326, + "grad_norm": 1.377667377068054, + "k1_kl": 0.09033203125, + "k3_kl": 0.056884765625, + "kimi_kl": 0.1953125, + "learning_rate": 3.837e-07, + "loss": 0.0023, + "ppl": 0.050048828125, + "reward": 0.7949165105819702, + "reward_std": 0.026682347059249878, + "rewards/perpo_ocr_edit_distance_reward": 0.7949165105819702, "step": 1163, "temperature": 0.9 }, { - "advantages": -9.074381978280144e-05, - "completion_length": 438.5, - "delta_ref_entropy_loss": 0.0556640625, - "delta_ref_ppl": -0.050506591796875, - "entropy_loss": -0.07989501953125, - "epoch": 0.4656, - "grad_norm": 0.8311875035686811, - "k1_kl": 0.050262451171875, - "k3_kl": 0.0308990478515625, - "kimi_kl": 0.118133544921875, - "learning_rate": 2.6719999999999996e-07, - "loss": 0.0013, - "ppl": 0.0460205078125, - "reward": 0.9448866844177246, - "reward_std": 0.0020753414428327233, - "rewards/perpo_ocr_edit_distance_reward": 0.9448867738246918, + "advantages": -3.2356808787881164e-07, + "completion_length": 350.0, + "delta_ref_entropy_loss": 0.1376953125, + "delta_ref_ppl": -0.11474609375, + "entropy_loss": -0.10498046875, + "epoch": 0.2328, + "grad_norm": 1.7488108441300307, + "k1_kl": 0.11474609375, + "k3_kl": 0.06787109375, + "kimi_kl": 0.1884765625, + "learning_rate": 3.8359999999999997e-07, + "loss": 0.0027, + "ppl": 0.050048828125, + "reward": 0.6784108877182007, + "reward_std": 0.07903177291154861, + "rewards/perpo_ocr_edit_distance_reward": 0.6784108877182007, "step": 1164, "temperature": 0.9 }, { - "advantages": -4.5299530029296875e-06, - "completion_length": 311.0, - "delta_ref_entropy_loss": 0.08624267578125, - "delta_ref_ppl": -0.11273193359375, - "entropy_loss": -0.073974609375, - "epoch": 0.466, - "grad_norm": 1.5199734989641944, - "k1_kl": 0.11273193359375, - "k3_kl": 0.0738525390625, - "kimi_kl": 0.24945068359375, - "learning_rate": 2.67e-07, - "loss": 0.003, - "ppl": 0.03619384765625, - "reward": 0.8721500635147095, - "reward_std": 0.001830969238653779, - "rewards/perpo_ocr_edit_distance_reward": 0.8721500933170319, + "advantages": -1.5326910215662792e-05, + "completion_length": 491.0, + "delta_ref_entropy_loss": 0.11962890625, + "delta_ref_ppl": -0.0791015625, + "entropy_loss": -0.0751953125, + "epoch": 0.233, + "grad_norm": 1.1269812804259214, + "k1_kl": 0.0791015625, + "k3_kl": 0.04052734375, + "kimi_kl": 0.10107421875, + "learning_rate": 3.835e-07, + "loss": 0.0016, + "ppl": 0.03466796875, + "reward": 0.9613306522369385, + "reward_std": 0.001009743195027113, + "rewards/perpo_ocr_edit_distance_reward": 0.9613305926322937, "step": 1165, "temperature": 0.9 }, { - "advantages": -1.6561577467655297e-05, - "completion_length": 595.5, - "delta_ref_entropy_loss": 0.03192138671875, - "delta_ref_ppl": -0.064208984375, - "entropy_loss": -0.02880859375, - "epoch": 0.4664, - "grad_norm": 11.743826109904413, - "k1_kl": 0.064453125, - "k3_kl": 0.05419921875, - "kimi_kl": 0.188232421875, - "learning_rate": 2.668e-07, - "loss": 0.0022, - "ppl": 0.015869140625, - "reward": 0.9975068867206573, - "reward_std": 0.0024191323900595307, - "rewards/perpo_ocr_edit_distance_reward": 0.9975068867206573, + "advantages": -3.552436828613281e-05, + "completion_length": 872.0, + "delta_ref_entropy_loss": 0.10205078125, + "delta_ref_ppl": -0.078125, + "entropy_loss": -0.062255859375, + "epoch": 0.2332, + "grad_norm": 0.8242212833665149, + "k1_kl": 0.078125, + "k3_kl": 0.043212890625, + "kimi_kl": 0.11962890625, + "learning_rate": 3.834e-07, + "loss": 0.0018, + "ppl": 0.0341796875, + "reward": 0.942253589630127, + "reward_std": 0.0018183104693889618, + "rewards/perpo_ocr_edit_distance_reward": 0.942253589630127, "step": 1166, "temperature": 0.9 }, { - "advantages": -0.00015731369785498828, - "completion_length": 533.5, - "delta_ref_entropy_loss": 0.02557373046875, - "delta_ref_ppl": -0.0284423828125, - "entropy_loss": -0.01849365234375, - "epoch": 0.4668, - "grad_norm": 0.3618533895543311, - "k1_kl": 0.02838134765625, - "k3_kl": 0.0184326171875, - "kimi_kl": 0.0513916015625, - "learning_rate": 2.6659999999999997e-07, - "loss": 0.0009, - "ppl": 0.010009765625, - "reward": 0.9998279213905334, - "reward_std": 0.00034039103775285184, - "rewards/perpo_ocr_edit_distance_reward": 0.999828040599823, + "advantages": -0.0002645254135131836, + "completion_length": 709.0, + "delta_ref_entropy_loss": 0.059814453125, + "delta_ref_ppl": -0.041748046875, + "entropy_loss": -0.008056640625, + "epoch": 0.2334, + "grad_norm": 0.8360220631400651, + "k1_kl": 0.041748046875, + "k3_kl": 0.01904296875, + "kimi_kl": 0.037841796875, + "learning_rate": 3.8329999999999994e-07, + "loss": 0.001, + "ppl": 0.0024871826171875, + "reward": 0.9986526966094971, + "reward_std": 0.00015751595492474735, + "rewards/perpo_ocr_edit_distance_reward": 0.9986527562141418, "step": 1167, "temperature": 0.9 }, { - "advantages": -0.00010324802343575357, - "completion_length": 491.0, - "delta_ref_entropy_loss": 0.080078125, - "delta_ref_ppl": -0.06536865234375, - "entropy_loss": -0.148956298828125, - "epoch": 0.4672, - "grad_norm": 2.2147597861077664, - "k1_kl": 0.06512451171875, - "k3_kl": 0.041015625, - "kimi_kl": 0.141357421875, - "learning_rate": 2.664e-07, - "loss": 0.0017, - "ppl": 0.083526611328125, - "reward": 0.8877607882022858, - "reward_std": 0.014204302511643618, - "rewards/perpo_ocr_edit_distance_reward": 0.8877608478069305, + "advantages": -9.877342108666198e-07, + "completion_length": 809.0, + "delta_ref_entropy_loss": 0.08837890625, + "delta_ref_ppl": -0.0947265625, + "entropy_loss": -0.07177734375, + "epoch": 0.2336, + "grad_norm": 3.5400135721007437, + "k1_kl": 0.0947265625, + "k3_kl": 0.06298828125, + "kimi_kl": 0.1640625, + "learning_rate": 3.832e-07, + "loss": 0.0025, + "ppl": 0.0322265625, + "reward": 0.8455192446708679, + "reward_std": 0.05093811824917793, + "rewards/perpo_ocr_edit_distance_reward": 0.8455193638801575, "step": 1168, "temperature": 0.9 }, { - "advantages": -4.2085139284608886e-05, - "completion_length": 619.5, - "delta_ref_entropy_loss": 0.11767578125, - "delta_ref_ppl": -0.078125, - "entropy_loss": -0.1717529296875, - "epoch": 0.4676, - "grad_norm": 1.4506584584192739, - "k1_kl": 0.07763671875, - "k3_kl": 0.041748046875, - "kimi_kl": 0.0849609375, - "learning_rate": 2.662e-07, - "loss": 0.0017, - "ppl": 0.088134765625, - "reward": 0.8860189616680145, - "reward_std": 0.0027623003406915814, - "rewards/perpo_ocr_edit_distance_reward": 0.8860190212726593, + "advantages": 1.8732889657258056e-06, + "completion_length": 657.0, + "delta_ref_entropy_loss": 0.2099609375, + "delta_ref_ppl": -0.119140625, + "entropy_loss": -0.26953125, + "epoch": 0.2338, + "grad_norm": 2.5372696949669953, + "k1_kl": 0.11865234375, + "k3_kl": 0.05908203125, + "kimi_kl": 0.11767578125, + "learning_rate": 3.831e-07, + "loss": 0.0024, + "ppl": 0.154296875, + "reward": 0.8314369916915894, + "reward_std": 0.004464928992092609, + "rewards/perpo_ocr_edit_distance_reward": 0.8314369916915894, "step": 1169, "temperature": 0.9 }, { - "advantages": -4.394991265144199e-05, - "completion_length": 804.0, - "delta_ref_entropy_loss": 0.027587890625, - "delta_ref_ppl": -0.023681640625, - "entropy_loss": -0.0274658203125, - "epoch": 0.468, - "grad_norm": 0.6498495839963914, - "k1_kl": 0.023712158203125, - "k3_kl": 0.0145263671875, - "kimi_kl": 0.0316162109375, - "learning_rate": 2.66e-07, - "loss": 0.0006, - "ppl": 0.01141357421875, - "reward": 0.9990324079990387, - "reward_std": 0.0002703900681808591, - "rewards/perpo_ocr_edit_distance_reward": 0.9990324974060059, + "advantages": -2.1968569399177795e-06, + "completion_length": 313.0, + "delta_ref_entropy_loss": 0.1328125, + "delta_ref_ppl": -0.1328125, + "entropy_loss": -0.1083984375, + "epoch": 0.234, + "grad_norm": 1.8108906298538965, + "k1_kl": 0.1318359375, + "k3_kl": 0.08447265625, + "kimi_kl": 0.2412109375, + "learning_rate": 3.83e-07, + "loss": 0.0034, + "ppl": 0.053955078125, + "reward": 0.9774061441421509, + "reward_std": 0.007652854081243277, + "rewards/perpo_ocr_edit_distance_reward": 0.9774062037467957, "step": 1170, "temperature": 0.9 }, { - "advantages": -3.475163885013899e-05, - "completion_length": 688.5, - "delta_ref_entropy_loss": 0.027099609375, - "delta_ref_ppl": -0.02850341796875, - "entropy_loss": -0.013702392578125, - "epoch": 0.4684, - "grad_norm": 0.21691948557621088, - "k1_kl": 0.02850341796875, - "k3_kl": 0.0185546875, - "kimi_kl": 0.0721435546875, - "learning_rate": 2.6579999999999996e-07, - "loss": 0.0008, - "ppl": 0.0048370361328125, - "reward": 0.9984340965747833, - "reward_std": 0.00028828734502894804, - "rewards/perpo_ocr_edit_distance_reward": 0.9984341263771057, + "advantages": -9.92843160929624e-06, + "completion_length": 1306.0, + "delta_ref_entropy_loss": 0.07568359375, + "delta_ref_ppl": -0.0654296875, + "entropy_loss": -0.04541015625, + "epoch": 0.2342, + "grad_norm": 0.6899611602692682, + "k1_kl": 0.0654296875, + "k3_kl": 0.032470703125, + "kimi_kl": 0.06591796875, + "learning_rate": 3.829e-07, + "loss": 0.0013, + "ppl": 0.022705078125, + "reward": 0.9876828789710999, + "reward_std": 0.0007578629883937538, + "rewards/perpo_ocr_edit_distance_reward": 0.9876829385757446, "step": 1171, "temperature": 0.9 }, { - "advantages": 0.0, - "completion_length": 150.0, - "delta_ref_entropy_loss": 0.0458984375, - "delta_ref_ppl": -0.0640869140625, - "entropy_loss": -0.02374267578125, - "epoch": 0.4688, - "grad_norm": 0.142473878416187, - "k1_kl": 0.06396484375, - "k3_kl": 0.0469970703125, - "kimi_kl": 0.110107421875, - "learning_rate": 2.656e-07, - "loss": 0.0019, - "ppl": 0.00759124755859375, - "reward": 1.0, - "reward_std": 0.0, - "rewards/perpo_ocr_edit_distance_reward": 1.0, + "advantages": -5.493845674209297e-05, + "completion_length": 1135.0, + "delta_ref_entropy_loss": 0.07373046875, + "delta_ref_ppl": -0.059814453125, + "entropy_loss": -0.03466796875, + "epoch": 0.2344, + "grad_norm": 7.658325232599346, + "k1_kl": 0.059814453125, + "k3_kl": 0.032470703125, + "kimi_kl": 0.0751953125, + "learning_rate": 3.8279999999999996e-07, + "loss": 0.0014, + "ppl": 0.01806640625, + "reward": 0.9937052726745605, + "reward_std": 0.0006748751038685441, + "rewards/perpo_ocr_edit_distance_reward": 0.9937052726745605, "step": 1172, "temperature": 0.9 }, { - "advantages": -8.072172363426944e-06, - "completion_length": 519.0, - "delta_ref_entropy_loss": 0.03692626953125, - "delta_ref_ppl": -0.02777099609375, - "entropy_loss": -0.063629150390625, - "epoch": 0.4692, - "grad_norm": 1.2634182756062862, - "k1_kl": 0.0277099609375, - "k3_kl": 0.017852783203125, - "kimi_kl": 0.05322265625, - "learning_rate": 2.6540000000000003e-07, - "loss": 0.0007, - "ppl": 0.029754638671875, - "reward": 0.981039434671402, - "reward_std": 0.03857786816661246, - "rewards/perpo_ocr_edit_distance_reward": 0.9810394942760468, + "advantages": -2.8865679269074462e-05, + "completion_length": 582.0, + "delta_ref_entropy_loss": 0.1435546875, + "delta_ref_ppl": -0.1435546875, + "entropy_loss": -0.12890625, + "epoch": 0.2346, + "grad_norm": 1.5033693768096759, + "k1_kl": 0.1435546875, + "k3_kl": 0.076171875, + "kimi_kl": 0.16015625, + "learning_rate": 3.8269999999999995e-07, + "loss": 0.0031, + "ppl": 0.06396484375, + "reward": 0.8670756816864014, + "reward_std": 0.0007851279224269092, + "rewards/perpo_ocr_edit_distance_reward": 0.8670758008956909, "step": 1173, "temperature": 0.9 }, { - "advantages": -5.343131306290161e-06, - "completion_length": 475.5, - "delta_ref_entropy_loss": 0.049560546875, - "delta_ref_ppl": -0.06622314453125, - "entropy_loss": -0.052490234375, - "epoch": 0.4696, - "grad_norm": 1.7655776581302394, - "k1_kl": 0.06646728515625, - "k3_kl": 0.049072265625, - "kimi_kl": 0.236724853515625, - "learning_rate": 2.6519999999999997e-07, - "loss": 0.002, - "ppl": 0.02655029296875, - "reward": 0.9571213126182556, - "reward_std": 0.0016066234675236046, - "rewards/perpo_ocr_edit_distance_reward": 0.957121342420578, + "advantages": -2.912112677222467e-06, + "completion_length": 371.0, + "delta_ref_entropy_loss": 0.1171875, + "delta_ref_ppl": -0.12353515625, + "entropy_loss": -0.04638671875, + "epoch": 0.2348, + "grad_norm": 1.4653790469949994, + "k1_kl": 0.12353515625, + "k3_kl": 0.08349609375, + "kimi_kl": 0.2890625, + "learning_rate": 3.826e-07, + "loss": 0.0033, + "ppl": 0.01806640625, + "reward": 0.9023399949073792, + "reward_std": 0.017406683415174484, + "rewards/perpo_ocr_edit_distance_reward": 0.9023400545120239, "step": 1174, "temperature": 0.9 }, { - "advantages": -0.0002980232238769531, - "completion_length": 355.5, - "delta_ref_entropy_loss": 0.039306640625, - "delta_ref_ppl": -0.07330322265625, - "entropy_loss": -0.02490234375, - "epoch": 0.47, - "grad_norm": 0.028808661914010703, - "k1_kl": 0.07330322265625, - "k3_kl": 0.052093505859375, - "kimi_kl": 0.19140625, - "learning_rate": 2.65e-07, - "loss": 0.0024, - "ppl": 0.0108642578125, - "reward": 0.9941666424274445, - "reward_std": 0.0, - "rewards/perpo_ocr_edit_distance_reward": 0.9941667020320892, + "advantages": -5.558984776143916e-05, + "completion_length": 554.0, + "delta_ref_entropy_loss": 0.0625, + "delta_ref_ppl": -0.053466796875, + "entropy_loss": -0.034912109375, + "epoch": 0.235, + "grad_norm": 0.7091792247447652, + "k1_kl": 0.053466796875, + "k3_kl": 0.0299072265625, + "kimi_kl": 0.1015625, + "learning_rate": 3.825e-07, + "loss": 0.0012, + "ppl": 0.0135498046875, + "reward": 0.7254102826118469, + "reward_std": 0.00066589709604159, + "rewards/perpo_ocr_edit_distance_reward": 0.7254102826118469, "step": 1175, "temperature": 0.9 }, { - "advantages": -5.347814112610649e-05, - "completion_length": 1372.5, - "delta_ref_entropy_loss": 0.020660400390625, - "delta_ref_ppl": -0.0113525390625, - "entropy_loss": -0.0206298828125, - "epoch": 0.4704, - "grad_norm": 0.3555237693795484, - "k1_kl": 0.01129150390625, - "k3_kl": 0.0059967041015625, - "kimi_kl": 0.016815185546875, - "learning_rate": 2.648e-07, - "loss": 0.0003, - "ppl": 0.0079345703125, - "reward": 0.9990788400173187, - "reward_std": 0.0005479315150296316, - "rewards/perpo_ocr_edit_distance_reward": 0.9990788698196411, + "advantages": -3.200982609996572e-05, + "completion_length": 633.0, + "delta_ref_entropy_loss": 0.091796875, + "delta_ref_ppl": -0.0751953125, + "entropy_loss": -0.04296875, + "epoch": 0.2352, + "grad_norm": 0.7229875718715911, + "k1_kl": 0.0751953125, + "k3_kl": 0.045166015625, + "kimi_kl": 0.1376953125, + "learning_rate": 3.824e-07, + "loss": 0.0018, + "ppl": 0.0185546875, + "reward": 0.9850215911865234, + "reward_std": 0.0014959251275286078, + "rewards/perpo_ocr_edit_distance_reward": 0.9850216507911682, "step": 1176, "temperature": 0.9 }, { - "advantages": -6.083931566536194e-06, - "completion_length": 211.5, - "delta_ref_entropy_loss": 0.0621337890625, - "delta_ref_ppl": -0.03729248046875, - "entropy_loss": -0.03375244140625, - "epoch": 0.4708, - "grad_norm": 0.23749122832766154, - "k1_kl": 0.03729248046875, - "k3_kl": 0.01690673828125, - "kimi_kl": 0.03192138671875, - "learning_rate": 2.646e-07, - "loss": 0.0007, - "ppl": 0.01690673828125, - "reward": 0.9996533095836639, - "reward_std": 0.0003000297292601317, - "rewards/perpo_ocr_edit_distance_reward": 0.9996533691883087, + "advantages": -8.560930291423574e-05, + "completion_length": 281.0, + "delta_ref_entropy_loss": 0.111328125, + "delta_ref_ppl": -0.119140625, + "entropy_loss": -0.0220947265625, + "epoch": 0.2354, + "grad_norm": 0.5498968687002682, + "k1_kl": 0.119140625, + "k3_kl": 0.08203125, + "kimi_kl": 0.2431640625, + "learning_rate": 3.823e-07, + "loss": 0.0034, + "ppl": 0.00653076171875, + "reward": 0.8706696629524231, + "reward_std": 9.890797809930518e-05, + "rewards/perpo_ocr_edit_distance_reward": 0.8706697225570679, "step": 1177, "temperature": 0.9 }, { - "advantages": 1.678083663136931e-05, - "completion_length": 485.5, - "delta_ref_entropy_loss": 0.03118896484375, - "delta_ref_ppl": -0.022705078125, - "entropy_loss": -0.03363037109375, - "epoch": 0.4712, - "grad_norm": 0.5256334510831837, - "k1_kl": 0.02276611328125, - "k3_kl": 0.013671875, - "kimi_kl": 0.034912109375, - "learning_rate": 2.644e-07, - "loss": 0.0005, - "ppl": 0.016845703125, - "reward": 0.9941663146018982, - "reward_std": 0.0015778256638441235, - "rewards/perpo_ocr_edit_distance_reward": 0.9941663444042206, + "advantages": -1.3828278497385327e-05, + "completion_length": 441.0, + "delta_ref_entropy_loss": 0.1845703125, + "delta_ref_ppl": -0.1533203125, + "entropy_loss": -0.2099609375, + "epoch": 0.2356, + "grad_norm": 2.3247200041321494, + "k1_kl": 0.1533203125, + "k3_kl": 0.0888671875, + "kimi_kl": 0.21484375, + "learning_rate": 3.8219999999999997e-07, + "loss": 0.0036, + "ppl": 0.12109375, + "reward": 0.8309847712516785, + "reward_std": 0.0029813451692461967, + "rewards/perpo_ocr_edit_distance_reward": 0.8309847712516785, "step": 1178, "temperature": 0.9 }, { - "advantages": -9.494169080426218e-05, - "completion_length": 720.5, - "delta_ref_entropy_loss": 0.02154541015625, - "delta_ref_ppl": -0.0128173828125, - "entropy_loss": -0.01904296875, - "epoch": 0.4716, - "grad_norm": 0.30720969544942617, - "k1_kl": 0.012786865234375, - "k3_kl": 0.0073089599609375, - "kimi_kl": 0.01458740234375, - "learning_rate": 2.642e-07, - "loss": 0.0004, - "ppl": 0.00982666015625, - "reward": 0.999714732170105, - "reward_std": 0.0002455883368384093, - "rewards/perpo_ocr_edit_distance_reward": 0.9997147917747498, + "advantages": -1.927784614963457e-05, + "completion_length": 670.0, + "delta_ref_entropy_loss": 0.05029296875, + "delta_ref_ppl": -0.060791015625, + "entropy_loss": -0.0184326171875, + "epoch": 0.2358, + "grad_norm": 0.40958323851014367, + "k1_kl": 0.060791015625, + "k3_kl": 0.034423828125, + "kimi_kl": 0.09765625, + "learning_rate": 3.8209999999999996e-07, + "loss": 0.0014, + "ppl": 0.0052490234375, + "reward": 0.9934219121932983, + "reward_std": 0.0025506948586553335, + "rewards/perpo_ocr_edit_distance_reward": 0.9934219121932983, "step": 1179, "temperature": 0.9 }, { - "advantages": -1.1801720575022046e-05, - "completion_length": 381.5, - "delta_ref_entropy_loss": 0.0255126953125, - "delta_ref_ppl": -0.08270263671875, - "entropy_loss": -0.0308837890625, - "epoch": 0.472, - "grad_norm": 0.41633329679001646, - "k1_kl": 0.08270263671875, - "k3_kl": 0.067962646484375, - "kimi_kl": 0.2808837890625, - "learning_rate": 2.64e-07, - "loss": 0.0027, - "ppl": 0.010772705078125, - "reward": 0.9990211725234985, - "reward_std": 0.0004916402394883335, - "rewards/perpo_ocr_edit_distance_reward": 0.9990212023258209, + "advantages": -1.6553061868762597e-05, + "completion_length": 1230.0, + "delta_ref_entropy_loss": 0.03271484375, + "delta_ref_ppl": -0.03515625, + "entropy_loss": -0.0272216796875, + "epoch": 0.236, + "grad_norm": 6911839.127786289, + "k1_kl": 0.03515625, + "k3_kl": 4544.0, + "kimi_kl": 0.142578125, + "learning_rate": 3.82e-07, + "loss": 181.2062, + "ppl": 0.0189208984375, + "reward": 0.9422683715820312, + "reward_std": 0.002983098616823554, + "rewards/perpo_ocr_edit_distance_reward": 0.9422684907913208, "step": 1180, "temperature": 0.9 }, { - "advantages": -3.695488203447894e-06, - "completion_length": 283.0, - "delta_ref_entropy_loss": 0.060546875, - "delta_ref_ppl": -0.060546875, - "entropy_loss": -0.0494384765625, - "epoch": 0.4724, - "grad_norm": 0.8445772714651358, - "k1_kl": 0.060546875, - "k3_kl": 0.03631591796875, - "kimi_kl": 0.115966796875, - "learning_rate": 2.6379999999999997e-07, - "loss": 0.0015, - "ppl": 0.02154541015625, - "reward": 0.9642451703548431, - "reward_std": 0.005484008230268955, - "rewards/perpo_ocr_edit_distance_reward": 0.9642452299594879, + "advantages": -4.604884816217236e-05, + "completion_length": 443.0, + "delta_ref_entropy_loss": 0.0947265625, + "delta_ref_ppl": -0.111328125, + "entropy_loss": -0.044921875, + "epoch": 0.2362, + "grad_norm": 2.125891377711643, + "k1_kl": 0.111328125, + "k3_kl": 0.064453125, + "kimi_kl": 0.169921875, + "learning_rate": 3.819e-07, + "loss": 0.0026, + "ppl": 0.0198974609375, + "reward": 0.9637317061424255, + "reward_std": 0.0010091913864016533, + "rewards/perpo_ocr_edit_distance_reward": 0.9637317657470703, "step": 1181, "temperature": 0.9 }, { - "advantages": -9.558031024425873e-07, - "completion_length": 1386.5, - "delta_ref_entropy_loss": 0.0136260986328125, - "delta_ref_ppl": -0.0184326171875, - "entropy_loss": -0.04638671875, - "epoch": 0.4728, - "grad_norm": 0.7083359031289149, - "k1_kl": 0.01837158203125, - "k3_kl": 0.0114288330078125, - "kimi_kl": 0.028228759765625, - "learning_rate": 2.636e-07, - "loss": 0.0005, - "ppl": 0.02606201171875, - "reward": 0.8351002931594849, - "reward_std": 0.03590642474591732, - "rewards/perpo_ocr_edit_distance_reward": 0.8351003527641296, + "advantages": -1.3215201761340722e-05, + "completion_length": 721.0, + "delta_ref_entropy_loss": 0.177734375, + "delta_ref_ppl": -0.10302734375, + "entropy_loss": -0.1767578125, + "epoch": 0.2364, + "grad_norm": 4.0338742492773, + "k1_kl": 0.1025390625, + "k3_kl": 0.049560546875, + "kimi_kl": 0.091796875, + "learning_rate": 3.8179999999999994e-07, + "loss": 0.002, + "ppl": 0.09716796875, + "reward": 0.9264165163040161, + "reward_std": 0.0031251634936779737, + "rewards/perpo_ocr_edit_distance_reward": 0.9264165759086609, "step": 1182, "temperature": 0.9 }, { - "advantages": -0.00037298032839316875, - "completion_length": 657.0, - "delta_ref_entropy_loss": 0.02069091796875, - "delta_ref_ppl": -0.0081634521484375, - "entropy_loss": -0.01416015625, - "epoch": 0.4732, - "grad_norm": 0.16294355624062518, - "k1_kl": 0.0081634521484375, - "k3_kl": 0.002712249755859375, - "kimi_kl": 0.00433349609375, - "learning_rate": 2.634e-07, - "loss": 0.0005, - "ppl": 0.00467681884765625, - "reward": 0.9989593625068665, - "reward_std": 0.00023397964832838625, - "rewards/perpo_ocr_edit_distance_reward": 0.9989594519138336, + "advantages": -0.0005960464477539062, + "completion_length": 496.0, + "delta_ref_entropy_loss": 0.054931640625, + "delta_ref_ppl": -0.09375, + "entropy_loss": -0.00860595703125, + "epoch": 0.2366, + "grad_norm": 0.003078808419618408, + "k1_kl": 0.09375, + "k3_kl": 0.06591796875, + "kimi_kl": 0.294921875, + "learning_rate": 3.817e-07, + "loss": 0.0032, + "ppl": 0.00135040283203125, + "reward": 0.9962025880813599, + "reward_std": 0.0, + "rewards/perpo_ocr_edit_distance_reward": 0.9962025880813599, "step": 1183, "temperature": 0.9 }, { - "advantages": -1.2482915735745337e-05, - "completion_length": 684.5, - "delta_ref_entropy_loss": 0.02685546875, - "delta_ref_ppl": -0.0345458984375, - "entropy_loss": -0.0185546875, - "epoch": 0.4736, - "grad_norm": 0.38567769547915554, - "k1_kl": 0.034423828125, - "k3_kl": 0.022613525390625, - "kimi_kl": 0.069091796875, - "learning_rate": 2.632e-07, - "loss": 0.0009, - "ppl": 0.007354736328125, - "reward": 0.9997231662273407, - "reward_std": 0.0006318363011814654, - "rewards/perpo_ocr_edit_distance_reward": 0.9997231960296631, + "advantages": -1.2891633559775073e-05, + "completion_length": 360.0, + "delta_ref_entropy_loss": 0.07470703125, + "delta_ref_ppl": -0.1259765625, + "entropy_loss": -0.087890625, + "epoch": 0.2368, + "grad_norm": 1.3298185367954556, + "k1_kl": 0.1259765625, + "k3_kl": 0.0830078125, + "kimi_kl": 0.25, + "learning_rate": 3.816e-07, + "loss": 0.0033, + "ppl": 0.04736328125, + "reward": 0.972531795501709, + "reward_std": 0.0018826414598152041, + "rewards/perpo_ocr_edit_distance_reward": 0.9725319147109985, "step": 1184, "temperature": 0.9 }, { - "advantages": -1.1154584171890747e-06, - "completion_length": 197.0, - "delta_ref_entropy_loss": 0.0987548828125, - "delta_ref_ppl": -0.091796875, - "entropy_loss": -0.05072021484375, - "epoch": 0.474, - "grad_norm": 1.3215114871285747, - "k1_kl": 0.0919189453125, - "k3_kl": 0.0582275390625, - "kimi_kl": 0.224609375, - "learning_rate": 2.63e-07, - "loss": 0.0023, - "ppl": 0.02459716796875, - "reward": 0.9289491176605225, - "reward_std": 0.0017618679557926953, - "rewards/perpo_ocr_edit_distance_reward": 0.9289491772651672, + "advantages": -0.00011212485696887597, + "completion_length": 773.0, + "delta_ref_entropy_loss": 0.049560546875, + "delta_ref_ppl": -0.0625, + "entropy_loss": -0.0177001953125, + "epoch": 0.237, + "grad_norm": 0.5262782654264866, + "k1_kl": 0.0625, + "k3_kl": 0.03759765625, + "kimi_kl": 0.126953125, + "learning_rate": 3.8149999999999997e-07, + "loss": 0.0016, + "ppl": 0.007568359375, + "reward": 0.9875232577323914, + "reward_std": 0.00027969435905106366, + "rewards/perpo_ocr_edit_distance_reward": 0.9875233173370361, "step": 1185, "temperature": 0.9 }, { - "advantages": -7.704007293796167e-05, - "completion_length": 530.0, - "delta_ref_entropy_loss": 0.051025390625, - "delta_ref_ppl": -0.08831787109375, - "entropy_loss": -0.05859375, - "epoch": 0.4744, - "grad_norm": 1.2823221374835652, - "k1_kl": 0.0883026123046875, - "k3_kl": 0.06154632568359375, - "kimi_kl": 0.2200775146484375, - "learning_rate": 2.6279999999999994e-07, - "loss": 0.0025, - "ppl": 0.0321044921875, - "reward": 0.9785364866256714, - "reward_std": 0.0006354372962960042, - "rewards/perpo_ocr_edit_distance_reward": 0.9785365462303162, + "advantages": 0.0, + "completion_length": 359.0, + "delta_ref_entropy_loss": 0.091796875, + "delta_ref_ppl": -0.09814453125, + "entropy_loss": -0.027587890625, + "epoch": 0.2372, + "grad_norm": 0.9484922735824748, + "k1_kl": 0.09814453125, + "k3_kl": 0.06005859375, + "kimi_kl": 0.18359375, + "learning_rate": 3.814e-07, + "loss": 0.0024, + "ppl": 0.00994873046875, + "reward": 0.980793833732605, + "reward_std": 0.0007392519037239254, + "rewards/perpo_ocr_edit_distance_reward": 0.980793833732605, "step": 1186, "temperature": 0.9 }, { - "advantages": -4.841600457439199e-05, - "completion_length": 689.5, - "delta_ref_entropy_loss": 0.0693359375, - "delta_ref_ppl": -0.0474853515625, - "entropy_loss": -0.050537109375, - "epoch": 0.4748, - "grad_norm": 0.712331901152069, - "k1_kl": 0.0474853515625, - "k3_kl": 0.0277099609375, - "kimi_kl": 0.11376953125, - "learning_rate": 2.626e-07, - "loss": 0.0012, - "ppl": 0.02386474609375, - "reward": 0.9622895121574402, - "reward_std": 0.0017850931762950495, - "rewards/perpo_ocr_edit_distance_reward": 0.9622895419597626, + "advantages": -1.1716570952557959e-05, + "completion_length": 1130.0, + "delta_ref_entropy_loss": 0.0308837890625, + "delta_ref_ppl": -0.03564453125, + "entropy_loss": -0.046630859375, + "epoch": 0.2374, + "grad_norm": 0.5207952669815024, + "k1_kl": 0.035400390625, + "k3_kl": 0.0223388671875, + "kimi_kl": 0.064453125, + "learning_rate": 3.8129999999999995e-07, + "loss": 0.0009, + "ppl": 0.0245361328125, + "reward": 0.9823232293128967, + "reward_std": 0.003542862832546234, + "rewards/perpo_ocr_edit_distance_reward": 0.9823232889175415, "step": 1187, "temperature": 0.9 }, { - "advantages": -4.045452442369424e-05, - "completion_length": 729.5, - "delta_ref_entropy_loss": 0.052978515625, - "delta_ref_ppl": -0.0333251953125, - "entropy_loss": -0.0552978515625, - "epoch": 0.4752, - "grad_norm": 1.207089458945002, - "k1_kl": 0.033355712890625, - "k3_kl": 0.017608642578125, - "kimi_kl": 0.0424041748046875, - "learning_rate": 2.624e-07, - "loss": 0.0007, - "ppl": 0.028106689453125, - "reward": 0.9346781075000763, - "reward_std": 0.002190675266319886, - "rewards/perpo_ocr_edit_distance_reward": 0.9346781671047211, + "advantages": -2.2556101612281054e-05, + "completion_length": 614.0, + "delta_ref_entropy_loss": 0.080078125, + "delta_ref_ppl": -0.08056640625, + "entropy_loss": -0.040771484375, + "epoch": 0.2376, + "grad_norm": 0.5720909575945128, + "k1_kl": 0.08056640625, + "k3_kl": 0.046875, + "kimi_kl": 0.154296875, + "learning_rate": 3.8119999999999995e-07, + "loss": 0.0019, + "ppl": 0.016357421875, + "reward": 0.9913210272789001, + "reward_std": 0.000655467389151454, + "rewards/perpo_ocr_edit_distance_reward": 0.9913209676742554, "step": 1188, "temperature": 0.9 }, { - "advantages": -5.971534119453281e-05, - "completion_length": 593.5, - "delta_ref_entropy_loss": 0.04052734375, - "delta_ref_ppl": -0.033935546875, - "entropy_loss": -0.066650390625, - "epoch": 0.4756, - "grad_norm": 0.717858011498369, - "k1_kl": 0.033935546875, - "k3_kl": 0.0205078125, - "kimi_kl": 0.0423583984375, - "learning_rate": 2.6219999999999995e-07, - "loss": 0.0009, - "ppl": 0.03753662109375, - "reward": 0.9867285788059235, - "reward_std": 0.005623511315206997, - "rewards/perpo_ocr_edit_distance_reward": 0.9867286384105682, + "advantages": -5.960464477539063e-08, + "completion_length": 25.0, + "delta_ref_entropy_loss": 0.62890625, + "delta_ref_ppl": -1.046875, + "entropy_loss": -0.49609375, + "epoch": 0.2378, + "grad_norm": 26.32217209047408, + "k1_kl": 1.046875, + "k3_kl": 0.72265625, + "kimi_kl": 2.625, + "learning_rate": 3.811e-07, + "loss": 0.0289, + "ppl": 0.2451171875, + "reward": 0.16247139871120453, + "reward_std": 0.03613339737057686, + "rewards/perpo_ocr_edit_distance_reward": 0.16247139871120453, "step": 1189, "temperature": 0.9 }, { - "advantages": -6.470084190368652e-05, - "completion_length": 468.0, - "delta_ref_entropy_loss": 0.03515625, - "delta_ref_ppl": -0.03790283203125, - "entropy_loss": -0.028564453125, - "epoch": 0.476, - "grad_norm": 0.3867642498901916, - "k1_kl": 0.03790283203125, - "k3_kl": 0.02655029296875, - "kimi_kl": 0.08203125, - "learning_rate": 2.62e-07, - "loss": 0.0011, - "ppl": 0.01134490966796875, - "reward": 0.999891072511673, - "reward_std": 0.00028818378632422537, - "rewards/perpo_ocr_edit_distance_reward": 0.9998911023139954, + "advantages": -8.174351933121216e-07, + "completion_length": 493.0, + "delta_ref_entropy_loss": 0.0986328125, + "delta_ref_ppl": -0.0947265625, + "entropy_loss": -0.04931640625, + "epoch": 0.238, + "grad_norm": 1.2198211855992556, + "k1_kl": 0.0947265625, + "k3_kl": 0.05615234375, + "kimi_kl": 0.1318359375, + "learning_rate": 3.81e-07, + "loss": 0.0023, + "ppl": 0.0257568359375, + "reward": 0.9637351036071777, + "reward_std": 0.03051801584661007, + "rewards/perpo_ocr_edit_distance_reward": 0.9637352228164673, "step": 1190, "temperature": 0.9 }, { - "advantages": 7.56979034122196e-06, - "completion_length": 401.0, - "delta_ref_entropy_loss": 0.0694580078125, - "delta_ref_ppl": -0.05645751953125, - "entropy_loss": -0.063018798828125, - "epoch": 0.4764, - "grad_norm": 1.3817592901369502, - "k1_kl": 0.05621337890625, - "k3_kl": 0.03570556640625, - "kimi_kl": 0.15069580078125, - "learning_rate": 2.618e-07, - "loss": 0.0014, - "ppl": 0.031612396240234375, - "reward": 0.9571270048618317, - "reward_std": 0.0010747681371867657, - "rewards/perpo_ocr_edit_distance_reward": 0.9571269750595093, + "advantages": -0.00010643769201124087, + "completion_length": 674.0, + "delta_ref_entropy_loss": 0.07421875, + "delta_ref_ppl": -0.068359375, + "entropy_loss": -0.0179443359375, + "epoch": 0.2382, + "grad_norm": 0.5379941199438966, + "k1_kl": 0.068359375, + "k3_kl": 0.0400390625, + "kimi_kl": 0.1396484375, + "learning_rate": 3.809e-07, + "loss": 0.0017, + "ppl": 0.006866455078125, + "reward": 0.9972110986709595, + "reward_std": 0.0004600297543220222, + "rewards/perpo_ocr_edit_distance_reward": 0.997211217880249, "step": 1191, "temperature": 0.9 }, { - "advantages": -7.356916285061743e-06, - "completion_length": 490.5, - "delta_ref_entropy_loss": 0.0433349609375, - "delta_ref_ppl": -0.02740478515625, - "entropy_loss": -0.017547607421875, - "epoch": 0.4768, - "grad_norm": 0.23713330681007558, - "k1_kl": 0.02734375, - "k3_kl": 0.01300048828125, - "kimi_kl": 0.0396728515625, - "learning_rate": 2.616e-07, - "loss": 0.0005, - "ppl": 0.00457763671875, - "reward": 0.9998207092285156, - "reward_std": 0.00023929105373099446, - "rewards/perpo_ocr_edit_distance_reward": 0.999820739030838, + "advantages": -8.096014062175527e-05, + "completion_length": 723.0, + "delta_ref_entropy_loss": 0.04296875, + "delta_ref_ppl": -0.040283203125, + "entropy_loss": -0.0286865234375, + "epoch": 0.2384, + "grad_norm": 0.41141704804300616, + "k1_kl": 0.040283203125, + "k3_kl": 0.021728515625, + "kimi_kl": 0.047607421875, + "learning_rate": 3.808e-07, + "loss": 0.001, + "ppl": 0.0113525390625, + "reward": 0.9877452850341797, + "reward_std": 0.0003206697874702513, + "rewards/perpo_ocr_edit_distance_reward": 0.9877453446388245, "step": 1192, "temperature": 0.9 }, { - "advantages": -2.8141909382384256e-05, - "completion_length": 1103.5, - "delta_ref_entropy_loss": 0.0272216796875, - "delta_ref_ppl": -0.02728271484375, - "entropy_loss": -0.02313232421875, - "epoch": 0.4772, - "grad_norm": 5.025173488070186, - "k1_kl": 0.02728271484375, - "k3_kl": 0.01959228515625, - "kimi_kl": 0.09375, - "learning_rate": 2.614e-07, - "loss": 0.0008, - "ppl": 0.01251220703125, - "reward": 0.9836342036724091, - "reward_std": 0.019751005340367556, - "rewards/perpo_ocr_edit_distance_reward": 0.9836342632770538, + "advantages": -0.00016110284195747226, + "completion_length": 213.0, + "delta_ref_entropy_loss": 0.052978515625, + "delta_ref_ppl": -0.07421875, + "entropy_loss": -0.028564453125, + "epoch": 0.2386, + "grad_norm": 1.2001043264168794, + "k1_kl": 0.07373046875, + "k3_kl": 0.04638671875, + "kimi_kl": 0.1435546875, + "learning_rate": 3.8069999999999996e-07, + "loss": 0.002, + "ppl": 0.01336669921875, + "reward": 0.9896339178085327, + "reward_std": 0.00042854095227085054, + "rewards/perpo_ocr_edit_distance_reward": 0.9896340370178223, "step": 1193, "temperature": 0.9 }, { - "advantages": -8.749110804728844e-07, - "completion_length": 1595.0, - "delta_ref_entropy_loss": 0.0733642578125, - "delta_ref_ppl": -0.07763671875, - "entropy_loss": -0.1474609375, - "epoch": 0.4776, - "grad_norm": 1.5754220283130371, - "k1_kl": 0.077392578125, - "k3_kl": 0.051025390625, - "kimi_kl": 0.1416015625, - "learning_rate": 2.612e-07, - "loss": 0.002, - "ppl": 0.081298828125, - "reward": 0.7315780520439148, - "reward_std": 0.22959761321544647, - "rewards/perpo_ocr_edit_distance_reward": 0.7315781116485596, + "advantages": -3.405979782655777e-07, + "completion_length": 328.0, + "delta_ref_entropy_loss": 0.09326171875, + "delta_ref_ppl": -0.12158203125, + "entropy_loss": -0.171875, + "epoch": 0.2388, + "grad_norm": 3.557537311250508, + "k1_kl": 0.1220703125, + "k3_kl": 0.08544921875, + "kimi_kl": 0.2041015625, + "learning_rate": 3.8059999999999995e-07, + "loss": 0.0034, + "ppl": 0.09521484375, + "reward": 0.609530508518219, + "reward_std": 0.1627698391675949, + "rewards/perpo_ocr_edit_distance_reward": 0.6095305681228638, "step": 1194, "temperature": 0.9 }, { - "advantages": -2.0878655959677417e-05, - "completion_length": 926.0, - "delta_ref_entropy_loss": 0.0426025390625, - "delta_ref_ppl": -0.0467529296875, - "entropy_loss": -0.0526123046875, - "epoch": 0.478, - "grad_norm": 0.6704774943952053, - "k1_kl": 0.0467529296875, - "k3_kl": 0.03125, - "kimi_kl": 0.0845947265625, - "learning_rate": 2.61e-07, - "loss": 0.0013, - "ppl": 0.02825927734375, - "reward": 0.9704372584819794, - "reward_std": 0.0032604497391730547, - "rewards/perpo_ocr_edit_distance_reward": 0.9704373478889465, + "advantages": -3.7465778746081924e-07, + "completion_length": 553.0, + "delta_ref_entropy_loss": 0.1005859375, + "delta_ref_ppl": -0.07958984375, + "entropy_loss": -0.1494140625, + "epoch": 0.239, + "grad_norm": 1.1494170864425937, + "k1_kl": 0.07958984375, + "k3_kl": 0.043701171875, + "kimi_kl": 0.09423828125, + "learning_rate": 3.805e-07, + "loss": 0.0018, + "ppl": 0.07568359375, + "reward": 0.8183074593544006, + "reward_std": 0.22979852557182312, + "rewards/perpo_ocr_edit_distance_reward": 0.8183075189590454, "step": 1195, "temperature": 0.9 }, { - "advantages": -0.00034631150265340693, - "completion_length": 393.0, - "delta_ref_entropy_loss": 0.055419921875, - "delta_ref_ppl": -0.04595947265625, - "entropy_loss": -0.03887939453125, - "epoch": 0.4784, - "grad_norm": 0.5943386027062723, - "k1_kl": 0.04595947265625, - "k3_kl": 0.027618408203125, - "kimi_kl": 0.079833984375, - "learning_rate": 2.6079999999999995e-07, - "loss": 0.0015, - "ppl": 0.02008056640625, - "reward": 0.99622842669487, - "reward_std": 0.00047892561997286975, - "rewards/perpo_ocr_edit_distance_reward": 0.9962284862995148, + "advantages": -1.9950526620959863e-05, + "completion_length": 814.0, + "delta_ref_entropy_loss": 0.091796875, + "delta_ref_ppl": -0.0791015625, + "entropy_loss": -0.07275390625, + "epoch": 0.2392, + "grad_norm": 0.9629646958129812, + "k1_kl": 0.07958984375, + "k3_kl": 0.0400390625, + "kimi_kl": 0.0908203125, + "learning_rate": 3.804e-07, + "loss": 0.0016, + "ppl": 0.037109375, + "reward": 0.8251926302909851, + "reward_std": 0.0016072433209046721, + "rewards/perpo_ocr_edit_distance_reward": 0.8251926898956299, "step": 1196, "temperature": 0.9 }, { - "advantages": -7.782664397382177e-06, - "completion_length": 413.5, - "delta_ref_entropy_loss": 0.047607421875, - "delta_ref_ppl": -0.11492919921875, - "entropy_loss": -0.04254150390625, - "epoch": 0.4788, - "grad_norm": 2.4180189989316174, - "k1_kl": 0.11492919921875, - "k3_kl": 0.086151123046875, - "kimi_kl": 0.43084716796875, - "learning_rate": 2.606e-07, - "loss": 0.0035, - "ppl": 0.019012451171875, - "reward": 0.9935028254985809, - "reward_std": 0.0004973312607035041, - "rewards/perpo_ocr_edit_distance_reward": 0.9935028851032257, + "advantages": -8.056845399551094e-05, + "completion_length": 372.0, + "delta_ref_entropy_loss": 0.1005859375, + "delta_ref_ppl": -0.11865234375, + "entropy_loss": -0.044921875, + "epoch": 0.2394, + "grad_norm": 1.5396203557430983, + "k1_kl": 0.1181640625, + "k3_kl": 0.08203125, + "kimi_kl": 0.296875, + "learning_rate": 3.803e-07, + "loss": 0.0033, + "ppl": 0.0218505859375, + "reward": 0.8161624073982239, + "reward_std": 0.0007457638857886195, + "rewards/perpo_ocr_edit_distance_reward": 0.8161624670028687, "step": 1197, "temperature": 0.9 }, { - "advantages": -1.0984284926962573e-05, - "completion_length": 578.5, - "delta_ref_entropy_loss": 0.02972412109375, - "delta_ref_ppl": -0.03271484375, - "entropy_loss": -0.02349853515625, - "epoch": 0.4792, - "grad_norm": 0.59716736146761, - "k1_kl": 0.03271484375, - "k3_kl": 0.01947784423828125, - "kimi_kl": 0.044586181640625, - "learning_rate": 2.6040000000000003e-07, - "loss": 0.0008, - "ppl": 0.01474761962890625, - "reward": 0.9927219450473785, - "reward_std": 0.00033753516618162394, - "rewards/perpo_ocr_edit_distance_reward": 0.9927219450473785, + "advantages": -3.964560528402217e-05, + "completion_length": 897.0, + "delta_ref_entropy_loss": 0.05712890625, + "delta_ref_ppl": -0.043701171875, + "entropy_loss": -0.04833984375, + "epoch": 0.2396, + "grad_norm": 1.1813420674677153, + "k1_kl": 0.0439453125, + "k3_kl": 0.023193359375, + "kimi_kl": 0.052978515625, + "learning_rate": 3.802e-07, + "loss": 0.001, + "ppl": 0.0262451171875, + "reward": 0.9774435758590698, + "reward_std": 0.0020489522721618414, + "rewards/perpo_ocr_edit_distance_reward": 0.9774436950683594, "step": 1198, "temperature": 0.9 }, { - "advantages": -9.10248127183877e-06, - "completion_length": 238.5, - "delta_ref_entropy_loss": 0.05267333984375, - "delta_ref_ppl": -0.088134765625, - "entropy_loss": -0.025146484375, - "epoch": 0.4796, - "grad_norm": 0.44562536287529114, - "k1_kl": 0.08819580078125, - "k3_kl": 0.06695556640625, - "kimi_kl": 0.32110595703125, - "learning_rate": 2.6019999999999996e-07, - "loss": 0.0027, - "ppl": 0.011932373046875, - "reward": 0.9998523890972137, - "reward_std": 0.000184073272976093, - "rewards/perpo_ocr_edit_distance_reward": 0.9998524188995361, + "advantages": 5.10896995820076e-07, + "completion_length": 602.0, + "delta_ref_entropy_loss": 0.06689453125, + "delta_ref_ppl": -0.0654296875, + "entropy_loss": -0.17578125, + "epoch": 0.2398, + "grad_norm": 1.6540528931446832, + "k1_kl": 0.0654296875, + "k3_kl": 0.042236328125, + "kimi_kl": 0.125, + "learning_rate": 3.8009999999999997e-07, + "loss": 0.0017, + "ppl": 0.123046875, + "reward": 0.788868248462677, + "reward_std": 0.0326714813709259, + "rewards/perpo_ocr_edit_distance_reward": 0.788868248462677, "step": 1199, "temperature": 0.9 }, { - "advantages": -3.739765975296905e-05, - "completion_length": 837.5, - "delta_ref_entropy_loss": 0.02294921875, - "delta_ref_ppl": -0.02117919921875, - "entropy_loss": -0.02581787109375, - "epoch": 0.48, - "grad_norm": 0.6050421656388898, - "k1_kl": 0.021148681640625, - "k3_kl": 0.013336181640625, - "kimi_kl": 0.03436279296875, - "learning_rate": 2.6e-07, - "loss": 0.0006, - "ppl": 0.01416015625, - "reward": 0.9911119043827057, - "reward_std": 0.003443464229349047, - "rewards/perpo_ocr_edit_distance_reward": 0.9911119341850281, - "step": 1200, - "temperature": 0.9 - }, + "advantages": -0.0005960464477539062, + "completion_length": 220.0, + "delta_ref_entropy_loss": 0.047119140625, + "delta_ref_ppl": -0.1484375, + "entropy_loss": -0.01116943359375, + "epoch": 0.24, + "grad_norm": 0.009868996091359961, + "k1_kl": 0.1484375, + "k3_kl": 0.1181640625, + "kimi_kl": 0.51953125, + "learning_rate": 3.7999999999999996e-07, + "loss": 0.0053, + "ppl": 0.00189971923828125, + "reward": 0.9795321822166443, + "reward_std": 0.0, + "rewards/perpo_ocr_edit_distance_reward": 0.9795322418212891, + "step": 1200, + "temperature": 0.9 + }, { - "advantages": -0.00015257938503054902, - "completion_length": 912.5, - "delta_ref_entropy_loss": 0.02197265625, - "delta_ref_ppl": -0.0150909423828125, - "entropy_loss": -0.0135498046875, - "epoch": 0.4804, - "grad_norm": 0.3357038878912539, - "k1_kl": 0.01507568359375, - "k3_kl": 0.00872039794921875, - "kimi_kl": 0.02556610107421875, - "learning_rate": 2.598e-07, - "loss": 0.0005, - "ppl": 0.0054779052734375, - "reward": 0.9990570545196533, - "reward_std": 0.0002190121085732244, - "rewards/perpo_ocr_edit_distance_reward": 0.9990571439266205, + "advantages": -5.08683078805916e-05, + "completion_length": 538.0, + "delta_ref_entropy_loss": 0.061279296875, + "delta_ref_ppl": -0.0556640625, + "entropy_loss": -0.020263671875, + "epoch": 0.2402, + "grad_norm": 0.7255769517242302, + "k1_kl": 0.0556640625, + "k3_kl": 0.035888671875, + "kimi_kl": 0.10888671875, + "learning_rate": 3.799e-07, + "loss": 0.0015, + "ppl": 0.0103759765625, + "reward": 0.993685781955719, + "reward_std": 0.0005695189465768635, + "rewards/perpo_ocr_edit_distance_reward": 0.9936859011650085, "step": 1201, "temperature": 0.9 }, { - "advantages": -1.9584383892379265e-07, - "completion_length": 430.0, - "delta_ref_entropy_loss": 0.02435302734375, - "delta_ref_ppl": -0.018951416015625, - "entropy_loss": -0.018798828125, - "epoch": 0.4808, - "grad_norm": 0.8366323789968747, - "k1_kl": 0.018951416015625, - "k3_kl": 0.0099334716796875, - "kimi_kl": 0.019927978515625, - "learning_rate": 2.5959999999999997e-07, - "loss": 0.0004, - "ppl": 0.0068511962890625, - "reward": 0.9726578891277313, - "reward_std": 0.07215353101491928, - "rewards/perpo_ocr_edit_distance_reward": 0.9726579487323761, + "advantages": -2.087865686917212e-05, + "completion_length": 690.0, + "delta_ref_entropy_loss": 0.0703125, + "delta_ref_ppl": -0.0703125, + "entropy_loss": -0.0257568359375, + "epoch": 0.2404, + "grad_norm": 0.4832632243029886, + "k1_kl": 0.0703125, + "k3_kl": 0.042724609375, + "kimi_kl": 0.11767578125, + "learning_rate": 3.798e-07, + "loss": 0.0017, + "ppl": 0.01239013671875, + "reward": 0.9942922592163086, + "reward_std": 0.0007162857218645513, + "rewards/perpo_ocr_edit_distance_reward": 0.9942923188209534, "step": 1202, "temperature": 0.9 }, { - "advantages": 1.5041658770087452e-05, - "completion_length": 964.5, - "delta_ref_entropy_loss": 0.0255126953125, - "delta_ref_ppl": -0.02362060546875, - "entropy_loss": -0.0421142578125, - "epoch": 0.4812, - "grad_norm": 0.6960428644199821, - "k1_kl": 0.0235595703125, - "k3_kl": 0.015289306640625, - "kimi_kl": 0.03424072265625, - "learning_rate": 2.594e-07, - "loss": 0.0006, - "ppl": 0.0233154296875, - "reward": 0.9502859115600586, - "reward_std": 0.021158111310796812, - "rewards/perpo_ocr_edit_distance_reward": 0.950285941362381, + "advantages": -0.0005960464477539062, + "completion_length": 220.0, + "delta_ref_entropy_loss": 0.1787109375, + "delta_ref_ppl": -0.1796875, + "entropy_loss": -0.04931640625, + "epoch": 0.2406, + "grad_norm": 0.05957738127125028, + "k1_kl": 0.1796875, + "k3_kl": 0.1142578125, + "kimi_kl": 0.41796875, + "learning_rate": 3.7969999999999994e-07, + "loss": 0.0052, + "ppl": 0.01361083984375, + "reward": 0.84375, + "reward_std": 0.0, + "rewards/perpo_ocr_edit_distance_reward": 0.8437500596046448, "step": 1203, "temperature": 0.9 }, { - "advantages": -6.192071396071697e-05, - "completion_length": 1002.0, - "delta_ref_entropy_loss": 0.02777099609375, - "delta_ref_ppl": -0.0174560546875, - "entropy_loss": -0.02325439453125, - "epoch": 0.4816, - "grad_norm": 2.5020978386737007, - "k1_kl": 0.01751708984375, - "k3_kl": 0.028839111328125, - "kimi_kl": 0.03515625, - "learning_rate": 2.592e-07, - "loss": 0.0012, - "ppl": 0.01385498046875, - "reward": 0.9983530938625336, - "reward_std": 0.0006419543642550707, - "rewards/perpo_ocr_edit_distance_reward": 0.998353123664856, + "advantages": -2.8763499358319677e-05, + "completion_length": 518.0, + "delta_ref_entropy_loss": 0.1611328125, + "delta_ref_ppl": -0.10205078125, + "entropy_loss": -0.11328125, + "epoch": 0.2408, + "grad_norm": 1.17092223681478, + "k1_kl": 0.1015625, + "k3_kl": 0.052490234375, + "kimi_kl": 0.11181640625, + "learning_rate": 3.796e-07, + "loss": 0.0021, + "ppl": 0.057861328125, + "reward": 0.9394412636756897, + "reward_std": 0.0013797438004985452, + "rewards/perpo_ocr_edit_distance_reward": 0.9394413232803345, "step": 1204, "temperature": 0.9 }, { - "advantages": -0.00011186089352577255, - "completion_length": 747.0, - "delta_ref_entropy_loss": 0.02276611328125, - "delta_ref_ppl": -0.014373779296875, - "entropy_loss": -0.0235595703125, - "epoch": 0.482, - "grad_norm": 1.1294155867070585, - "k1_kl": 0.014373779296875, - "k3_kl": 0.009490966796875, - "kimi_kl": 0.020721435546875, - "learning_rate": 2.59e-07, - "loss": 0.0005, - "ppl": 0.01055908203125, - "reward": 0.9790776968002319, - "reward_std": 0.02408692213066388, - "rewards/perpo_ocr_edit_distance_reward": 0.9790777862071991, + "advantages": 0.0, + "completion_length": 608.0, + "delta_ref_entropy_loss": 0.0595703125, + "delta_ref_ppl": -0.041259765625, + "entropy_loss": -0.0179443359375, + "epoch": 0.241, + "grad_norm": 0.014042948385425197, + "k1_kl": 0.041259765625, + "k3_kl": 0.019775390625, + "kimi_kl": 0.052734375, + "learning_rate": 3.795e-07, + "loss": 0.0008, + "ppl": 0.006195068359375, + "reward": 0.991525411605835, + "reward_std": 0.0, + "rewards/perpo_ocr_edit_distance_reward": 0.991525411605835, "step": 1205, "temperature": 0.9 }, { - "advantages": -2.186638994317036e-05, - "completion_length": 706.5, - "delta_ref_entropy_loss": 0.04608154296875, - "delta_ref_ppl": -0.041259765625, - "entropy_loss": -0.05419921875, - "epoch": 0.4824, - "grad_norm": 0.7165887076913966, - "k1_kl": 0.0413818359375, - "k3_kl": 0.02838134765625, - "kimi_kl": 0.09033203125, - "learning_rate": 2.5879999999999996e-07, - "loss": 0.0012, - "ppl": 0.03094482421875, - "reward": 0.9702989757061005, - "reward_std": 0.0005338570335879922, - "rewards/perpo_ocr_edit_distance_reward": 0.9702990651130676, + "advantages": -3.409385681152344e-05, + "completion_length": 850.0, + "delta_ref_entropy_loss": 0.07470703125, + "delta_ref_ppl": -0.052734375, + "entropy_loss": -0.0263671875, + "epoch": 0.2412, + "grad_norm": 0.48696757144477537, + "k1_kl": 0.052734375, + "k3_kl": 0.0255126953125, + "kimi_kl": 0.047607421875, + "learning_rate": 3.794e-07, + "loss": 0.0011, + "ppl": 0.0111083984375, + "reward": 0.9701648354530334, + "reward_std": 0.00039977047708816826, + "rewards/perpo_ocr_edit_distance_reward": 0.9701648950576782, "step": 1206, "temperature": 0.9 }, { - "advantages": 4.257344698999077e-08, - "completion_length": 625.5, - "delta_ref_entropy_loss": 0.03961181640625, - "delta_ref_ppl": -0.047607421875, - "entropy_loss": -0.019775390625, - "epoch": 0.4828, - "grad_norm": 0.687301798411015, - "k1_kl": 0.0477294921875, - "k3_kl": 0.0318603515625, - "kimi_kl": 0.10888671875, - "learning_rate": 2.586e-07, - "loss": 0.0013, - "ppl": 0.0074615478515625, - "reward": 0.9721301198005676, - "reward_std": 0.0006638183658651542, - "rewards/perpo_ocr_edit_distance_reward": 0.97213014960289, + "advantages": -5.418913860921748e-05, + "completion_length": 196.0, + "delta_ref_entropy_loss": 0.0859375, + "delta_ref_ppl": -0.185546875, + "entropy_loss": -0.03857421875, + "epoch": 0.2414, + "grad_norm": 1.5074814799142529, + "k1_kl": 0.185546875, + "k3_kl": 0.1337890625, + "kimi_kl": 0.439453125, + "learning_rate": 3.793e-07, + "loss": 0.0054, + "ppl": 0.0174560546875, + "reward": 0.9455025792121887, + "reward_std": 0.002099779900163412, + "rewards/perpo_ocr_edit_distance_reward": 0.945502758026123, "step": 1207, "temperature": 0.9 }, { - "advantages": -3.0568669444619445e-06, - "completion_length": 531.5, - "delta_ref_entropy_loss": 0.03192138671875, - "delta_ref_ppl": -0.025482177734375, - "entropy_loss": -0.01885986328125, - "epoch": 0.4832, - "grad_norm": 0.4450664636288109, - "k1_kl": 0.025482177734375, - "k3_kl": 0.01497650146484375, - "kimi_kl": 0.03668212890625, - "learning_rate": 2.584e-07, - "loss": 0.0006, - "ppl": 0.00807952880859375, - "reward": 0.9663741290569305, - "reward_std": 0.0062355659902095795, - "rewards/perpo_ocr_edit_distance_reward": 0.9663741886615753, + "advantages": -4.039491977891885e-05, + "completion_length": 605.0, + "delta_ref_entropy_loss": 0.048583984375, + "delta_ref_ppl": -0.057373046875, + "entropy_loss": -0.0294189453125, + "epoch": 0.2416, + "grad_norm": 0.6979734390606708, + "k1_kl": 0.057373046875, + "k3_kl": 0.037109375, + "kimi_kl": 0.125, + "learning_rate": 3.7919999999999995e-07, + "loss": 0.0015, + "ppl": 0.0107421875, + "reward": 0.9881755113601685, + "reward_std": 0.0015869111521169543, + "rewards/perpo_ocr_edit_distance_reward": 0.9881755709648132, "step": 1208, "temperature": 0.9 }, { - "advantages": -3.150531233586662e-05, - "completion_length": 433.0, - "delta_ref_entropy_loss": 0.0947265625, - "delta_ref_ppl": -0.07861328125, - "entropy_loss": -0.1029052734375, - "epoch": 0.4836, - "grad_norm": 1.4139433783103028, - "k1_kl": 0.07861328125, - "k3_kl": 0.04638671875, - "kimi_kl": 0.171142578125, - "learning_rate": 2.5819999999999997e-07, - "loss": 0.0019, - "ppl": 0.05938720703125, - "reward": 0.7193536907434464, - "reward_std": 0.014751168782822788, - "rewards/perpo_ocr_edit_distance_reward": 0.7193537950515747, + "advantages": -5.770155621576123e-05, + "completion_length": 592.0, + "delta_ref_entropy_loss": 0.038330078125, + "delta_ref_ppl": -0.06884765625, + "entropy_loss": -0.0185546875, + "epoch": 0.2418, + "grad_norm": 1.7792829888038664, + "k1_kl": 0.0693359375, + "k3_kl": 0.05078125, + "kimi_kl": 0.166015625, + "learning_rate": 3.791e-07, + "loss": 0.0021, + "ppl": 0.010498046875, + "reward": 0.9977659583091736, + "reward_std": 0.0010803703917190433, + "rewards/perpo_ocr_edit_distance_reward": 0.9977660179138184, "step": 1209, "temperature": 0.9 }, { - "advantages": -8.766566679696552e-05, - "completion_length": 886.0, - "delta_ref_entropy_loss": 0.02197265625, - "delta_ref_ppl": -0.01824951171875, - "entropy_loss": -0.025421142578125, - "epoch": 0.484, - "grad_norm": 0.4495449511339752, - "k1_kl": 0.01824951171875, - "k3_kl": 0.010101318359375, - "kimi_kl": 0.02630615234375, - "learning_rate": 2.58e-07, - "loss": 0.0005, - "ppl": 0.0126495361328125, - "reward": 0.9991689324378967, - "reward_std": 0.0005179554282221943, - "rewards/perpo_ocr_edit_distance_reward": 0.9991689920425415, + "advantages": -1.7523765563964844e-05, + "completion_length": 622.0, + "delta_ref_entropy_loss": 0.11572265625, + "delta_ref_ppl": -0.07275390625, + "entropy_loss": -0.08935546875, + "epoch": 0.242, + "grad_norm": 1.441250644337685, + "k1_kl": 0.07275390625, + "k3_kl": 0.03564453125, + "kimi_kl": 0.087890625, + "learning_rate": 3.79e-07, + "loss": 0.0014, + "ppl": 0.04345703125, + "reward": 0.8091024160385132, + "reward_std": 0.004273668862879276, + "rewards/perpo_ocr_edit_distance_reward": 0.809102475643158, "step": 1210, "temperature": 0.9 }, { - "advantages": -0.00035943304101238027, - "completion_length": 422.0, - "delta_ref_entropy_loss": 0.0269775390625, - "delta_ref_ppl": -0.05218505859375, - "entropy_loss": -0.0224609375, - "epoch": 0.4844, - "grad_norm": 0.5177302365382612, - "k1_kl": 0.05242919921875, - "k3_kl": 0.044586181640625, - "kimi_kl": 0.24267578125, - "learning_rate": 2.5779999999999994e-07, - "loss": 0.0021, - "ppl": 0.009735107421875, - "reward": 0.996823251247406, - "reward_std": 0.00033133625402115285, - "rewards/perpo_ocr_edit_distance_reward": 0.9968233406543732, + "advantages": -3.62396240234375e-05, + "completion_length": 1109.0, + "delta_ref_entropy_loss": 0.039794921875, + "delta_ref_ppl": -0.03564453125, + "entropy_loss": -0.03759765625, + "epoch": 0.2422, + "grad_norm": 0.5961857719798752, + "k1_kl": 0.03564453125, + "k3_kl": 0.0216064453125, + "kimi_kl": 0.04833984375, + "learning_rate": 3.789e-07, + "loss": 0.0009, + "ppl": 0.0186767578125, + "reward": 0.9911670088768005, + "reward_std": 0.0015441562281921506, + "rewards/perpo_ocr_edit_distance_reward": 0.9911671280860901, "step": 1211, "temperature": 0.9 }, { - "advantages": -2.7758735086536035e-05, - "completion_length": 327.0, - "delta_ref_entropy_loss": 0.04541015625, - "delta_ref_ppl": -0.064544677734375, - "entropy_loss": -0.0238037109375, - "epoch": 0.4848, - "grad_norm": 0.2978604314183488, - "k1_kl": 0.064544677734375, - "k3_kl": 0.0433807373046875, - "kimi_kl": 0.14910888671875, - "learning_rate": 2.576e-07, - "loss": 0.0018, - "ppl": 0.0096588134765625, - "reward": 0.999000072479248, - "reward_std": 0.0007936738547869027, - "rewards/perpo_ocr_edit_distance_reward": 0.9990001022815704, + "advantages": -0.00010616439249133691, + "completion_length": 467.0, + "delta_ref_entropy_loss": 0.09130859375, + "delta_ref_ppl": -0.119140625, + "entropy_loss": -0.03369140625, + "epoch": 0.2424, + "grad_norm": 0.5053836391698578, + "k1_kl": 0.119140625, + "k3_kl": 0.078125, + "kimi_kl": 0.310546875, + "learning_rate": 3.7880000000000003e-07, + "loss": 0.0032, + "ppl": 0.01129150390625, + "reward": 0.9846093058586121, + "reward_std": 0.0003812535433098674, + "rewards/perpo_ocr_edit_distance_reward": 0.9846093654632568, "step": 1212, "temperature": 0.9 }, { - "advantages": -0.00019892838099622168, - "completion_length": 690.5, - "delta_ref_entropy_loss": 0.015380859375, - "delta_ref_ppl": -0.01033782958984375, - "entropy_loss": -0.015869140625, - "epoch": 0.4852, - "grad_norm": 0.5269446897128781, - "k1_kl": 0.01035308837890625, - "k3_kl": 0.006839752197265625, - "kimi_kl": 0.020000457763671875, - "learning_rate": 2.574e-07, - "loss": 0.0005, - "ppl": 0.0070953369140625, - "reward": 0.9994217157363892, - "reward_std": 0.00022762786829844117, - "rewards/perpo_ocr_edit_distance_reward": 0.9994218051433563, + "advantages": -5.3661213314626366e-05, + "completion_length": 721.0, + "delta_ref_entropy_loss": 0.09033203125, + "delta_ref_ppl": -0.059814453125, + "entropy_loss": -0.025634765625, + "epoch": 0.2426, + "grad_norm": 0.3880913864628606, + "k1_kl": 0.06005859375, + "k3_kl": 0.027099609375, + "kimi_kl": 0.07568359375, + "learning_rate": 3.7869999999999997e-07, + "loss": 0.0011, + "ppl": 0.0076904296875, + "reward": 0.9705710411071777, + "reward_std": 0.0008518315735273063, + "rewards/perpo_ocr_edit_distance_reward": 0.9705711007118225, "step": 1213, "temperature": 0.9 }, { - "advantages": 3.0253615477704443e-05, - "completion_length": 493.5, - "delta_ref_entropy_loss": 0.02142333984375, - "delta_ref_ppl": -0.011444091796875, - "entropy_loss": -0.012969970703125, - "epoch": 0.4856, - "grad_norm": 0.270023025219482, - "k1_kl": 0.011474609375, - "k3_kl": 0.00567626953125, - "kimi_kl": 0.012359619140625, - "learning_rate": 2.5719999999999995e-07, - "loss": 0.0002, - "ppl": 0.00379180908203125, - "reward": 0.9998144209384918, - "reward_std": 0.0002313993318239227, - "rewards/perpo_ocr_edit_distance_reward": 0.9998144507408142, + "advantages": -2.901894913520664e-05, + "completion_length": 919.0, + "delta_ref_entropy_loss": 0.041259765625, + "delta_ref_ppl": -0.0302734375, + "entropy_loss": -0.037353515625, + "epoch": 0.2428, + "grad_norm": 0.5885088841729355, + "k1_kl": 0.0301513671875, + "k3_kl": 0.0167236328125, + "kimi_kl": 0.0390625, + "learning_rate": 3.7859999999999996e-07, + "loss": 0.0007, + "ppl": 0.016357421875, + "reward": 0.9836965203285217, + "reward_std": 0.002246408024802804, + "rewards/perpo_ocr_edit_distance_reward": 0.9836965799331665, "step": 1214, "temperature": 0.9 }, { - "advantages": -1.6037906920729483e-05, - "completion_length": 422.5, - "delta_ref_entropy_loss": 0.073974609375, - "delta_ref_ppl": -0.04583740234375, - "entropy_loss": -0.1328125, - "epoch": 0.486, - "grad_norm": 1.899599803995467, - "k1_kl": 0.0457763671875, - "k3_kl": 0.023681640625, - "kimi_kl": 0.04150390625, - "learning_rate": 2.57e-07, - "loss": 0.001, - "ppl": 0.063232421875, - "reward": 0.43261294066905975, - "reward_std": 0.05893260185257532, - "rewards/perpo_ocr_edit_distance_reward": 0.43261297047138214, + "advantages": -4.3749812903115526e-05, + "completion_length": 528.0, + "delta_ref_entropy_loss": 0.11328125, + "delta_ref_ppl": -0.09375, + "entropy_loss": -0.0712890625, + "epoch": 0.243, + "grad_norm": 1.0430517715633283, + "k1_kl": 0.09375, + "k3_kl": 0.04931640625, + "kimi_kl": 0.12255859375, + "learning_rate": 3.785e-07, + "loss": 0.002, + "ppl": 0.0341796875, + "reward": 0.9450623989105225, + "reward_std": 0.0012628367403522134, + "rewards/perpo_ocr_edit_distance_reward": 0.945062518119812, "step": 1215, "temperature": 0.9 }, { - "advantages": -3.2484534131071996e-05, - "completion_length": 1173.5, - "delta_ref_entropy_loss": 0.02020263671875, - "delta_ref_ppl": -0.0146484375, - "entropy_loss": -0.0186767578125, - "epoch": 0.4864, - "grad_norm": 0.43734792013832796, - "k1_kl": 0.0146942138671875, - "k3_kl": 0.0088958740234375, - "kimi_kl": 0.02447509765625, - "learning_rate": 2.5679999999999997e-07, - "loss": 0.0004, - "ppl": 0.0093994140625, - "reward": 0.9975847899913788, - "reward_std": 0.0004992937610950321, - "rewards/perpo_ocr_edit_distance_reward": 0.997584879398346, + "advantages": 1.4909676792740356e-05, + "completion_length": 488.0, + "delta_ref_entropy_loss": 0.068359375, + "delta_ref_ppl": -0.10302734375, + "entropy_loss": -0.025634765625, + "epoch": 0.2432, + "grad_norm": 0.522116645670093, + "k1_kl": 0.10302734375, + "k3_kl": 0.0673828125, + "kimi_kl": 0.2451171875, + "learning_rate": 3.784e-07, + "loss": 0.0027, + "ppl": 0.01141357421875, + "reward": 0.9884883165359497, + "reward_std": 0.00047072608140297234, + "rewards/perpo_ocr_edit_distance_reward": 0.9884883761405945, "step": 1216, "temperature": 0.9 }, { - "advantages": -2.3841859729145654e-05, - "completion_length": 433.5, - "delta_ref_entropy_loss": 0.04052734375, - "delta_ref_ppl": -0.03253173828125, - "entropy_loss": -0.0281982421875, - "epoch": 0.4868, - "grad_norm": 0.2736650217026748, - "k1_kl": 0.032470703125, - "k3_kl": 0.0187835693359375, - "kimi_kl": 0.054901123046875, - "learning_rate": 2.566e-07, - "loss": 0.0008, - "ppl": 0.011474609375, - "reward": 0.9844862520694733, - "reward_std": 0.00021791583276353776, - "rewards/perpo_ocr_edit_distance_reward": 0.9844862520694733, + "advantages": -4.7700748837087303e-05, + "completion_length": 1468.0, + "delta_ref_entropy_loss": 0.0299072265625, + "delta_ref_ppl": -0.0296630859375, + "entropy_loss": -0.0830078125, + "epoch": 0.2434, + "grad_norm": 2.4242714982831246, + "k1_kl": 0.02978515625, + "k3_kl": 0.0223388671875, + "kimi_kl": 0.04296875, + "learning_rate": 3.783e-07, + "loss": 0.0009, + "ppl": 0.0498046875, + "reward": 0.9841014742851257, + "reward_std": 0.0009709295118227601, + "rewards/perpo_ocr_edit_distance_reward": 0.9841015338897705, "step": 1217, "temperature": 0.9 }, { - "advantages": -1.743861685099546e-05, - "completion_length": 780.5, - "delta_ref_entropy_loss": 0.0345458984375, - "delta_ref_ppl": -0.039794921875, - "entropy_loss": -0.0662841796875, - "epoch": 0.4872, - "grad_norm": 1.8653254570062359, - "k1_kl": 0.0396728515625, - "k3_kl": 0.02581787109375, - "kimi_kl": 0.0787353515625, - "learning_rate": 2.564e-07, - "loss": 0.0011, - "ppl": 0.0345458984375, - "reward": 0.8128836452960968, - "reward_std": 0.1444085444090888, - "rewards/perpo_ocr_edit_distance_reward": 0.8128837049007416, + "advantages": 1.1235476449655835e-05, + "completion_length": 1558.0, + "delta_ref_entropy_loss": 0.05126953125, + "delta_ref_ppl": -0.048583984375, + "entropy_loss": -0.12890625, + "epoch": 0.2436, + "grad_norm": 9.785407175437607, + "k1_kl": 0.048583984375, + "k3_kl": 0.0311279296875, + "kimi_kl": 0.055419921875, + "learning_rate": 3.782e-07, + "loss": 0.0012, + "ppl": 0.07470703125, + "reward": 0.9286917448043823, + "reward_std": 0.004447154700756073, + "rewards/perpo_ocr_edit_distance_reward": 0.9286917448043823, "step": 1218, "temperature": 0.9 }, { - "advantages": -1.7029897207976319e-06, - "completion_length": 567.5, - "delta_ref_entropy_loss": 0.0831298828125, - "delta_ref_ppl": -0.04168701171875, - "entropy_loss": -0.0611572265625, - "epoch": 0.4876, - "grad_norm": 0.7835724381940701, - "k1_kl": 0.04168701171875, - "k3_kl": 0.02099609375, - "kimi_kl": 0.0345458984375, - "learning_rate": 2.562e-07, - "loss": 0.0008, - "ppl": 0.03369140625, - "reward": 0.8570675253868103, - "reward_std": 0.0029247350757941604, - "rewards/perpo_ocr_edit_distance_reward": 0.8570675551891327, + "advantages": -2.7247838829680404e-07, + "completion_length": 1288.0, + "delta_ref_entropy_loss": 0.09033203125, + "delta_ref_ppl": -0.038818359375, + "entropy_loss": -0.169921875, + "epoch": 0.2438, + "grad_norm": 2584516.140739003, + "k1_kl": 0.038818359375, + "k3_kl": 61184.0, + "kimi_kl": 0.1474609375, + "learning_rate": 3.781e-07, + "loss": 2449.9402, + "ppl": 0.1005859375, + "reward": 0.8081893920898438, + "reward_std": 0.22156743705272675, + "rewards/perpo_ocr_edit_distance_reward": 0.8081894516944885, "step": 1219, "temperature": 0.9 }, { - "advantages": -0.0003105319092355785, - "completion_length": 481.5, - "delta_ref_entropy_loss": 0.0213623046875, - "delta_ref_ppl": -0.01336669921875, - "entropy_loss": -0.018768310546875, - "epoch": 0.488, - "grad_norm": 0.2758558643364747, - "k1_kl": 0.013336181640625, - "k3_kl": 0.0067901611328125, - "kimi_kl": 0.012786865234375, - "learning_rate": 2.56e-07, - "loss": 0.0006, - "ppl": 0.006927490234375, - "reward": 0.9731662571430206, - "reward_std": 0.0013116165064275265, - "rewards/perpo_ocr_edit_distance_reward": 0.9731664061546326, + "advantages": -0.00010106393892783672, + "completion_length": 599.0, + "delta_ref_entropy_loss": 0.053955078125, + "delta_ref_ppl": -0.046142578125, + "entropy_loss": -0.014892578125, + "epoch": 0.244, + "grad_norm": 0.36936555530935644, + "k1_kl": 0.046142578125, + "k3_kl": 0.0230712890625, + "kimi_kl": 0.05908203125, + "learning_rate": 3.7799999999999997e-07, + "loss": 0.001, + "ppl": 0.00677490234375, + "reward": 0.9962621927261353, + "reward_std": 0.00023703857732471079, + "rewards/perpo_ocr_edit_distance_reward": 0.9962623119354248, "step": 1220, "temperature": 0.9 }, { - "advantages": -0.0001386489257129142, - "completion_length": 272.5, - "delta_ref_entropy_loss": 0.0400390625, - "delta_ref_ppl": -0.091552734375, - "entropy_loss": -0.02484130859375, - "epoch": 0.4884, - "grad_norm": 1.0097980671148448, - "k1_kl": 0.091552734375, - "k3_kl": 0.0718994140625, - "kimi_kl": 0.37646484375, - "learning_rate": 2.558e-07, - "loss": 0.003, - "ppl": 0.008056640625, - "reward": 0.8556366562843323, - "reward_std": 0.00039097166154533625, - "rewards/perpo_ocr_edit_distance_reward": 0.8556367456912994, + "advantages": -6.726810397594818e-07, + "completion_length": 623.0, + "delta_ref_entropy_loss": 0.0322265625, + "delta_ref_ppl": -0.046630859375, + "entropy_loss": -0.0228271484375, + "epoch": 0.2442, + "grad_norm": 0.8084405953029049, + "k1_kl": 0.046630859375, + "k3_kl": 0.031982421875, + "kimi_kl": 0.0908203125, + "learning_rate": 3.779e-07, + "loss": 0.0013, + "ppl": 0.010009765625, + "reward": 0.9596273303031921, + "reward_std": 0.012295195832848549, + "rewards/perpo_ocr_edit_distance_reward": 0.9596273303031921, "step": 1221, "temperature": 0.9 }, { - "advantages": -9.682561877610851e-05, - "completion_length": 769.5, - "delta_ref_entropy_loss": 0.0201416015625, - "delta_ref_ppl": -0.02783203125, - "entropy_loss": -0.1192626953125, - "epoch": 0.4888, - "grad_norm": 0.9486054499635735, - "k1_kl": 0.027984619140625, - "k3_kl": 0.0211181640625, - "kimi_kl": 0.0445556640625, - "learning_rate": 2.556e-07, - "loss": 0.0009, - "ppl": 0.076324462890625, - "reward": 0.845055490732193, - "reward_std": 0.08531016424240079, - "rewards/perpo_ocr_edit_distance_reward": 0.8450555205345154, + "advantages": -4.938671054333099e-07, + "completion_length": 385.0, + "delta_ref_entropy_loss": 0.1240234375, + "delta_ref_ppl": -0.12255859375, + "entropy_loss": -0.03466796875, + "epoch": 0.2444, + "grad_norm": 1.020923177758475, + "k1_kl": 0.123046875, + "k3_kl": 0.06640625, + "kimi_kl": 0.169921875, + "learning_rate": 3.778e-07, + "loss": 0.0027, + "ppl": 0.0146484375, + "reward": 0.9378591775894165, + "reward_std": 0.15515314042568207, + "rewards/perpo_ocr_edit_distance_reward": 0.937859296798706, "step": 1222, "temperature": 0.9 }, { - "advantages": -8.510692623531213e-06, - "completion_length": 465.5, - "delta_ref_entropy_loss": 0.0770263671875, - "delta_ref_ppl": -0.1448974609375, - "entropy_loss": -0.040283203125, - "epoch": 0.4892, - "grad_norm": 2.679495710093818, - "k1_kl": 0.1448974609375, - "k3_kl": 0.098785400390625, - "kimi_kl": 0.282318115234375, - "learning_rate": 2.554e-07, - "loss": 0.004, - "ppl": 0.021331787109375, - "reward": 0.980562299489975, - "reward_std": 0.011514120269566774, - "rewards/perpo_ocr_edit_distance_reward": 0.9805623292922974, + "advantages": -0.00013597097131423652, + "completion_length": 368.0, + "delta_ref_entropy_loss": 0.050048828125, + "delta_ref_ppl": -0.0654296875, + "entropy_loss": -0.0198974609375, + "epoch": 0.2446, + "grad_norm": 0.5885399338783008, + "k1_kl": 0.0654296875, + "k3_kl": 0.044189453125, + "kimi_kl": 0.150390625, + "learning_rate": 3.7769999999999995e-07, + "loss": 0.0019, + "ppl": 0.00732421875, + "reward": 0.9970278739929199, + "reward_std": 0.00033836282091215253, + "rewards/perpo_ocr_edit_distance_reward": 0.9970279932022095, "step": 1223, "temperature": 0.9 }, { - "advantages": -9.234462595486548e-06, - "completion_length": 433.5, - "delta_ref_entropy_loss": 0.061279296875, + "advantages": -0.00013623919221572578, + "completion_length": 1104.0, + "delta_ref_entropy_loss": 0.06591796875, "delta_ref_ppl": -0.050537109375, - "entropy_loss": -0.040771484375, - "epoch": 0.4896, - "grad_norm": 0.48544540930348323, - "k1_kl": 0.0504150390625, - "k3_kl": 0.02960205078125, - "kimi_kl": 0.087646484375, - "learning_rate": 2.5519999999999996e-07, + "entropy_loss": -0.0296630859375, + "epoch": 0.2448, + "grad_norm": 0.530613958621193, + "k1_kl": 0.050537109375, + "k3_kl": 0.0255126953125, + "kimi_kl": 0.062255859375, + "learning_rate": 3.776e-07, "loss": 0.0012, - "ppl": 0.01776123046875, - "reward": 0.9599877297878265, - "reward_std": 0.0024817066732794046, - "rewards/perpo_ocr_edit_distance_reward": 0.9599878191947937, + "ppl": 0.0118408203125, + "reward": 0.9831113219261169, + "reward_std": 0.00046243442920967937, + "rewards/perpo_ocr_edit_distance_reward": 0.9831113815307617, "step": 1224, "temperature": 0.9 }, { - "advantages": -5.94237062578884e-05, - "completion_length": 850.0, - "delta_ref_entropy_loss": 0.01568603515625, - "delta_ref_ppl": -0.0174560546875, - "entropy_loss": -0.023193359375, - "epoch": 0.49, - "grad_norm": 3.440154841453137, - "k1_kl": 0.01744842529296875, - "k3_kl": 0.0269775390625, - "kimi_kl": 0.0390167236328125, - "learning_rate": 2.55e-07, - "loss": 0.0011, - "ppl": 0.011138916015625, - "reward": 0.8980870544910431, - "reward_std": 0.16605687004630454, - "rewards/perpo_ocr_edit_distance_reward": 0.8980871140956879, + "advantages": -2.5544848085701233e-06, + "completion_length": 534.0, + "delta_ref_entropy_loss": 0.1640625, + "delta_ref_ppl": -0.09521484375, + "entropy_loss": -0.09521484375, + "epoch": 0.245, + "grad_norm": 1.3803345548978927, + "k1_kl": 0.09521484375, + "k3_kl": 0.04931640625, + "kimi_kl": 0.1298828125, + "learning_rate": 3.775e-07, + "loss": 0.002, + "ppl": 0.042236328125, + "reward": 0.8986547589302063, + "reward_std": 0.013164565898478031, + "rewards/perpo_ocr_edit_distance_reward": 0.8986548185348511, "step": 1225, "temperature": 0.9 }, { - "advantages": -1.1495181883702799e-05, - "completion_length": 767.5, - "delta_ref_entropy_loss": 0.04296875, - "delta_ref_ppl": -0.03759765625, - "entropy_loss": -0.0762939453125, - "epoch": 0.4904, - "grad_norm": 0.8483588251129469, - "k1_kl": 0.0377197265625, - "k3_kl": 0.0318603515625, - "kimi_kl": 0.06982421875, - "learning_rate": 2.5480000000000003e-07, - "loss": 0.0013, - "ppl": 0.041259765625, - "reward": 0.9160571098327637, - "reward_std": 0.08826605585636571, - "rewards/perpo_ocr_edit_distance_reward": 0.9160570800304413, + "advantages": 1.088210592570249e-05, + "completion_length": 132.0, + "delta_ref_entropy_loss": 0.039306640625, + "delta_ref_ppl": -0.2158203125, + "entropy_loss": -0.0220947265625, + "epoch": 0.2452, + "grad_norm": 1.708022961750625, + "k1_kl": 0.2158203125, + "k3_kl": 0.169921875, + "kimi_kl": 0.73046875, + "learning_rate": 3.774e-07, + "loss": 0.0068, + "ppl": 0.0086669921875, + "reward": 0.9847357273101807, + "reward_std": 0.001464441535063088, + "rewards/perpo_ocr_edit_distance_reward": 0.9847357869148254, "step": 1226, "temperature": 0.9 }, { - "advantages": -2.307551312696887e-06, - "completion_length": 513.5, - "delta_ref_entropy_loss": 0.03253173828125, - "delta_ref_ppl": -0.0340576171875, - "entropy_loss": -0.03131103515625, - "epoch": 0.4908, - "grad_norm": 0.47751704896367175, - "k1_kl": 0.0341796875, - "k3_kl": 0.021759033203125, - "kimi_kl": 0.0565185546875, - "learning_rate": 2.5459999999999996e-07, - "loss": 0.0009, - "ppl": 0.01507568359375, - "reward": 0.9036045372486115, - "reward_std": 0.007334825582802296, - "rewards/perpo_ocr_edit_distance_reward": 0.9036045670509338, + "advantages": -2.641337414388545e-05, + "completion_length": 271.0, + "delta_ref_entropy_loss": 0.0791015625, + "delta_ref_ppl": -0.103515625, + "entropy_loss": -0.0303955078125, + "epoch": 0.2454, + "grad_norm": 1.0676540927889018, + "k1_kl": 0.103515625, + "k3_kl": 0.07080078125, + "kimi_kl": 0.259765625, + "learning_rate": 3.773e-07, + "loss": 0.0029, + "ppl": 0.00970458984375, + "reward": 0.9751222133636475, + "reward_std": 0.0015113505069166422, + "rewards/perpo_ocr_edit_distance_reward": 0.975122332572937, "step": 1227, "temperature": 0.9 }, { - "advantages": -1.7574856201463263e-05, - "completion_length": 677.5, - "delta_ref_entropy_loss": 0.06494140625, - "delta_ref_ppl": -0.0421142578125, - "entropy_loss": -0.05517578125, - "epoch": 0.4912, - "grad_norm": 1.0891921895664118, - "k1_kl": 0.042236328125, - "k3_kl": 0.0281982421875, - "kimi_kl": 0.0947265625, - "learning_rate": 2.544e-07, - "loss": 0.0011, - "ppl": 0.0311126708984375, - "reward": 0.9857894778251648, - "reward_std": 0.0026703497860580683, - "rewards/perpo_ocr_edit_distance_reward": 0.9857895374298096, + "advantages": -4.9352649512002245e-05, + "completion_length": 598.0, + "delta_ref_entropy_loss": 0.0966796875, + "delta_ref_ppl": -0.09619140625, + "entropy_loss": -0.026611328125, + "epoch": 0.2456, + "grad_norm": 0.5063525946483133, + "k1_kl": 0.09619140625, + "k3_kl": 0.057861328125, + "kimi_kl": 0.1884765625, + "learning_rate": 3.7719999999999996e-07, + "loss": 0.0024, + "ppl": 0.01123046875, + "reward": 0.988297164440155, + "reward_std": 0.00041769695235416293, + "rewards/perpo_ocr_edit_distance_reward": 0.9882972240447998, "step": 1228, "temperature": 0.9 }, { - "advantages": -0.00010188988562731538, - "completion_length": 488.0, - "delta_ref_entropy_loss": 0.05035400390625, - "delta_ref_ppl": -0.0372314453125, - "entropy_loss": -0.03656005859375, - "epoch": 0.4916, - "grad_norm": 0.42486641347910187, - "k1_kl": 0.0372314453125, - "k3_kl": 0.0225830078125, - "kimi_kl": 0.12103271484375, - "learning_rate": 2.542e-07, - "loss": 0.001, - "ppl": 0.0158843994140625, - "reward": 0.9828769862651825, - "reward_std": 0.0007829726964700967, - "rewards/perpo_ocr_edit_distance_reward": 0.9828770458698273, + "advantages": -0.0005960464477539062, + "completion_length": 42.0, + "delta_ref_entropy_loss": 0.19921875, + "delta_ref_ppl": -0.546875, + "entropy_loss": -0.04248046875, + "epoch": 0.2458, + "grad_norm": 0.031011376422896322, + "k1_kl": 0.55078125, + "k3_kl": 0.421875, + "kimi_kl": 1.6171875, + "learning_rate": 3.7709999999999996e-07, + "loss": 0.0176, + "ppl": 0.007476806640625, + "reward": 0.9160838723182678, + "reward_std": 0.0, + "rewards/perpo_ocr_edit_distance_reward": 0.9160839915275574, "step": 1229, "temperature": 0.9 }, { - "advantages": -7.019937174845836e-05, - "completion_length": 807.5, - "delta_ref_entropy_loss": 0.030029296875, - "delta_ref_ppl": -0.027862548828125, - "entropy_loss": -0.0260009765625, - "epoch": 0.492, - "grad_norm": 0.7107076095259591, - "k1_kl": 0.02783203125, - "k3_kl": 0.01763916015625, - "kimi_kl": 0.04058837890625, - "learning_rate": 2.5399999999999997e-07, - "loss": 0.0008, - "ppl": 0.01318359375, - "reward": 0.9970032572746277, - "reward_std": 0.0009765081631485373, - "rewards/perpo_ocr_edit_distance_reward": 0.9970032870769501, + "advantages": -2.3382051949738525e-05, + "completion_length": 444.0, + "delta_ref_entropy_loss": 0.07861328125, + "delta_ref_ppl": -0.091796875, + "entropy_loss": -0.0263671875, + "epoch": 0.246, + "grad_norm": 0.7949673756292807, + "k1_kl": 0.091796875, + "k3_kl": 0.05810546875, + "kimi_kl": 0.1748046875, + "learning_rate": 3.77e-07, + "loss": 0.0023, + "ppl": 0.0093994140625, + "reward": 0.9912382960319519, + "reward_std": 0.0017195778200402856, + "rewards/perpo_ocr_edit_distance_reward": 0.9912383556365967, "step": 1230, "temperature": 0.9 }, { - "advantages": -8.84022074387758e-05, - "completion_length": 540.0, - "delta_ref_entropy_loss": 0.043212890625, - "delta_ref_ppl": -0.0703125, - "entropy_loss": -0.03369140625, - "epoch": 0.4924, - "grad_norm": 0.7677054844357681, - "k1_kl": 0.070556640625, - "k3_kl": 0.0506591796875, - "kimi_kl": 0.1591796875, - "learning_rate": 2.538e-07, - "loss": 0.0021, - "ppl": 0.015411376953125, - "reward": 0.9744448959827423, - "reward_std": 0.004159635092946701, - "rewards/perpo_ocr_edit_distance_reward": 0.9744449555873871, + "advantages": -5.21966421729303e-06, + "completion_length": 1459.0, + "delta_ref_entropy_loss": 0.11376953125, + "delta_ref_ppl": -0.07666015625, + "entropy_loss": -0.1943359375, + "epoch": 0.2462, + "grad_norm": 2.2332462146619037, + "k1_kl": 0.07666015625, + "k3_kl": 0.047119140625, + "kimi_kl": 0.078125, + "learning_rate": 3.769e-07, + "loss": 0.0019, + "ppl": 0.109375, + "reward": 0.9043189883232117, + "reward_std": 0.008079434745013714, + "rewards/perpo_ocr_edit_distance_reward": 0.9043189883232117, "step": 1231, "temperature": 0.9 }, { - "advantages": -6.978001238167053e-06, - "completion_length": 1018.5, - "delta_ref_entropy_loss": 0.14208984375, - "delta_ref_ppl": -0.10888671875, - "entropy_loss": -0.56689453125, - "epoch": 0.4928, - "grad_norm": 37.88521342334898, - "k1_kl": 0.10986328125, - "k3_kl": 0.79052734375, - "kimi_kl": 0.235595703125, - "learning_rate": 2.536e-07, - "loss": 0.0318, - "ppl": 0.46923828125, - "reward": 0.850172370672226, - "reward_std": 0.010115463752299547, - "rewards/perpo_ocr_edit_distance_reward": 0.8501724302768707, + "advantages": -7.95125961303711e-05, + "completion_length": 1082.0, + "delta_ref_entropy_loss": 0.07421875, + "delta_ref_ppl": -0.06103515625, + "entropy_loss": -0.0625, + "epoch": 0.2464, + "grad_norm": 0.7821730511026352, + "k1_kl": 0.06103515625, + "k3_kl": 0.0283203125, + "kimi_kl": 0.06103515625, + "learning_rate": 3.768e-07, + "loss": 0.0012, + "ppl": 0.02978515625, + "reward": 0.9926018118858337, + "reward_std": 0.000863697670865804, + "rewards/perpo_ocr_edit_distance_reward": 0.9926019310951233, "step": 1232, "temperature": 0.9 }, { - "advantages": -5.361437979445327e-05, - "completion_length": 486.5, - "delta_ref_entropy_loss": 0.02923583984375, - "delta_ref_ppl": -0.0234375, - "entropy_loss": -0.0240478515625, - "epoch": 0.4932, - "grad_norm": 0.3875547608271041, - "k1_kl": 0.0234375, - "k3_kl": 0.013641357421875, - "kimi_kl": 0.0341796875, - "learning_rate": 2.534e-07, - "loss": 0.0006, - "ppl": 0.01336669921875, - "reward": 0.9781894087791443, - "reward_std": 0.0007500917126890272, - "rewards/perpo_ocr_edit_distance_reward": 0.9781894981861115, + "advantages": -3.6967652704333887e-05, + "completion_length": 891.0, + "delta_ref_entropy_loss": 0.0966796875, + "delta_ref_ppl": -0.072265625, + "entropy_loss": -0.04443359375, + "epoch": 0.2466, + "grad_norm": 0.7755842946553664, + "k1_kl": 0.072265625, + "k3_kl": 0.0390625, + "kimi_kl": 0.11767578125, + "learning_rate": 3.767e-07, + "loss": 0.0016, + "ppl": 0.0218505859375, + "reward": 0.9505772590637207, + "reward_std": 0.0015122528420761228, + "rewards/perpo_ocr_edit_distance_reward": 0.9505772590637207, "step": 1233, "temperature": 0.9 }, { - "advantages": -0.00011946474114665762, - "completion_length": 540.0, - "delta_ref_entropy_loss": 0.052734375, - "delta_ref_ppl": -0.030670166015625, - "entropy_loss": -0.02978515625, - "epoch": 0.4936, - "grad_norm": 0.5390562566192211, - "k1_kl": 0.030670166015625, - "k3_kl": 0.0143280029296875, - "kimi_kl": 0.0385894775390625, - "learning_rate": 2.5319999999999996e-07, - "loss": 0.0007, - "ppl": 0.013458251953125, - "reward": 0.9963458180427551, - "reward_std": 0.00044700295256916434, - "rewards/perpo_ocr_edit_distance_reward": 0.9963459074497223, + "advantages": -5.960464477539062e-07, + "completion_length": 1015.0, + "delta_ref_entropy_loss": 0.0859375, + "delta_ref_ppl": -0.07177734375, + "entropy_loss": -0.041748046875, + "epoch": 0.2468, + "grad_norm": 1.2675643365406284, + "k1_kl": 0.072265625, + "k3_kl": 0.037109375, + "kimi_kl": 0.07958984375, + "learning_rate": 3.7659999999999997e-07, + "loss": 0.0015, + "ppl": 0.0169677734375, + "reward": 0.8939085602760315, + "reward_std": 0.12447692453861237, + "rewards/perpo_ocr_edit_distance_reward": 0.893908679485321, "step": 1234, "temperature": 0.9 }, { - "advantages": -8.813398289930774e-05, - "completion_length": 980.5, - "delta_ref_entropy_loss": 0.02117919921875, - "delta_ref_ppl": -0.011505126953125, - "entropy_loss": -0.02490234375, - "epoch": 0.494, - "grad_norm": 0.40755996089552515, - "k1_kl": 0.011505126953125, - "k3_kl": 0.00565338134765625, - "kimi_kl": 0.011260986328125, - "learning_rate": 2.53e-07, - "loss": 0.0003, - "ppl": 0.0123291015625, - "reward": 0.9919647574424744, - "reward_std": 0.0006792779095121659, - "rewards/perpo_ocr_edit_distance_reward": 0.9919647872447968, + "advantages": -3.7670135498046875e-05, + "completion_length": 429.0, + "delta_ref_entropy_loss": 0.0771484375, + "delta_ref_ppl": -0.1318359375, + "entropy_loss": -0.047119140625, + "epoch": 0.247, + "grad_norm": 8.682039854961637, + "k1_kl": 0.1318359375, + "k3_kl": 0.087890625, + "kimi_kl": 0.318359375, + "learning_rate": 3.7649999999999996e-07, + "loss": 0.0036, + "ppl": 0.037353515625, + "reward": 0.9859550595283508, + "reward_std": 0.001256221323274076, + "rewards/perpo_ocr_edit_distance_reward": 0.9859551191329956, "step": 1235, "temperature": 0.9 }, { - "advantages": -6.35853848507395e-05, - "completion_length": 489.5, - "delta_ref_entropy_loss": 0.0313720703125, - "delta_ref_ppl": -0.033355712890625, - "entropy_loss": -0.02569580078125, - "epoch": 0.4944, - "grad_norm": 1.072299214098145, - "k1_kl": 0.0334625244140625, - "k3_kl": 0.02191162109375, - "kimi_kl": 0.05108642578125, - "learning_rate": 2.528e-07, - "loss": 0.0009, - "ppl": 0.011199951171875, - "reward": 0.9993175268173218, - "reward_std": 0.000887710164533928, - "rewards/perpo_ocr_edit_distance_reward": 0.9993175864219666, + "advantages": 0.0, + "completion_length": 326.0, + "delta_ref_entropy_loss": 0.265625, + "delta_ref_ppl": -0.1728515625, + "entropy_loss": -0.30078125, + "epoch": 0.2472, + "grad_norm": 3.3604525004560073, + "k1_kl": 0.1728515625, + "k3_kl": 0.091796875, + "kimi_kl": 0.263671875, + "learning_rate": 3.764e-07, + "loss": 0.0037, + "ppl": 0.1630859375, + "reward": 0.5048412084579468, + "reward_std": 0.012056994251906872, + "rewards/perpo_ocr_edit_distance_reward": 0.5048412680625916, "step": 1236, "temperature": 0.9 }, { - "advantages": -0.00011175445979461074, - "completion_length": 526.5, - "delta_ref_entropy_loss": 0.07464599609375, - "delta_ref_ppl": -0.132598876953125, - "entropy_loss": -0.0401611328125, - "epoch": 0.4948, - "grad_norm": 0.16695329983672896, - "k1_kl": 0.13262939453125, - "k3_kl": 0.093170166015625, - "kimi_kl": 0.283447265625, - "learning_rate": 2.5259999999999997e-07, - "loss": 0.0039, - "ppl": 0.0245361328125, - "reward": 0.9998582303524017, - "reward_std": 8.333644655067474e-05, - "rewards/perpo_ocr_edit_distance_reward": 0.9998582899570465, + "advantages": 2.278600550198462e-05, + "completion_length": 303.0, + "delta_ref_entropy_loss": 0.08349609375, + "delta_ref_ppl": -0.1552734375, + "entropy_loss": -0.08251953125, + "epoch": 0.2474, + "grad_norm": 1.2279915150045164, + "k1_kl": 0.154296875, + "k3_kl": 0.10888671875, + "kimi_kl": 0.392578125, + "learning_rate": 3.763e-07, + "loss": 0.0043, + "ppl": 0.04248046875, + "reward": 0.9845990538597107, + "reward_std": 0.0017679949523881078, + "rewards/perpo_ocr_edit_distance_reward": 0.9845989942550659, "step": 1237, "temperature": 0.9 }, { - "advantages": -0.00011556702429516008, - "completion_length": 723.5, - "delta_ref_entropy_loss": 0.053680419921875, - "delta_ref_ppl": -0.037445068359375, - "entropy_loss": -0.06353759765625, - "epoch": 0.4952, - "grad_norm": 1.838109944003741, - "k1_kl": 0.037445068359375, - "k3_kl": 0.0211181640625, - "kimi_kl": 0.05767822265625, - "learning_rate": 2.524e-07, - "loss": 0.001, - "ppl": 0.032012939453125, - "reward": 0.9923067986965179, - "reward_std": 0.001454820558137726, - "rewards/perpo_ocr_edit_distance_reward": 0.9923068583011627, + "advantages": -7.726465264568105e-05, + "completion_length": 290.0, + "delta_ref_entropy_loss": 0.083984375, + "delta_ref_ppl": -0.06396484375, + "entropy_loss": -0.02392578125, + "epoch": 0.2476, + "grad_norm": 0.6677788521279522, + "k1_kl": 0.06396484375, + "k3_kl": 0.034423828125, + "kimi_kl": 0.08154296875, + "learning_rate": 3.7619999999999994e-07, + "loss": 0.0015, + "ppl": 0.00909423828125, + "reward": 0.9969526529312134, + "reward_std": 0.0006713260081596673, + "rewards/perpo_ocr_edit_distance_reward": 0.9969527721405029, "step": 1238, "temperature": 0.9 }, { - "advantages": -5.158782278158469e-05, - "completion_length": 688.5, - "delta_ref_entropy_loss": 0.035888671875, - "delta_ref_ppl": -0.036376953125, - "entropy_loss": -0.0301513671875, - "epoch": 0.4956, - "grad_norm": 0.4444152207433388, - "k1_kl": 0.0364990234375, - "k3_kl": 0.0213623046875, - "kimi_kl": 0.0516357421875, - "learning_rate": 2.5219999999999994e-07, - "loss": 0.0009, - "ppl": 0.014495849609375, - "reward": 0.9865752160549164, - "reward_std": 0.0014536292874254286, - "rewards/perpo_ocr_edit_distance_reward": 0.9865753054618835, + "advantages": -1.8273081877850927e-05, + "completion_length": 2021.0, + "delta_ref_entropy_loss": 0.0284423828125, + "delta_ref_ppl": -0.03857421875, + "entropy_loss": -0.068359375, + "epoch": 0.2478, + "grad_norm": 11.176700320731923, + "k1_kl": 0.03857421875, + "k3_kl": 0.1328125, + "kimi_kl": 0.0966796875, + "learning_rate": 3.761e-07, + "loss": 0.0053, + "ppl": 0.047119140625, + "reward": 0.9881300926208496, + "reward_std": 0.004095882643014193, + "rewards/perpo_ocr_edit_distance_reward": 0.9881302118301392, "step": 1239, "temperature": 0.9 }, { - "advantages": -0.0002999561174874543, - "completion_length": 309.0, - "delta_ref_entropy_loss": 0.03082275390625, - "delta_ref_ppl": -0.0499267578125, - "entropy_loss": -0.027313232421875, - "epoch": 0.496, - "grad_norm": 1.3998073505694204, - "k1_kl": 0.0499267578125, - "k3_kl": 0.03277587890625, - "kimi_kl": 0.08544921875, - "learning_rate": 2.52e-07, - "loss": 0.0016, - "ppl": 0.01202392578125, - "reward": 0.9914600551128387, - "reward_std": 0.0032506436109542847, - "rewards/perpo_ocr_edit_distance_reward": 0.9914601147174835, - "step": 1240, - "temperature": 0.9 - }, - { - "advantages": -0.000170111660281691, - "completion_length": 601.0, - "delta_ref_entropy_loss": 0.03076171875, - "delta_ref_ppl": -0.0313262939453125, - "entropy_loss": -0.05059814453125, - "epoch": 0.4964, - "grad_norm": 0.9080338225761052, - "k1_kl": 0.0313262939453125, - "k3_kl": 0.01959228515625, - "kimi_kl": 0.048126220703125, - "learning_rate": 2.518e-07, - "loss": 0.001, - "ppl": 0.0259552001953125, - "reward": 0.9992011785507202, - "reward_std": 0.0007267611363204196, - "rewards/perpo_ocr_edit_distance_reward": 0.9992012083530426, + "advantages": -0.0005960464477539062, + "completion_length": 263.0, + "delta_ref_entropy_loss": 0.0986328125, + "delta_ref_ppl": -0.072265625, + "entropy_loss": -0.0216064453125, + "epoch": 0.248, + "grad_norm": 0.011195920385250184, + "k1_kl": 0.07177734375, + "k3_kl": 0.0341796875, + "kimi_kl": 0.06982421875, + "learning_rate": 3.76e-07, + "loss": 0.002, + "ppl": 0.00384521484375, + "reward": 0.996363639831543, + "reward_std": 0.0, + "rewards/perpo_ocr_edit_distance_reward": 0.9963636994361877, + "step": 1240, + "temperature": 0.9 + }, + { + "advantages": -2.469335413479712e-06, + "completion_length": 466.0, + "delta_ref_entropy_loss": 0.06201171875, + "delta_ref_ppl": -0.09326171875, + "entropy_loss": -0.032470703125, + "epoch": 0.2482, + "grad_norm": 1.0221547875680086, + "k1_kl": 0.09326171875, + "k3_kl": 0.06103515625, + "kimi_kl": 0.1865234375, + "learning_rate": 3.7589999999999997e-07, + "loss": 0.0024, + "ppl": 0.01300048828125, + "reward": 0.9894234538078308, + "reward_std": 0.0033665925730019808, + "rewards/perpo_ocr_edit_distance_reward": 0.9894235134124756, "step": 1241, "temperature": 0.9 }, { - "advantages": -4.588706204788906e-05, - "completion_length": 595.5, - "delta_ref_entropy_loss": 0.036376953125, - "delta_ref_ppl": -0.0548095703125, - "entropy_loss": -0.02923583984375, - "epoch": 0.4968, - "grad_norm": 0.38082946310051646, - "k1_kl": 0.0548095703125, - "k3_kl": 0.0404052734375, - "kimi_kl": 0.23095703125, - "learning_rate": 2.516e-07, - "loss": 0.0017, - "ppl": 0.014678955078125, - "reward": 0.8959751427173615, - "reward_std": 0.0007462573412340134, - "rewards/perpo_ocr_edit_distance_reward": 0.8959752023220062, + "advantages": -0.0005960464477539062, + "completion_length": 507.0, + "delta_ref_entropy_loss": 0.047119140625, + "delta_ref_ppl": -0.03564453125, + "entropy_loss": -0.01129150390625, + "epoch": 0.2484, + "grad_norm": 0.001967540727373408, + "k1_kl": 0.03564453125, + "k3_kl": 0.019775390625, + "kimi_kl": 0.0625, + "learning_rate": 3.758e-07, + "loss": 0.0014, + "ppl": 0.00180816650390625, + "reward": 0.995276927947998, + "reward_std": 0.0, + "rewards/perpo_ocr_edit_distance_reward": 0.9952769875526428, "step": 1242, "temperature": 0.9 }, { - "advantages": -2.1287374693201855e-05, - "completion_length": 488.0, - "delta_ref_entropy_loss": 0.0399169921875, - "delta_ref_ppl": -0.03948974609375, - "entropy_loss": -0.0545654296875, - "epoch": 0.4972, - "grad_norm": 0.4289423468222607, - "k1_kl": 0.03948974609375, - "k3_kl": 0.02783203125, - "kimi_kl": 0.095794677734375, - "learning_rate": 2.514e-07, + "advantages": -0.00010598557855701074, + "completion_length": 745.0, + "delta_ref_entropy_loss": 0.06591796875, + "delta_ref_ppl": -0.053466796875, + "entropy_loss": -0.03369140625, + "epoch": 0.2486, + "grad_norm": 0.2761430174801869, + "k1_kl": 0.053466796875, + "k3_kl": 0.0250244140625, + "kimi_kl": 0.0556640625, + "learning_rate": 3.7569999999999996e-07, "loss": 0.0011, - "ppl": 0.027313232421875, - "reward": 0.9966376721858978, - "reward_std": 0.0008494235808029771, - "rewards/perpo_ocr_edit_distance_reward": 0.996637761592865, + "ppl": 0.01202392578125, + "reward": 0.9886696338653564, + "reward_std": 0.0005426692659966648, + "rewards/perpo_ocr_edit_distance_reward": 0.9886696934700012, "step": 1243, "temperature": 0.9 }, { - "advantages": -2.118945258189342e-05, - "completion_length": 367.5, - "delta_ref_entropy_loss": 0.0484619140625, - "delta_ref_ppl": -0.05279541015625, - "entropy_loss": -0.0439453125, - "epoch": 0.4976, - "grad_norm": 1.003875458225098, - "k1_kl": 0.05279541015625, - "k3_kl": 0.0379638671875, - "kimi_kl": 0.183837890625, - "learning_rate": 2.5119999999999997e-07, - "loss": 0.0015, - "ppl": 0.021820068359375, - "reward": 0.9944864213466644, - "reward_std": 0.001564668258652091, - "rewards/perpo_ocr_edit_distance_reward": 0.9944865107536316, + "advantages": -8.299521141452715e-05, + "completion_length": 337.0, + "delta_ref_entropy_loss": 0.11376953125, + "delta_ref_ppl": -0.14453125, + "entropy_loss": -0.035888671875, + "epoch": 0.2488, + "grad_norm": 0.704150023580152, + "k1_kl": 0.14453125, + "k3_kl": 0.091796875, + "kimi_kl": 0.265625, + "learning_rate": 3.7559999999999995e-07, + "loss": 0.0038, + "ppl": 0.0166015625, + "reward": 0.9946994185447693, + "reward_std": 0.00041295913979411125, + "rewards/perpo_ocr_edit_distance_reward": 0.9946995377540588, "step": 1244, "temperature": 0.9 }, { - "advantages": 0.0, - "completion_length": 352.5, - "delta_ref_entropy_loss": 0.02777099609375, - "delta_ref_ppl": -0.020172119140625, - "entropy_loss": -0.01678466796875, - "epoch": 0.498, - "grad_norm": 0.3511745203988077, - "k1_kl": 0.020111083984375, - "k3_kl": 0.013397216796875, - "kimi_kl": 0.034576416015625, - "learning_rate": 2.51e-07, - "loss": 0.0005, - "ppl": 0.0086212158203125, - "reward": 0.9997223019599915, - "reward_std": 0.0003463511820882559, - "rewards/perpo_ocr_edit_distance_reward": 0.9997223019599915, + "advantages": -7.561275197076611e-06, + "completion_length": 419.0, + "delta_ref_entropy_loss": 0.10498046875, + "delta_ref_ppl": -0.08154296875, + "entropy_loss": -0.04833984375, + "epoch": 0.249, + "grad_norm": 1.7016368724136717, + "k1_kl": 0.08154296875, + "k3_kl": 0.048828125, + "kimi_kl": 0.115234375, + "learning_rate": 3.755e-07, + "loss": 0.002, + "ppl": 0.01953125, + "reward": 0.9334786534309387, + "reward_std": 0.012260179035365582, + "rewards/perpo_ocr_edit_distance_reward": 0.933478832244873, "step": 1245, "temperature": 0.9 }, { - "advantages": 1.0290316822647583e-05, - "completion_length": 429.5, - "delta_ref_entropy_loss": 0.046875, - "delta_ref_ppl": -0.25506591796875, - "entropy_loss": -0.13623046875, - "epoch": 0.4984, - "grad_norm": 6.764044679080464, - "k1_kl": 0.25408935546875, - "k3_kl": 0.200286865234375, - "kimi_kl": 0.66265869140625, - "learning_rate": 2.508e-07, - "loss": 0.008, - "ppl": 0.060882568359375, - "reward": 0.64910988509655, - "reward_std": 0.0991116976365447, - "rewards/perpo_ocr_edit_distance_reward": 0.6491098552942276, + "advantages": -2.081053753499873e-05, + "completion_length": 950.0, + "delta_ref_entropy_loss": 0.050537109375, + "delta_ref_ppl": -0.054931640625, + "entropy_loss": -0.04736328125, + "epoch": 0.2492, + "grad_norm": 2.0275837016646374, + "k1_kl": 0.05517578125, + "k3_kl": 0.031494140625, + "kimi_kl": 0.0888671875, + "learning_rate": 3.754e-07, + "loss": 0.0013, + "ppl": 0.0233154296875, + "reward": 0.9958279728889465, + "reward_std": 0.001127695431932807, + "rewards/perpo_ocr_edit_distance_reward": 0.9958279728889465, "step": 1246, "temperature": 0.9 }, { - "advantages": -3.4434455301379785e-05, - "completion_length": 353.0, - "delta_ref_entropy_loss": 0.03765869140625, - "delta_ref_ppl": -0.02288818359375, - "entropy_loss": -0.02490234375, - "epoch": 0.4988, - "grad_norm": 0.4872079054072862, - "k1_kl": 0.0228271484375, - "k3_kl": 0.011383056640625, - "kimi_kl": 0.0198974609375, - "learning_rate": 2.506e-07, - "loss": 0.0005, - "ppl": 0.013427734375, - "reward": 0.9990514814853668, - "reward_std": 0.0001971659658011049, - "rewards/perpo_ocr_edit_distance_reward": 0.9990515112876892, + "advantages": -7.782664397382177e-06, + "completion_length": 773.0, + "delta_ref_entropy_loss": 0.08154296875, + "delta_ref_ppl": -0.06689453125, + "entropy_loss": -0.080078125, + "epoch": 0.2494, + "grad_norm": 2.531974827155189, + "k1_kl": 0.06689453125, + "k3_kl": 0.0419921875, + "kimi_kl": 0.080078125, + "learning_rate": 3.753e-07, + "loss": 0.0017, + "ppl": 0.044189453125, + "reward": 0.8715924620628357, + "reward_std": 0.007543137297034264, + "rewards/perpo_ocr_edit_distance_reward": 0.8715925216674805, "step": 1247, "temperature": 0.9 }, { - "advantages": -2.4012157155084424e-05, - "completion_length": 448.5, - "delta_ref_entropy_loss": 0.04010009765625, - "delta_ref_ppl": -0.03558349609375, - "entropy_loss": -0.027618408203125, - "epoch": 0.4992, - "grad_norm": 2.1473944769122917, - "k1_kl": 0.03558349609375, - "k3_kl": 0.02166748046875, - "kimi_kl": 0.0584716796875, - "learning_rate": 2.504e-07, - "loss": 0.0009, - "ppl": 0.011474609375, - "reward": 0.9969954788684845, - "reward_std": 0.0009256084449589252, - "rewards/perpo_ocr_edit_distance_reward": 0.9969955384731293, + "advantages": -2.271788616781123e-05, + "completion_length": 197.0, + "delta_ref_entropy_loss": 0.1259765625, + "delta_ref_ppl": -0.23046875, + "entropy_loss": -0.052490234375, + "epoch": 0.2496, + "grad_norm": 2.178326886866042, + "k1_kl": 0.2314453125, + "k3_kl": 0.1640625, + "kimi_kl": 0.55859375, + "learning_rate": 3.7519999999999997e-07, + "loss": 0.0066, + "ppl": 0.0250244140625, + "reward": 0.9566434025764465, + "reward_std": 0.0013985882978886366, + "rewards/perpo_ocr_edit_distance_reward": 0.9566434025764465, "step": 1248, "temperature": 0.9 }, { - "advantages": -0.0003034770493286487, - "completion_length": 238.0, - "delta_ref_entropy_loss": 0.03448486328125, - "delta_ref_ppl": -0.046630859375, - "entropy_loss": -0.021820068359375, - "epoch": 0.4996, - "grad_norm": 3.2997911275192675, - "k1_kl": 0.046630859375, - "k3_kl": 0.029510498046875, - "kimi_kl": 0.07080078125, - "learning_rate": 2.5019999999999995e-07, - "loss": 0.0015, - "ppl": 0.0075836181640625, - "reward": 0.9964718520641327, - "reward_std": 0.001119155203923583, - "rewards/perpo_ocr_edit_distance_reward": 0.9964719116687775, + "advantages": -8.514949634275126e-09, + "completion_length": 195.0, + "delta_ref_entropy_loss": 0.10791015625, + "delta_ref_ppl": -0.17578125, + "entropy_loss": -0.09912109375, + "epoch": 0.2498, + "grad_norm": 2.9903763577068028, + "k1_kl": 0.17578125, + "k3_kl": 0.115234375, + "kimi_kl": 0.33203125, + "learning_rate": 3.7509999999999996e-07, + "loss": 0.0046, + "ppl": 0.0390625, + "reward": 0.8105332851409912, + "reward_std": 0.003429195610806346, + "rewards/perpo_ocr_edit_distance_reward": 0.8105334043502808, "step": 1249, "temperature": 0.9 }, { - "advantages": -4.001600518677151e-05, - "completion_length": 545.5, - "delta_ref_entropy_loss": 0.0816650390625, - "delta_ref_ppl": -0.0762939453125, - "entropy_loss": -0.12109375, - "epoch": 0.5, - "grad_norm": 1.4682038886741697, - "k1_kl": 0.0762939453125, - "k3_kl": 0.0472412109375, - "kimi_kl": 0.1475830078125, - "learning_rate": 2.5e-07, - "loss": 0.0019, - "ppl": 0.06439208984375, - "reward": 0.8509061634540558, - "reward_std": 0.009142678391071968, - "rewards/perpo_ocr_edit_distance_reward": 0.8509062230587006, + "advantages": -3.3208303307219467e-07, + "completion_length": 592.0, + "delta_ref_entropy_loss": 0.181640625, + "delta_ref_ppl": -0.10107421875, + "entropy_loss": -0.169921875, + "epoch": 0.25, + "grad_norm": 2.6967222702081397, + "k1_kl": 0.1015625, + "k3_kl": 0.0498046875, + "kimi_kl": 0.09619140625, + "learning_rate": 3.75e-07, + "loss": 0.002, + "ppl": 0.08642578125, + "reward": 0.9168123602867126, + "reward_std": 0.025797108188271523, + "rewards/perpo_ocr_edit_distance_reward": 0.9168123006820679, "step": 1250, "temperature": 0.9 }, { - "advantages": -0.0003028937749149918, - "completion_length": 495.0, - "delta_ref_entropy_loss": 0.03033447265625, - "delta_ref_ppl": -0.0338134765625, - "entropy_loss": -0.0289306640625, - "epoch": 0.5004, - "grad_norm": 0.6724970227008946, - "k1_kl": 0.033935546875, - "k3_kl": 0.02484130859375, - "kimi_kl": 0.07232666015625, - "learning_rate": 2.498e-07, - "loss": 0.0013, - "ppl": 0.015045166015625, - "reward": 0.942821204662323, - "reward_std": 0.00038680038414895535, - "rewards/perpo_ocr_edit_distance_reward": 0.9428212642669678, + "advantages": -9.087154467124492e-05, + "completion_length": 867.0, + "delta_ref_entropy_loss": 0.08935546875, + "delta_ref_ppl": -0.08349609375, + "entropy_loss": -0.04345703125, + "epoch": 0.2502, + "grad_norm": 32.230977630530894, + "k1_kl": 0.08349609375, + "k3_kl": 0.07763671875, + "kimi_kl": 0.1416015625, + "learning_rate": 3.749e-07, + "loss": 0.0032, + "ppl": 0.0198974609375, + "reward": 0.9919184446334839, + "reward_std": 0.0002748648403212428, + "rewards/perpo_ocr_edit_distance_reward": 0.9919184446334839, "step": 1251, "temperature": 0.9 }, { - "advantages": -2.233471377621754e-05, - "completion_length": 524.0, - "delta_ref_entropy_loss": 0.07421875, - "delta_ref_ppl": -0.07080078125, - "entropy_loss": -0.088134765625, - "epoch": 0.5008, - "grad_norm": 0.9929959539572859, - "k1_kl": 0.07080078125, - "k3_kl": 0.045166015625, - "kimi_kl": 0.1494140625, - "learning_rate": 2.4959999999999996e-07, - "loss": 0.0018, - "ppl": 0.049072265625, - "reward": 0.9754148423671722, - "reward_std": 0.002491112390998751, - "rewards/perpo_ocr_edit_distance_reward": 0.975414901971817, + "advantages": -4.552091922960244e-05, + "completion_length": 1177.0, + "delta_ref_entropy_loss": 0.02197265625, + "delta_ref_ppl": -0.031982421875, + "entropy_loss": -0.029296875, + "epoch": 0.2504, + "grad_norm": 0.7431836851890504, + "k1_kl": 0.031982421875, + "k3_kl": 0.0201416015625, + "kimi_kl": 0.051513671875, + "learning_rate": 3.748e-07, + "loss": 0.0009, + "ppl": 0.01361083984375, + "reward": 0.9957074522972107, + "reward_std": 0.00214412366040051, + "rewards/perpo_ocr_edit_distance_reward": 0.9957075715065002, "step": 1252, "temperature": 0.9 }, { - "advantages": -0.00031452093935513403, - "completion_length": 538.5, - "delta_ref_entropy_loss": 0.0576171875, - "delta_ref_ppl": -0.05035400390625, - "entropy_loss": -0.057647705078125, - "epoch": 0.5012, - "grad_norm": 0.7252922484404509, - "k1_kl": 0.05010986328125, - "k3_kl": 0.030242919921875, - "kimi_kl": 0.082275390625, - "learning_rate": 2.494e-07, - "loss": 0.0015, - "ppl": 0.0291290283203125, - "reward": 0.979759156703949, - "reward_std": 0.0008530033519491553, - "rewards/perpo_ocr_edit_distance_reward": 0.9797592461109161, + "advantages": 8.140292266034521e-06, + "completion_length": 151.0, + "delta_ref_entropy_loss": 0.189453125, + "delta_ref_ppl": -0.255859375, + "entropy_loss": -0.07958984375, + "epoch": 0.2506, + "grad_norm": 3.089514048201409, + "k1_kl": 0.255859375, + "k3_kl": 0.1884765625, + "kimi_kl": 0.87109375, + "learning_rate": 3.747e-07, + "loss": 0.0075, + "ppl": 0.0308837890625, + "reward": 0.9750397205352783, + "reward_std": 0.005130609031766653, + "rewards/perpo_ocr_edit_distance_reward": 0.9750397801399231, "step": 1253, "temperature": 0.9 }, { - "advantages": -3.8317273265420226e-07, - "completion_length": 647.0, - "delta_ref_entropy_loss": 0.04833984375, - "delta_ref_ppl": -0.0465087890625, - "entropy_loss": -0.05157470703125, - "epoch": 0.5016, - "grad_norm": 0.6215100281875795, - "k1_kl": 0.0467529296875, - "k3_kl": 0.03106689453125, - "kimi_kl": 0.0823974609375, - "learning_rate": 2.492e-07, - "loss": 0.0012, - "ppl": 0.024566650390625, - "reward": 0.8631349205970764, - "reward_std": 0.045490812510252, - "rewards/perpo_ocr_edit_distance_reward": 0.8631349802017212, + "advantages": -7.66345493730114e-08, + "completion_length": 748.0, + "delta_ref_entropy_loss": 0.125, + "delta_ref_ppl": -0.1064453125, + "entropy_loss": -0.248046875, + "epoch": 0.2508, + "grad_norm": 2.6561944301652227, + "k1_kl": 0.10595703125, + "k3_kl": 0.06298828125, + "kimi_kl": 0.1240234375, + "learning_rate": 3.746e-07, + "loss": 0.0025, + "ppl": 0.1337890625, + "reward": 0.7501343488693237, + "reward_std": 0.14056579768657684, + "rewards/perpo_ocr_edit_distance_reward": 0.7501344084739685, "step": 1254, "temperature": 0.9 }, { - "advantages": -0.00018362063747190405, - "completion_length": 605.5, - "delta_ref_entropy_loss": 0.05316162109375, - "delta_ref_ppl": -0.036041259765625, - "entropy_loss": -0.03509521484375, - "epoch": 0.502, - "grad_norm": 0.892236424321195, - "k1_kl": 0.036163330078125, - "k3_kl": 0.0188140869140625, - "kimi_kl": 0.045623779296875, - "learning_rate": 2.4899999999999997e-07, - "loss": 0.0009, - "ppl": 0.0129241943359375, - "reward": 0.9817739725112915, - "reward_std": 0.0005437276558950543, - "rewards/perpo_ocr_edit_distance_reward": 0.9817740619182587, + "advantages": 1.4305115882962127e-06, + "completion_length": 1093.0, + "delta_ref_entropy_loss": 0.09521484375, + "delta_ref_ppl": -0.052978515625, + "entropy_loss": -0.15234375, + "epoch": 0.251, + "grad_norm": 1.8468950104934942, + "k1_kl": 0.052978515625, + "k3_kl": 0.02685546875, + "kimi_kl": 0.044677734375, + "learning_rate": 3.7449999999999997e-07, + "loss": 0.0011, + "ppl": 0.0791015625, + "reward": 0.6348404884338379, + "reward_std": 0.03549577295780182, + "rewards/perpo_ocr_edit_distance_reward": 0.6348404288291931, "step": 1255, "temperature": 0.9 }, { - "advantages": -0.0003069383765250677, - "completion_length": 397.0, - "delta_ref_entropy_loss": 0.05572509765625, - "delta_ref_ppl": -0.0521240234375, - "entropy_loss": -0.0379638671875, - "epoch": 0.5024, - "grad_norm": 0.5747072738858374, - "k1_kl": 0.0521240234375, - "k3_kl": 0.034423828125, - "kimi_kl": 0.128662109375, - "learning_rate": 2.488e-07, - "loss": 0.0017, - "ppl": 0.014617919921875, - "reward": 0.9969442486763, - "reward_std": 0.0006666697445325553, - "rewards/perpo_ocr_edit_distance_reward": 0.9969443082809448, + "advantages": -2.0078250599908642e-05, + "completion_length": 627.0, + "delta_ref_entropy_loss": 0.15625, + "delta_ref_ppl": -0.11279296875, + "entropy_loss": -0.1328125, + "epoch": 0.2512, + "grad_norm": 3.1791839103636064, + "k1_kl": 0.11279296875, + "k3_kl": 0.0634765625, + "kimi_kl": 0.1806640625, + "learning_rate": 3.744e-07, + "loss": 0.0026, + "ppl": 0.07470703125, + "reward": 0.823867917060852, + "reward_std": 0.004143691621720791, + "rewards/perpo_ocr_edit_distance_reward": 0.8238679766654968, "step": 1256, "temperature": 0.9 }, { - "advantages": -7.28539120498084e-05, - "completion_length": 1049.5, - "delta_ref_entropy_loss": 0.025115966796875, - "delta_ref_ppl": -0.023590087890625, - "entropy_loss": -0.02490234375, - "epoch": 0.5028, - "grad_norm": 0.6141575420748571, - "k1_kl": 0.0236053466796875, - "k3_kl": 0.01563262939453125, - "kimi_kl": 0.048492431640625, - "learning_rate": 2.486e-07, - "loss": 0.0007, - "ppl": 0.011444091796875, - "reward": 0.9993060827255249, - "reward_std": 0.0003803767031058669, - "rewards/perpo_ocr_edit_distance_reward": 0.9993061721324921, + "advantages": 1.0865075637411792e-05, + "completion_length": 801.0, + "delta_ref_entropy_loss": 0.0869140625, + "delta_ref_ppl": -0.07373046875, + "entropy_loss": -0.031982421875, + "epoch": 0.2514, + "grad_norm": 1.2871358255317935, + "k1_kl": 0.07421875, + "k3_kl": 0.038818359375, + "kimi_kl": 0.09033203125, + "learning_rate": 3.743e-07, + "loss": 0.0015, + "ppl": 0.013671875, + "reward": 0.9942967891693115, + "reward_std": 0.0014657812425866723, + "rewards/perpo_ocr_edit_distance_reward": 0.9942967891693115, "step": 1257, "temperature": 0.9 }, { - "advantages": -5.0612861741683446e-05, - "completion_length": 468.5, - "delta_ref_entropy_loss": 0.0462646484375, - "delta_ref_ppl": -0.03533935546875, - "entropy_loss": -0.03094482421875, - "epoch": 0.5032, - "grad_norm": 0.6113126073189499, - "k1_kl": 0.0352783203125, - "k3_kl": 0.02264404296875, - "kimi_kl": 0.0819091796875, - "learning_rate": 2.484e-07, - "loss": 0.001, - "ppl": 0.01513671875, - "reward": 0.9981379806995392, - "reward_std": 0.0008463068807031959, - "rewards/perpo_ocr_edit_distance_reward": 0.9981380701065063, + "advantages": -2.0819052224396728e-05, + "completion_length": 797.0, + "delta_ref_entropy_loss": 0.0810546875, + "delta_ref_ppl": -0.06591796875, + "entropy_loss": -0.043701171875, + "epoch": 0.2516, + "grad_norm": 1.1244369297776504, + "k1_kl": 0.06591796875, + "k3_kl": 0.036376953125, + "kimi_kl": 0.1005859375, + "learning_rate": 3.7419999999999995e-07, + "loss": 0.0015, + "ppl": 0.01953125, + "reward": 0.9633879065513611, + "reward_std": 0.00317333173006773, + "rewards/perpo_ocr_edit_distance_reward": 0.9633879661560059, "step": 1258, "temperature": 0.9 }, { - "advantages": -0.00032774891224107705, - "completion_length": 550.0, - "delta_ref_entropy_loss": 0.0323486328125, - "delta_ref_ppl": -0.0379638671875, - "entropy_loss": -0.017364501953125, - "epoch": 0.5036, - "grad_norm": 0.19508743274068177, - "k1_kl": 0.0379638671875, - "k3_kl": 0.02435302734375, - "kimi_kl": 0.084716796875, - "learning_rate": 2.482e-07, - "loss": 0.0013, - "ppl": 0.008026123046875, - "reward": 0.9990110397338867, - "reward_std": 0.00016484335355926305, - "rewards/perpo_ocr_edit_distance_reward": 0.9990111589431763, + "advantages": 2.1457672119140625e-06, + "completion_length": 669.0, + "delta_ref_entropy_loss": 0.0830078125, + "delta_ref_ppl": -0.080078125, + "entropy_loss": -0.03662109375, + "epoch": 0.2518, + "grad_norm": 0.6496541947034316, + "k1_kl": 0.080078125, + "k3_kl": 0.04541015625, + "kimi_kl": 0.10595703125, + "learning_rate": 3.741e-07, + "loss": 0.0018, + "ppl": 0.0133056640625, + "reward": 0.9832970499992371, + "reward_std": 0.003884833073243499, + "rewards/perpo_ocr_edit_distance_reward": 0.9832971096038818, "step": 1259, "temperature": 0.9 }, { - "advantages": 0.0, - "completion_length": 423.0, - "delta_ref_entropy_loss": 0.0213623046875, - "delta_ref_ppl": -0.0263671875, - "entropy_loss": -0.016326904296875, - "epoch": 0.504, - "grad_norm": 0.009072754918557895, - "k1_kl": 0.0263671875, - "k3_kl": 0.01641845703125, - "kimi_kl": 0.043701171875, - "learning_rate": 2.48e-07, - "loss": 0.0007, - "ppl": 0.006591796875, - "reward": 0.992514967918396, - "reward_std": 0.0, - "rewards/perpo_ocr_edit_distance_reward": 0.992514967918396, + "advantages": -3.571595516405068e-05, + "completion_length": 299.0, + "delta_ref_entropy_loss": 0.1572265625, + "delta_ref_ppl": -0.1435546875, + "entropy_loss": -0.04443359375, + "epoch": 0.252, + "grad_norm": 1.3261706726446607, + "k1_kl": 0.1435546875, + "k3_kl": 0.09130859375, + "kimi_kl": 0.26171875, + "learning_rate": 3.74e-07, + "loss": 0.0037, + "ppl": 0.0162353515625, + "reward": 0.9094982147216797, + "reward_std": 0.002283157780766487, + "rewards/perpo_ocr_edit_distance_reward": 0.909498393535614, "step": 1260, "temperature": 0.9 }, { - "advantages": -3.427267165534431e-05, - "completion_length": 541.0, - "delta_ref_entropy_loss": 0.04241943359375, - "delta_ref_ppl": -0.0379638671875, - "entropy_loss": -0.02734375, - "epoch": 0.5044, - "grad_norm": 0.6790960068210736, - "k1_kl": 0.03802490234375, - "k3_kl": 0.02276611328125, - "kimi_kl": 0.075439453125, - "learning_rate": 2.478e-07, - "loss": 0.0009, - "ppl": 0.014007568359375, - "reward": 0.9968946278095245, - "reward_std": 0.0020876830094493926, - "rewards/perpo_ocr_edit_distance_reward": 0.9968946874141693, + "advantages": -7.141488458728418e-05, + "completion_length": 420.0, + "delta_ref_entropy_loss": 0.06201171875, + "delta_ref_ppl": -0.0634765625, + "entropy_loss": -0.04248046875, + "epoch": 0.2522, + "grad_norm": 0.621317231342486, + "k1_kl": 0.0634765625, + "k3_kl": 0.033935546875, + "kimi_kl": 0.09619140625, + "learning_rate": 3.739e-07, + "loss": 0.0014, + "ppl": 0.01275634765625, + "reward": 0.9850960373878479, + "reward_std": 0.0003771028423216194, + "rewards/perpo_ocr_edit_distance_reward": 0.9850960969924927, "step": 1261, "temperature": 0.9 }, { - "advantages": -8.525167822881485e-05, - "completion_length": 684.5, - "delta_ref_entropy_loss": 0.0355224609375, - "delta_ref_ppl": -0.02301025390625, - "entropy_loss": -0.0328369140625, - "epoch": 0.5048, - "grad_norm": 0.6190214269200871, - "k1_kl": 0.02313232421875, - "k3_kl": 0.0145263671875, - "kimi_kl": 0.03662109375, - "learning_rate": 2.4759999999999997e-07, - "loss": 0.0007, - "ppl": 0.015869140625, - "reward": 0.9859600961208344, - "reward_std": 0.006922114218468778, - "rewards/perpo_ocr_edit_distance_reward": 0.9859601855278015, + "advantages": 0.0, + "completion_length": 803.0, + "delta_ref_entropy_loss": 0.046875, + "delta_ref_ppl": -0.0498046875, + "entropy_loss": -0.017578125, + "epoch": 0.2524, + "grad_norm": 8.025727253750977, + "k1_kl": 0.0498046875, + "k3_kl": 0.0322265625, + "kimi_kl": 0.068359375, + "learning_rate": 3.738e-07, + "loss": 0.0013, + "ppl": 0.007476806640625, + "reward": 0.997326135635376, + "reward_std": 0.00034519436303526163, + "rewards/perpo_ocr_edit_distance_reward": 0.9973261952400208, "step": 1262, "temperature": 0.9 }, { - "advantages": 4.257474817137563e-09, - "completion_length": 374.5, - "delta_ref_entropy_loss": 0.0458984375, - "delta_ref_ppl": -0.063720703125, - "entropy_loss": -0.01776123046875, - "epoch": 0.5052, - "grad_norm": 0.1280110705164232, - "k1_kl": 0.0635986328125, - "k3_kl": 0.0401611328125, - "kimi_kl": 0.1151123046875, - "learning_rate": 2.474e-07, - "loss": 0.0016, - "ppl": 0.00545501708984375, - "reward": 0.9967485964298248, - "reward_std": 6.77270581945777e-05, - "rewards/perpo_ocr_edit_distance_reward": 0.9967485964298248, + "advantages": -1.2695790246652905e-05, + "completion_length": 945.0, + "delta_ref_entropy_loss": 0.057861328125, + "delta_ref_ppl": -0.06640625, + "entropy_loss": -0.04296875, + "epoch": 0.2526, + "grad_norm": 0.5404125412904847, + "k1_kl": 0.06640625, + "k3_kl": 0.043701171875, + "kimi_kl": 0.130859375, + "learning_rate": 3.7369999999999996e-07, + "loss": 0.0018, + "ppl": 0.019775390625, + "reward": 0.9438601732254028, + "reward_std": 0.0012418200494721532, + "rewards/perpo_ocr_edit_distance_reward": 0.9438602328300476, "step": 1263, "temperature": 0.9 }, { - "advantages": -5.974940359010361e-05, - "completion_length": 470.5, - "delta_ref_entropy_loss": 0.039306640625, - "delta_ref_ppl": -0.033935546875, - "entropy_loss": -0.0208740234375, - "epoch": 0.5056, - "grad_norm": 0.6063412861302531, - "k1_kl": 0.033935546875, - "k3_kl": 0.02166748046875, - "kimi_kl": 0.0849609375, - "learning_rate": 2.472e-07, - "loss": 0.0009, - "ppl": 0.01141357421875, - "reward": 0.9987403452396393, - "reward_std": 0.0009235565084964037, - "rewards/perpo_ocr_edit_distance_reward": 0.9987404644489288, + "advantages": -7.37735244911164e-05, + "completion_length": 1447.0, + "delta_ref_entropy_loss": 0.0262451171875, + "delta_ref_ppl": -0.0301513671875, + "entropy_loss": -0.0308837890625, + "epoch": 0.2528, + "grad_norm": 0.45824725531649585, + "k1_kl": 0.030029296875, + "k3_kl": 0.0179443359375, + "kimi_kl": 0.04541015625, + "learning_rate": 3.7359999999999996e-07, + "loss": 0.0008, + "ppl": 0.0135498046875, + "reward": 0.9919742941856384, + "reward_std": 0.0011694367276504636, + "rewards/perpo_ocr_edit_distance_reward": 0.991974413394928, "step": 1264, "temperature": 0.9 }, { - "advantages": -7.113814353942871e-05, - "completion_length": 465.5, - "delta_ref_entropy_loss": 0.03070068359375, - "delta_ref_ppl": -0.0372314453125, - "entropy_loss": -0.020843505859375, - "epoch": 0.506, - "grad_norm": 0.29639967853357385, - "k1_kl": 0.0372314453125, - "k3_kl": 0.02386474609375, - "kimi_kl": 0.0648193359375, - "learning_rate": 2.47e-07, - "loss": 0.001, - "ppl": 0.009307861328125, - "reward": 0.9990918338298798, - "reward_std": 0.00021935375116299838, - "rewards/perpo_ocr_edit_distance_reward": 0.9990918636322021, + "advantages": -8.685248758411035e-05, + "completion_length": 572.0, + "delta_ref_entropy_loss": 0.05126953125, + "delta_ref_ppl": -0.07373046875, + "entropy_loss": -0.01324462890625, + "epoch": 0.253, + "grad_norm": 0.46731801655139527, + "k1_kl": 0.07373046875, + "k3_kl": 0.048828125, + "kimi_kl": 0.16796875, + "learning_rate": 3.735e-07, + "loss": 0.002, + "ppl": 0.004119873046875, + "reward": 0.9978966116905212, + "reward_std": 0.00019416536088101566, + "rewards/perpo_ocr_edit_distance_reward": 0.997896671295166, "step": 1265, "temperature": 0.9 }, { - "advantages": -2.1581139662885107e-05, - "completion_length": 678.5, - "delta_ref_entropy_loss": 0.02838134765625, - "delta_ref_ppl": -0.02618408203125, - "entropy_loss": -0.0150909423828125, - "epoch": 0.5064, - "grad_norm": 0.2867859086414056, - "k1_kl": 0.026214599609375, - "k3_kl": 0.0157012939453125, - "kimi_kl": 0.045379638671875, - "learning_rate": 2.4679999999999996e-07, - "loss": 0.0006, - "ppl": 0.0062713623046875, - "reward": 0.9994423389434814, - "reward_std": 0.0002459061215631664, - "rewards/perpo_ocr_edit_distance_reward": 0.9994423389434814, + "advantages": -0.0005960464477539062, + "completion_length": 593.0, + "delta_ref_entropy_loss": 0.046142578125, + "delta_ref_ppl": -0.0546875, + "entropy_loss": -0.010009765625, + "epoch": 0.2532, + "grad_norm": 0.0032578154111178263, + "k1_kl": 0.054931640625, + "k3_kl": 0.034423828125, + "kimi_kl": 0.12060546875, + "learning_rate": 3.734e-07, + "loss": 0.002, + "ppl": 0.00188446044921875, + "reward": 0.9951292872428894, + "reward_std": 0.0, + "rewards/perpo_ocr_edit_distance_reward": 0.9951293468475342, "step": 1266, "temperature": 0.9 }, { - "advantages": -8.676734069013037e-06, - "completion_length": 517.0, - "delta_ref_entropy_loss": 0.03271484375, - "delta_ref_ppl": -0.022003173828125, - "entropy_loss": -0.019805908203125, - "epoch": 0.5068, - "grad_norm": 0.44129543827712575, - "k1_kl": 0.022003173828125, - "k3_kl": 0.01263427734375, - "kimi_kl": 0.02447509765625, - "learning_rate": 2.466e-07, - "loss": 0.0005, - "ppl": 0.00791168212890625, - "reward": 0.8818548619747162, - "reward_std": 0.0009313328191637993, - "rewards/perpo_ocr_edit_distance_reward": 0.881854921579361, + "advantages": -2.385888910794165e-05, + "completion_length": 64.0, + "delta_ref_entropy_loss": 0.078125, + "delta_ref_ppl": -0.34765625, + "entropy_loss": -0.07666015625, + "epoch": 0.2534, + "grad_norm": 2.6384567505335084, + "k1_kl": 0.34765625, + "k3_kl": 0.26953125, + "kimi_kl": 1.15625, + "learning_rate": 3.733e-07, + "loss": 0.0108, + "ppl": 0.02685546875, + "reward": 0.9443520903587341, + "reward_std": 0.0031076837331056595, + "rewards/perpo_ocr_edit_distance_reward": 0.9443522095680237, "step": 1267, "temperature": 0.9 }, { - "advantages": -5.004661579732783e-05, - "completion_length": 417.5, - "delta_ref_entropy_loss": 0.0379638671875, - "delta_ref_ppl": -0.02813720703125, - "entropy_loss": -0.026611328125, - "epoch": 0.5072, - "grad_norm": 0.385732847500563, - "k1_kl": 0.02813720703125, - "k3_kl": 0.0145263671875, - "kimi_kl": 0.033447265625, - "learning_rate": 2.464e-07, - "loss": 0.0006, - "ppl": 0.01165771484375, - "reward": 0.9964656829833984, - "reward_std": 0.0008433469920419157, - "rewards/perpo_ocr_edit_distance_reward": 0.9964657723903656, + "advantages": -2.622604597490863e-06, + "completion_length": 529.0, + "delta_ref_entropy_loss": 0.027099609375, + "delta_ref_ppl": -0.03173828125, + "entropy_loss": -0.0230712890625, + "epoch": 0.2536, + "grad_norm": 0.6253473379431435, + "k1_kl": 0.03173828125, + "k3_kl": 0.02099609375, + "kimi_kl": 0.05615234375, + "learning_rate": 3.732e-07, + "loss": 0.0008, + "ppl": 0.00970458984375, + "reward": 0.9793973565101624, + "reward_std": 0.01289794035255909, + "rewards/perpo_ocr_edit_distance_reward": 0.9793974161148071, "step": 1268, "temperature": 0.9 }, { - "advantages": 6.722552825522143e-06, - "completion_length": 710.0, - "delta_ref_entropy_loss": 0.0250244140625, - "delta_ref_ppl": -0.031494140625, - "entropy_loss": -0.02947998046875, - "epoch": 0.5076, - "grad_norm": 0.24085660417138882, - "k1_kl": 0.0313720703125, - "k3_kl": 0.021484375, - "kimi_kl": 0.0701904296875, - "learning_rate": 2.4619999999999997e-07, - "loss": 0.0009, - "ppl": 0.012969970703125, - "reward": 0.9994968771934509, - "reward_std": 0.0002665567153599113, - "rewards/perpo_ocr_edit_distance_reward": 0.9994968771934509, + "advantages": -5.701610280084424e-05, + "completion_length": 1102.0, + "delta_ref_entropy_loss": 0.0810546875, + "delta_ref_ppl": -0.060302734375, + "entropy_loss": -0.033203125, + "epoch": 0.2538, + "grad_norm": 0.7618992537829057, + "k1_kl": 0.060546875, + "k3_kl": 0.027587890625, + "kimi_kl": 0.0595703125, + "learning_rate": 3.7309999999999997e-07, + "loss": 0.0012, + "ppl": 0.0159912109375, + "reward": 0.9867153167724609, + "reward_std": 0.0010946006514132023, + "rewards/perpo_ocr_edit_distance_reward": 0.9867153763771057, "step": 1269, "temperature": 0.9 }, { - "advantages": -7.356916285061743e-06, - "completion_length": 243.5, - "delta_ref_entropy_loss": 0.0662841796875, - "delta_ref_ppl": -0.0504150390625, - "entropy_loss": -0.10479736328125, - "epoch": 0.508, - "grad_norm": 1.1722312499079763, - "k1_kl": 0.0504150390625, - "k3_kl": 0.0283203125, - "kimi_kl": 0.073486328125, - "learning_rate": 2.46e-07, - "loss": 0.0011, - "ppl": 0.0560150146484375, - "reward": 0.9265743494033813, - "reward_std": 0.0013942292425781488, - "rewards/perpo_ocr_edit_distance_reward": 0.9265743792057037, + "advantages": 8.889607670425903e-06, + "completion_length": 557.0, + "delta_ref_entropy_loss": 0.1396484375, + "delta_ref_ppl": -0.1357421875, + "entropy_loss": -0.072265625, + "epoch": 0.254, + "grad_norm": 0.9420667859093191, + "k1_kl": 0.1357421875, + "k3_kl": 0.08349609375, + "kimi_kl": 0.2578125, + "learning_rate": 3.7299999999999997e-07, + "loss": 0.0033, + "ppl": 0.031982421875, + "reward": 0.959272563457489, + "reward_std": 0.0008580725407227874, + "rewards/perpo_ocr_edit_distance_reward": 0.959272563457489, "step": 1270, "temperature": 0.9 }, { - "advantages": -1.5071460666149505e-06, + "advantages": -5.619866897177417e-06, "completion_length": 556.0, - "delta_ref_entropy_loss": 0.020263671875, - "delta_ref_ppl": -0.051849365234375, - "entropy_loss": -0.0277099609375, - "epoch": 0.5084, - "grad_norm": 1.5941362111028894, - "k1_kl": 0.052154541015625, - "k3_kl": 0.040130615234375, - "kimi_kl": 0.1658935546875, - "learning_rate": 2.458e-07, - "loss": 0.0016, - "ppl": 0.01399993896484375, - "reward": 0.9718315005302429, - "reward_std": 0.011134534142911434, - "rewards/perpo_ocr_edit_distance_reward": 0.9718315601348877, + "delta_ref_entropy_loss": 0.08447265625, + "delta_ref_ppl": -0.0966796875, + "entropy_loss": -0.041259765625, + "epoch": 0.2542, + "grad_norm": 1.5774690379098897, + "k1_kl": 0.09619140625, + "k3_kl": 0.0556640625, + "kimi_kl": 0.142578125, + "learning_rate": 3.729e-07, + "loss": 0.0022, + "ppl": 0.01336669921875, + "reward": 0.786740243434906, + "reward_std": 0.015046413987874985, + "rewards/perpo_ocr_edit_distance_reward": 0.7867403626441956, "step": 1271, "temperature": 0.9 }, { - "advantages": 0.0, - "completion_length": 284.0, - "delta_ref_entropy_loss": 0.038726806640625, - "delta_ref_ppl": -0.0179443359375, - "entropy_loss": -0.019561767578125, - "epoch": 0.5088, - "grad_norm": 0.024258959713834852, - "k1_kl": 0.0179443359375, - "k3_kl": 0.0082244873046875, - "kimi_kl": 0.0162353515625, - "learning_rate": 2.456e-07, - "loss": 0.0003, - "ppl": 0.0080718994140625, - "reward": 0.9705128073692322, - "reward_std": 0.0, - "rewards/perpo_ocr_edit_distance_reward": 0.9705128073692322, + "advantages": -3.008331623277627e-05, + "completion_length": 588.0, + "delta_ref_entropy_loss": 0.053466796875, + "delta_ref_ppl": -0.07666015625, + "entropy_loss": -0.04052734375, + "epoch": 0.2544, + "grad_norm": 0.6176100590304631, + "k1_kl": 0.07666015625, + "k3_kl": 0.044921875, + "kimi_kl": 0.119140625, + "learning_rate": 3.728e-07, + "loss": 0.0018, + "ppl": 0.015869140625, + "reward": 0.9945279955863953, + "reward_std": 0.0004661924613174051, + "rewards/perpo_ocr_edit_distance_reward": 0.9945279955863953, "step": 1272, "temperature": 0.9 }, { - "advantages": -0.00013635839786729775, - "completion_length": 855.0, - "delta_ref_entropy_loss": 0.02349853515625, - "delta_ref_ppl": -0.0211181640625, - "entropy_loss": -0.02020263671875, - "epoch": 0.5092, - "grad_norm": 0.27204728931866123, - "k1_kl": 0.0211181640625, - "k3_kl": 0.0136260986328125, - "kimi_kl": 0.0594482421875, - "learning_rate": 2.454e-07, - "loss": 0.0007, - "ppl": 0.0087890625, - "reward": 0.98983034491539, - "reward_std": 0.0002214020860265009, - "rewards/perpo_ocr_edit_distance_reward": 0.9898304045200348, + "advantages": -6.811959707420101e-08, + "completion_length": 64.0, + "delta_ref_entropy_loss": 0.154296875, + "delta_ref_ppl": -0.30859375, + "entropy_loss": -0.2333984375, + "epoch": 0.2546, + "grad_norm": 7.462515033084374, + "k1_kl": 0.30859375, + "k3_kl": 0.216796875, + "kimi_kl": 0.703125, + "learning_rate": 3.7269999999999994e-07, + "loss": 0.0087, + "ppl": 0.11083984375, + "reward": 0.5492895245552063, + "reward_std": 0.342136025428772, + "rewards/perpo_ocr_edit_distance_reward": 0.5492895245552063, "step": 1273, "temperature": 0.9 }, { - "advantages": -7.0993394594154324e-06, - "completion_length": 683.0, - "delta_ref_entropy_loss": 0.0369873046875, - "delta_ref_ppl": -0.0260009765625, - "entropy_loss": -0.04217529296875, - "epoch": 0.5096, - "grad_norm": 0.5954337544297499, - "k1_kl": 0.0260009765625, - "k3_kl": 0.014007568359375, - "kimi_kl": 0.0390625, - "learning_rate": 2.452e-07, - "loss": 0.0006, - "ppl": 0.02288818359375, - "reward": 0.7131513804197311, - "reward_std": 0.0024165139766409993, - "rewards/perpo_ocr_edit_distance_reward": 0.7131514400243759, + "advantages": 1.7029899268550253e-08, + "completion_length": 481.0, + "delta_ref_entropy_loss": 0.03759765625, + "delta_ref_ppl": -0.046630859375, + "entropy_loss": -0.0103759765625, + "epoch": 0.2548, + "grad_norm": 0.2592729636332995, + "k1_kl": 0.046630859375, + "k3_kl": 0.028564453125, + "kimi_kl": 0.09326171875, + "learning_rate": 3.726e-07, + "loss": 0.0011, + "ppl": 0.0023193359375, + "reward": 0.9940131902694702, + "reward_std": 0.0007282492588274181, + "rewards/perpo_ocr_edit_distance_reward": 0.9940131902694702, "step": 1274, "temperature": 0.9 }, { - "advantages": -2.086162709247219e-07, - "completion_length": 826.5, - "delta_ref_entropy_loss": 0.021484375, - "delta_ref_ppl": -0.03131103515625, - "entropy_loss": -0.09686279296875, - "epoch": 0.51, - "grad_norm": 0.623757912412376, - "k1_kl": 0.03125, - "k3_kl": 0.020965576171875, - "kimi_kl": 0.06414794921875, - "learning_rate": 2.45e-07, - "loss": 0.0008, - "ppl": 0.0453338623046875, - "reward": 0.9250443577766418, - "reward_std": 0.07073381543159485, - "rewards/perpo_ocr_edit_distance_reward": 0.9250443577766418, + "advantages": -1.0473388556420105e-06, + "completion_length": 812.0, + "delta_ref_entropy_loss": 0.09375, + "delta_ref_ppl": -0.09423828125, + "entropy_loss": -0.04296875, + "epoch": 0.255, + "grad_norm": 1.0491879548828182, + "k1_kl": 0.0947265625, + "k3_kl": 0.05419921875, + "kimi_kl": 0.15234375, + "learning_rate": 3.725e-07, + "loss": 0.0022, + "ppl": 0.0203857421875, + "reward": 0.9448461532592773, + "reward_std": 0.05749697610735893, + "rewards/perpo_ocr_edit_distance_reward": 0.9448462128639221, "step": 1275, "temperature": 0.9 }, { - "advantages": 6.846019459771924e-06, - "completion_length": 465.5, - "delta_ref_entropy_loss": 0.0301513671875, - "delta_ref_ppl": -0.0289306640625, - "entropy_loss": -0.028106689453125, - "epoch": 0.5104, - "grad_norm": 0.6269703642871693, - "k1_kl": 0.0288238525390625, - "k3_kl": 0.019439697265625, - "kimi_kl": 0.08392715454101562, - "learning_rate": 2.4479999999999997e-07, - "loss": 0.0008, - "ppl": 0.015655517578125, - "reward": 0.9818377792835236, - "reward_std": 0.0005718676256947219, - "rewards/perpo_ocr_edit_distance_reward": 0.981837809085846, + "advantages": -0.00011287417146377265, + "completion_length": 788.0, + "delta_ref_entropy_loss": 0.06787109375, + "delta_ref_ppl": -0.05322265625, + "entropy_loss": -0.02734375, + "epoch": 0.2552, + "grad_norm": 0.40406627680886753, + "k1_kl": 0.053466796875, + "k3_kl": 0.02783203125, + "kimi_kl": 0.059326171875, + "learning_rate": 3.7239999999999997e-07, + "loss": 0.0012, + "ppl": 0.0101318359375, + "reward": 0.9922690391540527, + "reward_std": 0.000428081548307091, + "rewards/perpo_ocr_edit_distance_reward": 0.9922690987586975, "step": 1276, "temperature": 0.9 }, { - "advantages": -9.98718442133395e-05, - "completion_length": 596.5, - "delta_ref_entropy_loss": 0.0228271484375, - "delta_ref_ppl": -0.01800537109375, - "entropy_loss": -0.021026611328125, - "epoch": 0.5108, - "grad_norm": 0.5147409484025497, - "k1_kl": 0.01800537109375, - "k3_kl": 0.010345458984375, - "kimi_kl": 0.029998779296875, - "learning_rate": 2.446e-07, - "loss": 0.0005, - "ppl": 0.00958251953125, - "reward": 0.9979354441165924, - "reward_std": 0.0007139743102015927, - "rewards/perpo_ocr_edit_distance_reward": 0.9979354739189148, + "advantages": -2.835478153428994e-05, + "completion_length": 389.0, + "delta_ref_entropy_loss": 0.07373046875, + "delta_ref_ppl": -0.0732421875, + "entropy_loss": -0.0478515625, + "epoch": 0.2554, + "grad_norm": 1.1483096024877162, + "k1_kl": 0.0732421875, + "k3_kl": 0.0498046875, + "kimi_kl": 0.1396484375, + "learning_rate": 3.723e-07, + "loss": 0.002, + "ppl": 0.025634765625, + "reward": 0.9888291358947754, + "reward_std": 0.0011006367858499289, + "rewards/perpo_ocr_edit_distance_reward": 0.9888292551040649, "step": 1277, "temperature": 0.9 }, { - "advantages": -1.5599387552356347e-05, - "completion_length": 737.0, - "delta_ref_entropy_loss": 0.0587158203125, - "delta_ref_ppl": -0.026947021484375, - "entropy_loss": -0.07342529296875, - "epoch": 0.5112, - "grad_norm": 0.709433139433293, - "k1_kl": 0.027069091796875, - "k3_kl": 0.01263427734375, - "kimi_kl": 0.019927978515625, - "learning_rate": 2.444e-07, - "loss": 0.0005, - "ppl": 0.040252685546875, - "reward": 0.9501221179962158, - "reward_std": 0.002924088970758021, - "rewards/perpo_ocr_edit_distance_reward": 0.9501221776008606, + "advantages": -1.1307853128528222e-05, + "completion_length": 466.0, + "delta_ref_entropy_loss": 0.07421875, + "delta_ref_ppl": -0.05810546875, + "entropy_loss": -0.030029296875, + "epoch": 0.2556, + "grad_norm": 0.5165338654669677, + "k1_kl": 0.05810546875, + "k3_kl": 0.03515625, + "kimi_kl": 0.1083984375, + "learning_rate": 3.7219999999999996e-07, + "loss": 0.0014, + "ppl": 0.01190185546875, + "reward": 0.8755517601966858, + "reward_std": 0.0014057295629754663, + "rewards/perpo_ocr_edit_distance_reward": 0.8755518198013306, "step": 1278, "temperature": 0.9 }, { - "advantages": -3.916876778475853e-07, - "completion_length": 380.5, - "delta_ref_entropy_loss": 0.05206298828125, - "delta_ref_ppl": -0.0352783203125, - "entropy_loss": -0.0358428955078125, - "epoch": 0.5116, - "grad_norm": 1.6089843782144013, - "k1_kl": 0.03515625, - "k3_kl": 0.019775390625, - "kimi_kl": 0.053131103515625, - "learning_rate": 2.442e-07, - "loss": 0.0008, - "ppl": 0.015842437744140625, - "reward": 0.9873743653297424, - "reward_std": 0.016027700155973434, - "rewards/perpo_ocr_edit_distance_reward": 0.9873743951320648, + "advantages": 6.5394815464969724e-06, + "completion_length": 996.0, + "delta_ref_entropy_loss": 0.07373046875, + "delta_ref_ppl": -0.055419921875, + "entropy_loss": -0.04296875, + "epoch": 0.2558, + "grad_norm": 234.23619221572218, + "k1_kl": 0.055419921875, + "k3_kl": 0.134765625, + "kimi_kl": 0.0986328125, + "learning_rate": 3.7209999999999995e-07, + "loss": 0.0054, + "ppl": 0.0208740234375, + "reward": 0.9917994737625122, + "reward_std": 0.0011981449788436294, + "rewards/perpo_ocr_edit_distance_reward": 0.991799533367157, "step": 1279, "temperature": 0.9 }, { - "advantages": -2.5544848540448584e-05, - "completion_length": 545.0, - "delta_ref_entropy_loss": 0.029296875, - "delta_ref_ppl": -0.025787353515625, - "entropy_loss": -0.026031494140625, - "epoch": 0.512, - "grad_norm": 6.201734617015122, - "k1_kl": 0.0257568359375, - "k3_kl": 0.017486572265625, - "kimi_kl": 0.06561279296875, - "learning_rate": 2.4399999999999996e-07, - "loss": 0.0007, - "ppl": 0.01444244384765625, - "reward": 0.9971437156200409, - "reward_std": 0.000783496187068522, - "rewards/perpo_ocr_edit_distance_reward": 0.9971437752246857, + "advantages": -0.00018211774295195937, + "completion_length": 1199.0, + "delta_ref_entropy_loss": 0.028564453125, + "delta_ref_ppl": -0.0361328125, + "entropy_loss": -0.0220947265625, + "epoch": 0.256, + "grad_norm": 3.6509722879216264, + "k1_kl": 0.0361328125, + "k3_kl": 0.021240234375, + "kimi_kl": 0.05908203125, + "learning_rate": 3.72e-07, + "loss": 0.001, + "ppl": 0.008056640625, + "reward": 0.9984354376792908, + "reward_std": 0.00027404361753724515, + "rewards/perpo_ocr_edit_distance_reward": 0.9984354972839355, "step": 1280, "temperature": 0.9 }, { - "advantages": -2.2820064700113107e-06, - "completion_length": 699.0, - "delta_ref_entropy_loss": 0.0601806640625, - "delta_ref_ppl": -0.04833984375, - "entropy_loss": -0.076171875, - "epoch": 0.5124, - "grad_norm": 1.0986068531629005, - "k1_kl": 0.04833984375, - "k3_kl": 0.02703857421875, - "kimi_kl": 0.0611572265625, - "learning_rate": 2.438e-07, - "loss": 0.0011, - "ppl": 0.0396728515625, - "reward": 0.9594543278217316, - "reward_std": 0.025548504665493965, - "rewards/perpo_ocr_edit_distance_reward": 0.959454357624054, - "step": 1281, - "temperature": 0.9 + "advantages": -5.28267482877709e-05, + "completion_length": 543.0, + "delta_ref_entropy_loss": 0.06298828125, + "delta_ref_ppl": -0.042236328125, + "entropy_loss": -0.029296875, + "epoch": 0.2562, + "grad_norm": 0.3695813101007829, + "k1_kl": 0.0419921875, + "k3_kl": 0.02099609375, + "kimi_kl": 0.044189453125, + "learning_rate": 3.719e-07, + "loss": 0.0009, + "ppl": 0.00848388671875, + "reward": 0.9949670433998108, + "reward_std": 0.00022237811936065555, + "rewards/perpo_ocr_edit_distance_reward": 0.9949671626091003, + "step": 1281, + "temperature": 0.9 }, { - "advantages": -7.482086402887944e-05, - "completion_length": 676.0, - "delta_ref_entropy_loss": 0.0279541015625, - "delta_ref_ppl": -0.02813720703125, - "entropy_loss": -0.04498291015625, - "epoch": 0.5128, - "grad_norm": 0.7032391688188908, - "k1_kl": 0.02801513671875, - "k3_kl": 0.0189208984375, - "kimi_kl": 0.046630859375, - "learning_rate": 2.436e-07, - "loss": 0.0008, - "ppl": 0.0249481201171875, - "reward": 0.991414874792099, - "reward_std": 0.0005643118493026122, - "rewards/perpo_ocr_edit_distance_reward": 0.9914149343967438, + "advantages": -0.0005960464477539062, + "completion_length": 1145.0, + "delta_ref_entropy_loss": 0.0267333984375, + "delta_ref_ppl": -0.0257568359375, + "entropy_loss": -0.01263427734375, + "epoch": 0.2564, + "grad_norm": 0.010840202850140436, + "k1_kl": 0.0257568359375, + "k3_kl": 0.01470947265625, + "kimi_kl": 0.04248046875, + "learning_rate": 3.718e-07, + "loss": 0.0012, + "ppl": 0.00396728515625, + "reward": 0.9858585000038147, + "reward_std": 0.0, + "rewards/perpo_ocr_edit_distance_reward": 0.9858586192131042, "step": 1282, "temperature": 0.9 }, { - "advantages": -0.00016816174320410937, - "completion_length": 639.5, - "delta_ref_entropy_loss": 0.0223388671875, - "delta_ref_ppl": -0.03271484375, - "entropy_loss": -0.017120361328125, - "epoch": 0.5132, - "grad_norm": 0.24884990397304602, - "k1_kl": 0.03271484375, - "k3_kl": 0.0228271484375, - "kimi_kl": 0.0972900390625, - "learning_rate": 2.4339999999999997e-07, - "loss": 0.0011, - "ppl": 0.0079345703125, - "reward": 0.9998019337654114, - "reward_std": 0.00031129910348681733, - "rewards/perpo_ocr_edit_distance_reward": 0.9998020231723785, + "advantages": -3.232275048503652e-05, + "completion_length": 448.0, + "delta_ref_entropy_loss": 0.0869140625, + "delta_ref_ppl": -0.12060546875, + "entropy_loss": -0.053955078125, + "epoch": 0.2566, + "grad_norm": 1.8066434539692429, + "k1_kl": 0.12060546875, + "k3_kl": 0.08056640625, + "kimi_kl": 0.3046875, + "learning_rate": 3.7169999999999997e-07, + "loss": 0.0033, + "ppl": 0.022705078125, + "reward": 0.917842447757721, + "reward_std": 0.002008043695241213, + "rewards/perpo_ocr_edit_distance_reward": 0.9178425073623657, "step": 1283, "temperature": 0.9 }, { - "advantages": -4.478863502299646e-06, - "completion_length": 811.0, - "delta_ref_entropy_loss": 0.03948974609375, - "delta_ref_ppl": -0.0343017578125, - "entropy_loss": -0.0283203125, - "epoch": 0.5136, - "grad_norm": 0.674095716927816, - "k1_kl": 0.0343017578125, - "k3_kl": 0.02301025390625, - "kimi_kl": 0.0645751953125, - "learning_rate": 2.432e-07, - "loss": 0.0009, - "ppl": 0.016204833984375, - "reward": 0.9973128736019135, - "reward_std": 0.002319494029507041, - "rewards/perpo_ocr_edit_distance_reward": 0.9973129332065582, + "advantages": 0.0, + "completion_length": 593.0, + "delta_ref_entropy_loss": 0.2451171875, + "delta_ref_ppl": -0.1630859375, + "entropy_loss": -0.3515625, + "epoch": 0.2568, + "grad_norm": 3.20559966472682, + "k1_kl": 0.162109375, + "k3_kl": 0.09033203125, + "kimi_kl": 0.1611328125, + "learning_rate": 3.7159999999999997e-07, + "loss": 0.0036, + "ppl": 0.2119140625, + "reward": 0.7266548871994019, + "reward_std": 0.005226357840001583, + "rewards/perpo_ocr_edit_distance_reward": 0.7266549468040466, "step": 1284, "temperature": 0.9 }, { - "advantages": -8.33613557915669e-06, - "completion_length": 600.5, - "delta_ref_entropy_loss": 0.017822265625, - "delta_ref_ppl": -0.0109405517578125, - "entropy_loss": -0.01251220703125, - "epoch": 0.514, - "grad_norm": 0.1923648004752079, - "k1_kl": 0.0109405517578125, - "k3_kl": 0.0061798095703125, - "kimi_kl": 0.017913818359375, - "learning_rate": 2.43e-07, - "loss": 0.0003, - "ppl": 0.0052490234375, - "reward": 0.9986900389194489, - "reward_std": 0.0004607067967299372, - "rewards/perpo_ocr_edit_distance_reward": 0.9986900687217712, + "advantages": -5.934919954597717e-06, + "completion_length": 837.0, + "delta_ref_entropy_loss": 0.1103515625, + "delta_ref_ppl": -0.07568359375, + "entropy_loss": -0.1669921875, + "epoch": 0.257, + "grad_norm": 3.465364197895299, + "k1_kl": 0.07568359375, + "k3_kl": 0.04443359375, + "kimi_kl": 0.09033203125, + "learning_rate": 3.7149999999999996e-07, + "loss": 0.0018, + "ppl": 0.091796875, + "reward": 0.8887587189674377, + "reward_std": 0.008509562350809574, + "rewards/perpo_ocr_edit_distance_reward": 0.8887588381767273, "step": 1285, "temperature": 0.9 }, { - "advantages": -2.4821076749503845e-06, - "completion_length": 540.0, - "delta_ref_entropy_loss": 0.0863037109375, - "delta_ref_ppl": -0.0555419921875, - "entropy_loss": -0.098876953125, - "epoch": 0.5144, - "grad_norm": 1.0520363068916787, - "k1_kl": 0.0555419921875, - "k3_kl": 0.03009033203125, - "kimi_kl": 0.067626953125, - "learning_rate": 2.428e-07, - "loss": 0.0012, - "ppl": 0.052734375, - "reward": 0.9019427597522736, - "reward_std": 0.003083254676312208, - "rewards/perpo_ocr_edit_distance_reward": 0.9019428193569183, + "advantages": 9.093966582440771e-06, + "completion_length": 549.0, + "delta_ref_entropy_loss": 0.10107421875, + "delta_ref_ppl": -0.095703125, + "entropy_loss": -0.04150390625, + "epoch": 0.2572, + "grad_norm": 0.6505012946283546, + "k1_kl": 0.095703125, + "k3_kl": 0.057373046875, + "kimi_kl": 0.17578125, + "learning_rate": 3.714e-07, + "loss": 0.0023, + "ppl": 0.017578125, + "reward": 0.9851434230804443, + "reward_std": 0.0008365713292732835, + "rewards/perpo_ocr_edit_distance_reward": 0.9851434230804443, "step": 1286, "temperature": 0.9 }, { - "advantages": -0.00016657370360917412, - "completion_length": 894.5, - "delta_ref_entropy_loss": 0.0421142578125, - "delta_ref_ppl": -0.023681640625, - "entropy_loss": -0.02783203125, - "epoch": 0.5148, - "grad_norm": 1.3165496231411722, - "k1_kl": 0.02374267578125, - "k3_kl": 0.0128173828125, - "kimi_kl": 0.03826904296875, - "learning_rate": 2.426e-07, - "loss": 0.0007, - "ppl": 0.0181884765625, - "reward": 0.9984384477138519, - "reward_std": 0.00042887588642770424, - "rewards/perpo_ocr_edit_distance_reward": 0.9984385371208191, + "advantages": -4.801580143976025e-05, + "completion_length": 714.0, + "delta_ref_entropy_loss": 0.06591796875, + "delta_ref_ppl": -0.0966796875, + "entropy_loss": -0.0439453125, + "epoch": 0.2574, + "grad_norm": 0.6462447865302859, + "k1_kl": 0.0966796875, + "k3_kl": 0.06298828125, + "kimi_kl": 0.1884765625, + "learning_rate": 3.713e-07, + "loss": 0.0026, + "ppl": 0.020751953125, + "reward": 0.9930378198623657, + "reward_std": 0.0011411068262532353, + "rewards/perpo_ocr_edit_distance_reward": 0.9930379390716553, "step": 1287, "temperature": 0.9 }, { - "advantages": -1.2589352991199121e-05, - "completion_length": 493.0, - "delta_ref_entropy_loss": 0.047607421875, - "delta_ref_ppl": -0.0380859375, - "entropy_loss": -0.052001953125, - "epoch": 0.5152, - "grad_norm": 0.7081492321535265, - "k1_kl": 0.0379638671875, - "k3_kl": 0.021240234375, - "kimi_kl": 0.0496826171875, - "learning_rate": 2.424e-07, - "loss": 0.0009, - "ppl": 0.02593994140625, - "reward": 0.9856480956077576, - "reward_std": 0.0015043112798593938, - "rewards/perpo_ocr_edit_distance_reward": 0.9856481552124023, + "advantages": -8.953469659900293e-06, + "completion_length": 356.0, + "delta_ref_entropy_loss": 0.1845703125, + "delta_ref_ppl": -0.1416015625, + "entropy_loss": -0.1640625, + "epoch": 0.2576, + "grad_norm": 2.350773909739451, + "k1_kl": 0.1416015625, + "k3_kl": 0.0810546875, + "kimi_kl": 0.1953125, + "learning_rate": 3.7119999999999994e-07, + "loss": 0.0032, + "ppl": 0.0908203125, + "reward": 0.8773733973503113, + "reward_std": 0.004658096935600042, + "rewards/perpo_ocr_edit_distance_reward": 0.877373456954956, "step": 1288, "temperature": 0.9 }, { - "advantages": -2.9027463050113056e-05, - "completion_length": 520.5, - "delta_ref_entropy_loss": 0.0450439453125, - "delta_ref_ppl": -0.03338623046875, - "entropy_loss": -0.055023193359375, - "epoch": 0.5156, - "grad_norm": 2.378086145684954, - "k1_kl": 0.03350830078125, - "k3_kl": 0.018157958984375, - "kimi_kl": 0.044403076171875, - "learning_rate": 2.422e-07, - "loss": 0.0008, - "ppl": 0.0270843505859375, - "reward": 0.9064868092536926, - "reward_std": 0.06706207469687797, - "rewards/perpo_ocr_edit_distance_reward": 0.9064868986606598, + "advantages": -1.1920928955078125e-07, + "completion_length": 313.0, + "delta_ref_entropy_loss": 0.150390625, + "delta_ref_ppl": -0.1767578125, + "entropy_loss": -0.1845703125, + "epoch": 0.2578, + "grad_norm": 5.194982541334957, + "k1_kl": 0.1767578125, + "k3_kl": 0.11572265625, + "kimi_kl": 0.376953125, + "learning_rate": 3.711e-07, + "loss": 0.0046, + "ppl": 0.0673828125, + "reward": 0.8205382227897644, + "reward_std": 0.2911282479763031, + "rewards/perpo_ocr_edit_distance_reward": 0.820538341999054, "step": 1289, "temperature": 0.9 }, { - "advantages": 0.0, - "completion_length": 376.0, - "delta_ref_entropy_loss": 0.043701171875, - "delta_ref_ppl": -0.03216552734375, - "entropy_loss": -0.018646240234375, - "epoch": 0.516, - "grad_norm": 0.19436201468189185, - "k1_kl": 0.03216552734375, - "k3_kl": 0.0172119140625, - "kimi_kl": 0.032806396484375, - "learning_rate": 2.4199999999999997e-07, - "loss": 0.0007, - "ppl": 0.00897216796875, - "reward": 0.9862455129623413, - "reward_std": 0.00020797736942768097, - "rewards/perpo_ocr_edit_distance_reward": 0.9862455427646637, + "advantages": -9.377513924846426e-05, + "completion_length": 425.0, + "delta_ref_entropy_loss": 0.056396484375, + "delta_ref_ppl": -0.047119140625, + "entropy_loss": -0.01080322265625, + "epoch": 0.258, + "grad_norm": 0.3335983970951025, + "k1_kl": 0.047119140625, + "k3_kl": 0.026123046875, + "kimi_kl": 0.06640625, + "learning_rate": 3.71e-07, + "loss": 0.0011, + "ppl": 0.002532958984375, + "reward": 0.9976537823677063, + "reward_std": 0.00017242766625713557, + "rewards/perpo_ocr_edit_distance_reward": 0.9976538419723511, "step": 1290, "temperature": 0.9 }, { - "advantages": -7.131270467652939e-05, - "completion_length": 806.0, - "delta_ref_entropy_loss": 0.04461669921875, - "delta_ref_ppl": -0.022979736328125, - "entropy_loss": -0.032928466796875, - "epoch": 0.5164, - "grad_norm": 0.3956883999703251, - "k1_kl": 0.02301025390625, - "k3_kl": 0.011383056640625, - "kimi_kl": 0.0242919921875, - "learning_rate": 2.4179999999999995e-07, - "loss": 0.0005, - "ppl": 0.016754150390625, - "reward": 0.9864567220211029, - "reward_std": 0.0005695502914022654, - "rewards/perpo_ocr_edit_distance_reward": 0.9864567816257477, + "advantages": -3.3548901683388976e-06, + "completion_length": 602.0, + "delta_ref_entropy_loss": 0.1904296875, + "delta_ref_ppl": -0.13671875, + "entropy_loss": -0.2255859375, + "epoch": 0.2582, + "grad_norm": 3.038013514552557, + "k1_kl": 0.13671875, + "k3_kl": 0.0771484375, + "kimi_kl": 0.1806640625, + "learning_rate": 3.7089999999999997e-07, + "loss": 0.0031, + "ppl": 0.12109375, + "reward": 0.6087392568588257, + "reward_std": 0.007526048459112644, + "rewards/perpo_ocr_edit_distance_reward": 0.6087393164634705, "step": 1291, "temperature": 0.9 }, { - "advantages": -3.654616271830946e-05, - "completion_length": 843.0, - "delta_ref_entropy_loss": 0.0108642578125, - "delta_ref_ppl": -0.015869140625, - "entropy_loss": -0.03564453125, - "epoch": 0.5168, - "grad_norm": 0.6177559620368542, - "k1_kl": 0.015838623046875, - "k3_kl": 0.00927734375, - "kimi_kl": 0.0228729248046875, - "learning_rate": 2.416e-07, - "loss": 0.0004, - "ppl": 0.02069091796875, - "reward": 0.9598882496356964, - "reward_std": 0.09421068250958342, - "rewards/perpo_ocr_edit_distance_reward": 0.9598882496356964, + "advantages": -4.32559427281376e-05, + "completion_length": 1250.0, + "delta_ref_entropy_loss": 0.04638671875, + "delta_ref_ppl": -0.041015625, + "entropy_loss": -0.042724609375, + "epoch": 0.2584, + "grad_norm": 0.9340024639221557, + "k1_kl": 0.041259765625, + "k3_kl": 0.0240478515625, + "kimi_kl": 0.05810546875, + "learning_rate": 3.708e-07, + "loss": 0.001, + "ppl": 0.0211181640625, + "reward": 0.9867849349975586, + "reward_std": 0.0010804363992065191, + "rewards/perpo_ocr_edit_distance_reward": 0.9867849946022034, "step": 1292, "temperature": 0.9 }, { - "advantages": -5.0732069212244824e-05, - "completion_length": 616.0, - "delta_ref_entropy_loss": 0.02447509765625, - "delta_ref_ppl": -0.02752685546875, - "entropy_loss": -0.02093505859375, - "epoch": 0.5172, - "grad_norm": 0.15395604863988113, - "k1_kl": 0.02740478515625, - "k3_kl": 0.0194091796875, - "kimi_kl": 0.0655517578125, - "learning_rate": 2.414e-07, - "loss": 0.0008, - "ppl": 0.0099639892578125, - "reward": 0.8214285373687744, - "reward_std": 0.0002017974911723286, - "rewards/perpo_ocr_edit_distance_reward": 0.8214285969734192, + "advantages": 0.0, + "completion_length": 805.0, + "delta_ref_entropy_loss": 0.038818359375, + "delta_ref_ppl": -0.055419921875, + "entropy_loss": -0.0308837890625, + "epoch": 0.2586, + "grad_norm": 0.5301219946658893, + "k1_kl": 0.055419921875, + "k3_kl": 0.035400390625, + "kimi_kl": 0.1044921875, + "learning_rate": 3.7069999999999995e-07, + "loss": 0.0014, + "ppl": 0.012939453125, + "reward": 0.9837740659713745, + "reward_std": 0.005029510241001844, + "rewards/perpo_ocr_edit_distance_reward": 0.9837740659713745, "step": 1293, "temperature": 0.9 }, { - "advantages": -0.00030838804650556995, - "completion_length": 414.0, - "delta_ref_entropy_loss": 0.084716796875, - "delta_ref_ppl": -0.1087646484375, - "entropy_loss": -0.06390380859375, - "epoch": 0.5176, - "grad_norm": 0.17858749102499408, - "k1_kl": 0.1087646484375, - "k3_kl": 0.07281494140625, - "kimi_kl": 0.181884765625, - "learning_rate": 2.4119999999999996e-07, - "loss": 0.0032, - "ppl": 0.029022216796875, - "reward": 0.8973177969455719, - "reward_std": 0.00015539555170107633, - "rewards/perpo_ocr_edit_distance_reward": 0.8973178565502167, + "advantages": 2.1815301806782372e-05, + "completion_length": 883.0, + "delta_ref_entropy_loss": 0.05419921875, + "delta_ref_ppl": -0.043701171875, + "entropy_loss": -0.0595703125, + "epoch": 0.2588, + "grad_norm": 0.6687314891462001, + "k1_kl": 0.043701171875, + "k3_kl": 0.0233154296875, + "kimi_kl": 0.059326171875, + "learning_rate": 3.7059999999999994e-07, + "loss": 0.0009, + "ppl": 0.0283203125, + "reward": 0.9898455739021301, + "reward_std": 0.0010718146804720163, + "rewards/perpo_ocr_edit_distance_reward": 0.9898455739021301, "step": 1294, "temperature": 0.9 }, { - "advantages": -2.6796546080731787e-05, - "completion_length": 653.5, - "delta_ref_entropy_loss": 0.0330810546875, - "delta_ref_ppl": -0.0447540283203125, - "entropy_loss": -0.0379638671875, - "epoch": 0.518, - "grad_norm": 0.40735084595065213, - "k1_kl": 0.044769287109375, - "k3_kl": 0.0270538330078125, - "kimi_kl": 0.07171440124511719, - "learning_rate": 2.41e-07, - "loss": 0.0011, - "ppl": 0.018524169921875, - "reward": 0.9845029711723328, - "reward_std": 0.0002677480224519968, - "rewards/perpo_ocr_edit_distance_reward": 0.9845029711723328, + "advantages": -8.855547548591858e-07, + "completion_length": 745.0, + "delta_ref_entropy_loss": 0.10693359375, + "delta_ref_ppl": -0.07861328125, + "entropy_loss": -0.05419921875, + "epoch": 0.259, + "grad_norm": 1.0475855697220504, + "k1_kl": 0.07861328125, + "k3_kl": 0.04443359375, + "kimi_kl": 0.138671875, + "learning_rate": 3.705e-07, + "loss": 0.0018, + "ppl": 0.02490234375, + "reward": 0.7897321581840515, + "reward_std": 0.05704526975750923, + "rewards/perpo_ocr_edit_distance_reward": 0.7897322773933411, "step": 1295, "temperature": 0.9 }, { - "advantages": 2.425057625998761e-05, - "completion_length": 627.0, - "delta_ref_entropy_loss": 0.06884765625, - "delta_ref_ppl": -0.036041259765625, - "entropy_loss": -0.1201171875, - "epoch": 0.5184, - "grad_norm": 0.8522026883684021, - "k1_kl": 0.036041259765625, - "k3_kl": 0.0154571533203125, - "kimi_kl": 0.0301055908203125, - "learning_rate": 2.408e-07, - "loss": 0.0006, - "ppl": 0.0628662109375, - "reward": 0.8887720704078674, - "reward_std": 0.10037063524941914, - "rewards/perpo_ocr_edit_distance_reward": 0.8887721002101898, + "advantages": -4.26088081439957e-05, + "completion_length": 242.0, + "delta_ref_entropy_loss": 0.07763671875, + "delta_ref_ppl": -0.1240234375, + "entropy_loss": -0.0238037109375, + "epoch": 0.2592, + "grad_norm": 0.7750384170738726, + "k1_kl": 0.1240234375, + "k3_kl": 0.0869140625, + "kimi_kl": 0.2890625, + "learning_rate": 3.704e-07, + "loss": 0.0035, + "ppl": 0.00927734375, + "reward": 0.9928571581840515, + "reward_std": 0.000699926633387804, + "rewards/perpo_ocr_edit_distance_reward": 0.9928571581840515, "step": 1296, "temperature": 0.9 }, { - "advantages": -0.00027936271180806216, - "completion_length": 684.0, - "delta_ref_entropy_loss": 0.017425537109375, - "delta_ref_ppl": -0.011749267578125, - "entropy_loss": -0.011932373046875, - "epoch": 0.5188, - "grad_norm": 0.09047327134522692, - "k1_kl": 0.0116729736328125, - "k3_kl": 0.0069122314453125, - "kimi_kl": 0.0194091796875, - "learning_rate": 2.406e-07, - "loss": 0.0006, - "ppl": 0.00409698486328125, - "reward": 0.9852167069911957, - "reward_std": 6.408510671462864e-05, - "rewards/perpo_ocr_edit_distance_reward": 0.9852167963981628, + "advantages": -3.405979782655777e-07, + "completion_length": 92.0, + "delta_ref_entropy_loss": 0.146484375, + "delta_ref_ppl": -0.392578125, + "entropy_loss": -0.302734375, + "epoch": 0.2594, + "grad_norm": 6.781439277601064, + "k1_kl": 0.392578125, + "k3_kl": 0.30859375, + "kimi_kl": 1.09375, + "learning_rate": 3.7030000000000003e-07, + "loss": 0.0124, + "ppl": 0.13671875, + "reward": 0.34479522705078125, + "reward_std": 0.03751649335026741, + "rewards/perpo_ocr_edit_distance_reward": 0.34479525685310364, "step": 1297, "temperature": 0.9 }, { - "advantages": -1.762594592946698e-06, - "completion_length": 63.0, - "delta_ref_entropy_loss": 0.13671875, - "delta_ref_ppl": -0.150390625, - "entropy_loss": -0.17431640625, - "epoch": 0.5192, - "grad_norm": 2.6018622791092567, - "k1_kl": 0.150390625, - "k3_kl": 0.108154296875, - "kimi_kl": 0.3271484375, - "learning_rate": 2.404e-07, - "loss": 0.0043, - "ppl": 0.105224609375, - "reward": 0.9791959822177887, - "reward_std": 0.0035870354622602463, - "rewards/perpo_ocr_edit_distance_reward": 0.9791960120201111, + "advantages": -1.737049751682207e-05, + "completion_length": 1754.0, + "delta_ref_entropy_loss": 0.056884765625, + "delta_ref_ppl": -0.053955078125, + "entropy_loss": -0.09375, + "epoch": 0.2596, + "grad_norm": 15.425673678799024, + "k1_kl": 0.0537109375, + "k3_kl": 0.08984375, + "kimi_kl": 0.09716796875, + "learning_rate": 3.7019999999999997e-07, + "loss": 0.0036, + "ppl": 0.053466796875, + "reward": 0.9619489312171936, + "reward_std": 0.0023526502773165703, + "rewards/perpo_ocr_edit_distance_reward": 0.9619489908218384, "step": 1298, "temperature": 0.9 }, { - "advantages": -3.602675224101404e-05, - "completion_length": 383.5, - "delta_ref_entropy_loss": 0.0589599609375, - "delta_ref_ppl": -0.0576171875, - "entropy_loss": -0.06365966796875, - "epoch": 0.5196, - "grad_norm": 1.2441222604557811, - "k1_kl": 0.0579833984375, - "k3_kl": 0.03607177734375, - "kimi_kl": 0.095703125, - "learning_rate": 2.402e-07, - "loss": 0.0015, - "ppl": 0.0362548828125, - "reward": 0.9481878876686096, - "reward_std": 0.002861212589778006, - "rewards/perpo_ocr_edit_distance_reward": 0.9481880068778992, + "advantages": -1.7728125385474414e-05, + "completion_length": 534.0, + "delta_ref_entropy_loss": 0.039794921875, + "delta_ref_ppl": -0.0439453125, + "entropy_loss": -0.01556396484375, + "epoch": 0.2598, + "grad_norm": 0.4257467534822954, + "k1_kl": 0.044189453125, + "k3_kl": 0.0263671875, + "kimi_kl": 0.0712890625, + "learning_rate": 3.7009999999999996e-07, + "loss": 0.0011, + "ppl": 0.005767822265625, + "reward": 0.9748161435127258, + "reward_std": 0.0003806322056334466, + "rewards/perpo_ocr_edit_distance_reward": 0.9748161435127258, "step": 1299, "temperature": 0.9 }, { - "advantages": -4.06886865675915e-05, - "completion_length": 396.5, - "delta_ref_entropy_loss": 0.02386474609375, - "delta_ref_ppl": -0.03466796875, - "entropy_loss": -0.0272216796875, - "epoch": 0.52, - "grad_norm": 0.26690067170128334, - "k1_kl": 0.03472900390625, - "k3_kl": 0.02703857421875, - "kimi_kl": 0.125732421875, - "learning_rate": 2.4e-07, - "loss": 0.0011, - "ppl": 0.011932373046875, - "reward": 0.9999142289161682, - "reward_std": 0.00010695408855099231, - "rewards/perpo_ocr_edit_distance_reward": 0.9999142587184906, + "advantages": -2.5221281248377636e-05, + "completion_length": 433.0, + "delta_ref_entropy_loss": 0.0751953125, + "delta_ref_ppl": -0.09521484375, + "entropy_loss": -0.04150390625, + "epoch": 0.26, + "grad_norm": 1.3111573807641588, + "k1_kl": 0.09521484375, + "k3_kl": 0.0615234375, + "kimi_kl": 0.193359375, + "learning_rate": 3.7e-07, + "loss": 0.0025, + "ppl": 0.015380859375, + "reward": 0.9776752591133118, + "reward_std": 0.0025987927801907063, + "rewards/perpo_ocr_edit_distance_reward": 0.9776754379272461, "step": 1300, "temperature": 0.9 }, { - "advantages": -5.823267565574497e-05, - "completion_length": 374.5, - "delta_ref_entropy_loss": 0.03265380859375, - "delta_ref_ppl": -0.03594970703125, - "entropy_loss": -0.037445068359375, - "epoch": 0.5204, - "grad_norm": 3.2395369966132717, - "k1_kl": 0.035888671875, - "k3_kl": 0.02313232421875, - "kimi_kl": 0.0574951171875, - "learning_rate": 2.398e-07, - "loss": 0.001, - "ppl": 0.0203094482421875, - "reward": 0.9988665282726288, - "reward_std": 0.0016154773184098303, - "rewards/perpo_ocr_edit_distance_reward": 0.998866617679596, + "advantages": -1.1852809620904736e-05, + "completion_length": 457.0, + "delta_ref_entropy_loss": 0.07470703125, + "delta_ref_ppl": -0.07470703125, + "entropy_loss": -0.09033203125, + "epoch": 0.2602, + "grad_norm": 2.166816792880131, + "k1_kl": 0.07470703125, + "k3_kl": 0.048828125, + "kimi_kl": 0.1533203125, + "learning_rate": 3.699e-07, + "loss": 0.002, + "ppl": 0.0439453125, + "reward": 0.9267683625221252, + "reward_std": 0.00349412695504725, + "rewards/perpo_ocr_edit_distance_reward": 0.92676842212677, "step": 1301, "temperature": 0.9 }, { - "advantages": -1.7396041585016064e-05, - "completion_length": 428.0, - "delta_ref_entropy_loss": 0.03436279296875, - "delta_ref_ppl": -0.04925537109375, - "entropy_loss": -0.02099609375, - "epoch": 0.5208, - "grad_norm": 0.7835890225241597, - "k1_kl": 0.0491943359375, - "k3_kl": 0.031585693359375, - "kimi_kl": 0.0789794921875, - "learning_rate": 2.396e-07, - "loss": 0.0013, - "ppl": 0.0107879638671875, - "reward": 0.9654514789581299, - "reward_std": 0.004847641219384968, - "rewards/perpo_ocr_edit_distance_reward": 0.9654515087604523, + "advantages": -2.3007394702290185e-05, + "completion_length": 484.0, + "delta_ref_entropy_loss": 0.103515625, + "delta_ref_ppl": -0.11083984375, + "entropy_loss": -0.037353515625, + "epoch": 0.2604, + "grad_norm": 0.43298738642732587, + "k1_kl": 0.11083984375, + "k3_kl": 0.07373046875, + "kimi_kl": 0.26953125, + "learning_rate": 3.698e-07, + "loss": 0.003, + "ppl": 0.0133056640625, + "reward": 0.9908557534217834, + "reward_std": 0.001010359264910221, + "rewards/perpo_ocr_edit_distance_reward": 0.990855872631073, "step": 1302, "temperature": 0.9 }, { - "advantages": -8.584984971093945e-05, - "completion_length": 714.0, - "delta_ref_entropy_loss": 0.03277587890625, - "delta_ref_ppl": -0.0230712890625, - "entropy_loss": -0.0299072265625, - "epoch": 0.5212, - "grad_norm": 0.5431174670833174, - "k1_kl": 0.0230712890625, - "k3_kl": 0.013397216796875, - "kimi_kl": 0.03228759765625, - "learning_rate": 2.394e-07, - "loss": 0.0006, - "ppl": 0.0164794921875, - "reward": 0.9890462458133698, - "reward_std": 0.0007053587469272316, - "rewards/perpo_ocr_edit_distance_reward": 0.9890463054180145, + "advantages": -5.023820222049835e-07, + "completion_length": 709.0, + "delta_ref_entropy_loss": 0.09912109375, + "delta_ref_ppl": -0.12353515625, + "entropy_loss": -0.1474609375, + "epoch": 0.2606, + "grad_norm": 2.7565646475912717, + "k1_kl": 0.12353515625, + "k3_kl": 0.09033203125, + "kimi_kl": 0.298828125, + "learning_rate": 3.697e-07, + "loss": 0.0036, + "ppl": 0.072265625, + "reward": 0.8954753875732422, + "reward_std": 0.16283074021339417, + "rewards/perpo_ocr_edit_distance_reward": 0.895475447177887, "step": 1303, "temperature": 0.9 }, { - "advantages": -1.1937959527585917e-05, - "completion_length": 1042.0, - "delta_ref_entropy_loss": 0.030517578125, - "delta_ref_ppl": -0.017425537109375, - "entropy_loss": -0.06689453125, - "epoch": 0.5216, - "grad_norm": 3.7308076200291764, - "k1_kl": 0.0174560546875, - "k3_kl": 0.0116424560546875, - "kimi_kl": 0.0235443115234375, - "learning_rate": 2.3919999999999997e-07, - "loss": 0.0005, - "ppl": 0.0382080078125, - "reward": 0.9038077592849731, - "reward_std": 0.029206613427959383, - "rewards/perpo_ocr_edit_distance_reward": 0.9038078188896179, + "advantages": -6.386212135112146e-06, + "completion_length": 572.0, + "delta_ref_entropy_loss": 0.07958984375, + "delta_ref_ppl": -0.09375, + "entropy_loss": -0.03369140625, + "epoch": 0.2608, + "grad_norm": 3.44356714780972, + "k1_kl": 0.09375, + "k3_kl": 0.061767578125, + "kimi_kl": 0.1962890625, + "learning_rate": 3.696e-07, + "loss": 0.0025, + "ppl": 0.0185546875, + "reward": 0.9654015302658081, + "reward_std": 0.015927111729979515, + "rewards/perpo_ocr_edit_distance_reward": 0.9654016494750977, "step": 1304, "temperature": 0.9 }, { - "advantages": -7.095081673469394e-05, - "completion_length": 711.0, - "delta_ref_entropy_loss": 0.028564453125, - "delta_ref_ppl": -0.0184326171875, - "entropy_loss": -0.03338623046875, - "epoch": 0.522, - "grad_norm": 0.5957371732410439, - "k1_kl": 0.018310546875, - "k3_kl": 0.009490966796875, - "kimi_kl": 0.02337646484375, - "learning_rate": 2.3899999999999996e-07, - "loss": 0.0004, - "ppl": 0.0165252685546875, - "reward": 0.9962391555309296, - "reward_std": 0.00035144253342878073, - "rewards/perpo_ocr_edit_distance_reward": 0.9962392449378967, + "advantages": 0.0, + "completion_length": 523.0, + "delta_ref_entropy_loss": 0.1025390625, + "delta_ref_ppl": -0.1142578125, + "entropy_loss": -0.1884765625, + "epoch": 0.261, + "grad_norm": 1.3530193742714316, + "k1_kl": 0.1142578125, + "k3_kl": 0.0751953125, + "kimi_kl": 0.20703125, + "learning_rate": 3.6949999999999997e-07, + "loss": 0.003, + "ppl": 0.0927734375, + "reward": 0.7286285161972046, + "reward_std": 0.1752457469701767, + "rewards/perpo_ocr_edit_distance_reward": 0.7286284565925598, "step": 1305, "temperature": 0.9 }, { - "advantages": -9.26426559999527e-06, - "completion_length": 479.5, - "delta_ref_entropy_loss": 0.08349609375, - "delta_ref_ppl": -0.0618896484375, - "entropy_loss": -0.092041015625, - "epoch": 0.5224, - "grad_norm": 1.2989266326641433, - "k1_kl": 0.0615234375, - "k3_kl": 0.0322265625, - "kimi_kl": 0.0791015625, - "learning_rate": 2.388e-07, - "loss": 0.0013, - "ppl": 0.04931640625, - "reward": 0.9309160113334656, - "reward_std": 0.012715761549770832, - "rewards/perpo_ocr_edit_distance_reward": 0.9309160709381104, + "advantages": 0.0, + "completion_length": 225.0, + "delta_ref_entropy_loss": 0.0888671875, + "delta_ref_ppl": -0.130859375, + "entropy_loss": -0.025634765625, + "epoch": 0.2612, + "grad_norm": 0.023645733556180524, + "k1_kl": 0.130859375, + "k3_kl": 0.08740234375, + "kimi_kl": 0.279296875, + "learning_rate": 3.694e-07, + "loss": 0.0035, + "ppl": 0.007598876953125, + "reward": 0.791304349899292, + "reward_std": 0.0, + "rewards/perpo_ocr_edit_distance_reward": 0.791304349899292, "step": 1306, "temperature": 0.9 }, { - "advantages": -3.8934608141971694e-05, - "completion_length": 411.5, - "delta_ref_entropy_loss": 0.0394287109375, - "delta_ref_ppl": -0.07568359375, - "entropy_loss": -0.06817626953125, - "epoch": 0.5228, - "grad_norm": 1.5238814383766979, - "k1_kl": 0.07568359375, - "k3_kl": 0.058837890625, - "kimi_kl": 0.2568359375, - "learning_rate": 2.386e-07, - "loss": 0.0024, - "ppl": 0.03790283203125, - "reward": 0.9046781659126282, - "reward_std": 0.009869664325378835, - "rewards/perpo_ocr_edit_distance_reward": 0.9046782553195953, + "advantages": -6.277221109485254e-05, + "completion_length": 866.0, + "delta_ref_entropy_loss": 0.054443359375, + "delta_ref_ppl": -0.05126953125, + "entropy_loss": -0.0196533203125, + "epoch": 0.2614, + "grad_norm": 1.2684256511524796, + "k1_kl": 0.05126953125, + "k3_kl": 0.02490234375, + "kimi_kl": 0.06689453125, + "learning_rate": 3.693e-07, + "loss": 0.0011, + "ppl": 0.006622314453125, + "reward": 0.9965398907661438, + "reward_std": 0.00044273128150962293, + "rewards/perpo_ocr_edit_distance_reward": 0.9965399503707886, "step": 1307, "temperature": 0.9 }, { - "advantages": 7.024835213087499e-06, - "completion_length": 583.0, - "delta_ref_entropy_loss": 0.0377197265625, - "delta_ref_ppl": -0.050537109375, - "entropy_loss": -0.031280517578125, - "epoch": 0.5232, - "grad_norm": 0.5037159677922988, - "k1_kl": 0.050537109375, - "k3_kl": 0.03704833984375, - "kimi_kl": 0.17626953125, - "learning_rate": 2.384e-07, - "loss": 0.0015, - "ppl": 0.01495361328125, - "reward": 0.9996731877326965, - "reward_std": 0.0003894512992701493, - "rewards/perpo_ocr_edit_distance_reward": 0.9996732771396637, + "advantages": -1.374312887492124e-05, + "completion_length": 1598.0, + "delta_ref_entropy_loss": 0.021240234375, + "delta_ref_ppl": -0.0233154296875, + "entropy_loss": -0.0101318359375, + "epoch": 0.2616, + "grad_norm": 1.6671080910847804, + "k1_kl": 0.0233154296875, + "k3_kl": 0.018310546875, + "kimi_kl": 0.046630859375, + "learning_rate": 3.6919999999999994e-07, + "loss": 0.0007, + "ppl": 0.00537109375, + "reward": 0.8177139759063721, + "reward_std": 0.002380700083449483, + "rewards/perpo_ocr_edit_distance_reward": 0.8177140355110168, "step": 1308, "temperature": 0.9 }, { - "advantages": -7.033348538243445e-06, - "completion_length": 622.5, - "delta_ref_entropy_loss": 0.062744140625, - "delta_ref_ppl": -0.04248046875, - "entropy_loss": -0.0703125, - "epoch": 0.5236, - "grad_norm": 0.8159158549822038, - "k1_kl": 0.04248046875, - "k3_kl": 0.02093505859375, - "kimi_kl": 0.0458984375, - "learning_rate": 2.3819999999999998e-07, - "loss": 0.0008, - "ppl": 0.038330078125, - "reward": 0.9457544684410095, - "reward_std": 0.006564292940311134, - "rewards/perpo_ocr_edit_distance_reward": 0.9457545876502991, + "advantages": -0.00010007620585383847, + "completion_length": 1428.0, + "delta_ref_entropy_loss": 0.020263671875, + "delta_ref_ppl": -0.0322265625, + "entropy_loss": -0.0185546875, + "epoch": 0.2618, + "grad_norm": 0.5050227259623943, + "k1_kl": 0.0322265625, + "k3_kl": 0.022705078125, + "kimi_kl": 0.06689453125, + "learning_rate": 3.691e-07, + "loss": 0.001, + "ppl": 0.0067138671875, + "reward": 0.9896594882011414, + "reward_std": 0.0003254091425333172, + "rewards/perpo_ocr_edit_distance_reward": 0.9896595478057861, "step": 1309, "temperature": 0.9 }, { - "advantages": -0.0002047802809101995, - "completion_length": 564.5, - "delta_ref_entropy_loss": 0.03912353515625, - "delta_ref_ppl": -0.04217529296875, - "entropy_loss": -0.0400390625, - "epoch": 0.524, - "grad_norm": 0.4820888036051585, - "k1_kl": 0.0421142578125, - "k3_kl": 0.02667236328125, - "kimi_kl": 0.09423828125, - "learning_rate": 2.38e-07, - "loss": 0.0013, - "ppl": 0.01959228515625, - "reward": 0.997576117515564, - "reward_std": 0.0005084607983008027, - "rewards/perpo_ocr_edit_distance_reward": 0.9975762665271759, + "advantages": -2.043587983280304e-06, + "completion_length": 536.0, + "delta_ref_entropy_loss": 0.236328125, + "delta_ref_ppl": -0.16015625, + "entropy_loss": -0.216796875, + "epoch": 0.262, + "grad_norm": 2.0781185404224654, + "k1_kl": 0.16015625, + "k3_kl": 0.08984375, + "kimi_kl": 0.21875, + "learning_rate": 3.69e-07, + "loss": 0.0036, + "ppl": 0.11572265625, + "reward": 0.8024322986602783, + "reward_std": 0.004087510518729687, + "rewards/perpo_ocr_edit_distance_reward": 0.8024323582649231, "step": 1310, "temperature": 0.9 }, { - "advantages": 0.0, - "completion_length": 518.0, - "delta_ref_entropy_loss": 0.02130126953125, - "delta_ref_ppl": -0.01873779296875, - "entropy_loss": -0.01239013671875, - "epoch": 0.5244, - "grad_norm": 0.010785312525357775, - "k1_kl": 0.0186767578125, - "k3_kl": 0.010711669921875, - "kimi_kl": 0.0291748046875, - "learning_rate": 2.378e-07, - "loss": 0.0004, - "ppl": 0.0054473876953125, - "reward": 1.0, - "reward_std": 0.0, - "rewards/perpo_ocr_edit_distance_reward": 1.0, + "advantages": -1.3351441339182202e-05, + "completion_length": 220.0, + "delta_ref_entropy_loss": 0.10009765625, + "delta_ref_ppl": -0.1416015625, + "entropy_loss": -0.044189453125, + "epoch": 0.2622, + "grad_norm": 2.296767615068945, + "k1_kl": 0.140625, + "k3_kl": 0.09228515625, + "kimi_kl": 0.271484375, + "learning_rate": 3.689e-07, + "loss": 0.0037, + "ppl": 0.017333984375, + "reward": 0.9861031174659729, + "reward_std": 0.0030896493699401617, + "rewards/perpo_ocr_edit_distance_reward": 0.9861032366752625, "step": 1311, "temperature": 0.9 }, { - "advantages": -8.174351933121216e-07, - "completion_length": 317.0, - "delta_ref_entropy_loss": 0.0936279296875, - "delta_ref_ppl": -0.07659912109375, - "entropy_loss": -0.11529541015625, - "epoch": 0.5248, - "grad_norm": 1.0668687622401258, - "k1_kl": 0.0765380859375, - "k3_kl": 0.042510986328125, - "kimi_kl": 0.1185302734375, - "learning_rate": 2.3759999999999998e-07, - "loss": 0.0017, - "ppl": 0.0550994873046875, - "reward": 0.9490856230258942, - "reward_std": 0.025970248505473137, - "rewards/perpo_ocr_edit_distance_reward": 0.9490856528282166, + "advantages": -2.8337752155493945e-05, + "completion_length": 357.0, + "delta_ref_entropy_loss": 0.09716796875, + "delta_ref_ppl": -0.111328125, + "entropy_loss": -0.0244140625, + "epoch": 0.2624, + "grad_norm": 0.7580385542844565, + "k1_kl": 0.11181640625, + "k3_kl": 0.0654296875, + "kimi_kl": 0.1669921875, + "learning_rate": 3.688e-07, + "loss": 0.0026, + "ppl": 0.00958251953125, + "reward": 0.9769665002822876, + "reward_std": 0.002003959147259593, + "rewards/perpo_ocr_edit_distance_reward": 0.9769665598869324, "step": 1312, "temperature": 0.9 }, { - "advantages": -5.963870580671937e-05, - "completion_length": 738.5, - "delta_ref_entropy_loss": 0.0286865234375, - "delta_ref_ppl": -0.02490234375, - "entropy_loss": -0.03460693359375, - "epoch": 0.5252, - "grad_norm": 0.914323033811223, - "k1_kl": 0.02484130859375, - "k3_kl": 0.015380859375, - "kimi_kl": 0.05181884765625, - "learning_rate": 2.374e-07, - "loss": 0.0007, - "ppl": 0.016082763671875, - "reward": 0.9975229799747467, - "reward_std": 0.0017931708935066126, - "rewards/perpo_ocr_edit_distance_reward": 0.9975230395793915, + "advantages": -6.0328417021082714e-05, + "completion_length": 1027.0, + "delta_ref_entropy_loss": 0.023681640625, + "delta_ref_ppl": -0.0230712890625, + "entropy_loss": -0.0244140625, + "epoch": 0.2626, + "grad_norm": 0.2588970983070705, + "k1_kl": 0.0230712890625, + "k3_kl": 0.013427734375, + "kimi_kl": 0.038330078125, + "learning_rate": 3.687e-07, + "loss": 0.0006, + "ppl": 0.00994873046875, + "reward": 0.9976580739021301, + "reward_std": 0.00018226070096716285, + "rewards/perpo_ocr_edit_distance_reward": 0.9976580739021301, "step": 1313, "temperature": 0.9 }, { - "advantages": -5.986009649738833e-06, - "completion_length": 235.5, - "delta_ref_entropy_loss": 0.06201171875, - "delta_ref_ppl": -0.2548828125, - "entropy_loss": -0.10498046875, - "epoch": 0.5256, - "grad_norm": 4.867835469727697, - "k1_kl": 0.2547607421875, - "k3_kl": 0.20538330078125, - "kimi_kl": 0.805908203125, - "learning_rate": 2.3719999999999998e-07, - "loss": 0.0082, - "ppl": 0.060302734375, - "reward": 0.5189149081707001, - "reward_std": 0.0013304125750437379, - "rewards/perpo_ocr_edit_distance_reward": 0.5189149118959904, + "advantages": -1.498631149843277e-06, + "completion_length": 1639.0, + "delta_ref_entropy_loss": 0.08056640625, + "delta_ref_ppl": -0.064453125, + "entropy_loss": -0.0908203125, + "epoch": 0.2628, + "grad_norm": 5.334010769058515, + "k1_kl": 0.064453125, + "k3_kl": 0.0390625, + "kimi_kl": 0.0703125, + "learning_rate": 3.6859999999999995e-07, + "loss": 0.0016, + "ppl": 0.051025390625, + "reward": 0.6641461849212646, + "reward_std": 0.02851751632988453, + "rewards/perpo_ocr_edit_distance_reward": 0.6641462445259094, "step": 1314, "temperature": 0.9 }, { - "advantages": -6.684235358989099e-05, - "completion_length": 339.0, - "delta_ref_entropy_loss": 0.049072265625, - "delta_ref_ppl": -0.0531005859375, - "entropy_loss": -0.02520751953125, - "epoch": 0.526, - "grad_norm": 0.5805797616422349, - "k1_kl": 0.0531005859375, - "k3_kl": 0.03558349609375, - "kimi_kl": 0.0928955078125, - "learning_rate": 2.3699999999999996e-07, - "loss": 0.0015, - "ppl": 0.012786865234375, - "reward": 0.9990471005439758, - "reward_std": 0.0011320815538056195, - "rewards/perpo_ocr_edit_distance_reward": 0.9990471601486206, + "advantages": -2.895082786835701e-07, + "completion_length": 1337.0, + "delta_ref_entropy_loss": 0.06640625, + "delta_ref_ppl": -0.0537109375, + "entropy_loss": -0.09912109375, + "epoch": 0.263, + "grad_norm": 3.200398241926439, + "k1_kl": 0.0537109375, + "k3_kl": 0.0296630859375, + "kimi_kl": 0.0576171875, + "learning_rate": 3.685e-07, + "loss": 0.0012, + "ppl": 0.049560546875, + "reward": 0.8001798987388611, + "reward_std": 0.12063043564558029, + "rewards/perpo_ocr_edit_distance_reward": 0.8001800179481506, "step": 1315, "temperature": 0.9 }, { - "advantages": -4.044601169539419e-06, - "completion_length": 1627.0, - "delta_ref_entropy_loss": 0.029052734375, - "delta_ref_ppl": -0.0144500732421875, - "entropy_loss": -0.0477294921875, - "epoch": 0.5264, - "grad_norm": 0.9083137995945505, - "k1_kl": 0.0144500732421875, - "k3_kl": 0.00920867919921875, - "kimi_kl": 0.01514434814453125, - "learning_rate": 2.368e-07, - "loss": 0.0004, - "ppl": 0.025299072265625, - "reward": 0.8898071050643921, - "reward_std": 0.07841686089523137, - "rewards/perpo_ocr_edit_distance_reward": 0.8898071646690369, + "advantages": -8.063657332968432e-06, + "completion_length": 1227.0, + "delta_ref_entropy_loss": 0.031982421875, + "delta_ref_ppl": -0.032470703125, + "entropy_loss": -0.019287109375, + "epoch": 0.2632, + "grad_norm": 0.4153170794067065, + "k1_kl": 0.032470703125, + "k3_kl": 0.0177001953125, + "kimi_kl": 0.041015625, + "learning_rate": 3.684e-07, + "loss": 0.0007, + "ppl": 0.007354736328125, + "reward": 0.9952051639556885, + "reward_std": 0.004122433718293905, + "rewards/perpo_ocr_edit_distance_reward": 0.9952052235603333, "step": 1316, "temperature": 0.9 }, { - "advantages": -5.3720817959401757e-05, - "completion_length": 604.5, - "delta_ref_entropy_loss": 0.03363037109375, - "delta_ref_ppl": -0.02386474609375, - "entropy_loss": -0.027587890625, - "epoch": 0.5268, - "grad_norm": 0.3863477498001348, - "k1_kl": 0.02392578125, - "k3_kl": 0.0133056640625, - "kimi_kl": 0.030517578125, - "learning_rate": 2.366e-07, - "loss": 0.0006, - "ppl": 0.0154571533203125, - "reward": 0.9962545037269592, - "reward_std": 0.0003463429748080671, - "rewards/perpo_ocr_edit_distance_reward": 0.9962545335292816, + "advantages": -2.2309168343781494e-06, + "completion_length": 61.0, + "delta_ref_entropy_loss": 0.244140625, + "delta_ref_ppl": -0.51171875, + "entropy_loss": -0.203125, + "epoch": 0.2634, + "grad_norm": 3.8911644499838856, + "k1_kl": 0.51171875, + "k3_kl": 0.380859375, + "kimi_kl": 1.328125, + "learning_rate": 3.683e-07, + "loss": 0.0152, + "ppl": 0.083984375, + "reward": 0.7363530397415161, + "reward_std": 0.0036574576515704393, + "rewards/perpo_ocr_edit_distance_reward": 0.7363530993461609, "step": 1317, "temperature": 0.9 }, { - "advantages": 6.301062853708572e-07, - "completion_length": 327.0, - "delta_ref_entropy_loss": 0.0291748046875, - "delta_ref_ppl": -0.02825927734375, - "entropy_loss": -0.026153564453125, - "epoch": 0.5272, - "grad_norm": 1.0032874334874868, - "k1_kl": 0.0283203125, - "k3_kl": 0.019775390625, - "kimi_kl": 0.054443359375, - "learning_rate": 2.364e-07, - "loss": 0.0008, - "ppl": 0.01434326171875, - "reward": 0.995886892080307, - "reward_std": 0.006666550878435373, - "rewards/perpo_ocr_edit_distance_reward": 0.995886892080307, + "advantages": -4.598924351739697e-05, + "completion_length": 548.0, + "delta_ref_entropy_loss": 0.06591796875, + "delta_ref_ppl": -0.06884765625, + "entropy_loss": -0.0191650390625, + "epoch": 0.2636, + "grad_norm": 0.4021923177529885, + "k1_kl": 0.06884765625, + "k3_kl": 0.04296875, + "kimi_kl": 0.1435546875, + "learning_rate": 3.6820000000000003e-07, + "loss": 0.0018, + "ppl": 0.00567626953125, + "reward": 0.9874446392059326, + "reward_std": 0.0008254441199824214, + "rewards/perpo_ocr_edit_distance_reward": 0.9874447584152222, "step": 1318, "temperature": 0.9 }, { - "advantages": -0.0001101195884984918, - "completion_length": 404.5, - "delta_ref_entropy_loss": 0.05224609375, - "delta_ref_ppl": -0.0357666015625, - "entropy_loss": -0.031982421875, - "epoch": 0.5276, - "grad_norm": 0.17816310402996882, - "k1_kl": 0.0360107421875, - "k3_kl": 0.02545166015625, - "kimi_kl": 0.078369140625, - "learning_rate": 2.3619999999999998e-07, - "loss": 0.0011, - "ppl": 0.010650634765625, - "reward": 0.9990652203559875, - "reward_std": 6.598122126888484e-05, - "rewards/perpo_ocr_edit_distance_reward": 0.9990652799606323, + "advantages": 7.76222805143334e-05, + "completion_length": 655.0, + "delta_ref_entropy_loss": 0.035888671875, + "delta_ref_ppl": -0.031005859375, + "entropy_loss": -0.0201416015625, + "epoch": 0.2638, + "grad_norm": 0.3326889537875522, + "k1_kl": 0.031005859375, + "k3_kl": 0.0172119140625, + "kimi_kl": 0.043212890625, + "learning_rate": 3.6809999999999997e-07, + "loss": 0.0006, + "ppl": 0.00726318359375, + "reward": 0.997517466545105, + "reward_std": 0.00011942762648686767, + "rewards/perpo_ocr_edit_distance_reward": 0.997517466545105, "step": 1319, "temperature": 0.9 }, { - "advantages": -5.2473378936213066e-05, - "completion_length": 321.5, - "delta_ref_entropy_loss": 0.058349609375, - "delta_ref_ppl": -0.0462646484375, - "entropy_loss": -0.0872802734375, - "epoch": 0.528, - "grad_norm": 1.0691336002687688, - "k1_kl": 0.0462646484375, - "k3_kl": 0.0283203125, - "kimi_kl": 0.066162109375, - "learning_rate": 2.3599999999999997e-07, - "loss": 0.0012, - "ppl": 0.0487060546875, - "reward": 0.8198392391204834, - "reward_std": 0.10891722262022085, - "rewards/perpo_ocr_edit_distance_reward": 0.8198392987251282, + "advantages": -1.7029899268550253e-08, + "completion_length": 1298.0, + "delta_ref_entropy_loss": 0.13671875, + "delta_ref_ppl": -0.115234375, + "entropy_loss": -0.146484375, + "epoch": 0.264, + "grad_norm": 4.507855583130729, + "k1_kl": 0.115234375, + "k3_kl": 0.08349609375, + "kimi_kl": 0.1396484375, + "learning_rate": 3.6799999999999996e-07, + "loss": 0.0033, + "ppl": 0.0869140625, + "reward": 0.9553412199020386, + "reward_std": 0.0019219088135287166, + "rewards/perpo_ocr_edit_distance_reward": 0.9553412199020386, "step": 1320, "temperature": 0.9 }, { - "advantages": -1.662969702920236e-05, - "completion_length": 499.5, - "delta_ref_entropy_loss": 0.064697265625, - "delta_ref_ppl": -0.0474853515625, - "entropy_loss": -0.109619140625, - "epoch": 0.5284, - "grad_norm": 1.533827746222626, - "k1_kl": 0.0474853515625, - "k3_kl": 0.02642822265625, - "kimi_kl": 0.088897705078125, - "learning_rate": 2.358e-07, - "loss": 0.0011, - "ppl": 0.0601806640625, - "reward": 0.8604118824005127, - "reward_std": 0.0052832585060968995, - "rewards/perpo_ocr_edit_distance_reward": 0.8604119718074799, + "advantages": -2.556187791924458e-05, + "completion_length": 361.0, + "delta_ref_entropy_loss": 0.1123046875, + "delta_ref_ppl": -0.09130859375, + "entropy_loss": -0.06884765625, + "epoch": 0.2642, + "grad_norm": 1.5369100887240317, + "k1_kl": 0.09130859375, + "k3_kl": 0.047119140625, + "kimi_kl": 0.109375, + "learning_rate": 3.679e-07, + "loss": 0.0019, + "ppl": 0.034912109375, + "reward": 0.9917171001434326, + "reward_std": 0.0012311635073274374, + "rewards/perpo_ocr_edit_distance_reward": 0.9917171001434326, "step": 1321, "temperature": 0.9 }, { - "advantages": -7.926460057205986e-05, - "completion_length": 813.5, - "delta_ref_entropy_loss": 0.026123046875, - "delta_ref_ppl": -0.023590087890625, - "entropy_loss": -0.03155517578125, - "epoch": 0.5288, - "grad_norm": 0.4437632842998506, - "k1_kl": 0.0235595703125, - "k3_kl": 0.01446533203125, - "kimi_kl": 0.03778076171875, - "learning_rate": 2.356e-07, - "loss": 0.0007, - "ppl": 0.016754150390625, - "reward": 0.9731772840023041, - "reward_std": 0.0006555180007126182, - "rewards/perpo_ocr_edit_distance_reward": 0.9731773436069489, + "advantages": -0.0005960464477539062, + "completion_length": 54.0, + "delta_ref_entropy_loss": 0.14453125, + "delta_ref_ppl": -0.33203125, + "entropy_loss": -0.04736328125, + "epoch": 0.2644, + "grad_norm": 0.047671759036951804, + "k1_kl": 0.330078125, + "k3_kl": 0.2353515625, + "kimi_kl": 0.69140625, + "learning_rate": 3.678e-07, + "loss": 0.01, + "ppl": 0.00677490234375, + "reward": 0.9516128897666931, + "reward_std": 0.0, + "rewards/perpo_ocr_edit_distance_reward": 0.9516129493713379, "step": 1322, "temperature": 0.9 }, { - "advantages": -0.00033032894134521484, - "completion_length": 355.0, - "delta_ref_entropy_loss": 0.03875732421875, - "delta_ref_ppl": -0.04638671875, - "entropy_loss": -0.03851318359375, - "epoch": 0.5292, - "grad_norm": 0.4143272425886145, - "k1_kl": 0.04638671875, - "k3_kl": 0.03070068359375, - "kimi_kl": 0.09375, - "learning_rate": 2.3539999999999998e-07, - "loss": 0.0016, - "ppl": 0.0184326171875, - "reward": 0.9496392607688904, - "reward_std": 0.00027939677238464355, - "rewards/perpo_ocr_edit_distance_reward": 0.9496393799781799, + "advantages": 2.2138868871479644e-07, + "completion_length": 667.0, + "delta_ref_entropy_loss": 0.058349609375, + "delta_ref_ppl": -0.0732421875, + "entropy_loss": -0.10498046875, + "epoch": 0.2646, + "grad_norm": 1.0419999508798368, + "k1_kl": 0.0732421875, + "k3_kl": 0.0458984375, + "kimi_kl": 0.10986328125, + "learning_rate": 3.677e-07, + "loss": 0.0018, + "ppl": 0.054443359375, + "reward": 0.7887739539146423, + "reward_std": 0.15593592822551727, + "rewards/perpo_ocr_edit_distance_reward": 0.7887739539146423, "step": 1323, "temperature": 0.9 }, { - "advantages": -0.00011591401016630698, - "completion_length": 504.0, - "delta_ref_entropy_loss": 0.0374755859375, - "delta_ref_ppl": -0.03253173828125, - "entropy_loss": -0.04547119140625, - "epoch": 0.5296, - "grad_norm": 0.845555440055374, - "k1_kl": 0.03253173828125, - "k3_kl": 0.018463134765625, - "kimi_kl": 0.0474853515625, - "learning_rate": 2.352e-07, - "loss": 0.0009, - "ppl": 0.023223876953125, - "reward": 0.9943458437919617, - "reward_std": 0.0006634661549469456, - "rewards/perpo_ocr_edit_distance_reward": 0.9943459331989288, + "advantages": -8.756774332141504e-05, + "completion_length": 628.0, + "delta_ref_entropy_loss": 0.048828125, + "delta_ref_ppl": -0.04345703125, + "entropy_loss": -0.023193359375, + "epoch": 0.2648, + "grad_norm": 0.46827710926128147, + "k1_kl": 0.04345703125, + "k3_kl": 0.0263671875, + "kimi_kl": 0.06298828125, + "learning_rate": 3.676e-07, + "loss": 0.0011, + "ppl": 0.0101318359375, + "reward": 0.7812235951423645, + "reward_std": 0.00048344823881052434, + "rewards/perpo_ocr_edit_distance_reward": 0.7812236547470093, "step": 1324, "temperature": 0.9 }, { - "advantages": -6.624630799478837e-06, - "completion_length": 1380.0, - "delta_ref_entropy_loss": 0.0518798828125, - "delta_ref_ppl": -0.04168701171875, - "entropy_loss": -0.0986328125, - "epoch": 0.53, - "grad_norm": 1.3857637863656547, - "k1_kl": 0.04156494140625, - "k3_kl": 0.027191162109375, - "kimi_kl": 0.069580078125, - "learning_rate": 2.3499999999999997e-07, + "advantages": -4.456724491319619e-05, + "completion_length": 959.0, + "delta_ref_entropy_loss": 0.06005859375, + "delta_ref_ppl": -0.050537109375, + "entropy_loss": -0.0390625, + "epoch": 0.265, + "grad_norm": 0.44640775720241305, + "k1_kl": 0.050537109375, + "k3_kl": 0.0252685546875, + "kimi_kl": 0.05810546875, + "learning_rate": 3.675e-07, "loss": 0.0011, - "ppl": 0.053192138671875, - "reward": 0.927556723356247, - "reward_std": 0.05799363669939339, - "rewards/perpo_ocr_edit_distance_reward": 0.9275568127632141, + "ppl": 0.0157470703125, + "reward": 0.9926354289054871, + "reward_std": 0.0010462289210408926, + "rewards/perpo_ocr_edit_distance_reward": 0.9926355481147766, "step": 1325, "temperature": 0.9 }, { - "advantages": -6.496906735264929e-06, - "completion_length": 771.5, - "delta_ref_entropy_loss": 0.0494384765625, - "delta_ref_ppl": -0.023651123046875, - "entropy_loss": -0.0682373046875, - "epoch": 0.5304, - "grad_norm": 0.5915490305191397, - "k1_kl": 0.023651123046875, - "k3_kl": 0.010562896728515625, - "kimi_kl": 0.01739501953125, - "learning_rate": 2.3479999999999998e-07, - "loss": 0.0004, - "ppl": 0.0341033935546875, - "reward": 0.7939989268779755, - "reward_std": 0.07145745342131704, - "rewards/perpo_ocr_edit_distance_reward": 0.7939989864826202, + "advantages": -1.3640948964166455e-05, + "completion_length": 748.0, + "delta_ref_entropy_loss": 0.0712890625, + "delta_ref_ppl": -0.06982421875, + "entropy_loss": -0.0264892578125, + "epoch": 0.2652, + "grad_norm": 0.4803565356716616, + "k1_kl": 0.06982421875, + "k3_kl": 0.036865234375, + "kimi_kl": 0.08935546875, + "learning_rate": 3.6739999999999997e-07, + "loss": 0.0015, + "ppl": 0.0106201171875, + "reward": 0.992530882358551, + "reward_std": 0.0005244827480055392, + "rewards/perpo_ocr_edit_distance_reward": 0.992530882358551, "step": 1326, "temperature": 0.9 }, { - "advantages": -3.611615738918772e-05, - "completion_length": 619.0, - "delta_ref_entropy_loss": 0.0841064453125, - "delta_ref_ppl": -0.045166015625, - "entropy_loss": -0.08203125, - "epoch": 0.5308, - "grad_norm": 0.9176691942610314, - "k1_kl": 0.045166015625, - "k3_kl": 0.0211944580078125, - "kimi_kl": 0.039520263671875, - "learning_rate": 2.346e-07, - "loss": 0.0009, - "ppl": 0.0408935546875, - "reward": 0.9754653871059418, - "reward_std": 0.0019482504285406321, - "rewards/perpo_ocr_edit_distance_reward": 0.9754654765129089, + "advantages": -0.00010090215073432773, + "completion_length": 464.0, + "delta_ref_entropy_loss": 0.07373046875, + "delta_ref_ppl": -0.058349609375, + "entropy_loss": -0.032470703125, + "epoch": 0.2654, + "grad_norm": 0.776072144807283, + "k1_kl": 0.05859375, + "k3_kl": 0.0299072265625, + "kimi_kl": 0.0654296875, + "learning_rate": 3.673e-07, + "loss": 0.0013, + "ppl": 0.01025390625, + "reward": 0.9987101554870605, + "reward_std": 0.00040620891377329826, + "rewards/perpo_ocr_edit_distance_reward": 0.9987101554870605, "step": 1327, "temperature": 0.9 }, { - "advantages": -2.9031721311412184e-05, - "completion_length": 956.0, - "delta_ref_entropy_loss": 0.020660400390625, - "delta_ref_ppl": -0.0224609375, - "entropy_loss": -0.02264404296875, - "epoch": 0.5312, - "grad_norm": 0.38860333166411637, - "k1_kl": 0.0225830078125, - "k3_kl": 0.01458740234375, - "kimi_kl": 0.03570556640625, - "learning_rate": 2.3439999999999998e-07, - "loss": 0.0006, - "ppl": 0.0103759765625, - "reward": 0.9867807328701019, - "reward_std": 0.030787796567892656, - "rewards/perpo_ocr_edit_distance_reward": 0.9867807924747467, + "advantages": -0.00012957198487129062, + "completion_length": 433.0, + "delta_ref_entropy_loss": 0.052490234375, + "delta_ref_ppl": -0.08642578125, + "entropy_loss": -0.024658203125, + "epoch": 0.2656, + "grad_norm": 0.5487389402455882, + "k1_kl": 0.0869140625, + "k3_kl": 0.05615234375, + "kimi_kl": 0.166015625, + "learning_rate": 3.672e-07, + "loss": 0.0024, + "ppl": 0.0089111328125, + "reward": 0.9969347715377808, + "reward_std": 0.0004914935561828315, + "rewards/perpo_ocr_edit_distance_reward": 0.9969348907470703, "step": 1328, "temperature": 0.9 }, { - "advantages": -0.00023617490660399199, - "completion_length": 632.0, - "delta_ref_entropy_loss": 0.029052734375, - "delta_ref_ppl": -0.02557373046875, - "entropy_loss": -0.02435302734375, - "epoch": 0.5316, - "grad_norm": 0.23805100683101948, - "k1_kl": 0.02545166015625, - "k3_kl": 0.016998291015625, - "kimi_kl": 0.0738525390625, - "learning_rate": 2.342e-07, - "loss": 0.0009, - "ppl": 0.010498046875, - "reward": 0.9887761771678925, - "reward_std": 0.0003404467352083884, - "rewards/perpo_ocr_edit_distance_reward": 0.9887763261795044, + "advantages": -1.5292849639081396e-05, + "completion_length": 627.0, + "delta_ref_entropy_loss": 0.1357421875, + "delta_ref_ppl": -0.103515625, + "entropy_loss": -0.1513671875, + "epoch": 0.2658, + "grad_norm": 1.8361770681337701, + "k1_kl": 0.103515625, + "k3_kl": 0.059326171875, + "kimi_kl": 0.171875, + "learning_rate": 3.6709999999999995e-07, + "loss": 0.0024, + "ppl": 0.080078125, + "reward": 0.8250809907913208, + "reward_std": 0.0043574003502726555, + "rewards/perpo_ocr_edit_distance_reward": 0.8250810503959656, "step": 1329, "temperature": 0.9 }, { - "advantages": -4.2263951399945654e-05, - "completion_length": 876.5, - "delta_ref_entropy_loss": 0.07318115234375, - "delta_ref_ppl": -0.04052734375, - "entropy_loss": -0.0963134765625, - "epoch": 0.532, - "grad_norm": 1.0991810137449394, - "k1_kl": 0.0404052734375, - "k3_kl": 0.022735595703125, - "kimi_kl": 0.04071044921875, - "learning_rate": 2.34e-07, - "loss": 0.001, - "ppl": 0.0538330078125, - "reward": 0.9088208675384521, - "reward_std": 0.0017854587058536708, - "rewards/perpo_ocr_edit_distance_reward": 0.9088209569454193, + "advantages": -4.257474756741431e-06, + "completion_length": 755.0, + "delta_ref_entropy_loss": 0.10888671875, + "delta_ref_ppl": -0.095703125, + "entropy_loss": -0.1591796875, + "epoch": 0.266, + "grad_norm": 2.8783936226247233, + "k1_kl": 0.095703125, + "k3_kl": 0.0615234375, + "kimi_kl": 0.1923828125, + "learning_rate": 3.67e-07, + "loss": 0.0025, + "ppl": 0.0888671875, + "reward": 0.506310760974884, + "reward_std": 0.011831066571176052, + "rewards/perpo_ocr_edit_distance_reward": 0.5063108205795288, "step": 1330, "temperature": 0.9 }, { - "advantages": -2.2607191567658447e-06, - "completion_length": 455.0, - "delta_ref_entropy_loss": 0.063446044921875, - "delta_ref_ppl": -0.03363037109375, - "entropy_loss": -0.0706024169921875, - "epoch": 0.5324, - "grad_norm": 0.813610188324068, - "k1_kl": 0.03375244140625, - "k3_kl": 0.01617431640625, - "kimi_kl": 0.0384979248046875, - "learning_rate": 2.338e-07, - "loss": 0.0006, - "ppl": 0.0393218994140625, - "reward": 0.7053743004798889, - "reward_std": 0.002779511734843254, - "rewards/perpo_ocr_edit_distance_reward": 0.7053743153810501, + "advantages": -0.00010638577805366367, + "completion_length": 348.0, + "delta_ref_entropy_loss": 0.0771484375, + "delta_ref_ppl": -0.08203125, + "entropy_loss": -0.017333984375, + "epoch": 0.2662, + "grad_norm": 0.7520981541260979, + "k1_kl": 0.08203125, + "k3_kl": 0.052978515625, + "kimi_kl": 0.1640625, + "learning_rate": 3.669e-07, + "loss": 0.0022, + "ppl": 0.00604248046875, + "reward": 0.9942857027053833, + "reward_std": 0.0003002631710842252, + "rewards/perpo_ocr_edit_distance_reward": 0.9942857623100281, "step": 1331, "temperature": 0.9 }, { - "advantages": -6.0502972701215185e-05, - "completion_length": 611.5, - "delta_ref_entropy_loss": 0.0333251953125, - "delta_ref_ppl": -0.023956298828125, - "entropy_loss": -0.02520751953125, - "epoch": 0.5328, - "grad_norm": 0.8426569145241974, - "k1_kl": 0.023834228515625, - "k3_kl": 0.013427734375, - "kimi_kl": 0.032501220703125, - "learning_rate": 2.336e-07, - "loss": 0.0006, - "ppl": 0.0113525390625, - "reward": 0.9752473831176758, - "reward_std": 0.0017237466527149081, - "rewards/perpo_ocr_edit_distance_reward": 0.9752475023269653, + "advantages": 0.0, + "completion_length": 555.0, + "delta_ref_entropy_loss": 0.031005859375, + "delta_ref_ppl": -0.04638671875, + "entropy_loss": -0.01422119140625, + "epoch": 0.2664, + "grad_norm": 0.006923278309846952, + "k1_kl": 0.046142578125, + "k3_kl": 0.03125, + "kimi_kl": 0.0830078125, + "learning_rate": 3.668e-07, + "loss": 0.0013, + "ppl": 0.0030517578125, + "reward": 0.9919354915618896, + "reward_std": 0.0, + "rewards/perpo_ocr_edit_distance_reward": 0.9919354915618896, "step": 1332, "temperature": 0.9 }, { - "advantages": -1.055853772413684e-06, - "completion_length": 409.5, - "delta_ref_entropy_loss": 0.05194091796875, - "delta_ref_ppl": -0.03924560546875, - "entropy_loss": -0.032073974609375, - "epoch": 0.5332, - "grad_norm": 0.7219172408101749, - "k1_kl": 0.03924560546875, - "k3_kl": 0.026641845703125, - "kimi_kl": 0.070068359375, - "learning_rate": 2.3339999999999999e-07, - "loss": 0.0011, - "ppl": 0.0129852294921875, - "reward": 0.9770163297653198, - "reward_std": 0.007933475077152252, - "rewards/perpo_ocr_edit_distance_reward": 0.9770163595676422, + "advantages": -1.1431319762778003e-05, + "completion_length": 920.0, + "delta_ref_entropy_loss": 0.051513671875, + "delta_ref_ppl": -0.054931640625, + "entropy_loss": -0.03173828125, + "epoch": 0.2666, + "grad_norm": 4.736068114616486, + "k1_kl": 0.05517578125, + "k3_kl": 0.030517578125, + "kimi_kl": 0.0869140625, + "learning_rate": 3.667e-07, + "loss": 0.0012, + "ppl": 0.0118408203125, + "reward": 0.9983241558074951, + "reward_std": 0.0006447896594181657, + "rewards/perpo_ocr_edit_distance_reward": 0.9983241558074951, "step": 1333, "temperature": 0.9 }, { - "advantages": -1.519386273685086e-05, - "completion_length": 1483.0, - "delta_ref_entropy_loss": 0.029022216796875, - "delta_ref_ppl": -0.0238494873046875, - "entropy_loss": -0.0311279296875, - "epoch": 0.5336, - "grad_norm": 7.248911426945179, - "k1_kl": 0.02394866943359375, - "k3_kl": 0.028564453125, - "kimi_kl": 0.04457855224609375, - "learning_rate": 2.3319999999999997e-07, - "loss": 0.0012, - "ppl": 0.0155029296875, - "reward": 0.9599143862724304, - "reward_std": 0.003293334331829101, - "rewards/perpo_ocr_edit_distance_reward": 0.9599144458770752, + "advantages": 4.385199190437561e-06, + "completion_length": 1538.0, + "delta_ref_entropy_loss": 0.06396484375, + "delta_ref_ppl": -0.06591796875, + "entropy_loss": -0.080078125, + "epoch": 0.2668, + "grad_norm": 1.9873781991460138, + "k1_kl": 0.06591796875, + "k3_kl": 0.0556640625, + "kimi_kl": 0.09326171875, + "learning_rate": 3.6659999999999996e-07, + "loss": 0.0022, + "ppl": 0.043701171875, + "reward": 0.9633291959762573, + "reward_std": 0.003788991831243038, + "rewards/perpo_ocr_edit_distance_reward": 0.9633291959762573, "step": 1334, "temperature": 0.9 }, { - "advantages": -2.7469227916299133e-05, - "completion_length": 737.0, - "delta_ref_entropy_loss": 0.03594970703125, - "delta_ref_ppl": -0.020263671875, - "entropy_loss": -0.04412841796875, - "epoch": 0.534, - "grad_norm": 0.8806347445512327, - "k1_kl": 0.020263671875, - "k3_kl": 0.00885009765625, - "kimi_kl": 0.0145263671875, - "learning_rate": 2.33e-07, - "loss": 0.0004, - "ppl": 0.021881103515625, - "reward": 0.9934839904308319, - "reward_std": 0.0025590640143491328, - "rewards/perpo_ocr_edit_distance_reward": 0.9934840500354767, + "advantages": -9.093966582440771e-06, + "completion_length": 868.0, + "delta_ref_entropy_loss": 0.068359375, + "delta_ref_ppl": -0.050048828125, + "entropy_loss": -0.03125, + "epoch": 0.267, + "grad_norm": 0.8591007972541711, + "k1_kl": 0.050048828125, + "k3_kl": 0.028564453125, + "kimi_kl": 0.06689453125, + "learning_rate": 3.6649999999999995e-07, + "loss": 0.0012, + "ppl": 0.01287841796875, + "reward": 0.9746537804603577, + "reward_std": 0.0027076604310423136, + "rewards/perpo_ocr_edit_distance_reward": 0.9746538400650024, "step": 1335, "temperature": 0.9 }, { - "advantages": -7.327965613512788e-05, - "completion_length": 870.5, - "delta_ref_entropy_loss": 0.04962158203125, - "delta_ref_ppl": -0.0330810546875, - "entropy_loss": -0.08319091796875, - "epoch": 0.5344, - "grad_norm": 0.7865618181368773, - "k1_kl": 0.0330810546875, - "k3_kl": 0.02099609375, - "kimi_kl": 0.033203125, - "learning_rate": 2.328e-07, - "loss": 0.0009, - "ppl": 0.04400634765625, - "reward": 0.9693260788917542, - "reward_std": 0.0011844084947369993, - "rewards/perpo_ocr_edit_distance_reward": 0.9693261384963989, + "advantages": -6.54458999633789e-05, + "completion_length": 619.0, + "delta_ref_entropy_loss": 0.1240234375, + "delta_ref_ppl": -0.0888671875, + "entropy_loss": -0.038330078125, + "epoch": 0.2672, + "grad_norm": 0.7740059169562452, + "k1_kl": 0.08837890625, + "k3_kl": 0.044189453125, + "kimi_kl": 0.1279296875, + "learning_rate": 3.664e-07, + "loss": 0.0018, + "ppl": 0.0157470703125, + "reward": 0.9726661443710327, + "reward_std": 0.0009408171172253788, + "rewards/perpo_ocr_edit_distance_reward": 0.9726662635803223, "step": 1336, "temperature": 0.9 }, { - "advantages": -7.663454653084045e-07, - "completion_length": 647.5, - "delta_ref_entropy_loss": 0.075927734375, - "delta_ref_ppl": -0.052734375, - "entropy_loss": -0.105224609375, - "epoch": 0.5348, - "grad_norm": 1.147374876528536, - "k1_kl": 0.052734375, - "k3_kl": 0.0299072265625, - "kimi_kl": 0.0966796875, - "learning_rate": 2.3259999999999998e-07, - "loss": 0.0012, - "ppl": 0.0589599609375, - "reward": 0.8265974521636963, - "reward_std": 0.0048390887677669525, - "rewards/perpo_ocr_edit_distance_reward": 0.8265975117683411, + "advantages": -7.89506157161668e-05, + "completion_length": 702.0, + "delta_ref_entropy_loss": 0.04248046875, + "delta_ref_ppl": -0.06640625, + "entropy_loss": -0.0191650390625, + "epoch": 0.2674, + "grad_norm": 0.5229417029597103, + "k1_kl": 0.06640625, + "k3_kl": 0.041015625, + "kimi_kl": 0.1181640625, + "learning_rate": 3.663e-07, + "loss": 0.0017, + "ppl": 0.0101318359375, + "reward": 0.9763146042823792, + "reward_std": 0.0005469808238558471, + "rewards/perpo_ocr_edit_distance_reward": 0.9763146638870239, "step": 1337, "temperature": 0.9 }, { - "advantages": -9.724072697281372e-06, - "completion_length": 980.0, - "delta_ref_entropy_loss": 0.02618408203125, - "delta_ref_ppl": -0.0496826171875, - "entropy_loss": -0.0677490234375, - "epoch": 0.5352, - "grad_norm": 1374.1814686857697, - "k1_kl": 0.0499267578125, - "k3_kl": 1.6104736328125, - "kimi_kl": 0.126708984375, - "learning_rate": 2.324e-07, - "loss": 0.0646, - "ppl": 0.03790283203125, - "reward": 0.9863676726818085, - "reward_std": 0.0021357759833335876, - "rewards/perpo_ocr_edit_distance_reward": 0.9863677322864532, + "advantages": 0.0, + "completion_length": 485.0, + "delta_ref_entropy_loss": 0.1767578125, + "delta_ref_ppl": -0.1474609375, + "entropy_loss": -0.2080078125, + "epoch": 0.2676, + "grad_norm": 4.293188110496839, + "k1_kl": 0.1474609375, + "k3_kl": 0.08740234375, + "kimi_kl": 0.2099609375, + "learning_rate": 3.662e-07, + "loss": 0.0035, + "ppl": 0.10205078125, + "reward": 0.5118386149406433, + "reward_std": 0.24856729805469513, + "rewards/perpo_ocr_edit_distance_reward": 0.5118386745452881, "step": 1338, "temperature": 0.9 }, { - "advantages": -1.3198171586736862e-06, - "completion_length": 403.0, - "delta_ref_entropy_loss": 0.0582275390625, - "delta_ref_ppl": -0.11181640625, - "entropy_loss": -0.083984375, - "epoch": 0.5356, - "grad_norm": 3.1942947756639226, - "k1_kl": 0.1123046875, - "k3_kl": 0.08062744140625, - "kimi_kl": 0.3072509765625, - "learning_rate": 2.3219999999999997e-07, - "loss": 0.0032, - "ppl": 0.0438232421875, - "reward": 0.8562654256820679, - "reward_std": 0.026157676242291927, - "rewards/perpo_ocr_edit_distance_reward": 0.8562654554843903, + "advantages": -0.0001378953456878662, + "completion_length": 763.0, + "delta_ref_entropy_loss": 0.055908203125, + "delta_ref_ppl": -0.07421875, + "entropy_loss": -0.03271484375, + "epoch": 0.2678, + "grad_norm": 1.0611724651858645, + "k1_kl": 0.07421875, + "k3_kl": 0.0478515625, + "kimi_kl": 0.14453125, + "learning_rate": 3.661e-07, + "loss": 0.002, + "ppl": 0.0166015625, + "reward": 0.996981143951416, + "reward_std": 0.00045573231182061136, + "rewards/perpo_ocr_edit_distance_reward": 0.9969812631607056, "step": 1339, "temperature": 0.9 }, { - "advantages": -0.00030518429639414535, - "completion_length": 674.0, - "delta_ref_entropy_loss": 0.028564453125, - "delta_ref_ppl": -0.0186767578125, - "entropy_loss": -0.018096923828125, - "epoch": 0.536, - "grad_norm": 0.18250952858979408, - "k1_kl": 0.0186767578125, - "k3_kl": 0.009490966796875, - "kimi_kl": 0.02471923828125, - "learning_rate": 2.32e-07, - "loss": 0.0007, - "ppl": 0.00714874267578125, - "reward": 0.9989918172359467, - "reward_std": 0.0002471828192938119, - "rewards/perpo_ocr_edit_distance_reward": 0.9989918768405914, + "advantages": -1.5267305570887402e-05, + "completion_length": 848.0, + "delta_ref_entropy_loss": 0.1162109375, + "delta_ref_ppl": -0.08935546875, + "entropy_loss": -0.126953125, + "epoch": 0.268, + "grad_norm": 1.5823076007212538, + "k1_kl": 0.08935546875, + "k3_kl": 0.051025390625, + "kimi_kl": 0.1142578125, + "learning_rate": 3.6599999999999997e-07, + "loss": 0.002, + "ppl": 0.0693359375, + "reward": 0.8404036164283752, + "reward_std": 0.003804859472438693, + "rewards/perpo_ocr_edit_distance_reward": 0.8404037356376648, "step": 1340, "temperature": 0.9 }, { - "advantages": -5.266496191325132e-05, - "completion_length": 497.0, - "delta_ref_entropy_loss": 0.02618408203125, - "delta_ref_ppl": -0.02349853515625, - "entropy_loss": -0.02001953125, - "epoch": 0.5364, - "grad_norm": 0.6687246870654716, - "k1_kl": 0.0235595703125, - "k3_kl": 0.014312744140625, - "kimi_kl": 0.03424072265625, - "learning_rate": 2.318e-07, - "loss": 0.0006, - "ppl": 0.011260986328125, - "reward": 0.9962820708751678, - "reward_std": 0.0005994633975205943, - "rewards/perpo_ocr_edit_distance_reward": 0.996282160282135, + "advantages": -7.770743104629219e-05, + "completion_length": 794.0, + "delta_ref_entropy_loss": 0.051513671875, + "delta_ref_ppl": -0.06689453125, + "entropy_loss": -0.02685546875, + "epoch": 0.2682, + "grad_norm": 0.6373117323601315, + "k1_kl": 0.06689453125, + "k3_kl": 0.04345703125, + "kimi_kl": 0.1396484375, + "learning_rate": 3.6589999999999996e-07, + "loss": 0.0018, + "ppl": 0.01287841796875, + "reward": 0.9978376030921936, + "reward_std": 0.0008861892274580896, + "rewards/perpo_ocr_edit_distance_reward": 0.9978376626968384, "step": 1341, "temperature": 0.9 }, { - "advantages": -7.485917785743368e-05, - "completion_length": 575.5, - "delta_ref_entropy_loss": 0.022125244140625, - "delta_ref_ppl": -0.09576416015625, - "entropy_loss": -0.03564453125, - "epoch": 0.5368, - "grad_norm": 0.9181278702763438, - "k1_kl": 0.09576416015625, - "k3_kl": 0.07928466796875, - "kimi_kl": 0.4775390625, - "learning_rate": 2.3159999999999998e-07, - "loss": 0.0032, - "ppl": 0.016357421875, - "reward": 0.9728271961212158, - "reward_std": 0.0017513963975943625, - "rewards/perpo_ocr_edit_distance_reward": 0.972827285528183, + "advantages": -6.323201523628086e-05, + "completion_length": 388.0, + "delta_ref_entropy_loss": 0.061767578125, + "delta_ref_ppl": -0.08056640625, + "entropy_loss": -0.016845703125, + "epoch": 0.2684, + "grad_norm": 0.5118746115898303, + "k1_kl": 0.08056640625, + "k3_kl": 0.050048828125, + "kimi_kl": 0.1494140625, + "learning_rate": 3.658e-07, + "loss": 0.0021, + "ppl": 0.005096435546875, + "reward": 0.9974518418312073, + "reward_std": 0.0008427437278442085, + "rewards/perpo_ocr_edit_distance_reward": 0.997451901435852, "step": 1342, "temperature": 0.9 }, { - "advantages": -5.489587874762947e-05, - "completion_length": 493.5, - "delta_ref_entropy_loss": 0.0447998046875, - "delta_ref_ppl": -0.0372314453125, - "entropy_loss": -0.03863525390625, - "epoch": 0.5372, - "grad_norm": 0.6737640780595742, - "k1_kl": 0.03717041015625, - "k3_kl": 0.020751953125, - "kimi_kl": 0.05419921875, - "learning_rate": 2.314e-07, - "loss": 0.0009, - "ppl": 0.019287109375, - "reward": 0.9862757325172424, - "reward_std": 0.0018927358905784786, - "rewards/perpo_ocr_edit_distance_reward": 0.986275851726532, + "advantages": -9.86031136562815e-06, + "completion_length": 33.0, + "delta_ref_entropy_loss": 0.1708984375, + "delta_ref_ppl": -0.58203125, + "entropy_loss": -0.04638671875, + "epoch": 0.2686, + "grad_norm": 6.0802377556455856, + "k1_kl": 0.58203125, + "k3_kl": 0.515625, + "kimi_kl": 2.203125, + "learning_rate": 3.657e-07, + "loss": 0.0206, + "ppl": 0.02001953125, + "reward": 0.9662542343139648, + "reward_std": 0.00595219386741519, + "rewards/perpo_ocr_edit_distance_reward": 0.9662542939186096, "step": 1343, "temperature": 0.9 }, { - "advantages": -4.178285807654447e-05, - "completion_length": 615.0, - "delta_ref_entropy_loss": 0.0572509765625, - "delta_ref_ppl": -0.0491943359375, - "entropy_loss": -0.06781005859375, - "epoch": 0.5376, - "grad_norm": 0.8182861454001445, - "k1_kl": 0.0491943359375, - "k3_kl": 0.0299072265625, - "kimi_kl": 0.086181640625, - "learning_rate": 2.3119999999999998e-07, - "loss": 0.0012, - "ppl": 0.03790283203125, - "reward": 0.969284176826477, - "reward_std": 0.011001066595781595, - "rewards/perpo_ocr_edit_distance_reward": 0.9692842662334442, + "advantages": 0.0, + "completion_length": 330.0, + "delta_ref_entropy_loss": 0.0654296875, + "delta_ref_ppl": -0.08544921875, + "entropy_loss": -0.0263671875, + "epoch": 0.2688, + "grad_norm": 0.9625632750090506, + "k1_kl": 0.0859375, + "k3_kl": 0.053955078125, + "kimi_kl": 0.1396484375, + "learning_rate": 3.6559999999999994e-07, + "loss": 0.0022, + "ppl": 0.0103759765625, + "reward": 0.9960429668426514, + "reward_std": 0.0004615577054210007, + "rewards/perpo_ocr_edit_distance_reward": 0.9960429668426514, "step": 1344, "temperature": 0.9 }, { - "advantages": -1.788139479685924e-05, - "completion_length": 749.0, - "delta_ref_entropy_loss": 0.02569580078125, - "delta_ref_ppl": -0.02642822265625, - "entropy_loss": -0.022705078125, - "epoch": 0.538, - "grad_norm": 0.44375310957329606, - "k1_kl": 0.02630615234375, - "k3_kl": 0.019134521484375, - "kimi_kl": 0.060302734375, - "learning_rate": 2.31e-07, - "loss": 0.0008, - "ppl": 0.009185791015625, - "reward": 0.9897143244743347, - "reward_std": 0.008608338859630749, - "rewards/perpo_ocr_edit_distance_reward": 0.9897143840789795, + "advantages": -1.2874604180979077e-05, + "completion_length": 262.0, + "delta_ref_entropy_loss": 0.201171875, + "delta_ref_ppl": -0.205078125, + "entropy_loss": -0.234375, + "epoch": 0.269, + "grad_norm": 2.3752661898287157, + "k1_kl": 0.205078125, + "k3_kl": 0.12158203125, + "kimi_kl": 0.31640625, + "learning_rate": 3.655e-07, + "loss": 0.0049, + "ppl": 0.1259765625, + "reward": 0.9270516633987427, + "reward_std": 0.003862694837152958, + "rewards/perpo_ocr_edit_distance_reward": 0.9270517230033875, "step": 1345, "temperature": 0.9 }, { - "advantages": -6.190368503666832e-05, - "completion_length": 518.5, - "delta_ref_entropy_loss": 0.02203369140625, - "delta_ref_ppl": -0.023223876953125, - "entropy_loss": -0.02215576171875, - "epoch": 0.5384, - "grad_norm": 0.4466475263389591, - "k1_kl": 0.0230712890625, - "k3_kl": 0.017333984375, - "kimi_kl": 0.0765380859375, - "learning_rate": 2.308e-07, - "loss": 0.0008, - "ppl": 0.0116119384765625, - "reward": 0.9979471862316132, - "reward_std": 0.001132171746576205, - "rewards/perpo_ocr_edit_distance_reward": 0.9979472458362579, + "advantages": 1.6178404393940582e-06, + "completion_length": 423.0, + "delta_ref_entropy_loss": 0.12890625, + "delta_ref_ppl": -0.1435546875, + "entropy_loss": -0.20703125, + "epoch": 0.2692, + "grad_norm": 1.741175918300385, + "k1_kl": 0.1435546875, + "k3_kl": 0.07861328125, + "kimi_kl": 0.232421875, + "learning_rate": 3.654e-07, + "loss": 0.0031, + "ppl": 0.1279296875, + "reward": 0.43467774987220764, + "reward_std": 0.0025519661139696836, + "rewards/perpo_ocr_edit_distance_reward": 0.43467774987220764, "step": 1346, "temperature": 0.9 }, { - "advantages": -2.1713121611810493e-07, - "completion_length": 975.0, - "delta_ref_entropy_loss": 0.047119140625, - "delta_ref_ppl": -0.0400390625, - "entropy_loss": -0.0616455078125, - "epoch": 0.5388, - "grad_norm": 0.6405603123096373, - "k1_kl": 0.03997802734375, - "k3_kl": 0.021728515625, - "kimi_kl": 0.0576171875, - "learning_rate": 2.306e-07, - "loss": 0.0009, - "ppl": 0.0340576171875, - "reward": 0.9023622572422028, - "reward_std": 0.0521836734842509, - "rewards/perpo_ocr_edit_distance_reward": 0.9023623168468475, + "advantages": 5.59602485736832e-05, + "completion_length": 896.0, + "delta_ref_entropy_loss": 0.0634765625, + "delta_ref_ppl": -0.06591796875, + "entropy_loss": -0.02197265625, + "epoch": 0.2694, + "grad_norm": 0.40733663189189295, + "k1_kl": 0.06591796875, + "k3_kl": 0.040283203125, + "kimi_kl": 0.13671875, + "learning_rate": 3.653e-07, + "loss": 0.0016, + "ppl": 0.0098876953125, + "reward": 0.9912412166595459, + "reward_std": 0.0002043283311650157, + "rewards/perpo_ocr_edit_distance_reward": 0.9912412166595459, "step": 1347, "temperature": 0.9 }, { - "advantages": 2.022726221184712e-05, - "completion_length": 621.0, - "delta_ref_entropy_loss": 0.0263824462890625, - "delta_ref_ppl": -0.0293121337890625, - "entropy_loss": -0.012115478515625, - "epoch": 0.5392, - "grad_norm": 0.8182840121190416, - "k1_kl": 0.0291900634765625, - "k3_kl": 0.019805908203125, - "kimi_kl": 0.063232421875, - "learning_rate": 2.3039999999999997e-07, - "loss": 0.0008, - "ppl": 0.0054931640625, - "reward": 0.9923420548439026, - "reward_std": 0.0018351400649407879, - "rewards/perpo_ocr_edit_distance_reward": 0.9923420548439026, + "advantages": -2.0980836779926904e-05, + "completion_length": 568.0, + "delta_ref_entropy_loss": 0.1484375, + "delta_ref_ppl": -0.11279296875, + "entropy_loss": -0.1259765625, + "epoch": 0.2696, + "grad_norm": 1.5167533554739472, + "k1_kl": 0.11328125, + "k3_kl": 0.062255859375, + "kimi_kl": 0.1865234375, + "learning_rate": 3.652e-07, + "loss": 0.0025, + "ppl": 0.061279296875, + "reward": 0.8095988631248474, + "reward_std": 0.003146218368783593, + "rewards/perpo_ocr_edit_distance_reward": 0.809598982334137, "step": 1348, "temperature": 0.9 }, { - "advantages": -0.00016179680824279785, - "completion_length": 402.5, - "delta_ref_entropy_loss": 0.03826904296875, - "delta_ref_ppl": -0.0364990234375, - "entropy_loss": -0.0230712890625, - "epoch": 0.5396, - "grad_norm": 0.16871564352846424, - "k1_kl": 0.0364990234375, - "k3_kl": 0.0235595703125, - "kimi_kl": 0.0823974609375, - "learning_rate": 2.3019999999999998e-07, - "loss": 0.0011, - "ppl": 0.009613037109375, - "reward": 0.9967746138572693, - "reward_std": 9.472556121181697e-05, - "rewards/perpo_ocr_edit_distance_reward": 0.9967747032642365, + "advantages": -4.407337837619707e-05, + "completion_length": 308.0, + "delta_ref_entropy_loss": 0.09375, + "delta_ref_ppl": -0.115234375, + "entropy_loss": -0.06298828125, + "epoch": 0.2698, + "grad_norm": 1.4810048385063388, + "k1_kl": 0.115234375, + "k3_kl": 0.06787109375, + "kimi_kl": 0.158203125, + "learning_rate": 3.6509999999999995e-07, + "loss": 0.0028, + "ppl": 0.0291748046875, + "reward": 0.9689082503318787, + "reward_std": 0.0022179502993822098, + "rewards/perpo_ocr_edit_distance_reward": 0.9689083695411682, "step": 1349, "temperature": 0.9 }, { - "advantages": -0.00010042531357612461, - "completion_length": 927.0, - "delta_ref_entropy_loss": 0.026611328125, - "delta_ref_ppl": -0.01934814453125, - "entropy_loss": -0.044189453125, - "epoch": 0.54, - "grad_norm": 5.509376294138333, - "k1_kl": 0.019378662109375, - "k3_kl": 0.0115966796875, - "kimi_kl": 0.0223388671875, - "learning_rate": 2.3e-07, - "loss": 0.0006, - "ppl": 0.02203369140625, - "reward": 0.9980128109455109, - "reward_std": 0.0004505194374360144, - "rewards/perpo_ocr_edit_distance_reward": 0.998012900352478, + "advantages": -1.5488692952203564e-05, + "completion_length": 360.0, + "delta_ref_entropy_loss": 0.1279296875, + "delta_ref_ppl": -0.12890625, + "entropy_loss": -0.1376953125, + "epoch": 0.27, + "grad_norm": 2.00057455575529, + "k1_kl": 0.12890625, + "k3_kl": 0.078125, + "kimi_kl": 0.2138671875, + "learning_rate": 3.65e-07, + "loss": 0.0031, + "ppl": 0.07470703125, + "reward": 0.7947794795036316, + "reward_std": 0.0037473731208592653, + "rewards/perpo_ocr_edit_distance_reward": 0.7947795391082764, "step": 1350, "temperature": 0.9 }, { - "advantages": -4.9901862439583056e-05, - "completion_length": 481.5, - "delta_ref_entropy_loss": 0.0313720703125, - "delta_ref_ppl": -0.05712890625, - "entropy_loss": -0.025909423828125, - "epoch": 0.5404, - "grad_norm": 0.7733472853452008, - "k1_kl": 0.057373046875, - "k3_kl": 0.04156494140625, - "kimi_kl": 0.163330078125, - "learning_rate": 2.298e-07, - "loss": 0.0017, - "ppl": 0.01412200927734375, - "reward": 0.960974782705307, - "reward_std": 0.0015413229193654843, - "rewards/perpo_ocr_edit_distance_reward": 0.9609748125076294, + "advantages": -4.592112236423418e-05, + "completion_length": 438.0, + "delta_ref_entropy_loss": 0.08740234375, + "delta_ref_ppl": -0.0693359375, + "entropy_loss": -0.031494140625, + "epoch": 0.2702, + "grad_norm": 0.90974851908715, + "k1_kl": 0.0693359375, + "k3_kl": 0.039306640625, + "kimi_kl": 0.11083984375, + "learning_rate": 3.649e-07, + "loss": 0.0016, + "ppl": 0.01220703125, + "reward": 0.9916321039199829, + "reward_std": 0.0010121791856363416, + "rewards/perpo_ocr_edit_distance_reward": 0.9916322231292725, "step": 1351, "temperature": 0.9 }, { - "advantages": -7.288796950888354e-06, - "completion_length": 611.0, - "delta_ref_entropy_loss": 0.0811767578125, - "delta_ref_ppl": -0.06829833984375, - "entropy_loss": -0.080810546875, - "epoch": 0.5408, - "grad_norm": 1.4664229032776408, - "k1_kl": 0.0682373046875, - "k3_kl": 0.04638671875, - "kimi_kl": 0.1988525390625, - "learning_rate": 2.296e-07, - "loss": 0.0019, - "ppl": 0.04705810546875, - "reward": 0.9754533171653748, - "reward_std": 0.004460747761186212, - "rewards/perpo_ocr_edit_distance_reward": 0.9754534065723419, + "advantages": 8.191381311917212e-06, + "completion_length": 676.0, + "delta_ref_entropy_loss": 0.0654296875, + "delta_ref_ppl": -0.06494140625, + "entropy_loss": -0.06396484375, + "epoch": 0.2704, + "grad_norm": 1.0472092169119585, + "k1_kl": 0.06494140625, + "k3_kl": 0.035400390625, + "kimi_kl": 0.07275390625, + "learning_rate": 3.648e-07, + "loss": 0.0014, + "ppl": 0.03271484375, + "reward": 0.9039047956466675, + "reward_std": 0.0019760574214160442, + "rewards/perpo_ocr_edit_distance_reward": 0.9039047956466675, "step": 1352, "temperature": 0.9 }, { - "advantages": 9.025846168242424e-07, - "completion_length": 1130.0, - "delta_ref_entropy_loss": 0.072265625, - "delta_ref_ppl": -0.0625, - "entropy_loss": -0.11669921875, - "epoch": 0.5412, - "grad_norm": 0.7396887430515756, - "k1_kl": 0.0626220703125, - "k3_kl": 0.041748046875, - "kimi_kl": 0.1392822265625, - "learning_rate": 2.2939999999999998e-07, - "loss": 0.0017, - "ppl": 0.06085205078125, - "reward": 0.946205347776413, - "reward_std": 0.05582535173743963, - "rewards/perpo_ocr_edit_distance_reward": 0.9462053775787354, + "advantages": -3.283790283603594e-05, + "completion_length": 494.0, + "delta_ref_entropy_loss": 0.0751953125, + "delta_ref_ppl": -0.07763671875, + "entropy_loss": -0.0130615234375, + "epoch": 0.2706, + "grad_norm": 0.8072952187288567, + "k1_kl": 0.07763671875, + "k3_kl": 0.046142578125, + "kimi_kl": 0.1318359375, + "learning_rate": 3.6470000000000003e-07, + "loss": 0.0019, + "ppl": 0.0037841796875, + "reward": 0.9988960027694702, + "reward_std": 0.0004190071776974946, + "rewards/perpo_ocr_edit_distance_reward": 0.9988959431648254, "step": 1353, "temperature": 0.9 }, { - "advantages": -0.00011443240873632021, - "completion_length": 782.5, - "delta_ref_entropy_loss": 0.02197265625, - "delta_ref_ppl": -0.0247802734375, - "entropy_loss": -0.019866943359375, - "epoch": 0.5416, - "grad_norm": 0.4170294176933375, - "k1_kl": 0.02490234375, - "k3_kl": 0.0172882080078125, - "kimi_kl": 0.07623291015625, - "learning_rate": 2.292e-07, - "loss": 0.0008, - "ppl": 0.00860595703125, - "reward": 0.9414070844650269, - "reward_std": 0.0006258785142563283, - "rewards/perpo_ocr_edit_distance_reward": 0.941407173871994, + "advantages": -1.9618444639490917e-05, + "completion_length": 521.0, + "delta_ref_entropy_loss": 0.05712890625, + "delta_ref_ppl": -0.07373046875, + "entropy_loss": -0.0322265625, + "epoch": 0.2708, + "grad_norm": 0.6607179815808611, + "k1_kl": 0.07421875, + "k3_kl": 0.046875, + "kimi_kl": 0.1337890625, + "learning_rate": 3.6459999999999997e-07, + "loss": 0.0019, + "ppl": 0.0159912109375, + "reward": 0.9966809749603271, + "reward_std": 0.00033399453968741, + "rewards/perpo_ocr_edit_distance_reward": 0.9966809749603271, "step": 1354, "temperature": 0.9 }, { - "advantages": -3.4264157875441015e-05, - "completion_length": 846.0, - "delta_ref_entropy_loss": 0.0423583984375, - "delta_ref_ppl": -0.03204345703125, - "entropy_loss": -0.0582275390625, - "epoch": 0.542, - "grad_norm": 0.6232227670830048, - "k1_kl": 0.03216552734375, - "k3_kl": 0.01806640625, - "kimi_kl": 0.0380859375, - "learning_rate": 2.29e-07, - "loss": 0.0008, - "ppl": 0.03009033203125, - "reward": 0.971232682466507, - "reward_std": 0.001947554701473564, - "rewards/perpo_ocr_edit_distance_reward": 0.9712328016757965, + "advantages": 4.257474728319721e-08, + "completion_length": 2048.0, + "delta_ref_entropy_loss": 0.020751953125, + "delta_ref_ppl": -0.1611328125, + "entropy_loss": -0.30078125, + "epoch": 0.271, + "grad_norm": 4.706514148278303, + "k1_kl": 0.1611328125, + "k3_kl": 0.1318359375, + "kimi_kl": 0.400390625, + "learning_rate": 3.6449999999999996e-07, + "loss": 0.0053, + "ppl": 0.1240234375, + "reward": 0.2904305160045624, + "reward_std": 0.19250628352165222, + "rewards/perpo_ocr_edit_distance_reward": 0.2904305160045624, "step": 1355, "temperature": 0.9 }, { - "advantages": -2.729041398197296e-06, - "completion_length": 389.0, - "delta_ref_entropy_loss": 0.0352783203125, - "delta_ref_ppl": -0.0487060546875, - "entropy_loss": -0.02337646484375, - "epoch": 0.5424, - "grad_norm": 0.5126754553963606, - "k1_kl": 0.048828125, - "k3_kl": 0.034912109375, - "kimi_kl": 0.163818359375, - "learning_rate": 2.2879999999999998e-07, - "loss": 0.0014, - "ppl": 0.00860595703125, - "reward": 0.9529979825019836, - "reward_std": 0.005431133322417736, - "rewards/perpo_ocr_edit_distance_reward": 0.9529980719089508, + "advantages": -2.4978604415082373e-05, + "completion_length": 423.0, + "delta_ref_entropy_loss": 0.1572265625, + "delta_ref_ppl": -0.154296875, + "entropy_loss": -0.11328125, + "epoch": 0.2712, + "grad_norm": 1.4290195839947293, + "k1_kl": 0.1552734375, + "k3_kl": 0.08837890625, + "kimi_kl": 0.24609375, + "learning_rate": 3.644e-07, + "loss": 0.0036, + "ppl": 0.047607421875, + "reward": 0.8572468757629395, + "reward_std": 0.0036493903025984764, + "rewards/perpo_ocr_edit_distance_reward": 0.8572469353675842, "step": 1356, "temperature": 0.9 }, { - "advantages": 3.1079566724656615e-06, - "completion_length": 663.5, - "delta_ref_entropy_loss": 0.038330078125, - "delta_ref_ppl": -0.0218505859375, - "entropy_loss": -0.0360107421875, - "epoch": 0.5428, - "grad_norm": 1.4692989728577652, - "k1_kl": 0.021820068359375, - "k3_kl": 0.011749267578125, - "kimi_kl": 0.025665283203125, - "learning_rate": 2.286e-07, - "loss": 0.0005, - "ppl": 0.01776123046875, - "reward": 0.9942238330841064, - "reward_std": 0.0006333366618491709, - "rewards/perpo_ocr_edit_distance_reward": 0.9942238330841064, + "advantages": 1.0796956303238403e-05, + "completion_length": 821.0, + "delta_ref_entropy_loss": 0.09326171875, + "delta_ref_ppl": -0.08154296875, + "entropy_loss": -0.08154296875, + "epoch": 0.2714, + "grad_norm": 1.1463573833918363, + "k1_kl": 0.08154296875, + "k3_kl": 0.052001953125, + "kimi_kl": 0.150390625, + "learning_rate": 3.643e-07, + "loss": 0.0021, + "ppl": 0.043212890625, + "reward": 0.953906238079071, + "reward_std": 0.0006890021613799036, + "rewards/perpo_ocr_edit_distance_reward": 0.953906238079071, "step": 1357, "temperature": 0.9 }, { - "advantages": -3.064530420715528e-05, - "completion_length": 318.5, - "delta_ref_entropy_loss": 0.033660888671875, - "delta_ref_ppl": -0.04510498046875, - "entropy_loss": -0.028076171875, - "epoch": 0.5432, - "grad_norm": 1.7369326406653394, - "k1_kl": 0.04534912109375, - "k3_kl": 0.030181884765625, - "kimi_kl": 0.08465576171875, - "learning_rate": 2.2839999999999998e-07, - "loss": 0.0012, - "ppl": 0.0166015625, - "reward": 0.9975769221782684, - "reward_std": 0.004240046604536474, - "rewards/perpo_ocr_edit_distance_reward": 0.9975769519805908, + "advantages": -2.6038715077447705e-05, + "completion_length": 925.0, + "delta_ref_entropy_loss": 0.12451171875, + "delta_ref_ppl": -0.10693359375, + "entropy_loss": -0.10986328125, + "epoch": 0.2716, + "grad_norm": 3.9840923631448657, + "k1_kl": 0.10693359375, + "k3_kl": 0.06689453125, + "kimi_kl": 0.2265625, + "learning_rate": 3.642e-07, + "loss": 0.0027, + "ppl": 0.058837890625, + "reward": 0.9427080750465393, + "reward_std": 0.0008807433769106865, + "rewards/perpo_ocr_edit_distance_reward": 0.9427081346511841, "step": 1358, "temperature": 0.9 }, { - "advantages": -2.784388470900012e-06, - "completion_length": 354.5, - "delta_ref_entropy_loss": 0.1650390625, - "delta_ref_ppl": -0.140625, - "entropy_loss": -0.138671875, - "epoch": 0.5436, - "grad_norm": 1.1669432004608173, - "k1_kl": 0.140869140625, - "k3_kl": 0.089111328125, - "kimi_kl": 0.231689453125, - "learning_rate": 2.2819999999999997e-07, - "loss": 0.0036, - "ppl": 0.07000732421875, - "reward": 0.8887663781642914, - "reward_std": 0.0022489188704639673, - "rewards/perpo_ocr_edit_distance_reward": 0.8887663781642914, + "advantages": -2.7069025236414745e-05, + "completion_length": 533.0, + "delta_ref_entropy_loss": 0.0986328125, + "delta_ref_ppl": -0.07421875, + "entropy_loss": -0.0556640625, + "epoch": 0.2718, + "grad_norm": 0.9237383437374721, + "k1_kl": 0.07421875, + "k3_kl": 0.04052734375, + "kimi_kl": 0.0810546875, + "learning_rate": 3.641e-07, + "loss": 0.0017, + "ppl": 0.0242919921875, + "reward": 0.9564631581306458, + "reward_std": 0.0011588848428800702, + "rewards/perpo_ocr_edit_distance_reward": 0.9564632177352905, "step": 1359, "temperature": 0.9 }, { - "advantages": -3.595863381633535e-05, - "completion_length": 236.5, - "delta_ref_entropy_loss": 0.0238037109375, - "delta_ref_ppl": -0.04205322265625, - "entropy_loss": -0.02581787109375, - "epoch": 0.544, - "grad_norm": 1.2748770063557135, - "k1_kl": 0.0418701171875, - "k3_kl": 0.03173828125, - "kimi_kl": 0.09197998046875, - "learning_rate": 2.28e-07, - "loss": 0.0013, - "ppl": 0.011627197265625, - "reward": 0.9990391135215759, - "reward_std": 0.00042373366886749864, - "rewards/perpo_ocr_edit_distance_reward": 0.9990391135215759, + "advantages": -1.1682511285471264e-05, + "completion_length": 1482.0, + "delta_ref_entropy_loss": 0.05810546875, + "delta_ref_ppl": -0.045654296875, + "entropy_loss": -0.0654296875, + "epoch": 0.272, + "grad_norm": 2.243970706688739, + "k1_kl": 0.045654296875, + "k3_kl": 0.0269775390625, + "kimi_kl": 0.058837890625, + "learning_rate": 3.64e-07, + "loss": 0.0011, + "ppl": 0.034423828125, + "reward": 0.8377576470375061, + "reward_std": 0.004263678099960089, + "rewards/perpo_ocr_edit_distance_reward": 0.8377576470375061, "step": 1360, "temperature": 0.9 }, { - "advantages": -2.060617816823651e-06, - "completion_length": 369.5, - "delta_ref_entropy_loss": 0.03594970703125, - "delta_ref_ppl": -0.04229736328125, - "entropy_loss": -0.0340576171875, - "epoch": 0.5444, - "grad_norm": 0.7002751795660365, - "k1_kl": 0.042236328125, - "k3_kl": 0.02874755859375, - "kimi_kl": 0.098388671875, - "learning_rate": 2.278e-07, - "loss": 0.0012, - "ppl": 0.016143798828125, - "reward": 0.9844445586204529, - "reward_std": 0.004170928892563097, - "rewards/perpo_ocr_edit_distance_reward": 0.9844445586204529, + "advantages": -3.9611546526430175e-05, + "completion_length": 928.0, + "delta_ref_entropy_loss": 0.032470703125, + "delta_ref_ppl": -0.050537109375, + "entropy_loss": -0.022705078125, + "epoch": 0.2722, + "grad_norm": 0.5112453977496788, + "k1_kl": 0.050537109375, + "k3_kl": 0.03369140625, + "kimi_kl": 0.07763671875, + "learning_rate": 3.6389999999999997e-07, + "loss": 0.0014, + "ppl": 0.01068115234375, + "reward": 0.9894769191741943, + "reward_std": 0.0005449583986774087, + "rewards/perpo_ocr_edit_distance_reward": 0.9894769191741943, "step": 1361, "temperature": 0.9 }, { - "advantages": -0.00027480297393367437, - "completion_length": 600.0, - "delta_ref_entropy_loss": 0.0311279296875, - "delta_ref_ppl": -0.021636962890625, - "entropy_loss": -0.05157470703125, - "epoch": 0.5448, - "grad_norm": 1.0235171494199131, - "k1_kl": 0.0216064453125, - "k3_kl": 0.011383056640625, - "kimi_kl": 0.0240936279296875, - "learning_rate": 2.2759999999999997e-07, - "loss": 0.0007, - "ppl": 0.027801513671875, - "reward": 0.9898563921451569, - "reward_std": 0.0029273036852828227, - "rewards/perpo_ocr_edit_distance_reward": 0.9898564219474792, + "advantages": 0.0, + "completion_length": 321.0, + "delta_ref_entropy_loss": 0.08056640625, + "delta_ref_ppl": -0.1015625, + "entropy_loss": -0.0194091796875, + "epoch": 0.2724, + "grad_norm": 0.0067819868814135206, + "k1_kl": 0.1015625, + "k3_kl": 0.0673828125, + "kimi_kl": 0.2138671875, + "learning_rate": 3.638e-07, + "loss": 0.0027, + "ppl": 0.0029754638671875, + "reward": 0.2682492434978485, + "reward_std": 0.0, + "rewards/perpo_ocr_edit_distance_reward": 0.2682492434978485, "step": 1362, "temperature": 0.9 }, { - "advantages": -4.127408934095911e-05, - "completion_length": 369.5, - "delta_ref_entropy_loss": 0.042724609375, - "delta_ref_ppl": -0.0555419921875, - "entropy_loss": -0.07269287109375, - "epoch": 0.5452, - "grad_norm": 1.074398004070438, - "k1_kl": 0.055419921875, - "k3_kl": 0.03729248046875, - "kimi_kl": 0.125732421875, - "learning_rate": 2.2739999999999998e-07, - "loss": 0.0015, - "ppl": 0.036834716796875, - "reward": 0.8674931824207306, - "reward_std": 0.045357669005170465, - "rewards/perpo_ocr_edit_distance_reward": 0.8674932718276978, + "advantages": -6.146090890979394e-05, + "completion_length": 604.0, + "delta_ref_entropy_loss": 0.083984375, + "delta_ref_ppl": -0.0673828125, + "entropy_loss": -0.0595703125, + "epoch": 0.2726, + "grad_norm": 1.1223307189233171, + "k1_kl": 0.0673828125, + "k3_kl": 0.04296875, + "kimi_kl": 0.09521484375, + "learning_rate": 3.637e-07, + "loss": 0.0018, + "ppl": 0.03466796875, + "reward": 0.9944641590118408, + "reward_std": 0.0011470845201984048, + "rewards/perpo_ocr_edit_distance_reward": 0.9944641590118408, "step": 1363, "temperature": 0.9 }, { - "advantages": -3.995214512997336e-05, - "completion_length": 532.5, - "delta_ref_entropy_loss": 0.0489501953125, - "delta_ref_ppl": -0.0341796875, - "entropy_loss": -0.041015625, - "epoch": 0.5456, - "grad_norm": 1.2202267232568837, - "k1_kl": 0.0340576171875, - "k3_kl": 0.021453857421875, - "kimi_kl": 0.03948974609375, - "learning_rate": 2.272e-07, - "loss": 0.0009, - "ppl": 0.0204620361328125, - "reward": 0.9248997271060944, - "reward_std": 0.006425170664442703, - "rewards/perpo_ocr_edit_distance_reward": 0.9248997569084167, + "advantages": -0.00014581000141333789, + "completion_length": 469.0, + "delta_ref_entropy_loss": 0.0673828125, + "delta_ref_ppl": -0.056640625, + "entropy_loss": -0.0142822265625, + "epoch": 0.2728, + "grad_norm": 0.3084604155855206, + "k1_kl": 0.056884765625, + "k3_kl": 0.030517578125, + "kimi_kl": 0.07177734375, + "learning_rate": 3.6359999999999995e-07, + "loss": 0.0014, + "ppl": 0.005706787109375, + "reward": 0.9941653609275818, + "reward_std": 0.000600748579017818, + "rewards/perpo_ocr_edit_distance_reward": 0.9941654205322266, "step": 1364, "temperature": 0.9 }, { - "advantages": -3.131372795905918e-05, - "completion_length": 287.5, - "delta_ref_entropy_loss": 0.05511474609375, - "delta_ref_ppl": -0.053466796875, - "entropy_loss": -0.030853271484375, - "epoch": 0.546, - "grad_norm": 1.6197444948432405, - "k1_kl": 0.053466796875, - "k3_kl": 0.03375244140625, - "kimi_kl": 0.08514404296875, - "learning_rate": 2.27e-07, - "loss": 0.0014, - "ppl": 0.013458251953125, - "reward": 0.9994405210018158, - "reward_std": 0.0006978129968047142, - "rewards/perpo_ocr_edit_distance_reward": 0.9994405508041382, + "advantages": -0.00018416132661513984, + "completion_length": 995.0, + "delta_ref_entropy_loss": 0.047119140625, + "delta_ref_ppl": -0.0439453125, + "entropy_loss": -0.0238037109375, + "epoch": 0.273, + "grad_norm": 0.256076145709371, + "k1_kl": 0.0439453125, + "k3_kl": 0.024169921875, + "kimi_kl": 0.06494140625, + "learning_rate": 3.635e-07, + "loss": 0.0012, + "ppl": 0.00921630859375, + "reward": 0.9987595081329346, + "reward_std": 0.0001774070697138086, + "rewards/perpo_ocr_edit_distance_reward": 0.9987595677375793, "step": 1365, "temperature": 0.9 }, { - "advantages": -0.00014915637075318955, - "completion_length": 877.0, - "delta_ref_entropy_loss": 0.08203125, - "delta_ref_ppl": -0.08056640625, - "entropy_loss": -0.0926513671875, - "epoch": 0.5464, - "grad_norm": 1.433794022965801, - "k1_kl": 0.08056640625, - "k3_kl": 0.0511474609375, - "kimi_kl": 0.132080078125, - "learning_rate": 2.268e-07, - "loss": 0.0022, - "ppl": 0.049560546875, - "reward": 0.9732857644557953, - "reward_std": 0.0011889301385963336, - "rewards/perpo_ocr_edit_distance_reward": 0.9732858538627625, + "advantages": -0.00019386837084311992, + "completion_length": 472.0, + "delta_ref_entropy_loss": 0.057861328125, + "delta_ref_ppl": -0.052978515625, + "entropy_loss": -0.0167236328125, + "epoch": 0.2732, + "grad_norm": 0.34017202333856666, + "k1_kl": 0.052978515625, + "k3_kl": 0.03271484375, + "kimi_kl": 0.08251953125, + "learning_rate": 3.634e-07, + "loss": 0.0015, + "ppl": 0.005767822265625, + "reward": 0.9942960143089294, + "reward_std": 0.0003391602949704975, + "rewards/perpo_ocr_edit_distance_reward": 0.9942960739135742, "step": 1366, "temperature": 0.9 }, { - "advantages": -9.959084854926914e-05, - "completion_length": 449.0, - "delta_ref_entropy_loss": 0.0316162109375, - "delta_ref_ppl": -0.01904296875, - "entropy_loss": -0.019622802734375, - "epoch": 0.5468, - "grad_norm": 0.6988916306712033, - "k1_kl": 0.01898193359375, - "k3_kl": 0.0100555419921875, - "kimi_kl": 0.02642822265625, - "learning_rate": 2.2659999999999998e-07, - "loss": 0.0005, - "ppl": 0.0084381103515625, - "reward": 0.9991690814495087, - "reward_std": 0.000480340066133067, - "rewards/perpo_ocr_edit_distance_reward": 0.9991690814495087, + "advantages": -3.2527107123314636e-06, + "completion_length": 761.0, + "delta_ref_entropy_loss": 0.15234375, + "delta_ref_ppl": -0.1162109375, + "entropy_loss": -0.197265625, + "epoch": 0.2734, + "grad_norm": 1.255036514263579, + "k1_kl": 0.1162109375, + "k3_kl": 0.061767578125, + "kimi_kl": 0.1201171875, + "learning_rate": 3.633e-07, + "loss": 0.0025, + "ppl": 0.107421875, + "reward": 0.9251084327697754, + "reward_std": 0.023575564846396446, + "rewards/perpo_ocr_edit_distance_reward": 0.9251085519790649, "step": 1367, "temperature": 0.9 }, { - "advantages": -5.138771939527942e-05, - "completion_length": 670.5, - "delta_ref_entropy_loss": 0.0340576171875, - "delta_ref_ppl": -0.015960693359375, - "entropy_loss": -0.03021240234375, - "epoch": 0.5472, - "grad_norm": 0.5304380029634238, - "k1_kl": 0.015960693359375, - "k3_kl": 0.0076904296875, - "kimi_kl": 0.011627197265625, - "learning_rate": 2.264e-07, - "loss": 0.0004, - "ppl": 0.014129638671875, - "reward": 0.9985867440700531, - "reward_std": 0.0008381952939089388, - "rewards/perpo_ocr_edit_distance_reward": 0.9985868036746979, + "advantages": -4.8518184485146776e-05, + "completion_length": 273.0, + "delta_ref_entropy_loss": 0.08837890625, + "delta_ref_ppl": -0.0810546875, + "entropy_loss": -0.016357421875, + "epoch": 0.2736, + "grad_norm": 0.8564922099132753, + "k1_kl": 0.0810546875, + "k3_kl": 0.044921875, + "kimi_kl": 0.119140625, + "learning_rate": 3.632e-07, + "loss": 0.0018, + "ppl": 0.00518798828125, + "reward": 0.9871535301208496, + "reward_std": 0.0016553278546780348, + "rewards/perpo_ocr_edit_distance_reward": 0.9871536493301392, "step": 1368, "temperature": 0.9 }, { - "advantages": -1.2363706900941906e-05, - "completion_length": 297.5, - "delta_ref_entropy_loss": 0.044189453125, - "delta_ref_ppl": -0.0654296875, - "entropy_loss": -0.03277587890625, - "epoch": 0.5476, - "grad_norm": 1.0350985847874525, - "k1_kl": 0.0654296875, - "k3_kl": 0.0457763671875, - "kimi_kl": 0.14404296875, - "learning_rate": 2.262e-07, - "loss": 0.0018, - "ppl": 0.01605224609375, - "reward": 0.9922068119049072, - "reward_std": 0.0007752776582492515, - "rewards/perpo_ocr_edit_distance_reward": 0.992206871509552, + "advantages": -1.171018448076211e-05, + "completion_length": 564.0, + "delta_ref_entropy_loss": 0.0751953125, + "delta_ref_ppl": -0.0908203125, + "entropy_loss": -0.0220947265625, + "epoch": 0.2738, + "grad_norm": 1.95460853625864, + "k1_kl": 0.09033203125, + "k3_kl": 0.059814453125, + "kimi_kl": 0.1962890625, + "learning_rate": 3.6309999999999996e-07, + "loss": 0.0024, + "ppl": 0.0106201171875, + "reward": 0.9817768335342407, + "reward_std": 0.0006279752124100924, + "rewards/perpo_ocr_edit_distance_reward": 0.9817769527435303, "step": 1369, "temperature": 0.9 }, { - "advantages": -9.707042408990674e-06, - "completion_length": 675.5, - "delta_ref_entropy_loss": 0.0460205078125, - "delta_ref_ppl": -0.0380859375, - "entropy_loss": -0.0555419921875, - "epoch": 0.548, - "grad_norm": 0.6761117718330156, - "k1_kl": 0.0382080078125, - "k3_kl": 0.02239990234375, - "kimi_kl": 0.0526123046875, - "learning_rate": 2.2599999999999999e-07, - "loss": 0.0009, - "ppl": 0.03070068359375, - "reward": 0.9752035140991211, - "reward_std": 0.003726653754711151, - "rewards/perpo_ocr_edit_distance_reward": 0.9752035737037659, + "advantages": -0.00010066373215522617, + "completion_length": 940.0, + "delta_ref_entropy_loss": 0.04443359375, + "delta_ref_ppl": -0.04248046875, + "entropy_loss": -0.0291748046875, + "epoch": 0.274, + "grad_norm": 1.2354407664340794, + "k1_kl": 0.04248046875, + "k3_kl": 0.023193359375, + "kimi_kl": 0.0654296875, + "learning_rate": 3.6299999999999995e-07, + "loss": 0.001, + "ppl": 0.01165771484375, + "reward": 0.9891911745071411, + "reward_std": 0.00023832960869185627, + "rewards/perpo_ocr_edit_distance_reward": 0.9891911745071411, "step": 1370, "temperature": 0.9 }, { - "advantages": -2.384185791015625e-07, - "completion_length": 543.0, - "delta_ref_entropy_loss": 0.0400390625, - "delta_ref_ppl": -0.021331787109375, - "entropy_loss": -0.0562744140625, - "epoch": 0.5484, - "grad_norm": 13.492676400198508, - "k1_kl": 0.021331787109375, - "k3_kl": 0.01837158203125, - "kimi_kl": 0.057498931884765625, - "learning_rate": 2.258e-07, - "loss": 0.0007, - "ppl": 0.023468017578125, - "reward": 0.9492691159248352, - "reward_std": 0.08665696531534195, - "rewards/perpo_ocr_edit_distance_reward": 0.9492692351341248, + "advantages": -0.00010726282198447734, + "completion_length": 349.0, + "delta_ref_entropy_loss": 0.05126953125, + "delta_ref_ppl": -0.08447265625, + "entropy_loss": -0.0189208984375, + "epoch": 0.2742, + "grad_norm": 0.6768359077777469, + "k1_kl": 0.0849609375, + "k3_kl": 0.057861328125, + "kimi_kl": 0.18359375, + "learning_rate": 3.629e-07, + "loss": 0.0024, + "ppl": 0.005126953125, + "reward": 0.9930952191352844, + "reward_std": 0.0002969690249301493, + "rewards/perpo_ocr_edit_distance_reward": 0.9930952787399292, "step": 1371, "temperature": 0.9 }, { - "advantages": -0.00034333980875089765, - "completion_length": 815.0, - "delta_ref_entropy_loss": 0.024658203125, - "delta_ref_ppl": -0.015106201171875, - "entropy_loss": -0.018035888671875, - "epoch": 0.5488, - "grad_norm": 0.298347836723149, - "k1_kl": 0.015106201171875, - "k3_kl": 0.006988525390625, - "kimi_kl": 0.01572418212890625, - "learning_rate": 2.2559999999999998e-07, - "loss": 0.0006, - "ppl": 0.0102081298828125, - "reward": 0.9952572584152222, - "reward_std": 0.0001982268295250833, - "rewards/perpo_ocr_edit_distance_reward": 0.9952573478221893, + "advantages": -4.5465574658010155e-05, + "completion_length": 625.0, + "delta_ref_entropy_loss": 0.052001953125, + "delta_ref_ppl": -0.0478515625, + "entropy_loss": -0.022216796875, + "epoch": 0.2744, + "grad_norm": 0.3082157836393694, + "k1_kl": 0.0478515625, + "k3_kl": 0.025634765625, + "kimi_kl": 0.07958984375, + "learning_rate": 3.628e-07, + "loss": 0.0011, + "ppl": 0.007293701171875, + "reward": 0.9606616497039795, + "reward_std": 0.00027459129341877997, + "rewards/perpo_ocr_edit_distance_reward": 0.9606617093086243, "step": 1372, "temperature": 0.9 }, { - "advantages": -0.0001157777705884655, - "completion_length": 765.5, - "delta_ref_entropy_loss": 0.030517578125, - "delta_ref_ppl": -0.01953125, - "entropy_loss": -0.05450439453125, - "epoch": 0.5492, - "grad_norm": 1.0108798237189573, - "k1_kl": 0.0194091796875, - "k3_kl": 0.0101318359375, - "kimi_kl": 0.018218994140625, - "learning_rate": 2.2539999999999997e-07, - "loss": 0.0005, - "ppl": 0.024383544921875, - "reward": 0.9958153069019318, - "reward_std": 0.0012400672130752355, - "rewards/perpo_ocr_edit_distance_reward": 0.9958153665065765, + "advantages": -0.00012052059901179746, + "completion_length": 666.0, + "delta_ref_entropy_loss": 0.04638671875, + "delta_ref_ppl": -0.05078125, + "entropy_loss": -0.0250244140625, + "epoch": 0.2746, + "grad_norm": 0.4523368313024408, + "k1_kl": 0.050537109375, + "k3_kl": 0.0306396484375, + "kimi_kl": 0.095703125, + "learning_rate": 3.627e-07, + "loss": 0.0013, + "ppl": 0.00994873046875, + "reward": 0.9897082448005676, + "reward_std": 0.0007479223422706127, + "rewards/perpo_ocr_edit_distance_reward": 0.989708423614502, "step": 1373, "temperature": 0.9 }, { - "advantages": -6.886465598654468e-06, - "completion_length": 340.5, - "delta_ref_entropy_loss": 0.01910400390625, - "delta_ref_ppl": -0.0443115234375, - "entropy_loss": -0.01934814453125, - "epoch": 0.5496, - "grad_norm": 0.5927812213180215, - "k1_kl": 0.0443115234375, - "k3_kl": 0.035888671875, - "kimi_kl": 0.19384765625, - "learning_rate": 2.252e-07, - "loss": 0.0014, - "ppl": 0.0087432861328125, - "reward": 0.999848484992981, - "reward_std": 0.000258722371654585, - "rewards/perpo_ocr_edit_distance_reward": 0.9998485147953033, + "advantages": 0.0, + "completion_length": 159.0, + "delta_ref_entropy_loss": 0.09130859375, + "delta_ref_ppl": -0.1884765625, + "entropy_loss": -0.0279541015625, + "epoch": 0.2748, + "grad_norm": 0.04111033976503761, + "k1_kl": 0.189453125, + "k3_kl": 0.1396484375, + "kimi_kl": 0.578125, + "learning_rate": 3.626e-07, + "loss": 0.0056, + "ppl": 0.00823974609375, + "reward": 0.9353312253952026, + "reward_std": 0.0, + "rewards/perpo_ocr_edit_distance_reward": 0.9353312253952026, "step": 1374, "temperature": 0.9 }, { - "advantages": -4.433308549778303e-05, - "completion_length": 894.5, - "delta_ref_entropy_loss": 0.04791259765625, - "delta_ref_ppl": -0.04132080078125, - "entropy_loss": -0.043609619140625, - "epoch": 0.55, - "grad_norm": 2.7205868519234735, - "k1_kl": 0.04119873046875, - "k3_kl": 0.0283203125, - "kimi_kl": 0.074462890625, - "learning_rate": 2.25e-07, - "loss": 0.0012, - "ppl": 0.0234375, - "reward": 0.9857337176799774, - "reward_std": 0.001651920290896669, - "rewards/perpo_ocr_edit_distance_reward": 0.9857337772846222, + "advantages": -6.464550097007304e-05, + "completion_length": 934.0, + "delta_ref_entropy_loss": 0.046875, + "delta_ref_ppl": -0.046875, + "entropy_loss": -0.0164794921875, + "epoch": 0.275, + "grad_norm": 0.23908817258143913, + "k1_kl": 0.046875, + "k3_kl": 0.025634765625, + "kimi_kl": 0.06787109375, + "learning_rate": 3.6249999999999997e-07, + "loss": 0.0011, + "ppl": 0.00579833984375, + "reward": 0.9928188323974609, + "reward_std": 0.00042696320451796055, + "rewards/perpo_ocr_edit_distance_reward": 0.9928188323974609, "step": 1375, "temperature": 0.9 }, { - "advantages": -0.00019699335825862363, - "completion_length": 748.5, - "delta_ref_entropy_loss": 0.0302734375, - "delta_ref_ppl": -0.017303466796875, - "entropy_loss": -0.0263671875, - "epoch": 0.5504, - "grad_norm": 0.5624717999041604, - "k1_kl": 0.017333984375, - "k3_kl": 0.008941650390625, - "kimi_kl": 0.018768310546875, - "learning_rate": 2.248e-07, - "loss": 0.0006, - "ppl": 0.013580322265625, - "reward": 0.9597108960151672, - "reward_std": 0.00018321351672057062, - "rewards/perpo_ocr_edit_distance_reward": 0.959710955619812, + "advantages": -4.087175966560608e-07, + "completion_length": 93.0, + "delta_ref_entropy_loss": 0.0181884765625, + "delta_ref_ppl": -0.279296875, + "entropy_loss": -0.1201171875, + "epoch": 0.2752, + "grad_norm": 2.8535070289457383, + "k1_kl": 0.279296875, + "k3_kl": 0.21484375, + "kimi_kl": 0.74609375, + "learning_rate": 3.6239999999999996e-07, + "loss": 0.0086, + "ppl": 0.05078125, + "reward": 0.8690105080604553, + "reward_std": 0.1674104779958725, + "rewards/perpo_ocr_edit_distance_reward": 0.8690106272697449, "step": 1376, "temperature": 0.9 }, { - "advantages": -7.199815809144638e-05, - "completion_length": 540.5, - "delta_ref_entropy_loss": 0.0372314453125, - "delta_ref_ppl": -0.0289306640625, - "entropy_loss": -0.02777099609375, - "epoch": 0.5508, - "grad_norm": 0.6920435574585775, - "k1_kl": 0.0289306640625, - "k3_kl": 0.015716552734375, - "kimi_kl": 0.0301513671875, - "learning_rate": 2.2459999999999999e-07, - "loss": 0.0007, - "ppl": 0.0143280029296875, - "reward": 0.9984178245067596, - "reward_std": 0.00038889610732439905, - "rewards/perpo_ocr_edit_distance_reward": 0.998417854309082, + "advantages": -8.159024582710117e-05, + "completion_length": 723.0, + "delta_ref_entropy_loss": 0.06787109375, + "delta_ref_ppl": -0.050537109375, + "entropy_loss": -0.04296875, + "epoch": 0.2754, + "grad_norm": 0.4282380538713682, + "k1_kl": 0.050537109375, + "k3_kl": 0.0247802734375, + "kimi_kl": 0.052490234375, + "learning_rate": 3.623e-07, + "loss": 0.0011, + "ppl": 0.016845703125, + "reward": 0.971406102180481, + "reward_std": 0.0006305747665464878, + "rewards/perpo_ocr_edit_distance_reward": 0.9714061617851257, "step": 1377, "temperature": 0.9 }, { - "advantages": -0.0002980232238769531, - "completion_length": 129.5, - "delta_ref_entropy_loss": 0.0875244140625, - "delta_ref_ppl": -0.16461181640625, - "entropy_loss": -0.048095703125, - "epoch": 0.5512, - "grad_norm": 0.0586246907460938, - "k1_kl": 0.16461181640625, - "k3_kl": 0.121826171875, - "kimi_kl": 0.4998779296875, - "learning_rate": 2.2439999999999997e-07, - "loss": 0.0052, - "ppl": 0.021026611328125, - "reward": 0.8353474736213684, - "reward_std": 0.0, - "rewards/perpo_ocr_edit_distance_reward": 0.8353474736213684, + "advantages": -1.1461122085165698e-05, + "completion_length": 715.0, + "delta_ref_entropy_loss": 0.1328125, + "delta_ref_ppl": -0.09130859375, + "entropy_loss": -0.126953125, + "epoch": 0.2756, + "grad_norm": 1.584966286969312, + "k1_kl": 0.09130859375, + "k3_kl": 0.04833984375, + "kimi_kl": 0.1259765625, + "learning_rate": 3.622e-07, + "loss": 0.0019, + "ppl": 0.0673828125, + "reward": 0.964601457118988, + "reward_std": 0.005844390485435724, + "rewards/perpo_ocr_edit_distance_reward": 0.9646015167236328, "step": 1378, "temperature": 0.9 }, { - "advantages": -7.458670006599277e-05, - "completion_length": 473.0, - "delta_ref_entropy_loss": 0.017364501953125, - "delta_ref_ppl": -0.012908935546875, - "entropy_loss": -0.0294189453125, - "epoch": 0.5516, - "grad_norm": 26.498704360396736, - "k1_kl": 0.013031005859375, - "k3_kl": 0.04729461669921875, - "kimi_kl": 0.0465087890625, - "learning_rate": 2.242e-07, - "loss": 0.002, - "ppl": 0.0395355224609375, - "reward": 0.8240008652210236, - "reward_std": 6.415999087039381e-05, - "rewards/perpo_ocr_edit_distance_reward": 0.8240009248256683, + "advantages": -1.3887883142160717e-05, + "completion_length": 1201.0, + "delta_ref_entropy_loss": 0.0791015625, + "delta_ref_ppl": -0.056396484375, + "entropy_loss": -0.0986328125, + "epoch": 0.2758, + "grad_norm": 0.9188037771987465, + "k1_kl": 0.056396484375, + "k3_kl": 0.031494140625, + "kimi_kl": 0.0693359375, + "learning_rate": 3.6209999999999994e-07, + "loss": 0.0013, + "ppl": 0.054931640625, + "reward": 0.908679187297821, + "reward_std": 0.002352308016270399, + "rewards/perpo_ocr_edit_distance_reward": 0.908679187297821, "step": 1379, "temperature": 0.9 }, { - "advantages": -0.00024110079434080944, - "completion_length": 871.5, - "delta_ref_entropy_loss": 0.11029052734375, - "delta_ref_ppl": -0.07177734375, - "entropy_loss": -0.16339111328125, - "epoch": 0.552, - "grad_norm": 1.7300435794719817, - "k1_kl": 0.07208251953125, - "k3_kl": 0.043212890625, - "kimi_kl": 0.147216796875, - "learning_rate": 2.24e-07, - "loss": 0.002, - "ppl": 0.09136962890625, - "reward": 0.5711818560957909, - "reward_std": 0.0023250044032465667, - "rewards/perpo_ocr_edit_distance_reward": 0.5711819157004356, + "advantages": -8.549009180569556e-06, + "completion_length": 61.0, + "delta_ref_entropy_loss": 0.11767578125, + "delta_ref_ppl": -0.40625, + "entropy_loss": -0.1025390625, + "epoch": 0.276, + "grad_norm": 3.7337579612011567, + "k1_kl": 0.408203125, + "k3_kl": 0.30859375, + "kimi_kl": 0.984375, + "learning_rate": 3.62e-07, + "loss": 0.0124, + "ppl": 0.0380859375, + "reward": 0.9005975127220154, + "reward_std": 0.008897808380424976, + "rewards/perpo_ocr_edit_distance_reward": 0.9005975723266602, "step": 1380, "temperature": 0.9 }, { - "advantages": -1.584419305800111e-05, - "completion_length": 471.5, - "delta_ref_entropy_loss": 0.0516357421875, - "delta_ref_ppl": -0.0460205078125, - "entropy_loss": -0.0572509765625, - "epoch": 0.5524, - "grad_norm": 1.8176526943639562, - "k1_kl": 0.0460205078125, - "k3_kl": 0.0284423828125, - "kimi_kl": 0.0743408203125, - "learning_rate": 2.2379999999999998e-07, - "loss": 0.0012, - "ppl": 0.03289794921875, - "reward": 0.8932982087135315, - "reward_std": 0.003657081979326904, - "rewards/perpo_ocr_edit_distance_reward": 0.8932982385158539, + "advantages": -0.0002220443420810625, + "completion_length": 784.0, + "delta_ref_entropy_loss": 0.047607421875, + "delta_ref_ppl": -0.0400390625, + "entropy_loss": -0.0135498046875, + "epoch": 0.2762, + "grad_norm": 0.5461631978556472, + "k1_kl": 0.0400390625, + "k3_kl": 0.0208740234375, + "kimi_kl": 0.059814453125, + "learning_rate": 3.619e-07, + "loss": 0.0011, + "ppl": 0.00469970703125, + "reward": 0.9885575175285339, + "reward_std": 9.173079888569191e-05, + "rewards/perpo_ocr_edit_distance_reward": 0.9885575771331787, "step": 1381, "temperature": 0.9 }, { - "advantages": -5.487885573529638e-06, - "completion_length": 505.5, - "delta_ref_entropy_loss": 0.0367431640625, - "delta_ref_ppl": -0.05145263671875, - "entropy_loss": -0.02984619140625, - "epoch": 0.5528, - "grad_norm": 0.5978140774423079, - "k1_kl": 0.0513916015625, - "k3_kl": 0.03680419921875, - "kimi_kl": 0.1123046875, - "learning_rate": 2.236e-07, - "loss": 0.0015, - "ppl": 0.014251708984375, - "reward": 0.9943016469478607, - "reward_std": 0.0012451917282305658, - "rewards/perpo_ocr_edit_distance_reward": 0.9943017065525055, + "advantages": -4.4124470150563866e-05, + "completion_length": 419.0, + "delta_ref_entropy_loss": 0.04736328125, + "delta_ref_ppl": -0.10107421875, + "entropy_loss": -0.0157470703125, + "epoch": 0.2764, + "grad_norm": 0.8213178935978203, + "k1_kl": 0.10107421875, + "k3_kl": 0.08056640625, + "kimi_kl": 0.30859375, + "learning_rate": 3.6179999999999997e-07, + "loss": 0.0033, + "ppl": 0.0069580078125, + "reward": 0.9913259744644165, + "reward_std": 0.0006718370714224875, + "rewards/perpo_ocr_edit_distance_reward": 0.9913260340690613, "step": 1382, "temperature": 0.9 }, { - "advantages": -2.906577992689563e-05, - "completion_length": 455.0, - "delta_ref_entropy_loss": 0.029541015625, - "delta_ref_ppl": -0.03125, - "entropy_loss": -0.0185546875, - "epoch": 0.5532, - "grad_norm": 0.6797773987194359, - "k1_kl": 0.0313720703125, - "k3_kl": 0.019012451171875, - "kimi_kl": 0.0482177734375, - "learning_rate": 2.2339999999999998e-07, - "loss": 0.0008, - "ppl": 0.00994873046875, - "reward": 0.9995772242546082, - "reward_std": 0.0005344930686987936, - "rewards/perpo_ocr_edit_distance_reward": 0.9995772242546082, + "advantages": -7.173845006036572e-06, + "completion_length": 1225.0, + "delta_ref_entropy_loss": 0.03515625, + "delta_ref_ppl": -0.037841796875, + "entropy_loss": -0.038818359375, + "epoch": 0.2766, + "grad_norm": 2.346543504372697, + "k1_kl": 0.03759765625, + "k3_kl": 0.0211181640625, + "kimi_kl": 0.0517578125, + "learning_rate": 3.617e-07, + "loss": 0.0009, + "ppl": 0.0181884765625, + "reward": 0.9897682070732117, + "reward_std": 0.001083696843124926, + "rewards/perpo_ocr_edit_distance_reward": 0.9897682070732117, "step": 1383, "temperature": 0.9 }, { - "advantages": -6.0541290849869256e-05, - "completion_length": 796.0, - "delta_ref_entropy_loss": 0.022216796875, - "delta_ref_ppl": -0.00860595703125, - "entropy_loss": -0.016448974609375, - "epoch": 0.5536, - "grad_norm": 0.2941054551598428, - "k1_kl": 0.0086517333984375, - "k3_kl": 0.003261566162109375, - "kimi_kl": 0.00525665283203125, - "learning_rate": 2.232e-07, - "loss": 0.0002, - "ppl": 0.0072784423828125, - "reward": 0.9944376349449158, - "reward_std": 0.0044416588061721995, - "rewards/perpo_ocr_edit_distance_reward": 0.9944377243518829, + "advantages": 0.0, + "completion_length": 490.0, + "delta_ref_entropy_loss": 0.07080078125, + "delta_ref_ppl": -0.08349609375, + "entropy_loss": -0.0247802734375, + "epoch": 0.2768, + "grad_norm": 0.038244994053489904, + "k1_kl": 0.083984375, + "k3_kl": 0.0517578125, + "kimi_kl": 0.1591796875, + "learning_rate": 3.6159999999999996e-07, + "loss": 0.0021, + "ppl": 0.00628662109375, + "reward": 0.9650614261627197, + "reward_std": 0.0, + "rewards/perpo_ocr_edit_distance_reward": 0.965061366558075, "step": 1384, "temperature": 0.9 }, { - "advantages": -6.620373045507222e-05, - "completion_length": 511.0, - "delta_ref_entropy_loss": 0.0296630859375, - "delta_ref_ppl": -0.0257568359375, - "entropy_loss": -0.03009033203125, - "epoch": 0.554, - "grad_norm": 0.6680571213297014, - "k1_kl": 0.02569580078125, - "k3_kl": 0.013946533203125, - "kimi_kl": 0.0345458984375, - "learning_rate": 2.23e-07, + "advantages": 3.6750523577211425e-05, + "completion_length": 1058.0, + "delta_ref_entropy_loss": 0.0213623046875, + "delta_ref_ppl": -0.024169921875, + "entropy_loss": -0.02587890625, + "epoch": 0.277, + "grad_norm": 0.7289692371575962, + "k1_kl": 0.024169921875, + "k3_kl": 0.014892578125, + "kimi_kl": 0.034912109375, + "learning_rate": 3.6149999999999995e-07, "loss": 0.0006, - "ppl": 0.01123046875, - "reward": 0.9317564368247986, - "reward_std": 0.16573405277449638, - "rewards/perpo_ocr_edit_distance_reward": 0.9317565560340881, + "ppl": 0.01055908203125, + "reward": 0.9741013050079346, + "reward_std": 0.0005955393426120281, + "rewards/perpo_ocr_edit_distance_reward": 0.9741012454032898, "step": 1385, "temperature": 0.9 }, { - "advantages": 6.982258469179214e-07, - "completion_length": 738.5, - "delta_ref_entropy_loss": 0.02423095703125, - "delta_ref_ppl": -0.022705078125, - "entropy_loss": -0.01507568359375, - "epoch": 0.5544, - "grad_norm": 0.3742892506494102, - "k1_kl": 0.02264404296875, - "k3_kl": 0.013031005859375, - "kimi_kl": 0.0272216796875, - "learning_rate": 2.2279999999999998e-07, - "loss": 0.0005, - "ppl": 0.0049896240234375, - "reward": 0.9762088060379028, - "reward_std": 0.003042369382455945, - "rewards/perpo_ocr_edit_distance_reward": 0.9762088358402252, + "advantages": -0.0005960464477539062, + "completion_length": 579.0, + "delta_ref_entropy_loss": 0.056884765625, + "delta_ref_ppl": -0.056884765625, + "entropy_loss": -0.010009765625, + "epoch": 0.2772, + "grad_norm": 0.004425116349877347, + "k1_kl": 0.05712890625, + "k3_kl": 0.034912109375, + "kimi_kl": 0.12451171875, + "learning_rate": 3.614e-07, + "loss": 0.002, + "ppl": 0.0026092529296875, + "reward": 0.998581051826477, + "reward_std": 0.0, + "rewards/perpo_ocr_edit_distance_reward": 0.9985811114311218, "step": 1386, "temperature": 0.9 }, { - "advantages": -6.285735798883252e-05, - "completion_length": 465.5, - "delta_ref_entropy_loss": 0.06640625, - "delta_ref_ppl": -0.032012939453125, - "entropy_loss": -0.0540771484375, - "epoch": 0.5548, - "grad_norm": 0.9250602463872771, - "k1_kl": 0.032012939453125, - "k3_kl": 0.0134429931640625, - "kimi_kl": 0.02716827392578125, - "learning_rate": 2.226e-07, - "loss": 0.0006, - "ppl": 0.0257568359375, - "reward": 0.9255347847938538, - "reward_std": 0.0010056744213216007, - "rewards/perpo_ocr_edit_distance_reward": 0.9255348742008209, + "advantages": -5.9757916460512206e-05, + "completion_length": 580.0, + "delta_ref_entropy_loss": 0.04052734375, + "delta_ref_ppl": -0.03662109375, + "entropy_loss": -0.01953125, + "epoch": 0.2774, + "grad_norm": 0.39108224589179447, + "k1_kl": 0.03662109375, + "k3_kl": 0.0179443359375, + "kimi_kl": 0.04052734375, + "learning_rate": 3.613e-07, + "loss": 0.0008, + "ppl": 0.009033203125, + "reward": 0.9954217076301575, + "reward_std": 0.0007552166352979839, + "rewards/perpo_ocr_edit_distance_reward": 0.9954218864440918, "step": 1387, "temperature": 0.9 }, { - "advantages": -5.5428063205908984e-05, - "completion_length": 771.5, - "delta_ref_entropy_loss": 0.02850341796875, - "delta_ref_ppl": -0.022705078125, - "entropy_loss": -0.01373291015625, - "epoch": 0.5552, - "grad_norm": 0.28189370681971493, - "k1_kl": 0.02264404296875, - "k3_kl": 0.013031005859375, - "kimi_kl": 0.04583740234375, - "learning_rate": 2.2239999999999998e-07, - "loss": 0.0006, - "ppl": 0.00489044189453125, - "reward": 0.9990680813789368, - "reward_std": 0.0005491354822879657, - "rewards/perpo_ocr_edit_distance_reward": 0.9990681111812592, + "advantages": -2.7929033876716858e-06, + "completion_length": 371.0, + "delta_ref_entropy_loss": 0.146484375, + "delta_ref_ppl": -0.12109375, + "entropy_loss": -0.1533203125, + "epoch": 0.2776, + "grad_norm": 2.2679997363555024, + "k1_kl": 0.12060546875, + "k3_kl": 0.0751953125, + "kimi_kl": 0.236328125, + "learning_rate": 3.612e-07, + "loss": 0.003, + "ppl": 0.0673828125, + "reward": 0.6799293756484985, + "reward_std": 0.021288787946105003, + "rewards/perpo_ocr_edit_distance_reward": 0.6799294352531433, "step": 1388, "temperature": 0.9 }, { - "advantages": -4.2029792439279845e-05, - "completion_length": 664.5, - "delta_ref_entropy_loss": 0.0673828125, - "delta_ref_ppl": -0.0491943359375, - "entropy_loss": -0.0670166015625, - "epoch": 0.5556, - "grad_norm": 0.7719172417847755, - "k1_kl": 0.049072265625, - "k3_kl": 0.02752685546875, - "kimi_kl": 0.07861328125, - "learning_rate": 2.222e-07, - "loss": 0.0011, - "ppl": 0.03448486328125, - "reward": 0.9690268933773041, - "reward_std": 0.002959944133181125, - "rewards/perpo_ocr_edit_distance_reward": 0.9690269529819489, + "advantages": -3.931777973775752e-05, + "completion_length": 567.0, + "delta_ref_entropy_loss": 0.07861328125, + "delta_ref_ppl": -0.06640625, + "entropy_loss": -0.02978515625, + "epoch": 0.2778, + "grad_norm": 0.4463401083336634, + "k1_kl": 0.06640625, + "k3_kl": 0.038330078125, + "kimi_kl": 0.1298828125, + "learning_rate": 3.6109999999999997e-07, + "loss": 0.0016, + "ppl": 0.0101318359375, + "reward": 0.862642765045166, + "reward_std": 0.0011993848020210862, + "rewards/perpo_ocr_edit_distance_reward": 0.8626428246498108, "step": 1389, "temperature": 0.9 }, { - "advantages": -0.00016588825383223593, - "completion_length": 655.5, - "delta_ref_entropy_loss": 0.0318603515625, - "delta_ref_ppl": -0.024444580078125, - "entropy_loss": -0.0157470703125, - "epoch": 0.556, - "grad_norm": 0.08665349668523828, - "k1_kl": 0.02435302734375, - "k3_kl": 0.01299285888671875, - "kimi_kl": 0.03369903564453125, - "learning_rate": 2.22e-07, - "loss": 0.0007, - "ppl": 0.00506591796875, - "reward": 0.9999437034130096, - "reward_std": 5.266461812425405e-05, - "rewards/perpo_ocr_edit_distance_reward": 0.999943733215332, + "advantages": -0.0005960464477539062, + "completion_length": 599.0, + "delta_ref_entropy_loss": 0.041259765625, + "delta_ref_ppl": -0.039306640625, + "entropy_loss": -0.01458740234375, + "epoch": 0.278, + "grad_norm": 0.018451515724477778, + "k1_kl": 0.0390625, + "k3_kl": 0.0234375, + "kimi_kl": 0.0693359375, + "learning_rate": 3.6099999999999996e-07, + "loss": 0.0015, + "ppl": 0.003997802734375, + "reward": 0.830562174320221, + "reward_std": 0.0, + "rewards/perpo_ocr_edit_distance_reward": 0.8305622339248657, "step": 1390, "temperature": 0.9 }, { - "advantages": -1.0677747013687622e-05, - "completion_length": 987.5, - "delta_ref_entropy_loss": 0.03326416015625, - "delta_ref_ppl": -0.026458740234375, - "entropy_loss": -0.021148681640625, - "epoch": 0.5564, - "grad_norm": 0.42191584722035924, - "k1_kl": 0.026458740234375, - "k3_kl": 0.018829345703125, - "kimi_kl": 0.0477294921875, - "learning_rate": 2.218e-07, - "loss": 0.0008, - "ppl": 0.0145721435546875, - "reward": 0.9936333298683167, - "reward_std": 0.000548274430911988, - "rewards/perpo_ocr_edit_distance_reward": 0.993633359670639, + "advantages": 1.7029899268550253e-08, + "completion_length": 990.0, + "delta_ref_entropy_loss": 0.08349609375, + "delta_ref_ppl": -0.083984375, + "entropy_loss": -0.057373046875, + "epoch": 0.2782, + "grad_norm": 1.1487309133245802, + "k1_kl": 0.083984375, + "k3_kl": 0.049560546875, + "kimi_kl": 0.1357421875, + "learning_rate": 3.6089999999999996e-07, + "loss": 0.002, + "ppl": 0.0260009765625, + "reward": 0.9585091471672058, + "reward_std": 0.0012307015713304281, + "rewards/perpo_ocr_edit_distance_reward": 0.9585092067718506, "step": 1391, "temperature": 0.9 }, { - "advantages": -0.0003123538845102303, - "completion_length": 228.5, - "delta_ref_entropy_loss": 0.0565185546875, - "delta_ref_ppl": -0.0562744140625, - "entropy_loss": -0.028778076171875, - "epoch": 0.5568, - "grad_norm": 0.47230979340752594, - "k1_kl": 0.0560302734375, - "k3_kl": 0.034912109375, - "kimi_kl": 0.102783203125, - "learning_rate": 2.2159999999999997e-07, - "loss": 0.0017, - "ppl": 0.012603759765625, - "reward": 0.879737913608551, - "reward_std": 0.0011374016758054495, - "rewards/perpo_ocr_edit_distance_reward": 0.8797379732131958, + "advantages": -1.6961779692792334e-05, + "completion_length": 760.0, + "delta_ref_entropy_loss": 0.12890625, + "delta_ref_ppl": -0.10693359375, + "entropy_loss": -0.14453125, + "epoch": 0.2784, + "grad_norm": 2.3459489177408512, + "k1_kl": 0.10693359375, + "k3_kl": 0.06494140625, + "kimi_kl": 0.1650390625, + "learning_rate": 3.608e-07, + "loss": 0.0026, + "ppl": 0.076171875, + "reward": 0.9289900660514832, + "reward_std": 0.002909902250394225, + "rewards/perpo_ocr_edit_distance_reward": 0.9289902448654175, "step": 1392, "temperature": 0.9 }, { - "advantages": 4.2283109678464825e-05, - "completion_length": 816.0, - "delta_ref_entropy_loss": 0.05328369140625, - "delta_ref_ppl": -0.039306640625, - "entropy_loss": -0.0972900390625, - "epoch": 0.5572, - "grad_norm": 1.1664339074831653, - "k1_kl": 0.039306640625, - "k3_kl": 0.0252685546875, - "kimi_kl": 0.086181640625, - "learning_rate": 2.214e-07, - "loss": 0.001, - "ppl": 0.04986572265625, - "reward": 0.8774694204330444, - "reward_std": 0.014560961921233684, - "rewards/perpo_ocr_edit_distance_reward": 0.8774694204330444, + "advantages": 4.2574748704282683e-07, + "completion_length": 761.0, + "delta_ref_entropy_loss": 0.1064453125, + "delta_ref_ppl": -0.0888671875, + "entropy_loss": -0.10302734375, + "epoch": 0.2786, + "grad_norm": 1.9316934504431196, + "k1_kl": 0.0888671875, + "k3_kl": 0.046630859375, + "kimi_kl": 0.099609375, + "learning_rate": 3.607e-07, + "loss": 0.0019, + "ppl": 0.0478515625, + "reward": 0.8432895541191101, + "reward_std": 0.018529165536165237, + "rewards/perpo_ocr_edit_distance_reward": 0.8432895541191101, "step": 1393, "temperature": 0.9 }, { - "advantages": -0.0003241215435991762, - "completion_length": 299.0, - "delta_ref_entropy_loss": 0.0400390625, - "delta_ref_ppl": -0.02886962890625, - "entropy_loss": -0.03057861328125, - "epoch": 0.5576, - "grad_norm": 0.5025073875585535, - "k1_kl": 0.029052734375, - "k3_kl": 0.0174713134765625, - "kimi_kl": 0.059234619140625, - "learning_rate": 2.212e-07, - "loss": 0.001, - "ppl": 0.0119171142578125, - "reward": 0.9823171198368073, - "reward_std": 0.0006838699337095022, - "rewards/perpo_ocr_edit_distance_reward": 0.9823172092437744, + "advantages": -3.75679592252709e-05, + "completion_length": 439.0, + "delta_ref_entropy_loss": 0.08349609375, + "delta_ref_ppl": -0.119140625, + "entropy_loss": -0.046630859375, + "epoch": 0.2788, + "grad_norm": 0.9750641778626215, + "k1_kl": 0.11962890625, + "k3_kl": 0.078125, + "kimi_kl": 0.296875, + "learning_rate": 3.6059999999999993e-07, + "loss": 0.0032, + "ppl": 0.0224609375, + "reward": 0.9923962950706482, + "reward_std": 0.0023928978480398655, + "rewards/perpo_ocr_edit_distance_reward": 0.9923964142799377, "step": 1394, "temperature": 0.9 }, { - "advantages": -0.00015207700425889925, - "completion_length": 836.5, - "delta_ref_entropy_loss": 0.03594970703125, - "delta_ref_ppl": -0.026092529296875, - "entropy_loss": -0.023681640625, - "epoch": 0.558, - "grad_norm": 0.3069109928887802, - "k1_kl": 0.026092529296875, - "k3_kl": 0.0136260986328125, - "kimi_kl": 0.04046630859375, - "learning_rate": 2.2099999999999998e-07, - "loss": 0.0007, - "ppl": 0.0095062255859375, - "reward": 0.9903141558170319, - "reward_std": 0.002294565703778062, - "rewards/perpo_ocr_edit_distance_reward": 0.9903142750263214, + "advantages": -8.65459514898248e-05, + "completion_length": 715.0, + "delta_ref_entropy_loss": 0.02880859375, + "delta_ref_ppl": -0.03466796875, + "entropy_loss": -0.01416015625, + "epoch": 0.279, + "grad_norm": 0.2045960676940931, + "k1_kl": 0.03466796875, + "k3_kl": 0.0203857421875, + "kimi_kl": 0.062255859375, + "learning_rate": 3.605e-07, + "loss": 0.0009, + "ppl": 0.003936767578125, + "reward": 0.9983773827552795, + "reward_std": 0.0001951363228727132, + "rewards/perpo_ocr_edit_distance_reward": 0.9983775019645691, "step": 1395, "temperature": 0.9 }, { - "advantages": -0.0002980232238769531, - "completion_length": 271.5, - "delta_ref_entropy_loss": 0.02496337890625, - "delta_ref_ppl": -0.0225067138671875, - "entropy_loss": -0.018463134765625, - "epoch": 0.5584, - "grad_norm": 0.044010294178508914, - "k1_kl": 0.022491455078125, - "k3_kl": 0.0137939453125, - "kimi_kl": 0.0264892578125, - "learning_rate": 2.208e-07, - "loss": 0.0009, - "ppl": 0.008087158203125, - "reward": 0.9997608363628387, - "reward_std": 0.0, - "rewards/perpo_ocr_edit_distance_reward": 0.9997608959674835, + "advantages": -0.00012121456529712304, + "completion_length": 845.0, + "delta_ref_entropy_loss": 0.07470703125, + "delta_ref_ppl": -0.061767578125, + "entropy_loss": -0.0390625, + "epoch": 0.2792, + "grad_norm": 0.425905685000156, + "k1_kl": 0.061767578125, + "k3_kl": 0.0289306640625, + "kimi_kl": 0.07470703125, + "learning_rate": 3.6039999999999997e-07, + "loss": 0.0013, + "ppl": 0.01531982421875, + "reward": 0.9881244897842407, + "reward_std": 0.00039175304118543863, + "rewards/perpo_ocr_edit_distance_reward": 0.9881246089935303, "step": 1396, "temperature": 0.9 }, { - "advantages": -0.00014939905486244243, - "completion_length": 585.0, - "delta_ref_entropy_loss": 0.03759765625, - "delta_ref_ppl": -0.0211181640625, - "entropy_loss": -0.03607177734375, - "epoch": 0.5588, - "grad_norm": 0.46358890114268625, - "k1_kl": 0.0211181640625, - "k3_kl": 0.0102081298828125, - "kimi_kl": 0.019134521484375, - "learning_rate": 2.2059999999999998e-07, - "loss": 0.0006, - "ppl": 0.0179443359375, - "reward": 0.9914686679840088, - "reward_std": 0.00047430551785510033, - "rewards/perpo_ocr_edit_distance_reward": 0.9914688169956207, + "advantages": -1.8221992377220886e-06, + "completion_length": 1094.0, + "delta_ref_entropy_loss": 0.0576171875, + "delta_ref_ppl": -0.05908203125, + "entropy_loss": -0.0830078125, + "epoch": 0.2794, + "grad_norm": 1.9451064986600997, + "k1_kl": 0.05908203125, + "k3_kl": 0.04296875, + "kimi_kl": 0.1015625, + "learning_rate": 3.603e-07, + "loss": 0.0017, + "ppl": 0.04052734375, + "reward": 0.9039315581321716, + "reward_std": 0.04603467509150505, + "rewards/perpo_ocr_edit_distance_reward": 0.9039316773414612, "step": 1397, "temperature": 0.9 }, { - "advantages": -9.847539331531152e-06, - "completion_length": 640.0, - "delta_ref_entropy_loss": 0.02703857421875, - "delta_ref_ppl": -0.02313232421875, - "entropy_loss": -0.02008056640625, - "epoch": 0.5592, - "grad_norm": 0.2745150405110693, - "k1_kl": 0.023193359375, - "k3_kl": 0.01336669921875, - "kimi_kl": 0.036376953125, - "learning_rate": 2.2040000000000001e-07, - "loss": 0.0005, - "ppl": 0.0086822509765625, - "reward": 0.9982879161834717, - "reward_std": 0.000310431671096012, - "rewards/perpo_ocr_edit_distance_reward": 0.9982879459857941, + "advantages": -0.0005960464477539062, + "completion_length": 577.0, + "delta_ref_entropy_loss": 0.0556640625, + "delta_ref_ppl": -0.055419921875, + "entropy_loss": -0.0108642578125, + "epoch": 0.2796, + "grad_norm": 0.003623906705909418, + "k1_kl": 0.055419921875, + "k3_kl": 0.03369140625, + "kimi_kl": 0.10546875, + "learning_rate": 3.602e-07, + "loss": 0.0019, + "ppl": 0.001861572265625, + "reward": 0.9969488382339478, + "reward_std": 0.0, + "rewards/perpo_ocr_edit_distance_reward": 0.9969489574432373, "step": 1398, "temperature": 0.9 }, { - "advantages": -7.987448771018535e-05, - "completion_length": 1511.5, - "delta_ref_entropy_loss": 0.0263671875, - "delta_ref_ppl": -0.0114288330078125, - "entropy_loss": -0.094970703125, - "epoch": 0.5596, - "grad_norm": 1.3804896021541466, - "k1_kl": 0.01129150390625, - "k3_kl": 0.0114288330078125, - "kimi_kl": 0.01158905029296875, - "learning_rate": 2.202e-07, - "loss": 0.0005, - "ppl": 0.061553955078125, - "reward": 0.7962624728679657, - "reward_std": 0.007119356421753764, - "rewards/perpo_ocr_edit_distance_reward": 0.7962625324726105, + "advantages": -6.35385513305664e-05, + "completion_length": 705.0, + "delta_ref_entropy_loss": 0.061767578125, + "delta_ref_ppl": -0.051513671875, + "entropy_loss": -0.0184326171875, + "epoch": 0.2798, + "grad_norm": 6.438185500018916, + "k1_kl": 0.0517578125, + "k3_kl": 0.0277099609375, + "kimi_kl": 0.06787109375, + "learning_rate": 3.6009999999999995e-07, + "loss": 0.0012, + "ppl": 0.0230712890625, + "reward": 0.9970369935035706, + "reward_std": 0.0008380438084714115, + "rewards/perpo_ocr_edit_distance_reward": 0.9970371127128601, "step": 1399, "temperature": 0.9 }, { - "advantages": -2.566405760262569e-05, - "completion_length": 744.5, - "delta_ref_entropy_loss": 0.08856201171875, - "delta_ref_ppl": -0.04681396484375, - "entropy_loss": -0.1243896484375, - "epoch": 0.56, - "grad_norm": 2.0694613889526137, - "k1_kl": 0.04656982421875, - "k3_kl": 0.02520751953125, - "kimi_kl": 0.0406494140625, - "learning_rate": 2.1999999999999998e-07, - "loss": 0.001, - "ppl": 0.07049560546875, - "reward": 0.8946853280067444, - "reward_std": 0.006157191935926676, - "rewards/perpo_ocr_edit_distance_reward": 0.8946853876113892, + "advantages": -0.0001304149627685547, + "completion_length": 769.0, + "delta_ref_entropy_loss": 0.05126953125, + "delta_ref_ppl": -0.07568359375, + "entropy_loss": -0.0269775390625, + "epoch": 0.28, + "grad_norm": 0.47005589156748717, + "k1_kl": 0.07568359375, + "k3_kl": 0.048583984375, + "kimi_kl": 0.1572265625, + "learning_rate": 3.6e-07, + "loss": 0.0021, + "ppl": 0.01275634765625, + "reward": 0.9929681420326233, + "reward_std": 0.00022647669538855553, + "rewards/perpo_ocr_edit_distance_reward": 0.9929682016372681, "step": 1400, "temperature": 0.9 }, { - "advantages": -0.00013837645019521005, - "completion_length": 521.5, - "delta_ref_entropy_loss": 0.0643310546875, - "delta_ref_ppl": -0.03118896484375, - "entropy_loss": -0.05810546875, - "epoch": 0.5604, - "grad_norm": 0.624831864679258, - "k1_kl": 0.03118896484375, - "k3_kl": 0.01318359375, - "kimi_kl": 0.022003173828125, - "learning_rate": 2.198e-07, - "loss": 0.0007, - "ppl": 0.030181884765625, - "reward": 0.9635811150074005, - "reward_std": 0.00077876950672362, - "rewards/perpo_ocr_edit_distance_reward": 0.9635812342166901, + "advantages": -2.5851386453723535e-05, + "completion_length": 778.0, + "delta_ref_entropy_loss": 0.0693359375, + "delta_ref_ppl": -0.08154296875, + "entropy_loss": -0.05908203125, + "epoch": 0.2802, + "grad_norm": 0.8855869164447557, + "k1_kl": 0.08154296875, + "k3_kl": 0.046630859375, + "kimi_kl": 0.1025390625, + "learning_rate": 3.599e-07, + "loss": 0.0019, + "ppl": 0.0301513671875, + "reward": 0.9914566278457642, + "reward_std": 0.001549293170683086, + "rewards/perpo_ocr_edit_distance_reward": 0.9914566278457642, "step": 1401, "temperature": 0.9 }, { - "advantages": -2.221124617562964e-05, - "completion_length": 254.0, - "delta_ref_entropy_loss": 0.0966796875, - "delta_ref_ppl": -0.2183837890625, - "entropy_loss": -0.3204345703125, - "epoch": 0.5608, - "grad_norm": 3.5525989216206657, - "k1_kl": 0.2183837890625, - "k3_kl": 0.15631103515625, - "kimi_kl": 0.53125, - "learning_rate": 2.1959999999999998e-07, - "loss": 0.0063, - "ppl": 0.1688232421875, - "reward": 0.7171434611082077, - "reward_std": 0.040538499131798744, - "rewards/perpo_ocr_edit_distance_reward": 0.7171435505151749, + "advantages": 1.5284334949683398e-05, + "completion_length": 851.0, + "delta_ref_entropy_loss": 0.083984375, + "delta_ref_ppl": -0.07080078125, + "entropy_loss": -0.045654296875, + "epoch": 0.2804, + "grad_norm": 0.6128749813192105, + "k1_kl": 0.0703125, + "k3_kl": 0.0341796875, + "kimi_kl": 0.0712890625, + "learning_rate": 3.598e-07, + "loss": 0.0014, + "ppl": 0.02197265625, + "reward": 0.9514301419258118, + "reward_std": 0.00045739489723928273, + "rewards/perpo_ocr_edit_distance_reward": 0.9514302015304565, "step": 1402, "temperature": 0.9 }, { - "advantages": -8.694615098647773e-05, - "completion_length": 493.5, - "delta_ref_entropy_loss": 0.036376953125, - "delta_ref_ppl": -0.038726806640625, - "entropy_loss": -0.0205078125, - "epoch": 0.5612, - "grad_norm": 1.2725633585890583, - "k1_kl": 0.0386962890625, - "k3_kl": 0.026153564453125, - "kimi_kl": 0.125, - "learning_rate": 2.194e-07, - "loss": 0.0011, - "ppl": 0.0080718994140625, - "reward": 0.9988646507263184, - "reward_std": 0.0004682705766754225, - "rewards/perpo_ocr_edit_distance_reward": 0.9988647401332855, + "advantages": -3.5881996154785156e-05, + "completion_length": 320.0, + "delta_ref_entropy_loss": 0.09912109375, + "delta_ref_ppl": -0.1005859375, + "entropy_loss": -0.07421875, + "epoch": 0.2806, + "grad_norm": 0.9078509237895368, + "k1_kl": 0.1005859375, + "k3_kl": 0.06201171875, + "kimi_kl": 0.1630859375, + "learning_rate": 3.597e-07, + "loss": 0.0025, + "ppl": 0.0242919921875, + "reward": 0.5427724123001099, + "reward_std": 0.0008489463361911476, + "rewards/perpo_ocr_edit_distance_reward": 0.5427724719047546, "step": 1403, "temperature": 0.9 }, { - "advantages": -3.065381974920456e-07, - "completion_length": 492.5, - "delta_ref_entropy_loss": 0.056640625, - "delta_ref_ppl": -0.04949951171875, - "entropy_loss": -0.0775146484375, - "epoch": 0.5616, - "grad_norm": 0.6717112889337223, - "k1_kl": 0.04949951171875, - "k3_kl": 0.03045654296875, - "kimi_kl": 0.09423828125, - "learning_rate": 2.192e-07, - "loss": 0.0012, - "ppl": 0.035400390625, - "reward": 0.9372269809246063, - "reward_std": 0.06729472428560257, - "rewards/perpo_ocr_edit_distance_reward": 0.9372270107269287, + "advantages": -9.205086098518223e-05, + "completion_length": 1006.0, + "delta_ref_entropy_loss": 0.0150146484375, + "delta_ref_ppl": -0.0244140625, + "entropy_loss": -0.01165771484375, + "epoch": 0.2808, + "grad_norm": 0.4260454474311375, + "k1_kl": 0.0242919921875, + "k3_kl": 0.01611328125, + "kimi_kl": 0.042724609375, + "learning_rate": 3.5959999999999996e-07, + "loss": 0.0007, + "ppl": 0.004150390625, + "reward": 0.9924233555793762, + "reward_std": 0.0011026199208572507, + "rewards/perpo_ocr_edit_distance_reward": 0.9924235343933105, "step": 1404, "temperature": 0.9 }, { - "advantages": -2.343314112707162e-05, - "completion_length": 436.0, - "delta_ref_entropy_loss": 0.04925537109375, - "delta_ref_ppl": -0.0576171875, - "entropy_loss": -0.05548095703125, - "epoch": 0.562, - "grad_norm": 1.6417575764513541, - "k1_kl": 0.057373046875, - "k3_kl": 0.03851318359375, - "kimi_kl": 0.12646484375, - "learning_rate": 2.19e-07, - "loss": 0.0016, - "ppl": 0.0250244140625, - "reward": 0.9201382696628571, - "reward_std": 0.055818492255639285, - "rewards/perpo_ocr_edit_distance_reward": 0.9201382994651794, + "advantages": -5.46659748579259e-06, + "completion_length": 1641.0, + "delta_ref_entropy_loss": 0.0235595703125, + "delta_ref_ppl": -0.0244140625, + "entropy_loss": -0.02685546875, + "epoch": 0.281, + "grad_norm": 0.3718287872953929, + "k1_kl": 0.0244140625, + "k3_kl": 0.01409912109375, + "kimi_kl": 0.036865234375, + "learning_rate": 3.5949999999999996e-07, + "loss": 0.0006, + "ppl": 0.012451171875, + "reward": 0.988423764705658, + "reward_std": 0.007672385312616825, + "rewards/perpo_ocr_edit_distance_reward": 0.9884238243103027, "step": 1405, "temperature": 0.9 }, { - "advantages": -6.726810397594818e-07, - "completion_length": 1099.5, - "delta_ref_entropy_loss": 0.07696533203125, - "delta_ref_ppl": -0.0484619140625, - "entropy_loss": -0.13616943359375, - "epoch": 0.5624, - "grad_norm": 1.3867202321598542, - "k1_kl": 0.04864501953125, - "k3_kl": 0.028350830078125, - "kimi_kl": 0.076416015625, - "learning_rate": 2.1879999999999997e-07, - "loss": 0.0011, - "ppl": 0.081787109375, - "reward": 0.8182291984558105, - "reward_std": 0.02365813683718443, - "rewards/perpo_ocr_edit_distance_reward": 0.8182292282581329, + "advantages": -0.00021028945047874004, + "completion_length": 759.0, + "delta_ref_entropy_loss": 0.04150390625, + "delta_ref_ppl": -0.050048828125, + "entropy_loss": -0.019287109375, + "epoch": 0.2812, + "grad_norm": 0.20484636365885925, + "k1_kl": 0.050048828125, + "k3_kl": 0.033203125, + "kimi_kl": 0.1298828125, + "learning_rate": 3.594e-07, + "loss": 0.0015, + "ppl": 0.00701904296875, + "reward": 0.9985402822494507, + "reward_std": 0.00030489382334053516, + "rewards/perpo_ocr_edit_distance_reward": 0.9985404014587402, "step": 1406, "temperature": 0.9 }, { - "advantages": -7.3015694397327024e-06, - "completion_length": 487.5, - "delta_ref_entropy_loss": 0.07818603515625, - "delta_ref_ppl": -0.056396484375, - "entropy_loss": -0.0784912109375, - "epoch": 0.5628, - "grad_norm": 1.91267766947463, - "k1_kl": 0.056396484375, - "k3_kl": 0.029266357421875, - "kimi_kl": 0.062255859375, - "learning_rate": 2.1859999999999999e-07, - "loss": 0.0012, - "ppl": 0.04541015625, - "reward": 0.9063586294651031, - "reward_std": 0.003333259839564562, - "rewards/perpo_ocr_edit_distance_reward": 0.9063586890697479, + "advantages": -0.0005960464477539062, + "completion_length": 464.0, + "delta_ref_entropy_loss": 0.05078125, + "delta_ref_ppl": -0.0751953125, + "entropy_loss": -0.0115966796875, + "epoch": 0.2814, + "grad_norm": 0.006432365474707447, + "k1_kl": 0.07470703125, + "k3_kl": 0.05224609375, + "kimi_kl": 0.2119140625, + "learning_rate": 3.593e-07, + "loss": 0.0027, + "ppl": 0.001983642578125, + "reward": 0.998073160648346, + "reward_std": 0.0, + "rewards/perpo_ocr_edit_distance_reward": 0.9980732798576355, "step": 1407, "temperature": 0.9 }, { - "advantages": -1.106943500417401e-06, - "completion_length": 458.5, - "delta_ref_entropy_loss": 0.088134765625, - "delta_ref_ppl": -0.096923828125, - "entropy_loss": -0.2900390625, - "epoch": 0.5632, - "grad_norm": 3.286726313762289, - "k1_kl": 0.0966796875, - "k3_kl": 0.0899658203125, - "kimi_kl": 0.17041015625, - "learning_rate": 2.184e-07, - "loss": 0.0036, - "ppl": 0.1395263671875, - "reward": 0.6985311210155487, - "reward_std": 0.017349930480122566, - "rewards/perpo_ocr_edit_distance_reward": 0.6985311508178711, + "advantages": -7.159369852161035e-05, + "completion_length": 656.0, + "delta_ref_entropy_loss": 0.08642578125, + "delta_ref_ppl": -0.05615234375, + "entropy_loss": -0.04345703125, + "epoch": 0.2816, + "grad_norm": 0.4845507055345235, + "k1_kl": 0.05615234375, + "k3_kl": 0.02880859375, + "kimi_kl": 0.06396484375, + "learning_rate": 3.592e-07, + "loss": 0.0012, + "ppl": 0.019287109375, + "reward": 0.9964057803153992, + "reward_std": 0.0007327489438466728, + "rewards/perpo_ocr_edit_distance_reward": 0.9964057803153992, "step": 1408, "temperature": 0.9 }, { - "advantages": -0.00014281699259299785, - "completion_length": 1072.0, - "delta_ref_entropy_loss": 0.02593994140625, - "delta_ref_ppl": -0.0255126953125, - "entropy_loss": -0.034332275390625, - "epoch": 0.5636, - "grad_norm": 0.3731182884486702, - "k1_kl": 0.02557373046875, - "k3_kl": 0.018798828125, - "kimi_kl": 0.060302734375, - "learning_rate": 2.182e-07, - "loss": 0.0009, - "ppl": 0.018890380859375, - "reward": 0.9961920976638794, - "reward_std": 0.0005193160759517923, - "rewards/perpo_ocr_edit_distance_reward": 0.996192216873169, + "advantages": -3.2237599953077734e-05, + "completion_length": 472.0, + "delta_ref_entropy_loss": 0.042236328125, + "delta_ref_ppl": -0.062255859375, + "entropy_loss": -0.0111083984375, + "epoch": 0.2818, + "grad_norm": 0.5508463996791969, + "k1_kl": 0.062255859375, + "k3_kl": 0.04248046875, + "kimi_kl": 0.1376953125, + "learning_rate": 3.591e-07, + "loss": 0.0017, + "ppl": 0.0028076171875, + "reward": 0.9988706707954407, + "reward_std": 0.00042858312372118235, + "rewards/perpo_ocr_edit_distance_reward": 0.9988707304000854, "step": 1409, "temperature": 0.9 }, { - "advantages": -0.0002980232238769531, - "completion_length": 645.5, - "delta_ref_entropy_loss": 0.03277587890625, - "delta_ref_ppl": -0.033935546875, - "entropy_loss": -0.017974853515625, - "epoch": 0.564, - "grad_norm": 0.1771166178285621, - "k1_kl": 0.0340576171875, - "k3_kl": 0.02099609375, - "kimi_kl": 0.0738525390625, - "learning_rate": 2.18e-07, - "loss": 0.0011, - "ppl": 0.0080413818359375, - "reward": 0.9990241527557373, - "reward_std": 9.798895189305767e-05, - "rewards/perpo_ocr_edit_distance_reward": 0.9990242123603821, + "advantages": -5.818264980916865e-05, + "completion_length": 916.0, + "delta_ref_entropy_loss": 0.0771484375, + "delta_ref_ppl": -0.050537109375, + "entropy_loss": -0.04541015625, + "epoch": 0.282, + "grad_norm": 1.1446378279312963, + "k1_kl": 0.050537109375, + "k3_kl": 0.0283203125, + "kimi_kl": 0.052978515625, + "learning_rate": 3.5899999999999997e-07, + "loss": 0.0012, + "ppl": 0.020751953125, + "reward": 0.9780189394950867, + "reward_std": 0.0010703403968364, + "rewards/perpo_ocr_edit_distance_reward": 0.9780189990997314, "step": 1410, "temperature": 0.9 }, { - "advantages": -0.00031132357526075793, - "completion_length": 522.0, - "delta_ref_entropy_loss": 0.0394287109375, - "delta_ref_ppl": -0.02642822265625, - "entropy_loss": -0.0279541015625, - "epoch": 0.5644, - "grad_norm": 0.4060464146143822, - "k1_kl": 0.0263671875, - "k3_kl": 0.015350341796875, - "kimi_kl": 0.04827880859375, - "learning_rate": 2.1779999999999998e-07, - "loss": 0.0009, - "ppl": 0.01372528076171875, - "reward": 0.9828782975673676, - "reward_std": 0.0007500915671698749, - "rewards/perpo_ocr_edit_distance_reward": 0.9828783571720123, + "advantages": -4.087175966560608e-06, + "completion_length": 716.0, + "delta_ref_entropy_loss": 0.13671875, + "delta_ref_ppl": -0.08837890625, + "entropy_loss": -0.0732421875, + "epoch": 0.2822, + "grad_norm": 0.9344794441158618, + "k1_kl": 0.08837890625, + "k3_kl": 0.039306640625, + "kimi_kl": 0.0791015625, + "learning_rate": 3.5889999999999996e-07, + "loss": 0.0016, + "ppl": 0.031494140625, + "reward": 0.9630305767059326, + "reward_std": 0.004049140494316816, + "rewards/perpo_ocr_edit_distance_reward": 0.9630306363105774, "step": 1411, "temperature": 0.9 }, { - "advantages": -0.00033479928970336914, - "completion_length": 567.5, - "delta_ref_entropy_loss": 0.02557373046875, - "delta_ref_ppl": -0.01837158203125, - "entropy_loss": -0.0318603515625, - "epoch": 0.5648, - "grad_norm": 0.40609773393522824, - "k1_kl": 0.01837158203125, - "k3_kl": 0.010345458984375, - "kimi_kl": 0.01702880859375, - "learning_rate": 2.176e-07, - "loss": 0.0007, - "ppl": 0.015533447265625, - "reward": 0.960745632648468, - "reward_std": 0.00029721891041845083, - "rewards/perpo_ocr_edit_distance_reward": 0.9607456624507904, + "advantages": -4.9727306759450585e-05, + "completion_length": 160.0, + "delta_ref_entropy_loss": 0.035400390625, + "delta_ref_ppl": -0.07763671875, + "entropy_loss": -0.0284423828125, + "epoch": 0.2824, + "grad_norm": 1.467943128233875, + "k1_kl": 0.078125, + "k3_kl": 0.048828125, + "kimi_kl": 0.130859375, + "learning_rate": 3.588e-07, + "loss": 0.002, + "ppl": 0.0093994140625, + "reward": 0.9967408776283264, + "reward_std": 0.0009276660857722163, + "rewards/perpo_ocr_edit_distance_reward": 0.9967409372329712, "step": 1412, "temperature": 0.9 }, { - "advantages": 1.8562590184956207e-06, - "completion_length": 531.5, - "delta_ref_entropy_loss": 0.1173095703125, - "delta_ref_ppl": -0.0927734375, - "entropy_loss": -0.08984375, - "epoch": 0.5652, - "grad_norm": 0.8651100020520995, - "k1_kl": 0.0927734375, - "k3_kl": 0.0496826171875, - "kimi_kl": 0.117919921875, - "learning_rate": 2.174e-07, - "loss": 0.002, - "ppl": 0.043701171875, - "reward": 0.9404933750629425, - "reward_std": 0.006157462950795889, - "rewards/perpo_ocr_edit_distance_reward": 0.9404934048652649, + "advantages": 0.0, + "completion_length": 757.0, + "delta_ref_entropy_loss": 0.09033203125, + "delta_ref_ppl": -0.0751953125, + "entropy_loss": -0.05859375, + "epoch": 0.2826, + "grad_norm": 0.7296218820985265, + "k1_kl": 0.0751953125, + "k3_kl": 0.039306640625, + "kimi_kl": 0.09326171875, + "learning_rate": 3.587e-07, + "loss": 0.0016, + "ppl": 0.023681640625, + "reward": 0.9573567509651184, + "reward_std": 0.008328991942107677, + "rewards/perpo_ocr_edit_distance_reward": 0.9573567509651184, "step": 1413, "temperature": 0.9 }, { - "advantages": -1.249994630825313e-05, - "completion_length": 1171.0, - "delta_ref_entropy_loss": 0.0411376953125, - "delta_ref_ppl": -0.0338134765625, - "entropy_loss": -0.1334228515625, - "epoch": 0.5656, - "grad_norm": 10.960780606543734, - "k1_kl": 0.03387451171875, - "k3_kl": 0.078125, - "kimi_kl": 0.0611572265625, - "learning_rate": 2.1719999999999999e-07, - "loss": 0.0031, - "ppl": 0.0843505859375, - "reward": 0.9717152714729309, - "reward_std": 0.007264438580023125, - "rewards/perpo_ocr_edit_distance_reward": 0.9717153012752533, + "advantages": -9.962491276382934e-07, + "completion_length": 912.0, + "delta_ref_entropy_loss": 0.10791015625, + "delta_ref_ppl": -0.09716796875, + "entropy_loss": -0.33984375, + "epoch": 0.2828, + "grad_norm": 1.9252623261672455, + "k1_kl": 0.09716796875, + "k3_kl": 0.0732421875, + "kimi_kl": 0.1396484375, + "learning_rate": 3.5859999999999994e-07, + "loss": 0.0029, + "ppl": 0.1787109375, + "reward": 0.644025981426239, + "reward_std": 0.025589119642972946, + "rewards/perpo_ocr_edit_distance_reward": 0.6440260410308838, "step": 1414, "temperature": 0.9 }, { - "advantages": -4.86075915659967e-05, - "completion_length": 1466.5, - "delta_ref_entropy_loss": 0.03448486328125, - "delta_ref_ppl": -0.035888671875, - "entropy_loss": -0.0484619140625, - "epoch": 0.566, - "grad_norm": 1.4989799867726923, - "k1_kl": 0.035888671875, - "k3_kl": 0.021728515625, - "kimi_kl": 0.05560302734375, - "learning_rate": 2.17e-07, - "loss": 0.0009, - "ppl": 0.0272216796875, - "reward": 0.9163593947887421, - "reward_std": 0.007204267778433859, - "rewards/perpo_ocr_edit_distance_reward": 0.9163595139980316, + "advantages": -0.0005960464477539062, + "completion_length": 412.0, + "delta_ref_entropy_loss": 0.08251953125, + "delta_ref_ppl": -0.058349609375, + "entropy_loss": -0.0191650390625, + "epoch": 0.283, + "grad_norm": 0.008433255766230129, + "k1_kl": 0.057861328125, + "k3_kl": 0.0279541015625, + "kimi_kl": 0.05859375, + "learning_rate": 3.585e-07, + "loss": 0.0017, + "ppl": 0.0037384033203125, + "reward": 0.9808142185211182, + "reward_std": 0.0, + "rewards/perpo_ocr_edit_distance_reward": 0.9808142781257629, "step": 1415, "temperature": 0.9 }, { - "advantages": -2.86698349185599e-05, - "completion_length": 220.5, - "delta_ref_entropy_loss": 0.0242919921875, - "delta_ref_ppl": -0.033599853515625, - "entropy_loss": -0.03594970703125, - "epoch": 0.5664, - "grad_norm": 1.364463153611568, - "k1_kl": 0.0335693359375, - "k3_kl": 0.0231781005859375, - "kimi_kl": 0.082000732421875, - "learning_rate": 2.1679999999999998e-07, - "loss": 0.001, - "ppl": 0.016357421875, - "reward": 0.700263500213623, - "reward_std": 0.05475187345291488, - "rewards/perpo_ocr_edit_distance_reward": 0.7002635151147842, + "advantages": -0.0005960464477539062, + "completion_length": 281.0, + "delta_ref_entropy_loss": 0.06396484375, + "delta_ref_ppl": -0.10107421875, + "entropy_loss": -0.0159912109375, + "epoch": 0.2832, + "grad_norm": 0.010741741737719598, + "k1_kl": 0.1015625, + "k3_kl": 0.0703125, + "kimi_kl": 0.251953125, + "learning_rate": 3.584e-07, + "loss": 0.0034, + "ppl": 0.0029449462890625, + "reward": 0.9625849723815918, + "reward_std": 0.0, + "rewards/perpo_ocr_edit_distance_reward": 0.9625850915908813, "step": 1416, "temperature": 0.9 }, { - "advantages": -8.693763447809033e-06, - "completion_length": 473.5, - "delta_ref_entropy_loss": 0.0477294921875, - "delta_ref_ppl": -0.039306640625, - "entropy_loss": -0.02728271484375, - "epoch": 0.5668, - "grad_norm": 0.2652795121947788, - "k1_kl": 0.0394287109375, - "k3_kl": 0.023681640625, - "kimi_kl": 0.0657958984375, - "learning_rate": 2.1659999999999997e-07, - "loss": 0.001, - "ppl": 0.013916015625, - "reward": 0.9984745681285858, - "reward_std": 0.0009280673111788929, - "rewards/perpo_ocr_edit_distance_reward": 0.9984746277332306, + "advantages": -2.404621773166582e-05, + "completion_length": 530.0, + "delta_ref_entropy_loss": 0.0908203125, + "delta_ref_ppl": -0.07861328125, + "entropy_loss": -0.042724609375, + "epoch": 0.2834, + "grad_norm": 0.7373733815636572, + "k1_kl": 0.07861328125, + "k3_kl": 0.044189453125, + "kimi_kl": 0.1220703125, + "learning_rate": 3.5829999999999997e-07, + "loss": 0.0018, + "ppl": 0.0184326171875, + "reward": 0.983991801738739, + "reward_std": 0.0009623761870898306, + "rewards/perpo_ocr_edit_distance_reward": 0.9839919209480286, "step": 1417, "temperature": 0.9 }, { - "advantages": 4.989759872842114e-06, - "completion_length": 648.5, - "delta_ref_entropy_loss": 0.085693359375, - "delta_ref_ppl": -0.0494384765625, - "entropy_loss": -0.0670166015625, - "epoch": 0.5672, - "grad_norm": 0.7763071948472176, - "k1_kl": 0.0494384765625, - "k3_kl": 0.028076171875, - "kimi_kl": 0.0701904296875, - "learning_rate": 2.164e-07, - "loss": 0.0011, - "ppl": 0.03460693359375, - "reward": 0.9532105326652527, - "reward_std": 0.0013076157629257068, - "rewards/perpo_ocr_edit_distance_reward": 0.9532105922698975, + "advantages": 3.0738967325305566e-05, + "completion_length": 498.0, + "delta_ref_entropy_loss": 0.055419921875, + "delta_ref_ppl": -0.08154296875, + "entropy_loss": -0.0498046875, + "epoch": 0.2836, + "grad_norm": 0.7285463415072362, + "k1_kl": 0.08154296875, + "k3_kl": 0.048828125, + "kimi_kl": 0.1279296875, + "learning_rate": 3.582e-07, + "loss": 0.0019, + "ppl": 0.020263671875, + "reward": 0.9806644320487976, + "reward_std": 0.00045389062142930925, + "rewards/perpo_ocr_edit_distance_reward": 0.9806644320487976, "step": 1418, "temperature": 0.9 }, { - "advantages": -1.8596650733115894e-05, - "completion_length": 425.0, - "delta_ref_entropy_loss": 0.06787109375, - "delta_ref_ppl": -0.04779052734375, - "entropy_loss": -0.0423583984375, - "epoch": 0.5676, - "grad_norm": 1.1214940198497907, - "k1_kl": 0.04779052734375, - "k3_kl": 0.0259552001953125, - "kimi_kl": 0.045806884765625, - "learning_rate": 2.162e-07, - "loss": 0.0011, - "ppl": 0.0195159912109375, - "reward": 0.906193882226944, - "reward_std": 0.006577315274626017, - "rewards/perpo_ocr_edit_distance_reward": 0.9061939716339111, + "advantages": -1.021793991640152e-06, + "completion_length": 1778.0, + "delta_ref_entropy_loss": 0.09326171875, + "delta_ref_ppl": -0.09033203125, + "entropy_loss": -0.1689453125, + "epoch": 0.2838, + "grad_norm": 8.915407102781394, + "k1_kl": 0.09033203125, + "k3_kl": 0.057373046875, + "kimi_kl": 0.1484375, + "learning_rate": 3.5809999999999996e-07, + "loss": 0.0023, + "ppl": 0.09375, + "reward": 0.8871284127235413, + "reward_std": 0.04069362208247185, + "rewards/perpo_ocr_edit_distance_reward": 0.887128472328186, "step": 1419, "temperature": 0.9 }, { - "advantages": -1.3198171927797375e-05, - "completion_length": 769.5, - "delta_ref_entropy_loss": 0.04931640625, - "delta_ref_ppl": -0.0435791015625, - "entropy_loss": -0.05615234375, - "epoch": 0.568, - "grad_norm": 1.1302444214885126, - "k1_kl": 0.04345703125, - "k3_kl": 0.02789306640625, - "kimi_kl": 0.088134765625, - "learning_rate": 2.1599999999999998e-07, - "loss": 0.0011, - "ppl": 0.0272216796875, - "reward": 0.8866372406482697, - "reward_std": 0.005914152367040515, - "rewards/perpo_ocr_edit_distance_reward": 0.8866373002529144, + "advantages": -2.2547586922883056e-05, + "completion_length": 433.0, + "delta_ref_entropy_loss": 0.05078125, + "delta_ref_ppl": -0.0830078125, + "entropy_loss": -0.038330078125, + "epoch": 0.284, + "grad_norm": 1.051409176971662, + "k1_kl": 0.0830078125, + "k3_kl": 0.0576171875, + "kimi_kl": 0.169921875, + "learning_rate": 3.5799999999999995e-07, + "loss": 0.0023, + "ppl": 0.01287841796875, + "reward": 0.9974666833877563, + "reward_std": 0.003675838466733694, + "rewards/perpo_ocr_edit_distance_reward": 0.9974668025970459, "step": 1420, "temperature": 0.9 }, { - "advantages": -4.375406945200666e-05, - "completion_length": 873.0, - "delta_ref_entropy_loss": 0.048583984375, - "delta_ref_ppl": -0.02545166015625, - "entropy_loss": -0.0894775390625, - "epoch": 0.5684, - "grad_norm": 2.9148603720068658, - "k1_kl": 0.02557373046875, - "k3_kl": 0.01348876953125, - "kimi_kl": 0.0252685546875, - "learning_rate": 2.158e-07, - "loss": 0.0006, - "ppl": 0.053619384765625, - "reward": 0.985209196805954, - "reward_std": 0.0015966735081747174, - "rewards/perpo_ocr_edit_distance_reward": 0.9852092564105988, + "advantages": -9.528228474664502e-06, + "completion_length": 33.0, + "delta_ref_entropy_loss": 0.130859375, + "delta_ref_ppl": -0.59375, + "entropy_loss": -0.1279296875, + "epoch": 0.2842, + "grad_norm": 10.406604114754698, + "k1_kl": 0.59375, + "k3_kl": 0.466796875, + "kimi_kl": 1.546875, + "learning_rate": 3.579e-07, + "loss": 0.0187, + "ppl": 0.0634765625, + "reward": 0.9538239240646362, + "reward_std": 0.007947439327836037, + "rewards/perpo_ocr_edit_distance_reward": 0.9538240432739258, "step": 1421, "temperature": 0.9 }, { - "advantages": -0.00013323341408977285, - "completion_length": 564.5, - "delta_ref_entropy_loss": 0.031005859375, - "delta_ref_ppl": -0.02960205078125, - "entropy_loss": -0.020751953125, - "epoch": 0.5688, - "grad_norm": 0.6548033218486397, - "k1_kl": 0.02960205078125, - "k3_kl": 0.01910400390625, - "kimi_kl": 0.0562744140625, - "learning_rate": 2.156e-07, - "loss": 0.0009, - "ppl": 0.009552001953125, - "reward": 0.9969851672649384, - "reward_std": 0.0003979091561632231, - "rewards/perpo_ocr_edit_distance_reward": 0.9969852864742279, + "advantages": -3.7508352761506103e-06, + "completion_length": 800.0, + "delta_ref_entropy_loss": 0.099609375, + "delta_ref_ppl": -0.07568359375, + "entropy_loss": -0.1044921875, + "epoch": 0.2844, + "grad_norm": 1.206969949281437, + "k1_kl": 0.07568359375, + "k3_kl": 0.037109375, + "kimi_kl": 0.083984375, + "learning_rate": 3.578e-07, + "loss": 0.0015, + "ppl": 0.050537109375, + "reward": 0.964931309223175, + "reward_std": 0.0021732335444539785, + "rewards/perpo_ocr_edit_distance_reward": 0.9649313688278198, "step": 1422, "temperature": 0.9 }, { - "advantages": -3.81895506507135e-06, - "completion_length": 594.5, - "delta_ref_entropy_loss": 0.0616455078125, - "delta_ref_ppl": -0.0491943359375, - "entropy_loss": -0.04095458984375, - "epoch": 0.5692, - "grad_norm": 0.7778005232662177, - "k1_kl": 0.0491943359375, - "k3_kl": 0.0299072265625, - "kimi_kl": 0.100830078125, - "learning_rate": 2.154e-07, - "loss": 0.0012, - "ppl": 0.018218994140625, - "reward": 0.9917417168617249, - "reward_std": 0.0021817947272211313, - "rewards/perpo_ocr_edit_distance_reward": 0.9917417168617249, + "advantages": -8.33613557915669e-06, + "completion_length": 1176.0, + "delta_ref_entropy_loss": 0.083984375, + "delta_ref_ppl": -0.0615234375, + "entropy_loss": -0.10546875, + "epoch": 0.2846, + "grad_norm": 1.0463177881412848, + "k1_kl": 0.0615234375, + "k3_kl": 0.0361328125, + "kimi_kl": 0.0849609375, + "learning_rate": 3.577e-07, + "loss": 0.0015, + "ppl": 0.0546875, + "reward": 0.9576942324638367, + "reward_std": 0.006019219756126404, + "rewards/perpo_ocr_edit_distance_reward": 0.9576942920684814, "step": 1423, "temperature": 0.9 }, { - "advantages": -1.2900148902872388e-05, - "completion_length": 982.0, - "delta_ref_entropy_loss": 0.03680419921875, - "delta_ref_ppl": -0.02972412109375, - "entropy_loss": -0.039306640625, - "epoch": 0.5696, - "grad_norm": 0.858206933362142, - "k1_kl": 0.02972412109375, - "k3_kl": 0.020233154296875, - "kimi_kl": 0.0416259765625, - "learning_rate": 2.152e-07, + "advantages": 4.257474817137563e-09, + "completion_length": 1175.0, + "delta_ref_entropy_loss": 0.04296875, + "delta_ref_ppl": -0.04150390625, + "entropy_loss": -0.05419921875, + "epoch": 0.2848, + "grad_norm": 0.6120409090076976, + "k1_kl": 0.04150390625, + "k3_kl": 0.020751953125, + "kimi_kl": 0.04052734375, + "learning_rate": 3.5759999999999997e-07, "loss": 0.0008, - "ppl": 0.0202484130859375, - "reward": 0.9587700963020325, - "reward_std": 0.01290956325829029, - "rewards/perpo_ocr_edit_distance_reward": 0.9587701857089996, + "ppl": 0.0233154296875, + "reward": 0.9907118678092957, + "reward_std": 0.0003528278321027756, + "rewards/perpo_ocr_edit_distance_reward": 0.9907119274139404, "step": 1424, "temperature": 0.9 }, { - "advantages": -1.307044772147492e-05, - "completion_length": 699.0, - "delta_ref_entropy_loss": 0.09429931640625, - "delta_ref_ppl": -0.05938720703125, - "entropy_loss": -0.105712890625, - "epoch": 0.57, - "grad_norm": 0.9312293931056252, - "k1_kl": 0.05908203125, - "k3_kl": 0.03070068359375, - "kimi_kl": 0.057281494140625, - "learning_rate": 2.1499999999999998e-07, - "loss": 0.0012, - "ppl": 0.05517578125, - "reward": 0.9366699159145355, - "reward_std": 0.003004382480867207, - "rewards/perpo_ocr_edit_distance_reward": 0.9366700053215027, + "advantages": -1.3964516938358429e-06, + "completion_length": 536.0, + "delta_ref_entropy_loss": 0.061279296875, + "delta_ref_ppl": -0.0830078125, + "entropy_loss": -0.0673828125, + "epoch": 0.285, + "grad_norm": 0.9034748426660998, + "k1_kl": 0.08349609375, + "k3_kl": 0.052734375, + "kimi_kl": 0.1728515625, + "learning_rate": 3.5749999999999997e-07, + "loss": 0.0021, + "ppl": 0.0277099609375, + "reward": 0.7740065455436707, + "reward_std": 0.029920831322669983, + "rewards/perpo_ocr_edit_distance_reward": 0.7740066647529602, "step": 1425, "temperature": 0.9 }, { - "advantages": -1.1682511313892974e-05, - "completion_length": 568.5, - "delta_ref_entropy_loss": 0.05987548828125, - "delta_ref_ppl": -0.090728759765625, - "entropy_loss": -0.052001953125, - "epoch": 0.5704, - "grad_norm": 1.8097403995554968, - "k1_kl": 0.090240478515625, - "k3_kl": 0.062408447265625, - "kimi_kl": 0.23748779296875, - "learning_rate": 2.148e-07, - "loss": 0.0025, - "ppl": 0.02349853515625, - "reward": 0.972982794046402, - "reward_std": 0.02571909106336534, - "rewards/perpo_ocr_edit_distance_reward": 0.9729828834533691, + "advantages": -7.91890329310263e-07, + "completion_length": 1147.0, + "delta_ref_entropy_loss": 0.0693359375, + "delta_ref_ppl": -0.048095703125, + "entropy_loss": -0.053955078125, + "epoch": 0.2852, + "grad_norm": 0.6995863083200676, + "k1_kl": 0.048095703125, + "k3_kl": 0.0274658203125, + "kimi_kl": 0.06396484375, + "learning_rate": 3.5739999999999996e-07, + "loss": 0.0011, + "ppl": 0.022705078125, + "reward": 0.9661962389945984, + "reward_std": 0.010788671672344208, + "rewards/perpo_ocr_edit_distance_reward": 0.9661962985992432, "step": 1426, "temperature": 0.9 }, { - "advantages": -3.702300205077336e-05, - "completion_length": 393.5, - "delta_ref_entropy_loss": 0.0570068359375, - "delta_ref_ppl": -0.04400634765625, - "entropy_loss": -0.1080322265625, - "epoch": 0.5708, - "grad_norm": 1.364071329875263, - "k1_kl": 0.04376220703125, - "k3_kl": 0.02532958984375, - "kimi_kl": 0.061920166015625, - "learning_rate": 2.146e-07, + "advantages": -2.9802324206684716e-05, + "completion_length": 589.0, + "delta_ref_entropy_loss": 0.058837890625, + "delta_ref_ppl": -0.0478515625, + "entropy_loss": -0.02685546875, + "epoch": 0.2854, + "grad_norm": 0.5425707633966327, + "k1_kl": 0.0478515625, + "k3_kl": 0.0244140625, + "kimi_kl": 0.05419921875, + "learning_rate": 3.573e-07, "loss": 0.001, - "ppl": 0.05572509765625, - "reward": 0.922848105430603, - "reward_std": 0.007529011927545071, - "rewards/perpo_ocr_edit_distance_reward": 0.9228481948375702, + "ppl": 0.0091552734375, + "reward": 0.9937116503715515, + "reward_std": 0.00047160257236100733, + "rewards/perpo_ocr_edit_distance_reward": 0.9937117099761963, "step": 1427, "temperature": 0.9 }, { - "advantages": -5.4095474297355395e-05, - "completion_length": 525.5, - "delta_ref_entropy_loss": 0.0867919921875, - "delta_ref_ppl": -0.044921875, - "entropy_loss": -0.1407470703125, - "epoch": 0.5712, - "grad_norm": 1.2915836952173, - "k1_kl": 0.044921875, - "k3_kl": 0.020172119140625, - "kimi_kl": 0.0313720703125, - "learning_rate": 2.144e-07, - "loss": 0.0009, - "ppl": 0.07855224609375, - "reward": 0.9121772348880768, - "reward_std": 0.0015840618580114096, - "rewards/perpo_ocr_edit_distance_reward": 0.9121773540973663, + "advantages": -0.0002245945652248338, + "completion_length": 532.0, + "delta_ref_entropy_loss": 0.05078125, + "delta_ref_ppl": -0.05810546875, + "entropy_loss": -0.01141357421875, + "epoch": 0.2856, + "grad_norm": 0.4426308664624213, + "k1_kl": 0.057861328125, + "k3_kl": 0.037353515625, + "kimi_kl": 0.10546875, + "learning_rate": 3.572e-07, + "loss": 0.0017, + "ppl": 0.0028839111328125, + "reward": 0.9986308813095093, + "reward_std": 0.0002032963529927656, + "rewards/perpo_ocr_edit_distance_reward": 0.9986310005187988, "step": 1428, "temperature": 0.9 }, { - "advantages": -3.583942202567414e-05, - "completion_length": 344.0, - "delta_ref_entropy_loss": 0.07794189453125, - "delta_ref_ppl": -0.056427001953125, - "entropy_loss": -0.062744140625, - "epoch": 0.5716, - "grad_norm": 2.2788138605263732, - "k1_kl": 0.05645751953125, - "k3_kl": 0.033111572265625, - "kimi_kl": 0.08978271484375, - "learning_rate": 2.142e-07, - "loss": 0.0014, - "ppl": 0.0308837890625, - "reward": 0.9670697748661041, - "reward_std": 0.002625510824145749, - "rewards/perpo_ocr_edit_distance_reward": 0.9670698344707489, + "advantages": -2.4097307687043212e-05, + "completion_length": 1175.0, + "delta_ref_entropy_loss": 0.0361328125, + "delta_ref_ppl": -0.046875, + "entropy_loss": -0.0220947265625, + "epoch": 0.2858, + "grad_norm": 0.7146575953374624, + "k1_kl": 0.046875, + "k3_kl": 0.0274658203125, + "kimi_kl": 0.08251953125, + "learning_rate": 3.5709999999999994e-07, + "loss": 0.0011, + "ppl": 0.00732421875, + "reward": 0.9986644983291626, + "reward_std": 0.0006075142882764339, + "rewards/perpo_ocr_edit_distance_reward": 0.9986646175384521, "step": 1429, "temperature": 0.9 }, { - "advantages": -1.8353974155616015e-05, - "completion_length": 705.0, - "delta_ref_entropy_loss": 0.033203125, - "delta_ref_ppl": -0.02734375, - "entropy_loss": -0.03118896484375, - "epoch": 0.572, - "grad_norm": 0.49338892775631565, - "k1_kl": 0.02734375, - "k3_kl": 0.0159912109375, - "kimi_kl": 0.0308837890625, - "learning_rate": 2.1399999999999998e-07, - "loss": 0.0007, - "ppl": 0.017364501953125, - "reward": 0.9963855743408203, - "reward_std": 0.00037929913378320634, - "rewards/perpo_ocr_edit_distance_reward": 0.9963855743408203, + "advantages": -6.365776062011719e-05, + "completion_length": 534.0, + "delta_ref_entropy_loss": 0.07080078125, + "delta_ref_ppl": -0.05126953125, + "entropy_loss": -0.01531982421875, + "epoch": 0.286, + "grad_norm": 0.3289653525464005, + "k1_kl": 0.05126953125, + "k3_kl": 0.0255126953125, + "kimi_kl": 0.054443359375, + "learning_rate": 3.57e-07, + "loss": 0.0011, + "ppl": 0.00341796875, + "reward": 0.9953311085700989, + "reward_std": 0.0003013009554706514, + "rewards/perpo_ocr_edit_distance_reward": 0.9953311085700989, "step": 1430, "temperature": 0.9 }, { - "advantages": -1.5710081862074787e-06, - "completion_length": 1053.0, - "delta_ref_entropy_loss": 0.03558349609375, - "delta_ref_ppl": -0.02294921875, - "entropy_loss": -0.07177734375, - "epoch": 0.5724, - "grad_norm": 1.060197244551964, - "k1_kl": 0.022918701171875, - "k3_kl": 0.015228271484375, - "kimi_kl": 0.024383544921875, - "learning_rate": 2.1379999999999997e-07, - "loss": 0.0006, - "ppl": 0.04052734375, - "reward": 0.942877858877182, - "reward_std": 0.0031712341005913913, - "rewards/perpo_ocr_edit_distance_reward": 0.9428778886795044, + "advantages": -0.00022244454885367304, + "completion_length": 629.0, + "delta_ref_entropy_loss": 0.04736328125, + "delta_ref_ppl": -0.0517578125, + "entropy_loss": -0.01904296875, + "epoch": 0.2862, + "grad_norm": 0.3100752686421649, + "k1_kl": 0.051513671875, + "k3_kl": 0.033203125, + "kimi_kl": 0.1044921875, + "learning_rate": 3.5689999999999997e-07, + "loss": 0.0015, + "ppl": 0.00933837890625, + "reward": 0.7405467629432678, + "reward_std": 0.00016794000111985952, + "rewards/perpo_ocr_edit_distance_reward": 0.7405468821525574, "step": 1431, "temperature": 0.9 }, { - "advantages": -0.00012642997216971708, - "completion_length": 512.5, - "delta_ref_entropy_loss": 0.02978515625, - "delta_ref_ppl": -0.021209716796875, - "entropy_loss": -0.02874755859375, - "epoch": 0.5728, - "grad_norm": 0.9923983717583831, - "k1_kl": 0.021331787109375, - "k3_kl": 0.01293182373046875, - "kimi_kl": 0.0279083251953125, - "learning_rate": 2.136e-07, - "loss": 0.0006, - "ppl": 0.01483154296875, - "reward": 0.9968642294406891, - "reward_std": 0.001328470796579495, - "rewards/perpo_ocr_edit_distance_reward": 0.9968642592430115, + "advantages": -0.00011107751925010234, + "completion_length": 746.0, + "delta_ref_entropy_loss": 0.054443359375, + "delta_ref_ppl": -0.0732421875, + "entropy_loss": -0.0242919921875, + "epoch": 0.2864, + "grad_norm": 0.32712002341438157, + "k1_kl": 0.07373046875, + "k3_kl": 0.0458984375, + "kimi_kl": 0.1337890625, + "learning_rate": 3.5679999999999997e-07, + "loss": 0.0019, + "ppl": 0.00738525390625, + "reward": 0.20363782346248627, + "reward_std": 0.00018748726870398968, + "rewards/perpo_ocr_edit_distance_reward": 0.20363785326480865, "step": 1432, "temperature": 0.9 }, { - "advantages": -2.2224019176064758e-06, - "completion_length": 409.5, - "delta_ref_entropy_loss": 0.0614013671875, - "delta_ref_ppl": -0.0736083984375, - "entropy_loss": -0.02679443359375, - "epoch": 0.5732, - "grad_norm": 0.7871201282995639, - "k1_kl": 0.0733642578125, - "k3_kl": 0.0496826171875, - "kimi_kl": 0.15966796875, - "learning_rate": 2.134e-07, - "loss": 0.002, - "ppl": 0.0145416259765625, - "reward": 0.9980436861515045, - "reward_std": 0.0009087711805477738, - "rewards/perpo_ocr_edit_distance_reward": 0.9980436861515045, + "advantages": -2.087865686917212e-05, + "completion_length": 442.0, + "delta_ref_entropy_loss": 0.1669921875, + "delta_ref_ppl": -0.12451171875, + "entropy_loss": -0.1708984375, + "epoch": 0.2866, + "grad_norm": 2.2299629047590583, + "k1_kl": 0.12451171875, + "k3_kl": 0.07421875, + "kimi_kl": 0.1865234375, + "learning_rate": 3.567e-07, + "loss": 0.003, + "ppl": 0.083984375, + "reward": 0.9711635708808899, + "reward_std": 0.0023450027219951153, + "rewards/perpo_ocr_edit_distance_reward": 0.9711636304855347, "step": 1433, "temperature": 0.9 }, { - "advantages": -7.542116327385884e-05, - "completion_length": 538.5, - "delta_ref_entropy_loss": 0.03662109375, - "delta_ref_ppl": -0.0362548828125, - "entropy_loss": -0.03338623046875, - "epoch": 0.5736, - "grad_norm": 0.5928933470682998, - "k1_kl": 0.0362548828125, - "k3_kl": 0.022216796875, - "kimi_kl": 0.07177734375, - "learning_rate": 2.132e-07, - "loss": 0.001, - "ppl": 0.018280029296875, - "reward": 0.9424203932285309, - "reward_std": 0.0010695790988393128, - "rewards/perpo_ocr_edit_distance_reward": 0.9424204230308533, + "advantages": -5.1106726459693164e-05, + "completion_length": 355.0, + "delta_ref_entropy_loss": 0.06201171875, + "delta_ref_ppl": -0.10986328125, + "entropy_loss": -0.0257568359375, + "epoch": 0.2868, + "grad_norm": 0.7857360320670348, + "k1_kl": 0.10986328125, + "k3_kl": 0.07568359375, + "kimi_kl": 0.26953125, + "learning_rate": 3.5659999999999995e-07, + "loss": 0.0031, + "ppl": 0.01043701171875, + "reward": 0.9846764802932739, + "reward_std": 0.0007326801423914731, + "rewards/perpo_ocr_edit_distance_reward": 0.9846765398979187, "step": 1434, "temperature": 0.9 }, { - "advantages": -0.0003625069366535172, - "completion_length": 881.0, - "delta_ref_entropy_loss": 0.013458251953125, - "delta_ref_ppl": -0.0213623046875, - "entropy_loss": -0.02374267578125, - "epoch": 0.574, - "grad_norm": 0.19513904715644068, - "k1_kl": 0.021392822265625, - "k3_kl": 0.013763427734375, - "kimi_kl": 0.03265380859375, - "learning_rate": 2.13e-07, - "loss": 0.0009, - "ppl": 0.011383056640625, - "reward": 0.9986604452133179, - "reward_std": 0.00024709341232664883, - "rewards/perpo_ocr_edit_distance_reward": 0.998660534620285, + "advantages": -0.0005960464477539062, + "completion_length": 340.0, + "delta_ref_entropy_loss": 0.06591796875, + "delta_ref_ppl": -0.0810546875, + "entropy_loss": -0.0164794921875, + "epoch": 0.287, + "grad_norm": 0.011170943407603407, + "k1_kl": 0.0810546875, + "k3_kl": 0.05224609375, + "kimi_kl": 0.203125, + "learning_rate": 3.5649999999999994e-07, + "loss": 0.0027, + "ppl": 0.0037689208984375, + "reward": 0.9965337514877319, + "reward_std": 0.0, + "rewards/perpo_ocr_edit_distance_reward": 0.9965338706970215, "step": 1435, "temperature": 0.9 }, { - "advantages": -8.883647024049424e-05, - "completion_length": 583.0, - "delta_ref_entropy_loss": 0.03076171875, - "delta_ref_ppl": -0.02337646484375, - "entropy_loss": -0.0225830078125, - "epoch": 0.5744, - "grad_norm": 0.4460626376572443, - "k1_kl": 0.02337646484375, - "k3_kl": 0.01434326171875, - "kimi_kl": 0.03363037109375, - "learning_rate": 2.1279999999999997e-07, - "loss": 0.0007, - "ppl": 0.01177978515625, - "reward": 0.9924111068248749, - "reward_std": 0.0005868246662430465, - "rewards/perpo_ocr_edit_distance_reward": 0.992411196231842, + "advantages": -3.341691990499385e-05, + "completion_length": 737.0, + "delta_ref_entropy_loss": 0.038330078125, + "delta_ref_ppl": -0.043212890625, + "entropy_loss": -0.040771484375, + "epoch": 0.2872, + "grad_norm": 0.7514792819132092, + "k1_kl": 0.04296875, + "k3_kl": 0.025390625, + "kimi_kl": 0.07275390625, + "learning_rate": 3.564e-07, + "loss": 0.001, + "ppl": 0.0172119140625, + "reward": 0.9958573579788208, + "reward_std": 0.0006642534863203764, + "rewards/perpo_ocr_edit_distance_reward": 0.9958574175834656, "step": 1436, "temperature": 0.9 }, { - "advantages": -9.23148233722415e-05, - "completion_length": 843.0, - "delta_ref_entropy_loss": 0.023681640625, - "delta_ref_ppl": -0.01934814453125, - "entropy_loss": -0.028045654296875, - "epoch": 0.5748, - "grad_norm": 0.43765384362755677, - "k1_kl": 0.01934814453125, - "k3_kl": 0.012481689453125, - "kimi_kl": 0.03143310546875, - "learning_rate": 2.126e-07, - "loss": 0.0006, - "ppl": 0.013397216796875, - "reward": 0.9991702437400818, - "reward_std": 0.0004015758167952299, - "rewards/perpo_ocr_edit_distance_reward": 0.9991702735424042, + "advantages": -3.759350147447549e-05, + "completion_length": 255.0, + "delta_ref_entropy_loss": 0.052001953125, + "delta_ref_ppl": -0.1103515625, + "entropy_loss": -0.035888671875, + "epoch": 0.2874, + "grad_norm": 1.9276525292593938, + "k1_kl": 0.109375, + "k3_kl": 0.07666015625, + "kimi_kl": 0.2197265625, + "learning_rate": 3.563e-07, + "loss": 0.0031, + "ppl": 0.00946044921875, + "reward": 0.9215046167373657, + "reward_std": 0.0014853825559839606, + "rewards/perpo_ocr_edit_distance_reward": 0.9215047359466553, "step": 1437, "temperature": 0.9 }, { - "advantages": -0.0002980232238769531, - "completion_length": 624.5, - "delta_ref_entropy_loss": 0.0377197265625, - "delta_ref_ppl": -0.037353515625, - "entropy_loss": -0.017425537109375, - "epoch": 0.5752, - "grad_norm": 0.016528091547044158, - "k1_kl": 0.0374755859375, - "k3_kl": 0.02484130859375, - "kimi_kl": 0.073974609375, - "learning_rate": 2.124e-07, + "advantages": -2.537454975026776e-06, + "completion_length": 1800.0, + "delta_ref_entropy_loss": 0.056640625, + "delta_ref_ppl": -0.045654296875, + "entropy_loss": -0.1015625, + "epoch": 0.2876, + "grad_norm": 4.080455250031751, + "k1_kl": 0.0458984375, + "k3_kl": 0.03369140625, + "kimi_kl": 0.055419921875, + "learning_rate": 3.562e-07, "loss": 0.0013, - "ppl": 0.005645751953125, - "reward": 0.9997659027576447, - "reward_std": 0.0, - "rewards/perpo_ocr_edit_distance_reward": 0.999765932559967, + "ppl": 0.055419921875, + "reward": 0.9151288270950317, + "reward_std": 0.036669231951236725, + "rewards/perpo_ocr_edit_distance_reward": 0.9151288866996765, "step": 1438, "temperature": 0.9 }, { - "advantages": -4.865442224399885e-05, - "completion_length": 449.5, - "delta_ref_entropy_loss": 0.03955078125, - "delta_ref_ppl": -0.04046630859375, - "entropy_loss": -0.01995849609375, - "epoch": 0.5756, - "grad_norm": 0.4115982691723905, - "k1_kl": 0.04046630859375, - "k3_kl": 0.022979736328125, - "kimi_kl": 0.067474365234375, - "learning_rate": 2.1219999999999998e-07, - "loss": 0.001, - "ppl": 0.0075531005859375, - "reward": 0.9622677266597748, - "reward_std": 0.0078326647344511, - "rewards/perpo_ocr_edit_distance_reward": 0.9622677564620972, + "advantages": -0.00012101446191081777, + "completion_length": 835.0, + "delta_ref_entropy_loss": 0.05322265625, + "delta_ref_ppl": -0.04541015625, + "entropy_loss": -0.0234375, + "epoch": 0.2878, + "grad_norm": 0.2338131492872523, + "k1_kl": 0.045166015625, + "k3_kl": 0.0233154296875, + "kimi_kl": 0.057373046875, + "learning_rate": 3.561e-07, + "loss": 0.0011, + "ppl": 0.007080078125, + "reward": 0.9933913946151733, + "reward_std": 0.0003221603692509234, + "rewards/perpo_ocr_edit_distance_reward": 0.9933913946151733, "step": 1439, "temperature": 0.9 }, { - "advantages": -0.00017297269005212002, - "completion_length": 515.0, - "delta_ref_entropy_loss": 0.04266357421875, - "delta_ref_ppl": -0.06201171875, - "entropy_loss": -0.046875, - "epoch": 0.576, - "grad_norm": 1.2661325108270134, - "k1_kl": 0.06201171875, - "k3_kl": 0.04345703125, - "kimi_kl": 0.14306640625, - "learning_rate": 2.12e-07, - "loss": 0.0019, - "ppl": 0.0250244140625, - "reward": 0.9989842772483826, - "reward_std": 0.0006307813164312392, - "rewards/perpo_ocr_edit_distance_reward": 0.9989843964576721, + "advantages": -6.450925866374746e-05, + "completion_length": 1114.0, + "delta_ref_entropy_loss": 0.02783203125, + "delta_ref_ppl": -0.029296875, + "entropy_loss": -0.033447265625, + "epoch": 0.288, + "grad_norm": 0.6464970906208982, + "k1_kl": 0.029296875, + "k3_kl": 0.015869140625, + "kimi_kl": 0.0380859375, + "learning_rate": 3.5599999999999996e-07, + "loss": 0.0007, + "ppl": 0.0137939453125, + "reward": 0.9943077564239502, + "reward_std": 0.00082375667989254, + "rewards/perpo_ocr_edit_distance_reward": 0.994307816028595, "step": 1440, "temperature": 0.9 }, { - "advantages": -2.767358637356665e-06, - "completion_length": 143.0, - "delta_ref_entropy_loss": 0.1087646484375, - "delta_ref_ppl": -0.11572265625, - "entropy_loss": -0.1380615234375, - "epoch": 0.5764, - "grad_norm": 1.8949358761420918, - "k1_kl": 0.115478515625, - "k3_kl": 0.075439453125, - "kimi_kl": 0.29443359375, - "learning_rate": 2.1179999999999998e-07, - "loss": 0.003, - "ppl": 0.075836181640625, - "reward": 0.8904331028461456, - "reward_std": 0.01215263083577156, - "rewards/perpo_ocr_edit_distance_reward": 0.8904331624507904, - "step": 1441, - "temperature": 0.9 - }, - { - "advantages": -7.663454852036011e-06, - "completion_length": 127.5, - "delta_ref_entropy_loss": 0.16650390625, - "delta_ref_ppl": -0.28125, - "entropy_loss": -0.1904296875, - "epoch": 0.5768, - "grad_norm": 19.748005331534735, - "k1_kl": 0.28173828125, - "k3_kl": 0.379150390625, - "kimi_kl": 1.2509765625, - "learning_rate": 2.116e-07, - "loss": 0.0151, - "ppl": 0.1268310546875, - "reward": 0.6376240849494934, - "reward_std": 0.07871092483401299, - "rewards/perpo_ocr_edit_distance_reward": 0.6376241594552994, + "advantages": -2.384185791015625e-07, + "completion_length": 733.0, + "delta_ref_entropy_loss": 0.046142578125, + "delta_ref_ppl": -0.04736328125, + "entropy_loss": -0.185546875, + "epoch": 0.2882, + "grad_norm": 1.8316099686737617, + "k1_kl": 0.047119140625, + "k3_kl": 0.0311279296875, + "kimi_kl": 0.06298828125, + "learning_rate": 3.559e-07, + "loss": 0.0012, + "ppl": 0.08642578125, + "reward": 0.7948904037475586, + "reward_std": 0.2881690561771393, + "rewards/perpo_ocr_edit_distance_reward": 0.7948904633522034, + "step": 1441, + "temperature": 0.9 + }, + { + "advantages": -5.3082196245668456e-05, + "completion_length": 533.0, + "delta_ref_entropy_loss": 0.1318359375, + "delta_ref_ppl": -0.1259765625, + "entropy_loss": -0.115234375, + "epoch": 0.2884, + "grad_norm": 1.1497682562243505, + "k1_kl": 0.126953125, + "k3_kl": 0.07421875, + "kimi_kl": 0.2265625, + "learning_rate": 3.558e-07, + "loss": 0.003, + "ppl": 0.056884765625, + "reward": 0.9653126001358032, + "reward_std": 0.0013436307199299335, + "rewards/perpo_ocr_edit_distance_reward": 0.9653127193450928, "step": 1442, "temperature": 0.9 }, { - "advantages": -3.645675769803347e-05, - "completion_length": 618.5, - "delta_ref_entropy_loss": 0.0579833984375, - "delta_ref_ppl": -0.0531005859375, - "entropy_loss": -0.0697021484375, - "epoch": 0.5772, - "grad_norm": 1.2059600107745452, - "k1_kl": 0.05322265625, - "k3_kl": 0.0364990234375, - "kimi_kl": 0.089599609375, - "learning_rate": 2.114e-07, - "loss": 0.0015, - "ppl": 0.03656005859375, - "reward": 0.9877653121948242, - "reward_std": 0.001749550981912762, - "rewards/perpo_ocr_edit_distance_reward": 0.9877654016017914, + "advantages": 5.10896995820076e-07, + "completion_length": 315.0, + "delta_ref_entropy_loss": 0.0947265625, + "delta_ref_ppl": -0.0908203125, + "entropy_loss": -0.080078125, + "epoch": 0.2886, + "grad_norm": 1.9095886418182901, + "k1_kl": 0.09033203125, + "k3_kl": 0.051513671875, + "kimi_kl": 0.1259765625, + "learning_rate": 3.557e-07, + "loss": 0.0021, + "ppl": 0.031982421875, + "reward": 0.897787868976593, + "reward_std": 0.03375518321990967, + "rewards/perpo_ocr_edit_distance_reward": 0.8977879285812378, "step": 1443, "temperature": 0.9 }, { - "advantages": -6.575244333362207e-05, - "completion_length": 496.0, - "delta_ref_entropy_loss": 0.0263671875, - "delta_ref_ppl": -0.03387451171875, - "entropy_loss": -0.065673828125, - "epoch": 0.5776, - "grad_norm": 0.8712799994402531, - "k1_kl": 0.0338134765625, - "k3_kl": 0.024169921875, - "kimi_kl": 0.070068359375, - "learning_rate": 2.1119999999999999e-07, - "loss": 0.001, - "ppl": 0.032958984375, - "reward": 0.885697215795517, - "reward_std": 0.030040197205380537, - "rewards/perpo_ocr_edit_distance_reward": 0.8856972754001617, + "advantages": -3.2356808787881164e-06, + "completion_length": 548.0, + "delta_ref_entropy_loss": 0.09765625, + "delta_ref_ppl": -0.07421875, + "entropy_loss": -0.09326171875, + "epoch": 0.2888, + "grad_norm": 1.4262116809687526, + "k1_kl": 0.07373046875, + "k3_kl": 0.0361328125, + "kimi_kl": 0.0693359375, + "learning_rate": 3.5560000000000003e-07, + "loss": 0.0014, + "ppl": 0.047119140625, + "reward": 0.9043885469436646, + "reward_std": 0.01566138118505478, + "rewards/perpo_ocr_edit_distance_reward": 0.9043886065483093, "step": 1444, "temperature": 0.9 }, { - "advantages": -4.0105412608681945e-06, - "completion_length": 710.5, - "delta_ref_entropy_loss": 0.0791015625, - "delta_ref_ppl": -0.0440673828125, - "entropy_loss": -0.105224609375, - "epoch": 0.578, - "grad_norm": 0.9899572309966209, - "k1_kl": 0.0440673828125, - "k3_kl": 0.0233154296875, - "kimi_kl": 0.0504150390625, - "learning_rate": 2.1099999999999997e-07, - "loss": 0.0009, - "ppl": 0.055419921875, - "reward": 0.8382419347763062, - "reward_std": 0.012292447965592146, - "rewards/perpo_ocr_edit_distance_reward": 0.8382420241832733, + "advantages": -1.3283321322887787e-06, + "completion_length": 913.0, + "delta_ref_entropy_loss": 0.09228515625, + "delta_ref_ppl": -0.09521484375, + "entropy_loss": -0.126953125, + "epoch": 0.289, + "grad_norm": 2.8726055402017323, + "k1_kl": 0.09521484375, + "k3_kl": 0.056884765625, + "kimi_kl": 0.134765625, + "learning_rate": 3.555e-07, + "loss": 0.0023, + "ppl": 0.06494140625, + "reward": 0.7902413010597229, + "reward_std": 0.025422297418117523, + "rewards/perpo_ocr_edit_distance_reward": 0.7902413010597229, "step": 1445, "temperature": 0.9 }, { - "advantages": -0.0002980232238769531, - "completion_length": 399.5, - "delta_ref_entropy_loss": 0.03668212890625, - "delta_ref_ppl": -0.044921875, - "entropy_loss": -0.03057861328125, - "epoch": 0.5784, - "grad_norm": 0.38558237441994764, - "k1_kl": 0.0450439453125, - "k3_kl": 0.02978515625, - "kimi_kl": 0.0872802734375, - "learning_rate": 2.1079999999999998e-07, - "loss": 0.0015, - "ppl": 0.013397216796875, - "reward": 0.930225670337677, - "reward_std": 0.0016227468149736524, - "rewards/perpo_ocr_edit_distance_reward": 0.9302257597446442, + "advantages": 1.7029899268550253e-08, + "completion_length": 488.0, + "delta_ref_entropy_loss": 0.06298828125, + "delta_ref_ppl": -0.078125, + "entropy_loss": -0.0228271484375, + "epoch": 0.2892, + "grad_norm": 0.4005368235464786, + "k1_kl": 0.078125, + "k3_kl": 0.049560546875, + "kimi_kl": 0.181640625, + "learning_rate": 3.5539999999999997e-07, + "loss": 0.002, + "ppl": 0.00775146484375, + "reward": 0.9977620840072632, + "reward_std": 0.0004652218194678426, + "rewards/perpo_ocr_edit_distance_reward": 0.9977620244026184, "step": 1446, "temperature": 0.9 }, { - "advantages": -4.547834487311775e-05, - "completion_length": 907.5, - "delta_ref_entropy_loss": 0.03985595703125, - "delta_ref_ppl": -0.0333251953125, - "entropy_loss": -0.03826904296875, - "epoch": 0.5788, - "grad_norm": 0.5788253657908018, - "k1_kl": 0.033203125, - "k3_kl": 0.019805908203125, - "kimi_kl": 0.0472412109375, - "learning_rate": 2.106e-07, - "loss": 0.0008, - "ppl": 0.019256591796875, - "reward": 0.9960710108280182, - "reward_std": 0.001203529245685786, - "rewards/perpo_ocr_edit_distance_reward": 0.996071070432663, + "advantages": 0.0, + "completion_length": 683.0, + "delta_ref_entropy_loss": 0.05859375, + "delta_ref_ppl": -0.058349609375, + "entropy_loss": -0.0206298828125, + "epoch": 0.2894, + "grad_norm": 0.21101453360936928, + "k1_kl": 0.05859375, + "k3_kl": 0.03564453125, + "kimi_kl": 0.1279296875, + "learning_rate": 3.553e-07, + "loss": 0.0014, + "ppl": 0.0059814453125, + "reward": 0.9981740117073059, + "reward_std": 0.0002594362013041973, + "rewards/perpo_ocr_edit_distance_reward": 0.9981740117073059, "step": 1447, "temperature": 0.9 }, { - "advantages": -0.0002979210444777891, - "completion_length": 1295.5, - "delta_ref_entropy_loss": 0.018280029296875, - "delta_ref_ppl": -0.0224609375, - "entropy_loss": -0.04815673828125, - "epoch": 0.5792, - "grad_norm": 1.481406900578744, - "k1_kl": 0.0224609375, - "k3_kl": 0.0147705078125, - "kimi_kl": 0.04132080078125, - "learning_rate": 2.104e-07, - "loss": 0.0009, - "ppl": 0.025665283203125, - "reward": 0.9491125643253326, - "reward_std": 0.04546816274523735, - "rewards/perpo_ocr_edit_distance_reward": 0.949112594127655, + "advantages": -1.881803837022744e-05, + "completion_length": 589.0, + "delta_ref_entropy_loss": 0.0654296875, + "delta_ref_ppl": -0.0625, + "entropy_loss": -0.036376953125, + "epoch": 0.2896, + "grad_norm": 0.7876616337552204, + "k1_kl": 0.0625, + "k3_kl": 0.03466796875, + "kimi_kl": 0.1005859375, + "learning_rate": 3.552e-07, + "loss": 0.0014, + "ppl": 0.0150146484375, + "reward": 0.9698400497436523, + "reward_std": 0.0008053151541389525, + "rewards/perpo_ocr_edit_distance_reward": 0.9698401689529419, "step": 1448, "temperature": 0.9 }, { - "advantages": -1.3283321322887787e-06, - "completion_length": 473.0, - "delta_ref_entropy_loss": 0.1026611328125, - "delta_ref_ppl": -0.07666015625, - "entropy_loss": -0.1898193359375, - "epoch": 0.5796, - "grad_norm": 2.164753068189586, - "k1_kl": 0.076904296875, - "k3_kl": 0.046142578125, - "kimi_kl": 0.102783203125, - "learning_rate": 2.102e-07, - "loss": 0.0018, - "ppl": 0.108184814453125, - "reward": 0.607901468873024, - "reward_std": 0.06760654551908374, - "rewards/perpo_ocr_edit_distance_reward": 0.6079014837741852, + "advantages": -2.997262299686554e-06, + "completion_length": 83.0, + "delta_ref_entropy_loss": 0.1806640625, + "delta_ref_ppl": -0.439453125, + "entropy_loss": -0.095703125, + "epoch": 0.2898, + "grad_norm": 2.9190268919455984, + "k1_kl": 0.439453125, + "k3_kl": 0.333984375, + "kimi_kl": 1.40625, + "learning_rate": 3.551e-07, + "loss": 0.0134, + "ppl": 0.0322265625, + "reward": 0.9428570866584778, + "reward_std": 0.002748829545453191, + "rewards/perpo_ocr_edit_distance_reward": 0.9428571462631226, "step": 1449, "temperature": 0.9 }, { - "advantages": -2.226659425730304e-06, - "completion_length": 855.5, - "delta_ref_entropy_loss": 0.09161376953125, - "delta_ref_ppl": -0.05218505859375, - "entropy_loss": -0.1287841796875, - "epoch": 0.58, - "grad_norm": 1.4309654321893057, - "k1_kl": 0.05218505859375, - "k3_kl": 0.0258941650390625, - "kimi_kl": 0.047515869140625, - "learning_rate": 2.0999999999999997e-07, - "loss": 0.001, - "ppl": 0.07476806640625, - "reward": 0.8857944309711456, - "reward_std": 0.005962765397271141, - "rewards/perpo_ocr_edit_distance_reward": 0.885794460773468, + "advantages": -0.00016339337162207812, + "completion_length": 447.0, + "delta_ref_entropy_loss": 0.06787109375, + "delta_ref_ppl": -0.08544921875, + "entropy_loss": -0.026123046875, + "epoch": 0.29, + "grad_norm": 0.5190875627878492, + "k1_kl": 0.08544921875, + "k3_kl": 0.053466796875, + "kimi_kl": 0.2041015625, + "learning_rate": 3.55e-07, + "loss": 0.0023, + "ppl": 0.0084228515625, + "reward": 0.9931244850158691, + "reward_std": 0.0003690310986712575, + "rewards/perpo_ocr_edit_distance_reward": 0.9931246042251587, "step": 1450, "temperature": 0.9 }, { - "advantages": -0.0002980232238769531, - "completion_length": 580.0, - "delta_ref_entropy_loss": 0.026031494140625, - "delta_ref_ppl": -0.022705078125, - "entropy_loss": -0.0088653564453125, - "epoch": 0.5804, - "grad_norm": 0.016449055548806123, - "k1_kl": 0.022705078125, - "k3_kl": 0.015533447265625, - "kimi_kl": 0.0640869140625, - "learning_rate": 2.0979999999999999e-07, - "loss": 0.0009, - "ppl": 0.00270843505859375, - "reward": 0.996333658695221, - "reward_std": 0.0, - "rewards/perpo_ocr_edit_distance_reward": 0.9963337481021881, + "advantages": -5.6726596085354686e-05, + "completion_length": 833.0, + "delta_ref_entropy_loss": 0.0380859375, + "delta_ref_ppl": -0.044921875, + "entropy_loss": -0.0308837890625, + "epoch": 0.2902, + "grad_norm": 0.324076225471425, + "k1_kl": 0.044921875, + "k3_kl": 0.0272216796875, + "kimi_kl": 0.07373046875, + "learning_rate": 3.549e-07, + "loss": 0.0011, + "ppl": 0.01190185546875, + "reward": 0.9960635304450989, + "reward_std": 0.0006505926721729338, + "rewards/perpo_ocr_edit_distance_reward": 0.9960635304450989, "step": 1451, "temperature": 0.9 }, { - "advantages": 0.0, - "completion_length": 277.0, - "delta_ref_entropy_loss": 0.0357666015625, - "delta_ref_ppl": -0.07220458984375, - "entropy_loss": -0.02490234375, - "epoch": 0.5808, - "grad_norm": 0.04661325380997372, - "k1_kl": 0.072265625, - "k3_kl": 0.05511474609375, - "kimi_kl": 0.22662353515625, - "learning_rate": 2.096e-07, - "loss": 0.0022, - "ppl": 0.012664794921875, - "reward": 1.0, - "reward_std": 0.0, - "rewards/perpo_ocr_edit_distance_reward": 1.0, + "advantages": -8.174351933121216e-07, + "completion_length": 300.0, + "delta_ref_entropy_loss": 0.2041015625, + "delta_ref_ppl": -0.158203125, + "entropy_loss": -0.10107421875, + "epoch": 0.2904, + "grad_norm": 2.5163213266228213, + "k1_kl": 0.158203125, + "k3_kl": 0.0849609375, + "kimi_kl": 0.2060546875, + "learning_rate": 3.548e-07, + "loss": 0.0034, + "ppl": 0.0390625, + "reward": 0.9221757054328918, + "reward_std": 0.010591440834105015, + "rewards/perpo_ocr_edit_distance_reward": 0.9221757650375366, "step": 1452, "temperature": 0.9 }, { - "advantages": 3.320830273878528e-06, - "completion_length": 390.0, - "delta_ref_entropy_loss": 0.09600830078125, - "delta_ref_ppl": -0.07025146484375, - "entropy_loss": -0.084991455078125, - "epoch": 0.5812, - "grad_norm": 1.1801988077847811, - "k1_kl": 0.07049560546875, - "k3_kl": 0.0380859375, - "kimi_kl": 0.0816650390625, - "learning_rate": 2.0939999999999998e-07, - "loss": 0.0015, - "ppl": 0.04389190673828125, - "reward": 0.8389454185962677, - "reward_std": 0.0018709630239754915, - "rewards/perpo_ocr_edit_distance_reward": 0.8389454185962677, + "advantages": -2.0631723600672558e-05, + "completion_length": 350.0, + "delta_ref_entropy_loss": 0.0849609375, + "delta_ref_ppl": -0.10009765625, + "entropy_loss": -0.060791015625, + "epoch": 0.2906, + "grad_norm": 0.7676231812769397, + "k1_kl": 0.10009765625, + "k3_kl": 0.06884765625, + "kimi_kl": 0.2041015625, + "learning_rate": 3.547e-07, + "loss": 0.0028, + "ppl": 0.0208740234375, + "reward": 0.8763432502746582, + "reward_std": 0.001550349872559309, + "rewards/perpo_ocr_edit_distance_reward": 0.8763432502746582, "step": 1453, "temperature": 0.9 }, { - "advantages": -6.922654165464337e-05, - "completion_length": 418.0, - "delta_ref_entropy_loss": 0.0411376953125, - "delta_ref_ppl": -0.03741455078125, - "entropy_loss": -0.026580810546875, - "epoch": 0.5816, - "grad_norm": 0.5241219436500749, - "k1_kl": 0.03753662109375, - "k3_kl": 0.0228271484375, - "kimi_kl": 0.07562255859375, - "learning_rate": 2.092e-07, - "loss": 0.001, - "ppl": 0.0105133056640625, - "reward": 0.9936156570911407, - "reward_std": 0.0007335145564866252, - "rewards/perpo_ocr_edit_distance_reward": 0.9936157166957855, + "advantages": 4.283019734430127e-06, + "completion_length": 777.0, + "delta_ref_entropy_loss": 0.0712890625, + "delta_ref_ppl": -0.06787109375, + "entropy_loss": -0.0400390625, + "epoch": 0.2908, + "grad_norm": 0.519726781221934, + "k1_kl": 0.06787109375, + "k3_kl": 0.04248046875, + "kimi_kl": 0.15234375, + "learning_rate": 3.546e-07, + "loss": 0.0017, + "ppl": 0.01495361328125, + "reward": 0.9836829900741577, + "reward_std": 0.007848933339118958, + "rewards/perpo_ocr_edit_distance_reward": 0.9836829304695129, "step": 1454, "temperature": 0.9 }, { - "advantages": -7.95636897237273e-05, - "completion_length": 502.0, - "delta_ref_entropy_loss": 0.0218505859375, - "delta_ref_ppl": -0.017578125, - "entropy_loss": -0.03173828125, - "epoch": 0.582, - "grad_norm": 0.805019500951677, - "k1_kl": 0.01751708984375, - "k3_kl": 0.010467529296875, - "kimi_kl": 0.018310546875, - "learning_rate": 2.0899999999999998e-07, - "loss": 0.0005, - "ppl": 0.0172119140625, - "reward": 0.9989149570465088, - "reward_std": 0.0010057963081635535, - "rewards/perpo_ocr_edit_distance_reward": 0.998915046453476, + "advantages": 0.0, + "completion_length": 1085.0, + "delta_ref_entropy_loss": 0.036376953125, + "delta_ref_ppl": -0.037353515625, + "entropy_loss": -0.04833984375, + "epoch": 0.291, + "grad_norm": 1.6379915535608234, + "k1_kl": 0.03759765625, + "k3_kl": 0.020263671875, + "kimi_kl": 0.04833984375, + "learning_rate": 3.5449999999999995e-07, + "loss": 0.0008, + "ppl": 0.0211181640625, + "reward": 0.9909176230430603, + "reward_std": 0.004872059915214777, + "rewards/perpo_ocr_edit_distance_reward": 0.9909176826477051, "step": 1455, "temperature": 0.9 }, { - "advantages": -6.863049520688946e-06, - "completion_length": 400.0, - "delta_ref_entropy_loss": 0.06744384765625, - "delta_ref_ppl": -0.2603759765625, - "entropy_loss": -0.135498046875, - "epoch": 0.5824, - "grad_norm": 7.06314581629398, - "k1_kl": 0.26043701171875, - "k3_kl": 0.210205078125, - "kimi_kl": 0.8719482421875, - "learning_rate": 2.0880000000000002e-07, - "loss": 0.0084, - "ppl": 0.10821533203125, - "reward": 0.6683570295572281, - "reward_std": 0.005888100014999509, - "rewards/perpo_ocr_edit_distance_reward": 0.6683570444583893, + "advantages": 0.0, + "completion_length": 737.0, + "delta_ref_entropy_loss": 0.09326171875, + "delta_ref_ppl": -0.09814453125, + "entropy_loss": -0.158203125, + "epoch": 0.2912, + "grad_norm": 1.8381794672287664, + "k1_kl": 0.0986328125, + "k3_kl": 0.0693359375, + "kimi_kl": 0.14453125, + "learning_rate": 3.544e-07, + "loss": 0.0028, + "ppl": 0.09228515625, + "reward": 0.800165593624115, + "reward_std": 0.019333835691213608, + "rewards/perpo_ocr_edit_distance_reward": 0.800165593624115, "step": 1456, "temperature": 0.9 }, { - "advantages": -3.610764315453707e-05, - "completion_length": 315.0, - "delta_ref_entropy_loss": 0.02703857421875, - "delta_ref_ppl": -0.05078125, - "entropy_loss": -0.02899169921875, - "epoch": 0.5828, - "grad_norm": 2.015232542541809, - "k1_kl": 0.051025390625, - "k3_kl": 0.037872314453125, - "kimi_kl": 0.1248779296875, - "learning_rate": 2.086e-07, - "loss": 0.0015, - "ppl": 0.01397705078125, - "reward": 0.998571366071701, - "reward_std": 0.0027178689488209784, - "rewards/perpo_ocr_edit_distance_reward": 0.9985714554786682, + "advantages": -8.360403444385156e-05, + "completion_length": 490.0, + "delta_ref_entropy_loss": 0.07373046875, + "delta_ref_ppl": -0.10205078125, + "entropy_loss": -0.026611328125, + "epoch": 0.2914, + "grad_norm": 0.3990492444986717, + "k1_kl": 0.10205078125, + "k3_kl": 0.06787109375, + "kimi_kl": 0.244140625, + "learning_rate": 3.543e-07, + "loss": 0.0028, + "ppl": 0.0106201171875, + "reward": 0.9899846315383911, + "reward_std": 0.0004091129812877625, + "rewards/perpo_ocr_edit_distance_reward": 0.9899846911430359, "step": 1457, "temperature": 0.9 }, { - "advantages": -4.349010396254016e-06, - "completion_length": 1163.5, - "delta_ref_entropy_loss": 0.014892578125, - "delta_ref_ppl": -0.0077056884765625, - "entropy_loss": -0.018096923828125, - "epoch": 0.5832, - "grad_norm": 0.24797998897085963, - "k1_kl": 0.0076904296875, - "k3_kl": 0.003948211669921875, - "kimi_kl": 0.00617218017578125, - "learning_rate": 2.0839999999999999e-07, - "loss": 0.0002, - "ppl": 0.00927734375, - "reward": 0.988118052482605, - "reward_std": 0.0009302125545218587, - "rewards/perpo_ocr_edit_distance_reward": 0.988118052482605, + "advantages": -8.514949740856537e-07, + "completion_length": 136.0, + "delta_ref_entropy_loss": 0.1708984375, + "delta_ref_ppl": -0.345703125, + "entropy_loss": -0.220703125, + "epoch": 0.2916, + "grad_norm": 4.285474838416877, + "k1_kl": 0.345703125, + "k3_kl": 0.24609375, + "kimi_kl": 0.87890625, + "learning_rate": 3.542e-07, + "loss": 0.0098, + "ppl": 0.095703125, + "reward": 0.3920265734195709, + "reward_std": 0.030654488131403923, + "rewards/perpo_ocr_edit_distance_reward": 0.3920266032218933, "step": 1458, "temperature": 0.9 }, { - "advantages": -1.8988337160408264e-06, - "completion_length": 639.0, - "delta_ref_entropy_loss": 0.0128021240234375, - "delta_ref_ppl": -0.01055908203125, - "entropy_loss": -0.02716064453125, - "epoch": 0.5836, - "grad_norm": 1.2040042975574607, - "k1_kl": 0.010528564453125, - "k3_kl": 0.00545501708984375, - "kimi_kl": 0.0102081298828125, - "learning_rate": 2.082e-07, - "loss": 0.0002, - "ppl": 0.0126495361328125, - "reward": 0.9583449363708496, - "reward_std": 0.006687099114060402, - "rewards/perpo_ocr_edit_distance_reward": 0.958344966173172, + "advantages": -2.5408609872101806e-05, + "completion_length": 359.0, + "delta_ref_entropy_loss": 0.1845703125, + "delta_ref_ppl": -0.1474609375, + "entropy_loss": -0.1533203125, + "epoch": 0.2918, + "grad_norm": 2.494365383263415, + "k1_kl": 0.1474609375, + "k3_kl": 0.0849609375, + "kimi_kl": 0.1787109375, + "learning_rate": 3.5410000000000003e-07, + "loss": 0.0034, + "ppl": 0.0751953125, + "reward": 0.37023305892944336, + "reward_std": 0.0015763555420562625, + "rewards/perpo_ocr_edit_distance_reward": 0.37023311853408813, "step": 1459, "temperature": 0.9 }, { - "advantages": -6.974595362407854e-05, - "completion_length": 683.5, - "delta_ref_entropy_loss": 0.0400390625, - "delta_ref_ppl": -0.055908203125, - "entropy_loss": -0.04541015625, - "epoch": 0.584, - "grad_norm": 0.609760109539336, - "k1_kl": 0.055908203125, - "k3_kl": 0.03948974609375, - "kimi_kl": 0.1217041015625, - "learning_rate": 2.0799999999999998e-07, - "loss": 0.0016, - "ppl": 0.0238037109375, - "reward": 0.9581279456615448, - "reward_std": 0.0006820491107646376, - "rewards/perpo_ocr_edit_distance_reward": 0.9581279754638672, + "advantages": 4.3545453081605956e-05, + "completion_length": 1191.0, + "delta_ref_entropy_loss": 0.061767578125, + "delta_ref_ppl": -0.0400390625, + "entropy_loss": -0.06591796875, + "epoch": 0.292, + "grad_norm": 19.382770930553075, + "k1_kl": 0.039794921875, + "k3_kl": 0.287109375, + "kimi_kl": 0.06982421875, + "learning_rate": 3.5399999999999997e-07, + "loss": 0.0114, + "ppl": 0.04736328125, + "reward": 0.8041768074035645, + "reward_std": 0.0010732585797086358, + "rewards/perpo_ocr_edit_distance_reward": 0.8041767477989197, "step": 1460, "temperature": 0.9 }, { - "advantages": -5.9817521105287597e-05, - "completion_length": 479.0, - "delta_ref_entropy_loss": 0.02001953125, - "delta_ref_ppl": -0.01220703125, - "entropy_loss": -0.01678466796875, - "epoch": 0.5844, - "grad_norm": 0.24469608289332603, - "k1_kl": 0.01220703125, - "k3_kl": 0.0056304931640625, - "kimi_kl": 0.0089263916015625, - "learning_rate": 2.078e-07, - "loss": 0.0003, - "ppl": 0.0062255859375, - "reward": 0.9876639246940613, - "reward_std": 0.00012795746442861855, - "rewards/perpo_ocr_edit_distance_reward": 0.987663984298706, + "advantages": 5.10896995820076e-07, + "completion_length": 1024.0, + "delta_ref_entropy_loss": 0.126953125, + "delta_ref_ppl": -0.10888671875, + "entropy_loss": -0.478515625, + "epoch": 0.2922, + "grad_norm": 5.658023618540854, + "k1_kl": 0.10888671875, + "k3_kl": 0.08154296875, + "kimi_kl": 0.1669921875, + "learning_rate": 3.5389999999999996e-07, + "loss": 0.0033, + "ppl": 0.2734375, + "reward": 0.6241503357887268, + "reward_std": 0.06223520636558533, + "rewards/perpo_ocr_edit_distance_reward": 0.624150276184082, "step": 1461, "temperature": 0.9 }, { - "advantages": -1.1522855857037939e-05, - "completion_length": 435.0, - "delta_ref_entropy_loss": 0.0767822265625, - "delta_ref_ppl": -0.05419921875, - "entropy_loss": -0.075439453125, - "epoch": 0.5848, - "grad_norm": 0.8425290397981048, - "k1_kl": 0.0540771484375, - "k3_kl": 0.029541015625, - "kimi_kl": 0.088134765625, - "learning_rate": 2.076e-07, - "loss": 0.0012, - "ppl": 0.035736083984375, - "reward": 0.9855639934539795, - "reward_std": 0.001059359055943787, - "rewards/perpo_ocr_edit_distance_reward": 0.9855640530586243, + "advantages": -1.021793991640152e-07, + "completion_length": 500.0, + "delta_ref_entropy_loss": 0.158203125, + "delta_ref_ppl": -0.10888671875, + "entropy_loss": -0.27734375, + "epoch": 0.2924, + "grad_norm": 3.2583079000189876, + "k1_kl": 0.10888671875, + "k3_kl": 0.0615234375, + "kimi_kl": 0.173828125, + "learning_rate": 3.538e-07, + "loss": 0.0025, + "ppl": 0.1279296875, + "reward": 0.6597886085510254, + "reward_std": 0.16588705778121948, + "rewards/perpo_ocr_edit_distance_reward": 0.6597886085510254, "step": 1462, "temperature": 0.9 }, { - "advantages": -3.414494754849784e-05, - "completion_length": 376.0, - "delta_ref_entropy_loss": 0.0616455078125, - "delta_ref_ppl": -0.066162109375, - "entropy_loss": -0.048583984375, - "epoch": 0.5852, - "grad_norm": 0.8449167294894535, - "k1_kl": 0.06640625, - "k3_kl": 0.03692626953125, - "kimi_kl": 0.082763671875, - "learning_rate": 2.074e-07, - "loss": 0.0015, - "ppl": 0.02484130859375, - "reward": 0.9111864268779755, - "reward_std": 0.15793630958069116, - "rewards/perpo_ocr_edit_distance_reward": 0.9111865162849426, + "advantages": -1.1920928955078125e-06, + "completion_length": 268.0, + "delta_ref_entropy_loss": 0.1474609375, + "delta_ref_ppl": -0.1513671875, + "entropy_loss": -0.0986328125, + "epoch": 0.2926, + "grad_norm": 1.7703712592609517, + "k1_kl": 0.150390625, + "k3_kl": 0.08984375, + "kimi_kl": 0.251953125, + "learning_rate": 3.537e-07, + "loss": 0.0036, + "ppl": 0.044677734375, + "reward": 0.950644314289093, + "reward_std": 0.014254559762775898, + "rewards/perpo_ocr_edit_distance_reward": 0.9506443738937378, "step": 1463, "temperature": 0.9 }, { - "advantages": -9.825400138652185e-05, - "completion_length": 897.0, - "delta_ref_entropy_loss": 0.03216552734375, - "delta_ref_ppl": -0.03497314453125, - "entropy_loss": -0.0341796875, - "epoch": 0.5856, - "grad_norm": 0.6532377348023137, - "k1_kl": 0.03497314453125, - "k3_kl": 0.027435302734375, - "kimi_kl": 0.1031494140625, - "learning_rate": 2.0719999999999998e-07, - "loss": 0.0012, - "ppl": 0.01983642578125, - "reward": 0.9917141497135162, - "reward_std": 0.0006526022625621408, - "rewards/perpo_ocr_edit_distance_reward": 0.991714209318161, + "advantages": -1.7029899268550253e-08, + "completion_length": 772.0, + "delta_ref_entropy_loss": 0.1318359375, + "delta_ref_ppl": -0.09423828125, + "entropy_loss": -0.1552734375, + "epoch": 0.2928, + "grad_norm": 1.4118949677196846, + "k1_kl": 0.09423828125, + "k3_kl": 0.050537109375, + "kimi_kl": 0.11767578125, + "learning_rate": 3.536e-07, + "loss": 0.002, + "ppl": 0.0791015625, + "reward": 0.7629270553588867, + "reward_std": 0.010582818649709225, + "rewards/perpo_ocr_edit_distance_reward": 0.7629271149635315, "step": 1464, "temperature": 0.9 }, { - "advantages": -1.7212970305990893e-05, - "completion_length": 531.5, - "delta_ref_entropy_loss": 0.06182861328125, - "delta_ref_ppl": -0.0509033203125, - "entropy_loss": -0.0775146484375, - "epoch": 0.586, - "grad_norm": 1.354268416190249, - "k1_kl": 0.0506591796875, - "k3_kl": 0.027740478515625, - "kimi_kl": 0.05657958984375, - "learning_rate": 2.07e-07, - "loss": 0.0011, - "ppl": 0.042236328125, - "reward": 0.9802415668964386, - "reward_std": 0.0033066654577851295, - "rewards/perpo_ocr_edit_distance_reward": 0.9802416563034058, + "advantages": -3.121580448350869e-05, + "completion_length": 93.0, + "delta_ref_entropy_loss": 0.049072265625, + "delta_ref_ppl": -0.2890625, + "entropy_loss": -0.08544921875, + "epoch": 0.293, + "grad_norm": 2.081058720616768, + "k1_kl": 0.2890625, + "k3_kl": 0.2158203125, + "kimi_kl": 0.76171875, + "learning_rate": 3.535e-07, + "loss": 0.0087, + "ppl": 0.032470703125, + "reward": 0.9655171632766724, + "reward_std": 0.0018098901491612196, + "rewards/perpo_ocr_edit_distance_reward": 0.9655172824859619, "step": 1465, "temperature": 0.9 }, { - "advantages": 0.0, - "completion_length": 453.5, - "delta_ref_entropy_loss": 0.0216064453125, - "delta_ref_ppl": -0.016754150390625, - "entropy_loss": -0.00994873046875, - "epoch": 0.5864, - "grad_norm": 0.009079402867709172, - "k1_kl": 0.016754150390625, - "k3_kl": 0.010040283203125, - "kimi_kl": 0.022918701171875, - "learning_rate": 2.068e-07, - "loss": 0.0004, - "ppl": 0.00434112548828125, - "reward": 1.0, - "reward_std": 0.0, - "rewards/perpo_ocr_edit_distance_reward": 1.0, + "advantages": 1.9550323486328125e-05, + "completion_length": 77.0, + "delta_ref_entropy_loss": 0.0595703125, + "delta_ref_ppl": -0.310546875, + "entropy_loss": -0.0390625, + "epoch": 0.2932, + "grad_norm": 1.7908719633506833, + "k1_kl": 0.3125, + "k3_kl": 0.251953125, + "kimi_kl": 0.921875, + "learning_rate": 3.534e-07, + "loss": 0.0101, + "ppl": 0.01116943359375, + "reward": 0.9748973250389099, + "reward_std": 0.0012075643753632903, + "rewards/perpo_ocr_edit_distance_reward": 0.9748972654342651, "step": 1466, "temperature": 0.9 }, { - "advantages": -1.1324882791541313e-06, - "completion_length": 452.5, - "delta_ref_entropy_loss": 0.045654296875, - "delta_ref_ppl": -0.06732177734375, - "entropy_loss": -0.099609375, - "epoch": 0.5868, - "grad_norm": 3.8768268951672424, - "k1_kl": 0.0673828125, - "k3_kl": 0.05474853515625, - "kimi_kl": 0.156097412109375, - "learning_rate": 2.0659999999999998e-07, - "loss": 0.0022, - "ppl": 0.056640625, - "reward": 0.779366135597229, - "reward_std": 0.08625385444611311, - "rewards/perpo_ocr_edit_distance_reward": 0.7793661952018738, + "advantages": -5.8020865253638476e-05, + "completion_length": 423.0, + "delta_ref_entropy_loss": 0.03564453125, + "delta_ref_ppl": -0.053466796875, + "entropy_loss": -0.03125, + "epoch": 0.2934, + "grad_norm": 1.1208294402288106, + "k1_kl": 0.053466796875, + "k3_kl": 0.03759765625, + "kimi_kl": 0.1328125, + "learning_rate": 3.5329999999999997e-07, + "loss": 0.0016, + "ppl": 0.0140380859375, + "reward": 0.9851548671722412, + "reward_std": 0.001074520987458527, + "rewards/perpo_ocr_edit_distance_reward": 0.9851549863815308, "step": 1467, "temperature": 0.9 }, { - "advantages": -7.175122118496802e-05, - "completion_length": 510.5, - "delta_ref_entropy_loss": 0.03656005859375, - "delta_ref_ppl": -0.03851318359375, - "entropy_loss": -0.02264404296875, - "epoch": 0.5872, - "grad_norm": 0.46524208968480635, - "k1_kl": 0.03851318359375, - "k3_kl": 0.023345947265625, - "kimi_kl": 0.05731201171875, - "learning_rate": 2.064e-07, - "loss": 0.001, - "ppl": 0.0099639892578125, - "reward": 0.9995243549346924, - "reward_std": 0.0004915967947454192, - "rewards/perpo_ocr_edit_distance_reward": 0.9995244443416595, + "advantages": -5.194119239604333e-06, + "completion_length": 420.0, + "delta_ref_entropy_loss": 0.0478515625, + "delta_ref_ppl": -0.0810546875, + "entropy_loss": -0.032958984375, + "epoch": 0.2936, + "grad_norm": 0.757584218657531, + "k1_kl": 0.08154296875, + "k3_kl": 0.05322265625, + "kimi_kl": 0.1376953125, + "learning_rate": 3.532e-07, + "loss": 0.0021, + "ppl": 0.01043701171875, + "reward": 0.9831864833831787, + "reward_std": 0.0015339668607339263, + "rewards/perpo_ocr_edit_distance_reward": 0.9831864833831787, "step": 1468, "temperature": 0.9 }, { - "advantages": -1.4833041859674267e-05, - "completion_length": 873.5, - "delta_ref_entropy_loss": 0.021240234375, - "delta_ref_ppl": -0.028472900390625, - "entropy_loss": -0.03961181640625, - "epoch": 0.5876, - "grad_norm": 0.11367279868180905, - "k1_kl": 0.028472900390625, - "k3_kl": 0.017425537109375, - "kimi_kl": 0.04217529296875, - "learning_rate": 2.0619999999999998e-07, - "loss": 0.0007, - "ppl": 0.01983642578125, - "reward": 0.997796356678009, - "reward_std": 0.00023708185472059995, - "rewards/perpo_ocr_edit_distance_reward": 0.9977964162826538, + "advantages": 0.0, + "completion_length": 773.0, + "delta_ref_entropy_loss": 0.058837890625, + "delta_ref_ppl": -0.052734375, + "entropy_loss": -0.024658203125, + "epoch": 0.2938, + "grad_norm": 0.8607214430614367, + "k1_kl": 0.052734375, + "k3_kl": 0.032470703125, + "kimi_kl": 0.0908203125, + "learning_rate": 3.531e-07, + "loss": 0.0013, + "ppl": 0.0081787109375, + "reward": 0.9931157827377319, + "reward_std": 0.001563269179314375, + "rewards/perpo_ocr_edit_distance_reward": 0.9931157827377319, "step": 1469, "temperature": 0.9 }, { - "advantages": -6.908604336786084e-05, - "completion_length": 885.0, - "delta_ref_entropy_loss": 0.040008544921875, - "delta_ref_ppl": -0.025177001953125, - "entropy_loss": -0.04620361328125, - "epoch": 0.588, - "grad_norm": 0.6987405442234453, - "k1_kl": 0.025177001953125, - "k3_kl": 0.015289306640625, - "kimi_kl": 0.0362548828125, - "learning_rate": 2.06e-07, - "loss": 0.0007, - "ppl": 0.0252532958984375, - "reward": 0.9881788492202759, - "reward_std": 0.000602633343078196, - "rewards/perpo_ocr_edit_distance_reward": 0.9881789088249207, + "advantages": -6.867307092761621e-05, + "completion_length": 1661.0, + "delta_ref_entropy_loss": 0.018310546875, + "delta_ref_ppl": -0.02392578125, + "entropy_loss": -0.0267333984375, + "epoch": 0.294, + "grad_norm": 8.22403856613265, + "k1_kl": 0.02392578125, + "k3_kl": 0.0654296875, + "kimi_kl": 0.044189453125, + "learning_rate": 3.5299999999999994e-07, + "loss": 0.0027, + "ppl": 0.014892578125, + "reward": 0.9971121549606323, + "reward_std": 0.0007678742404095829, + "rewards/perpo_ocr_edit_distance_reward": 0.9971122741699219, "step": 1470, "temperature": 0.9 }, { - "advantages": -5.752700087668927e-05, - "completion_length": 584.5, - "delta_ref_entropy_loss": 0.033447265625, - "delta_ref_ppl": -0.0233154296875, - "entropy_loss": -0.043212890625, - "epoch": 0.5884, - "grad_norm": 0.7539725781143889, - "k1_kl": 0.0233154296875, - "k3_kl": 0.01177978515625, - "kimi_kl": 0.027069091796875, - "learning_rate": 2.058e-07, - "loss": 0.0005, - "ppl": 0.01812744140625, - "reward": 0.9749354720115662, - "reward_std": 0.006595095073862467, - "rewards/perpo_ocr_edit_distance_reward": 0.9749355316162109, + "advantages": -7.746049959678203e-05, + "completion_length": 534.0, + "delta_ref_entropy_loss": 0.062255859375, + "delta_ref_ppl": -0.07373046875, + "entropy_loss": -0.0206298828125, + "epoch": 0.2942, + "grad_norm": 0.4843188609284894, + "k1_kl": 0.07421875, + "k3_kl": 0.044921875, + "kimi_kl": 0.1376953125, + "learning_rate": 3.529e-07, + "loss": 0.0019, + "ppl": 0.006378173828125, + "reward": 0.9920384883880615, + "reward_std": 0.0003397419932298362, + "rewards/perpo_ocr_edit_distance_reward": 0.9920385479927063, "step": 1471, "temperature": 0.9 }, { - "advantages": -2.343314224617643e-05, - "completion_length": 484.0, - "delta_ref_entropy_loss": 0.03753662109375, - "delta_ref_ppl": -0.0343017578125, - "entropy_loss": -0.021484375, - "epoch": 0.5888, - "grad_norm": 0.41501546268986345, - "k1_kl": 0.0341796875, - "k3_kl": 0.0205078125, - "kimi_kl": 0.0654296875, - "learning_rate": 2.056e-07, - "loss": 0.0008, - "ppl": 0.009124755859375, - "reward": 0.9975274801254272, - "reward_std": 0.0035443765809759498, - "rewards/perpo_ocr_edit_distance_reward": 0.9975275099277496, + "advantages": -0.0005960464477539062, + "completion_length": 462.0, + "delta_ref_entropy_loss": 0.050537109375, + "delta_ref_ppl": -0.064453125, + "entropy_loss": -0.01348876953125, + "epoch": 0.2944, + "grad_norm": 0.0076914380243619995, + "k1_kl": 0.06494140625, + "k3_kl": 0.03759765625, + "kimi_kl": 0.1044921875, + "learning_rate": 3.528e-07, + "loss": 0.0021, + "ppl": 0.00360107421875, + "reward": 0.9911779165267944, + "reward_std": 0.0, + "rewards/perpo_ocr_edit_distance_reward": 0.991178035736084, "step": 1472, "temperature": 0.9 }, { - "advantages": -0.0002980232238769531, - "completion_length": 542.5, - "delta_ref_entropy_loss": 0.017669677734375, - "delta_ref_ppl": -0.0112152099609375, - "entropy_loss": -0.01129150390625, - "epoch": 0.5892, - "grad_norm": 0.014517659425804354, - "k1_kl": 0.01123046875, - "k3_kl": 0.004730224609375, - "kimi_kl": 0.0076446533203125, - "learning_rate": 2.054e-07, - "loss": 0.0005, - "ppl": 0.003204345703125, - "reward": 0.9930795729160309, - "reward_std": 0.0, - "rewards/perpo_ocr_edit_distance_reward": 0.9930796027183533, + "advantages": -0.0002201455063186586, + "completion_length": 562.0, + "delta_ref_entropy_loss": 0.062255859375, + "delta_ref_ppl": -0.08056640625, + "entropy_loss": -0.043212890625, + "epoch": 0.2946, + "grad_norm": 0.7438482627761983, + "k1_kl": 0.08056640625, + "k3_kl": 0.04931640625, + "kimi_kl": 0.19140625, + "learning_rate": 3.527e-07, + "loss": 0.0022, + "ppl": 0.018798828125, + "reward": 0.975757896900177, + "reward_std": 0.00048016165965236723, + "rewards/perpo_ocr_edit_distance_reward": 0.9757580757141113, "step": 1473, "temperature": 0.9 }, { - "advantages": -1.0379723789810669e-05, - "completion_length": 422.0, - "delta_ref_entropy_loss": 0.072021484375, - "delta_ref_ppl": -0.0523681640625, - "entropy_loss": -0.110107421875, - "epoch": 0.5896, - "grad_norm": 1.0022985305413934, - "k1_kl": 0.05242919921875, - "k3_kl": 0.027740478515625, - "kimi_kl": 0.0538330078125, - "learning_rate": 2.0519999999999998e-07, - "loss": 0.0011, - "ppl": 0.0655517578125, - "reward": 0.9499669671058655, - "reward_std": 0.0009754609200172126, - "rewards/perpo_ocr_edit_distance_reward": 0.9499669671058655, + "advantages": -0.0005960464477539062, + "completion_length": 238.0, + "delta_ref_entropy_loss": 0.0810546875, + "delta_ref_ppl": -0.10693359375, + "entropy_loss": -0.05615234375, + "epoch": 0.2948, + "grad_norm": 0.052097068222976384, + "k1_kl": 0.10693359375, + "k3_kl": 0.0654296875, + "kimi_kl": 0.18359375, + "learning_rate": 3.526e-07, + "loss": 0.0032, + "ppl": 0.0208740234375, + "reward": 0.9679408073425293, + "reward_std": 0.0, + "rewards/perpo_ocr_edit_distance_reward": 0.9679408669471741, "step": 1474, "temperature": 0.9 }, { - "advantages": -1.7919712263392285e-05, - "completion_length": 585.0, - "delta_ref_entropy_loss": 0.05712890625, - "delta_ref_ppl": -0.05908203125, - "entropy_loss": -0.0594482421875, - "epoch": 0.59, - "grad_norm": 0.5685951317708798, - "k1_kl": 0.0589599609375, - "k3_kl": 0.0379638671875, - "kimi_kl": 0.1396484375, - "learning_rate": 2.0499999999999997e-07, - "loss": 0.0015, - "ppl": 0.0295257568359375, - "reward": 0.9711411893367767, - "reward_std": 0.0016129782889038324, - "rewards/perpo_ocr_edit_distance_reward": 0.9711412489414215, + "advantages": -3.70400300653273e-07, + "completion_length": 784.0, + "delta_ref_entropy_loss": -0.0137939453125, + "delta_ref_ppl": -0.033203125, + "entropy_loss": -0.0947265625, + "epoch": 0.295, + "grad_norm": 1.9184850001885991, + "k1_kl": 0.033447265625, + "k3_kl": 0.0303955078125, + "kimi_kl": 0.10107421875, + "learning_rate": 3.5249999999999996e-07, + "loss": 0.0012, + "ppl": 0.034423828125, + "reward": 0.6435257792472839, + "reward_std": 0.12093645334243774, + "rewards/perpo_ocr_edit_distance_reward": 0.6435258388519287, "step": 1475, "temperature": 0.9 }, { - "advantages": -3.485168781480752e-05, - "completion_length": 630.5, - "delta_ref_entropy_loss": 0.0277099609375, - "delta_ref_ppl": -0.032958984375, - "entropy_loss": -0.0225830078125, - "epoch": 0.5904, - "grad_norm": 0.37708962693934495, - "k1_kl": 0.032958984375, - "k3_kl": 0.02374267578125, - "kimi_kl": 0.07666015625, - "learning_rate": 2.048e-07, - "loss": 0.001, - "ppl": 0.010498046875, - "reward": 0.9986349940299988, - "reward_std": 0.00038783062336733565, - "rewards/perpo_ocr_edit_distance_reward": 0.9986350536346436, + "advantages": -7.608959276694804e-05, + "completion_length": 325.0, + "delta_ref_entropy_loss": 0.10009765625, + "delta_ref_ppl": -0.11767578125, + "entropy_loss": -0.025390625, + "epoch": 0.2952, + "grad_norm": 0.5242992352390792, + "k1_kl": 0.1181640625, + "k3_kl": 0.07470703125, + "kimi_kl": 0.251953125, + "learning_rate": 3.5239999999999995e-07, + "loss": 0.0031, + "ppl": 0.00830078125, + "reward": 0.9813665151596069, + "reward_std": 0.000683570047840476, + "rewards/perpo_ocr_edit_distance_reward": 0.9813665151596069, "step": 1476, "temperature": 0.9 }, { - "advantages": 0.0, - "completion_length": 212.0, - "delta_ref_entropy_loss": 0.0428466796875, - "delta_ref_ppl": -0.0634765625, - "entropy_loss": -0.017608642578125, - "epoch": 0.5908, - "grad_norm": 0.03263167989310292, - "k1_kl": 0.0633544921875, - "k3_kl": 0.0484619140625, - "kimi_kl": 0.20556640625, - "learning_rate": 2.046e-07, - "loss": 0.0019, - "ppl": 0.00788116455078125, - "reward": 0.6537054032087326, - "reward_std": 0.0, - "rewards/perpo_ocr_edit_distance_reward": 0.6537054032087326, + "advantages": -1.532690987460228e-07, + "completion_length": 796.0, + "delta_ref_entropy_loss": 0.0283203125, + "delta_ref_ppl": -0.04052734375, + "entropy_loss": -0.0137939453125, + "epoch": 0.2954, + "grad_norm": 0.8803869312002027, + "k1_kl": 0.040771484375, + "k3_kl": 0.02587890625, + "kimi_kl": 0.06640625, + "learning_rate": 3.523e-07, + "loss": 0.001, + "ppl": 0.003448486328125, + "reward": 0.35078486800193787, + "reward_std": 0.1257561296224594, + "rewards/perpo_ocr_edit_distance_reward": 0.35078489780426025, "step": 1477, "temperature": 0.9 }, { - "advantages": -4.302603909867031e-05, - "completion_length": 848.0, - "delta_ref_entropy_loss": 0.024261474609375, - "delta_ref_ppl": -0.021728515625, - "entropy_loss": -0.0245361328125, - "epoch": 0.5912, - "grad_norm": 0.5111746168953067, - "k1_kl": 0.021759033203125, - "k3_kl": 0.013702392578125, - "kimi_kl": 0.0391845703125, - "learning_rate": 2.0439999999999998e-07, - "loss": 0.0006, - "ppl": 0.0103607177734375, - "reward": 0.995049387216568, - "reward_std": 0.0014767180546186864, - "rewards/perpo_ocr_edit_distance_reward": 0.9950494468212128, + "advantages": 8.514949456639442e-08, + "completion_length": 309.0, + "delta_ref_entropy_loss": 0.09326171875, + "delta_ref_ppl": -0.2470703125, + "entropy_loss": -0.45703125, + "epoch": 0.2956, + "grad_norm": 3.4978152046609647, + "k1_kl": 0.2470703125, + "k3_kl": 0.1806640625, + "kimi_kl": 0.64453125, + "learning_rate": 3.522e-07, + "loss": 0.0072, + "ppl": 0.2392578125, + "reward": 0.2896079421043396, + "reward_std": 0.14547891914844513, + "rewards/perpo_ocr_edit_distance_reward": 0.2896079421043396, "step": 1478, "temperature": 0.9 }, { - "advantages": -8.30633342729925e-06, - "completion_length": 1090.5, - "delta_ref_entropy_loss": 0.0506591796875, - "delta_ref_ppl": -0.053924560546875, - "entropy_loss": -0.066162109375, - "epoch": 0.5916, - "grad_norm": 2.9878881746763133, - "k1_kl": 0.053955078125, - "k3_kl": 0.035797119140625, - "kimi_kl": 0.115264892578125, - "learning_rate": 2.042e-07, - "loss": 0.0014, - "ppl": 0.036865234375, - "reward": 0.9686374068260193, - "reward_std": 0.03832136816345155, - "rewards/perpo_ocr_edit_distance_reward": 0.9686374664306641, + "advantages": -0.0005960464477539062, + "completion_length": 381.0, + "delta_ref_entropy_loss": 0.05126953125, + "delta_ref_ppl": -0.0693359375, + "entropy_loss": -0.0120849609375, + "epoch": 0.2958, + "grad_norm": 0.003993078283121707, + "k1_kl": 0.0693359375, + "k3_kl": 0.04931640625, + "kimi_kl": 0.23046875, + "learning_rate": 3.521e-07, + "loss": 0.0026, + "ppl": 0.0019683837890625, + "reward": 0.9919314384460449, + "reward_std": 0.0, + "rewards/perpo_ocr_edit_distance_reward": 0.9919314980506897, "step": 1479, "temperature": 0.9 }, { - "advantages": -0.00015524455511695123, - "completion_length": 433.5, - "delta_ref_entropy_loss": 0.05078125, - "delta_ref_ppl": -0.14935302734375, - "entropy_loss": -0.056396484375, - "epoch": 0.592, - "grad_norm": 0.5578395989323983, - "k1_kl": 0.14935302734375, - "k3_kl": 0.121795654296875, - "kimi_kl": 0.59405517578125, - "learning_rate": 2.0399999999999997e-07, - "loss": 0.005, - "ppl": 0.025848388671875, - "reward": 0.740971565246582, - "reward_std": 0.0009748244192451239, - "rewards/perpo_ocr_edit_distance_reward": 0.7409716099500656, + "advantages": -1.927784614963457e-05, + "completion_length": 1017.0, + "delta_ref_entropy_loss": 0.051025390625, + "delta_ref_ppl": -0.068359375, + "entropy_loss": -0.033447265625, + "epoch": 0.296, + "grad_norm": 2.0814025573439103, + "k1_kl": 0.068359375, + "k3_kl": 0.04052734375, + "kimi_kl": 0.12158203125, + "learning_rate": 3.52e-07, + "loss": 0.0016, + "ppl": 0.01495361328125, + "reward": 0.9934625029563904, + "reward_std": 0.00034173502353951335, + "rewards/perpo_ocr_edit_distance_reward": 0.9934626221656799, "step": 1480, "temperature": 0.9 }, { - "advantages": -1.6263553561657318e-06, - "completion_length": 274.0, - "delta_ref_entropy_loss": 0.055908203125, - "delta_ref_ppl": -0.083984375, - "entropy_loss": -0.0660400390625, - "epoch": 0.5924, - "grad_norm": 2.1309986268458845, - "k1_kl": 0.083984375, - "k3_kl": 0.05517578125, - "kimi_kl": 0.169677734375, - "learning_rate": 2.038e-07, - "loss": 0.0022, - "ppl": 0.0372314453125, - "reward": 0.9794512391090393, - "reward_std": 0.011661775410175323, - "rewards/perpo_ocr_edit_distance_reward": 0.9794512689113617, + "advantages": -0.00014162063598632812, + "completion_length": 734.0, + "delta_ref_entropy_loss": 0.058837890625, + "delta_ref_ppl": -0.0615234375, + "entropy_loss": -0.0361328125, + "epoch": 0.2962, + "grad_norm": 0.5246999996480624, + "k1_kl": 0.0615234375, + "k3_kl": 0.04052734375, + "kimi_kl": 0.1259765625, + "learning_rate": 3.5189999999999997e-07, + "loss": 0.0018, + "ppl": 0.01556396484375, + "reward": 0.9794902801513672, + "reward_std": 0.0006215075263753533, + "rewards/perpo_ocr_edit_distance_reward": 0.9794904589653015, "step": 1481, "temperature": 0.9 }, { - "advantages": -0.00029903650283813477, - "completion_length": 436.0, - "delta_ref_entropy_loss": 0.05126953125, - "delta_ref_ppl": -0.03961181640625, - "entropy_loss": -0.15460205078125, - "epoch": 0.5928, - "grad_norm": 1.1250905775014468, - "k1_kl": 0.03936767578125, - "k3_kl": 0.024078369140625, - "kimi_kl": 0.05059814453125, - "learning_rate": 2.036e-07, - "loss": 0.0013, - "ppl": 0.084075927734375, - "reward": 0.9357093274593353, - "reward_std": 0.00839210581034422, - "rewards/perpo_ocr_edit_distance_reward": 0.9357093870639801, + "advantages": -1.3794218602924957e-06, + "completion_length": 756.0, + "delta_ref_entropy_loss": 0.10009765625, + "delta_ref_ppl": -0.076171875, + "entropy_loss": -0.23828125, + "epoch": 0.2964, + "grad_norm": 2.0488211265982543, + "k1_kl": 0.076171875, + "k3_kl": 0.049072265625, + "kimi_kl": 0.10888671875, + "learning_rate": 3.5179999999999996e-07, + "loss": 0.002, + "ppl": 0.11865234375, + "reward": 0.6690394878387451, + "reward_std": 0.012301565147936344, + "rewards/perpo_ocr_edit_distance_reward": 0.6690395474433899, "step": 1482, "temperature": 0.9 }, { - "advantages": -4.7096185880945995e-05, - "completion_length": 805.5, - "delta_ref_entropy_loss": 0.05035400390625, - "delta_ref_ppl": -0.0411376953125, - "entropy_loss": -0.06280517578125, - "epoch": 0.5932, - "grad_norm": 0.9567305680984761, - "k1_kl": 0.0411376953125, - "k3_kl": 0.025390625, - "kimi_kl": 0.05169677734375, - "learning_rate": 2.0339999999999998e-07, - "loss": 0.0011, - "ppl": 0.035858154296875, - "reward": 0.9873223602771759, - "reward_std": 0.0019326887268107384, - "rewards/perpo_ocr_edit_distance_reward": 0.9873223900794983, + "advantages": 0.0, + "completion_length": 564.0, + "delta_ref_entropy_loss": 0.054443359375, + "delta_ref_ppl": -0.060791015625, + "entropy_loss": -0.011474609375, + "epoch": 0.2966, + "grad_norm": 0.00541739270459636, + "k1_kl": 0.060791015625, + "k3_kl": 0.037353515625, + "kimi_kl": 0.125, + "learning_rate": 3.517e-07, + "loss": 0.0015, + "ppl": 0.0021514892578125, + "reward": 0.9992706179618835, + "reward_std": 0.0, + "rewards/perpo_ocr_edit_distance_reward": 0.9992706179618835, "step": 1483, "temperature": 0.9 }, { - "advantages": -4.896096470474731e-07, - "completion_length": 613.0, - "delta_ref_entropy_loss": 0.052490234375, - "delta_ref_ppl": -0.03759765625, - "entropy_loss": -0.04052734375, - "epoch": 0.5936, - "grad_norm": 0.5097696637400805, - "k1_kl": 0.03778076171875, - "k3_kl": 0.023345947265625, - "kimi_kl": 0.0792236328125, - "learning_rate": 2.032e-07, - "loss": 0.0009, - "ppl": 0.0198974609375, - "reward": 0.9901238977909088, - "reward_std": 0.001158945175120607, - "rewards/perpo_ocr_edit_distance_reward": 0.9901239573955536, + "advantages": -3.405979782655777e-07, + "completion_length": 225.0, + "delta_ref_entropy_loss": 0.1591796875, + "delta_ref_ppl": -0.1806640625, + "entropy_loss": -0.265625, + "epoch": 0.2968, + "grad_norm": 3.008335537739294, + "k1_kl": 0.1806640625, + "k3_kl": 0.12890625, + "kimi_kl": 0.388671875, + "learning_rate": 3.516e-07, + "loss": 0.0051, + "ppl": 0.125, + "reward": 0.441290020942688, + "reward_std": 0.04829910770058632, + "rewards/perpo_ocr_edit_distance_reward": 0.4412900507450104, "step": 1484, "temperature": 0.9 }, { - "advantages": -4.5329334625421325e-05, - "completion_length": 652.5, - "delta_ref_entropy_loss": 0.03411865234375, - "delta_ref_ppl": -0.028961181640625, - "entropy_loss": -0.021240234375, - "epoch": 0.594, - "grad_norm": 0.49582042520007147, - "k1_kl": 0.02899169921875, - "k3_kl": 0.02008056640625, - "kimi_kl": 0.08807373046875, - "learning_rate": 2.03e-07, - "loss": 0.0009, - "ppl": 0.0104827880859375, - "reward": 0.9988359212875366, - "reward_std": 0.0010834459390025586, - "rewards/perpo_ocr_edit_distance_reward": 0.9988359808921814, + "advantages": -0.00020260896417312324, + "completion_length": 1027.0, + "delta_ref_entropy_loss": 0.028564453125, + "delta_ref_ppl": -0.036865234375, + "entropy_loss": -0.034912109375, + "epoch": 0.297, + "grad_norm": 0.2714048700899911, + "k1_kl": 0.037109375, + "k3_kl": 0.022216796875, + "kimi_kl": 0.052734375, + "learning_rate": 3.5149999999999994e-07, + "loss": 0.0011, + "ppl": 0.01434326171875, + "reward": 0.9833323955535889, + "reward_std": 0.0002782215306069702, + "rewards/perpo_ocr_edit_distance_reward": 0.9833325147628784, "step": 1485, "temperature": 0.9 }, { - "advantages": -6.778325457545975e-05, - "completion_length": 875.0, - "delta_ref_entropy_loss": 0.052978515625, - "delta_ref_ppl": -0.05078125, - "entropy_loss": -0.2427978515625, - "epoch": 0.5944, - "grad_norm": 1.7826703935679196, - "k1_kl": 0.051025390625, - "k3_kl": 0.03460693359375, - "kimi_kl": 0.07861328125, - "learning_rate": 2.028e-07, - "loss": 0.0015, - "ppl": 0.1351318359375, - "reward": 0.7020958811044693, - "reward_std": 0.02189456077030627, - "rewards/perpo_ocr_edit_distance_reward": 0.7020959109067917, + "advantages": -4.257474817137563e-09, + "completion_length": 622.0, + "delta_ref_entropy_loss": 0.032470703125, + "delta_ref_ppl": -0.06298828125, + "entropy_loss": -0.091796875, + "epoch": 0.2972, + "grad_norm": 1.741222728912109, + "k1_kl": 0.06298828125, + "k3_kl": 0.042724609375, + "kimi_kl": 0.09619140625, + "learning_rate": 3.514e-07, + "loss": 0.0017, + "ppl": 0.04443359375, + "reward": 0.8576541543006897, + "reward_std": 0.057817332446575165, + "rewards/perpo_ocr_edit_distance_reward": 0.8576542139053345, "step": 1486, "temperature": 0.9 }, { - "advantages": -8.959430124377832e-05, - "completion_length": 447.0, - "delta_ref_entropy_loss": 0.04296875, - "delta_ref_ppl": -0.0350341796875, - "entropy_loss": -0.017242431640625, - "epoch": 0.5948, - "grad_norm": 0.35136180801086925, - "k1_kl": 0.03509521484375, - "k3_kl": 0.02252197265625, - "kimi_kl": 0.096923828125, - "learning_rate": 2.026e-07, - "loss": 0.001, - "ppl": 0.0056915283203125, - "reward": 0.9965797960758209, - "reward_std": 0.0006668144051218405, - "rewards/perpo_ocr_edit_distance_reward": 0.9965799152851105, + "advantages": -1.021793991640152e-07, + "completion_length": 20.0, + "delta_ref_entropy_loss": 0.0927734375, + "delta_ref_ppl": -0.7734375, + "entropy_loss": -0.7734375, + "epoch": 0.2974, + "grad_norm": 18.665526598252626, + "k1_kl": 0.77734375, + "k3_kl": 0.67578125, + "kimi_kl": 3.875, + "learning_rate": 3.513e-07, + "loss": 0.027, + "ppl": 0.375, + "reward": 0.2749256491661072, + "reward_std": 0.1299862563610077, + "rewards/perpo_ocr_edit_distance_reward": 0.27492567896842957, "step": 1487, "temperature": 0.9 }, { - "advantages": -6.421549414881156e-05, - "completion_length": 1114.0, - "delta_ref_entropy_loss": 0.02032470703125, - "delta_ref_ppl": -0.0116119384765625, - "entropy_loss": -0.0318603515625, - "epoch": 0.5952, - "grad_norm": 0.45220572821422983, - "k1_kl": 0.0115509033203125, - "k3_kl": 0.0069580078125, - "kimi_kl": 0.011016845703125, - "learning_rate": 2.0239999999999999e-07, - "loss": 0.0003, - "ppl": 0.01898193359375, - "reward": 0.9954248368740082, - "reward_std": 0.002054603595752269, - "rewards/perpo_ocr_edit_distance_reward": 0.9954248666763306, + "advantages": -4.728351632365957e-05, + "completion_length": 523.0, + "delta_ref_entropy_loss": 0.07421875, + "delta_ref_ppl": -0.09228515625, + "entropy_loss": -0.0380859375, + "epoch": 0.2976, + "grad_norm": 0.5702627536290399, + "k1_kl": 0.091796875, + "k3_kl": 0.057861328125, + "kimi_kl": 0.1953125, + "learning_rate": 3.512e-07, + "loss": 0.0024, + "ppl": 0.01446533203125, + "reward": 0.9785181879997253, + "reward_std": 0.0009803265566006303, + "rewards/perpo_ocr_edit_distance_reward": 0.9785182476043701, "step": 1488, "temperature": 0.9 }, { - "advantages": 5.245209194981726e-06, - "completion_length": 536.5, - "delta_ref_entropy_loss": 0.035888671875, - "delta_ref_ppl": -0.03643798828125, - "entropy_loss": -0.0164794921875, - "epoch": 0.5956, - "grad_norm": 0.3849209101374273, - "k1_kl": 0.03643798828125, - "k3_kl": 0.02325439453125, - "kimi_kl": 0.0892333984375, - "learning_rate": 2.0219999999999997e-07, - "loss": 0.0009, - "ppl": 0.005828857421875, - "reward": 0.9997819066047668, - "reward_std": 0.00043813740921905264, - "rewards/perpo_ocr_edit_distance_reward": 0.9997819066047668, + "advantages": -1.0490418389963452e-05, + "completion_length": 610.0, + "delta_ref_entropy_loss": 0.058837890625, + "delta_ref_ppl": -0.0634765625, + "entropy_loss": -0.03759765625, + "epoch": 0.2978, + "grad_norm": 0.9012442123997003, + "k1_kl": 0.0634765625, + "k3_kl": 0.040283203125, + "kimi_kl": 0.12890625, + "learning_rate": 3.511e-07, + "loss": 0.0016, + "ppl": 0.01556396484375, + "reward": 0.9923665523529053, + "reward_std": 0.0015251061413437128, + "rewards/perpo_ocr_edit_distance_reward": 0.9923665523529053, "step": 1489, "temperature": 0.9 }, { - "advantages": -2.9657568575203186e-05, - "completion_length": 687.5, - "delta_ref_entropy_loss": 0.0712890625, - "delta_ref_ppl": -0.059326171875, - "entropy_loss": -0.08135986328125, - "epoch": 0.596, - "grad_norm": 1.0008620842297693, - "k1_kl": 0.05908203125, - "k3_kl": 0.0379638671875, - "kimi_kl": 0.11822509765625, - "learning_rate": 2.02e-07, - "loss": 0.0015, - "ppl": 0.040863037109375, - "reward": 0.9689984917640686, - "reward_std": 0.002952850132714957, - "rewards/perpo_ocr_edit_distance_reward": 0.9689985513687134, + "advantages": -4.359654212748865e-06, + "completion_length": 638.0, + "delta_ref_entropy_loss": 0.053466796875, + "delta_ref_ppl": -0.07470703125, + "entropy_loss": -0.028076171875, + "epoch": 0.298, + "grad_norm": 0.6853983745614576, + "k1_kl": 0.07470703125, + "k3_kl": 0.046875, + "kimi_kl": 0.1474609375, + "learning_rate": 3.5099999999999995e-07, + "loss": 0.0019, + "ppl": 0.01287841796875, + "reward": 0.9901926517486572, + "reward_std": 0.0018534051487222314, + "rewards/perpo_ocr_edit_distance_reward": 0.9901926517486572, "step": 1490, "temperature": 0.9 }, { - "advantages": -5.1093953516101465e-05, - "completion_length": 606.0, - "delta_ref_entropy_loss": 0.039794921875, - "delta_ref_ppl": -0.02655029296875, - "entropy_loss": -0.0193023681640625, - "epoch": 0.5964, - "grad_norm": 0.4078262496022622, - "k1_kl": 0.02655029296875, - "k3_kl": 0.013336181640625, - "kimi_kl": 0.022918701171875, - "learning_rate": 2.018e-07, - "loss": 0.0006, - "ppl": 0.009227752685546875, - "reward": 0.9880303740501404, - "reward_std": 0.0003666588745545596, - "rewards/perpo_ocr_edit_distance_reward": 0.9880304336547852, + "advantages": -5.711828271159902e-05, + "completion_length": 1052.0, + "delta_ref_entropy_loss": 0.0272216796875, + "delta_ref_ppl": -0.031494140625, + "entropy_loss": -0.0274658203125, + "epoch": 0.2982, + "grad_norm": 0.31903110727811157, + "k1_kl": 0.031494140625, + "k3_kl": 0.019287109375, + "kimi_kl": 0.045166015625, + "learning_rate": 3.509e-07, + "loss": 0.0008, + "ppl": 0.0113525390625, + "reward": 0.9976761937141418, + "reward_std": 0.00034723032149486244, + "rewards/perpo_ocr_edit_distance_reward": 0.9976762533187866, "step": 1491, "temperature": 0.9 }, { - "advantages": -2.7877944944521005e-05, - "completion_length": 521.5, - "delta_ref_entropy_loss": 0.05828857421875, - "delta_ref_ppl": -0.0521240234375, - "entropy_loss": -0.0858154296875, - "epoch": 0.5968, - "grad_norm": 1.4456041270714162, - "k1_kl": 0.0521240234375, - "k3_kl": 0.0332183837890625, - "kimi_kl": 0.086181640625, - "learning_rate": 2.016e-07, - "loss": 0.0014, - "ppl": 0.0474853515625, - "reward": 0.993110328912735, - "reward_std": 0.006320924963802099, - "rewards/perpo_ocr_edit_distance_reward": 0.9931104183197021, + "advantages": -9.775161743164062e-06, + "completion_length": 87.0, + "delta_ref_entropy_loss": 0.11083984375, + "delta_ref_ppl": -0.328125, + "entropy_loss": -0.1083984375, + "epoch": 0.2984, + "grad_norm": 3.350542373737891, + "k1_kl": 0.328125, + "k3_kl": 0.26953125, + "kimi_kl": 0.890625, + "learning_rate": 3.508e-07, + "loss": 0.0108, + "ppl": 0.047607421875, + "reward": 0.7647058963775635, + "reward_std": 0.006003652233630419, + "rewards/perpo_ocr_edit_distance_reward": 0.7647059559822083, "step": 1492, "temperature": 0.9 }, { - "advantages": -0.0002725890699366573, - "completion_length": 360.0, - "delta_ref_entropy_loss": 0.03399658203125, - "delta_ref_ppl": -0.037078857421875, - "entropy_loss": -0.019073486328125, - "epoch": 0.5972, - "grad_norm": 0.23031596987137315, - "k1_kl": 0.03704833984375, - "k3_kl": 0.023712158203125, - "kimi_kl": 0.076629638671875, - "learning_rate": 2.014e-07, - "loss": 0.0012, - "ppl": 0.00872802734375, - "reward": 0.9955801069736481, - "reward_std": 0.00011734557483578101, - "rewards/perpo_ocr_edit_distance_reward": 0.9955801665782928, + "advantages": -0.00019426006474532187, + "completion_length": 497.0, + "delta_ref_entropy_loss": 0.0576171875, + "delta_ref_ppl": -0.040771484375, + "entropy_loss": -0.02734375, + "epoch": 0.2986, + "grad_norm": 0.2615121930575543, + "k1_kl": 0.04052734375, + "k3_kl": 0.02099609375, + "kimi_kl": 0.053466796875, + "learning_rate": 3.507e-07, + "loss": 0.001, + "ppl": 0.0078125, + "reward": 0.9926723837852478, + "reward_std": 0.00020683827460743487, + "rewards/perpo_ocr_edit_distance_reward": 0.9926725029945374, "step": 1493, "temperature": 0.9 }, { - "advantages": -5.956207246526901e-06, - "completion_length": 972.0, - "delta_ref_entropy_loss": 0.090087890625, - "delta_ref_ppl": -0.0574951171875, - "entropy_loss": -0.1015625, - "epoch": 0.5976, - "grad_norm": 2603.823778299571, - "k1_kl": 0.0576171875, - "k3_kl": 3.1727294921875, - "kimi_kl": 0.0885009765625, - "learning_rate": 2.0119999999999998e-07, - "loss": 0.1272, - "ppl": 0.0635986328125, - "reward": 0.890220582485199, - "reward_std": 0.007480425061658025, - "rewards/perpo_ocr_edit_distance_reward": 0.8902206718921661, + "advantages": -2.384185791015625e-07, + "completion_length": 2005.0, + "delta_ref_entropy_loss": -0.037109375, + "delta_ref_ppl": -0.0185546875, + "entropy_loss": -0.263671875, + "epoch": 0.2988, + "grad_norm": 7.539404511553595, + "k1_kl": 0.0185546875, + "k3_kl": 0.0213623046875, + "kimi_kl": 0.040283203125, + "learning_rate": 3.5060000000000003e-07, + "loss": 0.0009, + "ppl": 0.115234375, + "reward": 0.368831992149353, + "reward_std": 0.1858171671628952, + "rewards/perpo_ocr_edit_distance_reward": 0.368831992149353, "step": 1494, "temperature": 0.9 }, { - "advantages": -8.933459321269765e-05, - "completion_length": 637.0, - "delta_ref_entropy_loss": 0.046875, - "delta_ref_ppl": -0.0330810546875, - "entropy_loss": -0.0201416015625, - "epoch": 0.598, - "grad_norm": 0.3545252702310745, - "k1_kl": 0.0330810546875, - "k3_kl": 0.0182952880859375, - "kimi_kl": 0.03948974609375, - "learning_rate": 2.01e-07, - "loss": 0.0008, - "ppl": 0.0073089599609375, - "reward": 0.9997985064983368, - "reward_std": 0.00016447616508230567, - "rewards/perpo_ocr_edit_distance_reward": 0.9997985661029816, + "advantages": -2.309254341525957e-05, + "completion_length": 312.0, + "delta_ref_entropy_loss": 0.05029296875, + "delta_ref_ppl": -0.09814453125, + "entropy_loss": -0.046630859375, + "epoch": 0.299, + "grad_norm": 1.0173975417053924, + "k1_kl": 0.09814453125, + "k3_kl": 0.0625, + "kimi_kl": 0.1953125, + "learning_rate": 3.5049999999999997e-07, + "loss": 0.0025, + "ppl": 0.016845703125, + "reward": 0.9473130106925964, + "reward_std": 0.0006366872694343328, + "rewards/perpo_ocr_edit_distance_reward": 0.9473130702972412, "step": 1495, "temperature": 0.9 }, { - "advantages": -4.675133106957219e-05, - "completion_length": 335.0, - "delta_ref_entropy_loss": 0.150146484375, - "delta_ref_ppl": -0.0994873046875, - "entropy_loss": -0.2047119140625, - "epoch": 0.5984, - "grad_norm": 1.46929774643365, - "k1_kl": 0.099609375, - "k3_kl": 0.0517578125, - "kimi_kl": 0.1043701171875, - "learning_rate": 2.008e-07, - "loss": 0.0021, - "ppl": 0.1138916015625, - "reward": 0.8944490849971771, - "reward_std": 0.006974282849114388, - "rewards/perpo_ocr_edit_distance_reward": 0.8944491446018219, + "advantages": -4.3123964132973924e-05, + "completion_length": 1319.0, + "delta_ref_entropy_loss": 0.072265625, + "delta_ref_ppl": -0.04638671875, + "entropy_loss": -0.10009765625, + "epoch": 0.2992, + "grad_norm": 531.368003803043, + "k1_kl": 0.04638671875, + "k3_kl": 1.21875, + "kimi_kl": 0.0478515625, + "learning_rate": 3.5039999999999996e-07, + "loss": 0.0484, + "ppl": 0.05224609375, + "reward": 0.9748523831367493, + "reward_std": 0.0012825109297409654, + "rewards/perpo_ocr_edit_distance_reward": 0.9748525619506836, "step": 1496, "temperature": 0.9 }, { - "advantages": -2.0589147879945813e-05, - "completion_length": 1379.0, - "delta_ref_entropy_loss": 0.02813720703125, - "delta_ref_ppl": -0.022003173828125, - "entropy_loss": -0.03192138671875, - "epoch": 0.5988, - "grad_norm": 4.6132941287060145, - "k1_kl": 0.022003173828125, - "k3_kl": 0.02215576171875, - "kimi_kl": 0.04595947265625, - "learning_rate": 2.0059999999999998e-07, - "loss": 0.0009, - "ppl": 0.017913818359375, - "reward": 0.988083004951477, - "reward_std": 0.0034049375681206584, - "rewards/perpo_ocr_edit_distance_reward": 0.9880830347537994, + "advantages": -1.7711095097183716e-06, + "completion_length": 552.0, + "delta_ref_entropy_loss": 0.056396484375, + "delta_ref_ppl": -0.078125, + "entropy_loss": -0.060546875, + "epoch": 0.2994, + "grad_norm": 1.0732412773494568, + "k1_kl": 0.078125, + "k3_kl": 0.045654296875, + "kimi_kl": 0.11962890625, + "learning_rate": 3.503e-07, + "loss": 0.0018, + "ppl": 0.0294189453125, + "reward": 0.9676096439361572, + "reward_std": 0.019192948937416077, + "rewards/perpo_ocr_edit_distance_reward": 0.967609703540802, "step": 1497, "temperature": 0.9 }, { - "advantages": -1.4292343252009232e-05, - "completion_length": 817.5, - "delta_ref_entropy_loss": 0.02691650390625, - "delta_ref_ppl": -0.03204345703125, - "entropy_loss": -0.02789306640625, - "epoch": 0.5992, - "grad_norm": 0.5438073896183587, - "k1_kl": 0.03204345703125, - "k3_kl": 0.02288818359375, - "kimi_kl": 0.0672607421875, - "learning_rate": 2.004e-07, - "loss": 0.0009, - "ppl": 0.0129852294921875, - "reward": 0.9189507067203522, - "reward_std": 0.10218216938665137, - "rewards/perpo_ocr_edit_distance_reward": 0.9189507961273193, + "advantages": -1.7915454009198584e-05, + "completion_length": 642.0, + "delta_ref_entropy_loss": 0.08935546875, + "delta_ref_ppl": -0.1015625, + "entropy_loss": -0.05126953125, + "epoch": 0.2996, + "grad_norm": 1.0534945605242743, + "k1_kl": 0.10107421875, + "k3_kl": 0.062255859375, + "kimi_kl": 0.1748046875, + "learning_rate": 3.502e-07, + "loss": 0.0025, + "ppl": 0.0198974609375, + "reward": 0.991037130355835, + "reward_std": 0.0022771244402974844, + "rewards/perpo_ocr_edit_distance_reward": 0.9910372495651245, "step": 1498, "temperature": 0.9 }, { - "advantages": -0.00011090296180782389, - "completion_length": 658.5, - "delta_ref_entropy_loss": 0.03314208984375, - "delta_ref_ppl": -0.03009033203125, - "entropy_loss": -0.03656005859375, - "epoch": 0.5996, - "grad_norm": 0.8836851181604168, - "k1_kl": 0.0301513671875, - "k3_kl": 0.020538330078125, - "kimi_kl": 0.068603515625, - "learning_rate": 2.0019999999999998e-07, - "loss": 0.0009, - "ppl": 0.0160980224609375, - "reward": 0.9799700677394867, - "reward_std": 0.0054987973417155445, - "rewards/perpo_ocr_edit_distance_reward": 0.9799700975418091, + "advantages": -5.119187699165195e-05, + "completion_length": 654.0, + "delta_ref_entropy_loss": 0.052490234375, + "delta_ref_ppl": -0.06494140625, + "entropy_loss": -0.03173828125, + "epoch": 0.2998, + "grad_norm": 0.5245817841689133, + "k1_kl": 0.06494140625, + "k3_kl": 0.0341796875, + "kimi_kl": 0.07373046875, + "learning_rate": 3.501e-07, + "loss": 0.0014, + "ppl": 0.011474609375, + "reward": 0.9940997958183289, + "reward_std": 0.0010641004191711545, + "rewards/perpo_ocr_edit_distance_reward": 0.9940999150276184, "step": 1499, "temperature": 0.9 }, { - "advantages": -0.00010566200944595039, - "completion_length": 548.5, - "delta_ref_entropy_loss": 0.060302734375, - "delta_ref_ppl": -0.037139892578125, - "entropy_loss": -0.04217529296875, - "epoch": 0.6, - "grad_norm": 0.1887142990841563, - "k1_kl": 0.036865234375, - "k3_kl": 0.017486572265625, - "kimi_kl": 0.031005859375, - "learning_rate": 2e-07, - "loss": 0.0008, - "ppl": 0.01763916015625, - "reward": 0.9992128312587738, - "reward_std": 0.00011115851521026343, - "rewards/perpo_ocr_edit_distance_reward": 0.9992128908634186, + "advantages": -0.00015237502520903945, + "completion_length": 441.0, + "delta_ref_entropy_loss": 0.0615234375, + "delta_ref_ppl": -0.115234375, + "entropy_loss": -0.032958984375, + "epoch": 0.3, + "grad_norm": 0.6998374745988509, + "k1_kl": 0.11474609375, + "k3_kl": 0.08056640625, + "kimi_kl": 0.330078125, + "learning_rate": 3.5e-07, + "loss": 0.0034, + "ppl": 0.01470947265625, + "reward": 0.8676766753196716, + "reward_std": 0.0005705802468582988, + "rewards/perpo_ocr_edit_distance_reward": 0.8676767945289612, "step": 1500, "temperature": 0.9 }, { - "advantages": -1.3283321095514111e-05, - "completion_length": 648.0, - "delta_ref_entropy_loss": 0.0274658203125, - "delta_ref_ppl": -0.017578125, - "entropy_loss": -0.01556396484375, - "epoch": 0.6004, - "grad_norm": 0.48360133812543726, - "k1_kl": 0.017578125, - "k3_kl": 0.010284423828125, - "kimi_kl": 0.02520751953125, - "learning_rate": 1.998e-07, - "loss": 0.0004, - "ppl": 0.00624847412109375, - "reward": 0.9980463981628418, - "reward_std": 0.00011034656199626625, - "rewards/perpo_ocr_edit_distance_reward": 0.9980464279651642, + "advantages": -4.564012851915322e-05, + "completion_length": 260.0, + "delta_ref_entropy_loss": 0.08154296875, + "delta_ref_ppl": -0.1396484375, + "entropy_loss": -0.031494140625, + "epoch": 0.3002, + "grad_norm": 2.3265849548059516, + "k1_kl": 0.1396484375, + "k3_kl": 0.09521484375, + "kimi_kl": 0.375, + "learning_rate": 3.499e-07, + "loss": 0.0039, + "ppl": 0.0081787109375, + "reward": 0.7959697842597961, + "reward_std": 0.0012058252468705177, + "rewards/perpo_ocr_edit_distance_reward": 0.7959698438644409, "step": 1501, "temperature": 0.9 }, { - "advantages": 1.2870345926785376e-05, - "completion_length": 95.0, - "delta_ref_entropy_loss": 0.038818359375, - "delta_ref_ppl": -0.0821533203125, - "entropy_loss": -0.0704345703125, - "epoch": 0.6008, - "grad_norm": 2.1711183696535867, - "k1_kl": 0.08203125, - "k3_kl": 0.0631103515625, - "kimi_kl": 0.1728515625, - "learning_rate": 1.996e-07, - "loss": 0.0025, - "ppl": 0.0401611328125, - "reward": 0.9972600340843201, - "reward_std": 0.0007769821095280349, - "rewards/perpo_ocr_edit_distance_reward": 0.9972600340843201, + "advantages": -5.3967749408911914e-05, + "completion_length": 1441.0, + "delta_ref_entropy_loss": 0.0203857421875, + "delta_ref_ppl": -0.0306396484375, + "entropy_loss": -0.0322265625, + "epoch": 0.3004, + "grad_norm": 0.35763027602814845, + "k1_kl": 0.0306396484375, + "k3_kl": 0.0205078125, + "kimi_kl": 0.051025390625, + "learning_rate": 3.4979999999999997e-07, + "loss": 0.0009, + "ppl": 0.01324462890625, + "reward": 0.9950371980667114, + "reward_std": 0.0008466497529298067, + "rewards/perpo_ocr_edit_distance_reward": 0.9950372576713562, "step": 1502, "temperature": 0.9 }, { - "advantages": -0.00014354076483868994, - "completion_length": 326.0, - "delta_ref_entropy_loss": 0.039794921875, - "delta_ref_ppl": -0.04058837890625, - "entropy_loss": -0.024261474609375, - "epoch": 0.6012, - "grad_norm": 0.6987315660240698, - "k1_kl": 0.04071044921875, - "k3_kl": 0.026123046875, - "kimi_kl": 0.09112548828125, - "learning_rate": 1.9939999999999997e-07, - "loss": 0.0012, - "ppl": 0.00946044921875, - "reward": 0.6944669932126999, - "reward_std": 0.0005622030876111239, - "rewards/perpo_ocr_edit_distance_reward": 0.6944671124219894, + "advantages": -1.4305115882962127e-06, + "completion_length": 573.0, + "delta_ref_entropy_loss": 0.111328125, + "delta_ref_ppl": -0.09716796875, + "entropy_loss": -0.0810546875, + "epoch": 0.3006, + "grad_norm": 1.0208230407912804, + "k1_kl": 0.09716796875, + "k3_kl": 0.053955078125, + "kimi_kl": 0.15234375, + "learning_rate": 3.497e-07, + "loss": 0.0022, + "ppl": 0.03564453125, + "reward": 0.942604660987854, + "reward_std": 0.023366723209619522, + "rewards/perpo_ocr_edit_distance_reward": 0.9426047205924988, "step": 1503, "temperature": 0.9 }, { - "advantages": 1.5548297596978955e-05, - "completion_length": 387.0, - "delta_ref_entropy_loss": 0.06005859375, - "delta_ref_ppl": -0.044921875, - "entropy_loss": -0.019012451171875, - "epoch": 0.6016, - "grad_norm": 0.180847449455247, - "k1_kl": 0.045166015625, - "k3_kl": 0.02117919921875, - "kimi_kl": 0.04046630859375, - "learning_rate": 1.9919999999999998e-07, - "loss": 0.0008, - "ppl": 0.00403594970703125, - "reward": 0.9990848898887634, - "reward_std": 0.0004972464521415532, - "rewards/perpo_ocr_edit_distance_reward": 0.9990848898887634, + "advantages": -1.7149108316516504e-05, + "completion_length": 369.0, + "delta_ref_entropy_loss": 0.0439453125, + "delta_ref_ppl": -0.060546875, + "entropy_loss": -0.020263671875, + "epoch": 0.3008, + "grad_norm": 0.5633883971496098, + "k1_kl": 0.060546875, + "k3_kl": 0.037353515625, + "kimi_kl": 0.11962890625, + "learning_rate": 3.496e-07, + "loss": 0.0015, + "ppl": 0.004913330078125, + "reward": 0.9943786859512329, + "reward_std": 0.00039659059257246554, + "rewards/perpo_ocr_edit_distance_reward": 0.9943786859512329, "step": 1504, "temperature": 0.9 }, { - "advantages": -0.00011772343441407429, - "completion_length": 817.5, - "delta_ref_entropy_loss": 0.0224609375, - "delta_ref_ppl": -0.015838623046875, - "entropy_loss": -0.013641357421875, - "epoch": 0.602, - "grad_norm": 0.17036160348475923, - "k1_kl": 0.015869140625, - "k3_kl": 0.0084686279296875, - "kimi_kl": 0.0284576416015625, - "learning_rate": 1.99e-07, - "loss": 0.0005, - "ppl": 0.0054779052734375, - "reward": 0.9985001087188721, - "reward_std": 0.00020357415996841155, - "rewards/perpo_ocr_edit_distance_reward": 0.9985001385211945, + "advantages": -5.449567765936081e-07, + "completion_length": 409.0, + "delta_ref_entropy_loss": 0.228515625, + "delta_ref_ppl": -0.1494140625, + "entropy_loss": -0.453125, + "epoch": 0.301, + "grad_norm": 2.9383973436244406, + "k1_kl": 0.1494140625, + "k3_kl": 0.0859375, + "kimi_kl": 0.173828125, + "learning_rate": 3.4949999999999995e-07, + "loss": 0.0034, + "ppl": 0.25, + "reward": 0.8028432726860046, + "reward_std": 0.015868481248617172, + "rewards/perpo_ocr_edit_distance_reward": 0.8028433322906494, "step": 1505, "temperature": 0.9 }, { - "advantages": 0.0, - "completion_length": 50.5, - "delta_ref_entropy_loss": 0.0374755859375, - "delta_ref_ppl": -0.1441650390625, - "entropy_loss": -0.0269775390625, - "epoch": 0.6024, - "grad_norm": 0.0321506227077258, - "k1_kl": 0.1436767578125, - "k3_kl": 0.11572265625, - "kimi_kl": 0.55517578125, - "learning_rate": 1.988e-07, - "loss": 0.0046, - "ppl": 0.00494384765625, - "reward": 1.0, - "reward_std": 0.0, - "rewards/perpo_ocr_edit_distance_reward": 1.0, + "advantages": -4.377961522550322e-05, + "completion_length": 947.0, + "delta_ref_entropy_loss": 0.030517578125, + "delta_ref_ppl": -0.04248046875, + "entropy_loss": -0.03369140625, + "epoch": 0.3012, + "grad_norm": 30.304278393455398, + "k1_kl": 0.042236328125, + "k3_kl": 0.03173828125, + "kimi_kl": 0.08642578125, + "learning_rate": 3.494e-07, + "loss": 0.0013, + "ppl": 0.021484375, + "reward": 0.9858426451683044, + "reward_std": 0.0012620665365830064, + "rewards/perpo_ocr_edit_distance_reward": 0.9858427047729492, "step": 1506, "temperature": 0.9 }, { - "advantages": -4.336663732829038e-05, - "completion_length": 596.0, - "delta_ref_entropy_loss": 0.06689453125, - "delta_ref_ppl": -0.0419921875, - "entropy_loss": -0.076904296875, - "epoch": 0.6028, - "grad_norm": 1.081917779372459, - "k1_kl": 0.0421142578125, - "k3_kl": 0.0240478515625, - "kimi_kl": 0.0760498046875, - "learning_rate": 1.986e-07, - "loss": 0.001, - "ppl": 0.03955078125, - "reward": 0.8632274866104126, - "reward_std": 0.001682286208961159, - "rewards/perpo_ocr_edit_distance_reward": 0.863227516412735, + "advantages": -3.537110023899004e-05, + "completion_length": 711.0, + "delta_ref_entropy_loss": 0.072265625, + "delta_ref_ppl": -0.06689453125, + "entropy_loss": -0.053955078125, + "epoch": 0.3014, + "grad_norm": 3.0905528208838877, + "k1_kl": 0.06689453125, + "k3_kl": 0.056640625, + "kimi_kl": 0.107421875, + "learning_rate": 3.493e-07, + "loss": 0.0023, + "ppl": 0.022705078125, + "reward": 0.9420415163040161, + "reward_std": 0.001344786724075675, + "rewards/perpo_ocr_edit_distance_reward": 0.9420415759086609, "step": 1507, "temperature": 0.9 }, { - "advantages": -3.581600822144537e-05, - "completion_length": 604.0, - "delta_ref_entropy_loss": 0.04058837890625, - "delta_ref_ppl": -0.03240966796875, - "entropy_loss": -0.0223388671875, - "epoch": 0.6032, - "grad_norm": 1.5526295830404029, - "k1_kl": 0.03240966796875, - "k3_kl": 0.01904296875, - "kimi_kl": 0.04718017578125, - "learning_rate": 1.9839999999999998e-07, - "loss": 0.0008, - "ppl": 0.012176513671875, - "reward": 0.9976276159286499, - "reward_std": 0.0008856685308273882, - "rewards/perpo_ocr_edit_distance_reward": 0.9976276755332947, + "advantages": -2.0010131265735254e-05, + "completion_length": 655.0, + "delta_ref_entropy_loss": 0.061767578125, + "delta_ref_ppl": -0.08056640625, + "entropy_loss": -0.047119140625, + "epoch": 0.3016, + "grad_norm": 1.0845774492691624, + "k1_kl": 0.08056640625, + "k3_kl": 0.0498046875, + "kimi_kl": 0.12060546875, + "learning_rate": 3.492e-07, + "loss": 0.002, + "ppl": 0.0245361328125, + "reward": 0.9802787899971008, + "reward_std": 0.0011763626243919134, + "rewards/perpo_ocr_edit_distance_reward": 0.9802788496017456, "step": 1508, "temperature": 0.9 }, { - "advantages": -3.874727667607658e-05, - "completion_length": 492.5, - "delta_ref_entropy_loss": 0.0892333984375, - "delta_ref_ppl": -0.06195068359375, - "entropy_loss": -0.0972900390625, - "epoch": 0.6036, - "grad_norm": 2.4000489554430158, - "k1_kl": 0.0621337890625, - "k3_kl": 0.0401611328125, - "kimi_kl": 0.12774658203125, - "learning_rate": 1.982e-07, - "loss": 0.0016, - "ppl": 0.054290771484375, - "reward": 0.9677838683128357, - "reward_std": 0.007058982562739402, - "rewards/perpo_ocr_edit_distance_reward": 0.9677838981151581, + "advantages": -0.00020829269487876445, + "completion_length": 512.0, + "delta_ref_entropy_loss": 0.0380859375, + "delta_ref_ppl": -0.035400390625, + "entropy_loss": -0.02392578125, + "epoch": 0.3018, + "grad_norm": 0.6506765014688083, + "k1_kl": 0.035400390625, + "k3_kl": 0.02001953125, + "kimi_kl": 0.04833984375, + "learning_rate": 3.491e-07, + "loss": 0.001, + "ppl": 0.00970458984375, + "reward": 0.9987826347351074, + "reward_std": 0.00026794057339429855, + "rewards/perpo_ocr_edit_distance_reward": 0.998782753944397, "step": 1509, "temperature": 0.9 }, { - "advantages": -3.937738392778556e-05, - "completion_length": 397.5, - "delta_ref_entropy_loss": 0.03497314453125, - "delta_ref_ppl": -0.03515625, - "entropy_loss": -0.03094482421875, - "epoch": 0.604, - "grad_norm": 0.5939283574013445, - "k1_kl": 0.0352783203125, - "k3_kl": 0.02191162109375, - "kimi_kl": 0.0535888671875, - "learning_rate": 1.98e-07, - "loss": 0.0009, - "ppl": 0.01580810546875, - "reward": 0.9971738457679749, - "reward_std": 0.001689939308562316, - "rewards/perpo_ocr_edit_distance_reward": 0.9971738755702972, + "advantages": -6.462846613430884e-06, + "completion_length": 738.0, + "delta_ref_entropy_loss": 0.09033203125, + "delta_ref_ppl": -0.08349609375, + "entropy_loss": -0.1220703125, + "epoch": 0.302, + "grad_norm": 1.1972879006868091, + "k1_kl": 0.0830078125, + "k3_kl": 0.05126953125, + "kimi_kl": 0.10302734375, + "learning_rate": 3.4899999999999996e-07, + "loss": 0.0021, + "ppl": 0.06787109375, + "reward": 0.8883360028266907, + "reward_std": 0.003861512988805771, + "rewards/perpo_ocr_edit_distance_reward": 0.8883360624313354, "step": 1510, "temperature": 0.9 }, { - "advantages": -9.70704263636435e-07, - "completion_length": 518.0, - "delta_ref_entropy_loss": 0.0443115234375, - "delta_ref_ppl": -0.059326171875, - "entropy_loss": -0.0528564453125, - "epoch": 0.6044, - "grad_norm": 0.92057458934843, - "k1_kl": 0.0592041015625, - "k3_kl": 0.039306640625, - "kimi_kl": 0.111328125, - "learning_rate": 1.9779999999999998e-07, - "loss": 0.0016, - "ppl": 0.025634765625, - "reward": 0.9681862592697144, - "reward_std": 0.008652454242110252, - "rewards/perpo_ocr_edit_distance_reward": 0.9681863188743591, + "advantages": -6.757464143447578e-05, + "completion_length": 983.0, + "delta_ref_entropy_loss": 0.10888671875, + "delta_ref_ppl": -0.083984375, + "entropy_loss": -0.0908203125, + "epoch": 0.3022, + "grad_norm": 0.9684455309617214, + "k1_kl": 0.083984375, + "k3_kl": 0.045654296875, + "kimi_kl": 0.1142578125, + "learning_rate": 3.4889999999999995e-07, + "loss": 0.0019, + "ppl": 0.040283203125, + "reward": 0.8819335699081421, + "reward_std": 0.001412169192917645, + "rewards/perpo_ocr_edit_distance_reward": 0.8819336891174316, "step": 1511, "temperature": 0.9 }, { - "advantages": -2.2547586013388354e-05, - "completion_length": 758.5, - "delta_ref_entropy_loss": 0.0654296875, - "delta_ref_ppl": -0.0367431640625, - "entropy_loss": -0.06005859375, - "epoch": 0.6048, - "grad_norm": 1.3056395833104406, - "k1_kl": 0.0367431640625, - "k3_kl": 0.0179443359375, - "kimi_kl": 0.03887939453125, - "learning_rate": 1.976e-07, - "loss": 0.0007, - "ppl": 0.03192138671875, - "reward": 0.9011220037937164, - "reward_std": 0.0015096983697731048, - "rewards/perpo_ocr_edit_distance_reward": 0.9011220335960388, + "advantages": -0.0005960464477539062, + "completion_length": 524.0, + "delta_ref_entropy_loss": 0.083984375, + "delta_ref_ppl": -0.080078125, + "entropy_loss": -0.030029296875, + "epoch": 0.3024, + "grad_norm": 0.03289128642559821, + "k1_kl": 0.080078125, + "k3_kl": 0.047119140625, + "kimi_kl": 0.1357421875, + "learning_rate": 3.488e-07, + "loss": 0.0025, + "ppl": 0.0081787109375, + "reward": 0.909958004951477, + "reward_std": 0.0, + "rewards/perpo_ocr_edit_distance_reward": 0.9099580645561218, "step": 1512, "temperature": 0.9 }, { - "advantages": -0.0002980572836754902, - "completion_length": 363.0, - "delta_ref_entropy_loss": 0.0843505859375, - "delta_ref_ppl": -0.1490478515625, - "entropy_loss": -0.644073486328125, - "epoch": 0.6052, - "grad_norm": 3.668679306329589, - "k1_kl": 0.149169921875, - "k3_kl": 0.1129150390625, - "kimi_kl": 0.2825927734375, - "learning_rate": 1.9739999999999998e-07, - "loss": 0.0048, - "ppl": 0.34600830078125, - "reward": 0.6200512424111366, - "reward_std": 0.052864328026771545, - "rewards/perpo_ocr_edit_distance_reward": 0.6200513020157814, + "advantages": -3.320830273878528e-06, + "completion_length": 504.0, + "delta_ref_entropy_loss": 0.115234375, + "delta_ref_ppl": -0.10595703125, + "entropy_loss": -0.310546875, + "epoch": 0.3026, + "grad_norm": 2.5480974643282783, + "k1_kl": 0.10595703125, + "k3_kl": 0.06103515625, + "kimi_kl": 0.12158203125, + "learning_rate": 3.487e-07, + "loss": 0.0024, + "ppl": 0.1591796875, + "reward": 0.768195629119873, + "reward_std": 0.020282745361328125, + "rewards/perpo_ocr_edit_distance_reward": 0.7681956887245178, "step": 1513, "temperature": 0.9 }, { - "advantages": -5.066395000419277e-06, - "completion_length": 865.0, - "delta_ref_entropy_loss": 0.0941162109375, - "delta_ref_ppl": -0.06787109375, - "entropy_loss": -0.1533203125, - "epoch": 0.6056, - "grad_norm": 1.6266641723917383, - "k1_kl": 0.06787109375, - "k3_kl": 0.04052734375, - "kimi_kl": 0.0927734375, - "learning_rate": 1.9719999999999997e-07, - "loss": 0.0016, - "ppl": 0.085693359375, - "reward": 0.7149868011474609, - "reward_std": 0.15245920862071216, - "rewards/perpo_ocr_edit_distance_reward": 0.7149868905544281, + "advantages": -9.025846452459518e-07, + "completion_length": 1130.0, + "delta_ref_entropy_loss": 0.0595703125, + "delta_ref_ppl": -0.0712890625, + "entropy_loss": -0.0947265625, + "epoch": 0.3028, + "grad_norm": 2.139272573631219, + "k1_kl": 0.0712890625, + "k3_kl": 0.04736328125, + "kimi_kl": 0.1064453125, + "learning_rate": 3.486e-07, + "loss": 0.0019, + "ppl": 0.0546875, + "reward": 0.8149099349975586, + "reward_std": 0.07539675384759903, + "rewards/perpo_ocr_edit_distance_reward": 0.8149099946022034, "step": 1514, "temperature": 0.9 }, { - "advantages": -8.719308198124054e-06, - "completion_length": 1242.5, - "delta_ref_entropy_loss": 0.0777587890625, - "delta_ref_ppl": -0.0465087890625, - "entropy_loss": -0.126220703125, - "epoch": 0.606, - "grad_norm": 11.768221561547925, - "k1_kl": 0.046630859375, - "k3_kl": 0.0733642578125, - "kimi_kl": 0.0657958984375, - "learning_rate": 1.97e-07, - "loss": 0.0029, - "ppl": 0.0863037109375, - "reward": 0.6147259920835495, - "reward_std": 0.0037992190336808562, - "rewards/perpo_ocr_edit_distance_reward": 0.6147260516881943, + "advantages": -1.704480018815957e-05, + "completion_length": 876.0, + "delta_ref_entropy_loss": 0.039306640625, + "delta_ref_ppl": -0.038818359375, + "entropy_loss": -0.03955078125, + "epoch": 0.303, + "grad_norm": 0.5761110140800706, + "k1_kl": 0.03857421875, + "k3_kl": 0.0208740234375, + "kimi_kl": 0.048828125, + "learning_rate": 3.485e-07, + "loss": 0.0009, + "ppl": 0.018798828125, + "reward": 0.9859076142311096, + "reward_std": 0.0008985827444121242, + "rewards/perpo_ocr_edit_distance_reward": 0.9859076142311096, "step": 1515, "temperature": 0.9 }, { - "advantages": -5.521093407878652e-05, - "completion_length": 322.5, - "delta_ref_entropy_loss": 0.087646484375, - "delta_ref_ppl": -0.127197265625, - "entropy_loss": -0.053955078125, - "epoch": 0.6064, - "grad_norm": 0.32741657358518184, - "k1_kl": 0.127685546875, - "k3_kl": 0.09130859375, - "kimi_kl": 0.2894287109375, - "learning_rate": 1.968e-07, - "loss": 0.0037, - "ppl": 0.0367431640625, - "reward": 0.9848047196865082, - "reward_std": 0.0001813300623325631, - "rewards/perpo_ocr_edit_distance_reward": 0.984804779291153, + "advantages": -1.183577978736139e-06, + "completion_length": 546.0, + "delta_ref_entropy_loss": 0.1005859375, + "delta_ref_ppl": -0.1328125, + "entropy_loss": -0.10107421875, + "epoch": 0.3032, + "grad_norm": 1.3965480930150447, + "k1_kl": 0.1318359375, + "k3_kl": 0.07763671875, + "kimi_kl": 0.228515625, + "learning_rate": 3.4839999999999997e-07, + "loss": 0.0031, + "ppl": 0.04541015625, + "reward": 0.8768553137779236, + "reward_std": 0.03563791140913963, + "rewards/perpo_ocr_edit_distance_reward": 0.8768554329872131, "step": 1516, "temperature": 0.9 }, { - "advantages": -1.3589859236162738e-05, - "completion_length": 656.0, - "delta_ref_entropy_loss": 0.020538330078125, - "delta_ref_ppl": -0.014862060546875, - "entropy_loss": -0.0157470703125, - "epoch": 0.6068, - "grad_norm": 0.27297669857946955, - "k1_kl": 0.014862060546875, - "k3_kl": 0.009796142578125, - "kimi_kl": 0.03385162353515625, - "learning_rate": 1.966e-07, - "loss": 0.0004, - "ppl": 0.00594329833984375, - "reward": 0.9953066110610962, - "reward_std": 0.0015729879960417747, - "rewards/perpo_ocr_edit_distance_reward": 0.995306670665741, + "advantages": -7.931676009320654e-06, + "completion_length": 969.0, + "delta_ref_entropy_loss": 0.0703125, + "delta_ref_ppl": -0.0703125, + "entropy_loss": -0.1806640625, + "epoch": 0.3034, + "grad_norm": 2.897196054407004, + "k1_kl": 0.06982421875, + "k3_kl": 0.039794921875, + "kimi_kl": 0.0791015625, + "learning_rate": 3.4829999999999996e-07, + "loss": 0.0016, + "ppl": 0.09326171875, + "reward": 0.8478017449378967, + "reward_std": 0.008508453145623207, + "rewards/perpo_ocr_edit_distance_reward": 0.8478018045425415, "step": 1517, "temperature": 0.9 }, { - "advantages": -8.395740744404634e-06, - "completion_length": 1307.0, - "delta_ref_entropy_loss": 0.0048828125, - "delta_ref_ppl": -0.103759765625, - "entropy_loss": -0.281005859375, - "epoch": 0.6072, - "grad_norm": 5.313670220010317, - "k1_kl": 0.1026611328125, - "k3_kl": 0.08135986328125, - "kimi_kl": 0.365234375, - "learning_rate": 1.9639999999999999e-07, - "loss": 0.0033, - "ppl": 0.1617431640625, - "reward": 0.490741565823555, - "reward_std": 0.05061150586698204, - "rewards/perpo_ocr_edit_distance_reward": 0.4907415956258774, + "advantages": -0.00017901830142363906, + "completion_length": 813.0, + "delta_ref_entropy_loss": 0.032470703125, + "delta_ref_ppl": -0.032958984375, + "entropy_loss": -0.017578125, + "epoch": 0.3036, + "grad_norm": 0.36904498462877305, + "k1_kl": 0.033203125, + "k3_kl": 0.0191650390625, + "kimi_kl": 0.056884765625, + "learning_rate": 3.482e-07, + "loss": 0.0009, + "ppl": 0.005340576171875, + "reward": 0.9959656000137329, + "reward_std": 0.00013780040899291635, + "rewards/perpo_ocr_edit_distance_reward": 0.9959656596183777, "step": 1518, "temperature": 0.9 }, { - "advantages": 7.101467929260252e-06, - "completion_length": 672.5, - "delta_ref_entropy_loss": 0.03057861328125, - "delta_ref_ppl": -0.04669189453125, - "entropy_loss": -0.0355224609375, - "epoch": 0.6076, - "grad_norm": 4.918525205168879, - "k1_kl": 0.0467529296875, - "k3_kl": 0.034088134765625, - "kimi_kl": 0.18743896484375, - "learning_rate": 1.962e-07, - "loss": 0.0014, - "ppl": 0.01849365234375, - "reward": 0.8817571997642517, - "reward_std": 0.027443165483418852, - "rewards/perpo_ocr_edit_distance_reward": 0.8817572295665741, + "advantages": -4.070146133017261e-06, + "completion_length": 72.0, + "delta_ref_entropy_loss": 0.0693359375, + "delta_ref_ppl": -0.59375, + "entropy_loss": -0.2451171875, + "epoch": 0.3038, + "grad_norm": 6.262440030091246, + "k1_kl": 0.59375, + "k3_kl": 0.462890625, + "kimi_kl": 1.796875, + "learning_rate": 3.481e-07, + "loss": 0.0185, + "ppl": 0.1005859375, + "reward": 0.4304825961589813, + "reward_std": 0.0030311818700283766, + "rewards/perpo_ocr_edit_distance_reward": 0.43048256635665894, "step": 1519, "temperature": 0.9 }, { - "advantages": -9.704488184070215e-05, - "completion_length": 925.0, - "delta_ref_entropy_loss": 0.0552978515625, - "delta_ref_ppl": -0.05108642578125, - "entropy_loss": -0.03662109375, - "epoch": 0.608, - "grad_norm": 0.4575846103758506, - "k1_kl": 0.051025390625, - "k3_kl": 0.02972412109375, - "kimi_kl": 0.0762939453125, - "learning_rate": 1.96e-07, - "loss": 0.0013, - "ppl": 0.0181884765625, - "reward": 0.9980440139770508, - "reward_std": 0.0005573198432102799, - "rewards/perpo_ocr_edit_distance_reward": 0.9980440735816956, + "advantages": -4.930155682814075e-06, + "completion_length": 454.0, + "delta_ref_entropy_loss": 0.177734375, + "delta_ref_ppl": -0.1845703125, + "entropy_loss": -0.341796875, + "epoch": 0.304, + "grad_norm": 2.2632713104860414, + "k1_kl": 0.1845703125, + "k3_kl": 0.11767578125, + "kimi_kl": 0.349609375, + "learning_rate": 3.4799999999999994e-07, + "loss": 0.0047, + "ppl": 0.1845703125, + "reward": 0.9080860018730164, + "reward_std": 0.005082553718239069, + "rewards/perpo_ocr_edit_distance_reward": 0.9080861210823059, "step": 1520, "temperature": 0.9 }, { - "advantages": -3.595863173710967e-05, - "completion_length": 633.0, - "delta_ref_entropy_loss": 0.044677734375, - "delta_ref_ppl": -0.0552978515625, - "entropy_loss": -0.0389404296875, - "epoch": 0.6084, - "grad_norm": 0.9147171881624798, - "k1_kl": 0.0552978515625, - "k3_kl": 0.03704833984375, - "kimi_kl": 0.1201171875, - "learning_rate": 1.958e-07, - "loss": 0.0015, - "ppl": 0.019287109375, - "reward": 0.9907247424125671, - "reward_std": 0.0024785739660728723, - "rewards/perpo_ocr_edit_distance_reward": 0.9907247722148895, + "advantages": -5.234991112956777e-05, + "completion_length": 556.0, + "delta_ref_entropy_loss": 0.0458984375, + "delta_ref_ppl": -0.043212890625, + "entropy_loss": -0.0257568359375, + "epoch": 0.3042, + "grad_norm": 0.5063367594621215, + "k1_kl": 0.04345703125, + "k3_kl": 0.02490234375, + "kimi_kl": 0.06640625, + "learning_rate": 3.479e-07, + "loss": 0.001, + "ppl": 0.0123291015625, + "reward": 0.9826672673225403, + "reward_std": 0.0007133110193535686, + "rewards/perpo_ocr_edit_distance_reward": 0.9826672673225403, "step": 1521, "temperature": 0.9 }, { - "advantages": -2.4608204967080383e-06, - "completion_length": 540.0, - "delta_ref_entropy_loss": 0.0562744140625, - "delta_ref_ppl": -0.0784912109375, - "entropy_loss": -0.0340576171875, - "epoch": 0.6088, - "grad_norm": 0.2627127743298425, - "k1_kl": 0.0780029296875, - "k3_kl": 0.052032470703125, - "kimi_kl": 0.19110107421875, - "learning_rate": 1.9559999999999998e-07, - "loss": 0.0021, - "ppl": 0.014312744140625, - "reward": 0.9855944514274597, - "reward_std": 0.0008134819800034165, - "rewards/perpo_ocr_edit_distance_reward": 0.9855944216251373, + "advantages": -1.5803747373865917e-05, + "completion_length": 771.0, + "delta_ref_entropy_loss": 0.0771484375, + "delta_ref_ppl": -0.078125, + "entropy_loss": -0.04248046875, + "epoch": 0.3044, + "grad_norm": 0.4947937760476813, + "k1_kl": 0.07763671875, + "k3_kl": 0.043701171875, + "kimi_kl": 0.11669921875, + "learning_rate": 3.478e-07, + "loss": 0.0018, + "ppl": 0.0185546875, + "reward": 0.9908447265625, + "reward_std": 0.0009767048759385943, + "rewards/perpo_ocr_edit_distance_reward": 0.9908447861671448, "step": 1522, "temperature": 0.9 }, { - "advantages": -2.8282404855417553e-05, - "completion_length": 565.0, - "delta_ref_entropy_loss": 0.068359375, - "delta_ref_ppl": -0.051727294921875, - "entropy_loss": -0.0777587890625, - "epoch": 0.6092, - "grad_norm": 1.0874981873545477, - "k1_kl": 0.051483154296875, - "k3_kl": 0.03338623046875, - "kimi_kl": 0.0840911865234375, - "learning_rate": 1.954e-07, - "loss": 0.0014, - "ppl": 0.0431060791015625, - "reward": 0.9241521656513214, - "reward_std": 0.0013720109564019367, - "rewards/perpo_ocr_edit_distance_reward": 0.9241522252559662, + "advantages": -0.0001846041122917086, + "completion_length": 742.0, + "delta_ref_entropy_loss": 0.0732421875, + "delta_ref_ppl": -0.068359375, + "entropy_loss": -0.04541015625, + "epoch": 0.3046, + "grad_norm": 0.6014406511178076, + "k1_kl": 0.068359375, + "k3_kl": 0.035400390625, + "kimi_kl": 0.09326171875, + "learning_rate": 3.4769999999999997e-07, + "loss": 0.0016, + "ppl": 0.0142822265625, + "reward": 0.9810212254524231, + "reward_std": 0.0006380483391694725, + "rewards/perpo_ocr_edit_distance_reward": 0.9810214042663574, "step": 1523, "temperature": 0.9 }, { - "advantages": -1.7740896770312986e-05, - "completion_length": 899.0, - "delta_ref_entropy_loss": 0.0257568359375, - "delta_ref_ppl": -0.020050048828125, - "entropy_loss": -0.0289306640625, - "epoch": 0.6096, - "grad_norm": 2.7140349887967217, - "k1_kl": 0.02008056640625, - "k3_kl": 0.0107269287109375, - "kimi_kl": 0.023101806640625, - "learning_rate": 1.952e-07, - "loss": 0.0004, - "ppl": 0.01226806640625, - "reward": 0.9989030361175537, - "reward_std": 0.000622000967268832, - "rewards/perpo_ocr_edit_distance_reward": 0.9989030361175537, + "advantages": -0.0002333990269107744, + "completion_length": 522.0, + "delta_ref_entropy_loss": 0.0546875, + "delta_ref_ppl": -0.06396484375, + "entropy_loss": -0.019287109375, + "epoch": 0.3048, + "grad_norm": 0.32320726539863537, + "k1_kl": 0.0634765625, + "k3_kl": 0.038330078125, + "kimi_kl": 0.130859375, + "learning_rate": 3.476e-07, + "loss": 0.0018, + "ppl": 0.006805419921875, + "reward": 0.9969664216041565, + "reward_std": 0.0001918769266922027, + "rewards/perpo_ocr_edit_distance_reward": 0.996966540813446, "step": 1524, "temperature": 0.9 }, { - "advantages": -8.743150101508945e-05, - "completion_length": 1102.5, - "delta_ref_entropy_loss": 0.0172119140625, - "delta_ref_ppl": -0.014892578125, - "entropy_loss": -0.0238037109375, - "epoch": 0.61, - "grad_norm": 0.547286099270771, - "k1_kl": 0.014892578125, - "k3_kl": 0.0098114013671875, - "kimi_kl": 0.020294189453125, - "learning_rate": 1.9499999999999999e-07, - "loss": 0.0005, - "ppl": 0.01214599609375, - "reward": 0.9985470473766327, - "reward_std": 0.00037080397305544466, - "rewards/perpo_ocr_edit_distance_reward": 0.9985471069812775, + "advantages": -3.015995207533706e-05, + "completion_length": 266.0, + "delta_ref_entropy_loss": 0.080078125, + "delta_ref_ppl": -0.12158203125, + "entropy_loss": -0.056396484375, + "epoch": 0.305, + "grad_norm": 1.066757539966869, + "k1_kl": 0.12109375, + "k3_kl": 0.08642578125, + "kimi_kl": 0.265625, + "learning_rate": 3.4749999999999996e-07, + "loss": 0.0035, + "ppl": 0.0208740234375, + "reward": 0.9429693222045898, + "reward_std": 0.001029851147904992, + "rewards/perpo_ocr_edit_distance_reward": 0.9429693818092346, "step": 1525, "temperature": 0.9 }, { - "advantages": -5.313328529155115e-06, - "completion_length": 243.0, - "delta_ref_entropy_loss": 0.09356689453125, - "delta_ref_ppl": -0.088134765625, - "entropy_loss": -0.1007080078125, - "epoch": 0.6104, - "grad_norm": 1.8348426662648885, - "k1_kl": 0.0880126953125, - "k3_kl": 0.047119140625, - "kimi_kl": 0.093505859375, - "learning_rate": 1.948e-07, - "loss": 0.0019, - "ppl": 0.058929443359375, - "reward": 0.9324395954608917, - "reward_std": 0.0027543846517801285, - "rewards/perpo_ocr_edit_distance_reward": 0.9324396252632141, + "advantages": -6.301062512648059e-06, + "completion_length": 437.0, + "delta_ref_entropy_loss": 0.1357421875, + "delta_ref_ppl": -0.134765625, + "entropy_loss": -0.1494140625, + "epoch": 0.3052, + "grad_norm": 1.6544450779675182, + "k1_kl": 0.134765625, + "k3_kl": 0.0791015625, + "kimi_kl": 0.1982421875, + "learning_rate": 3.4739999999999995e-07, + "loss": 0.0032, + "ppl": 0.07568359375, + "reward": 0.9536901116371155, + "reward_std": 0.0026107945013791323, + "rewards/perpo_ocr_edit_distance_reward": 0.9536901712417603, "step": 1526, "temperature": 0.9 }, { - "advantages": 2.6685851992169773e-05, - "completion_length": 1145.0, - "delta_ref_entropy_loss": 0.0572509765625, - "delta_ref_ppl": -0.03558349609375, - "entropy_loss": -0.0963134765625, - "epoch": 0.6108, - "grad_norm": 5.53796872616578, - "k1_kl": 0.03546142578125, - "k3_kl": 0.0570068359375, - "kimi_kl": 0.05157470703125, - "learning_rate": 1.9459999999999998e-07, - "loss": 0.0022, - "ppl": 0.052001953125, - "reward": 0.8801741898059845, - "reward_std": 0.04481078125536442, - "rewards/perpo_ocr_edit_distance_reward": 0.8801741898059845, + "advantages": -8.854696352500468e-05, + "completion_length": 473.0, + "delta_ref_entropy_loss": 0.0625, + "delta_ref_ppl": -0.056396484375, + "entropy_loss": -0.025146484375, + "epoch": 0.3054, + "grad_norm": 0.2855978701563169, + "k1_kl": 0.05615234375, + "k3_kl": 0.0284423828125, + "kimi_kl": 0.0576171875, + "learning_rate": 3.473e-07, + "loss": 0.0012, + "ppl": 0.00640869140625, + "reward": 0.9999287724494934, + "reward_std": 0.0001885129458969459, + "rewards/perpo_ocr_edit_distance_reward": 0.9999287724494934, "step": 1527, "temperature": 0.9 }, { - "advantages": -1.6817025425552856e-05, - "completion_length": 525.0, - "delta_ref_entropy_loss": 0.0762939453125, - "delta_ref_ppl": -0.0560302734375, - "entropy_loss": -0.091064453125, - "epoch": 0.6112, - "grad_norm": 0.819639060911115, - "k1_kl": 0.055908203125, - "k3_kl": 0.031005859375, - "kimi_kl": 0.0889892578125, - "learning_rate": 1.944e-07, - "loss": 0.0013, - "ppl": 0.050537109375, - "reward": 0.7737466096878052, - "reward_std": 0.0033106948249042034, - "rewards/perpo_ocr_edit_distance_reward": 0.7737466990947723, + "advantages": -6.335122634482104e-06, + "completion_length": 750.0, + "delta_ref_entropy_loss": 0.1279296875, + "delta_ref_ppl": -0.11474609375, + "entropy_loss": -0.181640625, + "epoch": 0.3056, + "grad_norm": 1.348882193488194, + "k1_kl": 0.1142578125, + "k3_kl": 0.06787109375, + "kimi_kl": 0.2041015625, + "learning_rate": 3.472e-07, + "loss": 0.0027, + "ppl": 0.0966796875, + "reward": 0.9224302172660828, + "reward_std": 0.006607783958315849, + "rewards/perpo_ocr_edit_distance_reward": 0.9224303364753723, "step": 1528, "temperature": 0.9 }, { - "advantages": -3.6614283544622594e-05, - "completion_length": 417.0, - "delta_ref_entropy_loss": 0.065185546875, - "delta_ref_ppl": -0.0587158203125, - "entropy_loss": -0.080810546875, - "epoch": 0.6116, - "grad_norm": 1.0031371025847746, - "k1_kl": 0.058837890625, - "k3_kl": 0.0367431640625, - "kimi_kl": 0.118408203125, - "learning_rate": 1.942e-07, - "loss": 0.0015, - "ppl": 0.0440673828125, - "reward": 0.9531998932361603, - "reward_std": 0.0022921400668565184, - "rewards/perpo_ocr_edit_distance_reward": 0.953199952840805, + "advantages": -4.087175966560608e-07, + "completion_length": 241.0, + "delta_ref_entropy_loss": 0.11083984375, + "delta_ref_ppl": -0.2080078125, + "entropy_loss": -0.10986328125, + "epoch": 0.3058, + "grad_norm": 5.692988615139284, + "k1_kl": 0.2080078125, + "k3_kl": 0.1552734375, + "kimi_kl": 0.68359375, + "learning_rate": 3.471e-07, + "loss": 0.0062, + "ppl": 0.049560546875, + "reward": 0.6885038018226624, + "reward_std": 0.1446073204278946, + "rewards/perpo_ocr_edit_distance_reward": 0.6885038614273071, "step": 1529, "temperature": 0.9 }, { - "advantages": -4.120171251997817e-05, - "completion_length": 290.5, - "delta_ref_entropy_loss": 0.03387451171875, - "delta_ref_ppl": -0.0390625, - "entropy_loss": -0.0216064453125, - "epoch": 0.612, - "grad_norm": 0.7153496401120434, - "k1_kl": 0.0390625, - "k3_kl": 0.02557373046875, - "kimi_kl": 0.091339111328125, - "learning_rate": 1.94e-07, - "loss": 0.0011, - "ppl": 0.0118408203125, - "reward": 0.998432070016861, - "reward_std": 0.0007374017586698756, - "rewards/perpo_ocr_edit_distance_reward": 0.9984320998191833, + "advantages": -6.973743893468054e-06, + "completion_length": 505.0, + "delta_ref_entropy_loss": 0.0537109375, + "delta_ref_ppl": -0.06201171875, + "entropy_loss": -0.0341796875, + "epoch": 0.306, + "grad_norm": 0.5208861313439479, + "k1_kl": 0.061767578125, + "k3_kl": 0.043701171875, + "kimi_kl": 0.10986328125, + "learning_rate": 3.4699999999999997e-07, + "loss": 0.0018, + "ppl": 0.01513671875, + "reward": 0.9917152523994446, + "reward_std": 0.0011222257744520903, + "rewards/perpo_ocr_edit_distance_reward": 0.9917153120040894, "step": 1530, "temperature": 0.9 }, { - "advantages": -5.921721640333999e-05, - "completion_length": 393.0, - "delta_ref_entropy_loss": 0.0421142578125, - "delta_ref_ppl": -0.02178955078125, - "entropy_loss": -0.03076171875, - "epoch": 0.6124, - "grad_norm": 0.5811400390023523, - "k1_kl": 0.02178955078125, - "k3_kl": 0.0095062255859375, - "kimi_kl": 0.015625, - "learning_rate": 1.938e-07, - "loss": 0.0004, - "ppl": 0.01568603515625, - "reward": 0.9723663032054901, - "reward_std": 0.0005757809849455953, - "rewards/perpo_ocr_edit_distance_reward": 0.9723663926124573, + "advantages": -4.368169356894214e-06, + "completion_length": 1019.0, + "delta_ref_entropy_loss": 0.123046875, + "delta_ref_ppl": -0.10791015625, + "entropy_loss": -0.1845703125, + "epoch": 0.3062, + "grad_norm": 2.2659178695081925, + "k1_kl": 0.107421875, + "k3_kl": 0.06005859375, + "kimi_kl": 0.1279296875, + "learning_rate": 3.4689999999999996e-07, + "loss": 0.0024, + "ppl": 0.10009765625, + "reward": 0.946382462978363, + "reward_std": 0.011616707779467106, + "rewards/perpo_ocr_edit_distance_reward": 0.9463825821876526, "step": 1531, "temperature": 0.9 }, { - "advantages": -4.19999896621448e-05, - "completion_length": 768.0, - "delta_ref_entropy_loss": 0.0904541015625, - "delta_ref_ppl": -0.062652587890625, - "entropy_loss": -0.12689208984375, - "epoch": 0.6128, - "grad_norm": 0.9973640590664753, - "k1_kl": 0.062408447265625, - "k3_kl": 0.034423828125, - "kimi_kl": 0.082550048828125, - "learning_rate": 1.9359999999999999e-07, - "loss": 0.0014, - "ppl": 0.073638916015625, - "reward": 0.9263454973697662, - "reward_std": 0.0013893037466914393, - "rewards/perpo_ocr_edit_distance_reward": 0.9263455271720886, + "advantages": -2.895082786835701e-07, + "completion_length": 569.0, + "delta_ref_entropy_loss": 0.0791015625, + "delta_ref_ppl": -0.08251953125, + "entropy_loss": -0.06396484375, + "epoch": 0.3064, + "grad_norm": 2.581465241272757, + "k1_kl": 0.0830078125, + "k3_kl": 0.049560546875, + "kimi_kl": 0.11328125, + "learning_rate": 3.4679999999999996e-07, + "loss": 0.002, + "ppl": 0.0291748046875, + "reward": 0.9082605838775635, + "reward_std": 0.08636725693941116, + "rewards/perpo_ocr_edit_distance_reward": 0.9082606434822083, "step": 1532, "temperature": 0.9 }, { - "advantages": 1.9669532775878906e-06, - "completion_length": 711.5, - "delta_ref_entropy_loss": 0.06005859375, - "delta_ref_ppl": -0.032196044921875, - "entropy_loss": -0.071533203125, - "epoch": 0.6132, - "grad_norm": 0.8978544055830451, - "k1_kl": 0.032196044921875, - "k3_kl": 0.0174102783203125, - "kimi_kl": 0.03582763671875, - "learning_rate": 1.9339999999999997e-07, - "loss": 0.0007, - "ppl": 0.0411376953125, - "reward": 0.95817431807518, - "reward_std": 0.002337850572075695, - "rewards/perpo_ocr_edit_distance_reward": 0.9581743776798248, + "advantages": -5.755680103902705e-05, + "completion_length": 389.0, + "delta_ref_entropy_loss": 0.060791015625, + "delta_ref_ppl": -0.083984375, + "entropy_loss": -0.03125, + "epoch": 0.3066, + "grad_norm": 0.7892685659062729, + "k1_kl": 0.083984375, + "k3_kl": 0.056396484375, + "kimi_kl": 0.2138671875, + "learning_rate": 3.467e-07, + "loss": 0.0023, + "ppl": 0.01300048828125, + "reward": 0.9965083599090576, + "reward_std": 0.0009357518283650279, + "rewards/perpo_ocr_edit_distance_reward": 0.9965084791183472, "step": 1533, "temperature": 0.9 }, { - "advantages": -1.456056388349225e-06, - "completion_length": 394.0, - "delta_ref_entropy_loss": 0.0531005859375, - "delta_ref_ppl": -0.04522705078125, - "entropy_loss": -0.03857421875, - "epoch": 0.6136, - "grad_norm": 1.5139659763071167, - "k1_kl": 0.04522705078125, - "k3_kl": 0.0303955078125, - "kimi_kl": 0.1177978515625, - "learning_rate": 1.932e-07, - "loss": 0.0012, - "ppl": 0.0194091796875, - "reward": 0.7005039751529694, - "reward_std": 0.053801434114575386, - "rewards/perpo_ocr_edit_distance_reward": 0.7005040347576141, + "advantages": -3.174373341607861e-05, + "completion_length": 942.0, + "delta_ref_entropy_loss": 0.05126953125, + "delta_ref_ppl": -0.068359375, + "entropy_loss": -0.03955078125, + "epoch": 0.3068, + "grad_norm": 9.724807497555918, + "k1_kl": 0.068359375, + "k3_kl": 0.0439453125, + "kimi_kl": 0.1083984375, + "learning_rate": 3.466e-07, + "loss": 0.0018, + "ppl": 0.019775390625, + "reward": 0.9933390617370605, + "reward_std": 0.0015113090630620718, + "rewards/perpo_ocr_edit_distance_reward": 0.9933391809463501, "step": 1534, "temperature": 0.9 }, { - "advantages": -9.653398228692822e-05, - "completion_length": 520.0, - "delta_ref_entropy_loss": 0.0323486328125, - "delta_ref_ppl": -0.01812744140625, - "entropy_loss": -0.01776123046875, - "epoch": 0.614, - "grad_norm": 0.3744862401336765, - "k1_kl": 0.0181884765625, - "k3_kl": 0.009063720703125, - "kimi_kl": 0.015838623046875, - "learning_rate": 1.93e-07, - "loss": 0.0005, - "ppl": 0.009124755859375, - "reward": 0.9984560310840607, - "reward_std": 0.00046255465713329613, - "rewards/perpo_ocr_edit_distance_reward": 0.9984560608863831, + "advantages": -3.2867706067918334e-06, + "completion_length": 405.0, + "delta_ref_entropy_loss": 0.1875, + "delta_ref_ppl": -0.15625, + "entropy_loss": -0.1845703125, + "epoch": 0.307, + "grad_norm": 1.4532698980473087, + "k1_kl": 0.1552734375, + "k3_kl": 0.0849609375, + "kimi_kl": 0.23828125, + "learning_rate": 3.4649999999999993e-07, + "loss": 0.0034, + "ppl": 0.08984375, + "reward": 0.9037920832633972, + "reward_std": 0.0024993084371089935, + "rewards/perpo_ocr_edit_distance_reward": 0.903792142868042, "step": 1535, "temperature": 0.9 }, { - "advantages": -6.80387038300978e-05, - "completion_length": 391.0, - "delta_ref_entropy_loss": 0.0472412109375, - "delta_ref_ppl": -0.0433349609375, - "entropy_loss": -0.038330078125, - "epoch": 0.6144, - "grad_norm": 0.7153839790496674, - "k1_kl": 0.04345703125, - "k3_kl": 0.02532958984375, - "kimi_kl": 0.057373046875, - "learning_rate": 1.9279999999999998e-07, - "loss": 0.0011, - "ppl": 0.0203857421875, - "reward": 0.9362349808216095, - "reward_std": 0.0006864751921966672, - "rewards/perpo_ocr_edit_distance_reward": 0.9362350404262543, + "advantages": -1.8903187992691528e-06, + "completion_length": 535.0, + "delta_ref_entropy_loss": 0.0869140625, + "delta_ref_ppl": -0.119140625, + "entropy_loss": -0.44140625, + "epoch": 0.3072, + "grad_norm": 4.486358847317947, + "k1_kl": 0.11962890625, + "k3_kl": 0.08447265625, + "kimi_kl": 0.17578125, + "learning_rate": 3.464e-07, + "loss": 0.0034, + "ppl": 0.22265625, + "reward": 0.4986417293548584, + "reward_std": 0.027433188632130623, + "rewards/perpo_ocr_edit_distance_reward": 0.4986417889595032, "step": 1536, "temperature": 0.9 }, { - "advantages": -2.299036481190342e-07, - "completion_length": 1137.5, - "delta_ref_entropy_loss": 0.02166748046875, - "delta_ref_ppl": -0.18359375, - "entropy_loss": -0.529296875, - "epoch": 0.6148, - "grad_norm": 3.8231002995502665, - "k1_kl": 0.184326171875, - "k3_kl": 0.144287109375, - "kimi_kl": 0.4765625, - "learning_rate": 1.926e-07, - "loss": 0.0058, - "ppl": 0.2841796875, - "reward": 0.36277204751968384, - "reward_std": 0.0857729073613882, - "rewards/perpo_ocr_edit_distance_reward": 0.3627720773220062, + "advantages": -3.503050174913369e-05, + "completion_length": 748.0, + "delta_ref_entropy_loss": 0.06494140625, + "delta_ref_ppl": -0.08154296875, + "entropy_loss": -0.05078125, + "epoch": 0.3074, + "grad_norm": 0.8208496422465683, + "k1_kl": 0.08154296875, + "k3_kl": 0.052978515625, + "kimi_kl": 0.18359375, + "learning_rate": 3.4629999999999997e-07, + "loss": 0.0022, + "ppl": 0.02099609375, + "reward": 0.9196087121963501, + "reward_std": 0.0016021698247641325, + "rewards/perpo_ocr_edit_distance_reward": 0.9196088314056396, "step": 1537, "temperature": 0.9 }, { - "advantages": -0.00010314796963939443, - "completion_length": 391.0, - "delta_ref_entropy_loss": 0.01910400390625, - "delta_ref_ppl": -0.03546142578125, - "entropy_loss": -0.0244140625, - "epoch": 0.6152, - "grad_norm": 0.6416799587699803, - "k1_kl": 0.03533935546875, - "k3_kl": 0.0263671875, - "kimi_kl": 0.1103515625, - "learning_rate": 1.9239999999999998e-07, - "loss": 0.0012, - "ppl": 0.01177978515625, - "reward": 0.9984890222549438, - "reward_std": 0.0001563848927617073, - "rewards/perpo_ocr_edit_distance_reward": 0.9984890520572662, + "advantages": -4.121235633647302e-06, + "completion_length": 578.0, + "delta_ref_entropy_loss": 0.0908203125, + "delta_ref_ppl": -0.10205078125, + "entropy_loss": -0.046630859375, + "epoch": 0.3076, + "grad_norm": 0.9992689029706202, + "k1_kl": 0.10205078125, + "k3_kl": 0.062255859375, + "kimi_kl": 0.1689453125, + "learning_rate": 3.462e-07, + "loss": 0.0025, + "ppl": 0.0203857421875, + "reward": 0.9902822971343994, + "reward_std": 0.01031545177102089, + "rewards/perpo_ocr_edit_distance_reward": 0.9902823567390442, "step": 1538, "temperature": 0.9 }, { - "advantages": -6.570134974026587e-05, - "completion_length": 843.0, - "delta_ref_entropy_loss": 0.054443359375, - "delta_ref_ppl": -0.0601806640625, - "entropy_loss": -0.0494384765625, - "epoch": 0.6156, - "grad_norm": 1.0364115649251113, - "k1_kl": 0.0601806640625, - "k3_kl": 0.03729248046875, - "kimi_kl": 0.098388671875, - "learning_rate": 1.9220000000000001e-07, - "loss": 0.0016, - "ppl": 0.02691650390625, - "reward": 0.9694360196590424, - "reward_std": 0.0008051061886362731, - "rewards/perpo_ocr_edit_distance_reward": 0.9694360792636871, + "advantages": -0.0002329860581085086, + "completion_length": 597.0, + "delta_ref_entropy_loss": 0.044921875, + "delta_ref_ppl": -0.0869140625, + "entropy_loss": -0.031494140625, + "epoch": 0.3078, + "grad_norm": 1.3182253688692642, + "k1_kl": 0.08740234375, + "k3_kl": 0.0634765625, + "kimi_kl": 0.283203125, + "learning_rate": 3.461e-07, + "loss": 0.0028, + "ppl": 0.012451171875, + "reward": 0.9954342246055603, + "reward_std": 0.00041165429865941405, + "rewards/perpo_ocr_edit_distance_reward": 0.9954344034194946, "step": 1539, "temperature": 0.9 }, { - "advantages": -0.0001575946807861328, - "completion_length": 819.0, - "delta_ref_entropy_loss": 0.02496337890625, - "delta_ref_ppl": -0.0228271484375, - "entropy_loss": -0.009246826171875, - "epoch": 0.616, - "grad_norm": 0.0650376474464895, - "k1_kl": 0.022796630859375, - "k3_kl": 0.0137786865234375, - "kimi_kl": 0.03515625, - "learning_rate": 1.92e-07, - "loss": 0.0007, - "ppl": 0.003662109375, - "reward": 0.999983161687851, - "reward_std": 4.4554679334396496e-05, - "rewards/perpo_ocr_edit_distance_reward": 0.9999831914901733, + "advantages": -1.246588635694934e-05, + "completion_length": 812.0, + "delta_ref_entropy_loss": 0.0908203125, + "delta_ref_ppl": -0.09130859375, + "entropy_loss": -0.12109375, + "epoch": 0.308, + "grad_norm": 1.6894962122778066, + "k1_kl": 0.09130859375, + "k3_kl": 0.05859375, + "kimi_kl": 0.2080078125, + "learning_rate": 3.4599999999999995e-07, + "loss": 0.0024, + "ppl": 0.064453125, + "reward": 0.9846009612083435, + "reward_std": 0.004672377370297909, + "rewards/perpo_ocr_edit_distance_reward": 0.9846010208129883, "step": 1540, "temperature": 0.9 }, { - "advantages": -2.5838615336226667e-05, - "completion_length": 1278.0, - "delta_ref_entropy_loss": 0.0404052734375, - "delta_ref_ppl": -0.041229248046875, - "entropy_loss": -0.0762939453125, - "epoch": 0.6164, - "grad_norm": 0.8145268931853328, - "k1_kl": 0.04119873046875, - "k3_kl": 0.031463623046875, - "kimi_kl": 0.0828857421875, - "learning_rate": 1.9179999999999998e-07, - "loss": 0.0013, - "ppl": 0.044189453125, - "reward": 0.7916481792926788, - "reward_std": 0.0243315328261815, - "rewards/perpo_ocr_edit_distance_reward": 0.7916482388973236, + "advantages": 0.0, + "completion_length": 709.0, + "delta_ref_entropy_loss": 0.0263671875, + "delta_ref_ppl": -0.02392578125, + "entropy_loss": -0.01483154296875, + "epoch": 0.3082, + "grad_norm": 0.026406144006840288, + "k1_kl": 0.0238037109375, + "k3_kl": 0.013427734375, + "kimi_kl": 0.04296875, + "learning_rate": 3.459e-07, + "loss": 0.0005, + "ppl": 0.00384521484375, + "reward": 0.9989780187606812, + "reward_std": 0.0, + "rewards/perpo_ocr_edit_distance_reward": 0.9989780187606812, "step": 1541, "temperature": 0.9 }, { - "advantages": -1.498631149843277e-06, - "completion_length": 554.0, - "delta_ref_entropy_loss": 0.043701171875, - "delta_ref_ppl": -0.0294189453125, - "entropy_loss": -0.015411376953125, - "epoch": 0.6168, - "grad_norm": 0.5157750147427113, - "k1_kl": 0.02935791015625, - "k3_kl": 0.01873779296875, - "kimi_kl": 0.06365966796875, - "learning_rate": 1.916e-07, - "loss": 0.0008, - "ppl": 0.006511688232421875, - "reward": 0.9756229817867279, - "reward_std": 0.008476867340505123, - "rewards/perpo_ocr_edit_distance_reward": 0.9756230115890503, + "advantages": 1.6519002201675903e-06, + "completion_length": 43.0, + "delta_ref_entropy_loss": 0.10888671875, + "delta_ref_ppl": -0.6171875, + "entropy_loss": -0.09033203125, + "epoch": 0.3084, + "grad_norm": 4.519449644437149, + "k1_kl": 0.6171875, + "k3_kl": 0.5234375, + "kimi_kl": 2.34375, + "learning_rate": 3.458e-07, + "loss": 0.021, + "ppl": 0.038818359375, + "reward": 0.9079577326774597, + "reward_std": 0.005073359701782465, + "rewards/perpo_ocr_edit_distance_reward": 0.9079577922821045, "step": 1542, "temperature": 0.9 }, { - "advantages": -6.199309219834248e-05, - "completion_length": 329.5, - "delta_ref_entropy_loss": 0.0413818359375, - "delta_ref_ppl": -0.03460693359375, - "entropy_loss": -0.032958984375, - "epoch": 0.6172, - "grad_norm": 0.6554210571761436, - "k1_kl": 0.03460693359375, - "k3_kl": 0.01910400390625, - "kimi_kl": 0.040283203125, - "learning_rate": 1.9139999999999998e-07, - "loss": 0.0008, - "ppl": 0.01800537109375, - "reward": 0.9989013373851776, - "reward_std": 0.00046155884774634615, - "rewards/perpo_ocr_edit_distance_reward": 0.9989013671875, + "advantages": -4.4669424823950976e-05, + "completion_length": 620.0, + "delta_ref_entropy_loss": 0.1044921875, + "delta_ref_ppl": -0.10205078125, + "entropy_loss": -0.2060546875, + "epoch": 0.3086, + "grad_norm": 9.082535103814497, + "k1_kl": 0.10205078125, + "k3_kl": 0.0625, + "kimi_kl": 0.1796875, + "learning_rate": 3.457e-07, + "loss": 0.0025, + "ppl": 0.10546875, + "reward": 0.8669329285621643, + "reward_std": 0.0021864695008844137, + "rewards/perpo_ocr_edit_distance_reward": 0.8669330477714539, "step": 1543, "temperature": 0.9 }, { - "advantages": -8.234382039518096e-05, - "completion_length": 435.0, - "delta_ref_entropy_loss": 0.04833984375, - "delta_ref_ppl": -0.04510498046875, - "entropy_loss": -0.03515625, - "epoch": 0.6176, - "grad_norm": 0.8818670455428652, - "k1_kl": 0.0447998046875, - "k3_kl": 0.027587890625, - "kimi_kl": 0.08990478515625, - "learning_rate": 1.912e-07, - "loss": 0.0012, - "ppl": 0.0128326416015625, - "reward": 0.9974089562892914, - "reward_std": 0.0005978029294055887, - "rewards/perpo_ocr_edit_distance_reward": 0.9974090158939362, + "advantages": 2.628565016493667e-05, + "completion_length": 1202.0, + "delta_ref_entropy_loss": 0.0294189453125, + "delta_ref_ppl": -0.0311279296875, + "entropy_loss": -0.031494140625, + "epoch": 0.3088, + "grad_norm": 4.499872143388611, + "k1_kl": 0.0311279296875, + "k3_kl": 0.0291748046875, + "kimi_kl": 0.043701171875, + "learning_rate": 3.456e-07, + "loss": 0.0011, + "ppl": 0.015380859375, + "reward": 0.9978805780410767, + "reward_std": 0.0005474562058225274, + "rewards/perpo_ocr_edit_distance_reward": 0.9978805780410767, "step": 1544, "temperature": 0.9 }, { - "advantages": -0.0002980232238769531, - "completion_length": 293.0, - "delta_ref_entropy_loss": 0.02825927734375, - "delta_ref_ppl": -0.0150604248046875, - "entropy_loss": -0.0264892578125, - "epoch": 0.618, - "grad_norm": 0.02553249854444744, - "k1_kl": 0.015106201171875, - "k3_kl": 0.007598876953125, - "kimi_kl": 0.01711273193359375, - "learning_rate": 1.91e-07, - "loss": 0.0006, - "ppl": 0.013427734375, - "reward": 0.9512390196323395, - "reward_std": 0.0, - "rewards/perpo_ocr_edit_distance_reward": 0.9512390494346619, + "advantages": -4.938671054333099e-07, + "completion_length": 972.0, + "delta_ref_entropy_loss": 0.0869140625, + "delta_ref_ppl": -0.061279296875, + "entropy_loss": -0.310546875, + "epoch": 0.309, + "grad_norm": 121.83165789167971, + "k1_kl": 0.0615234375, + "k3_kl": 0.166015625, + "kimi_kl": 0.12890625, + "learning_rate": 3.4549999999999996e-07, + "loss": 0.0066, + "ppl": 0.181640625, + "reward": 0.49233198165893555, + "reward_std": 0.06892596930265427, + "rewards/perpo_ocr_edit_distance_reward": 0.4923320412635803, "step": 1545, "temperature": 0.9 }, { - "advantages": -3.089223901042715e-05, - "completion_length": 691.5, - "delta_ref_entropy_loss": 0.020477294921875, - "delta_ref_ppl": -0.012847900390625, - "entropy_loss": -0.02508544921875, - "epoch": 0.6184, - "grad_norm": 0.41464964183163167, - "k1_kl": 0.0128173828125, - "k3_kl": 0.00714111328125, - "kimi_kl": 0.011199951171875, - "learning_rate": 1.908e-07, - "loss": 0.0003, - "ppl": 0.013427734375, - "reward": 0.997974306344986, - "reward_std": 0.0006389411282725632, - "rewards/perpo_ocr_edit_distance_reward": 0.9979743361473083, + "advantages": -5.286080704536289e-05, + "completion_length": 286.0, + "delta_ref_entropy_loss": 0.1142578125, + "delta_ref_ppl": -0.1611328125, + "entropy_loss": -0.0546875, + "epoch": 0.3092, + "grad_norm": 0.9903042188865057, + "k1_kl": 0.1611328125, + "k3_kl": 0.1103515625, + "kimi_kl": 0.42578125, + "learning_rate": 3.4539999999999996e-07, + "loss": 0.0045, + "ppl": 0.0225830078125, + "reward": 0.9443871378898621, + "reward_std": 0.0011891901958733797, + "rewards/perpo_ocr_edit_distance_reward": 0.9443873167037964, "step": 1546, "temperature": 0.9 }, { - "advantages": -4.90120510221459e-05, - "completion_length": 777.5, - "delta_ref_entropy_loss": 0.0252685546875, - "delta_ref_ppl": -0.02667236328125, - "entropy_loss": -0.02581787109375, - "epoch": 0.6188, - "grad_norm": 0.3200302032216842, - "k1_kl": 0.026824951171875, - "k3_kl": 0.016937255859375, - "kimi_kl": 0.04595947265625, - "learning_rate": 1.9059999999999997e-07, - "loss": 0.0007, - "ppl": 0.012054443359375, - "reward": 0.9991604387760162, - "reward_std": 0.0002105900930473581, - "rewards/perpo_ocr_edit_distance_reward": 0.9991604685783386, + "advantages": -1.2704304936050903e-05, + "completion_length": 1371.0, + "delta_ref_entropy_loss": 0.0233154296875, + "delta_ref_ppl": -0.0262451171875, + "entropy_loss": -0.033935546875, + "epoch": 0.3094, + "grad_norm": 0.5924869360686278, + "k1_kl": 0.026123046875, + "k3_kl": 0.01434326171875, + "kimi_kl": 0.032470703125, + "learning_rate": 3.453e-07, + "loss": 0.0006, + "ppl": 0.01385498046875, + "reward": 0.9956756830215454, + "reward_std": 0.005263970699161291, + "rewards/perpo_ocr_edit_distance_reward": 0.9956756830215454, "step": 1547, "temperature": 0.9 }, { - "advantages": -2.3283064365386963e-06, - "completion_length": 369.0, - "delta_ref_entropy_loss": -0.0068359375, - "delta_ref_ppl": -0.1821441650390625, - "entropy_loss": -0.154205322265625, - "epoch": 0.6192, - "grad_norm": 0.24339666465203363, - "k1_kl": 0.182159423828125, - "k3_kl": 0.10858154296875, - "kimi_kl": 0.17303466796875, - "learning_rate": 1.904e-07, - "loss": 0.0043, - "ppl": 0.0315704345703125, - "reward": 0.5022807796485722, - "reward_std": 0.0, - "rewards/perpo_ocr_edit_distance_reward": 0.5022807796485722, + "advantages": 0.0, + "completion_length": 496.0, + "delta_ref_entropy_loss": 0.09423828125, + "delta_ref_ppl": -0.08154296875, + "entropy_loss": -0.09228515625, + "epoch": 0.3096, + "grad_norm": 1.0865147431815902, + "k1_kl": 0.08154296875, + "k3_kl": 0.04931640625, + "kimi_kl": 0.11962890625, + "learning_rate": 3.452e-07, + "loss": 0.002, + "ppl": 0.052001953125, + "reward": 0.96818608045578, + "reward_std": 0.0009884544415399432, + "rewards/perpo_ocr_edit_distance_reward": 0.9681861400604248, "step": 1548, "temperature": 0.9 }, { - "advantages": -0.00022642529074801132, - "completion_length": 756.0, - "delta_ref_entropy_loss": 0.02960205078125, - "delta_ref_ppl": -0.0262451171875, - "entropy_loss": -0.02471923828125, - "epoch": 0.6196, - "grad_norm": 1.057834957334971, - "k1_kl": 0.02630615234375, - "k3_kl": 0.015655517578125, - "kimi_kl": 0.0452880859375, - "learning_rate": 1.902e-07, - "loss": 0.0009, - "ppl": 0.011993408203125, - "reward": 0.9990397095680237, - "reward_std": 0.00037646804412361234, - "rewards/perpo_ocr_edit_distance_reward": 0.9990397691726685, + "advantages": -2.0963805582141504e-05, + "completion_length": 187.0, + "delta_ref_entropy_loss": 0.14453125, + "delta_ref_ppl": -0.263671875, + "entropy_loss": -0.08837890625, + "epoch": 0.3098, + "grad_norm": 2.2236500604927154, + "k1_kl": 0.263671875, + "k3_kl": 0.1953125, + "kimi_kl": 0.72265625, + "learning_rate": 3.451e-07, + "loss": 0.0078, + "ppl": 0.04248046875, + "reward": 0.9666971564292908, + "reward_std": 0.0047728451900184155, + "rewards/perpo_ocr_edit_distance_reward": 0.9666972160339355, "step": 1549, "temperature": 0.9 }, { - "advantages": -3.539664405138865e-05, - "completion_length": 1008.5, - "delta_ref_entropy_loss": 0.0452880859375, - "delta_ref_ppl": -0.04168701171875, - "entropy_loss": -0.0439453125, - "epoch": 0.62, - "grad_norm": 0.8059257514030581, - "k1_kl": 0.04156494140625, - "k3_kl": 0.02569580078125, - "kimi_kl": 0.0792236328125, - "learning_rate": 1.8999999999999998e-07, + "advantages": -1.6076224710559472e-05, + "completion_length": 827.0, + "delta_ref_entropy_loss": 0.053955078125, + "delta_ref_ppl": -0.051513671875, + "entropy_loss": -0.03466796875, + "epoch": 0.31, + "grad_norm": 0.6511382296491869, + "k1_kl": 0.051513671875, + "k3_kl": 0.02734375, + "kimi_kl": 0.060302734375, + "learning_rate": 3.45e-07, "loss": 0.0011, - "ppl": 0.0213623046875, - "reward": 0.783784806728363, - "reward_std": 0.15200599198578857, - "rewards/perpo_ocr_edit_distance_reward": 0.7837848961353302, + "ppl": 0.015380859375, + "reward": 0.9156296849250793, + "reward_std": 0.0009586805826984346, + "rewards/perpo_ocr_edit_distance_reward": 0.9156297445297241, "step": 1550, "temperature": 0.9 }, { - "advantages": -5.347388446352852e-05, - "completion_length": 674.5, - "delta_ref_entropy_loss": 0.0234375, - "delta_ref_ppl": -0.0345458984375, - "entropy_loss": -0.03741455078125, - "epoch": 0.6204, - "grad_norm": 0.6178933292434211, - "k1_kl": 0.034423828125, - "k3_kl": 0.02349853515625, - "kimi_kl": 0.07763671875, - "learning_rate": 1.898e-07, - "loss": 0.001, - "ppl": 0.018157958984375, - "reward": 0.6433026343584061, - "reward_std": 0.01178690989036113, - "rewards/perpo_ocr_edit_distance_reward": 0.6433026790618896, + "advantages": -1.863922443590127e-05, + "completion_length": 744.0, + "delta_ref_entropy_loss": 0.046630859375, + "delta_ref_ppl": -0.0390625, + "entropy_loss": -0.0205078125, + "epoch": 0.3102, + "grad_norm": 0.35509761780604676, + "k1_kl": 0.0390625, + "k3_kl": 0.0184326171875, + "kimi_kl": 0.03466796875, + "learning_rate": 3.4489999999999997e-07, + "loss": 0.0008, + "ppl": 0.00677490234375, + "reward": 0.9819690585136414, + "reward_std": 0.00035686319461092353, + "rewards/perpo_ocr_edit_distance_reward": 0.9819691181182861, "step": 1551, "temperature": 0.9 }, { - "advantages": 1.800911923055537e-05, - "completion_length": 467.0, - "delta_ref_entropy_loss": 0.04425048828125, - "delta_ref_ppl": -0.0631103515625, - "entropy_loss": -0.02496337890625, - "epoch": 0.6208, - "grad_norm": 0.24284902931463623, - "k1_kl": 0.0631103515625, - "k3_kl": 0.0428466796875, - "kimi_kl": 0.14697265625, - "learning_rate": 1.8959999999999998e-07, - "loss": 0.0017, - "ppl": 0.011138916015625, - "reward": 0.9795311689376831, - "reward_std": 0.00018644632655195892, - "rewards/perpo_ocr_edit_distance_reward": 0.9795311689376831, + "advantages": -2.0895686247968115e-05, + "completion_length": 628.0, + "delta_ref_entropy_loss": 0.1083984375, + "delta_ref_ppl": -0.1171875, + "entropy_loss": -0.07373046875, + "epoch": 0.3104, + "grad_norm": 0.7368132091019458, + "k1_kl": 0.11669921875, + "k3_kl": 0.07568359375, + "kimi_kl": 0.255859375, + "learning_rate": 3.4479999999999996e-07, + "loss": 0.0031, + "ppl": 0.03125, + "reward": 0.8914586305618286, + "reward_std": 0.002343685133382678, + "rewards/perpo_ocr_edit_distance_reward": 0.8914586305618286, "step": 1552, "temperature": 0.9 }, { - "advantages": -7.2802820909601e-06, - "completion_length": 448.0, - "delta_ref_entropy_loss": 0.0491943359375, - "delta_ref_ppl": -0.04803466796875, - "entropy_loss": -0.0416259765625, - "epoch": 0.6212, - "grad_norm": 0.9995944446469651, - "k1_kl": 0.04827880859375, - "k3_kl": 0.0289306640625, - "kimi_kl": 0.06689453125, - "learning_rate": 1.8940000000000002e-07, - "loss": 0.0012, - "ppl": 0.0233154296875, - "reward": 0.939052164554596, - "reward_std": 0.011085973004810512, - "rewards/perpo_ocr_edit_distance_reward": 0.939052164554596, + "advantages": -7.322856845348724e-07, + "completion_length": 611.0, + "delta_ref_entropy_loss": 0.11376953125, + "delta_ref_ppl": -0.11474609375, + "entropy_loss": -0.16015625, + "epoch": 0.3106, + "grad_norm": 6.1551646365378465, + "k1_kl": 0.115234375, + "k3_kl": 0.07373046875, + "kimi_kl": 0.15234375, + "learning_rate": 3.447e-07, + "loss": 0.003, + "ppl": 0.080078125, + "reward": 0.6938909292221069, + "reward_std": 0.09304474294185638, + "rewards/perpo_ocr_edit_distance_reward": 0.6938909888267517, "step": 1553, "temperature": 0.9 }, { - "advantages": -4.317079401516821e-06, - "completion_length": 454.5, - "delta_ref_entropy_loss": 0.03460693359375, - "delta_ref_ppl": -0.042144775390625, - "entropy_loss": -0.037109375, - "epoch": 0.6216, - "grad_norm": 1.2890204152840945, - "k1_kl": 0.042388916015625, - "k3_kl": 0.030487060546875, - "kimi_kl": 0.0982666015625, - "learning_rate": 1.892e-07, - "loss": 0.0012, - "ppl": 0.02166748046875, - "reward": 0.9838171899318695, - "reward_std": 0.016274301015073434, - "rewards/perpo_ocr_edit_distance_reward": 0.9838171601295471, + "advantages": -3.065381861233618e-06, + "completion_length": 44.0, + "delta_ref_entropy_loss": 0.1201171875, + "delta_ref_ppl": -0.578125, + "entropy_loss": -0.10498046875, + "epoch": 0.3108, + "grad_norm": 4.709916525115992, + "k1_kl": 0.578125, + "k3_kl": 0.470703125, + "kimi_kl": 2.28125, + "learning_rate": 3.446e-07, + "loss": 0.0188, + "ppl": 0.041015625, + "reward": 0.978863000869751, + "reward_std": 0.005454310216009617, + "rewards/perpo_ocr_edit_distance_reward": 0.978863000869751, "step": 1554, "temperature": 0.9 }, { - "advantages": -5.846364365424961e-05, - "completion_length": 1215.5, - "delta_ref_entropy_loss": 0.01959228515625, - "delta_ref_ppl": -0.01275634765625, - "entropy_loss": -0.032073974609375, - "epoch": 0.622, - "grad_norm": 1.7713746154679741, - "k1_kl": 0.0127716064453125, - "k3_kl": 0.00896453857421875, - "kimi_kl": 0.0141143798828125, - "learning_rate": 1.8899999999999999e-07, - "loss": 0.0004, - "ppl": 0.0154876708984375, - "reward": 0.998136967420578, - "reward_std": 0.00042325418326072395, - "rewards/perpo_ocr_edit_distance_reward": 0.9981370270252228, + "advantages": -4.572527814161731e-06, + "completion_length": 452.0, + "delta_ref_entropy_loss": 0.06396484375, + "delta_ref_ppl": -0.07568359375, + "entropy_loss": -0.11083984375, + "epoch": 0.311, + "grad_norm": 2.1142244152249705, + "k1_kl": 0.0751953125, + "k3_kl": 0.047607421875, + "kimi_kl": 0.1171875, + "learning_rate": 3.4449999999999994e-07, + "loss": 0.0019, + "ppl": 0.05224609375, + "reward": 0.9755722284317017, + "reward_std": 0.01107757817953825, + "rewards/perpo_ocr_edit_distance_reward": 0.9755722880363464, "step": 1555, "temperature": 0.9 }, { - "advantages": -9.877341629049852e-07, - "completion_length": 231.0, - "delta_ref_entropy_loss": 0.09033203125, - "delta_ref_ppl": -0.118896484375, - "entropy_loss": -0.1055908203125, - "epoch": 0.6224, - "grad_norm": 1.6922491223033416, - "k1_kl": 0.118896484375, - "k3_kl": 0.0880126953125, - "kimi_kl": 0.34716796875, - "learning_rate": 1.888e-07, - "loss": 0.0035, - "ppl": 0.0506591796875, - "reward": 0.6345645040273666, - "reward_std": 0.10449833981692791, - "rewards/perpo_ocr_edit_distance_reward": 0.6345645636320114, + "advantages": -0.0001404498325427994, + "completion_length": 510.0, + "delta_ref_entropy_loss": 0.04833984375, + "delta_ref_ppl": -0.0751953125, + "entropy_loss": -0.03125, + "epoch": 0.3112, + "grad_norm": 0.6800123608755679, + "k1_kl": 0.0751953125, + "k3_kl": 0.050048828125, + "kimi_kl": 0.15625, + "learning_rate": 3.444e-07, + "loss": 0.0021, + "ppl": 0.01531982421875, + "reward": 0.9961665868759155, + "reward_std": 0.00044563887058757246, + "rewards/perpo_ocr_edit_distance_reward": 0.9961667060852051, "step": 1556, "temperature": 0.9 }, { - "advantages": -9.304711056756787e-06, - "completion_length": 705.0, - "delta_ref_entropy_loss": 0.0501708984375, - "delta_ref_ppl": -0.0394287109375, - "entropy_loss": -0.0537109375, - "epoch": 0.6228, - "grad_norm": 0.6312538089546401, - "k1_kl": 0.03936767578125, - "k3_kl": 0.0242919921875, - "kimi_kl": 0.06982421875, - "learning_rate": 1.8859999999999998e-07, - "loss": 0.001, - "ppl": 0.030975341796875, - "reward": 0.9812949299812317, - "reward_std": 0.0017802055226638913, - "rewards/perpo_ocr_edit_distance_reward": 0.9812949597835541, + "advantages": -2.384185791015625e-06, + "completion_length": 97.0, + "delta_ref_entropy_loss": 0.154296875, + "delta_ref_ppl": -0.43359375, + "entropy_loss": -0.28515625, + "epoch": 0.3114, + "grad_norm": 3.90146780354289, + "k1_kl": 0.43359375, + "k3_kl": 0.34765625, + "kimi_kl": 1.84375, + "learning_rate": 3.443e-07, + "loss": 0.0139, + "ppl": 0.10498046875, + "reward": 0.6770182251930237, + "reward_std": 0.028381818905472755, + "rewards/perpo_ocr_edit_distance_reward": 0.6770183444023132, "step": 1557, "temperature": 0.9 }, { - "advantages": -1.7655747797107324e-05, - "completion_length": 775.0, - "delta_ref_entropy_loss": 0.0416259765625, - "delta_ref_ppl": -0.03094482421875, - "entropy_loss": -0.03363037109375, - "epoch": 0.6232, - "grad_norm": 0.6776058925656081, - "k1_kl": 0.0308837890625, - "k3_kl": 0.016845703125, - "kimi_kl": 0.039337158203125, - "learning_rate": 1.884e-07, - "loss": 0.0007, - "ppl": 0.0155029296875, - "reward": 0.9973916709423065, - "reward_std": 0.0022906879894435406, - "rewards/perpo_ocr_edit_distance_reward": 0.9973916709423065, + "advantages": -5.66414455533959e-05, + "completion_length": 172.0, + "delta_ref_entropy_loss": 0.078125, + "delta_ref_ppl": -0.240234375, + "entropy_loss": -0.03857421875, + "epoch": 0.3116, + "grad_norm": 0.7567963991873636, + "k1_kl": 0.240234375, + "k3_kl": 0.1923828125, + "kimi_kl": 0.8515625, + "learning_rate": 3.4419999999999997e-07, + "loss": 0.0078, + "ppl": 0.01226806640625, + "reward": 0.9847290515899658, + "reward_std": 0.0006516575813293457, + "rewards/perpo_ocr_edit_distance_reward": 0.9847291111946106, "step": 1558, "temperature": 0.9 }, { - "advantages": -0.00012177654934930615, - "completion_length": 484.5, - "delta_ref_entropy_loss": 0.04266357421875, - "delta_ref_ppl": -0.04107666015625, - "entropy_loss": -0.03375244140625, - "epoch": 0.6236, - "grad_norm": 1.0107669141539, - "k1_kl": 0.041015625, - "k3_kl": 0.0264892578125, - "kimi_kl": 0.06591796875, - "learning_rate": 1.882e-07, - "loss": 0.0012, - "ppl": 0.016876220703125, - "reward": 0.992087334394455, - "reward_std": 0.0012225423779455014, - "rewards/perpo_ocr_edit_distance_reward": 0.9920873641967773, + "advantages": -0.00041699278517626226, + "completion_length": 496.0, + "delta_ref_entropy_loss": 0.04248046875, + "delta_ref_ppl": -0.0625, + "entropy_loss": -0.0216064453125, + "epoch": 0.3118, + "grad_norm": 0.015886321153143975, + "k1_kl": 0.06298828125, + "k3_kl": 0.04150390625, + "kimi_kl": 0.1298828125, + "learning_rate": 3.441e-07, + "loss": 0.0021, + "ppl": 0.00604248046875, + "reward": 0.9971105456352234, + "reward_std": 2.0995441900595324e-06, + "rewards/perpo_ocr_edit_distance_reward": 0.9971106648445129, "step": 1559, "temperature": 0.9 }, { - "advantages": 0.0, - "completion_length": 484.5, - "delta_ref_entropy_loss": 0.0224609375, - "delta_ref_ppl": -0.01641845703125, - "entropy_loss": -0.02105712890625, - "epoch": 0.624, - "grad_norm": 0.019465118165781142, - "k1_kl": 0.016448974609375, - "k3_kl": 0.0085601806640625, - "kimi_kl": 0.017852783203125, - "learning_rate": 1.88e-07, - "loss": 0.0003, - "ppl": 0.010223388671875, - "reward": 1.0, - "reward_std": 0.0, - "rewards/perpo_ocr_edit_distance_reward": 1.0, + "advantages": -7.459095741069177e-06, + "completion_length": 424.0, + "delta_ref_entropy_loss": 0.10302734375, + "delta_ref_ppl": -0.11865234375, + "entropy_loss": -0.08251953125, + "epoch": 0.312, + "grad_norm": 1.3712437638495323, + "k1_kl": 0.11865234375, + "k3_kl": 0.0751953125, + "kimi_kl": 0.251953125, + "learning_rate": 3.4399999999999996e-07, + "loss": 0.003, + "ppl": 0.040283203125, + "reward": 0.969549834728241, + "reward_std": 0.0010423845378682017, + "rewards/perpo_ocr_edit_distance_reward": 0.9695498943328857, "step": 1560, "temperature": 0.9 }, { - "advantages": -1.640405025682412e-05, - "completion_length": 456.0, - "delta_ref_entropy_loss": 0.06005859375, - "delta_ref_ppl": -0.04840087890625, - "entropy_loss": -0.0423583984375, - "epoch": 0.6244, - "grad_norm": 0.8920439744747959, - "k1_kl": 0.048431396484375, - "k3_kl": 0.032745361328125, - "kimi_kl": 0.1574859619140625, - "learning_rate": 1.8779999999999997e-07, - "loss": 0.0013, - "ppl": 0.021759033203125, - "reward": 0.9875039160251617, - "reward_std": 0.005031436419812962, - "rewards/perpo_ocr_edit_distance_reward": 0.9875040054321289, + "advantages": -1.0899135531872162e-06, + "completion_length": 992.0, + "delta_ref_entropy_loss": 0.07275390625, + "delta_ref_ppl": -0.08203125, + "entropy_loss": -0.12109375, + "epoch": 0.3122, + "grad_norm": 1.2441887786673, + "k1_kl": 0.08203125, + "k3_kl": 0.052978515625, + "kimi_kl": 0.12060546875, + "learning_rate": 3.4389999999999995e-07, + "loss": 0.0021, + "ppl": 0.06103515625, + "reward": 0.9047282934188843, + "reward_std": 0.015735315158963203, + "rewards/perpo_ocr_edit_distance_reward": 0.9047282934188843, "step": 1561, "temperature": 0.9 }, { - "advantages": -0.00010982581898133503, - "completion_length": 706.0, - "delta_ref_entropy_loss": 0.080810546875, - "delta_ref_ppl": -0.05767822265625, - "entropy_loss": -0.0994873046875, - "epoch": 0.6248, - "grad_norm": 0.9997912413783254, - "k1_kl": 0.05743408203125, - "k3_kl": 0.033050537109375, - "kimi_kl": 0.09375, - "learning_rate": 1.8759999999999999e-07, - "loss": 0.0014, - "ppl": 0.0558929443359375, - "reward": 0.8764782845973969, - "reward_std": 0.003014158981386572, - "rewards/perpo_ocr_edit_distance_reward": 0.876478374004364, + "advantages": -8.97475729288999e-06, + "completion_length": 612.0, + "delta_ref_entropy_loss": 0.053466796875, + "delta_ref_ppl": -0.0625, + "entropy_loss": -0.03759765625, + "epoch": 0.3124, + "grad_norm": 3.6508767261674184, + "k1_kl": 0.0625, + "k3_kl": 0.04052734375, + "kimi_kl": 0.166015625, + "learning_rate": 3.438e-07, + "loss": 0.0016, + "ppl": 0.01373291015625, + "reward": 0.9971147179603577, + "reward_std": 0.0036946346517652273, + "rewards/perpo_ocr_edit_distance_reward": 0.9971147775650024, "step": 1562, "temperature": 0.9 }, { - "advantages": -2.4650778414070373e-06, - "completion_length": 504.0, - "delta_ref_entropy_loss": 0.0657958984375, - "delta_ref_ppl": -0.04217529296875, - "entropy_loss": -0.0760498046875, - "epoch": 0.6252, - "grad_norm": 0.8386937420574558, - "k1_kl": 0.04217529296875, - "k3_kl": 0.020721435546875, - "kimi_kl": 0.0396728515625, - "learning_rate": 1.874e-07, - "loss": 0.0008, - "ppl": 0.039825439453125, - "reward": 0.9402806460857391, - "reward_std": 0.0034055416472256184, - "rewards/perpo_ocr_edit_distance_reward": 0.9402806758880615, + "advantages": -5.211149073147681e-06, + "completion_length": 51.0, + "delta_ref_entropy_loss": 0.1171875, + "delta_ref_ppl": -0.73828125, + "entropy_loss": -0.09228515625, + "epoch": 0.3126, + "grad_norm": 4.029333851291017, + "k1_kl": 0.73828125, + "k3_kl": 0.62109375, + "kimi_kl": 3.09375, + "learning_rate": 3.437e-07, + "loss": 0.0249, + "ppl": 0.0306396484375, + "reward": 0.7072484493255615, + "reward_std": 0.004807405173778534, + "rewards/perpo_ocr_edit_distance_reward": 0.7072484493255615, "step": 1563, "temperature": 0.9 }, { - "advantages": -1.0311603773516254e-05, - "completion_length": 1154.0, - "delta_ref_entropy_loss": 0.096435546875, - "delta_ref_ppl": -0.08251953125, - "entropy_loss": -0.1259765625, - "epoch": 0.6256, - "grad_norm": 1.9414382902929395, - "k1_kl": 0.08251953125, - "k3_kl": 0.0672607421875, - "kimi_kl": 0.1513671875, - "learning_rate": 1.872e-07, + "advantages": -8.259501555585302e-06, + "completion_length": 526.0, + "delta_ref_entropy_loss": 0.0400390625, + "delta_ref_ppl": -0.0908203125, + "entropy_loss": -0.044189453125, + "epoch": 0.3128, + "grad_norm": 0.8507938626413453, + "k1_kl": 0.09033203125, + "k3_kl": 0.0673828125, + "kimi_kl": 0.220703125, + "learning_rate": 3.436e-07, "loss": 0.0027, - "ppl": 0.06787109375, - "reward": 0.9013842046260834, - "reward_std": 0.01026983861811459, - "rewards/perpo_ocr_edit_distance_reward": 0.9013842940330505, + "ppl": 0.020751953125, + "reward": 0.972958505153656, + "reward_std": 0.0081584881991148, + "rewards/perpo_ocr_edit_distance_reward": 0.9729586243629456, "step": 1564, "temperature": 0.9 }, { - "advantages": -0.0002980232238769531, - "completion_length": 621.0, - "delta_ref_entropy_loss": 0.02044677734375, - "delta_ref_ppl": -0.014007568359375, - "entropy_loss": -0.01800537109375, - "epoch": 0.626, - "grad_norm": 0.04861724664324108, - "k1_kl": 0.014068603515625, - "k3_kl": 0.0088348388671875, - "kimi_kl": 0.0182037353515625, - "learning_rate": 1.87e-07, - "loss": 0.0007, - "ppl": 0.0087890625, - "reward": 0.9988751113414764, - "reward_std": 0.0, - "rewards/perpo_ocr_edit_distance_reward": 0.9988751709461212, + "advantages": -2.060617771348916e-05, + "completion_length": 546.0, + "delta_ref_entropy_loss": 0.01611328125, + "delta_ref_ppl": -0.04541015625, + "entropy_loss": -0.03076171875, + "epoch": 0.313, + "grad_norm": 0.6894263999978717, + "k1_kl": 0.04541015625, + "k3_kl": 0.03564453125, + "kimi_kl": 0.119140625, + "learning_rate": 3.435e-07, + "loss": 0.0014, + "ppl": 0.0145263671875, + "reward": 0.9947324991226196, + "reward_std": 0.004031137563288212, + "rewards/perpo_ocr_edit_distance_reward": 0.9947326183319092, "step": 1565, "temperature": 0.9 }, { - "advantages": -7.408006268860845e-06, - "completion_length": 541.0, - "delta_ref_entropy_loss": 0.063720703125, - "delta_ref_ppl": -0.0574951171875, - "entropy_loss": -0.1064453125, - "epoch": 0.6264, - "grad_norm": 1.102973524318546, - "k1_kl": 0.0577392578125, - "k3_kl": 0.0369873046875, - "kimi_kl": 0.098876953125, - "learning_rate": 1.8679999999999998e-07, - "loss": 0.0015, - "ppl": 0.0546875, - "reward": 0.9087520241737366, - "reward_std": 0.013439883477985859, - "rewards/perpo_ocr_edit_distance_reward": 0.9087521135807037, + "advantages": -1.3010842849325854e-05, + "completion_length": 284.0, + "delta_ref_entropy_loss": 0.07861328125, + "delta_ref_ppl": -0.142578125, + "entropy_loss": -0.07568359375, + "epoch": 0.3132, + "grad_norm": 2.022384448409514, + "k1_kl": 0.142578125, + "k3_kl": 0.10107421875, + "kimi_kl": 0.34375, + "learning_rate": 3.4339999999999996e-07, + "loss": 0.004, + "ppl": 0.03759765625, + "reward": 0.942925751209259, + "reward_std": 0.002519747940823436, + "rewards/perpo_ocr_edit_distance_reward": 0.9429258108139038, "step": 1566, "temperature": 0.9 }, { - "advantages": -2.784388470900012e-06, - "completion_length": 692.5, - "delta_ref_entropy_loss": 0.0770263671875, - "delta_ref_ppl": -0.0478515625, - "entropy_loss": -0.072723388671875, - "epoch": 0.6268, - "grad_norm": 0.8653154048097073, - "k1_kl": 0.0478515625, - "k3_kl": 0.02764892578125, - "kimi_kl": 0.0888671875, - "learning_rate": 1.866e-07, - "loss": 0.0011, - "ppl": 0.037685394287109375, - "reward": 0.96901735663414, - "reward_std": 0.003012077184394002, - "rewards/perpo_ocr_edit_distance_reward": 0.9690173864364624, + "advantages": 1.2908663848065771e-05, + "completion_length": 524.0, + "delta_ref_entropy_loss": 0.06396484375, + "delta_ref_ppl": -0.0693359375, + "entropy_loss": -0.0262451171875, + "epoch": 0.3134, + "grad_norm": 0.48680886120399797, + "k1_kl": 0.0693359375, + "k3_kl": 0.0400390625, + "kimi_kl": 0.12451171875, + "learning_rate": 3.4329999999999996e-07, + "loss": 0.0016, + "ppl": 0.01080322265625, + "reward": 0.9982288479804993, + "reward_std": 0.0005589530919678509, + "rewards/perpo_ocr_edit_distance_reward": 0.998228907585144, "step": 1567, "temperature": 0.9 }, { - "advantages": -6.292547709563223e-06, - "completion_length": 355.0, - "delta_ref_entropy_loss": 0.05621337890625, - "delta_ref_ppl": -0.039794921875, - "entropy_loss": -0.050048828125, - "epoch": 0.6272, - "grad_norm": 1.1849295433423002, - "k1_kl": 0.039794921875, - "k3_kl": 0.0228271484375, - "kimi_kl": 0.05029296875, - "learning_rate": 1.864e-07, - "loss": 0.0009, - "ppl": 0.0264892578125, - "reward": 0.8999609053134918, - "reward_std": 0.03612030833028257, - "rewards/perpo_ocr_edit_distance_reward": 0.8999609649181366, + "advantages": -4.208088284940459e-05, + "completion_length": 408.0, + "delta_ref_entropy_loss": 0.0732421875, + "delta_ref_ppl": -0.1201171875, + "entropy_loss": -0.061279296875, + "epoch": 0.3136, + "grad_norm": 1.057042410625441, + "k1_kl": 0.1201171875, + "k3_kl": 0.08349609375, + "kimi_kl": 0.291015625, + "learning_rate": 3.432e-07, + "loss": 0.0034, + "ppl": 0.026123046875, + "reward": 0.990875244140625, + "reward_std": 0.002530178287997842, + "rewards/perpo_ocr_edit_distance_reward": 0.9908753037452698, "step": 1568, "temperature": 0.9 }, { - "advantages": -0.00015423127842950635, - "completion_length": 635.5, - "delta_ref_entropy_loss": 0.0274658203125, - "delta_ref_ppl": -0.035888671875, - "entropy_loss": -0.02020263671875, - "epoch": 0.6276, - "grad_norm": 0.4895263621881399, - "k1_kl": 0.035888671875, - "k3_kl": 0.02508544921875, - "kimi_kl": 0.10113525390625, - "learning_rate": 1.8619999999999999e-07, - "loss": 0.0012, - "ppl": 0.0103759765625, - "reward": 0.9926966428756714, - "reward_std": 0.000581699627218768, - "rewards/perpo_ocr_edit_distance_reward": 0.9926967322826385, + "advantages": 0.0, + "completion_length": 723.0, + "delta_ref_entropy_loss": 0.058349609375, + "delta_ref_ppl": -0.06982421875, + "entropy_loss": -0.150390625, + "epoch": 0.3138, + "grad_norm": 1.462567978577939, + "k1_kl": 0.0703125, + "k3_kl": 0.046630859375, + "kimi_kl": 0.1220703125, + "learning_rate": 3.431e-07, + "loss": 0.0019, + "ppl": 0.0771484375, + "reward": 0.43170222640037537, + "reward_std": 0.02044546790421009, + "rewards/perpo_ocr_edit_distance_reward": 0.43170222640037537, "step": 1569, "temperature": 0.9 }, { - "advantages": -4.606587935995776e-06, - "completion_length": 675.0, - "delta_ref_entropy_loss": 0.0455322265625, - "delta_ref_ppl": -0.04962158203125, - "entropy_loss": -0.068603515625, - "epoch": 0.628, - "grad_norm": 1.7315364796249042, - "k1_kl": 0.0496826171875, - "k3_kl": 0.02972412109375, - "kimi_kl": 0.067626953125, - "learning_rate": 1.86e-07, - "loss": 0.0012, - "ppl": 0.03662109375, - "reward": 0.7468773424625397, - "reward_std": 0.06296904478222132, - "rewards/perpo_ocr_edit_distance_reward": 0.7468774318695068, + "advantages": -2.4029188352869824e-05, + "completion_length": 559.0, + "delta_ref_entropy_loss": 0.099609375, + "delta_ref_ppl": -0.08203125, + "entropy_loss": -0.146484375, + "epoch": 0.314, + "grad_norm": 17.73229252848852, + "k1_kl": 0.08203125, + "k3_kl": 0.1025390625, + "kimi_kl": 0.126953125, + "learning_rate": 3.43e-07, + "loss": 0.0041, + "ppl": 0.07421875, + "reward": 0.9517744183540344, + "reward_std": 0.0013178990921005607, + "rewards/perpo_ocr_edit_distance_reward": 0.9517744779586792, "step": 1570, "temperature": 0.9 }, { - "advantages": -2.0154886442469433e-05, - "completion_length": 198.0, - "delta_ref_entropy_loss": 0.0372314453125, - "delta_ref_ppl": -0.0657958984375, - "entropy_loss": -0.0438232421875, - "epoch": 0.6284, - "grad_norm": 2.1264824728100336, - "k1_kl": 0.065673828125, - "k3_kl": 0.03619384765625, - "kimi_kl": 0.0701904296875, - "learning_rate": 1.8579999999999998e-07, - "loss": 0.0015, - "ppl": 0.02764892578125, - "reward": 0.9994728565216064, - "reward_std": 0.0009002790320664644, - "rewards/perpo_ocr_edit_distance_reward": 0.9994728863239288, + "advantages": -7.44036296964623e-05, + "completion_length": 858.0, + "delta_ref_entropy_loss": 0.0537109375, + "delta_ref_ppl": -0.0517578125, + "entropy_loss": -0.0224609375, + "epoch": 0.3142, + "grad_norm": 0.365391681808425, + "k1_kl": 0.0517578125, + "k3_kl": 0.0262451171875, + "kimi_kl": 0.072265625, + "learning_rate": 3.429e-07, + "loss": 0.0011, + "ppl": 0.007415771484375, + "reward": 0.9989961981773376, + "reward_std": 0.0010443058563396335, + "rewards/perpo_ocr_edit_distance_reward": 0.9989962577819824, "step": 1571, "temperature": 0.9 }, { - "advantages": 1.395813092130993e-05, - "completion_length": 934.0, - "delta_ref_entropy_loss": 0.03509521484375, - "delta_ref_ppl": -0.04388427734375, - "entropy_loss": -0.06732177734375, - "epoch": 0.6288, - "grad_norm": 2.1071519293162435, - "k1_kl": 0.04388427734375, - "k3_kl": 0.028350830078125, - "kimi_kl": 0.07452392578125, - "learning_rate": 1.8559999999999997e-07, - "loss": 0.0011, - "ppl": 0.0396728515625, - "reward": 0.9338712692260742, - "reward_std": 0.022514913929626346, - "rewards/perpo_ocr_edit_distance_reward": 0.9338712990283966, + "advantages": 1.6348703866242431e-06, + "completion_length": 103.0, + "delta_ref_entropy_loss": 0.125, + "delta_ref_ppl": -0.458984375, + "entropy_loss": -0.177734375, + "epoch": 0.3144, + "grad_norm": 5.002288859927942, + "k1_kl": 0.458984375, + "k3_kl": 0.359375, + "kimi_kl": 1.6953125, + "learning_rate": 3.4279999999999997e-07, + "loss": 0.0144, + "ppl": 0.09423828125, + "reward": 0.7723916172981262, + "reward_std": 0.01027454063296318, + "rewards/perpo_ocr_edit_distance_reward": 0.7723916172981262, "step": 1572, "temperature": 0.9 }, { - "advantages": -5.3167344191251686e-05, - "completion_length": 411.5, - "delta_ref_entropy_loss": 0.03326416015625, - "delta_ref_ppl": -0.045135498046875, - "entropy_loss": -0.019683837890625, - "epoch": 0.6292, - "grad_norm": 0.5377960583958165, - "k1_kl": 0.045135498046875, - "k3_kl": 0.030670166015625, - "kimi_kl": 0.09625244140625, - "learning_rate": 1.854e-07, - "loss": 0.0013, - "ppl": 0.0084991455078125, - "reward": 0.9901179373264313, - "reward_std": 0.005501628635101952, - "rewards/perpo_ocr_edit_distance_reward": 0.9901179671287537, + "advantages": -1.7029898913278885e-07, + "completion_length": 154.0, + "delta_ref_entropy_loss": 0.044921875, + "delta_ref_ppl": -0.1904296875, + "entropy_loss": -0.0888671875, + "epoch": 0.3146, + "grad_norm": 2.8023614105702452, + "k1_kl": 0.189453125, + "k3_kl": 0.154296875, + "kimi_kl": 0.59375, + "learning_rate": 3.4269999999999997e-07, + "loss": 0.0062, + "ppl": 0.04833984375, + "reward": 0.539229154586792, + "reward_std": 0.30404457449913025, + "rewards/perpo_ocr_edit_distance_reward": 0.5392292141914368, "step": 1573, "temperature": 0.9 }, { - "advantages": -1.6369991499232128e-05, - "completion_length": 346.5, - "delta_ref_entropy_loss": 0.048583984375, - "delta_ref_ppl": -0.0498046875, - "entropy_loss": -0.030731201171875, - "epoch": 0.6296, - "grad_norm": 1.4249634126997306, - "k1_kl": 0.0499267578125, - "k3_kl": 0.0325927734375, - "kimi_kl": 0.1309814453125, - "learning_rate": 1.852e-07, - "loss": 0.0013, - "ppl": 0.0123443603515625, - "reward": 0.9997342824935913, - "reward_std": 0.0003401620197109878, - "rewards/perpo_ocr_edit_distance_reward": 0.9997343122959137, + "advantages": 2.9798065952491015e-05, + "completion_length": 616.0, + "delta_ref_entropy_loss": 0.0654296875, + "delta_ref_ppl": -0.055419921875, + "entropy_loss": -0.045654296875, + "epoch": 0.3148, + "grad_norm": 0.6424751293602246, + "k1_kl": 0.055419921875, + "k3_kl": 0.027587890625, + "kimi_kl": 0.06591796875, + "learning_rate": 3.426e-07, + "loss": 0.0011, + "ppl": 0.02099609375, + "reward": 0.9949647188186646, + "reward_std": 0.0007574196206405759, + "rewards/perpo_ocr_edit_distance_reward": 0.9949647784233093, "step": 1574, "temperature": 0.9 }, { - "advantages": -0.0003344161195855122, - "completion_length": 522.5, - "delta_ref_entropy_loss": 0.033935546875, - "delta_ref_ppl": -0.03076171875, - "entropy_loss": -0.0189208984375, - "epoch": 0.63, - "grad_norm": 0.19177163288650864, - "k1_kl": 0.0308837890625, - "k3_kl": 0.0213623046875, - "kimi_kl": 0.0968017578125, - "learning_rate": 1.85e-07, - "loss": 0.0012, - "ppl": 0.0066375732421875, - "reward": 0.9626035690307617, - "reward_std": 0.0001254396775038913, - "rewards/perpo_ocr_edit_distance_reward": 0.9626036286354065, + "advantages": -1.0524478057050146e-05, + "completion_length": 476.0, + "delta_ref_entropy_loss": 0.0625, + "delta_ref_ppl": -0.09619140625, + "entropy_loss": -0.03369140625, + "epoch": 0.315, + "grad_norm": 0.41878064823376077, + "k1_kl": 0.09619140625, + "k3_kl": 0.0673828125, + "kimi_kl": 0.263671875, + "learning_rate": 3.425e-07, + "loss": 0.0027, + "ppl": 0.01373291015625, + "reward": 0.9978553652763367, + "reward_std": 0.0007091594161465764, + "rewards/perpo_ocr_edit_distance_reward": 0.9978554248809814, "step": 1575, "temperature": 0.9 }, { - "advantages": -2.982786827487871e-05, - "completion_length": 394.5, - "delta_ref_entropy_loss": 0.02239990234375, - "delta_ref_ppl": -0.03509521484375, - "entropy_loss": -0.03143310546875, - "epoch": 0.6304, - "grad_norm": 0.8043829690445867, - "k1_kl": 0.03521728515625, - "k3_kl": 0.02691650390625, - "kimi_kl": 0.12396240234375, - "learning_rate": 1.848e-07, - "loss": 0.0011, - "ppl": 0.0172271728515625, - "reward": 0.9983818829059601, - "reward_std": 0.000521034118719399, - "rewards/perpo_ocr_edit_distance_reward": 0.9983819425106049, + "advantages": -2.1355494027375244e-05, + "completion_length": 413.0, + "delta_ref_entropy_loss": 0.11376953125, + "delta_ref_ppl": -0.1318359375, + "entropy_loss": -0.0439453125, + "epoch": 0.3152, + "grad_norm": 0.8368449101538945, + "k1_kl": 0.1318359375, + "k3_kl": 0.08642578125, + "kimi_kl": 0.26953125, + "learning_rate": 3.4239999999999994e-07, + "loss": 0.0035, + "ppl": 0.0185546875, + "reward": 0.9926521182060242, + "reward_std": 0.0014937942614778876, + "rewards/perpo_ocr_edit_distance_reward": 0.9926521182060242, "step": 1576, "temperature": 0.9 }, { - "advantages": -5.2884222895954736e-05, - "completion_length": 589.5, - "delta_ref_entropy_loss": 0.03717041015625, - "delta_ref_ppl": -0.0347900390625, - "entropy_loss": -0.0355224609375, - "epoch": 0.6308, - "grad_norm": 0.6351340903249046, - "k1_kl": 0.03472900390625, - "k3_kl": 0.02264404296875, - "kimi_kl": 0.062255859375, - "learning_rate": 1.8459999999999997e-07, - "loss": 0.001, - "ppl": 0.02105712890625, - "reward": 0.9960430562496185, - "reward_std": 0.0005801131919724867, - "rewards/perpo_ocr_edit_distance_reward": 0.9960431158542633, + "advantages": -1.9788742065429688e-05, + "completion_length": 1268.0, + "delta_ref_entropy_loss": 0.0250244140625, + "delta_ref_ppl": -0.040771484375, + "entropy_loss": -0.046875, + "epoch": 0.3154, + "grad_norm": 1.0008788490714717, + "k1_kl": 0.040771484375, + "k3_kl": 0.03076171875, + "kimi_kl": 0.08984375, + "learning_rate": 3.423e-07, + "loss": 0.0012, + "ppl": 0.02392578125, + "reward": 0.9931257963180542, + "reward_std": 0.002479452406987548, + "rewards/perpo_ocr_edit_distance_reward": 0.9931259155273438, "step": 1577, "temperature": 0.9 }, { - "advantages": -9.949293053068686e-05, - "completion_length": 471.5, - "delta_ref_entropy_loss": 0.0538330078125, - "delta_ref_ppl": -0.034637451171875, - "entropy_loss": -0.062744140625, - "epoch": 0.6312, - "grad_norm": 0.7351088634191996, - "k1_kl": 0.034637451171875, - "k3_kl": 0.018218994140625, - "kimi_kl": 0.034149169921875, - "learning_rate": 1.844e-07, - "loss": 0.0008, - "ppl": 0.0340576171875, - "reward": 0.9695386290550232, - "reward_std": 0.0006297782529145479, - "rewards/perpo_ocr_edit_distance_reward": 0.9695387184619904, + "advantages": -1.7029899268550253e-08, + "completion_length": 620.0, + "delta_ref_entropy_loss": 0.045166015625, + "delta_ref_ppl": -0.083984375, + "entropy_loss": -0.0380859375, + "epoch": 0.3156, + "grad_norm": 1.614695025141436, + "k1_kl": 0.083984375, + "k3_kl": 0.06005859375, + "kimi_kl": 0.22265625, + "learning_rate": 3.422e-07, + "loss": 0.0024, + "ppl": 0.01708984375, + "reward": 0.613341748714447, + "reward_std": 0.002200832823291421, + "rewards/perpo_ocr_edit_distance_reward": 0.613341748714447, "step": 1578, "temperature": 0.9 }, { - "advantages": -0.0002980232238769531, - "completion_length": 261.0, - "delta_ref_entropy_loss": 0.0289306640625, - "delta_ref_ppl": -0.0811767578125, - "entropy_loss": -0.05511474609375, - "epoch": 0.6316, - "grad_norm": 0.12878915329057666, - "k1_kl": 0.0811767578125, - "k3_kl": 0.06243896484375, - "kimi_kl": 0.236083984375, - "learning_rate": 1.842e-07, - "loss": 0.0028, - "ppl": 0.026611328125, - "reward": 0.999775767326355, - "reward_std": 0.0, - "rewards/perpo_ocr_edit_distance_reward": 0.9997758269309998, + "advantages": -2.690724159037927e-06, + "completion_length": 578.0, + "delta_ref_entropy_loss": 0.13671875, + "delta_ref_ppl": -0.0986328125, + "entropy_loss": -0.1923828125, + "epoch": 0.3158, + "grad_norm": 2.1695417976581797, + "k1_kl": 0.0986328125, + "k3_kl": 0.056884765625, + "kimi_kl": 0.1337890625, + "learning_rate": 3.421e-07, + "loss": 0.0023, + "ppl": 0.08935546875, + "reward": 0.7996107339859009, + "reward_std": 0.015843600034713745, + "rewards/perpo_ocr_edit_distance_reward": 0.7996107935905457, "step": 1579, "temperature": 0.9 }, { - "advantages": -7.0163187047000974e-06, - "completion_length": 167.5, - "delta_ref_entropy_loss": 0.047607421875, - "delta_ref_ppl": -0.108154296875, - "entropy_loss": -0.027587890625, - "epoch": 0.632, - "grad_norm": 0.6929319868850378, - "k1_kl": 0.108154296875, - "k3_kl": 0.0810546875, - "kimi_kl": 0.30615234375, - "learning_rate": 1.8399999999999998e-07, - "loss": 0.0033, - "ppl": 0.00888824462890625, - "reward": 0.9862280786037445, - "reward_std": 0.001466436660848558, - "rewards/perpo_ocr_edit_distance_reward": 0.9862280786037445, + "advantages": -3.4332275390625e-05, + "completion_length": 873.0, + "delta_ref_entropy_loss": 0.05029296875, + "delta_ref_ppl": -0.07373046875, + "entropy_loss": -0.057373046875, + "epoch": 0.316, + "grad_norm": 0.8110403511070485, + "k1_kl": 0.07421875, + "k3_kl": 0.04931640625, + "kimi_kl": 0.15625, + "learning_rate": 3.42e-07, + "loss": 0.002, + "ppl": 0.02734375, + "reward": 0.9929527640342712, + "reward_std": 0.0021320038940757513, + "rewards/perpo_ocr_edit_distance_reward": 0.992952823638916, "step": 1580, "temperature": 0.9 }, { - "advantages": -3.49794136127457e-05, - "completion_length": 547.5, - "delta_ref_entropy_loss": 0.02825927734375, - "delta_ref_ppl": -0.0250244140625, - "entropy_loss": -0.017181396484375, - "epoch": 0.6324, - "grad_norm": 0.18410658871964555, - "k1_kl": 0.02508544921875, - "k3_kl": 0.01409912109375, - "kimi_kl": 0.036865234375, - "learning_rate": 1.838e-07, - "loss": 0.0006, - "ppl": 0.00698089599609375, - "reward": 0.9943639039993286, - "reward_std": 0.0001934797182912007, - "rewards/perpo_ocr_edit_distance_reward": 0.9943639039993286, + "advantages": -2.8227057555341162e-05, + "completion_length": 460.0, + "delta_ref_entropy_loss": 0.043212890625, + "delta_ref_ppl": -0.07373046875, + "entropy_loss": -0.0263671875, + "epoch": 0.3162, + "grad_norm": 0.7363529055410427, + "k1_kl": 0.0732421875, + "k3_kl": 0.05224609375, + "kimi_kl": 0.234375, + "learning_rate": 3.4189999999999996e-07, + "loss": 0.0021, + "ppl": 0.0107421875, + "reward": 0.9953916668891907, + "reward_std": 0.0014085498405620456, + "rewards/perpo_ocr_edit_distance_reward": 0.9953917264938354, "step": 1581, "temperature": 0.9 }, { - "advantages": -0.00010416338045615703, - "completion_length": 386.0, - "delta_ref_entropy_loss": 0.035400390625, - "delta_ref_ppl": -0.03143310546875, - "entropy_loss": -0.01483154296875, - "epoch": 0.6328, - "grad_norm": 0.4231509520134122, - "k1_kl": 0.03143310546875, - "k3_kl": 0.02093505859375, - "kimi_kl": 0.0693359375, - "learning_rate": 1.836e-07, - "loss": 0.0009, - "ppl": 0.005279541015625, - "reward": 0.9960265755653381, - "reward_std": 0.000403930782340467, - "rewards/perpo_ocr_edit_distance_reward": 0.9960266351699829, + "advantages": -4.393713879835559e-06, + "completion_length": 179.0, + "delta_ref_entropy_loss": 0.0859375, + "delta_ref_ppl": -0.220703125, + "entropy_loss": -0.1640625, + "epoch": 0.3164, + "grad_norm": 3.33978143889623, + "k1_kl": 0.2197265625, + "k3_kl": 0.1552734375, + "kimi_kl": 0.4140625, + "learning_rate": 3.418e-07, + "loss": 0.0062, + "ppl": 0.076171875, + "reward": 0.6714478731155396, + "reward_std": 0.007652623578906059, + "rewards/perpo_ocr_edit_distance_reward": 0.6714479923248291, "step": 1582, "temperature": 0.9 }, { - "advantages": -5.15324754815083e-05, - "completion_length": 224.0, - "delta_ref_entropy_loss": 0.0657958984375, - "delta_ref_ppl": -0.0697021484375, - "entropy_loss": -0.03875732421875, - "epoch": 0.6332, - "grad_norm": 0.4639072883714993, - "k1_kl": 0.0697021484375, - "k3_kl": 0.0469970703125, - "kimi_kl": 0.144775390625, - "learning_rate": 1.834e-07, - "loss": 0.0019, - "ppl": 0.0211181640625, - "reward": 0.9640571475028992, - "reward_std": 0.0004044051165692508, - "rewards/perpo_ocr_edit_distance_reward": 0.964057207107544, + "advantages": -1.5429088307428174e-05, + "completion_length": 390.0, + "delta_ref_entropy_loss": 0.06591796875, + "delta_ref_ppl": -0.08056640625, + "entropy_loss": -0.032470703125, + "epoch": 0.3166, + "grad_norm": 0.89120674048165, + "k1_kl": 0.0810546875, + "k3_kl": 0.05615234375, + "kimi_kl": 0.16796875, + "learning_rate": 3.417e-07, + "loss": 0.0023, + "ppl": 0.01434326171875, + "reward": 0.9951381683349609, + "reward_std": 0.004878579638898373, + "rewards/perpo_ocr_edit_distance_reward": 0.9951382279396057, "step": 1583, "temperature": 0.9 }, { - "advantages": 2.0031418115351585e-06, - "completion_length": 881.0, - "delta_ref_entropy_loss": 0.1285400390625, - "delta_ref_ppl": -0.06756591796875, - "entropy_loss": -0.2041015625, - "epoch": 0.6336, - "grad_norm": 3.910023007390029, - "k1_kl": 0.06756591796875, - "k3_kl": 0.034271240234375, - "kimi_kl": 0.0611572265625, - "learning_rate": 1.832e-07, - "loss": 0.0014, - "ppl": 0.1151123046875, - "reward": 0.8540866374969482, - "reward_std": 0.008026161463931203, - "rewards/perpo_ocr_edit_distance_reward": 0.8540866672992706, + "advantages": -9.67298274190398e-06, + "completion_length": 275.0, + "delta_ref_entropy_loss": 0.10546875, + "delta_ref_ppl": -0.19140625, + "entropy_loss": -0.150390625, + "epoch": 0.3168, + "grad_norm": 2.3025345680327605, + "k1_kl": 0.19140625, + "k3_kl": 0.134765625, + "kimi_kl": 0.44140625, + "learning_rate": 3.416e-07, + "loss": 0.0054, + "ppl": 0.0712890625, + "reward": 0.43720781803131104, + "reward_std": 0.0025358626153320074, + "rewards/perpo_ocr_edit_distance_reward": 0.4372078478336334, "step": 1584, "temperature": 0.9 }, { - "advantages": -1.1567559113245807e-05, - "completion_length": 1408.0, - "delta_ref_entropy_loss": 0.056884765625, - "delta_ref_ppl": -0.02587890625, - "entropy_loss": -0.0797119140625, - "epoch": 0.634, - "grad_norm": 47.77734513101581, - "k1_kl": 0.02569580078125, - "k3_kl": 0.29736328125, - "kimi_kl": 0.040283203125, - "learning_rate": 1.8299999999999998e-07, - "loss": 0.012, - "ppl": 0.053466796875, - "reward": 0.9332691133022308, - "reward_std": 0.0062183638801798224, - "rewards/perpo_ocr_edit_distance_reward": 0.9332691729068756, + "advantages": -5.53471727471333e-06, + "completion_length": 435.0, + "delta_ref_entropy_loss": 0.169921875, + "delta_ref_ppl": -0.181640625, + "entropy_loss": -0.28125, + "epoch": 0.317, + "grad_norm": 1.790441054175501, + "k1_kl": 0.181640625, + "k3_kl": 0.11083984375, + "kimi_kl": 0.26171875, + "learning_rate": 3.4150000000000003e-07, + "loss": 0.0045, + "ppl": 0.14453125, + "reward": 0.7184295654296875, + "reward_std": 0.019833996891975403, + "rewards/perpo_ocr_edit_distance_reward": 0.7184296250343323, "step": 1585, "temperature": 0.9 }, { - "advantages": -2.4203744487749645e-05, - "completion_length": 751.0, - "delta_ref_entropy_loss": 0.05078125, - "delta_ref_ppl": -0.043701171875, - "entropy_loss": -0.03131103515625, - "epoch": 0.6344, - "grad_norm": 0.9325667726767631, - "k1_kl": 0.04351806640625, - "k3_kl": 0.025146484375, - "kimi_kl": 0.056640625, - "learning_rate": 1.8279999999999997e-07, - "loss": 0.001, - "ppl": 0.015777587890625, - "reward": 0.8887083232402802, - "reward_std": 0.005089307756861672, - "rewards/perpo_ocr_edit_distance_reward": 0.8887083530426025, + "advantages": 2.588544703030493e-06, + "completion_length": 257.0, + "delta_ref_entropy_loss": 0.033935546875, + "delta_ref_ppl": -0.0908203125, + "entropy_loss": -0.0284423828125, + "epoch": 0.3172, + "grad_norm": 1.0502204605281087, + "k1_kl": 0.0908203125, + "k3_kl": 0.06689453125, + "kimi_kl": 0.271484375, + "learning_rate": 3.4139999999999997e-07, + "loss": 0.0027, + "ppl": 0.00885009765625, + "reward": 0.9960803389549255, + "reward_std": 0.0031906103249639273, + "rewards/perpo_ocr_edit_distance_reward": 0.9960802793502808, "step": 1586, "temperature": 0.9 }, { - "advantages": -2.5153161914204247e-05, - "completion_length": 727.0, - "delta_ref_entropy_loss": 0.0579833984375, - "delta_ref_ppl": -0.050018310546875, - "entropy_loss": -0.0479736328125, - "epoch": 0.6348, - "grad_norm": 0.7179464587413961, - "k1_kl": 0.050018310546875, - "k3_kl": 0.0293731689453125, - "kimi_kl": 0.1120758056640625, - "learning_rate": 1.826e-07, - "loss": 0.0012, - "ppl": 0.024810791015625, - "reward": 0.8441294729709625, - "reward_std": 0.01262603560462594, - "rewards/perpo_ocr_edit_distance_reward": 0.8441295027732849, + "advantages": -3.440039654378779e-05, + "completion_length": 451.0, + "delta_ref_entropy_loss": 0.06298828125, + "delta_ref_ppl": -0.09375, + "entropy_loss": -0.06396484375, + "epoch": 0.3174, + "grad_norm": 1.160019199183625, + "k1_kl": 0.09375, + "k3_kl": 0.060546875, + "kimi_kl": 0.2099609375, + "learning_rate": 3.4129999999999997e-07, + "loss": 0.0025, + "ppl": 0.0255126953125, + "reward": 0.9894136786460876, + "reward_std": 0.002374283503741026, + "rewards/perpo_ocr_edit_distance_reward": 0.9894137382507324, "step": 1587, "temperature": 0.9 }, { - "advantages": -7.527640991611406e-05, - "completion_length": 509.5, - "delta_ref_entropy_loss": 0.0465087890625, - "delta_ref_ppl": -0.0543212890625, - "entropy_loss": -0.02508544921875, - "epoch": 0.6352, - "grad_norm": 0.24704708729028904, - "k1_kl": 0.0543212890625, - "k3_kl": 0.03680419921875, - "kimi_kl": 0.12646484375, - "learning_rate": 1.824e-07, - "loss": 0.0015, - "ppl": 0.010345458984375, - "reward": 0.9997313618659973, - "reward_std": 9.137454617302865e-05, - "rewards/perpo_ocr_edit_distance_reward": 0.9997313916683197, + "advantages": -2.1236284737824462e-05, + "completion_length": 676.0, + "delta_ref_entropy_loss": 0.06982421875, + "delta_ref_ppl": -0.10888671875, + "entropy_loss": -0.056884765625, + "epoch": 0.3176, + "grad_norm": 0.7638774000024335, + "k1_kl": 0.10888671875, + "k3_kl": 0.0693359375, + "kimi_kl": 0.248046875, + "learning_rate": 3.412e-07, + "loss": 0.0028, + "ppl": 0.0279541015625, + "reward": 0.9929130673408508, + "reward_std": 0.0007017385214567184, + "rewards/perpo_ocr_edit_distance_reward": 0.9929130673408508, "step": 1588, "temperature": 0.9 }, { - "advantages": 0.0, - "completion_length": 351.0, - "delta_ref_entropy_loss": 0.0587158203125, - "delta_ref_ppl": -0.0638427734375, - "entropy_loss": -0.0439453125, - "epoch": 0.6356, - "grad_norm": 0.5553172471083546, - "k1_kl": 0.0638427734375, - "k3_kl": 0.044921875, - "kimi_kl": 0.16357421875, - "learning_rate": 1.822e-07, - "loss": 0.0018, - "ppl": 0.0205078125, - "reward": 0.6718563884496689, - "reward_std": 0.003252812195569277, - "rewards/perpo_ocr_edit_distance_reward": 0.6718563884496689, + "advantages": -4.214899945509387e-06, + "completion_length": 39.0, + "delta_ref_entropy_loss": 0.294921875, + "delta_ref_ppl": -0.6640625, + "entropy_loss": -0.32421875, + "epoch": 0.3178, + "grad_norm": 5.669515007004751, + "k1_kl": 0.6640625, + "k3_kl": 0.51953125, + "kimi_kl": 1.8046875, + "learning_rate": 3.411e-07, + "loss": 0.0208, + "ppl": 0.1416015625, + "reward": 0.9505135416984558, + "reward_std": 0.014047040604054928, + "rewards/perpo_ocr_edit_distance_reward": 0.9505136013031006, "step": 1589, "temperature": 0.9 }, { - "advantages": -2.0980836779926904e-05, - "completion_length": 481.5, - "delta_ref_entropy_loss": 0.06268310546875, - "delta_ref_ppl": -0.039886474609375, - "entropy_loss": -0.04779052734375, - "epoch": 0.636, - "grad_norm": 0.770789604176264, - "k1_kl": 0.03985595703125, - "k3_kl": 0.0194549560546875, - "kimi_kl": 0.04315185546875, - "learning_rate": 1.82e-07, - "loss": 0.0008, - "ppl": 0.02313232421875, - "reward": 0.9662558734416962, - "reward_std": 0.0012692036107182503, - "rewards/perpo_ocr_edit_distance_reward": 0.9662559628486633, + "advantages": -5.563667946262285e-05, + "completion_length": 621.0, + "delta_ref_entropy_loss": 0.087890625, + "delta_ref_ppl": -0.072265625, + "entropy_loss": -0.056640625, + "epoch": 0.318, + "grad_norm": 0.9681577627222174, + "k1_kl": 0.07275390625, + "k3_kl": 0.04296875, + "kimi_kl": 0.11328125, + "learning_rate": 3.41e-07, + "loss": 0.0018, + "ppl": 0.02294921875, + "reward": 0.9922056794166565, + "reward_std": 0.0012772453483194113, + "rewards/perpo_ocr_edit_distance_reward": 0.992205798625946, "step": 1590, "temperature": 0.9 }, { - "advantages": -3.071768105655792e-05, - "completion_length": 530.5, - "delta_ref_entropy_loss": 0.06005859375, - "delta_ref_ppl": -0.0523681640625, - "entropy_loss": -0.035186767578125, - "epoch": 0.6364, - "grad_norm": 0.4771499515410182, - "k1_kl": 0.0523681640625, - "k3_kl": 0.0340576171875, - "kimi_kl": 0.104248046875, - "learning_rate": 1.8179999999999997e-07, - "loss": 0.0014, - "ppl": 0.016082763671875, - "reward": 0.975641667842865, - "reward_std": 0.0006456959090428427, - "rewards/perpo_ocr_edit_distance_reward": 0.9756416976451874, + "advantages": -2.5357519916724414e-05, + "completion_length": 102.0, + "delta_ref_entropy_loss": 0.09033203125, + "delta_ref_ppl": -0.302734375, + "entropy_loss": -0.099609375, + "epoch": 0.3182, + "grad_norm": 3.2310580334419594, + "k1_kl": 0.30078125, + "k3_kl": 0.232421875, + "kimi_kl": 0.90625, + "learning_rate": 3.409e-07, + "loss": 0.0093, + "ppl": 0.03564453125, + "reward": 0.944000244140625, + "reward_std": 0.0035897763445973396, + "rewards/perpo_ocr_edit_distance_reward": 0.9440003037452698, "step": 1591, "temperature": 0.9 }, { - "advantages": -9.025846452459518e-07, - "completion_length": 511.0, - "delta_ref_entropy_loss": 0.0791015625, - "delta_ref_ppl": -0.04925537109375, - "entropy_loss": -0.08221435546875, - "epoch": 0.6368, - "grad_norm": 0.9505192752147498, - "k1_kl": 0.04925537109375, - "k3_kl": 0.027496337890625, - "kimi_kl": 0.06982421875, - "learning_rate": 1.816e-07, - "loss": 0.0011, - "ppl": 0.0421905517578125, - "reward": 0.8859735131263733, - "reward_std": 0.002312413649633527, - "rewards/perpo_ocr_edit_distance_reward": 0.8859735429286957, + "advantages": -0.00014001131057739258, + "completion_length": 1003.0, + "delta_ref_entropy_loss": 0.044677734375, + "delta_ref_ppl": -0.0634765625, + "entropy_loss": -0.021728515625, + "epoch": 0.3184, + "grad_norm": 0.2773225475905858, + "k1_kl": 0.0634765625, + "k3_kl": 0.0419921875, + "kimi_kl": 0.201171875, + "learning_rate": 3.408e-07, + "loss": 0.0018, + "ppl": 0.007171630859375, + "reward": 0.9987448453903198, + "reward_std": 0.0006906046182848513, + "rewards/perpo_ocr_edit_distance_reward": 0.9987450242042542, "step": 1592, "temperature": 0.9 }, { - "advantages": -2.3041454028316366e-05, - "completion_length": 711.5, - "delta_ref_entropy_loss": 0.01885986328125, - "delta_ref_ppl": -0.015869140625, - "entropy_loss": -0.01824951171875, - "epoch": 0.6372, - "grad_norm": 0.5600612185034259, - "k1_kl": 0.015869140625, - "k3_kl": 0.010955810546875, - "kimi_kl": 0.019775390625, - "learning_rate": 1.814e-07, - "loss": 0.0005, - "ppl": 0.010009765625, - "reward": 0.9941843152046204, - "reward_std": 0.005822401930345222, - "rewards/perpo_ocr_edit_distance_reward": 0.9941843748092651, + "advantages": -2.920627775893081e-05, + "completion_length": 1312.0, + "delta_ref_entropy_loss": 0.0272216796875, + "delta_ref_ppl": -0.035400390625, + "entropy_loss": -0.032958984375, + "epoch": 0.3186, + "grad_norm": 0.5178470735669596, + "k1_kl": 0.035400390625, + "k3_kl": 0.02294921875, + "kimi_kl": 0.0654296875, + "learning_rate": 3.407e-07, + "loss": 0.001, + "ppl": 0.0159912109375, + "reward": 0.9930304288864136, + "reward_std": 0.0007749555516056716, + "rewards/perpo_ocr_edit_distance_reward": 0.9930304288864136, "step": 1593, "temperature": 0.9 }, { - "advantages": -4.253642919138656e-05, - "completion_length": 794.0, - "delta_ref_entropy_loss": 0.0245361328125, - "delta_ref_ppl": -0.019989013671875, - "entropy_loss": -0.01800537109375, - "epoch": 0.6376, - "grad_norm": 0.4785089910075402, - "k1_kl": 0.01995849609375, - "k3_kl": 0.01226806640625, - "kimi_kl": 0.039642333984375, - "learning_rate": 1.8119999999999998e-07, - "loss": 0.0005, - "ppl": 0.008758544921875, - "reward": 0.9991533756256104, - "reward_std": 0.0006136186348157935, - "rewards/perpo_ocr_edit_distance_reward": 0.9991534352302551, + "advantages": -2.8099334485887084e-06, + "completion_length": 814.0, + "delta_ref_entropy_loss": 0.058837890625, + "delta_ref_ppl": -0.06640625, + "entropy_loss": -0.046630859375, + "epoch": 0.3188, + "grad_norm": 0.7340287449804491, + "k1_kl": 0.06640625, + "k3_kl": 0.04150390625, + "kimi_kl": 0.12109375, + "learning_rate": 3.406e-07, + "loss": 0.0017, + "ppl": 0.0169677734375, + "reward": 0.8553535342216492, + "reward_std": 0.02424677088856697, + "rewards/perpo_ocr_edit_distance_reward": 0.8553535342216492, "step": 1594, "temperature": 0.9 }, { - "advantages": -0.00014779823686694726, - "completion_length": 386.0, - "delta_ref_entropy_loss": 0.0394287109375, - "delta_ref_ppl": -0.041015625, - "entropy_loss": -0.03082275390625, - "epoch": 0.638, - "grad_norm": 0.6629113707320679, - "k1_kl": 0.04107666015625, - "k3_kl": 0.029022216796875, - "kimi_kl": 0.1148681640625, - "learning_rate": 1.81e-07, - "loss": 0.0013, - "ppl": 0.013946533203125, - "reward": 0.9952484369277954, - "reward_std": 0.0007183180132415146, - "rewards/perpo_ocr_edit_distance_reward": 0.995248556137085, + "advantages": 1.7498221041023498e-06, + "completion_length": 499.0, + "delta_ref_entropy_loss": 0.1552734375, + "delta_ref_ppl": -0.15234375, + "entropy_loss": -0.259765625, + "epoch": 0.319, + "grad_norm": 2.864268769626692, + "k1_kl": 0.1533203125, + "k3_kl": 0.09375, + "kimi_kl": 0.24609375, + "learning_rate": 3.405e-07, + "loss": 0.0037, + "ppl": 0.138671875, + "reward": 0.7134298086166382, + "reward_std": 0.004783822223544121, + "rewards/perpo_ocr_edit_distance_reward": 0.7134298086166382, "step": 1595, "temperature": 0.9 }, { - "advantages": -6.672314520983491e-05, - "completion_length": 406.0, - "delta_ref_entropy_loss": 0.051025390625, - "delta_ref_ppl": -0.0283203125, - "entropy_loss": -0.04302978515625, - "epoch": 0.6384, - "grad_norm": 1.0322111183060776, - "k1_kl": 0.0283203125, - "k3_kl": 0.01300048828125, - "kimi_kl": 0.02056884765625, - "learning_rate": 1.8079999999999998e-07, - "loss": 0.0006, - "ppl": 0.02056884765625, - "reward": 0.9877979159355164, - "reward_std": 0.000983813893981278, - "rewards/perpo_ocr_edit_distance_reward": 0.9877980649471283, + "advantages": -3.69037916243542e-05, + "completion_length": 1087.0, + "delta_ref_entropy_loss": 0.07177734375, + "delta_ref_ppl": -0.08984375, + "entropy_loss": -0.06884765625, + "epoch": 0.3192, + "grad_norm": 2.103052220697242, + "k1_kl": 0.08984375, + "k3_kl": 0.05908203125, + "kimi_kl": 0.1708984375, + "learning_rate": 3.4039999999999995e-07, + "loss": 0.0024, + "ppl": 0.03662109375, + "reward": 0.9218205809593201, + "reward_std": 0.0017461910611018538, + "rewards/perpo_ocr_edit_distance_reward": 0.9218206405639648, "step": 1596, "temperature": 0.9 }, { - "advantages": -1.1678253031277563e-05, + "advantages": -2.2564616301679052e-05, "completion_length": 279.0, - "delta_ref_entropy_loss": 0.0357666015625, - "delta_ref_ppl": -0.02642822265625, - "entropy_loss": -0.03790283203125, - "epoch": 0.6388, - "grad_norm": 0.9548701651253273, - "k1_kl": 0.02642822265625, - "k3_kl": 0.013580322265625, - "kimi_kl": 0.0269775390625, - "learning_rate": 1.806e-07, - "loss": 0.0006, - "ppl": 0.017681121826171875, - "reward": 0.9895226061344147, - "reward_std": 0.000679364544339478, - "rewards/perpo_ocr_edit_distance_reward": 0.9895226359367371, + "delta_ref_entropy_loss": 0.10205078125, + "delta_ref_ppl": -0.162109375, + "entropy_loss": -0.054931640625, + "epoch": 0.3194, + "grad_norm": 1.246803843883083, + "k1_kl": 0.1630859375, + "k3_kl": 0.1142578125, + "kimi_kl": 0.40234375, + "learning_rate": 3.403e-07, + "loss": 0.0046, + "ppl": 0.0218505859375, + "reward": 0.9788302779197693, + "reward_std": 0.0029149765614420176, + "rewards/perpo_ocr_edit_distance_reward": 0.9788303971290588, "step": 1597, "temperature": 0.9 }, { - "advantages": -0.000316602843668079, - "completion_length": 389.0, - "delta_ref_entropy_loss": 0.02301025390625, - "delta_ref_ppl": -0.040283203125, - "entropy_loss": -0.016845703125, - "epoch": 0.6392, - "grad_norm": 0.33065547752164864, - "k1_kl": 0.0404052734375, - "k3_kl": 0.03167724609375, - "kimi_kl": 0.140625, - "learning_rate": 1.804e-07, - "loss": 0.0016, - "ppl": 0.00537872314453125, - "reward": 0.9983364045619965, - "reward_std": 0.00040833043749444187, - "rewards/perpo_ocr_edit_distance_reward": 0.9983364641666412, + "advantages": -7.896764145698398e-05, + "completion_length": 542.0, + "delta_ref_entropy_loss": 0.06396484375, + "delta_ref_ppl": -0.07177734375, + "entropy_loss": -0.037109375, + "epoch": 0.3196, + "grad_norm": 0.8390328744922935, + "k1_kl": 0.07177734375, + "k3_kl": 0.0458984375, + "kimi_kl": 0.1318359375, + "learning_rate": 3.402e-07, + "loss": 0.0019, + "ppl": 0.0203857421875, + "reward": 0.9930773377418518, + "reward_std": 0.0013017350574955344, + "rewards/perpo_ocr_edit_distance_reward": 0.9930774569511414, "step": 1598, "temperature": 0.9 }, { - "advantages": -1.4645713690697448e-05, - "completion_length": 922.0, - "delta_ref_entropy_loss": 0.04364013671875, - "delta_ref_ppl": -0.03741455078125, - "entropy_loss": -0.0518798828125, - "epoch": 0.6396, - "grad_norm": 1.200794567651062, - "k1_kl": 0.03753662109375, - "k3_kl": 0.022216796875, - "kimi_kl": 0.0531005859375, - "learning_rate": 1.8019999999999999e-07, - "loss": 0.0009, - "ppl": 0.023712158203125, - "reward": 0.9945278465747833, - "reward_std": 0.00243828579550609, - "rewards/perpo_ocr_edit_distance_reward": 0.9945278465747833, + "advantages": -2.9053007892798632e-05, + "completion_length": 404.0, + "delta_ref_entropy_loss": 0.08056640625, + "delta_ref_ppl": -0.125, + "entropy_loss": -0.047119140625, + "epoch": 0.3198, + "grad_norm": 0.5353433672913521, + "k1_kl": 0.125, + "k3_kl": 0.07958984375, + "kimi_kl": 0.33203125, + "learning_rate": 3.401e-07, + "loss": 0.0032, + "ppl": 0.01611328125, + "reward": 0.9842649698257446, + "reward_std": 0.0007798238075338304, + "rewards/perpo_ocr_edit_distance_reward": 0.9842650294303894, "step": 1599, "temperature": 0.9 }, { - "advantages": -7.021853161859326e-05, - "completion_length": 424.0, - "delta_ref_entropy_loss": 0.0606689453125, - "delta_ref_ppl": -0.064453125, - "entropy_loss": -0.03021240234375, - "epoch": 0.64, - "grad_norm": 1.0490942115167405, - "k1_kl": 0.064697265625, - "k3_kl": 0.04083251953125, - "kimi_kl": 0.117919921875, - "learning_rate": 1.8e-07, - "loss": 0.0017, - "ppl": 0.015167236328125, - "reward": 0.9176939427852631, - "reward_std": 0.0009016317053465173, - "rewards/perpo_ocr_edit_distance_reward": 0.9176940321922302, + "advantages": -7.816723518772051e-05, + "completion_length": 590.0, + "delta_ref_entropy_loss": 0.06103515625, + "delta_ref_ppl": -0.076171875, + "entropy_loss": -0.03271484375, + "epoch": 0.32, + "grad_norm": 0.5654491680648905, + "k1_kl": 0.076171875, + "k3_kl": 0.047119140625, + "kimi_kl": 0.1376953125, + "learning_rate": 3.4000000000000003e-07, + "loss": 0.002, + "ppl": 0.01177978515625, + "reward": 0.9613233208656311, + "reward_std": 0.0006625732639804482, + "rewards/perpo_ocr_edit_distance_reward": 0.9613233804702759, "step": 1600, "temperature": 0.9 }, { - "advantages": 9.013074304675683e-06, - "completion_length": 603.0, - "delta_ref_entropy_loss": 0.0465087890625, - "delta_ref_ppl": -0.02911376953125, - "entropy_loss": -0.0233154296875, - "epoch": 0.6404, - "grad_norm": 0.4312360673058033, - "k1_kl": 0.02899169921875, - "k3_kl": 0.015106201171875, - "kimi_kl": 0.0404052734375, - "learning_rate": 1.7979999999999998e-07, - "loss": 0.0006, - "ppl": 0.0106201171875, - "reward": 0.9430166780948639, - "reward_std": 0.00018593278946354985, - "rewards/perpo_ocr_edit_distance_reward": 0.9430166780948639, + "advantages": -1.370906943520822e-06, + "completion_length": 492.0, + "delta_ref_entropy_loss": 0.083984375, + "delta_ref_ppl": -0.11669921875, + "entropy_loss": -0.1103515625, + "epoch": 0.3202, + "grad_norm": 1.6749256240709534, + "k1_kl": 0.11669921875, + "k3_kl": 0.072265625, + "kimi_kl": 0.25390625, + "learning_rate": 3.3989999999999997e-07, + "loss": 0.0029, + "ppl": 0.056396484375, + "reward": 0.9719487428665161, + "reward_std": 0.012234740890562534, + "rewards/perpo_ocr_edit_distance_reward": 0.9719487428665161, "step": 1601, "temperature": 0.9 }, { - "advantages": -0.00043212942546233535, - "completion_length": 613.5, - "delta_ref_entropy_loss": 0.02435302734375, - "delta_ref_ppl": -0.015045166015625, - "entropy_loss": -0.017822265625, - "epoch": 0.6408, - "grad_norm": 0.1480784824266917, - "k1_kl": 0.0150146484375, - "k3_kl": 0.007720947265625, - "kimi_kl": 0.0196533203125, - "learning_rate": 1.796e-07, - "loss": 0.0007, - "ppl": 0.007904052734375, - "reward": 0.9983730316162109, - "reward_std": 0.0001722359738778323, - "rewards/perpo_ocr_edit_distance_reward": 0.9983732104301453, + "advantages": -1.8471055227564648e-05, + "completion_length": 210.0, + "delta_ref_entropy_loss": 0.1201171875, + "delta_ref_ppl": -0.185546875, + "entropy_loss": -0.051025390625, + "epoch": 0.3204, + "grad_norm": 1.3099213239134795, + "k1_kl": 0.185546875, + "k3_kl": 0.1318359375, + "kimi_kl": 0.48828125, + "learning_rate": 3.3979999999999996e-07, + "loss": 0.0053, + "ppl": 0.02001953125, + "reward": 0.9915278553962708, + "reward_std": 0.0012817580718547106, + "rewards/perpo_ocr_edit_distance_reward": 0.9915279150009155, "step": 1602, "temperature": 0.9 }, { - "advantages": -2.92658824037062e-05, - "completion_length": 855.5, - "delta_ref_entropy_loss": 0.02783203125, - "delta_ref_ppl": -0.012603759765625, - "entropy_loss": -0.0233154296875, - "epoch": 0.6412, - "grad_norm": 0.3073097347788459, - "k1_kl": 0.0126190185546875, - "k3_kl": 0.00643157958984375, - "kimi_kl": 0.01551055908203125, - "learning_rate": 1.794e-07, - "loss": 0.0003, - "ppl": 0.01080322265625, - "reward": 0.9848664402961731, - "reward_std": 0.0016468456014990807, - "rewards/perpo_ocr_edit_distance_reward": 0.9848664999008179, + "advantages": -1.3181142094254028e-05, + "completion_length": 181.0, + "delta_ref_entropy_loss": 0.02734375, + "delta_ref_ppl": -0.2138671875, + "entropy_loss": -0.04541015625, + "epoch": 0.3206, + "grad_norm": 1.3973442053390437, + "k1_kl": 0.2138671875, + "k3_kl": 0.1796875, + "kimi_kl": 0.78125, + "learning_rate": 3.397e-07, + "loss": 0.0072, + "ppl": 0.0206298828125, + "reward": 0.9922233819961548, + "reward_std": 0.0011922784615308046, + "rewards/perpo_ocr_edit_distance_reward": 0.9922233819961548, "step": 1603, "temperature": 0.9 }, { - "advantages": -7.693257066421211e-05, - "completion_length": 1224.0, - "delta_ref_entropy_loss": 0.02508544921875, - "delta_ref_ppl": -0.02398681640625, - "entropy_loss": -0.056884765625, - "epoch": 0.6416, - "grad_norm": 0.5705277234322058, - "k1_kl": 0.0240478515625, - "k3_kl": 0.015869140625, - "kimi_kl": 0.043701171875, - "learning_rate": 1.792e-07, - "loss": 0.0007, - "ppl": 0.03125, - "reward": 0.9897441267967224, - "reward_std": 0.0007259675476234406, - "rewards/perpo_ocr_edit_distance_reward": 0.9897442162036896, + "advantages": -1.0388238251834991e-06, + "completion_length": 561.0, + "delta_ref_entropy_loss": 0.1015625, + "delta_ref_ppl": -0.12060546875, + "entropy_loss": -0.0771484375, + "epoch": 0.3208, + "grad_norm": 1.6924693719166002, + "k1_kl": 0.12060546875, + "k3_kl": 0.07373046875, + "kimi_kl": 0.2041015625, + "learning_rate": 3.396e-07, + "loss": 0.003, + "ppl": 0.0302734375, + "reward": 0.8145859241485596, + "reward_std": 0.016088318079710007, + "rewards/perpo_ocr_edit_distance_reward": 0.8145859837532043, "step": 1604, "temperature": 0.9 }, { - "advantages": -7.2036473284242675e-06, - "completion_length": 868.5, - "delta_ref_entropy_loss": 0.0926513671875, - "delta_ref_ppl": -0.07470703125, - "entropy_loss": -0.10791015625, - "epoch": 0.642, - "grad_norm": 1.1272091750472946, - "k1_kl": 0.0748291015625, - "k3_kl": 0.040771484375, - "kimi_kl": 0.095947265625, - "learning_rate": 1.7899999999999997e-07, - "loss": 0.0016, - "ppl": 0.055908203125, - "reward": 0.927216500043869, - "reward_std": 0.004306803806684911, - "rewards/perpo_ocr_edit_distance_reward": 0.9272165298461914, + "advantages": -2.4310180378961377e-05, + "completion_length": 341.0, + "delta_ref_entropy_loss": 0.07666015625, + "delta_ref_ppl": -0.1005859375, + "entropy_loss": -0.042236328125, + "epoch": 0.321, + "grad_norm": 0.6393311144859387, + "k1_kl": 0.10107421875, + "k3_kl": 0.07080078125, + "kimi_kl": 0.26953125, + "learning_rate": 3.395e-07, + "loss": 0.0029, + "ppl": 0.01708984375, + "reward": 0.9890775084495544, + "reward_std": 0.001300699426792562, + "rewards/perpo_ocr_edit_distance_reward": 0.9890775680541992, "step": 1605, "temperature": 0.9 }, { - "advantages": -4.3720008306991076e-05, - "completion_length": 523.0, - "delta_ref_entropy_loss": 0.0472412109375, - "delta_ref_ppl": -0.03662109375, - "entropy_loss": -0.06805419921875, - "epoch": 0.6424, - "grad_norm": 0.9905438818860625, - "k1_kl": 0.03668212890625, - "k3_kl": 0.022796630859375, - "kimi_kl": 0.069580078125, - "learning_rate": 1.7879999999999999e-07, - "loss": 0.001, - "ppl": 0.036376953125, - "reward": 0.9819220900535583, - "reward_std": 0.0018070971127599478, - "rewards/perpo_ocr_edit_distance_reward": 0.9819221496582031, + "advantages": -7.067408205330139e-07, + "completion_length": 718.0, + "delta_ref_entropy_loss": 0.0703125, + "delta_ref_ppl": -0.06591796875, + "entropy_loss": -0.08544921875, + "epoch": 0.3212, + "grad_norm": 0.9864814494388773, + "k1_kl": 0.06591796875, + "k3_kl": 0.038330078125, + "kimi_kl": 0.08837890625, + "learning_rate": 3.394e-07, + "loss": 0.0015, + "ppl": 0.037353515625, + "reward": 0.9671888947486877, + "reward_std": 0.011658621951937675, + "rewards/perpo_ocr_edit_distance_reward": 0.9671890139579773, "step": 1606, "temperature": 0.9 }, { - "advantages": -7.152557657263969e-06, - "completion_length": 1322.0, - "delta_ref_entropy_loss": 0.05517578125, - "delta_ref_ppl": -0.054931640625, - "entropy_loss": -0.53662109375, - "epoch": 0.6428, - "grad_norm": 2.4083098609604, - "k1_kl": 0.0548095703125, - "k3_kl": 0.04388427734375, - "kimi_kl": 0.063720703125, - "learning_rate": 1.786e-07, - "loss": 0.0018, - "ppl": 0.30859375, - "reward": 0.6292123794555664, - "reward_std": 0.02370428352151066, - "rewards/perpo_ocr_edit_distance_reward": 0.6292124539613724, - "step": 1607, - "temperature": 0.9 - }, - { - "advantages": -5.381448204389017e-06, - "completion_length": 433.0, - "delta_ref_entropy_loss": 0.060791015625, - "delta_ref_ppl": -0.0714111328125, - "entropy_loss": -0.0645751953125, - "epoch": 0.6432, - "grad_norm": 1.7110974506675094, - "k1_kl": 0.071533203125, - "k3_kl": 0.04168701171875, - "kimi_kl": 0.08935546875, - "learning_rate": 1.7839999999999998e-07, + "advantages": -6.393875810317695e-05, + "completion_length": 737.0, + "delta_ref_entropy_loss": 0.0751953125, + "delta_ref_ppl": -0.0712890625, + "entropy_loss": -0.059326171875, + "epoch": 0.3214, + "grad_norm": 0.8928354569172933, + "k1_kl": 0.0712890625, + "k3_kl": 0.04052734375, + "kimi_kl": 0.103515625, + "learning_rate": 3.393e-07, "loss": 0.0017, - "ppl": 0.0355224609375, - "reward": 0.9143287837505341, - "reward_std": 0.010140843864064664, - "rewards/perpo_ocr_edit_distance_reward": 0.9143288433551788, + "ppl": 0.0286865234375, + "reward": 0.981959879398346, + "reward_std": 0.0013648958411067724, + "rewards/perpo_ocr_edit_distance_reward": 0.9819599390029907, + "step": 1607, + "temperature": 0.9 + }, + { + "advantages": -2.8780530101357726e-06, + "completion_length": 345.0, + "delta_ref_entropy_loss": 0.0390625, + "delta_ref_ppl": -0.11376953125, + "entropy_loss": -0.078125, + "epoch": 0.3216, + "grad_norm": 1.911318788091248, + "k1_kl": 0.11328125, + "k3_kl": 0.08251953125, + "kimi_kl": 0.31640625, + "learning_rate": 3.3919999999999997e-07, + "loss": 0.0033, + "ppl": 0.033203125, + "reward": 0.4920269548892975, + "reward_std": 0.019147509709000587, + "rewards/perpo_ocr_edit_distance_reward": 0.49202704429626465, "step": 1608, "temperature": 0.9 }, { - "advantages": 0.0, - "completion_length": 473.0, - "delta_ref_entropy_loss": 0.03045654296875, - "delta_ref_ppl": -0.017486572265625, - "entropy_loss": -0.0165252685546875, - "epoch": 0.6436, - "grad_norm": 0.014073973136419156, - "k1_kl": 0.017425537109375, - "k3_kl": 0.008411407470703125, - "kimi_kl": 0.0158843994140625, - "learning_rate": 1.782e-07, - "loss": 0.0003, - "ppl": 0.00632476806640625, - "reward": 1.0, - "reward_std": 0.0, - "rewards/perpo_ocr_edit_distance_reward": 1.0, + "advantages": -1.6519003111170605e-05, + "completion_length": 493.0, + "delta_ref_entropy_loss": 0.036865234375, + "delta_ref_ppl": -0.060302734375, + "entropy_loss": -0.034912109375, + "epoch": 0.3218, + "grad_norm": 0.9474720185125217, + "k1_kl": 0.060302734375, + "k3_kl": 0.03955078125, + "kimi_kl": 0.115234375, + "learning_rate": 3.391e-07, + "loss": 0.0016, + "ppl": 0.0146484375, + "reward": 0.9770580530166626, + "reward_std": 0.005049492232501507, + "rewards/perpo_ocr_edit_distance_reward": 0.9770581126213074, "step": 1609, "temperature": 0.9 }, { - "advantages": -0.00017689594824332744, - "completion_length": 589.0, - "delta_ref_entropy_loss": 0.02947998046875, - "delta_ref_ppl": -0.02008056640625, - "entropy_loss": -0.02655029296875, - "epoch": 0.644, - "grad_norm": 0.4547489465365209, - "k1_kl": 0.0201416015625, - "k3_kl": 0.011077880859375, - "kimi_kl": 0.02099609375, - "learning_rate": 1.7799999999999998e-07, - "loss": 0.0006, - "ppl": 0.0152587890625, - "reward": 0.9955000579357147, - "reward_std": 0.000334351891069673, - "rewards/perpo_ocr_edit_distance_reward": 0.9955001473426819, + "advantages": -9.809222319745459e-06, + "completion_length": 1902.0, + "delta_ref_entropy_loss": 0.0303955078125, + "delta_ref_ppl": -0.03857421875, + "entropy_loss": -0.06005859375, + "epoch": 0.322, + "grad_norm": 1.2376607148235759, + "k1_kl": 0.03857421875, + "k3_kl": 0.0244140625, + "kimi_kl": 0.05419921875, + "learning_rate": 3.39e-07, + "loss": 0.001, + "ppl": 0.0296630859375, + "reward": 0.9710022807121277, + "reward_std": 0.00510117644444108, + "rewards/perpo_ocr_edit_distance_reward": 0.9710023403167725, "step": 1610, "temperature": 0.9 }, { - "advantages": -0.00030549083476216765, - "completion_length": 699.5, - "delta_ref_entropy_loss": 0.056884765625, - "delta_ref_ppl": -0.02825927734375, - "entropy_loss": -0.04852294921875, - "epoch": 0.6444, - "grad_norm": 0.5216234362545236, - "k1_kl": 0.02825927734375, - "k3_kl": 0.0140228271484375, - "kimi_kl": 0.042999267578125, - "learning_rate": 1.7780000000000002e-07, - "loss": 0.0009, - "ppl": 0.02069091796875, - "reward": 0.9914376139640808, - "reward_std": 0.0008054006029851735, - "rewards/perpo_ocr_edit_distance_reward": 0.9914376735687256, + "advantages": -4.938671054333099e-07, + "completion_length": 497.0, + "delta_ref_entropy_loss": 0.015380859375, + "delta_ref_ppl": -0.07177734375, + "entropy_loss": -0.166015625, + "epoch": 0.3222, + "grad_norm": 2.1034219424382443, + "k1_kl": 0.07177734375, + "k3_kl": 0.05419921875, + "kimi_kl": 0.205078125, + "learning_rate": 3.3889999999999994e-07, + "loss": 0.0022, + "ppl": 0.07568359375, + "reward": 0.9180201292037964, + "reward_std": 0.18120571970939636, + "rewards/perpo_ocr_edit_distance_reward": 0.9180202484130859, "step": 1611, "temperature": 0.9 }, { - "advantages": -0.0002980232238769531, - "completion_length": 465.5, - "delta_ref_entropy_loss": 0.0255126953125, - "delta_ref_ppl": -0.02386474609375, - "entropy_loss": -0.018463134765625, - "epoch": 0.6448, - "grad_norm": 0.04058685700706187, - "k1_kl": 0.02386474609375, - "k3_kl": 0.01593017578125, - "kimi_kl": 0.038848876953125, - "learning_rate": 1.776e-07, - "loss": 0.0009, - "ppl": 0.007110595703125, - "reward": 0.9998347759246826, - "reward_std": 0.0, - "rewards/perpo_ocr_edit_distance_reward": 0.999834805727005, + "advantages": -4.955700660502771e-06, + "completion_length": 1197.0, + "delta_ref_entropy_loss": 0.07275390625, + "delta_ref_ppl": -0.0791015625, + "entropy_loss": -0.4375, + "epoch": 0.3224, + "grad_norm": 2.110990278851683, + "k1_kl": 0.0791015625, + "k3_kl": 0.06201171875, + "kimi_kl": 0.1181640625, + "learning_rate": 3.388e-07, + "loss": 0.0025, + "ppl": 0.26171875, + "reward": 0.8024513721466064, + "reward_std": 0.006789566017687321, + "rewards/perpo_ocr_edit_distance_reward": 0.8024514317512512, "step": 1612, "temperature": 0.9 }, { - "advantages": -9.559733734931797e-05, - "completion_length": 858.5, - "delta_ref_entropy_loss": 0.02294921875, - "delta_ref_ppl": -0.02886962890625, - "entropy_loss": -0.02337646484375, - "epoch": 0.6452, - "grad_norm": 1.200047010953052, - "k1_kl": 0.028839111328125, - "k3_kl": 0.0200347900390625, - "kimi_kl": 0.080657958984375, - "learning_rate": 1.774e-07, + "advantages": -1.7336436940240674e-05, + "completion_length": 854.0, + "delta_ref_entropy_loss": 0.027587890625, + "delta_ref_ppl": -0.03369140625, + "entropy_loss": -0.0245361328125, + "epoch": 0.3226, + "grad_norm": 0.5505732650852685, + "k1_kl": 0.03369140625, + "k3_kl": 0.0211181640625, + "kimi_kl": 0.06982421875, + "learning_rate": 3.387e-07, "loss": 0.0009, - "ppl": 0.011077880859375, - "reward": 0.9975870549678802, - "reward_std": 0.000326825596857816, - "rewards/perpo_ocr_edit_distance_reward": 0.997587114572525, + "ppl": 0.007781982421875, + "reward": 0.9937704801559448, + "reward_std": 0.003831929061561823, + "rewards/perpo_ocr_edit_distance_reward": 0.9937705993652344, "step": 1613, "temperature": 0.9 }, { - "advantages": -0.0002980232238769531, - "completion_length": 583.0, - "delta_ref_entropy_loss": 0.0391845703125, - "delta_ref_ppl": -0.05474853515625, - "entropy_loss": -0.011627197265625, - "epoch": 0.6456, - "grad_norm": 0.037653086007563355, - "k1_kl": 0.0548095703125, - "k3_kl": 0.036376953125, - "kimi_kl": 0.12744140625, - "learning_rate": 1.772e-07, - "loss": 0.0017, - "ppl": 0.00460052490234375, - "reward": 0.9991319477558136, - "reward_std": 0.0, - "rewards/perpo_ocr_edit_distance_reward": 0.999131977558136, + "advantages": -8.457899821223691e-05, + "completion_length": 444.0, + "delta_ref_entropy_loss": 0.0654296875, + "delta_ref_ppl": -0.09423828125, + "entropy_loss": -0.0225830078125, + "epoch": 0.3228, + "grad_norm": 0.27255380171660276, + "k1_kl": 0.0947265625, + "k3_kl": 0.0615234375, + "kimi_kl": 0.21875, + "learning_rate": 3.386e-07, + "loss": 0.0025, + "ppl": 0.006927490234375, + "reward": 0.9889434576034546, + "reward_std": 0.00040326849557459354, + "rewards/perpo_ocr_edit_distance_reward": 0.9889434576034546, "step": 1614, "temperature": 0.9 }, { - "advantages": -2.086162709247219e-07, - "completion_length": 727.0, - "delta_ref_entropy_loss": 0.064453125, - "delta_ref_ppl": -0.0745849609375, - "entropy_loss": -0.12103271484375, - "epoch": 0.646, - "grad_norm": 0.918817760026696, - "k1_kl": 0.0740966796875, - "k3_kl": 0.0517578125, - "kimi_kl": 0.193359375, - "learning_rate": 1.7699999999999998e-07, - "loss": 0.0021, - "ppl": 0.0687255859375, - "reward": 0.9278661012649536, - "reward_std": 0.07195863127708435, - "rewards/perpo_ocr_edit_distance_reward": 0.927866131067276, + "advantages": -2.1781241230200976e-05, + "completion_length": 625.0, + "delta_ref_entropy_loss": 0.07470703125, + "delta_ref_ppl": -0.07470703125, + "entropy_loss": -0.039306640625, + "epoch": 0.323, + "grad_norm": 1.3422193765413533, + "k1_kl": 0.07470703125, + "k3_kl": 0.043212890625, + "kimi_kl": 0.11767578125, + "learning_rate": 3.385e-07, + "loss": 0.0018, + "ppl": 0.0174560546875, + "reward": 0.9829224348068237, + "reward_std": 0.004590999335050583, + "rewards/perpo_ocr_edit_distance_reward": 0.9829226136207581, "step": 1615, "temperature": 0.9 }, { - "advantages": -6.761295662727207e-05, - "completion_length": 873.0, - "delta_ref_entropy_loss": 0.02447509765625, - "delta_ref_ppl": -0.020416259765625, - "entropy_loss": -0.04736328125, - "epoch": 0.6464, - "grad_norm": 0.6364708355778976, - "k1_kl": 0.020416259765625, - "k3_kl": 0.011505126953125, - "kimi_kl": 0.023223876953125, - "learning_rate": 1.768e-07, - "loss": 0.0005, - "ppl": 0.02459716796875, - "reward": 0.9636458456516266, - "reward_std": 0.0011137141846120358, - "rewards/perpo_ocr_edit_distance_reward": 0.9636459052562714, + "advantages": -4.9693244363879785e-05, + "completion_length": 545.0, + "delta_ref_entropy_loss": 0.07568359375, + "delta_ref_ppl": -0.0751953125, + "entropy_loss": -0.08056640625, + "epoch": 0.3232, + "grad_norm": 0.7459516526002816, + "k1_kl": 0.0751953125, + "k3_kl": 0.04150390625, + "kimi_kl": 0.10888671875, + "learning_rate": 3.3839999999999996e-07, + "loss": 0.0017, + "ppl": 0.0294189453125, + "reward": 0.9818372130393982, + "reward_std": 0.0010999152436852455, + "rewards/perpo_ocr_edit_distance_reward": 0.981837272644043, "step": 1616, "temperature": 0.9 }, { - "advantages": -3.2356808787881164e-06, - "completion_length": 259.5, - "delta_ref_entropy_loss": 0.05810546875, - "delta_ref_ppl": -0.065673828125, - "entropy_loss": -0.05322265625, - "epoch": 0.6468, - "grad_norm": 0.4868365764783007, - "k1_kl": 0.0653076171875, - "k3_kl": 0.0445556640625, - "kimi_kl": 0.1728515625, - "learning_rate": 1.766e-07, - "loss": 0.0018, - "ppl": 0.027130126953125, - "reward": 0.9836039245128632, - "reward_std": 0.0006096430588513613, - "rewards/perpo_ocr_edit_distance_reward": 0.9836039543151855, + "advantages": -6.215913163032383e-05, + "completion_length": 1312.0, + "delta_ref_entropy_loss": 0.043701171875, + "delta_ref_ppl": -0.034912109375, + "entropy_loss": -0.050537109375, + "epoch": 0.3234, + "grad_norm": 3.243855305975211, + "k1_kl": 0.034912109375, + "k3_kl": 0.0177001953125, + "kimi_kl": 0.041259765625, + "learning_rate": 3.3829999999999995e-07, + "loss": 0.0008, + "ppl": 0.022705078125, + "reward": 0.9950313568115234, + "reward_std": 0.0008588250493630767, + "rewards/perpo_ocr_edit_distance_reward": 0.9950315356254578, "step": 1617, "temperature": 0.9 }, { - "advantages": -5.542380677070469e-05, - "completion_length": 407.0, - "delta_ref_entropy_loss": 0.02349853515625, - "delta_ref_ppl": -0.01727294921875, - "entropy_loss": -0.02880859375, - "epoch": 0.6472, - "grad_norm": 0.35448496069095575, - "k1_kl": 0.017303466796875, - "k3_kl": 0.00946044921875, - "kimi_kl": 0.021087646484375, - "learning_rate": 1.764e-07, - "loss": 0.0004, - "ppl": 0.01702880859375, - "reward": 0.996452122926712, - "reward_std": 0.00029564567375928164, - "rewards/perpo_ocr_edit_distance_reward": 0.9964521527290344, + "advantages": -0.0001100216613849625, + "completion_length": 420.0, + "delta_ref_entropy_loss": 0.09033203125, + "delta_ref_ppl": -0.12353515625, + "entropy_loss": -0.03515625, + "epoch": 0.3236, + "grad_norm": 0.7581464598871548, + "k1_kl": 0.12353515625, + "k3_kl": 0.0771484375, + "kimi_kl": 0.26171875, + "learning_rate": 3.382e-07, + "loss": 0.0032, + "ppl": 0.0155029296875, + "reward": 0.9950055480003357, + "reward_std": 0.0009062077151611447, + "rewards/perpo_ocr_edit_distance_reward": 0.9950056672096252, "step": 1618, "temperature": 0.9 }, { - "advantages": -0.00012474401000872604, - "completion_length": 524.0, - "delta_ref_entropy_loss": 0.040771484375, - "delta_ref_ppl": -0.0244140625, - "entropy_loss": -0.021697998046875, - "epoch": 0.6476, - "grad_norm": 0.4056530132472409, - "k1_kl": 0.0245361328125, - "k3_kl": 0.01300048828125, - "kimi_kl": 0.0357666015625, - "learning_rate": 1.7619999999999998e-07, - "loss": 0.0006, - "ppl": 0.009521484375, - "reward": 0.9960814714431763, - "reward_std": 0.0005740004125982523, - "rewards/perpo_ocr_edit_distance_reward": 0.9960815012454987, + "advantages": 4.3341092350601684e-06, + "completion_length": 93.0, + "delta_ref_entropy_loss": 0.1494140625, + "delta_ref_ppl": -0.337890625, + "entropy_loss": -0.1298828125, + "epoch": 0.3238, + "grad_norm": 3.409734194479474, + "k1_kl": 0.337890625, + "k3_kl": 0.2353515625, + "kimi_kl": 1.0234375, + "learning_rate": 3.381e-07, + "loss": 0.0094, + "ppl": 0.05126953125, + "reward": 0.9657614231109619, + "reward_std": 0.007768581621348858, + "rewards/perpo_ocr_edit_distance_reward": 0.9657614827156067, "step": 1619, "temperature": 0.9 }, { - "advantages": -1.5386514860438183e-05, - "completion_length": 388.5, - "delta_ref_entropy_loss": 0.079833984375, - "delta_ref_ppl": -0.07958984375, - "entropy_loss": -0.062469482421875, - "epoch": 0.648, - "grad_norm": 0.6786351386978973, - "k1_kl": 0.079833984375, - "k3_kl": 0.05126953125, - "kimi_kl": 0.138916015625, - "learning_rate": 1.76e-07, - "loss": 0.0021, - "ppl": 0.03662109375, - "reward": 0.9611974358558655, - "reward_std": 0.001332552288658917, - "rewards/perpo_ocr_edit_distance_reward": 0.9611974656581879, + "advantages": -1.7029898913278885e-07, + "completion_length": 810.0, + "delta_ref_entropy_loss": 0.043212890625, + "delta_ref_ppl": -0.0634765625, + "entropy_loss": -0.134765625, + "epoch": 0.324, + "grad_norm": 1.437331100789495, + "k1_kl": 0.0634765625, + "k3_kl": 0.04345703125, + "kimi_kl": 0.11767578125, + "learning_rate": 3.38e-07, + "loss": 0.0017, + "ppl": 0.06640625, + "reward": 0.19724024832248688, + "reward_std": 0.040851786732673645, + "rewards/perpo_ocr_edit_distance_reward": 0.19724026322364807, "step": 1620, "temperature": 0.9 }, { - "advantages": -0.0003003903798344254, - "completion_length": 1145.0, - "delta_ref_entropy_loss": 0.02978515625, - "delta_ref_ppl": -0.05889892578125, - "entropy_loss": -0.0382080078125, - "epoch": 0.6484, - "grad_norm": 5.321876225494304, - "k1_kl": 0.05889892578125, - "k3_kl": 0.047210693359375, - "kimi_kl": 0.2503662109375, - "learning_rate": 1.758e-07, - "loss": 0.0022, - "ppl": 0.017425537109375, - "reward": 0.9459295272827148, - "reward_std": 0.0008478958043269813, - "rewards/perpo_ocr_edit_distance_reward": 0.9459295570850372, + "advantages": -1.2074198821210302e-05, + "completion_length": 1269.0, + "delta_ref_entropy_loss": 0.0771484375, + "delta_ref_ppl": -0.059814453125, + "entropy_loss": -0.12060546875, + "epoch": 0.3242, + "grad_norm": 2.0418921183010665, + "k1_kl": 0.059814453125, + "k3_kl": 0.030029296875, + "kimi_kl": 0.06494140625, + "learning_rate": 3.379e-07, + "loss": 0.0012, + "ppl": 0.0634765625, + "reward": 0.9525370597839355, + "reward_std": 0.0034285071305930614, + "rewards/perpo_ocr_edit_distance_reward": 0.9525371193885803, "step": 1621, "temperature": 0.9 }, { - "advantages": -7.66345493730114e-08, - "completion_length": 688.5, - "delta_ref_entropy_loss": 0.0465087890625, - "delta_ref_ppl": -0.04296875, - "entropy_loss": -0.046142578125, - "epoch": 0.6488, - "grad_norm": 1.092735549837488, - "k1_kl": 0.04296875, - "k3_kl": 0.024322509765625, - "kimi_kl": 0.05206298828125, - "learning_rate": 1.756e-07, - "loss": 0.001, - "ppl": 0.02532958984375, - "reward": 0.9593437314033508, - "reward_std": 0.09512384980916977, - "rewards/perpo_ocr_edit_distance_reward": 0.9593437910079956, + "advantages": -6.965229113120586e-05, + "completion_length": 390.0, + "delta_ref_entropy_loss": 0.091796875, + "delta_ref_ppl": -0.0703125, + "entropy_loss": -0.045166015625, + "epoch": 0.3244, + "grad_norm": 0.6742829675195477, + "k1_kl": 0.0703125, + "k3_kl": 0.039794921875, + "kimi_kl": 0.1103515625, + "learning_rate": 3.3779999999999997e-07, + "loss": 0.0017, + "ppl": 0.0157470703125, + "reward": 0.9812147617340088, + "reward_std": 0.000511264952365309, + "rewards/perpo_ocr_edit_distance_reward": 0.9812148213386536, "step": 1622, "temperature": 0.9 }, { - "advantages": -9.525673885946162e-05, - "completion_length": 446.5, - "delta_ref_entropy_loss": 0.0465087890625, - "delta_ref_ppl": -0.0626220703125, - "entropy_loss": -0.02716064453125, - "epoch": 0.6492, - "grad_norm": 0.778037293683809, - "k1_kl": 0.0626220703125, - "k3_kl": 0.0418701171875, - "kimi_kl": 0.137451171875, - "learning_rate": 1.754e-07, - "loss": 0.0018, - "ppl": 0.01531982421875, - "reward": 0.9962552487850189, - "reward_std": 0.0005670713726431131, - "rewards/perpo_ocr_edit_distance_reward": 0.9962553679943085, + "advantages": -7.985319825820625e-05, + "completion_length": 862.0, + "delta_ref_entropy_loss": 0.041015625, + "delta_ref_ppl": -0.06005859375, + "entropy_loss": -0.0299072265625, + "epoch": 0.3246, + "grad_norm": 0.4358431575313319, + "k1_kl": 0.06005859375, + "k3_kl": 0.037109375, + "kimi_kl": 0.1513671875, + "learning_rate": 3.3769999999999996e-07, + "loss": 0.0016, + "ppl": 0.01220703125, + "reward": 0.9918453097343445, + "reward_std": 0.0006463615573011339, + "rewards/perpo_ocr_edit_distance_reward": 0.991845428943634, "step": 1623, "temperature": 0.9 }, { - "advantages": -6.80301891406998e-05, - "completion_length": 894.0, - "delta_ref_entropy_loss": 0.08544921875, - "delta_ref_ppl": -0.0478515625, - "entropy_loss": -0.0960693359375, - "epoch": 0.6496, - "grad_norm": 1.286884839944591, - "k1_kl": 0.04779052734375, - "k3_kl": 0.02606201171875, - "kimi_kl": 0.05096435546875, - "learning_rate": 1.7519999999999998e-07, - "loss": 0.0011, - "ppl": 0.05029296875, - "reward": 0.9271104633808136, - "reward_std": 0.0011967344908043742, - "rewards/perpo_ocr_edit_distance_reward": 0.9271105825901031, + "advantages": 8.514949634275126e-09, + "completion_length": 885.0, + "delta_ref_entropy_loss": 0.107421875, + "delta_ref_ppl": -0.0986328125, + "entropy_loss": -0.1865234375, + "epoch": 0.3248, + "grad_norm": 1.7412220502640157, + "k1_kl": 0.0986328125, + "k3_kl": 0.064453125, + "kimi_kl": 0.11865234375, + "learning_rate": 3.376e-07, + "loss": 0.0026, + "ppl": 0.10400390625, + "reward": 0.7874619960784912, + "reward_std": 0.003730301046743989, + "rewards/perpo_ocr_edit_distance_reward": 0.7874619960784912, "step": 1624, "temperature": 0.9 }, { - "advantages": 0.0, - "completion_length": 589.0, - "delta_ref_entropy_loss": 0.029052734375, - "delta_ref_ppl": -0.02935791015625, - "entropy_loss": -0.019317626953125, - "epoch": 0.65, - "grad_norm": 0.82773902340314, - "k1_kl": 0.02935791015625, - "k3_kl": 0.01959228515625, - "kimi_kl": 0.09344482421875, - "learning_rate": 1.75e-07, - "loss": 0.0008, - "ppl": 0.00868988037109375, - "reward": 0.9996062219142914, - "reward_std": 0.00013121300435159355, - "rewards/perpo_ocr_edit_distance_reward": 0.9996062219142914, + "advantages": -1.9541808796930127e-05, + "completion_length": 681.0, + "delta_ref_entropy_loss": 0.06005859375, + "delta_ref_ppl": -0.0849609375, + "entropy_loss": -0.051513671875, + "epoch": 0.325, + "grad_norm": 1.1392626252160476, + "k1_kl": 0.0849609375, + "k3_kl": 0.05224609375, + "kimi_kl": 0.142578125, + "learning_rate": 3.375e-07, + "loss": 0.0021, + "ppl": 0.02392578125, + "reward": 0.9951995611190796, + "reward_std": 0.0012063406175002456, + "rewards/perpo_ocr_edit_distance_reward": 0.9951995611190796, "step": 1625, "temperature": 0.9 }, { - "advantages": -5.744610643887427e-05, - "completion_length": 1514.5, - "delta_ref_entropy_loss": 0.0179443359375, - "delta_ref_ppl": -0.01416015625, - "entropy_loss": -0.03631591796875, - "epoch": 0.6504, - "grad_norm": 71.6017221642426, - "k1_kl": 0.0141448974609375, - "k3_kl": 0.039825439453125, - "kimi_kl": 0.0274658203125, - "learning_rate": 1.748e-07, - "loss": 0.0017, - "ppl": 0.0198974609375, - "reward": 0.9941720068454742, - "reward_std": 0.0016588052531005815, - "rewards/perpo_ocr_edit_distance_reward": 0.994172066450119, + "advantages": -3.529446621541865e-05, + "completion_length": 402.0, + "delta_ref_entropy_loss": 0.0439453125, + "delta_ref_ppl": -0.10595703125, + "entropy_loss": -0.04248046875, + "epoch": 0.3252, + "grad_norm": 1.0406751022178875, + "k1_kl": 0.10595703125, + "k3_kl": 0.0771484375, + "kimi_kl": 0.263671875, + "learning_rate": 3.3739999999999994e-07, + "loss": 0.0031, + "ppl": 0.016357421875, + "reward": 0.9939448833465576, + "reward_std": 0.0013462856877595186, + "rewards/perpo_ocr_edit_distance_reward": 0.9939448833465576, "step": 1626, "temperature": 0.9 }, { - "advantages": -3.0355795388459228e-05, - "completion_length": 363.0, - "delta_ref_entropy_loss": 0.03515625, - "delta_ref_ppl": -0.01873779296875, - "entropy_loss": -0.02728271484375, - "epoch": 0.6508, - "grad_norm": 0.6744112201890365, - "k1_kl": 0.0186767578125, - "k3_kl": 0.00933837890625, - "kimi_kl": 0.01678466796875, - "learning_rate": 1.746e-07, - "loss": 0.0004, - "ppl": 0.009979248046875, - "reward": 0.9995285272598267, - "reward_std": 0.0004410285619087517, - "rewards/perpo_ocr_edit_distance_reward": 0.999528557062149, + "advantages": -7.76563410909148e-06, + "completion_length": 143.0, + "delta_ref_entropy_loss": 0.11376953125, + "delta_ref_ppl": -0.345703125, + "entropy_loss": -0.1083984375, + "epoch": 0.3254, + "grad_norm": 1.6804874220964123, + "k1_kl": 0.345703125, + "k3_kl": 0.265625, + "kimi_kl": 1.109375, + "learning_rate": 3.373e-07, + "loss": 0.0106, + "ppl": 0.039306640625, + "reward": 0.9865281581878662, + "reward_std": 0.00210026977583766, + "rewards/perpo_ocr_edit_distance_reward": 0.986528217792511, "step": 1627, "temperature": 0.9 }, { - "advantages": -1.3755901363765588e-05, - "completion_length": 726.0, - "delta_ref_entropy_loss": 0.048828125, - "delta_ref_ppl": -0.04046630859375, - "entropy_loss": -0.035888671875, - "epoch": 0.6512, - "grad_norm": 140.73581618987478, - "k1_kl": 0.04046630859375, - "k3_kl": 0.4208984375, - "kimi_kl": 0.09466552734375, - "learning_rate": 1.744e-07, - "loss": 0.0169, - "ppl": 0.021728515625, - "reward": 0.9852463603019714, - "reward_std": 0.002340231672860682, - "rewards/perpo_ocr_edit_distance_reward": 0.9852464199066162, + "advantages": -6.467955972766504e-05, + "completion_length": 1123.0, + "delta_ref_entropy_loss": 0.06298828125, + "delta_ref_ppl": -0.05810546875, + "entropy_loss": -0.07568359375, + "epoch": 0.3256, + "grad_norm": 1.1511925529263676, + "k1_kl": 0.05810546875, + "k3_kl": 0.03369140625, + "kimi_kl": 0.0712890625, + "learning_rate": 3.372e-07, + "loss": 0.0014, + "ppl": 0.034423828125, + "reward": 0.9786099791526794, + "reward_std": 0.001216820441186428, + "rewards/perpo_ocr_edit_distance_reward": 0.9786100387573242, "step": 1628, "temperature": 0.9 }, { - "advantages": -0.0002980232238769531, - "completion_length": 277.5, - "delta_ref_entropy_loss": 0.06707763671875, - "delta_ref_ppl": -0.04833984375, - "entropy_loss": -0.06591796875, - "epoch": 0.6516, - "grad_norm": 1.0080647800169629, - "k1_kl": 0.0484619140625, - "k3_kl": 0.0284423828125, - "kimi_kl": 0.05712890625, - "learning_rate": 1.7419999999999998e-07, + "advantages": -4.742827059089905e-06, + "completion_length": 841.0, + "delta_ref_entropy_loss": 0.060302734375, + "delta_ref_ppl": -0.0625, + "entropy_loss": -0.07080078125, + "epoch": 0.3258, + "grad_norm": 1.0562515889036388, + "k1_kl": 0.0625, + "k3_kl": 0.034912109375, + "kimi_kl": 0.0830078125, + "learning_rate": 3.371e-07, "loss": 0.0014, - "ppl": 0.0345458984375, - "reward": 0.9006098806858063, - "reward_std": 0.01816902495920658, - "rewards/perpo_ocr_edit_distance_reward": 0.900609940290451, + "ppl": 0.037841796875, + "reward": 0.986996054649353, + "reward_std": 0.0016893220599740744, + "rewards/perpo_ocr_edit_distance_reward": 0.986996054649353, "step": 1629, "temperature": 0.9 }, { - "advantages": -6.624630714213708e-06, - "completion_length": 427.5, - "delta_ref_entropy_loss": 0.03662109375, - "delta_ref_ppl": -0.035888671875, - "entropy_loss": -0.0482177734375, - "epoch": 0.652, - "grad_norm": 0.7008909474714937, - "k1_kl": 0.03607177734375, - "k3_kl": 0.02276611328125, - "kimi_kl": 0.0555419921875, - "learning_rate": 1.7399999999999997e-07, - "loss": 0.0009, - "ppl": 0.02471923828125, - "reward": 0.9810745716094971, - "reward_std": 0.002843013731762767, - "rewards/perpo_ocr_edit_distance_reward": 0.9810746610164642, + "advantages": 5.46659748579259e-06, + "completion_length": 65.0, + "delta_ref_entropy_loss": 0.146484375, + "delta_ref_ppl": -0.57421875, + "entropy_loss": -0.1826171875, + "epoch": 0.326, + "grad_norm": 4.000264145131009, + "k1_kl": 0.57421875, + "k3_kl": 0.458984375, + "kimi_kl": 2.078125, + "learning_rate": 3.37e-07, + "loss": 0.0183, + "ppl": 0.07763671875, + "reward": 0.9806116819381714, + "reward_std": 0.004575501661747694, + "rewards/perpo_ocr_edit_distance_reward": 0.9806116819381714, "step": 1630, "temperature": 0.9 }, { - "advantages": -3.549456596374512e-05, - "completion_length": 466.0, - "delta_ref_entropy_loss": 0.0369873046875, - "delta_ref_ppl": -0.02581787109375, - "entropy_loss": -0.021759033203125, - "epoch": 0.6524, - "grad_norm": 0.3463773600765912, - "k1_kl": 0.02593994140625, - "k3_kl": 0.013824462890625, - "kimi_kl": 0.029541015625, - "learning_rate": 1.738e-07, - "loss": 0.0006, - "ppl": 0.00974273681640625, - "reward": 0.9982864260673523, - "reward_std": 0.0005497059319168329, - "rewards/perpo_ocr_edit_distance_reward": 0.9982864856719971, + "advantages": -4.9216409934160765e-06, + "completion_length": 32.0, + "delta_ref_entropy_loss": 0.2265625, + "delta_ref_ppl": -1.0859375, + "entropy_loss": -0.234375, + "epoch": 0.3262, + "grad_norm": 6.0491146707039585, + "k1_kl": 1.09375, + "k3_kl": 0.8984375, + "kimi_kl": 3.875, + "learning_rate": 3.3689999999999995e-07, + "loss": 0.036, + "ppl": 0.08642578125, + "reward": 0.9320389032363892, + "reward_std": 0.013730247505009174, + "rewards/perpo_ocr_edit_distance_reward": 0.9320389032363892, "step": 1631, "temperature": 0.9 }, { - "advantages": -0.00029812540327611714, - "completion_length": 987.0, - "delta_ref_entropy_loss": 0.02215576171875, - "delta_ref_ppl": -0.043548583984375, - "entropy_loss": -0.088775634765625, - "epoch": 0.6528, - "grad_norm": 1.830737415916828, - "k1_kl": 0.043548583984375, - "k3_kl": 0.0338287353515625, - "kimi_kl": 0.0830078125, - "learning_rate": 1.736e-07, - "loss": 0.0017, - "ppl": 0.0410614013671875, - "reward": 0.8049885332584381, - "reward_std": 0.18519112467765808, - "rewards/perpo_ocr_edit_distance_reward": 0.8049886524677277, + "advantages": -1.5922955753921997e-06, + "completion_length": 872.0, + "delta_ref_entropy_loss": 0.02685546875, + "delta_ref_ppl": -0.055908203125, + "entropy_loss": -0.029052734375, + "epoch": 0.3264, + "grad_norm": 0.3467784772252401, + "k1_kl": 0.05615234375, + "k3_kl": 0.03857421875, + "kimi_kl": 0.1171875, + "learning_rate": 3.368e-07, + "loss": 0.0015, + "ppl": 0.0101318359375, + "reward": 0.9927475452423096, + "reward_std": 0.005294490605592728, + "rewards/perpo_ocr_edit_distance_reward": 0.9927476048469543, "step": 1632, "temperature": 0.9 }, { - "advantages": -1.2568065926643612e-05, - "completion_length": 644.5, - "delta_ref_entropy_loss": 0.1016845703125, - "delta_ref_ppl": -0.06500244140625, - "entropy_loss": -0.18670654296875, - "epoch": 0.6532, - "grad_norm": 3.0699341084120464, - "k1_kl": 0.06475830078125, - "k3_kl": 0.033050537109375, - "kimi_kl": 0.05682373046875, - "learning_rate": 1.7339999999999998e-07, - "loss": 0.0013, - "ppl": 0.108245849609375, - "reward": 0.719963014125824, - "reward_std": 0.008401598839554936, - "rewards/perpo_ocr_edit_distance_reward": 0.7199630290269852, + "advantages": -3.37021701852791e-05, + "completion_length": 581.0, + "delta_ref_entropy_loss": 0.080078125, + "delta_ref_ppl": -0.095703125, + "entropy_loss": -0.0498046875, + "epoch": 0.3266, + "grad_norm": 0.6268973775160702, + "k1_kl": 0.095703125, + "k3_kl": 0.05908203125, + "kimi_kl": 0.1865234375, + "learning_rate": 3.367e-07, + "loss": 0.0024, + "ppl": 0.0201416015625, + "reward": 0.9890663027763367, + "reward_std": 0.0004055578901898116, + "rewards/perpo_ocr_edit_distance_reward": 0.9890663027763367, "step": 1633, "temperature": 0.9 }, { - "advantages": -4.296217697907778e-05, - "completion_length": 603.5, - "delta_ref_entropy_loss": 0.02032470703125, - "delta_ref_ppl": -0.01593017578125, - "entropy_loss": -0.0166015625, - "epoch": 0.6536, - "grad_norm": 0.29706195519356127, - "k1_kl": 0.01593017578125, - "k3_kl": 0.0092315673828125, - "kimi_kl": 0.02099609375, - "learning_rate": 1.732e-07, - "loss": 0.0004, - "ppl": 0.006591796875, - "reward": 0.9997687041759491, - "reward_std": 0.0002502994393580593, - "rewards/perpo_ocr_edit_distance_reward": 0.9997687339782715, + "advantages": -2.9802324661432067e-06, + "completion_length": 30.0, + "delta_ref_entropy_loss": 0.05078125, + "delta_ref_ppl": -1.09375, + "entropy_loss": -0.2890625, + "epoch": 0.3268, + "grad_norm": 5.792420592932055, + "k1_kl": 1.09375, + "k3_kl": 0.94140625, + "kimi_kl": 4.40625, + "learning_rate": 3.366e-07, + "loss": 0.0377, + "ppl": 0.10498046875, + "reward": 0.9430719017982483, + "reward_std": 0.01136736012995243, + "rewards/perpo_ocr_edit_distance_reward": 0.9430720210075378, "step": 1634, "temperature": 0.9 }, { - "advantages": -6.364924684021389e-05, - "completion_length": 491.5, - "delta_ref_entropy_loss": 0.037109375, - "delta_ref_ppl": -0.03466796875, - "entropy_loss": -0.0262451171875, - "epoch": 0.654, - "grad_norm": 0.46820554177653645, - "k1_kl": 0.03466796875, - "k3_kl": 0.022003173828125, - "kimi_kl": 0.070068359375, - "learning_rate": 1.7299999999999997e-07, - "loss": 0.0009, - "ppl": 0.01214599609375, - "reward": 0.9985023140907288, - "reward_std": 0.00057070299226325, - "rewards/perpo_ocr_edit_distance_reward": 0.9985023736953735, + "advantages": 2.7247838261246216e-06, + "completion_length": 683.0, + "delta_ref_entropy_loss": 0.09912109375, + "delta_ref_ppl": -0.08740234375, + "entropy_loss": -0.1767578125, + "epoch": 0.327, + "grad_norm": 1.2824250940605166, + "k1_kl": 0.0869140625, + "k3_kl": 0.0537109375, + "kimi_kl": 0.11865234375, + "learning_rate": 3.3650000000000003e-07, + "loss": 0.0021, + "ppl": 0.0888671875, + "reward": 0.7379946112632751, + "reward_std": 0.003024999750778079, + "rewards/perpo_ocr_edit_distance_reward": 0.7379946112632751, "step": 1635, "temperature": 0.9 }, { - "advantages": -0.00018305012417840771, - "completion_length": 406.0, - "delta_ref_entropy_loss": 0.03497314453125, - "delta_ref_ppl": -0.03668212890625, - "entropy_loss": -0.021728515625, - "epoch": 0.6544, - "grad_norm": 0.36539969242119286, - "k1_kl": 0.03668212890625, - "k3_kl": 0.02252197265625, - "kimi_kl": 0.0557861328125, - "learning_rate": 1.728e-07, - "loss": 0.0011, - "ppl": 0.0105743408203125, - "reward": 0.9535046517848969, - "reward_std": 0.00037268023152137175, - "rewards/perpo_ocr_edit_distance_reward": 0.953504741191864, + "advantages": -7.592780457343906e-05, + "completion_length": 361.0, + "delta_ref_entropy_loss": 0.115234375, + "delta_ref_ppl": -0.1484375, + "entropy_loss": -0.0654296875, + "epoch": 0.3272, + "grad_norm": 1.210196730843417, + "k1_kl": 0.1484375, + "k3_kl": 0.10205078125, + "kimi_kl": 0.3984375, + "learning_rate": 3.3639999999999997e-07, + "loss": 0.0042, + "ppl": 0.0322265625, + "reward": 0.9916328191757202, + "reward_std": 0.0010216481750831008, + "rewards/perpo_ocr_edit_distance_reward": 0.9916329383850098, "step": 1636, "temperature": 0.9 }, { - "advantages": -2.4020672611868576e-05, - "completion_length": 296.0, - "delta_ref_entropy_loss": 0.062744140625, - "delta_ref_ppl": -0.0623779296875, - "entropy_loss": -0.0526123046875, - "epoch": 0.6548, - "grad_norm": 1.0722602362839824, - "k1_kl": 0.0621337890625, - "k3_kl": 0.036865234375, - "kimi_kl": 0.0694580078125, - "learning_rate": 1.726e-07, - "loss": 0.0015, - "ppl": 0.03009033203125, - "reward": 0.9385689496994019, - "reward_std": 0.010933803743682802, - "rewards/perpo_ocr_edit_distance_reward": 0.9385690093040466, + "advantages": -7.600444223498926e-05, + "completion_length": 1073.0, + "delta_ref_entropy_loss": 0.03125, + "delta_ref_ppl": -0.045654296875, + "entropy_loss": -0.042236328125, + "epoch": 0.3274, + "grad_norm": 0.6447821011824084, + "k1_kl": 0.045654296875, + "k3_kl": 0.0294189453125, + "kimi_kl": 0.0830078125, + "learning_rate": 3.3629999999999996e-07, + "loss": 0.0012, + "ppl": 0.019287109375, + "reward": 0.9905930161476135, + "reward_std": 0.0005722393980249763, + "rewards/perpo_ocr_edit_distance_reward": 0.9905930757522583, "step": 1637, "temperature": 0.9 }, { - "advantages": -8.514949634275126e-09, - "completion_length": 928.5, - "delta_ref_entropy_loss": 0.04705810546875, - "delta_ref_ppl": -0.0235595703125, - "entropy_loss": -0.09112548828125, - "epoch": 0.6552, - "grad_norm": 1.2419282728062313, - "k1_kl": 0.023681640625, - "k3_kl": 0.018157958984375, - "kimi_kl": 0.030517578125, - "learning_rate": 1.7239999999999998e-07, - "loss": 0.0007, - "ppl": 0.0508575439453125, - "reward": 0.9833630919456482, - "reward_std": 0.0018708637217059731, - "rewards/perpo_ocr_edit_distance_reward": 0.9833630919456482, + "advantages": -5.0067901611328125e-06, + "completion_length": 635.0, + "delta_ref_entropy_loss": 0.1181640625, + "delta_ref_ppl": -0.10107421875, + "entropy_loss": -0.10791015625, + "epoch": 0.3276, + "grad_norm": 1.1389490431933114, + "k1_kl": 0.1005859375, + "k3_kl": 0.06103515625, + "kimi_kl": 0.1953125, + "learning_rate": 3.362e-07, + "loss": 0.0024, + "ppl": 0.055419921875, + "reward": 0.9683611989021301, + "reward_std": 0.003305908292531967, + "rewards/perpo_ocr_edit_distance_reward": 0.9683611989021301, "step": 1638, "temperature": 0.9 }, { - "advantages": -2.1287373641598606e-08, - "completion_length": 225.0, - "delta_ref_entropy_loss": -0.01336669921875, - "delta_ref_ppl": -0.2706298828125, - "entropy_loss": -0.19281005859375, - "epoch": 0.6556, - "grad_norm": 4.672622662400486, - "k1_kl": 0.2686767578125, - "k3_kl": 0.257904052734375, - "kimi_kl": 1.622314453125, - "learning_rate": 1.722e-07, - "loss": 0.0103, - "ppl": 0.1432952880859375, - "reward": 0.8414452075958252, - "reward_std": 0.17971575260162354, - "rewards/perpo_ocr_edit_distance_reward": 0.8414452075958252, + "advantages": -2.3228782083606347e-05, + "completion_length": 821.0, + "delta_ref_entropy_loss": 0.06396484375, + "delta_ref_ppl": -0.08984375, + "entropy_loss": -0.076171875, + "epoch": 0.3278, + "grad_norm": 6.3887360727813975, + "k1_kl": 0.09033203125, + "k3_kl": 0.0888671875, + "kimi_kl": 0.1875, + "learning_rate": 3.361e-07, + "loss": 0.0036, + "ppl": 0.043212890625, + "reward": 0.6568748950958252, + "reward_std": 0.0017340496415272355, + "rewards/perpo_ocr_edit_distance_reward": 0.65687495470047, "step": 1639, "temperature": 0.9 }, { - "advantages": -5.475112629937939e-06, - "completion_length": 1212.5, - "delta_ref_entropy_loss": 0.04827880859375, - "delta_ref_ppl": -0.031707763671875, - "entropy_loss": -0.06103515625, - "epoch": 0.656, - "grad_norm": 4.841066078198102, - "k1_kl": 0.03179931640625, - "k3_kl": 0.027313232421875, - "kimi_kl": 0.0615234375, - "learning_rate": 1.7199999999999998e-07, - "loss": 0.0011, - "ppl": 0.037109375, - "reward": 0.9777486324310303, - "reward_std": 0.003689856850542128, - "rewards/perpo_ocr_edit_distance_reward": 0.9777486622333527, + "advantages": -8.824894030112773e-05, + "completion_length": 461.0, + "delta_ref_entropy_loss": 0.05908203125, + "delta_ref_ppl": -0.1064453125, + "entropy_loss": -0.0306396484375, + "epoch": 0.328, + "grad_norm": 0.4742825184466371, + "k1_kl": 0.1064453125, + "k3_kl": 0.0732421875, + "kimi_kl": 0.26953125, + "learning_rate": 3.36e-07, + "loss": 0.003, + "ppl": 0.01220703125, + "reward": 0.9485624432563782, + "reward_std": 0.0005754028679803014, + "rewards/perpo_ocr_edit_distance_reward": 0.948562502861023, "step": 1640, "temperature": 0.9 }, { - "advantages": -3.704428945638938e-05, - "completion_length": 268.5, - "delta_ref_entropy_loss": 0.0709228515625, - "delta_ref_ppl": -0.065185546875, - "entropy_loss": -0.054931640625, - "epoch": 0.6564, - "grad_norm": 2.3113287112747734, - "k1_kl": 0.0654296875, - "k3_kl": 0.0379638671875, - "kimi_kl": 0.10205078125, - "learning_rate": 1.718e-07, - "loss": 0.0016, - "ppl": 0.02398681640625, - "reward": 0.9882711172103882, - "reward_std": 0.0006490838713943958, - "rewards/perpo_ocr_edit_distance_reward": 0.988271176815033, + "advantages": 4.39030809502583e-05, + "completion_length": 465.0, + "delta_ref_entropy_loss": 0.0296630859375, + "delta_ref_ppl": -0.06689453125, + "entropy_loss": -0.0284423828125, + "epoch": 0.3282, + "grad_norm": 0.7435968028270543, + "k1_kl": 0.06689453125, + "k3_kl": 0.047119140625, + "kimi_kl": 0.208984375, + "learning_rate": 3.359e-07, + "loss": 0.0018, + "ppl": 0.01080322265625, + "reward": 0.9985645413398743, + "reward_std": 0.0006756830844096839, + "rewards/perpo_ocr_edit_distance_reward": 0.998564600944519, "step": 1641, "temperature": 0.9 }, { - "advantages": -2.7503287128638476e-05, - "completion_length": 669.5, - "delta_ref_entropy_loss": 0.03643798828125, - "delta_ref_ppl": -0.03167724609375, - "entropy_loss": -0.02325439453125, - "epoch": 0.6568, - "grad_norm": 0.26070067076099923, - "k1_kl": 0.03167724609375, - "k3_kl": 0.019561767578125, - "kimi_kl": 0.079833984375, - "learning_rate": 1.716e-07, - "loss": 0.0008, - "ppl": 0.008758544921875, - "reward": 0.9949747025966644, - "reward_std": 0.0002596195845399052, - "rewards/perpo_ocr_edit_distance_reward": 0.9949747622013092, + "advantages": -8.07046890258789e-05, + "completion_length": 531.0, + "delta_ref_entropy_loss": 0.00640869140625, + "delta_ref_ppl": -0.04248046875, + "entropy_loss": -0.045166015625, + "epoch": 0.3284, + "grad_norm": 0.41907163485646737, + "k1_kl": 0.04248046875, + "k3_kl": 0.03564453125, + "kimi_kl": 0.142578125, + "learning_rate": 3.358e-07, + "loss": 0.0015, + "ppl": 0.015869140625, + "reward": 0.9344309568405151, + "reward_std": 0.0008498944225721061, + "rewards/perpo_ocr_edit_distance_reward": 0.9344310760498047, "step": 1642, "temperature": 0.9 }, { - "advantages": -0.00031621115704183467, - "completion_length": 326.5, - "delta_ref_entropy_loss": 0.011993408203125, - "delta_ref_ppl": -0.05615234375, - "entropy_loss": -0.03350830078125, - "epoch": 0.6572, - "grad_norm": 1.4023421854806104, - "k1_kl": 0.05609130859375, - "k3_kl": 0.04718017578125, - "kimi_kl": 0.154052734375, - "learning_rate": 1.7139999999999999e-07, - "loss": 0.0022, - "ppl": 0.016815185546875, - "reward": 0.9949429929256439, - "reward_std": 0.0004181081021670252, - "rewards/perpo_ocr_edit_distance_reward": 0.9949430525302887, + "advantages": -0.00010139602090930566, + "completion_length": 756.0, + "delta_ref_entropy_loss": 0.044677734375, + "delta_ref_ppl": -0.0439453125, + "entropy_loss": -0.0179443359375, + "epoch": 0.3286, + "grad_norm": 0.21384882566358318, + "k1_kl": 0.0439453125, + "k3_kl": 0.0234375, + "kimi_kl": 0.09033203125, + "learning_rate": 3.3569999999999997e-07, + "loss": 0.001, + "ppl": 0.005096435546875, + "reward": 0.998077929019928, + "reward_std": 0.0003199287748429924, + "rewards/perpo_ocr_edit_distance_reward": 0.9980779886245728, "step": 1643, "temperature": 0.9 }, { - "advantages": -1.4492444279312622e-05, - "completion_length": 376.5, - "delta_ref_entropy_loss": 0.045166015625, - "delta_ref_ppl": -0.0379638671875, - "entropy_loss": -0.04046630859375, - "epoch": 0.6576, - "grad_norm": 0.308731562938758, - "k1_kl": 0.037841796875, - "k3_kl": 0.022491455078125, - "kimi_kl": 0.06939697265625, - "learning_rate": 1.7119999999999997e-07, - "loss": 0.0009, - "ppl": 0.021484375, - "reward": 0.8337208330631256, - "reward_std": 9.685001714387909e-05, - "rewards/perpo_ocr_edit_distance_reward": 0.833720862865448, + "advantages": -1.7370497289448394e-06, + "completion_length": 346.0, + "delta_ref_entropy_loss": 0.09423828125, + "delta_ref_ppl": -0.314453125, + "entropy_loss": -0.1796875, + "epoch": 0.3288, + "grad_norm": 5.970104021071078, + "k1_kl": 0.314453125, + "k3_kl": 0.244140625, + "kimi_kl": 0.9453125, + "learning_rate": 3.356e-07, + "loss": 0.0098, + "ppl": 0.06591796875, + "reward": 0.7578105926513672, + "reward_std": 0.029826825484633446, + "rewards/perpo_ocr_edit_distance_reward": 0.7578107118606567, "step": 1644, "temperature": 0.9 }, { - "advantages": -0.0003472481475910172, - "completion_length": 861.0, - "delta_ref_entropy_loss": 0.01953125, - "delta_ref_ppl": -0.02386474609375, - "entropy_loss": -0.014556884765625, - "epoch": 0.658, - "grad_norm": 0.29756426819627246, - "k1_kl": 0.02386474609375, - "k3_kl": 0.016510009765625, - "kimi_kl": 0.062744140625, - "learning_rate": 1.71e-07, - "loss": 0.001, - "ppl": 0.0054931640625, - "reward": 0.9996534883975983, - "reward_std": 7.971377635840327e-05, - "rewards/perpo_ocr_edit_distance_reward": 0.9996535181999207, + "advantages": -8.514949513482861e-06, + "completion_length": 1304.0, + "delta_ref_entropy_loss": 0.0257568359375, + "delta_ref_ppl": -0.033935546875, + "entropy_loss": -0.051513671875, + "epoch": 0.329, + "grad_norm": 1.1427934770517436, + "k1_kl": 0.033935546875, + "k3_kl": 0.021484375, + "kimi_kl": 0.05712890625, + "learning_rate": 3.355e-07, + "loss": 0.0009, + "ppl": 0.02490234375, + "reward": 0.9951642155647278, + "reward_std": 0.0038958960212767124, + "rewards/perpo_ocr_edit_distance_reward": 0.9951642751693726, "step": 1645, "temperature": 0.9 }, { - "advantages": -5.702461567125283e-05, - "completion_length": 603.5, - "delta_ref_entropy_loss": 0.03631591796875, - "delta_ref_ppl": -0.03594970703125, - "entropy_loss": -0.03155517578125, - "epoch": 0.6584, - "grad_norm": 0.8178372252394471, - "k1_kl": 0.03594970703125, - "k3_kl": 0.0206298828125, - "kimi_kl": 0.04541015625, - "learning_rate": 1.708e-07, - "loss": 0.0009, - "ppl": 0.015899658203125, - "reward": 0.9942232072353363, - "reward_std": 0.0006494831177406013, - "rewards/perpo_ocr_edit_distance_reward": 0.9942232966423035, + "advantages": -4.163810444879346e-06, + "completion_length": 96.0, + "delta_ref_entropy_loss": 0.05859375, + "delta_ref_ppl": -0.337890625, + "entropy_loss": -0.1865234375, + "epoch": 0.3292, + "grad_norm": 2.0928691433211113, + "k1_kl": 0.33984375, + "k3_kl": 0.267578125, + "kimi_kl": 1.015625, + "learning_rate": 3.3539999999999995e-07, + "loss": 0.0107, + "ppl": 0.0615234375, + "reward": 0.979656457901001, + "reward_std": 0.0060202027671039104, + "rewards/perpo_ocr_edit_distance_reward": 0.979656457901001, "step": 1646, "temperature": 0.9 }, { - "advantages": -3.491129234589607e-07, - "completion_length": 737.5, - "delta_ref_entropy_loss": 0.01715087890625, - "delta_ref_ppl": -0.0343017578125, - "entropy_loss": -0.0648193359375, - "epoch": 0.6588, - "grad_norm": 0.9978152523820255, - "k1_kl": 0.0343017578125, - "k3_kl": 0.027130126953125, - "kimi_kl": 0.07861328125, - "learning_rate": 1.706e-07, - "loss": 0.0011, - "ppl": 0.03797149658203125, - "reward": 0.916382223367691, - "reward_std": 0.0498785525560379, - "rewards/perpo_ocr_edit_distance_reward": 0.9163822531700134, + "advantages": -4.092284871148877e-05, + "completion_length": 990.0, + "delta_ref_entropy_loss": 0.0419921875, + "delta_ref_ppl": -0.057373046875, + "entropy_loss": -0.03857421875, + "epoch": 0.3294, + "grad_norm": 9.750754010876106, + "k1_kl": 0.057373046875, + "k3_kl": 0.03369140625, + "kimi_kl": 0.0947265625, + "learning_rate": 3.353e-07, + "loss": 0.0014, + "ppl": 0.01507568359375, + "reward": 0.997280478477478, + "reward_std": 0.0005240774480625987, + "rewards/perpo_ocr_edit_distance_reward": 0.997280478477478, "step": 1647, "temperature": 0.9 }, { - "advantages": -0.0003744406421901658, - "completion_length": 474.5, - "delta_ref_entropy_loss": 0.06298828125, - "delta_ref_ppl": -0.052215576171875, - "entropy_loss": -0.03265380859375, - "epoch": 0.6592, - "grad_norm": 0.2709051203210259, - "k1_kl": 0.05194091796875, - "k3_kl": 0.030609130859375, - "kimi_kl": 0.06109619140625, - "learning_rate": 1.704e-07, - "loss": 0.0016, - "ppl": 0.01409912109375, - "reward": 0.965267539024353, - "reward_std": 0.0001449864503229037, - "rewards/perpo_ocr_edit_distance_reward": 0.9652675986289978, - "step": 1648, - "temperature": 0.9 - }, - { - "advantages": 0.0, - "completion_length": 388.0, - "delta_ref_entropy_loss": 0.014739990234375, - "delta_ref_ppl": -0.009765625, - "entropy_loss": -0.011688232421875, - "epoch": 0.6596, - "grad_norm": 0.00699280100497551, - "k1_kl": 0.009765625, - "k3_kl": 0.0048828125, - "kimi_kl": 0.01146697998046875, - "learning_rate": 1.7019999999999998e-07, - "loss": 0.0002, - "ppl": 0.00438690185546875, - "reward": 1.0, - "reward_std": 0.0, - "rewards/perpo_ocr_edit_distance_reward": 1.0, + "advantages": -1.549720877846994e-06, + "completion_length": 975.0, + "delta_ref_entropy_loss": 0.06787109375, + "delta_ref_ppl": -0.0595703125, + "entropy_loss": -0.0673828125, + "epoch": 0.3296, + "grad_norm": 3.7847219744662515, + "k1_kl": 0.059814453125, + "k3_kl": 0.0341796875, + "kimi_kl": 0.06787109375, + "learning_rate": 3.352e-07, + "loss": 0.0014, + "ppl": 0.03271484375, + "reward": 0.8194618821144104, + "reward_std": 0.04950014501810074, + "rewards/perpo_ocr_edit_distance_reward": 0.8194620013237, + "step": 1648, + "temperature": 0.9 + }, + { + "advantages": -0.00013215202488936484, + "completion_length": 502.0, + "delta_ref_entropy_loss": 0.061279296875, + "delta_ref_ppl": -0.1044921875, + "entropy_loss": -0.02685546875, + "epoch": 0.3298, + "grad_norm": 0.4349817892452603, + "k1_kl": 0.1044921875, + "k3_kl": 0.076171875, + "kimi_kl": 0.37109375, + "learning_rate": 3.351e-07, + "loss": 0.0032, + "ppl": 0.01055908203125, + "reward": 0.9973887801170349, + "reward_std": 0.0007375433924607933, + "rewards/perpo_ocr_edit_distance_reward": 0.9973888993263245, "step": 1649, "temperature": 0.9 }, { - "advantages": 0.0, - "completion_length": 294.0, - "delta_ref_entropy_loss": 0.0753173828125, - "delta_ref_ppl": -0.121337890625, - "entropy_loss": -0.0364990234375, - "epoch": 0.66, - "grad_norm": 0.03530560603526073, - "k1_kl": 0.1212158203125, - "k3_kl": 0.09130859375, - "kimi_kl": 0.461669921875, - "learning_rate": 1.7000000000000001e-07, - "loss": 0.0037, - "ppl": 0.011688232421875, - "reward": 1.0, - "reward_std": 0.0, - "rewards/perpo_ocr_edit_distance_reward": 1.0, + "advantages": -3.161600761814043e-05, + "completion_length": 875.0, + "delta_ref_entropy_loss": 0.03857421875, + "delta_ref_ppl": -0.06201171875, + "entropy_loss": -0.15234375, + "epoch": 0.33, + "grad_norm": 2.3849989018667825, + "k1_kl": 0.061767578125, + "k3_kl": 0.046630859375, + "kimi_kl": 0.11181640625, + "learning_rate": 3.35e-07, + "loss": 0.0019, + "ppl": 0.08056640625, + "reward": 0.5962468385696411, + "reward_std": 0.0015157220186665654, + "rewards/perpo_ocr_edit_distance_reward": 0.5962468981742859, "step": 1650, "temperature": 0.9 }, { - "advantages": -4.721539653473883e-06, - "completion_length": 542.0, - "delta_ref_entropy_loss": 0.0777587890625, - "delta_ref_ppl": -0.080810546875, - "entropy_loss": -0.0692138671875, - "epoch": 0.6604, - "grad_norm": 0.9434168292027092, - "k1_kl": 0.0809326171875, - "k3_kl": 0.0513916015625, - "kimi_kl": 0.142578125, - "learning_rate": 1.698e-07, - "loss": 0.0021, - "ppl": 0.03448486328125, - "reward": 0.5594534277915955, - "reward_std": 0.0029993432108312845, - "rewards/perpo_ocr_edit_distance_reward": 0.5594534650444984, + "advantages": -2.4829592803143896e-05, + "completion_length": 707.0, + "delta_ref_entropy_loss": 0.06201171875, + "delta_ref_ppl": -0.0986328125, + "entropy_loss": -0.0439453125, + "epoch": 0.3302, + "grad_norm": 0.7025174104625178, + "k1_kl": 0.0986328125, + "k3_kl": 0.06298828125, + "kimi_kl": 0.2041015625, + "learning_rate": 3.3489999999999996e-07, + "loss": 0.0025, + "ppl": 0.01806640625, + "reward": 0.996314287185669, + "reward_std": 0.0005862795514985919, + "rewards/perpo_ocr_edit_distance_reward": 0.9963143467903137, "step": 1651, "temperature": 0.9 }, { - "advantages": -0.00015098282983672107, - "completion_length": 411.5, - "delta_ref_entropy_loss": 0.082275390625, - "delta_ref_ppl": -0.05499267578125, - "entropy_loss": -0.055419921875, - "epoch": 0.6608, - "grad_norm": 1.1171598403056684, - "k1_kl": 0.05499267578125, - "k3_kl": 0.032562255859375, - "kimi_kl": 0.1099853515625, - "learning_rate": 1.6959999999999998e-07, - "loss": 0.0014, - "ppl": 0.02728271484375, - "reward": 0.9925881922245026, - "reward_std": 0.0015626070744474418, - "rewards/perpo_ocr_edit_distance_reward": 0.9925882816314697, + "advantages": -6.965228749322705e-06, + "completion_length": 521.0, + "delta_ref_entropy_loss": 0.046630859375, + "delta_ref_ppl": -0.0654296875, + "entropy_loss": -0.047607421875, + "epoch": 0.3304, + "grad_norm": 0.6046556312742674, + "k1_kl": 0.0654296875, + "k3_kl": 0.046142578125, + "kimi_kl": 0.1845703125, + "learning_rate": 3.3479999999999995e-07, + "loss": 0.0018, + "ppl": 0.02294921875, + "reward": 0.9773519039154053, + "reward_std": 0.004794497508555651, + "rewards/perpo_ocr_edit_distance_reward": 0.97735196352005, "step": 1652, "temperature": 0.9 }, { - "advantages": -1.1307853128528222e-05, - "completion_length": 394.0, - "delta_ref_entropy_loss": 0.033172607421875, - "delta_ref_ppl": -0.02740478515625, - "entropy_loss": -0.015289306640625, - "epoch": 0.6612, - "grad_norm": 0.4160194221236123, - "k1_kl": 0.0274658203125, - "k3_kl": 0.016204833984375, - "kimi_kl": 0.0360107421875, - "learning_rate": 1.694e-07, - "loss": 0.0007, - "ppl": 0.0069427490234375, - "reward": 0.998382568359375, - "reward_std": 0.0008919626707211137, - "rewards/perpo_ocr_edit_distance_reward": 0.9983826279640198, + "advantages": -2.4225032575486694e-06, + "completion_length": 1014.0, + "delta_ref_entropy_loss": 0.119140625, + "delta_ref_ppl": -0.142578125, + "entropy_loss": -0.1689453125, + "epoch": 0.3306, + "grad_norm": 1.917179314452025, + "k1_kl": 0.142578125, + "k3_kl": 0.0849609375, + "kimi_kl": 0.21875, + "learning_rate": 3.347e-07, + "loss": 0.0034, + "ppl": 0.08642578125, + "reward": 0.9221706390380859, + "reward_std": 0.020962711423635483, + "rewards/perpo_ocr_edit_distance_reward": 0.9221707582473755, "step": 1653, "temperature": 0.9 }, { - "advantages": -0.00011620564373515663, - "completion_length": 1075.0, - "delta_ref_entropy_loss": 0.02020263671875, - "delta_ref_ppl": -0.024505615234375, - "entropy_loss": -0.018585205078125, - "epoch": 0.6616, - "grad_norm": 0.7974698750436945, - "k1_kl": 0.024383544921875, - "k3_kl": 0.01824951171875, - "kimi_kl": 0.0613555908203125, - "learning_rate": 1.6919999999999998e-07, - "loss": 0.0008, - "ppl": 0.0091552734375, - "reward": 0.9981629550457001, - "reward_std": 0.0006122328559285961, - "rewards/perpo_ocr_edit_distance_reward": 0.9981630146503448, + "advantages": 8.736338713788427e-06, + "completion_length": 591.0, + "delta_ref_entropy_loss": 0.043212890625, + "delta_ref_ppl": -0.06494140625, + "entropy_loss": -0.061767578125, + "epoch": 0.3308, + "grad_norm": 0.5511595106974926, + "k1_kl": 0.06494140625, + "k3_kl": 0.040283203125, + "kimi_kl": 0.09814453125, + "learning_rate": 3.346e-07, + "loss": 0.0016, + "ppl": 0.0277099609375, + "reward": 0.9705575108528137, + "reward_std": 0.0008736568852327764, + "rewards/perpo_ocr_edit_distance_reward": 0.9705575108528137, "step": 1654, "temperature": 0.9 }, { - "advantages": -8.514949634275126e-09, - "completion_length": 361.5, - "delta_ref_entropy_loss": 0.0384521484375, - "delta_ref_ppl": -0.0245361328125, - "entropy_loss": -0.028076171875, - "epoch": 0.662, - "grad_norm": 0.5879219955821626, - "k1_kl": 0.02459716796875, - "k3_kl": 0.01214599609375, - "kimi_kl": 0.0281982421875, - "learning_rate": 1.69e-07, - "loss": 0.0005, - "ppl": 0.0153350830078125, - "reward": 0.9897459745407104, - "reward_std": 0.0009406788158230484, - "rewards/perpo_ocr_edit_distance_reward": 0.9897459745407104, + "advantages": -1.021793991640152e-07, + "completion_length": 453.0, + "delta_ref_entropy_loss": 0.1416015625, + "delta_ref_ppl": -0.212890625, + "entropy_loss": -0.337890625, + "epoch": 0.331, + "grad_norm": 6.8109895568729115, + "k1_kl": 0.2138671875, + "k3_kl": 0.1494140625, + "kimi_kl": 0.5390625, + "learning_rate": 3.345e-07, + "loss": 0.006, + "ppl": 0.1728515625, + "reward": 0.583069384098053, + "reward_std": 0.16861827671527863, + "rewards/perpo_ocr_edit_distance_reward": 0.5830694437026978, "step": 1655, "temperature": 0.9 }, { - "advantages": 0.0, - "completion_length": 302.0, - "delta_ref_entropy_loss": 0.0228271484375, - "delta_ref_ppl": -0.029296875, - "entropy_loss": -0.01715087890625, - "epoch": 0.6624, - "grad_norm": 0.006719247841364344, - "k1_kl": 0.029296875, - "k3_kl": 0.020355224609375, - "kimi_kl": 0.07745361328125, - "learning_rate": 1.688e-07, - "loss": 0.0008, - "ppl": 0.0073699951171875, - "reward": 1.0, - "reward_std": 0.0, - "rewards/perpo_ocr_edit_distance_reward": 1.0, + "advantages": -2.486365247023059e-06, + "completion_length": 489.0, + "delta_ref_entropy_loss": 0.033203125, + "delta_ref_ppl": -0.08984375, + "entropy_loss": -0.049072265625, + "epoch": 0.3312, + "grad_norm": 1.2431355137509954, + "k1_kl": 0.08984375, + "k3_kl": 0.06787109375, + "kimi_kl": 0.234375, + "learning_rate": 3.344e-07, + "loss": 0.0027, + "ppl": 0.0238037109375, + "reward": 0.9681096076965332, + "reward_std": 0.013574582524597645, + "rewards/perpo_ocr_edit_distance_reward": 0.9681096076965332, "step": 1656, "temperature": 0.9 }, { - "advantages": -1.2606383279489819e-05, - "completion_length": 477.0, - "delta_ref_entropy_loss": 0.03143310546875, - "delta_ref_ppl": -0.04632568359375, - "entropy_loss": -0.03289794921875, - "epoch": 0.6628, - "grad_norm": 0.5345438766486531, - "k1_kl": 0.046356201171875, - "k3_kl": 0.03338623046875, - "kimi_kl": 0.122802734375, - "learning_rate": 1.686e-07, - "loss": 0.0013, - "ppl": 0.01798248291015625, - "reward": 0.989304780960083, - "reward_std": 0.0009627927211113274, - "rewards/perpo_ocr_edit_distance_reward": 0.9893048107624054, + "advantages": 2.5289400582551025e-06, + "completion_length": 672.0, + "delta_ref_entropy_loss": 0.12109375, + "delta_ref_ppl": -0.07373046875, + "entropy_loss": -0.201171875, + "epoch": 0.3314, + "grad_norm": 1.5679578586176206, + "k1_kl": 0.07373046875, + "k3_kl": 0.037109375, + "kimi_kl": 0.0625, + "learning_rate": 3.3429999999999997e-07, + "loss": 0.0015, + "ppl": 0.0927734375, + "reward": 0.780341625213623, + "reward_std": 0.00999276340007782, + "rewards/perpo_ocr_edit_distance_reward": 0.7803415656089783, "step": 1657, "temperature": 0.9 }, { - "advantages": -2.254120136058191e-05, - "completion_length": 444.0, - "delta_ref_entropy_loss": 0.113037109375, - "delta_ref_ppl": -0.083984375, - "entropy_loss": -0.1240234375, - "epoch": 0.6632, - "grad_norm": 1.099192586324865, - "k1_kl": 0.083740234375, - "k3_kl": 0.0462646484375, - "kimi_kl": 0.11474609375, - "learning_rate": 1.684e-07, + "advantages": -7.734980317763984e-05, + "completion_length": 605.0, + "delta_ref_entropy_loss": 0.039794921875, + "delta_ref_ppl": -0.0693359375, + "entropy_loss": -0.0306396484375, + "epoch": 0.3316, + "grad_norm": 0.9257208799420846, + "k1_kl": 0.0693359375, + "k3_kl": 0.0458984375, + "kimi_kl": 0.15234375, + "learning_rate": 3.3419999999999996e-07, "loss": 0.0019, - "ppl": 0.07037353515625, - "reward": 0.923439621925354, - "reward_std": 0.0020234030671417713, - "rewards/perpo_ocr_edit_distance_reward": 0.9234396815299988, + "ppl": 0.0120849609375, + "reward": 0.9962959289550781, + "reward_std": 0.0012208687840029597, + "rewards/perpo_ocr_edit_distance_reward": 0.9962959885597229, "step": 1658, "temperature": 0.9 }, { - "advantages": -4.0871759026117616e-05, - "completion_length": 424.5, - "delta_ref_entropy_loss": 0.090576171875, - "delta_ref_ppl": -0.078857421875, - "entropy_loss": -0.166015625, - "epoch": 0.6636, - "grad_norm": 1.177126516575136, - "k1_kl": 0.0791015625, - "k3_kl": 0.0511474609375, - "kimi_kl": 0.167724609375, - "learning_rate": 1.6819999999999998e-07, + "advantages": -3.462178574409336e-05, + "completion_length": 632.0, + "delta_ref_entropy_loss": 0.06982421875, + "delta_ref_ppl": -0.0810546875, + "entropy_loss": -0.04833984375, + "epoch": 0.3318, + "grad_norm": 0.9644420129851892, + "k1_kl": 0.0810546875, + "k3_kl": 0.0517578125, + "kimi_kl": 0.171875, + "learning_rate": 3.341e-07, "loss": 0.0021, - "ppl": 0.0926513671875, - "reward": 0.7634766399860382, - "reward_std": 0.07965042773867026, - "rewards/perpo_ocr_edit_distance_reward": 0.7634767293930054, + "ppl": 0.019775390625, + "reward": 0.9082331657409668, + "reward_std": 0.0008832346647977829, + "rewards/perpo_ocr_edit_distance_reward": 0.9082331657409668, "step": 1659, "temperature": 0.9 }, { - "advantages": -2.9291427381394897e-06, - "completion_length": 380.5, - "delta_ref_entropy_loss": 0.0574951171875, - "delta_ref_ppl": -0.0677490234375, - "entropy_loss": -0.03173828125, - "epoch": 0.664, - "grad_norm": 0.5716395855837666, - "k1_kl": 0.0679931640625, - "k3_kl": 0.0435791015625, - "kimi_kl": 0.1259765625, - "learning_rate": 1.68e-07, - "loss": 0.0017, - "ppl": 0.01611328125, - "reward": 0.9986113607883453, - "reward_std": 0.0006760053802281618, - "rewards/perpo_ocr_edit_distance_reward": 0.9986113607883453, + "advantages": 4.649162292480469e-06, + "completion_length": 456.0, + "delta_ref_entropy_loss": 0.10302734375, + "delta_ref_ppl": -0.11865234375, + "entropy_loss": -0.06396484375, + "epoch": 0.332, + "grad_norm": 0.7351161498227701, + "k1_kl": 0.1181640625, + "k3_kl": 0.07861328125, + "kimi_kl": 0.265625, + "learning_rate": 3.34e-07, + "loss": 0.0031, + "ppl": 0.030029296875, + "reward": 0.9807308316230774, + "reward_std": 0.0017324612708762288, + "rewards/perpo_ocr_edit_distance_reward": 0.9807308912277222, "step": 1660, "temperature": 0.9 }, { - "advantages": -4.384347579389214e-05, - "completion_length": 614.5, - "delta_ref_entropy_loss": 0.113037109375, - "delta_ref_ppl": -0.0680389404296875, - "entropy_loss": -0.164337158203125, - "epoch": 0.6644, - "grad_norm": 1.875934609509994, - "k1_kl": 0.0680389404296875, - "k3_kl": 0.04129791259765625, - "kimi_kl": 0.078582763671875, - "learning_rate": 1.678e-07, - "loss": 0.0017, - "ppl": 0.096649169921875, - "reward": 0.8307804763317108, - "reward_std": 0.015288256865460426, - "rewards/perpo_ocr_edit_distance_reward": 0.8307805359363556, + "advantages": 0.0, + "completion_length": 114.0, + "delta_ref_entropy_loss": 0.0269775390625, + "delta_ref_ppl": -0.451171875, + "entropy_loss": -0.166015625, + "epoch": 0.3322, + "grad_norm": 2.3074918538258125, + "k1_kl": 0.451171875, + "k3_kl": 0.376953125, + "kimi_kl": 1.7265625, + "learning_rate": 3.3389999999999994e-07, + "loss": 0.0151, + "ppl": 0.07470703125, + "reward": 0.7941579222679138, + "reward_std": 0.05930248275399208, + "rewards/perpo_ocr_edit_distance_reward": 0.7941579222679138, "step": 1661, "temperature": 0.9 }, { - "advantages": -5.609223080682568e-05, - "completion_length": 439.0, - "delta_ref_entropy_loss": 0.03240966796875, - "delta_ref_ppl": -0.02679443359375, - "entropy_loss": -0.02374267578125, - "epoch": 0.6648, - "grad_norm": 0.18403097993117926, - "k1_kl": 0.02679443359375, - "k3_kl": 0.01568603515625, - "kimi_kl": 0.0374755859375, - "learning_rate": 1.676e-07, - "loss": 0.0007, - "ppl": 0.009185791015625, - "reward": 0.9999184906482697, - "reward_std": 0.00021564876078628004, - "rewards/perpo_ocr_edit_distance_reward": 0.999918520450592, + "advantages": 0.0, + "completion_length": 329.0, + "delta_ref_entropy_loss": 0.050537109375, + "delta_ref_ppl": -0.12890625, + "entropy_loss": -0.0361328125, + "epoch": 0.3324, + "grad_norm": 1.0952943421874954, + "k1_kl": 0.12890625, + "k3_kl": 0.10009765625, + "kimi_kl": 0.427734375, + "learning_rate": 3.338e-07, + "loss": 0.004, + "ppl": 0.01483154296875, + "reward": 0.9817940592765808, + "reward_std": 0.0015785590512678027, + "rewards/perpo_ocr_edit_distance_reward": 0.9817941188812256, "step": 1662, "temperature": 0.9 }, { - "advantages": -2.3646015506528784e-05, - "completion_length": 597.0, - "delta_ref_entropy_loss": 0.04931640625, - "delta_ref_ppl": -0.040283203125, - "entropy_loss": -0.0400390625, - "epoch": 0.6652, - "grad_norm": 0.5827659647265485, - "k1_kl": 0.0401611328125, - "k3_kl": 0.02325439453125, - "kimi_kl": 0.0557861328125, - "learning_rate": 1.6739999999999998e-07, - "loss": 0.001, - "ppl": 0.01885986328125, - "reward": 0.9727357029914856, - "reward_std": 0.0017298281018156558, - "rewards/perpo_ocr_edit_distance_reward": 0.9727357625961304, + "advantages": -1.1665481451927917e-06, + "completion_length": 102.0, + "delta_ref_entropy_loss": 0.0296630859375, + "delta_ref_ppl": -0.515625, + "entropy_loss": -0.1318359375, + "epoch": 0.3326, + "grad_norm": 3.7810345031399346, + "k1_kl": 0.515625, + "k3_kl": 0.451171875, + "kimi_kl": 2.40625, + "learning_rate": 3.337e-07, + "loss": 0.018, + "ppl": 0.05810546875, + "reward": 0.9531087875366211, + "reward_std": 0.021367112174630165, + "rewards/perpo_ocr_edit_distance_reward": 0.9531088471412659, "step": 1663, "temperature": 0.9 }, { - "advantages": -9.499277507529769e-05, - "completion_length": 577.0, - "delta_ref_entropy_loss": 0.03436279296875, - "delta_ref_ppl": -0.023284912109375, - "entropy_loss": -0.0394287109375, - "epoch": 0.6656, - "grad_norm": 0.4785016678979643, - "k1_kl": 0.023284912109375, - "k3_kl": 0.01290130615234375, - "kimi_kl": 0.03326416015625, - "learning_rate": 1.672e-07, - "loss": 0.0006, - "ppl": 0.019134521484375, - "reward": 0.9981539845466614, - "reward_std": 0.000735117013391573, - "rewards/perpo_ocr_edit_distance_reward": 0.9981540441513062, + "advantages": -3.4187523851869628e-06, + "completion_length": 209.0, + "delta_ref_entropy_loss": 0.0537109375, + "delta_ref_ppl": -0.181640625, + "entropy_loss": -0.05224609375, + "epoch": 0.3328, + "grad_norm": 1.2255383527375971, + "k1_kl": 0.181640625, + "k3_kl": 0.1396484375, + "kimi_kl": 0.70703125, + "learning_rate": 3.3359999999999997e-07, + "loss": 0.0056, + "ppl": 0.025146484375, + "reward": 0.9790172576904297, + "reward_std": 0.0023889942094683647, + "rewards/perpo_ocr_edit_distance_reward": 0.9790173172950745, "step": 1664, "temperature": 0.9 }, { - "advantages": -0.00010954056779155508, - "completion_length": 603.0, - "delta_ref_entropy_loss": 0.057861328125, - "delta_ref_ppl": -0.060791015625, - "entropy_loss": -0.038177490234375, - "epoch": 0.666, - "grad_norm": 0.4570497680596116, - "k1_kl": 0.0607757568359375, - "k3_kl": 0.03914642333984375, - "kimi_kl": 0.088775634765625, - "learning_rate": 1.67e-07, - "loss": 0.0017, - "ppl": 0.024993896484375, - "reward": 0.9994674026966095, - "reward_std": 0.0001443465007469058, - "rewards/perpo_ocr_edit_distance_reward": 0.9994674623012543, + "advantages": -4.51292316938634e-06, + "completion_length": 113.0, + "delta_ref_entropy_loss": 0.1259765625, + "delta_ref_ppl": -0.466796875, + "entropy_loss": -0.1748046875, + "epoch": 0.333, + "grad_norm": 3.7396535643279667, + "k1_kl": 0.46875, + "k3_kl": 0.376953125, + "kimi_kl": 1.640625, + "learning_rate": 3.335e-07, + "loss": 0.0151, + "ppl": 0.08740234375, + "reward": 0.9718956351280212, + "reward_std": 0.009390339255332947, + "rewards/perpo_ocr_edit_distance_reward": 0.9718957543373108, "step": 1665, "temperature": 0.9 }, { - "advantages": 2.207500619988423e-05, - "completion_length": 924.0, - "delta_ref_entropy_loss": 0.048309326171875, - "delta_ref_ppl": -0.0217132568359375, - "entropy_loss": -0.0625, - "epoch": 0.6664, - "grad_norm": 1.092957203865379, - "k1_kl": 0.0217132568359375, - "k3_kl": 0.0208587646484375, - "kimi_kl": 0.022003173828125, - "learning_rate": 1.6679999999999998e-07, - "loss": 0.0008, - "ppl": 0.039306640625, - "reward": 0.9716993868350983, - "reward_std": 0.00033560182782821357, - "rewards/perpo_ocr_edit_distance_reward": 0.9716993868350983, + "advantages": -6.846019459771924e-06, + "completion_length": 259.0, + "delta_ref_entropy_loss": 0.043212890625, + "delta_ref_ppl": -0.1357421875, + "entropy_loss": -0.076171875, + "epoch": 0.3332, + "grad_norm": 2.4393634010402656, + "k1_kl": 0.1357421875, + "k3_kl": 0.10986328125, + "kimi_kl": 0.416015625, + "learning_rate": 3.3339999999999995e-07, + "loss": 0.0044, + "ppl": 0.035400390625, + "reward": 0.9919869303703308, + "reward_std": 0.0036287426482886076, + "rewards/perpo_ocr_edit_distance_reward": 0.9919869899749756, "step": 1666, "temperature": 0.9 }, { - "advantages": 0.0, - "completion_length": 478.0, - "delta_ref_entropy_loss": 0.04010009765625, - "delta_ref_ppl": -0.02587890625, - "entropy_loss": -0.02691650390625, - "epoch": 0.6668, - "grad_norm": 0.0165826716766128, - "k1_kl": 0.02587890625, - "k3_kl": 0.01177978515625, - "kimi_kl": 0.0231170654296875, - "learning_rate": 1.666e-07, - "loss": 0.0005, - "ppl": 0.012725830078125, - "reward": 1.0, - "reward_std": 0.0, - "rewards/perpo_ocr_edit_distance_reward": 1.0, + "advantages": 1.2091228427379974e-06, + "completion_length": 824.0, + "delta_ref_entropy_loss": 0.0751953125, + "delta_ref_ppl": -0.0712890625, + "entropy_loss": -0.1162109375, + "epoch": 0.3334, + "grad_norm": 3.2705253832199763, + "k1_kl": 0.0712890625, + "k3_kl": 0.037841796875, + "kimi_kl": 0.103515625, + "learning_rate": 3.3329999999999995e-07, + "loss": 0.0015, + "ppl": 0.057373046875, + "reward": 0.9579750299453735, + "reward_std": 0.007009937893599272, + "rewards/perpo_ocr_edit_distance_reward": 0.9579750299453735, "step": 1667, "temperature": 0.9 }, { - "advantages": -6.313409539870918e-05, - "completion_length": 909.5, - "delta_ref_entropy_loss": 0.0181884765625, - "delta_ref_ppl": -0.009918212890625, - "entropy_loss": -0.01654052734375, - "epoch": 0.6672, - "grad_norm": 0.2962835255228231, - "k1_kl": 0.0099334716796875, - "k3_kl": 0.006072998046875, - "kimi_kl": 0.0121612548828125, - "learning_rate": 1.6639999999999998e-07, - "loss": 0.0003, - "ppl": 0.00717926025390625, - "reward": 0.9969315528869629, - "reward_std": 0.00021976942662149668, - "rewards/perpo_ocr_edit_distance_reward": 0.9969315826892853, + "advantages": -4.625320798368193e-05, + "completion_length": 128.0, + "delta_ref_entropy_loss": 0.0888671875, + "delta_ref_ppl": -0.2314453125, + "entropy_loss": -0.0859375, + "epoch": 0.3336, + "grad_norm": 1.1892388242724323, + "k1_kl": 0.2314453125, + "k3_kl": 0.17578125, + "kimi_kl": 0.65234375, + "learning_rate": 3.332e-07, + "loss": 0.0071, + "ppl": 0.0301513671875, + "reward": 0.9864082932472229, + "reward_std": 0.0019254754297435284, + "rewards/perpo_ocr_edit_distance_reward": 0.9864084124565125, "step": 1668, "temperature": 0.9 }, { - "advantages": -7.097210527717834e-06, - "completion_length": 1012.0, - "delta_ref_entropy_loss": 0.0604248046875, - "delta_ref_ppl": -0.052978515625, - "entropy_loss": -0.117919921875, - "epoch": 0.6676, - "grad_norm": 1.3056378309062209, - "k1_kl": 0.05303955078125, - "k3_kl": 0.03717041015625, - "kimi_kl": 0.130126953125, - "learning_rate": 1.6619999999999997e-07, - "loss": 0.0015, - "ppl": 0.066650390625, - "reward": 0.9466970562934875, - "reward_std": 0.0023805865785107017, - "rewards/perpo_ocr_edit_distance_reward": 0.9466971158981323, + "advantages": 1.6246523955487646e-05, + "completion_length": 422.0, + "delta_ref_entropy_loss": 0.060302734375, + "delta_ref_ppl": -0.09033203125, + "entropy_loss": -0.044921875, + "epoch": 0.3338, + "grad_norm": 0.5988691106302677, + "k1_kl": 0.08984375, + "k3_kl": 0.05615234375, + "kimi_kl": 0.1728515625, + "learning_rate": 3.331e-07, + "loss": 0.0022, + "ppl": 0.0174560546875, + "reward": 0.9922474026679993, + "reward_std": 0.0009480859152972698, + "rewards/perpo_ocr_edit_distance_reward": 0.992247462272644, "step": 1669, "temperature": 0.9 }, { - "advantages": -4.257687760400586e-05, - "completion_length": 740.5, - "delta_ref_entropy_loss": 0.0147705078125, - "delta_ref_ppl": -0.014007568359375, - "entropy_loss": -0.02288818359375, - "epoch": 0.668, - "grad_norm": 0.4269610731678324, - "k1_kl": 0.0140380859375, - "k3_kl": 0.0107421875, - "kimi_kl": 0.02166748046875, - "learning_rate": 1.66e-07, - "loss": 0.0005, - "ppl": 0.012542724609375, - "reward": 0.9987029433250427, - "reward_std": 0.000836659804917872, - "rewards/perpo_ocr_edit_distance_reward": 0.9987030327320099, + "advantages": 0.0, + "completion_length": 250.0, + "delta_ref_entropy_loss": 0.06884765625, + "delta_ref_ppl": -0.1025390625, + "entropy_loss": -0.1220703125, + "epoch": 0.334, + "grad_norm": 1.9666261706766452, + "k1_kl": 0.1025390625, + "k3_kl": 0.06689453125, + "kimi_kl": 0.1455078125, + "learning_rate": 3.33e-07, + "loss": 0.0027, + "ppl": 0.05419921875, + "reward": 0.9794105291366577, + "reward_std": 0.0011993983061984181, + "rewards/perpo_ocr_edit_distance_reward": 0.9794105291366577, "step": 1670, "temperature": 0.9 }, { - "advantages": -4.5299530029296875e-06, - "completion_length": 400.0, - "delta_ref_entropy_loss": 0.09423828125, - "delta_ref_ppl": -0.072509765625, - "entropy_loss": -0.126220703125, - "epoch": 0.6684, - "grad_norm": 1.3686428345800212, - "k1_kl": 0.072509765625, - "k3_kl": 0.0445556640625, - "kimi_kl": 0.14013671875, - "learning_rate": 1.658e-07, - "loss": 0.0018, - "ppl": 0.07684326171875, - "reward": 0.9382783770561218, - "reward_std": 0.00418460089713335, - "rewards/perpo_ocr_edit_distance_reward": 0.9382784366607666, + "advantages": -1.2431826689862646e-05, + "completion_length": 754.0, + "delta_ref_entropy_loss": 0.053955078125, + "delta_ref_ppl": -0.0693359375, + "entropy_loss": -0.09814453125, + "epoch": 0.3342, + "grad_norm": 1.677516523120735, + "k1_kl": 0.06884765625, + "k3_kl": 0.0537109375, + "kimi_kl": 0.130859375, + "learning_rate": 3.3289999999999997e-07, + "loss": 0.0022, + "ppl": 0.054443359375, + "reward": 0.9856741428375244, + "reward_std": 0.004008937627077103, + "rewards/perpo_ocr_edit_distance_reward": 0.9856742024421692, "step": 1671, "temperature": 0.9 }, { - "advantages": -1.1333398106216919e-05, - "completion_length": 528.0, - "delta_ref_entropy_loss": 0.05859375, - "delta_ref_ppl": -0.06201171875, - "entropy_loss": -0.04974365234375, - "epoch": 0.6688, - "grad_norm": 0.8112447517073589, - "k1_kl": 0.06201171875, - "k3_kl": 0.04205322265625, - "kimi_kl": 0.138671875, - "learning_rate": 1.656e-07, - "loss": 0.0017, - "ppl": 0.0245361328125, - "reward": 0.9859281480312347, - "reward_std": 0.0008878900553099811, - "rewards/perpo_ocr_edit_distance_reward": 0.9859281778335571, + "advantages": -8.514949456639442e-08, + "completion_length": 239.0, + "delta_ref_entropy_loss": 0.02880859375, + "delta_ref_ppl": -0.24609375, + "entropy_loss": -0.10888671875, + "epoch": 0.3344, + "grad_norm": 5.581135972492534, + "k1_kl": 0.24609375, + "k3_kl": 0.212890625, + "kimi_kl": 0.96875, + "learning_rate": 3.3279999999999996e-07, + "loss": 0.0085, + "ppl": 0.042724609375, + "reward": 0.24325866997241974, + "reward_std": 0.09340117871761322, + "rewards/perpo_ocr_edit_distance_reward": 0.24325868487358093, "step": 1672, "temperature": 0.9 }, { - "advantages": -6.576095756827272e-05, - "completion_length": 487.5, - "delta_ref_entropy_loss": 0.05126953125, - "delta_ref_ppl": -0.0653076171875, - "entropy_loss": -0.07232666015625, - "epoch": 0.6692, - "grad_norm": 4.512348725927746, - "k1_kl": 0.0655517578125, - "k3_kl": 0.0435791015625, - "kimi_kl": 0.19287109375, - "learning_rate": 1.6539999999999999e-07, - "loss": 0.0018, - "ppl": 0.046478271484375, - "reward": 0.9680590927600861, - "reward_std": 0.0022302252909867093, - "rewards/perpo_ocr_edit_distance_reward": 0.9680591821670532, + "advantages": -1.3538769962906372e-05, + "completion_length": 360.0, + "delta_ref_entropy_loss": 0.0281982421875, + "delta_ref_ppl": -0.072265625, + "entropy_loss": -0.043212890625, + "epoch": 0.3346, + "grad_norm": 0.666500849159395, + "k1_kl": 0.072265625, + "k3_kl": 0.0498046875, + "kimi_kl": 0.1748046875, + "learning_rate": 3.3269999999999995e-07, + "loss": 0.002, + "ppl": 0.017333984375, + "reward": 0.9780464768409729, + "reward_std": 0.0017856985796242952, + "rewards/perpo_ocr_edit_distance_reward": 0.9780464768409729, "step": 1673, "temperature": 0.9 }, { - "advantages": -1.1927315313187137e-05, - "completion_length": 551.0, - "delta_ref_entropy_loss": 0.072509765625, - "delta_ref_ppl": -0.057373046875, - "entropy_loss": -0.15753173828125, - "epoch": 0.6696, - "grad_norm": 1.4662752245482844, - "k1_kl": 0.05731201171875, - "k3_kl": 0.03314208984375, - "kimi_kl": 0.0863037109375, - "learning_rate": 1.652e-07, - "loss": 0.0013, - "ppl": 0.09178924560546875, - "reward": 0.8302672207355499, - "reward_std": 0.010024701114161871, - "rewards/perpo_ocr_edit_distance_reward": 0.8302672505378723, + "advantages": -9.34941454033833e-06, + "completion_length": 456.0, + "delta_ref_entropy_loss": 0.10107421875, + "delta_ref_ppl": -0.0888671875, + "entropy_loss": -0.2490234375, + "epoch": 0.3348, + "grad_norm": 2.146473599626046, + "k1_kl": 0.0888671875, + "k3_kl": 0.050048828125, + "kimi_kl": 0.09619140625, + "learning_rate": 3.326e-07, + "loss": 0.002, + "ppl": 0.1259765625, + "reward": 0.8401456475257874, + "reward_std": 0.00719590624794364, + "rewards/perpo_ocr_edit_distance_reward": 0.8401457667350769, "step": 1674, "temperature": 0.9 }, { - "advantages": 0.0, - "completion_length": 507.0, - "delta_ref_entropy_loss": 0.0264892578125, - "delta_ref_ppl": -0.02838134765625, - "entropy_loss": -0.011444091796875, - "epoch": 0.67, - "grad_norm": 0.007497336907666106, - "k1_kl": 0.02850341796875, - "k3_kl": 0.018707275390625, - "kimi_kl": 0.06854248046875, - "learning_rate": 1.65e-07, - "loss": 0.0008, - "ppl": 0.00379180908203125, - "reward": 1.0, - "reward_std": 0.0, - "rewards/perpo_ocr_edit_distance_reward": 1.0, + "advantages": -5.568776941800024e-06, + "completion_length": 474.0, + "delta_ref_entropy_loss": 0.10205078125, + "delta_ref_ppl": -0.12890625, + "entropy_loss": -0.345703125, + "epoch": 0.335, + "grad_norm": 2.328558745509083, + "k1_kl": 0.1298828125, + "k3_kl": 0.091796875, + "kimi_kl": 0.267578125, + "learning_rate": 3.325e-07, + "loss": 0.0037, + "ppl": 0.19140625, + "reward": 0.8115102052688599, + "reward_std": 0.010605750605463982, + "rewards/perpo_ocr_edit_distance_reward": 0.8115102648735046, "step": 1675, "temperature": 0.9 }, { - "advantages": -7.646424637641758e-05, - "completion_length": 530.0, - "delta_ref_entropy_loss": 0.0523681640625, - "delta_ref_ppl": -0.605712890625, - "entropy_loss": -0.3037109375, - "epoch": 0.6704, - "grad_norm": 1.3532716489484438, - "k1_kl": 0.605712890625, - "k3_kl": 0.4528045654296875, - "kimi_kl": 1.294891357421875, - "learning_rate": 1.648e-07, - "loss": 0.0182, - "ppl": 0.15087890625, - "reward": 0.53861965239048, - "reward_std": 0.000492885010316968, - "rewards/perpo_ocr_edit_distance_reward": 0.5386197157204151, + "advantages": -4.466091195354238e-05, + "completion_length": 890.0, + "delta_ref_entropy_loss": 0.05810546875, + "delta_ref_ppl": -0.076171875, + "entropy_loss": -0.08837890625, + "epoch": 0.3352, + "grad_norm": 1.3526652969794002, + "k1_kl": 0.07568359375, + "k3_kl": 0.043701171875, + "kimi_kl": 0.10302734375, + "learning_rate": 3.3239999999999993e-07, + "loss": 0.0018, + "ppl": 0.040283203125, + "reward": 0.9872736930847168, + "reward_std": 0.0006623945664614439, + "rewards/perpo_ocr_edit_distance_reward": 0.9872738122940063, "step": 1676, "temperature": 0.9 }, { - "advantages": -0.0002980232238769531, - "completion_length": 53.0, - "delta_ref_entropy_loss": 0.089599609375, - "delta_ref_ppl": -0.13720703125, - "entropy_loss": -0.06787109375, - "epoch": 0.6708, - "grad_norm": 0.15653059716232406, - "k1_kl": 0.1373291015625, - "k3_kl": 0.08953857421875, - "kimi_kl": 0.26190185546875, - "learning_rate": 1.6459999999999998e-07, - "loss": 0.0039, - "ppl": 0.0350341796875, - "reward": 0.953125, - "reward_std": 0.0, - "rewards/perpo_ocr_edit_distance_reward": 0.9531250298023224, + "advantages": -0.0001812321861507371, + "completion_length": 755.0, + "delta_ref_entropy_loss": 0.04052734375, + "delta_ref_ppl": -0.034423828125, + "entropy_loss": -0.018798828125, + "epoch": 0.3354, + "grad_norm": 0.47297196038809863, + "k1_kl": 0.034423828125, + "k3_kl": 0.018798828125, + "kimi_kl": 0.0439453125, + "learning_rate": 3.323e-07, + "loss": 0.0009, + "ppl": 0.00555419921875, + "reward": 0.9960861206054688, + "reward_std": 0.00036985776387155056, + "rewards/perpo_ocr_edit_distance_reward": 0.9960861802101135, "step": 1677, "temperature": 0.9 }, { - "advantages": -9.467559826248362e-05, - "completion_length": 1377.5, - "delta_ref_entropy_loss": 0.022216796875, - "delta_ref_ppl": -0.017242431640625, - "entropy_loss": -0.18304443359375, - "epoch": 0.6712, - "grad_norm": 5.312992266773765, - "k1_kl": 0.017364501953125, - "k3_kl": 0.04730224609375, - "kimi_kl": 0.029449462890625, - "learning_rate": 1.644e-07, - "loss": 0.002, - "ppl": 0.119049072265625, - "reward": 0.7622545957565308, - "reward_std": 0.01954048891639104, - "rewards/perpo_ocr_edit_distance_reward": 0.7622545957565308, + "advantages": -5.577292085945373e-06, + "completion_length": 699.0, + "delta_ref_entropy_loss": 0.09130859375, + "delta_ref_ppl": -0.09033203125, + "entropy_loss": -0.1103515625, + "epoch": 0.3356, + "grad_norm": 2.0135305154103866, + "k1_kl": 0.08984375, + "k3_kl": 0.054443359375, + "kimi_kl": 0.15234375, + "learning_rate": 3.3219999999999997e-07, + "loss": 0.0022, + "ppl": 0.05419921875, + "reward": 0.9481774568557739, + "reward_std": 0.0029525950085371733, + "rewards/perpo_ocr_edit_distance_reward": 0.9481775164604187, "step": 1678, "temperature": 0.9 }, { - "advantages": -1.5935728356453183e-05, - "completion_length": 508.0, - "delta_ref_entropy_loss": 0.07470703125, - "delta_ref_ppl": -0.0347900390625, - "entropy_loss": -0.0654296875, - "epoch": 0.6716, - "grad_norm": 1.2645805971060795, - "k1_kl": 0.0347900390625, - "k3_kl": 0.01690673828125, - "kimi_kl": 0.03228759765625, - "learning_rate": 1.642e-07, - "loss": 0.0007, - "ppl": 0.0330810546875, - "reward": 0.6387442052364349, - "reward_std": 0.003882404649630189, - "rewards/perpo_ocr_edit_distance_reward": 0.6387442946434021, + "advantages": -3.0824117857264355e-05, + "completion_length": 465.0, + "delta_ref_entropy_loss": 0.08154296875, + "delta_ref_ppl": -0.09326171875, + "entropy_loss": -0.0439453125, + "epoch": 0.3358, + "grad_norm": 0.7598143477254808, + "k1_kl": 0.09326171875, + "k3_kl": 0.060302734375, + "kimi_kl": 0.1943359375, + "learning_rate": 3.321e-07, + "loss": 0.0024, + "ppl": 0.01708984375, + "reward": 0.9946846961975098, + "reward_std": 0.002939257537946105, + "rewards/perpo_ocr_edit_distance_reward": 0.9946848154067993, "step": 1679, "temperature": 0.9 }, { - "advantages": -6.667205752819427e-06, - "completion_length": 941.0, - "delta_ref_entropy_loss": 0.0313720703125, - "delta_ref_ppl": -0.03173828125, - "entropy_loss": -0.03094482421875, - "epoch": 0.672, - "grad_norm": 0.6921832368824329, - "k1_kl": 0.03173828125, - "k3_kl": 0.01873779296875, - "kimi_kl": 0.0523681640625, - "learning_rate": 1.64e-07, - "loss": 0.0008, - "ppl": 0.014678955078125, - "reward": 0.9519019424915314, - "reward_std": 0.01273683225736022, - "rewards/perpo_ocr_edit_distance_reward": 0.9519020318984985, + "advantages": -1.0317990017938428e-05, + "completion_length": 1159.0, + "delta_ref_entropy_loss": 0.08544921875, + "delta_ref_ppl": -0.08544921875, + "entropy_loss": -0.1083984375, + "epoch": 0.336, + "grad_norm": 6.959045633734197, + "k1_kl": 0.08544921875, + "k3_kl": 0.05712890625, + "kimi_kl": 0.12109375, + "learning_rate": 3.32e-07, + "loss": 0.0023, + "ppl": 0.06201171875, + "reward": 0.9892359972000122, + "reward_std": 0.0015520843444392085, + "rewards/perpo_ocr_edit_distance_reward": 0.9892359972000122, "step": 1680, "temperature": 0.9 }, { - "advantages": 1.0950225259875879e-05, - "completion_length": 763.5, - "delta_ref_entropy_loss": 0.03460693359375, - "delta_ref_ppl": -0.02166748046875, - "entropy_loss": -0.0936279296875, - "epoch": 0.6724, - "grad_norm": 1.776515330555561, - "k1_kl": 0.021636962890625, - "k3_kl": 0.0097503662109375, - "kimi_kl": 0.0147705078125, - "learning_rate": 1.638e-07, - "loss": 0.0004, - "ppl": 0.0538330078125, - "reward": 0.7902864813804626, - "reward_std": 0.0005331914289854467, - "rewards/perpo_ocr_edit_distance_reward": 0.7902864813804626, + "advantages": -4.725797043647617e-05, + "completion_length": 1334.0, + "delta_ref_entropy_loss": 0.0274658203125, + "delta_ref_ppl": -0.040771484375, + "entropy_loss": -0.041015625, + "epoch": 0.3362, + "grad_norm": 0.4094676642462872, + "k1_kl": 0.041015625, + "k3_kl": 0.0283203125, + "kimi_kl": 0.06787109375, + "learning_rate": 3.3189999999999995e-07, + "loss": 0.0012, + "ppl": 0.0191650390625, + "reward": 0.9950374364852905, + "reward_std": 0.0009806904708966613, + "rewards/perpo_ocr_edit_distance_reward": 0.9950375556945801, "step": 1681, "temperature": 0.9 }, { - "advantages": 1.047026671585627e-06, - "completion_length": 354.0, - "delta_ref_entropy_loss": 0.0509033203125, - "delta_ref_ppl": -0.04345703125, - "entropy_loss": -0.02703857421875, - "epoch": 0.6728, - "grad_norm": 1.122254326185491, - "k1_kl": 0.04327392578125, - "k3_kl": 0.0264892578125, - "kimi_kl": 0.0474853515625, - "learning_rate": 1.6359999999999998e-07, - "loss": 0.0011, - "ppl": 0.015472412109375, - "reward": 0.9996034801006317, - "reward_std": 0.0006865741743240505, - "rewards/perpo_ocr_edit_distance_reward": 0.9996035695075989, + "advantages": -9.6670220955275e-05, + "completion_length": 369.0, + "delta_ref_entropy_loss": 0.033203125, + "delta_ref_ppl": -0.0693359375, + "entropy_loss": -0.0361328125, + "epoch": 0.3364, + "grad_norm": 0.8474584806545943, + "k1_kl": 0.0693359375, + "k3_kl": 0.048583984375, + "kimi_kl": 0.1533203125, + "learning_rate": 3.318e-07, + "loss": 0.002, + "ppl": 0.0157470703125, + "reward": 0.9778603315353394, + "reward_std": 0.0006046704947948456, + "rewards/perpo_ocr_edit_distance_reward": 0.9778604507446289, "step": 1682, "temperature": 0.9 }, { - "advantages": -0.00010364396575823775, - "completion_length": 1264.0, - "delta_ref_entropy_loss": 0.022613525390625, - "delta_ref_ppl": -0.02020263671875, - "entropy_loss": -0.038818359375, - "epoch": 0.6732, - "grad_norm": 1.1289527012527951, - "k1_kl": 0.020172119140625, - "k3_kl": 0.0147552490234375, - "kimi_kl": 0.04620361328125, - "learning_rate": 1.634e-07, - "loss": 0.0007, - "ppl": 0.0218505859375, - "reward": 0.9321631193161011, - "reward_std": 0.003565754450391978, - "rewards/perpo_ocr_edit_distance_reward": 0.9321631789207458, + "advantages": -5.79016568735824e-06, + "completion_length": 56.0, + "delta_ref_entropy_loss": 0.0303955078125, + "delta_ref_ppl": -0.349609375, + "entropy_loss": -0.10400390625, + "epoch": 0.3366, + "grad_norm": 3.6580671476297932, + "k1_kl": 0.349609375, + "k3_kl": 0.28515625, + "kimi_kl": 1.1796875, + "learning_rate": 3.317e-07, + "loss": 0.0114, + "ppl": 0.033203125, + "reward": 0.979345977306366, + "reward_std": 0.00429332721978426, + "rewards/perpo_ocr_edit_distance_reward": 0.979345977306366, "step": 1683, "temperature": 0.9 }, { - "advantages": -7.797139369358774e-05, - "completion_length": 800.0, - "delta_ref_entropy_loss": 0.041656494140625, - "delta_ref_ppl": -0.02337646484375, - "entropy_loss": -0.02569580078125, - "epoch": 0.6736, - "grad_norm": 3.31713244654554, - "k1_kl": 0.02325439453125, - "k3_kl": 0.0208740234375, - "kimi_kl": 0.033843994140625, - "learning_rate": 1.632e-07, - "loss": 0.0009, - "ppl": 0.01055908203125, - "reward": 0.9898772835731506, - "reward_std": 0.00044351929682306945, - "rewards/perpo_ocr_edit_distance_reward": 0.9898773729801178, + "advantages": -5.960464841336943e-05, + "completion_length": 1577.0, + "delta_ref_entropy_loss": 0.048828125, + "delta_ref_ppl": -0.05322265625, + "entropy_loss": -0.06298828125, + "epoch": 0.3368, + "grad_norm": 2.7098655920317, + "k1_kl": 0.052978515625, + "k3_kl": 0.031982421875, + "kimi_kl": 0.07421875, + "learning_rate": 3.316e-07, + "loss": 0.0013, + "ppl": 0.028564453125, + "reward": 0.9750440716743469, + "reward_std": 0.0009001230355352163, + "rewards/perpo_ocr_edit_distance_reward": 0.9750441312789917, "step": 1684, "temperature": 0.9 }, { - "advantages": 0.0, - "completion_length": 294.5, - "delta_ref_entropy_loss": 0.0404052734375, - "delta_ref_ppl": -0.070068359375, - "entropy_loss": -0.02325439453125, - "epoch": 0.674, - "grad_norm": 0.026244702169459413, - "k1_kl": 0.0703125, - "k3_kl": 0.04827880859375, - "kimi_kl": 0.156494140625, - "learning_rate": 1.63e-07, - "loss": 0.0019, - "ppl": 0.01177978515625, - "reward": 1.0, - "reward_std": 0.0, - "rewards/perpo_ocr_edit_distance_reward": 1.0, + "advantages": -1.7029898913278885e-07, + "completion_length": 329.0, + "delta_ref_entropy_loss": 0.0810546875, + "delta_ref_ppl": -0.12451171875, + "entropy_loss": -0.14453125, + "epoch": 0.337, + "grad_norm": 2.070131066195314, + "k1_kl": 0.1240234375, + "k3_kl": 0.08056640625, + "kimi_kl": 0.248046875, + "learning_rate": 3.315e-07, + "loss": 0.0032, + "ppl": 0.07275390625, + "reward": 0.9680750966072083, + "reward_std": 0.05169430002570152, + "rewards/perpo_ocr_edit_distance_reward": 0.968075156211853, "step": 1685, "temperature": 0.9 }, { - "advantages": -1.8562590184956207e-06, - "completion_length": 406.0, - "delta_ref_entropy_loss": 0.0631103515625, - "delta_ref_ppl": -0.08837890625, - "entropy_loss": -0.043212890625, - "epoch": 0.6744, - "grad_norm": 0.6451443006703182, - "k1_kl": 0.08837890625, - "k3_kl": 0.05865478515625, - "kimi_kl": 0.2154541015625, - "learning_rate": 1.628e-07, - "loss": 0.0023, - "ppl": 0.0211181640625, - "reward": 0.9414779543876648, - "reward_std": 0.0010890621924772859, - "rewards/perpo_ocr_edit_distance_reward": 0.9414779543876648, + "advantages": -3.7874495319556445e-05, + "completion_length": 280.0, + "delta_ref_entropy_loss": 0.05029296875, + "delta_ref_ppl": -0.0986328125, + "entropy_loss": -0.06396484375, + "epoch": 0.3372, + "grad_norm": 0.9139569882632966, + "k1_kl": 0.0986328125, + "k3_kl": 0.06982421875, + "kimi_kl": 0.205078125, + "learning_rate": 3.3139999999999996e-07, + "loss": 0.0028, + "ppl": 0.0255126953125, + "reward": 0.9595823884010315, + "reward_std": 0.0014727930538356304, + "rewards/perpo_ocr_edit_distance_reward": 0.9595824480056763, "step": 1686, "temperature": 0.9 }, { - "advantages": -2.0435878411717567e-07, - "completion_length": 469.5, - "delta_ref_entropy_loss": -0.03387451171875, - "delta_ref_ppl": -0.0531005859375, - "entropy_loss": -0.3759765625, - "epoch": 0.6748, - "grad_norm": 1.9369743026050108, - "k1_kl": 0.0533447265625, - "k3_kl": 0.0517578125, - "kimi_kl": 0.14501953125, - "learning_rate": 1.626e-07, - "loss": 0.0021, - "ppl": 0.189208984375, - "reward": 0.21382996439933777, - "reward_std": 0.10634474828839302, - "rewards/perpo_ocr_edit_distance_reward": 0.21382997930049896, + "advantages": -8.130925561999902e-05, + "completion_length": 524.0, + "delta_ref_entropy_loss": 0.039306640625, + "delta_ref_ppl": -0.07568359375, + "entropy_loss": -0.0322265625, + "epoch": 0.3374, + "grad_norm": 0.391911082280555, + "k1_kl": 0.076171875, + "k3_kl": 0.04833984375, + "kimi_kl": 0.1435546875, + "learning_rate": 3.3129999999999996e-07, + "loss": 0.002, + "ppl": 0.00946044921875, + "reward": 0.9984477758407593, + "reward_std": 0.0006332531338557601, + "rewards/perpo_ocr_edit_distance_reward": 0.9984478950500488, "step": 1687, "temperature": 0.9 }, { - "advantages": -3.499005651974585e-05, - "completion_length": 891.0, - "delta_ref_entropy_loss": 0.0762939453125, - "delta_ref_ppl": -0.053466796875, - "entropy_loss": -0.0733642578125, - "epoch": 0.6752, - "grad_norm": 1.7520779085102984, - "k1_kl": 0.053466796875, - "k3_kl": 0.031829833984375, - "kimi_kl": 0.0909423828125, - "learning_rate": 1.6239999999999997e-07, - "loss": 0.0013, - "ppl": 0.038116455078125, - "reward": 0.9570463597774506, - "reward_std": 0.0028195553459227085, - "rewards/perpo_ocr_edit_distance_reward": 0.9570464193820953, + "advantages": 4.257474817137563e-09, + "completion_length": 561.0, + "delta_ref_entropy_loss": 0.091796875, + "delta_ref_ppl": -0.09521484375, + "entropy_loss": -0.06494140625, + "epoch": 0.3376, + "grad_norm": 0.6367800782407306, + "k1_kl": 0.09521484375, + "k3_kl": 0.052001953125, + "kimi_kl": 0.140625, + "learning_rate": 3.312e-07, + "loss": 0.0021, + "ppl": 0.027587890625, + "reward": 0.25792062282562256, + "reward_std": 0.000682278536260128, + "rewards/perpo_ocr_edit_distance_reward": 0.25792062282562256, "step": 1688, "temperature": 0.9 }, { - "advantages": -0.00043258070945739746, - "completion_length": 664.5, - "delta_ref_entropy_loss": 0.0213623046875, - "delta_ref_ppl": -0.01971435546875, - "entropy_loss": -0.01373291015625, - "epoch": 0.6756, - "grad_norm": 0.13289801851148078, - "k1_kl": 0.01971435546875, - "k3_kl": 0.011932373046875, - "kimi_kl": 0.03363037109375, - "learning_rate": 1.622e-07, - "loss": 0.0009, - "ppl": 0.005950927734375, - "reward": 0.9994909465312958, - "reward_std": 0.00012401801359374076, - "rewards/perpo_ocr_edit_distance_reward": 0.9994910061359406, + "advantages": -2.8950827982043847e-05, + "completion_length": 2035.0, + "delta_ref_entropy_loss": 0.04150390625, + "delta_ref_ppl": -0.04345703125, + "entropy_loss": -0.15234375, + "epoch": 0.3378, + "grad_norm": 1.7925588052131065, + "k1_kl": 0.04345703125, + "k3_kl": 0.0341796875, + "kimi_kl": 0.061767578125, + "learning_rate": 3.311e-07, + "loss": 0.0014, + "ppl": 0.0908203125, + "reward": 0.9565321207046509, + "reward_std": 0.00195940094999969, + "rewards/perpo_ocr_edit_distance_reward": 0.9565322399139404, "step": 1689, "temperature": 0.9 }, { - "advantages": -1.9814287952613086e-05, - "completion_length": 486.0, - "delta_ref_entropy_loss": 0.059326171875, - "delta_ref_ppl": -0.0330810546875, - "entropy_loss": -0.02496337890625, - "epoch": 0.676, - "grad_norm": 1.9983032487335095, - "k1_kl": 0.0330810546875, - "k3_kl": 0.01702880859375, - "kimi_kl": 0.0540771484375, - "learning_rate": 1.62e-07, - "loss": 0.0007, - "ppl": 0.0107269287109375, - "reward": 0.9989092946052551, - "reward_std": 0.0005946924793533981, - "rewards/perpo_ocr_edit_distance_reward": 0.9989093244075775, + "advantages": -5.3048138397571165e-06, + "completion_length": 232.0, + "delta_ref_entropy_loss": 0.0213623046875, + "delta_ref_ppl": -0.12060546875, + "entropy_loss": -0.061279296875, + "epoch": 0.338, + "grad_norm": 1.6041453266543073, + "k1_kl": 0.12060546875, + "k3_kl": 0.09423828125, + "kimi_kl": 0.455078125, + "learning_rate": 3.31e-07, + "loss": 0.0038, + "ppl": 0.0235595703125, + "reward": 0.9872742295265198, + "reward_std": 0.0031092525459825993, + "rewards/perpo_ocr_edit_distance_reward": 0.9872742891311646, "step": 1690, "temperature": 0.9 }, { - "advantages": 3.3804349186539184e-06, - "completion_length": 1047.5, - "delta_ref_entropy_loss": 0.039306640625, - "delta_ref_ppl": -0.023712158203125, - "entropy_loss": -0.0472412109375, - "epoch": 0.6764, - "grad_norm": 5491.097962769116, - "k1_kl": 0.023681640625, - "k3_kl": 50.259765625, - "kimi_kl": 0.0487060546875, - "learning_rate": 1.6179999999999998e-07, - "loss": 2.0223, - "ppl": 0.02691650390625, - "reward": 0.9926476776599884, - "reward_std": 0.001536341616883874, - "rewards/perpo_ocr_edit_distance_reward": 0.9926476776599884, - "step": 1691, - "temperature": 0.9 - }, - { - "advantages": -2.1840846557097393e-05, - "completion_length": 575.0, - "delta_ref_entropy_loss": 0.07257080078125, - "delta_ref_ppl": -0.05682373046875, - "entropy_loss": -0.07696533203125, - "epoch": 0.6768, - "grad_norm": 1.4302261033871861, - "k1_kl": 0.05682373046875, - "k3_kl": 0.030120849609375, - "kimi_kl": 0.0626220703125, - "learning_rate": 1.616e-07, - "loss": 0.0012, - "ppl": 0.049102783203125, - "reward": 0.9235391914844513, - "reward_std": 0.006258100358536467, - "rewards/perpo_ocr_edit_distance_reward": 0.9235392510890961, + "advantages": -3.358296089572832e-05, + "completion_length": 596.0, + "delta_ref_entropy_loss": 0.05224609375, + "delta_ref_ppl": -0.061279296875, + "entropy_loss": -0.072265625, + "epoch": 0.3382, + "grad_norm": 0.8791296014169566, + "k1_kl": 0.061279296875, + "k3_kl": 0.03564453125, + "kimi_kl": 0.091796875, + "learning_rate": 3.3090000000000003e-07, + "loss": 0.0015, + "ppl": 0.03515625, + "reward": 0.9592212438583374, + "reward_std": 0.0024369421880692244, + "rewards/perpo_ocr_edit_distance_reward": 0.959221363067627, + "step": 1691, + "temperature": 0.9 + }, + { + "advantages": -5.778670674772002e-05, + "completion_length": 1431.0, + "delta_ref_entropy_loss": 0.044189453125, + "delta_ref_ppl": -0.03466796875, + "entropy_loss": -0.064453125, + "epoch": 0.3384, + "grad_norm": 0.5798268876034808, + "k1_kl": 0.03466796875, + "k3_kl": 0.0164794921875, + "kimi_kl": 0.0267333984375, + "learning_rate": 3.3079999999999997e-07, + "loss": 0.0007, + "ppl": 0.0308837890625, + "reward": 0.9742862582206726, + "reward_std": 0.0006366092711687088, + "rewards/perpo_ocr_edit_distance_reward": 0.9742863774299622, "step": 1692, "temperature": 0.9 }, { - "advantages": -3.486020341370022e-05, - "completion_length": 412.5, - "delta_ref_entropy_loss": 0.03857421875, - "delta_ref_ppl": -0.03729248046875, - "entropy_loss": -0.03253173828125, - "epoch": 0.6772, - "grad_norm": 0.6705225299506223, - "k1_kl": 0.03729248046875, - "k3_kl": 0.02166748046875, - "kimi_kl": 0.04864501953125, - "learning_rate": 1.6139999999999998e-07, - "loss": 0.0009, - "ppl": 0.018310546875, - "reward": 0.9886762201786041, - "reward_std": 0.0007142489484976977, - "rewards/perpo_ocr_edit_distance_reward": 0.9886762499809265, + "advantages": -4.659380283555947e-05, + "completion_length": 506.0, + "delta_ref_entropy_loss": 0.0986328125, + "delta_ref_ppl": -0.1005859375, + "entropy_loss": -0.0703125, + "epoch": 0.3386, + "grad_norm": 0.7909891416932704, + "k1_kl": 0.10009765625, + "k3_kl": 0.0615234375, + "kimi_kl": 0.1884765625, + "learning_rate": 3.3069999999999996e-07, + "loss": 0.0025, + "ppl": 0.0289306640625, + "reward": 0.9686322212219238, + "reward_std": 0.0013615689240396023, + "rewards/perpo_ocr_edit_distance_reward": 0.9686322808265686, "step": 1693, "temperature": 0.9 }, { - "advantages": -1.634870386624243e-05, - "completion_length": 282.5, - "delta_ref_entropy_loss": 0.03985595703125, - "delta_ref_ppl": -0.0562744140625, - "entropy_loss": -0.02593994140625, - "epoch": 0.6776, - "grad_norm": 0.8973839885863353, - "k1_kl": 0.0562744140625, - "k3_kl": 0.03839111328125, - "kimi_kl": 0.15673828125, - "learning_rate": 1.6120000000000001e-07, - "loss": 0.0016, - "ppl": 0.011260986328125, - "reward": 0.9995269775390625, - "reward_std": 0.001251536188647151, - "rewards/perpo_ocr_edit_distance_reward": 0.9995270073413849, + "advantages": -6.183556251926348e-05, + "completion_length": 880.0, + "delta_ref_entropy_loss": 0.0247802734375, + "delta_ref_ppl": -0.04833984375, + "entropy_loss": -0.02978515625, + "epoch": 0.3388, + "grad_norm": 0.3364330923184881, + "k1_kl": 0.04833984375, + "k3_kl": 0.032958984375, + "kimi_kl": 0.11181640625, + "learning_rate": 3.306e-07, + "loss": 0.0014, + "ppl": 0.0111083984375, + "reward": 0.9237943291664124, + "reward_std": 0.00031323314760811627, + "rewards/perpo_ocr_edit_distance_reward": 0.9237943887710571, "step": 1694, "temperature": 0.9 }, { - "advantages": -2.3488489335310447e-05, - "completion_length": 464.5, - "delta_ref_entropy_loss": 0.0277099609375, - "delta_ref_ppl": -0.046173095703125, - "entropy_loss": -0.015625, - "epoch": 0.678, - "grad_norm": 0.5716148447801421, - "k1_kl": 0.0458984375, - "k3_kl": 0.03217315673828125, - "kimi_kl": 0.1132354736328125, - "learning_rate": 1.61e-07, - "loss": 0.0013, - "ppl": 0.0089111328125, - "reward": 0.9979381561279297, - "reward_std": 0.0005049986793892458, - "rewards/perpo_ocr_edit_distance_reward": 0.9979382157325745, + "advantages": -5.091939783596899e-06, + "completion_length": 580.0, + "delta_ref_entropy_loss": 0.0615234375, + "delta_ref_ppl": -0.0830078125, + "entropy_loss": -0.0537109375, + "epoch": 0.339, + "grad_norm": 0.6482170410811696, + "k1_kl": 0.0830078125, + "k3_kl": 0.05517578125, + "kimi_kl": 0.216796875, + "learning_rate": 3.305e-07, + "loss": 0.0022, + "ppl": 0.020263671875, + "reward": 0.9863652586936951, + "reward_std": 0.009956536814570427, + "rewards/perpo_ocr_edit_distance_reward": 0.9863653182983398, "step": 1695, "temperature": 0.9 }, { - "advantages": -0.0002630608396430034, - "completion_length": 629.5, - "delta_ref_entropy_loss": 0.02545166015625, - "delta_ref_ppl": -0.0148468017578125, - "entropy_loss": -0.01458740234375, - "epoch": 0.6784, - "grad_norm": 0.1591685430011283, - "k1_kl": 0.01483154296875, - "k3_kl": 0.007537841796875, - "kimi_kl": 0.0133819580078125, - "learning_rate": 1.6079999999999998e-07, - "loss": 0.0006, - "ppl": 0.0054168701171875, - "reward": 0.9124262928962708, - "reward_std": 7.17441871529445e-05, - "rewards/perpo_ocr_edit_distance_reward": 0.9124262928962708, + "advantages": -1.9243785573053174e-05, + "completion_length": 395.0, + "delta_ref_entropy_loss": 0.060791015625, + "delta_ref_ppl": -0.0908203125, + "entropy_loss": -0.04052734375, + "epoch": 0.3392, + "grad_norm": 0.903275080957164, + "k1_kl": 0.09130859375, + "k3_kl": 0.056396484375, + "kimi_kl": 0.169921875, + "learning_rate": 3.304e-07, + "loss": 0.0023, + "ppl": 0.015380859375, + "reward": 0.9803306460380554, + "reward_std": 0.0025546823162585497, + "rewards/perpo_ocr_edit_distance_reward": 0.9803306460380554, "step": 1696, "temperature": 0.9 }, { - "advantages": 0.0, - "completion_length": 492.5, - "delta_ref_entropy_loss": 0.02679443359375, - "delta_ref_ppl": -0.0206298828125, - "entropy_loss": -0.024658203125, - "epoch": 0.6788, - "grad_norm": 0.02202022952265776, - "k1_kl": 0.0206298828125, - "k3_kl": 0.012176513671875, - "kimi_kl": 0.0361328125, - "learning_rate": 1.606e-07, - "loss": 0.0005, - "ppl": 0.0111083984375, - "reward": 1.0, - "reward_std": 0.0, - "rewards/perpo_ocr_edit_distance_reward": 1.0, + "advantages": -3.576278970740532e-07, + "completion_length": 646.0, + "delta_ref_entropy_loss": 0.05126953125, + "delta_ref_ppl": -0.08544921875, + "entropy_loss": -0.1572265625, + "epoch": 0.3394, + "grad_norm": 2.1198708869622633, + "k1_kl": 0.0859375, + "k3_kl": 0.058349609375, + "kimi_kl": 0.177734375, + "learning_rate": 3.303e-07, + "loss": 0.0023, + "ppl": 0.072265625, + "reward": 0.7083389759063721, + "reward_std": 0.07399699091911316, + "rewards/perpo_ocr_edit_distance_reward": 0.7083390355110168, "step": 1697, "temperature": 0.9 }, { - "advantages": -2.322026739420835e-05, - "completion_length": 584.5, - "delta_ref_entropy_loss": 0.06195068359375, - "delta_ref_ppl": -0.07183837890625, - "entropy_loss": -0.06512451171875, - "epoch": 0.6792, - "grad_norm": 1.3185574044704882, - "k1_kl": 0.07183837890625, - "k3_kl": 0.050140380859375, - "kimi_kl": 0.155029296875, - "learning_rate": 1.6039999999999998e-07, - "loss": 0.002, - "ppl": 0.035308837890625, - "reward": 0.9165930449962616, - "reward_std": 0.07946515292860568, - "rewards/perpo_ocr_edit_distance_reward": 0.9165930449962616, + "advantages": -2.634525480971206e-05, + "completion_length": 215.0, + "delta_ref_entropy_loss": 0.1376953125, + "delta_ref_ppl": -0.2177734375, + "entropy_loss": -0.10302734375, + "epoch": 0.3396, + "grad_norm": 1.2171022097434172, + "k1_kl": 0.216796875, + "k3_kl": 0.16015625, + "kimi_kl": 0.73828125, + "learning_rate": 3.302e-07, + "loss": 0.0064, + "ppl": 0.043701171875, + "reward": 0.991730809211731, + "reward_std": 0.0024868755135685205, + "rewards/perpo_ocr_edit_distance_reward": 0.9917308688163757, "step": 1698, "temperature": 0.9 }, { - "advantages": -3.603952479558359e-05, - "completion_length": 298.0, - "delta_ref_entropy_loss": -0.09954833984375, - "delta_ref_ppl": -0.721954345703125, - "entropy_loss": -0.342620849609375, - "epoch": 0.6796, - "grad_norm": 16.578510463682544, - "k1_kl": 0.72198486328125, - "k3_kl": 0.6596221923828125, - "kimi_kl": 3.912750244140625, - "learning_rate": 1.602e-07, - "loss": 0.0264, - "ppl": 0.14703369140625, - "reward": 0.5387169383466244, - "reward_std": 0.06169455306371674, - "rewards/perpo_ocr_edit_distance_reward": 0.5387169979512691, + "advantages": -2.282006425957661e-05, + "completion_length": 773.0, + "delta_ref_entropy_loss": 0.0311279296875, + "delta_ref_ppl": -0.04931640625, + "entropy_loss": -0.033935546875, + "epoch": 0.3398, + "grad_norm": 0.58600786675775, + "k1_kl": 0.04931640625, + "k3_kl": 0.031494140625, + "kimi_kl": 0.0908203125, + "learning_rate": 3.3009999999999997e-07, + "loss": 0.0013, + "ppl": 0.016357421875, + "reward": 0.9881230592727661, + "reward_std": 0.003630612511187792, + "rewards/perpo_ocr_edit_distance_reward": 0.9881231188774109, "step": 1699, "temperature": 0.9 }, { - "advantages": -1.8468925645720446e-05, - "completion_length": 427.5, - "delta_ref_entropy_loss": 0.038330078125, - "delta_ref_ppl": -0.03985595703125, - "entropy_loss": -0.03057861328125, - "epoch": 0.68, - "grad_norm": 1.0460577189291316, - "k1_kl": 0.0399169921875, - "k3_kl": 0.0291748046875, - "kimi_kl": 0.112548828125, - "learning_rate": 1.6e-07, - "loss": 0.0012, - "ppl": 0.015594482421875, - "reward": 0.9978238046169281, - "reward_std": 0.0017468034056946635, - "rewards/perpo_ocr_edit_distance_reward": 0.9978238344192505, + "advantages": -1.9243786937295226e-06, + "completion_length": 571.0, + "delta_ref_entropy_loss": 0.123046875, + "delta_ref_ppl": -0.119140625, + "entropy_loss": -0.283203125, + "epoch": 0.34, + "grad_norm": 2.222276310319345, + "k1_kl": 0.11865234375, + "k3_kl": 0.072265625, + "kimi_kl": 0.1826171875, + "learning_rate": 3.3e-07, + "loss": 0.0029, + "ppl": 0.158203125, + "reward": 0.8360196352005005, + "reward_std": 0.013216491788625717, + "rewards/perpo_ocr_edit_distance_reward": 0.8360196352005005, "step": 1700, "temperature": 0.9 }, { - "advantages": -4.504408167349538e-06, - "completion_length": 1193.5, - "delta_ref_entropy_loss": 0.04351806640625, - "delta_ref_ppl": -0.027923583984375, - "entropy_loss": -0.073974609375, - "epoch": 0.6804, - "grad_norm": 29.60542767300363, - "k1_kl": 0.02789306640625, - "k3_kl": 0.341552734375, - "kimi_kl": 0.05419921875, - "learning_rate": 1.598e-07, - "loss": 0.0137, - "ppl": 0.047607421875, - "reward": 0.7974275350570679, - "reward_std": 0.03395225363783538, - "rewards/perpo_ocr_edit_distance_reward": 0.7974275648593903, + "advantages": -2.1176680093049072e-05, + "completion_length": 77.0, + "delta_ref_entropy_loss": 0.007537841796875, + "delta_ref_ppl": -0.345703125, + "entropy_loss": -0.052978515625, + "epoch": 0.3402, + "grad_norm": 3.4035930783353594, + "k1_kl": 0.345703125, + "k3_kl": 0.298828125, + "kimi_kl": 1.84375, + "learning_rate": 3.299e-07, + "loss": 0.012, + "ppl": 0.0234375, + "reward": 0.9679415822029114, + "reward_std": 0.0027167920488864183, + "rewards/perpo_ocr_edit_distance_reward": 0.9679415822029114, "step": 1701, "temperature": 0.9 }, { - "advantages": -3.3983163120865356e-05, - "completion_length": 652.5, - "delta_ref_entropy_loss": 0.03875732421875, - "delta_ref_ppl": -0.0318603515625, - "entropy_loss": -0.02764892578125, - "epoch": 0.6808, - "grad_norm": 0.6726409525799619, - "k1_kl": 0.03173828125, - "k3_kl": 0.0196533203125, - "kimi_kl": 0.0638427734375, - "learning_rate": 1.5959999999999997e-07, - "loss": 0.0008, - "ppl": 0.0140380859375, - "reward": 0.9988324046134949, - "reward_std": 0.0003056103305425495, - "rewards/perpo_ocr_edit_distance_reward": 0.9988324642181396, + "advantages": -1.4356204701471142e-05, + "completion_length": 58.0, + "delta_ref_entropy_loss": 0.125, + "delta_ref_ppl": -0.46484375, + "entropy_loss": -0.08837890625, + "epoch": 0.3404, + "grad_norm": 3.4426945792711874, + "k1_kl": 0.46484375, + "k3_kl": 0.390625, + "kimi_kl": 2.1875, + "learning_rate": 3.2979999999999995e-07, + "loss": 0.0157, + "ppl": 0.03125, + "reward": 0.989564836025238, + "reward_std": 0.005233217030763626, + "rewards/perpo_ocr_edit_distance_reward": 0.9895649552345276, "step": 1702, "temperature": 0.9 }, { - "advantages": -1.3070447494101245e-05, - "completion_length": 548.0, - "delta_ref_entropy_loss": 0.0255126953125, - "delta_ref_ppl": -0.0206298828125, - "entropy_loss": -0.016754150390625, - "epoch": 0.6812, - "grad_norm": 0.309302586953362, - "k1_kl": 0.020660400390625, - "k3_kl": 0.013427734375, - "kimi_kl": 0.047119140625, - "learning_rate": 1.5939999999999998e-07, - "loss": 0.0006, - "ppl": 0.00970458984375, - "reward": 0.9997950792312622, - "reward_std": 0.00011285056098131463, - "rewards/perpo_ocr_edit_distance_reward": 0.9997951090335846, + "advantages": -2.8005670174025e-05, + "completion_length": 435.0, + "delta_ref_entropy_loss": 0.07958984375, + "delta_ref_ppl": -0.08935546875, + "entropy_loss": -0.034912109375, + "epoch": 0.3406, + "grad_norm": 0.6004083709170025, + "k1_kl": 0.08935546875, + "k3_kl": 0.05859375, + "kimi_kl": 0.1845703125, + "learning_rate": 3.297e-07, + "loss": 0.0024, + "ppl": 0.01495361328125, + "reward": 0.9971444010734558, + "reward_std": 0.0008127999608404934, + "rewards/perpo_ocr_edit_distance_reward": 0.9971444606781006, "step": 1703, "temperature": 0.9 }, { - "advantages": -0.0002980232238769531, - "completion_length": 529.5, - "delta_ref_entropy_loss": 0.029510498046875, - "delta_ref_ppl": -0.021636962890625, - "entropy_loss": -0.011871337890625, - "epoch": 0.6816, - "grad_norm": 0.022240812902842803, - "k1_kl": 0.021697998046875, - "k3_kl": 0.0098419189453125, - "kimi_kl": 0.018768310546875, - "learning_rate": 1.592e-07, - "loss": 0.0007, - "ppl": 0.00347137451171875, - "reward": 0.9998324513435364, - "reward_std": 0.0, - "rewards/perpo_ocr_edit_distance_reward": 0.9998324811458588, + "advantages": -2.54341539402958e-05, + "completion_length": 1315.0, + "delta_ref_entropy_loss": 0.0150146484375, + "delta_ref_ppl": -0.0272216796875, + "entropy_loss": -0.01953125, + "epoch": 0.3408, + "grad_norm": 0.19211284409450738, + "k1_kl": 0.02734375, + "k3_kl": 0.018310546875, + "kimi_kl": 0.055908203125, + "learning_rate": 3.296e-07, + "loss": 0.0008, + "ppl": 0.007049560546875, + "reward": 0.9991658329963684, + "reward_std": 0.0005700653418898582, + "rewards/perpo_ocr_edit_distance_reward": 0.9991658329963684, "step": 1704, "temperature": 0.9 }, { - "advantages": -0.00030944177160563413, - "completion_length": 400.0, - "delta_ref_entropy_loss": 0.08428955078125, - "delta_ref_ppl": -0.04693603515625, - "entropy_loss": -0.08245849609375, - "epoch": 0.682, - "grad_norm": 0.9532971626224269, - "k1_kl": 0.046905517578125, - "k3_kl": 0.0227203369140625, - "kimi_kl": 0.0458831787109375, - "learning_rate": 1.59e-07, - "loss": 0.0012, - "ppl": 0.03974151611328125, - "reward": 0.9633941352367401, - "reward_std": 0.001068761688657105, - "rewards/perpo_ocr_edit_distance_reward": 0.9633942246437073, + "advantages": -5.757809049100615e-05, + "completion_length": 208.0, + "delta_ref_entropy_loss": 0.1328125, + "delta_ref_ppl": -0.1630859375, + "entropy_loss": -0.051025390625, + "epoch": 0.341, + "grad_norm": 0.9528959048795717, + "k1_kl": 0.1630859375, + "k3_kl": 0.11328125, + "kimi_kl": 0.443359375, + "learning_rate": 3.295e-07, + "loss": 0.0046, + "ppl": 0.020751953125, + "reward": 0.9862536787986755, + "reward_std": 0.0015268614515662193, + "rewards/perpo_ocr_edit_distance_reward": 0.9862537980079651, "step": 1705, "temperature": 0.9 }, { - "advantages": -1.4134816410660278e-06, - "completion_length": 479.0, - "delta_ref_entropy_loss": 0.1234130859375, - "delta_ref_ppl": -0.08282470703125, - "entropy_loss": -0.22021484375, - "epoch": 0.6824, - "grad_norm": 1.9141078714689541, - "k1_kl": 0.0828857421875, - "k3_kl": 0.048828125, - "kimi_kl": 0.1129150390625, - "learning_rate": 1.588e-07, - "loss": 0.002, - "ppl": 0.124755859375, - "reward": 0.5764186680316925, - "reward_std": 0.025121578015387058, - "rewards/perpo_ocr_edit_distance_reward": 0.5764186680316925, + "advantages": -1.3819762898492627e-05, + "completion_length": 188.0, + "delta_ref_entropy_loss": 0.06982421875, + "delta_ref_ppl": -0.11376953125, + "entropy_loss": -0.059814453125, + "epoch": 0.3412, + "grad_norm": 1.1646794700370584, + "k1_kl": 0.11376953125, + "k3_kl": 0.0830078125, + "kimi_kl": 0.27734375, + "learning_rate": 3.294e-07, + "loss": 0.0033, + "ppl": 0.0198974609375, + "reward": 0.9686696529388428, + "reward_std": 0.001749649876728654, + "rewards/perpo_ocr_edit_distance_reward": 0.9686696529388428, "step": 1706, "temperature": 0.9 }, { - "advantages": -0.00014037320215720683, - "completion_length": 851.0, - "delta_ref_entropy_loss": 0.021453857421875, - "delta_ref_ppl": -0.0145263671875, - "entropy_loss": -0.0201416015625, - "epoch": 0.6828, - "grad_norm": 0.49771686228854706, - "k1_kl": 0.0145263671875, - "k3_kl": 0.00708770751953125, - "kimi_kl": 0.01763153076171875, - "learning_rate": 1.5859999999999998e-07, - "loss": 0.0004, - "ppl": 0.00970458984375, - "reward": 0.9991235136985779, - "reward_std": 0.000620445454842411, - "rewards/perpo_ocr_edit_distance_reward": 0.9991236329078674, + "advantages": -7.944448043417651e-06, + "completion_length": 390.0, + "delta_ref_entropy_loss": 0.10546875, + "delta_ref_ppl": -0.1396484375, + "entropy_loss": -0.07861328125, + "epoch": 0.3414, + "grad_norm": 1.5108998505009923, + "k1_kl": 0.1396484375, + "k3_kl": 0.0927734375, + "kimi_kl": 0.375, + "learning_rate": 3.2929999999999996e-07, + "loss": 0.0037, + "ppl": 0.02783203125, + "reward": 0.9548649787902832, + "reward_std": 0.010609602555632591, + "rewards/perpo_ocr_edit_distance_reward": 0.9548650979995728, "step": 1707, "temperature": 0.9 }, { - "advantages": -0.00015605773660354316, - "completion_length": 319.5, - "delta_ref_entropy_loss": 0.05712890625, - "delta_ref_ppl": -0.063232421875, - "entropy_loss": -0.0438232421875, - "epoch": 0.6832, - "grad_norm": 0.6691494219926983, - "k1_kl": 0.06298828125, - "k3_kl": 0.0401611328125, - "kimi_kl": 0.1201171875, - "learning_rate": 1.5840000000000002e-07, - "loss": 0.0018, - "ppl": 0.02362060546875, - "reward": 0.9062942266464233, - "reward_std": 0.0002764749660855159, - "rewards/perpo_ocr_edit_distance_reward": 0.9062943160533905, + "advantages": -1.2964011148142163e-05, + "completion_length": 391.0, + "delta_ref_entropy_loss": 0.03515625, + "delta_ref_ppl": -0.035400390625, + "entropy_loss": -0.0191650390625, + "epoch": 0.3416, + "grad_norm": 0.4346198444156749, + "k1_kl": 0.03515625, + "k3_kl": 0.021728515625, + "kimi_kl": 0.07470703125, + "learning_rate": 3.2919999999999996e-07, + "loss": 0.0009, + "ppl": 0.005462646484375, + "reward": 0.9962469935417175, + "reward_std": 0.000557327235583216, + "rewards/perpo_ocr_edit_distance_reward": 0.9962470531463623, "step": 1708, "temperature": 0.9 }, { - "advantages": -5.5623907428525854e-05, - "completion_length": 601.0, - "delta_ref_entropy_loss": 0.09136962890625, - "delta_ref_ppl": -0.062530517578125, - "entropy_loss": -0.05108642578125, - "epoch": 0.6836, - "grad_norm": 1.0731973514040811, - "k1_kl": 0.06256103515625, - "k3_kl": 0.0384521484375, - "kimi_kl": 0.11767578125, - "learning_rate": 1.582e-07, - "loss": 0.0016, - "ppl": 0.0234375, - "reward": 0.9712752997875214, - "reward_std": 0.0020751605698023923, - "rewards/perpo_ocr_edit_distance_reward": 0.9712753295898438, + "advantages": -4.6338354877661914e-05, + "completion_length": 829.0, + "delta_ref_entropy_loss": 0.058837890625, + "delta_ref_ppl": -0.06982421875, + "entropy_loss": -0.05029296875, + "epoch": 0.3418, + "grad_norm": 1.0097285700652214, + "k1_kl": 0.06982421875, + "k3_kl": 0.04296875, + "kimi_kl": 0.1689453125, + "learning_rate": 3.291e-07, + "loss": 0.0018, + "ppl": 0.0208740234375, + "reward": 0.9562010169029236, + "reward_std": 0.0010024711955338717, + "rewards/perpo_ocr_edit_distance_reward": 0.9562010765075684, "step": 1709, "temperature": 0.9 }, { - "advantages": -2.0733901806124777e-05, - "completion_length": 950.0, - "delta_ref_entropy_loss": 0.0552978515625, - "delta_ref_ppl": -0.05780029296875, - "entropy_loss": -0.0936279296875, - "epoch": 0.684, - "grad_norm": 1.3527173726457544, - "k1_kl": 0.05780029296875, - "k3_kl": 0.03802490234375, - "kimi_kl": 0.101806640625, - "learning_rate": 1.5799999999999999e-07, - "loss": 0.0015, - "ppl": 0.0494384765625, - "reward": 0.8539297580718994, - "reward_std": 0.013852813834091648, - "rewards/perpo_ocr_edit_distance_reward": 0.8539297580718994, + "advantages": -1.5854835510253906e-05, + "completion_length": 76.0, + "delta_ref_entropy_loss": 0.26953125, + "delta_ref_ppl": -0.486328125, + "entropy_loss": -0.1064453125, + "epoch": 0.342, + "grad_norm": 1.908230420895615, + "k1_kl": 0.484375, + "k3_kl": 0.373046875, + "kimi_kl": 1.46875, + "learning_rate": 3.29e-07, + "loss": 0.0149, + "ppl": 0.03564453125, + "reward": 0.840205192565918, + "reward_std": 0.003124360227957368, + "rewards/perpo_ocr_edit_distance_reward": 0.8402053117752075, "step": 1710, "temperature": 0.9 }, { - "advantages": 0.00012009484271402471, - "completion_length": 310.5, - "delta_ref_entropy_loss": 0.140380859375, - "delta_ref_ppl": -0.1732177734375, - "entropy_loss": -0.112060546875, - "epoch": 0.6844, - "grad_norm": 1.441125817541886, - "k1_kl": 0.1732177734375, - "k3_kl": 0.1171875, - "kimi_kl": 0.469482421875, - "learning_rate": 1.578e-07, - "loss": 0.0046, - "ppl": 0.0504150390625, - "reward": 0.9733225107192993, - "reward_std": 0.00029295396052475553, - "rewards/perpo_ocr_edit_distance_reward": 0.9733225703239441, + "advantages": -2.3373537260340527e-05, + "completion_length": 461.0, + "delta_ref_entropy_loss": 0.06787109375, + "delta_ref_ppl": -0.1064453125, + "entropy_loss": -0.03173828125, + "epoch": 0.3422, + "grad_norm": 0.5295722134726171, + "k1_kl": 0.1064453125, + "k3_kl": 0.072265625, + "kimi_kl": 0.314453125, + "learning_rate": 3.289e-07, + "loss": 0.0029, + "ppl": 0.01470947265625, + "reward": 0.9961931705474854, + "reward_std": 0.0006288930308073759, + "rewards/perpo_ocr_edit_distance_reward": 0.9961931705474854, "step": 1711, "temperature": 0.9 }, { - "advantages": -0.00010325227776775137, - "completion_length": 738.5, - "delta_ref_entropy_loss": 0.04931640625, - "delta_ref_ppl": -0.0635986328125, - "entropy_loss": -0.0263671875, - "epoch": 0.6848, - "grad_norm": 0.4311548413691729, - "k1_kl": 0.063720703125, - "k3_kl": 0.0421142578125, - "kimi_kl": 0.13134765625, - "learning_rate": 1.5759999999999998e-07, - "loss": 0.0018, - "ppl": 0.0110321044921875, - "reward": 0.9964582324028015, - "reward_std": 0.000197414614376612, - "rewards/perpo_ocr_edit_distance_reward": 0.9964582920074463, + "advantages": -7.976804772624746e-05, + "completion_length": 357.0, + "delta_ref_entropy_loss": 0.056640625, + "delta_ref_ppl": -0.10986328125, + "entropy_loss": -0.03515625, + "epoch": 0.3424, + "grad_norm": 1.228846144413037, + "k1_kl": 0.10986328125, + "k3_kl": 0.07568359375, + "kimi_kl": 0.283203125, + "learning_rate": 3.288e-07, + "loss": 0.0031, + "ppl": 0.0167236328125, + "reward": 0.9977973699569702, + "reward_std": 0.0008606568444520235, + "rewards/perpo_ocr_edit_distance_reward": 0.9977974891662598, "step": 1712, "temperature": 0.9 }, { - "advantages": -2.2351743155013537e-05, - "completion_length": 925.0, - "delta_ref_entropy_loss": 0.02215576171875, - "delta_ref_ppl": -0.010955810546875, - "entropy_loss": -0.010894775390625, - "epoch": 0.6852, - "grad_norm": 0.40406988284246087, - "k1_kl": 0.010955810546875, - "k3_kl": 0.00482177734375, - "kimi_kl": 0.0102081298828125, - "learning_rate": 1.574e-07, - "loss": 0.0002, - "ppl": 0.00440216064453125, - "reward": 0.9994372129440308, - "reward_std": 0.001256187300896272, - "rewards/perpo_ocr_edit_distance_reward": 0.9994373023509979, + "advantages": -5.9689796216844115e-06, + "completion_length": 178.0, + "delta_ref_entropy_loss": 0.10400390625, + "delta_ref_ppl": -0.228515625, + "entropy_loss": -0.1083984375, + "epoch": 0.3426, + "grad_norm": 5.729332024317761, + "k1_kl": 0.228515625, + "k3_kl": 0.17578125, + "kimi_kl": 0.70703125, + "learning_rate": 3.2869999999999997e-07, + "loss": 0.007, + "ppl": 0.05419921875, + "reward": 0.9780668616294861, + "reward_std": 0.012715321965515614, + "rewards/perpo_ocr_edit_distance_reward": 0.9780669808387756, "step": 1713, "temperature": 0.9 }, { - "advantages": -0.00032461541195516475, - "completion_length": 833.0, - "delta_ref_entropy_loss": 0.02276611328125, - "delta_ref_ppl": -0.020050048828125, - "entropy_loss": -0.01995849609375, - "epoch": 0.6856, - "grad_norm": 0.21068620894708812, - "k1_kl": 0.02008056640625, - "k3_kl": 0.0118408203125, - "kimi_kl": 0.0279541015625, - "learning_rate": 1.572e-07, - "loss": 0.0008, - "ppl": 0.0094451904296875, - "reward": 0.998738557100296, - "reward_std": 0.00027033951482735574, - "rewards/perpo_ocr_edit_distance_reward": 0.9987386465072632, + "advantages": -6.4849853515625e-05, + "completion_length": 323.0, + "delta_ref_entropy_loss": 0.07666015625, + "delta_ref_ppl": -0.10693359375, + "entropy_loss": -0.031494140625, + "epoch": 0.3428, + "grad_norm": 0.5522531325791916, + "k1_kl": 0.10693359375, + "k3_kl": 0.0732421875, + "kimi_kl": 0.3125, + "learning_rate": 3.2859999999999996e-07, + "loss": 0.003, + "ppl": 0.009521484375, + "reward": 0.994660496711731, + "reward_std": 0.0006882090237922966, + "rewards/perpo_ocr_edit_distance_reward": 0.9946605563163757, "step": 1714, "temperature": 0.9 }, { - "advantages": -1.3794218830298632e-05, - "completion_length": 927.0, - "delta_ref_entropy_loss": 0.02490234375, - "delta_ref_ppl": -0.01806640625, - "entropy_loss": -0.035064697265625, - "epoch": 0.686, - "grad_norm": 0.5121854793870287, - "k1_kl": 0.01806640625, - "k3_kl": 0.011322021484375, - "kimi_kl": 0.030517578125, - "learning_rate": 1.57e-07, - "loss": 0.0005, - "ppl": 0.017059326171875, - "reward": 0.9970949292182922, - "reward_std": 0.0011841978412121534, - "rewards/perpo_ocr_edit_distance_reward": 0.997094988822937, + "advantages": -1.2993813470529858e-05, + "completion_length": 335.0, + "delta_ref_entropy_loss": 0.083984375, + "delta_ref_ppl": -0.12060546875, + "entropy_loss": -0.06640625, + "epoch": 0.343, + "grad_norm": 1.4495132799822181, + "k1_kl": 0.12109375, + "k3_kl": 0.08447265625, + "kimi_kl": 0.2734375, + "learning_rate": 3.285e-07, + "loss": 0.0034, + "ppl": 0.0341796875, + "reward": 0.9913730621337891, + "reward_std": 0.002520754234865308, + "rewards/perpo_ocr_edit_distance_reward": 0.9913731217384338, "step": 1715, "temperature": 0.9 }, { - "advantages": -1.0820372153830249e-05, - "completion_length": 559.0, - "delta_ref_entropy_loss": 0.080810546875, - "delta_ref_ppl": -0.062744140625, - "entropy_loss": -0.07080078125, - "epoch": 0.6864, - "grad_norm": 0.7036359831543046, - "k1_kl": 0.062744140625, - "k3_kl": 0.037841796875, - "kimi_kl": 0.11181640625, - "learning_rate": 1.5679999999999997e-07, - "loss": 0.0015, - "ppl": 0.0377197265625, - "reward": 0.952584981918335, - "reward_std": 0.003527272492647171, - "rewards/perpo_ocr_edit_distance_reward": 0.9525850713253021, + "advantages": -6.602492067031562e-05, + "completion_length": 560.0, + "delta_ref_entropy_loss": 0.034912109375, + "delta_ref_ppl": -0.0595703125, + "entropy_loss": -0.0264892578125, + "epoch": 0.3432, + "grad_norm": 0.49146507195202355, + "k1_kl": 0.0595703125, + "k3_kl": 0.037353515625, + "kimi_kl": 0.12158203125, + "learning_rate": 3.284e-07, + "loss": 0.0016, + "ppl": 0.00860595703125, + "reward": 0.9770475625991821, + "reward_std": 0.0009314997005276382, + "rewards/perpo_ocr_edit_distance_reward": 0.9770476818084717, "step": 1716, "temperature": 0.9 }, { - "advantages": -1.7698322608339367e-05, - "completion_length": 366.5, - "delta_ref_entropy_loss": 0.12158203125, - "delta_ref_ppl": -0.078857421875, - "entropy_loss": -0.07275390625, - "epoch": 0.6868, - "grad_norm": 1.092931540453187, - "k1_kl": 0.07861328125, - "k3_kl": 0.0440673828125, - "kimi_kl": 0.112548828125, - "learning_rate": 1.5659999999999999e-07, - "loss": 0.0018, - "ppl": 0.0377197265625, - "reward": 0.9447518587112427, - "reward_std": 0.0010817622824106365, - "rewards/perpo_ocr_edit_distance_reward": 0.9447519183158875, + "advantages": -3.3889500627992675e-06, + "completion_length": 114.0, + "delta_ref_entropy_loss": 0.06787109375, + "delta_ref_ppl": -0.38671875, + "entropy_loss": -0.06494140625, + "epoch": 0.3434, + "grad_norm": 1.5227938207673704, + "k1_kl": 0.38671875, + "k3_kl": 0.326171875, + "kimi_kl": 1.609375, + "learning_rate": 3.2829999999999994e-07, + "loss": 0.013, + "ppl": 0.025146484375, + "reward": 0.9521778225898743, + "reward_std": 0.002417699433863163, + "rewards/perpo_ocr_edit_distance_reward": 0.952177882194519, "step": 1717, "temperature": 0.9 }, { - "advantages": -7.921031647128984e-05, - "completion_length": 396.0, - "delta_ref_entropy_loss": 0.0347900390625, - "delta_ref_ppl": -0.02630615234375, - "entropy_loss": -0.02227783203125, - "epoch": 0.6872, - "grad_norm": 0.24616263869744323, - "k1_kl": 0.02630615234375, - "k3_kl": 0.01385498046875, - "kimi_kl": 0.0352783203125, - "learning_rate": 1.564e-07, - "loss": 0.0006, - "ppl": 0.009246826171875, - "reward": 0.9981962740421295, - "reward_std": 8.436622738372535e-05, - "rewards/perpo_ocr_edit_distance_reward": 0.9981962740421295, + "advantages": -1.3623919414840202e-07, + "completion_length": 543.0, + "delta_ref_entropy_loss": 0.06787109375, + "delta_ref_ppl": -0.10791015625, + "entropy_loss": -0.0830078125, + "epoch": 0.3436, + "grad_norm": 1.9927173070222546, + "k1_kl": 0.10791015625, + "k3_kl": 0.0732421875, + "kimi_kl": 0.1982421875, + "learning_rate": 3.282e-07, + "loss": 0.0029, + "ppl": 0.040771484375, + "reward": 0.7242749929428101, + "reward_std": 0.21645809710025787, + "rewards/perpo_ocr_edit_distance_reward": 0.7242750525474548, "step": 1718, "temperature": 0.9 }, { - "advantages": -1.952052161868778e-05, - "completion_length": 725.0, - "delta_ref_entropy_loss": 0.0355224609375, - "delta_ref_ppl": -0.04473876953125, - "entropy_loss": -0.05078125, - "epoch": 0.6876, - "grad_norm": 0.8020281856428418, - "k1_kl": 0.04473876953125, - "k3_kl": 0.03118896484375, - "kimi_kl": 0.104248046875, - "learning_rate": 1.562e-07, - "loss": 0.0013, - "ppl": 0.02471923828125, - "reward": 0.9854504764080048, - "reward_std": 0.005595609138254076, - "rewards/perpo_ocr_edit_distance_reward": 0.9854505360126495, + "advantages": -1.8562590412329882e-05, + "completion_length": 88.0, + "delta_ref_entropy_loss": 0.1279296875, + "delta_ref_ppl": -0.388671875, + "entropy_loss": -0.10400390625, + "epoch": 0.3438, + "grad_norm": 2.7582833541852616, + "k1_kl": 0.390625, + "k3_kl": 0.306640625, + "kimi_kl": 1.3984375, + "learning_rate": 3.281e-07, + "loss": 0.0123, + "ppl": 0.0439453125, + "reward": 0.8835978507995605, + "reward_std": 0.004483157303184271, + "rewards/perpo_ocr_edit_distance_reward": 0.8835979700088501, "step": 1719, "temperature": 0.9 }, { - "advantages": 4.3439014120849606e-05, - "completion_length": 839.0, - "delta_ref_entropy_loss": 0.1181640625, - "delta_ref_ppl": -0.0882568359375, - "entropy_loss": -0.17596435546875, - "epoch": 0.688, - "grad_norm": 2.473171289337837, - "k1_kl": 0.0882568359375, - "k3_kl": 0.05072021484375, - "kimi_kl": 0.1270751953125, - "learning_rate": 1.56e-07, - "loss": 0.002, - "ppl": 0.096832275390625, - "reward": 0.8509098291397095, - "reward_std": 0.014087534480495378, - "rewards/perpo_ocr_edit_distance_reward": 0.8509098589420319, + "advantages": -6.577798785656341e-07, + "completion_length": 773.0, + "delta_ref_entropy_loss": 0.08203125, + "delta_ref_ppl": -0.0751953125, + "entropy_loss": -0.10595703125, + "epoch": 0.344, + "grad_norm": 1.9554662711957265, + "k1_kl": 0.07470703125, + "k3_kl": 0.0458984375, + "kimi_kl": 0.11572265625, + "learning_rate": 3.28e-07, + "loss": 0.0018, + "ppl": 0.044677734375, + "reward": 0.7352234125137329, + "reward_std": 0.1569293737411499, + "rewards/perpo_ocr_edit_distance_reward": 0.7352235317230225, "step": 1720, "temperature": 0.9 }, { - "advantages": -5.705016405954666e-07, - "completion_length": 340.5, - "delta_ref_entropy_loss": 0.0352783203125, - "delta_ref_ppl": -0.02587890625, - "entropy_loss": -0.016265869140625, - "epoch": 0.6884, - "grad_norm": 0.3943186581458921, - "k1_kl": 0.02587890625, - "k3_kl": 0.014007568359375, - "kimi_kl": 0.039215087890625, - "learning_rate": 1.5579999999999998e-07, - "loss": 0.0006, - "ppl": 0.0073089599609375, - "reward": 0.9935542345046997, - "reward_std": 0.0037476723082363605, - "rewards/perpo_ocr_edit_distance_reward": 0.9935542643070221, + "advantages": -3.378732071723789e-05, + "completion_length": 1182.0, + "delta_ref_entropy_loss": 0.09375, + "delta_ref_ppl": -0.07861328125, + "entropy_loss": -0.0908203125, + "epoch": 0.3442, + "grad_norm": 3.9800029447875622, + "k1_kl": 0.0791015625, + "k3_kl": 0.045166015625, + "kimi_kl": 0.09375, + "learning_rate": 3.279e-07, + "loss": 0.0018, + "ppl": 0.0478515625, + "reward": 0.9781767725944519, + "reward_std": 0.0026706072967499495, + "rewards/perpo_ocr_edit_distance_reward": 0.9781768321990967, "step": 1721, "temperature": 0.9 }, { - "advantages": 0.0, - "completion_length": 538.5, - "delta_ref_entropy_loss": 0.02423095703125, - "delta_ref_ppl": -0.02239990234375, - "entropy_loss": -0.01470947265625, - "epoch": 0.6888, - "grad_norm": 0.012237569856415822, - "k1_kl": 0.02239990234375, - "k3_kl": 0.0137786865234375, - "kimi_kl": 0.0532989501953125, - "learning_rate": 1.556e-07, - "loss": 0.0006, - "ppl": 0.006500244140625, - "reward": 1.0, - "reward_std": 0.0, - "rewards/perpo_ocr_edit_distance_reward": 1.0, + "advantages": -7.154260674724355e-05, + "completion_length": 806.0, + "delta_ref_entropy_loss": 0.04443359375, + "delta_ref_ppl": -0.044677734375, + "entropy_loss": -0.031494140625, + "epoch": 0.3444, + "grad_norm": 0.27666750055735534, + "k1_kl": 0.044677734375, + "k3_kl": 0.022705078125, + "kimi_kl": 0.05078125, + "learning_rate": 3.2779999999999996e-07, + "loss": 0.001, + "ppl": 0.0107421875, + "reward": 0.9966128468513489, + "reward_std": 0.00037619794602505863, + "rewards/perpo_ocr_edit_distance_reward": 0.9966129064559937, "step": 1722, "temperature": 0.9 }, { - "advantages": -1.536096922905017e-05, - "completion_length": 604.0, - "delta_ref_entropy_loss": 0.12237548828125, - "delta_ref_ppl": -0.08154296875, - "entropy_loss": -0.1285400390625, - "epoch": 0.6892, - "grad_norm": 2.3327060229316223, - "k1_kl": 0.08154296875, - "k3_kl": 0.05157470703125, - "kimi_kl": 0.18939208984375, - "learning_rate": 1.554e-07, - "loss": 0.0021, - "ppl": 0.07373046875, - "reward": 0.8235026299953461, - "reward_std": 0.02639700227882713, - "rewards/perpo_ocr_edit_distance_reward": 0.8235026597976685, + "advantages": 9.553773452353198e-06, + "completion_length": 667.0, + "delta_ref_entropy_loss": 0.0289306640625, + "delta_ref_ppl": -0.05615234375, + "entropy_loss": -0.035400390625, + "epoch": 0.3446, + "grad_norm": 0.5903527902112029, + "k1_kl": 0.05615234375, + "k3_kl": 0.0390625, + "kimi_kl": 0.1259765625, + "learning_rate": 3.277e-07, + "loss": 0.0016, + "ppl": 0.01531982421875, + "reward": 0.9822505116462708, + "reward_std": 0.0025833849795162678, + "rewards/perpo_ocr_edit_distance_reward": 0.9822505116462708, "step": 1723, "temperature": 0.9 }, { - "advantages": -0.00046138679317664355, - "completion_length": 684.0, - "delta_ref_entropy_loss": 0.020751953125, - "delta_ref_ppl": -0.02569580078125, - "entropy_loss": -0.0157623291015625, - "epoch": 0.6896, - "grad_norm": 0.08842958869716666, - "k1_kl": 0.02569580078125, - "k3_kl": 0.01751708984375, - "kimi_kl": 0.056396484375, - "learning_rate": 1.552e-07, + "advantages": -4.4694970711134374e-05, + "completion_length": 1081.0, + "delta_ref_entropy_loss": 0.037109375, + "delta_ref_ppl": -0.04638671875, + "entropy_loss": -0.064453125, + "epoch": 0.3448, + "grad_norm": 0.5240437144031773, + "k1_kl": 0.04638671875, + "k3_kl": 0.0289306640625, + "kimi_kl": 0.0810546875, + "learning_rate": 3.276e-07, "loss": 0.0012, - "ppl": 0.00698089599609375, - "reward": 0.9997848570346832, - "reward_std": 6.727648724336177e-05, - "rewards/perpo_ocr_edit_distance_reward": 0.9997848868370056, + "ppl": 0.03076171875, + "reward": 0.9894229769706726, + "reward_std": 0.0016144586261361837, + "rewards/perpo_ocr_edit_distance_reward": 0.9894230961799622, "step": 1724, "temperature": 0.9 }, { - "advantages": 8.174351933121216e-07, - "completion_length": 920.0, - "delta_ref_entropy_loss": 0.048583984375, - "delta_ref_ppl": -0.033935546875, - "entropy_loss": -0.0325927734375, - "epoch": 0.69, - "grad_norm": 1.1295756835503472, - "k1_kl": 0.033935546875, - "k3_kl": 0.017974853515625, - "kimi_kl": 0.0465087890625, - "learning_rate": 1.55e-07, - "loss": 0.0007, - "ppl": 0.0130767822265625, - "reward": 0.9705004692077637, - "reward_std": 0.002672833375982009, - "rewards/perpo_ocr_edit_distance_reward": 0.9705004990100861, + "advantages": -8.20841160020791e-06, + "completion_length": 301.0, + "delta_ref_entropy_loss": 0.0693359375, + "delta_ref_ppl": -0.1064453125, + "entropy_loss": -0.07275390625, + "epoch": 0.345, + "grad_norm": 0.9006012410075108, + "k1_kl": 0.1064453125, + "k3_kl": 0.06787109375, + "kimi_kl": 0.18359375, + "learning_rate": 3.275e-07, + "loss": 0.0027, + "ppl": 0.034912109375, + "reward": 0.9929291009902954, + "reward_std": 0.003006405895575881, + "rewards/perpo_ocr_edit_distance_reward": 0.9929291009902954, "step": 1725, "temperature": 0.9 }, { - "advantages": -0.00011527113201736938, - "completion_length": 663.0, - "delta_ref_entropy_loss": 0.0399169921875, - "delta_ref_ppl": -0.0411376953125, - "entropy_loss": -0.02520751953125, - "epoch": 0.6904, - "grad_norm": 0.29150330324447943, - "k1_kl": 0.0411376953125, - "k3_kl": 0.02496337890625, - "kimi_kl": 0.071044921875, - "learning_rate": 1.5479999999999998e-07, - "loss": 0.0011, - "ppl": 0.01318359375, - "reward": 0.9960972964763641, - "reward_std": 0.00018290297884959728, - "rewards/perpo_ocr_edit_distance_reward": 0.9960973560810089, + "advantages": -0.00017418181232642382, + "completion_length": 475.0, + "delta_ref_entropy_loss": 0.08349609375, + "delta_ref_ppl": -0.09130859375, + "entropy_loss": -0.03857421875, + "epoch": 0.3452, + "grad_norm": 0.5736894239625947, + "k1_kl": 0.0908203125, + "k3_kl": 0.054931640625, + "kimi_kl": 0.166015625, + "learning_rate": 3.2740000000000003e-07, + "loss": 0.0024, + "ppl": 0.01544189453125, + "reward": 0.9948087334632874, + "reward_std": 0.00043779550469480455, + "rewards/perpo_ocr_edit_distance_reward": 0.9948087930679321, "step": 1726, "temperature": 0.9 }, { - "advantages": -2.6353768589615356e-05, - "completion_length": 417.5, - "delta_ref_entropy_loss": 0.0491943359375, - "delta_ref_ppl": -0.03521728515625, - "entropy_loss": -0.0286865234375, - "epoch": 0.6908, - "grad_norm": 0.5742211701296774, - "k1_kl": 0.035491943359375, - "k3_kl": 0.0197296142578125, - "kimi_kl": 0.05173492431640625, - "learning_rate": 1.5459999999999997e-07, - "loss": 0.0008, - "ppl": 0.014556884765625, - "reward": 0.9902209341526031, - "reward_std": 0.0009134681313298643, - "rewards/perpo_ocr_edit_distance_reward": 0.9902209341526031, + "advantages": -2.9257367714308202e-05, + "completion_length": 486.0, + "delta_ref_entropy_loss": 0.0771484375, + "delta_ref_ppl": -0.08251953125, + "entropy_loss": -0.04833984375, + "epoch": 0.3454, + "grad_norm": 0.8862852936151072, + "k1_kl": 0.08251953125, + "k3_kl": 0.050048828125, + "kimi_kl": 0.13671875, + "learning_rate": 3.2729999999999997e-07, + "loss": 0.002, + "ppl": 0.0206298828125, + "reward": 0.9849488139152527, + "reward_std": 0.0007736477418802679, + "rewards/perpo_ocr_edit_distance_reward": 0.9849488735198975, "step": 1727, "temperature": 0.9 }, { - "advantages": -4.4694970711134374e-05, - "completion_length": 500.5, - "delta_ref_entropy_loss": 0.07177734375, - "delta_ref_ppl": -0.081787109375, - "entropy_loss": -0.052215576171875, - "epoch": 0.6912, - "grad_norm": 1.3620757613223784, - "k1_kl": 0.08154296875, - "k3_kl": 0.0537109375, - "kimi_kl": 0.1650390625, - "learning_rate": 1.544e-07, - "loss": 0.0022, - "ppl": 0.023529052734375, - "reward": 0.9870539903640747, - "reward_std": 0.0010936038743238896, - "rewards/perpo_ocr_edit_distance_reward": 0.9870540499687195, + "advantages": -0.00022598676150664687, + "completion_length": 854.0, + "delta_ref_entropy_loss": 0.0546875, + "delta_ref_ppl": -0.0693359375, + "entropy_loss": -0.030517578125, + "epoch": 0.3456, + "grad_norm": 0.2816591662485653, + "k1_kl": 0.0693359375, + "k3_kl": 0.040283203125, + "kimi_kl": 0.138671875, + "learning_rate": 3.2719999999999997e-07, + "loss": 0.0018, + "ppl": 0.01025390625, + "reward": 0.9894870519638062, + "reward_std": 0.0003521641483530402, + "rewards/perpo_ocr_edit_distance_reward": 0.9894871711730957, "step": 1728, "temperature": 0.9 }, { - "advantages": -3.988402477261843e-05, - "completion_length": 1040.5, - "delta_ref_entropy_loss": 0.037353515625, - "delta_ref_ppl": -0.026123046875, - "entropy_loss": -0.03289794921875, - "epoch": 0.6916, - "grad_norm": 0.5773141700505664, - "k1_kl": 0.026123046875, - "k3_kl": 0.01690673828125, - "kimi_kl": 0.0455322265625, - "learning_rate": 1.542e-07, - "loss": 0.0007, - "ppl": 0.01806640625, - "reward": 0.9821591973304749, - "reward_std": 0.000993712106719613, - "rewards/perpo_ocr_edit_distance_reward": 0.982159286737442, + "advantages": -0.00010727133485488594, + "completion_length": 1348.0, + "delta_ref_entropy_loss": 0.0274658203125, + "delta_ref_ppl": -0.0458984375, + "entropy_loss": -0.043212890625, + "epoch": 0.3458, + "grad_norm": 0.4793564090966663, + "k1_kl": 0.0458984375, + "k3_kl": 0.0296630859375, + "kimi_kl": 0.07373046875, + "learning_rate": 3.271e-07, + "loss": 0.0013, + "ppl": 0.0220947265625, + "reward": 0.996761679649353, + "reward_std": 0.0008526176679879427, + "rewards/perpo_ocr_edit_distance_reward": 0.9967617988586426, "step": 1729, "temperature": 0.9 }, { - "advantages": -2.7682100153469946e-05, - "completion_length": 378.5, - "delta_ref_entropy_loss": 0.0733642578125, - "delta_ref_ppl": -0.0411376953125, - "entropy_loss": -0.0797119140625, - "epoch": 0.692, - "grad_norm": 1.33545283487021, - "k1_kl": 0.0411376953125, - "k3_kl": 0.019317626953125, - "kimi_kl": 0.0447998046875, - "learning_rate": 1.54e-07, - "loss": 0.0008, - "ppl": 0.0423126220703125, - "reward": 0.9612157344818115, - "reward_std": 0.0018853252404369414, - "rewards/perpo_ocr_edit_distance_reward": 0.9612157642841339, + "advantages": 1.7029898913278885e-07, + "completion_length": 1185.0, + "delta_ref_entropy_loss": 0.06884765625, + "delta_ref_ppl": -0.076171875, + "entropy_loss": -0.06640625, + "epoch": 0.346, + "grad_norm": 1.0891943846244492, + "k1_kl": 0.07666015625, + "k3_kl": 0.046630859375, + "kimi_kl": 0.1259765625, + "learning_rate": 3.27e-07, + "loss": 0.0019, + "ppl": 0.033447265625, + "reward": 0.8357548713684082, + "reward_std": 0.16699494421482086, + "rewards/perpo_ocr_edit_distance_reward": 0.8357548117637634, "step": 1730, "temperature": 0.9 }, { - "advantages": -1.2227467777847778e-05, - "completion_length": 980.0, - "delta_ref_entropy_loss": 0.0215606689453125, - "delta_ref_ppl": -0.0248260498046875, - "entropy_loss": -0.01258087158203125, - "epoch": 0.6924, - "grad_norm": 0.27503094416387647, - "k1_kl": 0.02484130859375, - "k3_kl": 0.015106201171875, - "kimi_kl": 0.049713134765625, - "learning_rate": 1.538e-07, - "loss": 0.0006, - "ppl": 0.00617218017578125, - "reward": 0.9946728050708771, - "reward_std": 0.0015163117786869407, - "rewards/perpo_ocr_edit_distance_reward": 0.9946728646755219, + "advantages": -8.07728138170205e-05, + "completion_length": 1034.0, + "delta_ref_entropy_loss": 0.03369140625, + "delta_ref_ppl": -0.04541015625, + "entropy_loss": -0.039794921875, + "epoch": 0.3462, + "grad_norm": 0.48135623635207425, + "k1_kl": 0.04541015625, + "k3_kl": 0.0260009765625, + "kimi_kl": 0.06396484375, + "learning_rate": 3.269e-07, + "loss": 0.0011, + "ppl": 0.0146484375, + "reward": 0.9880313277244568, + "reward_std": 0.000848717347253114, + "rewards/perpo_ocr_edit_distance_reward": 0.9880313873291016, "step": 1731, "temperature": 0.9 }, { - "advantages": -2.7748092747970077e-05, - "completion_length": 927.0, - "delta_ref_entropy_loss": 0.0294189453125, - "delta_ref_ppl": -0.0250396728515625, - "entropy_loss": -0.03485107421875, - "epoch": 0.6928, - "grad_norm": 12.593114778991161, - "k1_kl": 0.0250396728515625, - "k3_kl": 0.2353515625, - "kimi_kl": 0.061279296875, - "learning_rate": 1.5359999999999997e-07, - "loss": 0.0094, - "ppl": 0.0240478515625, - "reward": 0.9712643325328827, - "reward_std": 0.006188567407662049, - "rewards/perpo_ocr_edit_distance_reward": 0.9712643623352051, + "advantages": -0.00011604173050727695, + "completion_length": 861.0, + "delta_ref_entropy_loss": 0.03515625, + "delta_ref_ppl": -0.04931640625, + "entropy_loss": -0.0198974609375, + "epoch": 0.3464, + "grad_norm": 0.48955194407058156, + "k1_kl": 0.04931640625, + "k3_kl": 0.03173828125, + "kimi_kl": 0.1220703125, + "learning_rate": 3.268e-07, + "loss": 0.0014, + "ppl": 0.00555419921875, + "reward": 0.9946393370628357, + "reward_std": 0.00034023166517727077, + "rewards/perpo_ocr_edit_distance_reward": 0.9946393966674805, "step": 1732, "temperature": 0.9 }, { - "advantages": -2.5476729206275195e-05, - "completion_length": 224.0, - "delta_ref_entropy_loss": 0.06884765625, - "delta_ref_ppl": -0.0802001953125, - "entropy_loss": -0.047454833984375, - "epoch": 0.6932, - "grad_norm": 1.9803814267159954, - "k1_kl": 0.080078125, - "k3_kl": 0.0546875, - "kimi_kl": 0.15283203125, - "learning_rate": 1.534e-07, - "loss": 0.0022, - "ppl": 0.0201263427734375, - "reward": 0.9905461966991425, - "reward_std": 0.0007860504556447268, - "rewards/perpo_ocr_edit_distance_reward": 0.9905462563037872, + "advantages": 9.5367431640625e-07, + "completion_length": 262.0, + "delta_ref_entropy_loss": 0.125, + "delta_ref_ppl": -0.1279296875, + "entropy_loss": -0.10107421875, + "epoch": 0.3466, + "grad_norm": 3.2583514735141175, + "k1_kl": 0.1279296875, + "k3_kl": 0.08642578125, + "kimi_kl": 0.255859375, + "learning_rate": 3.267e-07, + "loss": 0.0035, + "ppl": 0.03271484375, + "reward": 0.669464111328125, + "reward_std": 0.008793837390840054, + "rewards/perpo_ocr_edit_distance_reward": 0.6694640517234802, "step": 1733, "temperature": 0.9 }, { - "advantages": -3.726141858351184e-05, - "completion_length": 795.0, - "delta_ref_entropy_loss": 0.04541015625, - "delta_ref_ppl": -0.03143310546875, - "entropy_loss": -0.027069091796875, - "epoch": 0.6936, - "grad_norm": 0.31832400915074294, - "k1_kl": 0.031494140625, - "k3_kl": 0.017578125, - "kimi_kl": 0.04486083984375, - "learning_rate": 1.532e-07, - "loss": 0.0007, - "ppl": 0.01226806640625, - "reward": 0.9682779312133789, - "reward_std": 0.0013879148464184254, - "rewards/perpo_ocr_edit_distance_reward": 0.9682780206203461, + "advantages": -6.685938569717109e-05, + "completion_length": 439.0, + "delta_ref_entropy_loss": 0.1298828125, + "delta_ref_ppl": -0.1318359375, + "entropy_loss": -0.10693359375, + "epoch": 0.3468, + "grad_norm": 1.3242227561794782, + "k1_kl": 0.1328125, + "k3_kl": 0.083984375, + "kimi_kl": 0.251953125, + "learning_rate": 3.2659999999999997e-07, + "loss": 0.0034, + "ppl": 0.054443359375, + "reward": 0.9398649334907532, + "reward_std": 0.0010466125095263124, + "rewards/perpo_ocr_edit_distance_reward": 0.939864993095398, "step": 1734, "temperature": 0.9 }, { - "advantages": -0.0003128903263132088, - "completion_length": 739.0, - "delta_ref_entropy_loss": 0.04193115234375, - "delta_ref_ppl": -0.03594970703125, - "entropy_loss": -0.019775390625, - "epoch": 0.694, - "grad_norm": 0.4666558752541475, - "k1_kl": 0.0357666015625, - "k3_kl": 0.0198974609375, - "kimi_kl": 0.04541015625, - "learning_rate": 1.5299999999999998e-07, - "loss": 0.0011, - "ppl": 0.00765228271484375, - "reward": 0.9646042287349701, - "reward_std": 9.32591428863816e-05, - "rewards/perpo_ocr_edit_distance_reward": 0.9646042585372925, + "advantages": -4.863739377469756e-05, + "completion_length": 591.0, + "delta_ref_entropy_loss": 0.0498046875, + "delta_ref_ppl": -0.07373046875, + "entropy_loss": -0.0390625, + "epoch": 0.347, + "grad_norm": 0.44628507374918575, + "k1_kl": 0.07373046875, + "k3_kl": 0.045166015625, + "kimi_kl": 0.1328125, + "learning_rate": 3.265e-07, + "loss": 0.0019, + "ppl": 0.0120849609375, + "reward": 0.8315702080726624, + "reward_std": 0.0006002724985592067, + "rewards/perpo_ocr_edit_distance_reward": 0.8315702676773071, "step": 1735, "temperature": 0.9 }, { - "advantages": -0.00011484538640615938, - "completion_length": 528.0, - "delta_ref_entropy_loss": 0.06439208984375, - "delta_ref_ppl": -0.0369873046875, - "entropy_loss": -0.0738067626953125, - "epoch": 0.6944, - "grad_norm": 1.0051510742027603, - "k1_kl": 0.0367431640625, - "k3_kl": 0.02088165283203125, - "kimi_kl": 0.0595550537109375, - "learning_rate": 1.528e-07, - "loss": 0.001, - "ppl": 0.03966522216796875, - "reward": 0.9310454726219177, - "reward_std": 0.0058101356771658175, - "rewards/perpo_ocr_edit_distance_reward": 0.9310455024242401, + "advantages": -5.1685743528651074e-05, + "completion_length": 258.0, + "delta_ref_entropy_loss": 0.08642578125, + "delta_ref_ppl": -0.1298828125, + "entropy_loss": -0.044189453125, + "epoch": 0.3472, + "grad_norm": 0.7110932412268542, + "k1_kl": 0.1298828125, + "k3_kl": 0.09326171875, + "kimi_kl": 0.310546875, + "learning_rate": 3.264e-07, + "loss": 0.0038, + "ppl": 0.01806640625, + "reward": 0.9978905320167542, + "reward_std": 0.001218197401612997, + "rewards/perpo_ocr_edit_distance_reward": 0.9978905320167542, "step": 1736, "temperature": 0.9 }, { - "advantages": -4.1042059302220935e-06, - "completion_length": 432.5, - "delta_ref_entropy_loss": 0.083984375, - "delta_ref_ppl": -0.0703125, - "entropy_loss": -0.102294921875, - "epoch": 0.6948, - "grad_norm": 8.96147935232918, - "k1_kl": 0.070068359375, - "k3_kl": 0.043212890625, - "kimi_kl": 0.128662109375, - "learning_rate": 1.526e-07, - "loss": 0.0017, - "ppl": 0.0631103515625, - "reward": 0.9798799157142639, - "reward_std": 0.0027992811519652605, - "rewards/perpo_ocr_edit_distance_reward": 0.9798799753189087, + "advantages": -1.498631149843277e-06, + "completion_length": 570.0, + "delta_ref_entropy_loss": 0.0966796875, + "delta_ref_ppl": -0.107421875, + "entropy_loss": -0.0703125, + "epoch": 0.3474, + "grad_norm": 1.3113606212610753, + "k1_kl": 0.107421875, + "k3_kl": 0.064453125, + "kimi_kl": 0.18359375, + "learning_rate": 3.2629999999999995e-07, + "loss": 0.0026, + "ppl": 0.0269775390625, + "reward": 0.9404317140579224, + "reward_std": 0.028126701712608337, + "rewards/perpo_ocr_edit_distance_reward": 0.9404317736625671, "step": 1737, "temperature": 0.9 }, { - "advantages": -0.00013138567533133028, - "completion_length": 1023.5, - "delta_ref_entropy_loss": 0.03955078125, - "delta_ref_ppl": -0.0263671875, - "entropy_loss": -0.0626220703125, - "epoch": 0.6952, - "grad_norm": 2.5208987419809885, - "k1_kl": 0.0263671875, - "k3_kl": 0.016571044921875, - "kimi_kl": 0.030517578125, - "learning_rate": 1.524e-07, - "loss": 0.0008, - "ppl": 0.03485107421875, - "reward": 0.9769879579544067, - "reward_std": 0.0015922842794680037, - "rewards/perpo_ocr_edit_distance_reward": 0.9769879877567291, + "advantages": -4.569122029352002e-05, + "completion_length": 902.0, + "delta_ref_entropy_loss": 0.043701171875, + "delta_ref_ppl": -0.037109375, + "entropy_loss": -0.026123046875, + "epoch": 0.3476, + "grad_norm": 87.85365477886073, + "k1_kl": 0.037353515625, + "k3_kl": 0.09814453125, + "kimi_kl": 0.051513671875, + "learning_rate": 3.262e-07, + "loss": 0.004, + "ppl": 0.013916015625, + "reward": 0.9984567165374756, + "reward_std": 0.0010180216049775481, + "rewards/perpo_ocr_edit_distance_reward": 0.9984568357467651, "step": 1738, "temperature": 0.9 }, { - "advantages": -3.970520992879756e-05, - "completion_length": 435.0, - "delta_ref_entropy_loss": 0.0439453125, - "delta_ref_ppl": -0.0321044921875, - "entropy_loss": -0.03436279296875, - "epoch": 0.6956, - "grad_norm": 1.1782053763102198, - "k1_kl": 0.0321044921875, - "k3_kl": 0.01873779296875, - "kimi_kl": 0.04296875, - "learning_rate": 1.522e-07, - "loss": 0.0008, - "ppl": 0.01971435546875, - "reward": 0.9962319135665894, - "reward_std": 0.0014843817916698754, - "rewards/perpo_ocr_edit_distance_reward": 0.9962320029735565, + "advantages": -5.2911898819729686e-05, + "completion_length": 884.0, + "delta_ref_entropy_loss": 0.033447265625, + "delta_ref_ppl": -0.04833984375, + "entropy_loss": -0.03125, + "epoch": 0.3478, + "grad_norm": 1.6715587679135127, + "k1_kl": 0.048583984375, + "k3_kl": 0.031005859375, + "kimi_kl": 0.07958984375, + "learning_rate": 3.261e-07, + "loss": 0.0013, + "ppl": 0.013671875, + "reward": 0.9705610871315002, + "reward_std": 0.0013481122441589832, + "rewards/perpo_ocr_edit_distance_reward": 0.9705612063407898, "step": 1739, "temperature": 0.9 }, { - "advantages": -0.00036148088838672265, - "completion_length": 252.5, - "delta_ref_entropy_loss": 0.02777099609375, - "delta_ref_ppl": -0.0196533203125, - "entropy_loss": -0.02099609375, - "epoch": 0.696, - "grad_norm": 0.25032142523596584, - "k1_kl": 0.0196533203125, - "k3_kl": 0.010711669921875, - "kimi_kl": 0.017608642578125, - "learning_rate": 1.5199999999999998e-07, - "loss": 0.0008, - "ppl": 0.009521484375, - "reward": 0.9363976120948792, - "reward_std": 0.0001848233223427087, - "rewards/perpo_ocr_edit_distance_reward": 0.9363977015018463, + "advantages": -6.23890373390168e-05, + "completion_length": 758.0, + "delta_ref_entropy_loss": 0.09375, + "delta_ref_ppl": -0.0859375, + "entropy_loss": -0.07275390625, + "epoch": 0.348, + "grad_norm": 0.5114219483739748, + "k1_kl": 0.0859375, + "k3_kl": 0.04296875, + "kimi_kl": 0.10302734375, + "learning_rate": 3.26e-07, + "loss": 0.0018, + "ppl": 0.0341796875, + "reward": 0.9726084470748901, + "reward_std": 0.0009920275770127773, + "rewards/perpo_ocr_edit_distance_reward": 0.9726085662841797, "step": 1740, "temperature": 0.9 }, { - "advantages": -7.750732720523956e-06, - "completion_length": 938.0, - "delta_ref_entropy_loss": 0.051666259765625, - "delta_ref_ppl": -0.0423736572265625, - "entropy_loss": -0.051788330078125, - "epoch": 0.6964, - "grad_norm": 0.7434799136060553, - "k1_kl": 0.0423736572265625, - "k3_kl": 0.0302734375, - "kimi_kl": 0.0784912109375, - "learning_rate": 1.518e-07, - "loss": 0.0012, - "ppl": 0.032958984375, - "reward": 0.9587377905845642, - "reward_std": 0.005345599493011832, - "rewards/perpo_ocr_edit_distance_reward": 0.9587378799915314, + "advantages": 5.364418484532507e-06, + "completion_length": 678.0, + "delta_ref_entropy_loss": 0.047607421875, + "delta_ref_ppl": -0.064453125, + "entropy_loss": -0.040771484375, + "epoch": 0.3482, + "grad_norm": 0.6330168405262954, + "k1_kl": 0.064453125, + "k3_kl": 0.03955078125, + "kimi_kl": 0.09716796875, + "learning_rate": 3.259e-07, + "loss": 0.0016, + "ppl": 0.01483154296875, + "reward": 0.9910391569137573, + "reward_std": 0.004656338598579168, + "rewards/perpo_ocr_edit_distance_reward": 0.9910391569137573, "step": 1741, "temperature": 0.9 }, { - "advantages": -6.811959707420101e-08, - "completion_length": 236.0, - "delta_ref_entropy_loss": 0.05810546875, - "delta_ref_ppl": -0.04278564453125, - "entropy_loss": -0.08648681640625, - "epoch": 0.6968, - "grad_norm": 1.0855161968162637, - "k1_kl": 0.04266357421875, - "k3_kl": 0.0235595703125, - "kimi_kl": 0.0433349609375, - "learning_rate": 1.516e-07, - "loss": 0.0009, - "ppl": 0.05355072021484375, - "reward": 0.7223847806453705, - "reward_std": 0.03179687634110451, - "rewards/perpo_ocr_edit_distance_reward": 0.7223848104476929, + "advantages": -6.895406113471836e-05, + "completion_length": 786.0, + "delta_ref_entropy_loss": 0.04443359375, + "delta_ref_ppl": -0.06787109375, + "entropy_loss": -0.042724609375, + "epoch": 0.3484, + "grad_norm": 0.8171798743054342, + "k1_kl": 0.068359375, + "k3_kl": 0.0439453125, + "kimi_kl": 0.1142578125, + "learning_rate": 3.2579999999999997e-07, + "loss": 0.0018, + "ppl": 0.0189208984375, + "reward": 0.9933487176895142, + "reward_std": 0.0008881751564331353, + "rewards/perpo_ocr_edit_distance_reward": 0.9933487772941589, "step": 1742, "temperature": 0.9 }, { - "advantages": 2.1840845874976367e-05, - "completion_length": 200.0, - "delta_ref_entropy_loss": 0.0535888671875, - "delta_ref_ppl": -0.036712646484375, - "entropy_loss": -0.031982421875, - "epoch": 0.6972, - "grad_norm": 0.7002805378614894, - "k1_kl": 0.03668212890625, - "k3_kl": 0.02178955078125, - "kimi_kl": 0.061920166015625, - "learning_rate": 1.514e-07, - "loss": 0.0008, - "ppl": 0.016754150390625, - "reward": 0.9993523955345154, - "reward_std": 0.0002422954567009583, - "rewards/perpo_ocr_edit_distance_reward": 0.9993523955345154, + "advantages": -5.005087223253213e-05, + "completion_length": 1082.0, + "delta_ref_entropy_loss": 0.06494140625, + "delta_ref_ppl": -0.0478515625, + "entropy_loss": -0.068359375, + "epoch": 0.3486, + "grad_norm": 0.5210626248843129, + "k1_kl": 0.0478515625, + "k3_kl": 0.021484375, + "kimi_kl": 0.037841796875, + "learning_rate": 3.2569999999999996e-07, + "loss": 0.0009, + "ppl": 0.0260009765625, + "reward": 0.7890831232070923, + "reward_std": 0.0005806046538054943, + "rewards/perpo_ocr_edit_distance_reward": 0.7890831828117371, "step": 1743, "temperature": 0.9 }, { - "advantages": -9.127174624712353e-05, - "completion_length": 594.5, - "delta_ref_entropy_loss": 0.04443359375, - "delta_ref_ppl": -0.0230712890625, - "entropy_loss": -0.029083251953125, - "epoch": 0.6976, - "grad_norm": 0.464901947955334, - "k1_kl": 0.0230712890625, - "k3_kl": 0.012725830078125, - "kimi_kl": 0.03338623046875, - "learning_rate": 1.512e-07, - "loss": 0.0006, - "ppl": 0.0142822265625, - "reward": 0.994100421667099, - "reward_std": 0.0003607722173910588, - "rewards/perpo_ocr_edit_distance_reward": 0.9941004514694214, + "advantages": -0.00010919571650447324, + "completion_length": 573.0, + "delta_ref_entropy_loss": 0.046875, + "delta_ref_ppl": -0.059814453125, + "entropy_loss": -0.03466796875, + "epoch": 0.3488, + "grad_norm": 0.3517666458342899, + "k1_kl": 0.059814453125, + "k3_kl": 0.040771484375, + "kimi_kl": 0.19921875, + "learning_rate": 3.256e-07, + "loss": 0.0017, + "ppl": 0.01519775390625, + "reward": 0.9672998785972595, + "reward_std": 0.0005238393787294626, + "rewards/perpo_ocr_edit_distance_reward": 0.9672999382019043, "step": 1744, "temperature": 0.9 }, { - "advantages": -5.653926564264111e-06, - "completion_length": 345.0, - "delta_ref_entropy_loss": 0.0599365234375, - "delta_ref_ppl": -0.10546875, - "entropy_loss": -0.06005859375, - "epoch": 0.698, - "grad_norm": 1.0125711709028244, - "k1_kl": 0.105712890625, - "k3_kl": 0.0684814453125, - "kimi_kl": 0.16357421875, - "learning_rate": 1.51e-07, - "loss": 0.0027, - "ppl": 0.024658203125, - "reward": 0.9522635042667389, - "reward_std": 0.0010790934320539236, - "rewards/perpo_ocr_edit_distance_reward": 0.9522635340690613, + "advantages": -6.949901580810547e-05, + "completion_length": 375.0, + "delta_ref_entropy_loss": 0.06640625, + "delta_ref_ppl": -0.09326171875, + "entropy_loss": -0.0301513671875, + "epoch": 0.349, + "grad_norm": 0.562442313579017, + "k1_kl": 0.09326171875, + "k3_kl": 0.06396484375, + "kimi_kl": 0.220703125, + "learning_rate": 3.255e-07, + "loss": 0.0026, + "ppl": 0.01239013671875, + "reward": 0.9943578243255615, + "reward_std": 0.0002676564909052104, + "rewards/perpo_ocr_edit_distance_reward": 0.9943578243255615, "step": 1745, "temperature": 0.9 }, { - "advantages": -1.8562590184956207e-06, - "completion_length": 928.5, - "delta_ref_entropy_loss": 0.0433349609375, - "delta_ref_ppl": -0.03839111328125, - "entropy_loss": -0.02325439453125, - "epoch": 0.6984, - "grad_norm": 333.67297308003305, - "k1_kl": 0.0384521484375, - "k3_kl": 0.294158935546875, - "kimi_kl": 0.07470703125, - "learning_rate": 1.5079999999999997e-07, - "loss": 0.0118, - "ppl": 0.012439727783203125, - "reward": 0.9865103363990784, - "reward_std": 0.0045397630892694, - "rewards/perpo_ocr_edit_distance_reward": 0.9865103363990784, + "advantages": -1.1001315215253271e-05, + "completion_length": 60.0, + "delta_ref_entropy_loss": 0.12060546875, + "delta_ref_ppl": -0.6953125, + "entropy_loss": -0.09521484375, + "epoch": 0.3492, + "grad_norm": 3.9605275247597302, + "k1_kl": 0.6953125, + "k3_kl": 0.58984375, + "kimi_kl": 3.28125, + "learning_rate": 3.254e-07, + "loss": 0.0236, + "ppl": 0.044189453125, + "reward": 0.9657434821128845, + "reward_std": 0.007632791064679623, + "rewards/perpo_ocr_edit_distance_reward": 0.9657435417175293, "step": 1746, "temperature": 0.9 }, { - "advantages": -5.813156167278066e-05, - "completion_length": 789.0, - "delta_ref_entropy_loss": 0.02301025390625, - "delta_ref_ppl": -0.0223388671875, - "entropy_loss": -0.015228271484375, - "epoch": 0.6988, - "grad_norm": 0.2507655528321931, - "k1_kl": 0.0223388671875, - "k3_kl": 0.0142822265625, - "kimi_kl": 0.037750244140625, - "learning_rate": 1.506e-07, - "loss": 0.0006, - "ppl": 0.0064544677734375, - "reward": 0.9995707869529724, - "reward_std": 0.0002429926535114646, - "rewards/perpo_ocr_edit_distance_reward": 0.9995708465576172, + "advantages": 7.40800601306546e-07, + "completion_length": 458.0, + "delta_ref_entropy_loss": 0.057373046875, + "delta_ref_ppl": -0.07373046875, + "entropy_loss": -0.060302734375, + "epoch": 0.3494, + "grad_norm": 0.6371995645529669, + "k1_kl": 0.07373046875, + "k3_kl": 0.048095703125, + "kimi_kl": 0.1474609375, + "learning_rate": 3.253e-07, + "loss": 0.0019, + "ppl": 0.0185546875, + "reward": 0.9695342779159546, + "reward_std": 0.01140252873301506, + "rewards/perpo_ocr_edit_distance_reward": 0.9695342183113098, "step": 1747, "temperature": 0.9 }, { - "advantages": -9.188907915813616e-05, - "completion_length": 726.0, - "delta_ref_entropy_loss": 0.02545166015625, - "delta_ref_ppl": -0.0114593505859375, - "entropy_loss": -0.01739501953125, - "epoch": 0.6992, - "grad_norm": 0.45631189297472263, - "k1_kl": 0.0114898681640625, - "k3_kl": 0.006866455078125, - "kimi_kl": 0.0205230712890625, - "learning_rate": 1.504e-07, - "loss": 0.0004, - "ppl": 0.00830078125, - "reward": 0.9997096657752991, - "reward_std": 0.00038694422983098775, - "rewards/perpo_ocr_edit_distance_reward": 0.9997097551822662, + "advantages": -1.9056456949329004e-05, + "completion_length": 1619.0, + "delta_ref_entropy_loss": 0.0322265625, + "delta_ref_ppl": -0.0400390625, + "entropy_loss": -0.06298828125, + "epoch": 0.3496, + "grad_norm": 1.7776846649875107, + "k1_kl": 0.0400390625, + "k3_kl": 0.0279541015625, + "kimi_kl": 0.053955078125, + "learning_rate": 3.252e-07, + "loss": 0.0011, + "ppl": 0.035400390625, + "reward": 0.9866816997528076, + "reward_std": 0.0016864367062225938, + "rewards/perpo_ocr_edit_distance_reward": 0.9866817593574524, "step": 1748, "temperature": 0.9 }, { - "advantages": -3.1394619327329565e-05, - "completion_length": 668.5, - "delta_ref_entropy_loss": 0.02496337890625, - "delta_ref_ppl": -0.021240234375, - "entropy_loss": -0.02593994140625, - "epoch": 0.6996, - "grad_norm": 0.7192436779186601, - "k1_kl": 0.0211181640625, - "k3_kl": 0.013580322265625, - "kimi_kl": 0.03424072265625, - "learning_rate": 1.5019999999999998e-07, - "loss": 0.0006, - "ppl": 0.0135498046875, - "reward": 0.9987439513206482, - "reward_std": 0.00058914435794577, - "rewards/perpo_ocr_edit_distance_reward": 0.9987439513206482, + "advantages": -7.353510591201484e-05, + "completion_length": 548.0, + "delta_ref_entropy_loss": 0.04931640625, + "delta_ref_ppl": -0.0625, + "entropy_loss": -0.02734375, + "epoch": 0.3498, + "grad_norm": 0.3597859744233446, + "k1_kl": 0.06298828125, + "k3_kl": 0.038818359375, + "kimi_kl": 0.166015625, + "learning_rate": 3.2509999999999997e-07, + "loss": 0.0016, + "ppl": 0.007781982421875, + "reward": 0.9949002861976624, + "reward_std": 0.0015210550045594573, + "rewards/perpo_ocr_edit_distance_reward": 0.9949003458023071, "step": 1749, "temperature": 0.9 }, { - "advantages": -0.00020847576837468296, - "completion_length": 1607.0, - "delta_ref_entropy_loss": 0.00769805908203125, - "delta_ref_ppl": -0.017303466796875, - "entropy_loss": -0.076446533203125, - "epoch": 0.7, - "grad_norm": 1.4311068902618849, - "k1_kl": 0.0172119140625, - "k3_kl": 0.012664794921875, - "kimi_kl": 0.02227783203125, - "learning_rate": 1.5e-07, - "loss": 0.0007, - "ppl": 0.043792724609375, - "reward": 0.7810423076152802, - "reward_std": 0.030388042934646364, - "rewards/perpo_ocr_edit_distance_reward": 0.7810424268245697, + "advantages": -3.9611546526430175e-05, + "completion_length": 807.0, + "delta_ref_entropy_loss": 0.06494140625, + "delta_ref_ppl": -0.059814453125, + "entropy_loss": -0.06396484375, + "epoch": 0.35, + "grad_norm": 1.4477309523282662, + "k1_kl": 0.06005859375, + "k3_kl": 0.03173828125, + "kimi_kl": 0.07373046875, + "learning_rate": 3.25e-07, + "loss": 0.0013, + "ppl": 0.0284423828125, + "reward": 0.9537633657455444, + "reward_std": 0.0014054011553525925, + "rewards/perpo_ocr_edit_distance_reward": 0.953763484954834, "step": 1750, "temperature": 0.9 }, { - "advantages": -0.0003095354359174962, - "completion_length": 309.5, - "delta_ref_entropy_loss": 0.0401611328125, - "delta_ref_ppl": -0.0638427734375, - "entropy_loss": -0.03155517578125, - "epoch": 0.7004, - "grad_norm": 0.4231447547218387, - "k1_kl": 0.0638427734375, - "k3_kl": 0.0457763671875, - "kimi_kl": 0.220703125, - "learning_rate": 1.4979999999999998e-07, - "loss": 0.0021, - "ppl": 0.0169219970703125, - "reward": 0.9967201352119446, - "reward_std": 0.00013493951701093465, - "rewards/perpo_ocr_edit_distance_reward": 0.9967201948165894, + "advantages": -1.748970680637285e-05, + "completion_length": 227.0, + "delta_ref_entropy_loss": 0.0654296875, + "delta_ref_ppl": -0.12890625, + "entropy_loss": -0.06201171875, + "epoch": 0.3502, + "grad_norm": 1.0120583546960613, + "k1_kl": 0.12890625, + "k3_kl": 0.09130859375, + "kimi_kl": 0.2890625, + "learning_rate": 3.249e-07, + "loss": 0.0037, + "ppl": 0.029052734375, + "reward": 0.9827425479888916, + "reward_std": 0.003305083606392145, + "rewards/perpo_ocr_edit_distance_reward": 0.9827426075935364, "step": 1751, "temperature": 0.9 }, { - "advantages": -0.00011575648659345461, - "completion_length": 592.5, - "delta_ref_entropy_loss": 0.069580078125, - "delta_ref_ppl": -0.0538330078125, - "entropy_loss": -0.059326171875, - "epoch": 0.7008, - "grad_norm": 0.5969811699971391, - "k1_kl": 0.053955078125, - "k3_kl": 0.03314208984375, - "kimi_kl": 0.090087890625, - "learning_rate": 1.4960000000000002e-07, - "loss": 0.0014, - "ppl": 0.03131103515625, - "reward": 0.9108870327472687, - "reward_std": 0.0013096452021272853, - "rewards/perpo_ocr_edit_distance_reward": 0.9108870625495911, + "advantages": -2.2104808522271924e-05, + "completion_length": 476.0, + "delta_ref_entropy_loss": 0.087890625, + "delta_ref_ppl": -0.0986328125, + "entropy_loss": -0.08740234375, + "epoch": 0.3504, + "grad_norm": 2.1420061753553035, + "k1_kl": 0.0986328125, + "k3_kl": 0.0576171875, + "kimi_kl": 0.1640625, + "learning_rate": 3.2479999999999994e-07, + "loss": 0.0023, + "ppl": 0.04345703125, + "reward": 0.975656270980835, + "reward_std": 0.0014419059734791517, + "rewards/perpo_ocr_edit_distance_reward": 0.9756563305854797, "step": 1752, "temperature": 0.9 }, { - "advantages": -0.0001053991081789718, - "completion_length": 986.0, - "delta_ref_entropy_loss": 0.011505126953125, - "delta_ref_ppl": -0.0099029541015625, - "entropy_loss": -0.021453857421875, - "epoch": 0.7012, - "grad_norm": 0.8781774966622976, - "k1_kl": 0.0099029541015625, - "k3_kl": 0.00567626953125, - "kimi_kl": 0.010101318359375, - "learning_rate": 1.494e-07, - "loss": 0.0003, - "ppl": 0.011749267578125, - "reward": 0.9991890490055084, - "reward_std": 0.0005127522017573938, - "rewards/perpo_ocr_edit_distance_reward": 0.9991890788078308, + "advantages": -4.998275471734814e-06, + "completion_length": 300.0, + "delta_ref_entropy_loss": 0.09033203125, + "delta_ref_ppl": -0.1572265625, + "entropy_loss": -0.0732421875, + "epoch": 0.3506, + "grad_norm": 0.9950016503383694, + "k1_kl": 0.1572265625, + "k3_kl": 0.11376953125, + "kimi_kl": 0.42578125, + "learning_rate": 3.247e-07, + "loss": 0.0046, + "ppl": 0.03271484375, + "reward": 0.8528677821159363, + "reward_std": 0.0016097304178401828, + "rewards/perpo_ocr_edit_distance_reward": 0.852867841720581, "step": 1753, "temperature": 0.9 }, { - "advantages": -9.270654118154198e-06, - "completion_length": 787.0, - "delta_ref_entropy_loss": 0.018798828125, - "delta_ref_ppl": -0.02117919921875, - "entropy_loss": -0.015777587890625, - "epoch": 0.7016, - "grad_norm": 0.23867372981701046, - "k1_kl": 0.02117919921875, - "k3_kl": 0.01470947265625, - "kimi_kl": 0.047119140625, - "learning_rate": 1.4919999999999999e-07, - "loss": 0.0006, - "ppl": 0.0065155029296875, - "reward": 0.9990888833999634, - "reward_std": 0.00015154538414208218, - "rewards/perpo_ocr_edit_distance_reward": 0.9990889430046082, + "advantages": -1.406669707648689e-05, + "completion_length": 571.0, + "delta_ref_entropy_loss": 0.036865234375, + "delta_ref_ppl": -0.04345703125, + "entropy_loss": -0.0247802734375, + "epoch": 0.3508, + "grad_norm": 0.6317327580966047, + "k1_kl": 0.043701171875, + "k3_kl": 0.0322265625, + "kimi_kl": 0.08251953125, + "learning_rate": 3.246e-07, + "loss": 0.0013, + "ppl": 0.013671875, + "reward": 0.9872367978096008, + "reward_std": 0.0011093771317973733, + "rewards/perpo_ocr_edit_distance_reward": 0.987236738204956, "step": 1754, "temperature": 0.9 }, { - "advantages": -1.0899135531872162e-06, - "completion_length": 637.5, - "delta_ref_entropy_loss": 0.07281494140625, - "delta_ref_ppl": -0.04718017578125, - "entropy_loss": -0.10552978515625, - "epoch": 0.702, - "grad_norm": 0.8702600984051105, - "k1_kl": 0.04718017578125, - "k3_kl": 0.0247802734375, - "kimi_kl": 0.043243408203125, - "learning_rate": 1.49e-07, - "loss": 0.001, - "ppl": 0.059326171875, - "reward": 0.9151479303836823, - "reward_std": 0.0038154786452651024, - "rewards/perpo_ocr_edit_distance_reward": 0.9151479601860046, + "advantages": -2.1287374693201855e-05, + "completion_length": 236.0, + "delta_ref_entropy_loss": 0.0279541015625, + "delta_ref_ppl": -0.06591796875, + "entropy_loss": -0.0257568359375, + "epoch": 0.351, + "grad_norm": 0.8625894507910671, + "k1_kl": 0.0654296875, + "k3_kl": 0.04833984375, + "kimi_kl": 0.158203125, + "learning_rate": 3.245e-07, + "loss": 0.002, + "ppl": 0.00762939453125, + "reward": 0.8900355100631714, + "reward_std": 0.0011014726478606462, + "rewards/perpo_ocr_edit_distance_reward": 0.8900355100631714, "step": 1755, "temperature": 0.9 }, { - "advantages": -0.00029947928021556436, - "completion_length": 1229.5, - "delta_ref_entropy_loss": 0.03265380859375, - "delta_ref_ppl": -0.025146484375, - "entropy_loss": -0.04168701171875, - "epoch": 0.7024, - "grad_norm": 1.1419038167243631, - "k1_kl": 0.025146484375, - "k3_kl": 0.015594482421875, - "kimi_kl": 0.040283203125, - "learning_rate": 1.4879999999999998e-07, + "advantages": -5.633490582113154e-05, + "completion_length": 706.0, + "delta_ref_entropy_loss": 0.019775390625, + "delta_ref_ppl": -0.03564453125, + "entropy_loss": -0.051513671875, + "epoch": 0.3512, + "grad_norm": 0.7985427409852294, + "k1_kl": 0.03564453125, + "k3_kl": 0.0208740234375, + "kimi_kl": 0.047607421875, + "learning_rate": 3.244e-07, "loss": 0.0009, - "ppl": 0.0260009765625, - "reward": 0.9288579821586609, - "reward_std": 0.004340732004493475, - "rewards/perpo_ocr_edit_distance_reward": 0.9288580417633057, + "ppl": 0.018310546875, + "reward": 0.9583299160003662, + "reward_std": 0.0012606506934389472, + "rewards/perpo_ocr_edit_distance_reward": 0.9583300352096558, "step": 1756, "temperature": 0.9 }, { - "advantages": -4.934413300361484e-05, - "completion_length": 571.0, - "delta_ref_entropy_loss": 0.0372314453125, - "delta_ref_ppl": -0.023681640625, - "entropy_loss": -0.028900146484375, - "epoch": 0.7028, - "grad_norm": 0.978722653514296, - "k1_kl": 0.02362060546875, - "k3_kl": 0.014312744140625, - "kimi_kl": 0.03753662109375, - "learning_rate": 1.486e-07, - "loss": 0.0006, - "ppl": 0.0139007568359375, - "reward": 0.9973749220371246, - "reward_std": 0.0006789041799493134, - "rewards/perpo_ocr_edit_distance_reward": 0.9973750114440918, + "advantages": 7.142339745769277e-05, + "completion_length": 708.0, + "delta_ref_entropy_loss": 0.061279296875, + "delta_ref_ppl": -0.059326171875, + "entropy_loss": -0.0322265625, + "epoch": 0.3514, + "grad_norm": 0.5081009054237975, + "k1_kl": 0.05908203125, + "k3_kl": 0.032958984375, + "kimi_kl": 0.1005859375, + "learning_rate": 3.2429999999999996e-07, + "loss": 0.0012, + "ppl": 0.01177978515625, + "reward": 0.9875487089157104, + "reward_std": 0.00037686314317397773, + "rewards/perpo_ocr_edit_distance_reward": 0.9875487089157104, "step": 1757, "temperature": 0.9 }, { - "advantages": -2.157262497348711e-05, - "completion_length": 920.5, - "delta_ref_entropy_loss": 0.029296875, - "delta_ref_ppl": -0.032989501953125, - "entropy_loss": -0.03131103515625, - "epoch": 0.7032, - "grad_norm": 0.8393709824133092, - "k1_kl": 0.032989501953125, - "k3_kl": 0.0216217041015625, - "kimi_kl": 0.067352294921875, - "learning_rate": 1.484e-07, - "loss": 0.0009, - "ppl": 0.016265869140625, - "reward": 0.9895982444286346, - "reward_std": 0.0007915902242530137, - "rewards/perpo_ocr_edit_distance_reward": 0.989598274230957, + "advantages": -1.6178404393940582e-07, + "completion_length": 489.0, + "delta_ref_entropy_loss": -0.1474609375, + "delta_ref_ppl": -0.050048828125, + "entropy_loss": -0.439453125, + "epoch": 0.3516, + "grad_norm": 5.020615906360562, + "k1_kl": 0.0498046875, + "k3_kl": 0.0517578125, + "kimi_kl": 0.1572265625, + "learning_rate": 3.2419999999999995e-07, + "loss": 0.0021, + "ppl": 0.1953125, + "reward": 0.41654711961746216, + "reward_std": 0.21024668216705322, + "rewards/perpo_ocr_edit_distance_reward": 0.41654714941978455, "step": 1758, "temperature": 0.9 }, { - "advantages": -3.150531426854286e-07, - "completion_length": 779.0, - "delta_ref_entropy_loss": 0.04119873046875, - "delta_ref_ppl": -0.03363037109375, - "entropy_loss": -0.05401611328125, - "epoch": 0.7036, - "grad_norm": 0.8999081261189203, - "k1_kl": 0.03350830078125, - "k3_kl": 0.021514892578125, - "kimi_kl": 0.0390625, - "learning_rate": 1.482e-07, - "loss": 0.0009, - "ppl": 0.02734375, - "reward": 0.8936998546123505, - "reward_std": 0.030124499928206205, - "rewards/perpo_ocr_edit_distance_reward": 0.8936999142169952, + "advantages": -2.0589148334693164e-05, + "completion_length": 539.0, + "delta_ref_entropy_loss": 0.07470703125, + "delta_ref_ppl": -0.0849609375, + "entropy_loss": -0.023681640625, + "epoch": 0.3518, + "grad_norm": 0.8532130877976045, + "k1_kl": 0.0849609375, + "k3_kl": 0.049560546875, + "kimi_kl": 0.140625, + "learning_rate": 3.241e-07, + "loss": 0.002, + "ppl": 0.0079345703125, + "reward": 0.978783130645752, + "reward_std": 0.001966494135558605, + "rewards/perpo_ocr_edit_distance_reward": 0.978783130645752, "step": 1759, "temperature": 0.9 }, { - "advantages": -2.882310582208447e-06, - "completion_length": 686.0, - "delta_ref_entropy_loss": 0.0445556640625, - "delta_ref_ppl": -0.03436279296875, - "entropy_loss": -0.0352783203125, - "epoch": 0.704, - "grad_norm": 0.436190512293543, - "k1_kl": 0.0343017578125, - "k3_kl": 0.0196533203125, - "kimi_kl": 0.042724609375, - "learning_rate": 1.4799999999999998e-07, - "loss": 0.0008, - "ppl": 0.018310546875, - "reward": 0.9766467213630676, - "reward_std": 0.0004918248305330053, - "rewards/perpo_ocr_edit_distance_reward": 0.97664675116539, + "advantages": -5.283526115817949e-05, + "completion_length": 276.0, + "delta_ref_entropy_loss": 0.1416015625, + "delta_ref_ppl": -0.134765625, + "entropy_loss": -0.07373046875, + "epoch": 0.352, + "grad_norm": 0.579820764616168, + "k1_kl": 0.134765625, + "k3_kl": 0.080078125, + "kimi_kl": 0.232421875, + "learning_rate": 3.24e-07, + "loss": 0.0032, + "ppl": 0.023193359375, + "reward": 0.9126183390617371, + "reward_std": 0.0005446099676191807, + "rewards/perpo_ocr_edit_distance_reward": 0.9126183986663818, "step": 1760, "temperature": 0.9 }, { - "advantages": -6.518619920825586e-05, - "completion_length": 580.0, - "delta_ref_entropy_loss": 0.08349609375, - "delta_ref_ppl": -0.046875, - "entropy_loss": -0.0557861328125, - "epoch": 0.7044, - "grad_norm": 0.8822560010578482, - "k1_kl": 0.046875, - "k3_kl": 0.023712158203125, - "kimi_kl": 0.05126953125, - "learning_rate": 1.4779999999999999e-07, - "loss": 0.001, - "ppl": 0.0302581787109375, - "reward": 0.8966382145881653, - "reward_std": 0.004414776834892109, - "rewards/perpo_ocr_edit_distance_reward": 0.8966382443904877, + "advantages": -1.647642784519121e-05, + "completion_length": 518.0, + "delta_ref_entropy_loss": 0.10986328125, + "delta_ref_ppl": -0.0869140625, + "entropy_loss": -0.138671875, + "epoch": 0.3522, + "grad_norm": 1.417835727909865, + "k1_kl": 0.0869140625, + "k3_kl": 0.04833984375, + "kimi_kl": 0.1201171875, + "learning_rate": 3.239e-07, + "loss": 0.002, + "ppl": 0.07470703125, + "reward": 0.9561247229576111, + "reward_std": 0.0030015311203897, + "rewards/perpo_ocr_edit_distance_reward": 0.9561248421669006, "step": 1761, "temperature": 0.9 }, { - "advantages": -7.169587661337573e-06, - "completion_length": 777.0, - "delta_ref_entropy_loss": 0.03369140625, - "delta_ref_ppl": -0.0177001953125, - "entropy_loss": -0.0322113037109375, - "epoch": 0.7048, - "grad_norm": 0.7667333961868019, - "k1_kl": 0.0177001953125, - "k3_kl": 0.009521484375, - "kimi_kl": 0.023529052734375, - "learning_rate": 1.476e-07, - "loss": 0.0004, - "ppl": 0.01657867431640625, - "reward": 0.9332817196846008, - "reward_std": 0.0005438092048279941, - "rewards/perpo_ocr_edit_distance_reward": 0.9332817494869232, + "advantages": -1.0762896636151709e-05, + "completion_length": 1388.0, + "delta_ref_entropy_loss": 0.0703125, + "delta_ref_ppl": -0.06494140625, + "entropy_loss": -0.044921875, + "epoch": 0.3524, + "grad_norm": 0.8091171589511166, + "k1_kl": 0.06494140625, + "k3_kl": 0.036865234375, + "kimi_kl": 0.0869140625, + "learning_rate": 3.238e-07, + "loss": 0.0015, + "ppl": 0.01953125, + "reward": 0.9918472170829773, + "reward_std": 0.0006890713702887297, + "rewards/perpo_ocr_edit_distance_reward": 0.9918472170829773, "step": 1762, "temperature": 0.9 }, { - "advantages": -9.374959518027026e-06, - "completion_length": 389.5, - "delta_ref_entropy_loss": 0.0316162109375, - "delta_ref_ppl": -0.03045654296875, - "entropy_loss": -0.021148681640625, - "epoch": 0.7052, - "grad_norm": 0.5966398932058473, - "k1_kl": 0.03045654296875, - "k3_kl": 0.01830291748046875, - "kimi_kl": 0.0639495849609375, - "learning_rate": 1.474e-07, - "loss": 0.0007, - "ppl": 0.01043701171875, - "reward": 0.9956267774105072, - "reward_std": 0.0008577675325796008, - "rewards/perpo_ocr_edit_distance_reward": 0.9956268072128296, + "advantages": -4.277910556993447e-05, + "completion_length": 475.0, + "delta_ref_entropy_loss": 0.12109375, + "delta_ref_ppl": -0.130859375, + "entropy_loss": -0.12060546875, + "epoch": 0.3526, + "grad_norm": 2.019851849879241, + "k1_kl": 0.130859375, + "k3_kl": 0.07861328125, + "kimi_kl": 0.2353515625, + "learning_rate": 3.2369999999999997e-07, + "loss": 0.0032, + "ppl": 0.062255859375, + "reward": 0.9214470386505127, + "reward_std": 0.0014918617671355605, + "rewards/perpo_ocr_edit_distance_reward": 0.9214470982551575, "step": 1763, "temperature": 0.9 }, { - "advantages": -4.293237452657195e-05, - "completion_length": 886.0, - "delta_ref_entropy_loss": 0.03216552734375, - "delta_ref_ppl": -0.0245361328125, - "entropy_loss": -0.027008056640625, - "epoch": 0.7056, - "grad_norm": 0.9667441359266787, - "k1_kl": 0.0245361328125, - "k3_kl": 0.015472412109375, - "kimi_kl": 0.0413818359375, - "learning_rate": 1.472e-07, - "loss": 0.0007, - "ppl": 0.011474609375, - "reward": 0.9991016983985901, - "reward_std": 0.0007111283339327201, - "rewards/perpo_ocr_edit_distance_reward": 0.9991017282009125, + "advantages": -1.08225012809271e-05, + "completion_length": 75.0, + "delta_ref_entropy_loss": 0.1259765625, + "delta_ref_ppl": -0.5, + "entropy_loss": -0.1005859375, + "epoch": 0.3528, + "grad_norm": 2.664050331495851, + "k1_kl": 0.5, + "k3_kl": 0.40234375, + "kimi_kl": 1.7421875, + "learning_rate": 3.2359999999999996e-07, + "loss": 0.0161, + "ppl": 0.0361328125, + "reward": 0.828138530254364, + "reward_std": 0.001478636055253446, + "rewards/perpo_ocr_edit_distance_reward": 0.828138530254364, "step": 1764, "temperature": 0.9 }, { - "advantages": -6.971615107431717e-06, - "completion_length": 1076.5, - "delta_ref_entropy_loss": 0.0086669921875, - "delta_ref_ppl": -0.03955078125, - "entropy_loss": -0.156494140625, - "epoch": 0.706, - "grad_norm": 7.355433647322053, - "k1_kl": 0.03955078125, - "k3_kl": 0.0279541015625, - "kimi_kl": 0.0523681640625, - "learning_rate": 1.4699999999999998e-07, + "advantages": -7.169587661337573e-06, + "completion_length": 1212.0, + "delta_ref_entropy_loss": 0.025146484375, + "delta_ref_ppl": -0.03857421875, + "entropy_loss": -0.029296875, + "epoch": 0.353, + "grad_norm": 0.4981503143920768, + "k1_kl": 0.03857421875, + "k3_kl": 0.026123046875, + "kimi_kl": 0.0654296875, + "learning_rate": 3.235e-07, "loss": 0.0011, - "ppl": 0.097412109375, - "reward": 0.8081908226013184, - "reward_std": 0.011671514133922756, - "rewards/perpo_ocr_edit_distance_reward": 0.8081908822059631, + "ppl": 0.0125732421875, + "reward": 0.9936109781265259, + "reward_std": 0.0034674988128244877, + "rewards/perpo_ocr_edit_distance_reward": 0.9936109781265259, "step": 1765, "temperature": 0.9 }, { - "advantages": -0.00034617100754985586, - "completion_length": 218.0, - "delta_ref_entropy_loss": 0.033447265625, - "delta_ref_ppl": -0.082275390625, - "entropy_loss": -0.0284423828125, - "epoch": 0.7064, - "grad_norm": 0.5439307707323505, - "k1_kl": 0.082244873046875, - "k3_kl": 0.065277099609375, - "kimi_kl": 0.26422119140625, - "learning_rate": 1.4680000000000002e-07, - "loss": 0.0029, - "ppl": 0.01708984375, - "reward": 0.997972697019577, - "reward_std": 0.00021533860126510262, - "rewards/perpo_ocr_edit_distance_reward": 0.9979728162288666, + "advantages": -2.259867687826045e-05, + "completion_length": 72.0, + "delta_ref_entropy_loss": 0.11669921875, + "delta_ref_ppl": -0.5390625, + "entropy_loss": -0.11474609375, + "epoch": 0.3532, + "grad_norm": 2.5641999153093535, + "k1_kl": 0.5390625, + "k3_kl": 0.4453125, + "kimi_kl": 2.046875, + "learning_rate": 3.234e-07, + "loss": 0.0178, + "ppl": 0.05322265625, + "reward": 0.9855245351791382, + "reward_std": 0.004799327347427607, + "rewards/perpo_ocr_edit_distance_reward": 0.9855246543884277, "step": 1766, "temperature": 0.9 }, { - "advantages": -0.0003014143026121019, - "completion_length": 398.5, - "delta_ref_entropy_loss": 0.076904296875, - "delta_ref_ppl": -0.0908203125, - "entropy_loss": -0.0653076171875, - "epoch": 0.7068, - "grad_norm": 0.6448491913100131, - "k1_kl": 0.0908203125, - "k3_kl": 0.06256103515625, - "kimi_kl": 0.2161865234375, - "learning_rate": 1.466e-07, - "loss": 0.0028, - "ppl": 0.032958984375, - "reward": 0.9801296889781952, - "reward_std": 0.001835519797168672, - "rewards/perpo_ocr_edit_distance_reward": 0.9801297187805176, + "advantages": -5.971534119453281e-05, + "completion_length": 1151.0, + "delta_ref_entropy_loss": 0.0242919921875, + "delta_ref_ppl": -0.031494140625, + "entropy_loss": -0.0458984375, + "epoch": 0.3534, + "grad_norm": 0.6720539796914857, + "k1_kl": 0.031494140625, + "k3_kl": 0.0233154296875, + "kimi_kl": 0.0673828125, + "learning_rate": 3.2329999999999994e-07, + "loss": 0.001, + "ppl": 0.0218505859375, + "reward": 0.9955483078956604, + "reward_std": 0.0008981941500678658, + "rewards/perpo_ocr_edit_distance_reward": 0.99554842710495, "step": 1767, "temperature": 0.9 }, { - "advantages": -0.00013405085246631643, - "completion_length": 351.5, - "delta_ref_entropy_loss": 0.0770263671875, - "delta_ref_ppl": -0.059326171875, - "entropy_loss": -0.0782470703125, - "epoch": 0.7072, - "grad_norm": 0.8391576333340026, - "k1_kl": 0.05908203125, - "k3_kl": 0.03509521484375, - "kimi_kl": 0.081298828125, - "learning_rate": 1.464e-07, - "loss": 0.0015, - "ppl": 0.0329742431640625, - "reward": 0.9702970385551453, - "reward_std": 0.001359609654173255, - "rewards/perpo_ocr_edit_distance_reward": 0.9702971279621124, + "advantages": -3.354464570293203e-05, + "completion_length": 701.0, + "delta_ref_entropy_loss": 0.08154296875, + "delta_ref_ppl": -0.078125, + "entropy_loss": -0.06494140625, + "epoch": 0.3536, + "grad_norm": 0.47098779120331413, + "k1_kl": 0.078125, + "k3_kl": 0.0458984375, + "kimi_kl": 0.1494140625, + "learning_rate": 3.232e-07, + "loss": 0.0019, + "ppl": 0.0228271484375, + "reward": 0.9226727485656738, + "reward_std": 0.0009158267639577389, + "rewards/perpo_ocr_edit_distance_reward": 0.9226727485656738, "step": 1768, "temperature": 0.9 }, { - "advantages": -6.258487701416016e-07, - "completion_length": 374.5, - "delta_ref_entropy_loss": 0.0267333984375, - "delta_ref_ppl": -0.018310546875, - "entropy_loss": -0.022979736328125, - "epoch": 0.7076, - "grad_norm": 0.6092854308322859, - "k1_kl": 0.018310546875, - "k3_kl": 0.00909423828125, - "kimi_kl": 0.018310546875, - "learning_rate": 1.462e-07, - "loss": 0.0004, - "ppl": 0.011077880859375, - "reward": 0.9083682894706726, - "reward_std": 0.020312508568167686, - "rewards/perpo_ocr_edit_distance_reward": 0.908368319272995, + "advantages": -1.3858080819773022e-05, + "completion_length": 303.0, + "delta_ref_entropy_loss": 0.0654296875, + "delta_ref_ppl": -0.12890625, + "entropy_loss": -0.032470703125, + "epoch": 0.3538, + "grad_norm": 2.376379699795487, + "k1_kl": 0.12890625, + "k3_kl": 0.09423828125, + "kimi_kl": 0.318359375, + "learning_rate": 3.231e-07, + "loss": 0.0038, + "ppl": 0.0140380859375, + "reward": 0.9960054755210876, + "reward_std": 0.0017437792848795652, + "rewards/perpo_ocr_edit_distance_reward": 0.9960054159164429, "step": 1769, "temperature": 0.9 }, { - "advantages": -8.141143644024851e-05, - "completion_length": 562.0, - "delta_ref_entropy_loss": 0.0789794921875, - "delta_ref_ppl": -0.062744140625, - "entropy_loss": -0.0693359375, - "epoch": 0.708, - "grad_norm": 0.9858246469926565, - "k1_kl": 0.062744140625, - "k3_kl": 0.03436279296875, - "kimi_kl": 0.0770263671875, - "learning_rate": 1.4599999999999998e-07, - "loss": 0.0015, - "ppl": 0.035400390625, - "reward": 0.9044781923294067, - "reward_std": 0.0020928840385749936, - "rewards/perpo_ocr_edit_distance_reward": 0.9044782221317291, + "advantages": 2.0776476503669983e-06, + "completion_length": 193.0, + "delta_ref_entropy_loss": -0.0301513671875, + "delta_ref_ppl": -0.2060546875, + "entropy_loss": -0.90234375, + "epoch": 0.354, + "grad_norm": 4.198346585503305, + "k1_kl": 0.2060546875, + "k3_kl": 0.1884765625, + "kimi_kl": 0.59375, + "learning_rate": 3.23e-07, + "loss": 0.0075, + "ppl": 0.431640625, + "reward": 0.36620470881462097, + "reward_std": 0.008119810372591019, + "rewards/perpo_ocr_edit_distance_reward": 0.3662046790122986, "step": 1770, "temperature": 0.9 }, { - "advantages": -0.00013577513163909316, - "completion_length": 494.0, - "delta_ref_entropy_loss": 0.02630615234375, - "delta_ref_ppl": -0.01397705078125, - "entropy_loss": -0.010528564453125, - "epoch": 0.7084, - "grad_norm": 0.44168626213440676, - "k1_kl": 0.014007568359375, - "k3_kl": 0.006805419921875, - "kimi_kl": 0.0126953125, - "learning_rate": 1.458e-07, - "loss": 0.0004, - "ppl": 0.0050201416015625, - "reward": 0.9995866119861603, - "reward_std": 0.0003381042697583325, - "rewards/perpo_ocr_edit_distance_reward": 0.9995867013931274, + "advantages": 1.7540796761750244e-06, + "completion_length": 166.0, + "delta_ref_entropy_loss": 0.1416015625, + "delta_ref_ppl": -0.271484375, + "entropy_loss": -0.2021484375, + "epoch": 0.3542, + "grad_norm": 3.0990785021679588, + "k1_kl": 0.271484375, + "k3_kl": 0.19921875, + "kimi_kl": 0.81640625, + "learning_rate": 3.229e-07, + "loss": 0.008, + "ppl": 0.083984375, + "reward": 0.932305634021759, + "reward_std": 0.014554728753864765, + "rewards/perpo_ocr_edit_distance_reward": 0.932305634021759, "step": 1771, "temperature": 0.9 }, { - "advantages": -0.00010854858101083664, - "completion_length": 509.0, - "delta_ref_entropy_loss": 0.05419921875, - "delta_ref_ppl": -0.093231201171875, - "entropy_loss": -0.039794921875, - "epoch": 0.7088, - "grad_norm": 6.527660800235525, - "k1_kl": 0.093231201171875, - "k3_kl": 0.0668182373046875, - "kimi_kl": 0.1673583984375, - "learning_rate": 1.456e-07, - "loss": 0.0028, - "ppl": 0.021026611328125, - "reward": 0.9992304742336273, - "reward_std": 0.0012318406006670557, - "rewards/perpo_ocr_edit_distance_reward": 0.9992305636405945, - "step": 1772, - "temperature": 0.9 - }, - { - "advantages": -4.904610899636452e-06, - "completion_length": 562.0, - "delta_ref_entropy_loss": 0.024169921875, - "delta_ref_ppl": -0.030517578125, - "entropy_loss": -0.02044677734375, - "epoch": 0.7092, - "grad_norm": 0.8296544112731116, - "k1_kl": 0.030517578125, - "k3_kl": 0.01995849609375, - "kimi_kl": 0.05731201171875, - "learning_rate": 1.454e-07, - "loss": 0.0008, - "ppl": 0.0085906982421875, - "reward": 0.9978793561458588, - "reward_std": 0.004872492732829414, - "rewards/perpo_ocr_edit_distance_reward": 0.9978794157505035, + "advantages": -3.213116360711865e-05, + "completion_length": 495.0, + "delta_ref_entropy_loss": 0.06298828125, + "delta_ref_ppl": -0.0673828125, + "entropy_loss": -0.048583984375, + "epoch": 0.3544, + "grad_norm": 0.663099846805473, + "k1_kl": 0.0673828125, + "k3_kl": 0.04052734375, + "kimi_kl": 0.125, + "learning_rate": 3.2279999999999995e-07, + "loss": 0.0016, + "ppl": 0.020751953125, + "reward": 0.9602033495903015, + "reward_std": 0.0012250851141288877, + "rewards/perpo_ocr_edit_distance_reward": 0.9602033495903015, + "step": 1772, + "temperature": 0.9 + }, + { + "advantages": -2.1457672119140625e-06, + "completion_length": 739.0, + "delta_ref_entropy_loss": 0.0546875, + "delta_ref_ppl": -0.07568359375, + "entropy_loss": -0.0185546875, + "epoch": 0.3546, + "grad_norm": 0.33789951885936953, + "k1_kl": 0.07568359375, + "k3_kl": 0.048583984375, + "kimi_kl": 0.2197265625, + "learning_rate": 3.227e-07, + "loss": 0.0019, + "ppl": 0.006866455078125, + "reward": 0.9061277508735657, + "reward_std": 0.01567249372601509, + "rewards/perpo_ocr_edit_distance_reward": 0.9061278104782104, "step": 1773, "temperature": 0.9 }, { - "advantages": -2.379715624556411e-05, - "completion_length": 645.0, - "delta_ref_entropy_loss": 0.02911376953125, - "delta_ref_ppl": -0.019134521484375, - "entropy_loss": -0.012603759765625, - "epoch": 0.7096, - "grad_norm": 0.16381078396079393, - "k1_kl": 0.0191650390625, - "k3_kl": 0.01190185546875, - "kimi_kl": 0.041778564453125, - "learning_rate": 1.4519999999999998e-07, - "loss": 0.0005, - "ppl": 0.0054779052734375, - "reward": 0.999924510717392, - "reward_std": 0.0001288909843424335, - "rewards/perpo_ocr_edit_distance_reward": 0.9999245405197144, + "advantages": -6.386212135112146e-06, + "completion_length": 203.0, + "delta_ref_entropy_loss": 0.099609375, + "delta_ref_ppl": -0.11181640625, + "entropy_loss": -0.1357421875, + "epoch": 0.3548, + "grad_norm": 4.120803795662803, + "k1_kl": 0.11181640625, + "k3_kl": 0.07080078125, + "kimi_kl": 0.181640625, + "learning_rate": 3.226e-07, + "loss": 0.0028, + "ppl": 0.064453125, + "reward": 0.9671919345855713, + "reward_std": 0.00655573233962059, + "rewards/perpo_ocr_edit_distance_reward": 0.9671919941902161, "step": 1774, "temperature": 0.9 }, { - "advantages": -8.446830165098618e-06, - "completion_length": 483.0, - "delta_ref_entropy_loss": 0.05633544921875, - "delta_ref_ppl": -0.06158447265625, - "entropy_loss": -0.261962890625, - "epoch": 0.71, - "grad_norm": 2.501767345889335, - "k1_kl": 0.061279296875, - "k3_kl": 0.03619384765625, - "kimi_kl": 0.06640625, - "learning_rate": 1.45e-07, - "loss": 0.0015, - "ppl": 0.129974365234375, - "reward": 0.8638532757759094, - "reward_std": 0.02484732068842277, - "rewards/perpo_ocr_edit_distance_reward": 0.8638533055782318, + "advantages": 4.087175966560608e-07, + "completion_length": 1150.0, + "delta_ref_entropy_loss": 0.07470703125, + "delta_ref_ppl": -0.0673828125, + "entropy_loss": -0.0556640625, + "epoch": 0.355, + "grad_norm": 23.43323053205543, + "k1_kl": 0.0673828125, + "k3_kl": 0.050048828125, + "kimi_kl": 0.080078125, + "learning_rate": 3.225e-07, + "loss": 0.002, + "ppl": 0.0301513671875, + "reward": 0.9793970584869385, + "reward_std": 0.020021779462695122, + "rewards/perpo_ocr_edit_distance_reward": 0.9793971180915833, "step": 1775, "temperature": 0.9 }, { - "advantages": -0.00010976621615554905, - "completion_length": 594.0, - "delta_ref_entropy_loss": 0.026702880859375, - "delta_ref_ppl": -0.02130126953125, - "entropy_loss": -0.01947021484375, - "epoch": 0.7104, - "grad_norm": 0.31171636723046675, - "k1_kl": 0.0213623046875, - "k3_kl": 0.01239013671875, - "kimi_kl": 0.03857421875, - "learning_rate": 1.448e-07, - "loss": 0.0006, - "ppl": 0.009490966796875, - "reward": 0.9968625009059906, - "reward_std": 0.00046796394599368796, - "rewards/perpo_ocr_edit_distance_reward": 0.9968626201152802, + "advantages": -0.00011185237963218242, + "completion_length": 741.0, + "delta_ref_entropy_loss": 0.0654296875, + "delta_ref_ppl": -0.1103515625, + "entropy_loss": -0.0634765625, + "epoch": 0.3552, + "grad_norm": 1.6744204808483336, + "k1_kl": 0.1103515625, + "k3_kl": 0.07177734375, + "kimi_kl": 0.255859375, + "learning_rate": 3.2240000000000003e-07, + "loss": 0.003, + "ppl": 0.03173828125, + "reward": 0.8362368941307068, + "reward_std": 0.0008133860537782311, + "rewards/perpo_ocr_edit_distance_reward": 0.8362370133399963, "step": 1776, "temperature": 0.9 }, { - "advantages": -2.6643278033589013e-05, - "completion_length": 301.0, - "delta_ref_entropy_loss": 0.04443359375, - "delta_ref_ppl": -0.0499267578125, - "entropy_loss": -0.02532958984375, - "epoch": 0.7108, - "grad_norm": 0.7816117353407463, - "k1_kl": 0.050048828125, - "k3_kl": 0.0316162109375, - "kimi_kl": 0.086669921875, - "learning_rate": 1.446e-07, - "loss": 0.0013, - "ppl": 0.01080322265625, - "reward": 0.9638443291187286, - "reward_std": 0.015079989389050752, - "rewards/perpo_ocr_edit_distance_reward": 0.963844358921051, + "advantages": -1.9601413441705517e-05, + "completion_length": 358.0, + "delta_ref_entropy_loss": 0.08251953125, + "delta_ref_ppl": -0.0986328125, + "entropy_loss": -0.08935546875, + "epoch": 0.3554, + "grad_norm": 1.4561352140718102, + "k1_kl": 0.0986328125, + "k3_kl": 0.0625, + "kimi_kl": 0.244140625, + "learning_rate": 3.2229999999999997e-07, + "loss": 0.0025, + "ppl": 0.039306640625, + "reward": 0.9870032072067261, + "reward_std": 0.004250712227076292, + "rewards/perpo_ocr_edit_distance_reward": 0.9870033860206604, "step": 1777, "temperature": 0.9 }, { - "advantages": 6.641660661443893e-07, - "completion_length": 611.0, - "delta_ref_entropy_loss": 0.04022216796875, - "delta_ref_ppl": -0.02545166015625, - "entropy_loss": -0.043609619140625, - "epoch": 0.7112, - "grad_norm": 1.1431110345716446, - "k1_kl": 0.02545166015625, - "k3_kl": 0.015411376953125, - "kimi_kl": 0.02984619140625, - "learning_rate": 1.444e-07, - "loss": 0.0006, - "ppl": 0.0223236083984375, - "reward": 0.9044269025325775, - "reward_std": 0.03869749465957284, - "rewards/perpo_ocr_edit_distance_reward": 0.9044269323348999, + "advantages": -6.2414578678726684e-06, + "completion_length": 284.0, + "delta_ref_entropy_loss": 0.042236328125, + "delta_ref_ppl": -0.08544921875, + "entropy_loss": -0.046142578125, + "epoch": 0.3556, + "grad_norm": 0.9257778925908899, + "k1_kl": 0.08544921875, + "k3_kl": 0.057861328125, + "kimi_kl": 0.173828125, + "learning_rate": 3.2219999999999996e-07, + "loss": 0.0023, + "ppl": 0.016845703125, + "reward": 0.9836695194244385, + "reward_std": 0.002624162472784519, + "rewards/perpo_ocr_edit_distance_reward": 0.9836695790290833, "step": 1778, "temperature": 0.9 }, { - "advantages": -9.565694563207217e-05, - "completion_length": 440.5, - "delta_ref_entropy_loss": 0.0399169921875, - "delta_ref_ppl": -0.03485107421875, - "entropy_loss": -0.02325439453125, - "epoch": 0.7116, - "grad_norm": 0.41865975065553807, - "k1_kl": 0.03485107421875, - "k3_kl": 0.0225830078125, - "kimi_kl": 0.07952880859375, - "learning_rate": 1.4419999999999998e-07, - "loss": 0.001, - "ppl": 0.01104736328125, - "reward": 0.9982727766036987, - "reward_std": 0.0009761866531334817, - "rewards/perpo_ocr_edit_distance_reward": 0.9982728958129883, + "advantages": -4.782847099704668e-05, + "completion_length": 488.0, + "delta_ref_entropy_loss": 0.046875, + "delta_ref_ppl": -0.05615234375, + "entropy_loss": -0.0263671875, + "epoch": 0.3558, + "grad_norm": 1.0993309644625708, + "k1_kl": 0.056396484375, + "k3_kl": 0.0306396484375, + "kimi_kl": 0.07275390625, + "learning_rate": 3.221e-07, + "loss": 0.0013, + "ppl": 0.01031494140625, + "reward": 0.9677598476409912, + "reward_std": 0.0006122957565821707, + "rewards/perpo_ocr_edit_distance_reward": 0.967759907245636, "step": 1779, "temperature": 0.9 }, { - "advantages": -0.0002381099184276536, - "completion_length": 869.0, - "delta_ref_entropy_loss": 0.01953125, - "delta_ref_ppl": -0.014251708984375, - "entropy_loss": -0.0167999267578125, - "epoch": 0.712, - "grad_norm": 0.36592695400851594, - "k1_kl": 0.0141754150390625, - "k3_kl": 0.009918212890625, - "kimi_kl": 0.026214599609375, - "learning_rate": 1.44e-07, - "loss": 0.0006, - "ppl": 0.008056640625, - "reward": 0.9997391700744629, - "reward_std": 0.00029384923982433975, - "rewards/perpo_ocr_edit_distance_reward": 0.9997392892837524, + "advantages": -0.0005960464477539062, + "completion_length": 37.0, + "delta_ref_entropy_loss": 0.19921875, + "delta_ref_ppl": -0.890625, + "entropy_loss": -0.119140625, + "epoch": 0.356, + "grad_norm": 0.16264045363798527, + "k1_kl": 0.88671875, + "k3_kl": 0.7421875, + "kimi_kl": 3.515625, + "learning_rate": 3.22e-07, + "loss": 0.0303, + "ppl": 0.0238037109375, + "reward": 0.9381442666053772, + "reward_std": 0.0, + "rewards/perpo_ocr_edit_distance_reward": 0.9381443858146667, "step": 1780, "temperature": 0.9 }, { - "advantages": -1.610176968824817e-05, - "completion_length": 630.5, - "delta_ref_entropy_loss": 0.04888916015625, - "delta_ref_ppl": -0.04071044921875, - "entropy_loss": -0.0721435546875, - "epoch": 0.7124, - "grad_norm": 2.0140186766181016, - "k1_kl": 0.04071044921875, - "k3_kl": 0.0244140625, - "kimi_kl": 0.064208984375, - "learning_rate": 1.438e-07, - "loss": 0.001, - "ppl": 0.03948974609375, - "reward": 0.9245077073574066, - "reward_std": 0.0032006314722821116, - "rewards/perpo_ocr_edit_distance_reward": 0.9245077669620514, + "advantages": -7.82183269620873e-05, + "completion_length": 475.0, + "delta_ref_entropy_loss": 0.07373046875, + "delta_ref_ppl": -0.0859375, + "entropy_loss": -0.03564453125, + "epoch": 0.3562, + "grad_norm": 0.5758871788787211, + "k1_kl": 0.0859375, + "k3_kl": 0.057373046875, + "kimi_kl": 0.1689453125, + "learning_rate": 3.219e-07, + "loss": 0.0024, + "ppl": 0.01708984375, + "reward": 0.9767181873321533, + "reward_std": 0.00055297976359725, + "rewards/perpo_ocr_edit_distance_reward": 0.9767182469367981, "step": 1781, "temperature": 0.9 }, { - "advantages": -3.916025161743164e-05, - "completion_length": 664.5, - "delta_ref_entropy_loss": 0.030029296875, - "delta_ref_ppl": -0.02435302734375, - "entropy_loss": -0.024871826171875, - "epoch": 0.7128, - "grad_norm": 0.2771888124738373, - "k1_kl": 0.02447509765625, - "k3_kl": 0.0155181884765625, - "kimi_kl": 0.067718505859375, - "learning_rate": 1.436e-07, - "loss": 0.0007, - "ppl": 0.01031494140625, - "reward": 0.9872027039527893, - "reward_std": 0.00016739605052862316, - "rewards/perpo_ocr_edit_distance_reward": 0.9872027039527893, + "advantages": -1.0388238479208667e-05, + "completion_length": 22.0, + "delta_ref_entropy_loss": -0.14453125, + "delta_ref_ppl": -1.3828125, + "entropy_loss": -0.24609375, + "epoch": 0.3564, + "grad_norm": 7.177330142145808, + "k1_kl": 1.3828125, + "k3_kl": 1.3046875, + "kimi_kl": 11.0, + "learning_rate": 3.218e-07, + "loss": 0.0521, + "ppl": 0.05712890625, + "reward": 0.38070690631866455, + "reward_std": 0.0019482779316604137, + "rewards/perpo_ocr_edit_distance_reward": 0.38070693612098694, "step": 1782, "temperature": 0.9 }, { - "advantages": -7.28432651158073e-05, - "completion_length": 554.0, - "delta_ref_entropy_loss": 0.0389404296875, - "delta_ref_ppl": -0.0338134765625, - "entropy_loss": -0.017913818359375, - "epoch": 0.7132, - "grad_norm": 0.2694799231740446, - "k1_kl": 0.0338134765625, - "k3_kl": 0.01996612548828125, - "kimi_kl": 0.05157470703125, - "learning_rate": 1.434e-07, - "loss": 0.0009, - "ppl": 0.007904052734375, - "reward": 0.9949590265750885, - "reward_std": 0.00023530122416559607, - "rewards/perpo_ocr_edit_distance_reward": 0.9949590563774109, + "advantages": -8.787427759671118e-06, + "completion_length": 1754.0, + "delta_ref_entropy_loss": 0.0625, + "delta_ref_ppl": -0.076171875, + "entropy_loss": -0.1337890625, + "epoch": 0.3566, + "grad_norm": 3.764354925737936, + "k1_kl": 0.07568359375, + "k3_kl": 0.0498046875, + "kimi_kl": 0.10205078125, + "learning_rate": 3.217e-07, + "loss": 0.002, + "ppl": 0.07421875, + "reward": 0.9455156922340393, + "reward_std": 0.0047539458610117435, + "rewards/perpo_ocr_edit_distance_reward": 0.9455156922340393, "step": 1783, "temperature": 0.9 }, { - "advantages": -8.727823228582565e-07, - "completion_length": 452.0, - "delta_ref_entropy_loss": 0.0870361328125, - "delta_ref_ppl": -0.06976318359375, - "entropy_loss": -0.1158447265625, - "epoch": 0.7136, - "grad_norm": 1.2632718285803086, - "k1_kl": 0.06976318359375, - "k3_kl": 0.0463409423828125, - "kimi_kl": 0.123870849609375, - "learning_rate": 1.4319999999999999e-07, - "loss": 0.0018, - "ppl": 0.0720977783203125, - "reward": 0.9491227865219116, - "reward_std": 0.002388917375355959, - "rewards/perpo_ocr_edit_distance_reward": 0.949122816324234, + "advantages": 0.0, + "completion_length": 555.0, + "delta_ref_entropy_loss": 0.0301513671875, + "delta_ref_ppl": -0.05908203125, + "entropy_loss": -0.018798828125, + "epoch": 0.3568, + "grad_norm": 0.32051764586952136, + "k1_kl": 0.058837890625, + "k3_kl": 0.042724609375, + "kimi_kl": 0.1572265625, + "learning_rate": 3.2159999999999997e-07, + "loss": 0.0017, + "ppl": 0.007568359375, + "reward": 0.9965317249298096, + "reward_std": 0.00022249565517995507, + "rewards/perpo_ocr_edit_distance_reward": 0.9965317845344543, "step": 1784, "temperature": 0.9 }, { - "advantages": -2.5119102247117553e-06, - "completion_length": 466.0, - "delta_ref_entropy_loss": 0.070465087890625, - "delta_ref_ppl": -0.049774169921875, - "entropy_loss": -0.056365966796875, - "epoch": 0.714, - "grad_norm": 0.68917609841738, - "k1_kl": 0.04998779296875, - "k3_kl": 0.02655029296875, - "kimi_kl": 0.060791015625, - "learning_rate": 1.4299999999999997e-07, - "loss": 0.0011, - "ppl": 0.027599334716796875, - "reward": 0.6316017359495163, - "reward_std": 0.001224428415298462, - "rewards/perpo_ocr_edit_distance_reward": 0.6316017359495163, + "advantages": -1.2091228200006299e-05, + "completion_length": 872.0, + "delta_ref_entropy_loss": 0.11279296875, + "delta_ref_ppl": -0.07958984375, + "entropy_loss": -0.1943359375, + "epoch": 0.357, + "grad_norm": 1.8035747323786888, + "k1_kl": 0.0791015625, + "k3_kl": 0.04248046875, + "kimi_kl": 0.0810546875, + "learning_rate": 3.215e-07, + "loss": 0.0017, + "ppl": 0.1044921875, + "reward": 0.8476283550262451, + "reward_std": 0.002011496340855956, + "rewards/perpo_ocr_edit_distance_reward": 0.8476283550262451, "step": 1785, "temperature": 0.9 }, { - "advantages": -5.275011517369421e-06, - "completion_length": 513.5, - "delta_ref_entropy_loss": 0.0594482421875, - "delta_ref_ppl": -0.03582763671875, - "entropy_loss": -0.04254150390625, - "epoch": 0.7144, - "grad_norm": 1.5585966807130562, - "k1_kl": 0.03582763671875, - "k3_kl": 0.01849365234375, - "kimi_kl": 0.03985595703125, - "learning_rate": 1.428e-07, - "loss": 0.0007, - "ppl": 0.0199432373046875, - "reward": 0.992162436246872, - "reward_std": 0.0019643427804112434, - "rewards/perpo_ocr_edit_distance_reward": 0.9921624660491943, + "advantages": -2.7247838829680404e-07, + "completion_length": 894.0, + "delta_ref_entropy_loss": 0.0703125, + "delta_ref_ppl": -0.111328125, + "entropy_loss": -0.08251953125, + "epoch": 0.3572, + "grad_norm": 1.082070684146692, + "k1_kl": 0.111328125, + "k3_kl": 0.0703125, + "kimi_kl": 0.17578125, + "learning_rate": 3.214e-07, + "loss": 0.0028, + "ppl": 0.035888671875, + "reward": 0.8554861545562744, + "reward_std": 0.2827467918395996, + "rewards/perpo_ocr_edit_distance_reward": 0.8554862141609192, "step": 1786, "temperature": 0.9 }, { - "advantages": -2.0750933146018724e-05, - "completion_length": 227.0, - "delta_ref_entropy_loss": 0.072509765625, - "delta_ref_ppl": -0.0777587890625, - "entropy_loss": -0.0567626953125, - "epoch": 0.7148, - "grad_norm": 0.8262478363579293, - "k1_kl": 0.0780029296875, - "k3_kl": 0.05322265625, - "kimi_kl": 0.1591796875, - "learning_rate": 1.426e-07, - "loss": 0.0021, - "ppl": 0.028961181640625, - "reward": 0.86977818608284, - "reward_std": 0.04129579081200063, - "rewards/perpo_ocr_edit_distance_reward": 0.8697782158851624, + "advantages": 2.009528088819934e-06, + "completion_length": 490.0, + "delta_ref_entropy_loss": 0.11767578125, + "delta_ref_ppl": -0.11669921875, + "entropy_loss": -0.1376953125, + "epoch": 0.3574, + "grad_norm": 1.1431069023451887, + "k1_kl": 0.1171875, + "k3_kl": 0.07470703125, + "kimi_kl": 0.2734375, + "learning_rate": 3.2129999999999995e-07, + "loss": 0.003, + "ppl": 0.0703125, + "reward": 0.7157820463180542, + "reward_std": 0.00415937602519989, + "rewards/perpo_ocr_edit_distance_reward": 0.7157820463180542, "step": 1787, "temperature": 0.9 }, { - "advantages": -2.5280885893153027e-05, - "completion_length": 530.0, - "delta_ref_entropy_loss": 0.0390625, - "delta_ref_ppl": -0.04443359375, - "entropy_loss": -0.0281982421875, - "epoch": 0.7152, - "grad_norm": 0.5185343062824305, - "k1_kl": 0.04443359375, - "k3_kl": 0.02740478515625, - "kimi_kl": 0.066650390625, - "learning_rate": 1.424e-07, - "loss": 0.0011, - "ppl": 0.01419830322265625, - "reward": 0.9890543818473816, - "reward_std": 0.0010446759406477213, - "rewards/perpo_ocr_edit_distance_reward": 0.9890544414520264, + "advantages": -1.7762184143066406e-05, + "completion_length": 371.0, + "delta_ref_entropy_loss": 0.0771484375, + "delta_ref_ppl": -0.12255859375, + "entropy_loss": -0.0537109375, + "epoch": 0.3576, + "grad_norm": 1.734678116054521, + "k1_kl": 0.12255859375, + "k3_kl": 0.0751953125, + "kimi_kl": 0.251953125, + "learning_rate": 3.212e-07, + "loss": 0.003, + "ppl": 0.025146484375, + "reward": 0.990585446357727, + "reward_std": 0.0037334575317800045, + "rewards/perpo_ocr_edit_distance_reward": 0.9905855655670166, "step": 1788, "temperature": 0.9 }, { - "advantages": 0.0, - "completion_length": 556.5, - "delta_ref_entropy_loss": 0.03619384765625, - "delta_ref_ppl": -0.044921875, - "entropy_loss": -0.015411376953125, - "epoch": 0.7156, - "grad_norm": 0.021626357787325395, - "k1_kl": 0.044921875, - "k3_kl": 0.031341552734375, - "kimi_kl": 0.16522216796875, - "learning_rate": 1.422e-07, - "loss": 0.0012, - "ppl": 0.00518798828125, - "reward": 0.9857650995254517, - "reward_std": 0.0, - "rewards/perpo_ocr_edit_distance_reward": 0.985765129327774, + "advantages": -1.5429088307428174e-05, + "completion_length": 1870.0, + "delta_ref_entropy_loss": 0.0262451171875, + "delta_ref_ppl": -0.03125, + "entropy_loss": -0.0791015625, + "epoch": 0.3578, + "grad_norm": 1.4863429960764247, + "k1_kl": 0.03125, + "k3_kl": 0.033203125, + "kimi_kl": 0.052490234375, + "learning_rate": 3.211e-07, + "loss": 0.0013, + "ppl": 0.043212890625, + "reward": 0.9235067963600159, + "reward_std": 0.0037578395567834377, + "rewards/perpo_ocr_edit_distance_reward": 0.9235068559646606, "step": 1789, "temperature": 0.9 }, { - "advantages": 5.449567765936081e-07, - "completion_length": 697.5, - "delta_ref_entropy_loss": 0.03240966796875, - "delta_ref_ppl": -0.0535888671875, - "entropy_loss": -0.04351806640625, - "epoch": 0.716, - "grad_norm": 1.0265035909513374, - "k1_kl": 0.0535888671875, - "k3_kl": 0.0369873046875, - "kimi_kl": 0.117431640625, - "learning_rate": 1.4199999999999997e-07, - "loss": 0.0015, - "ppl": 0.02197265625, - "reward": 0.9943577647209167, - "reward_std": 0.00793004030128941, - "rewards/perpo_ocr_edit_distance_reward": 0.9943578243255615, + "advantages": -8.644376794109121e-05, + "completion_length": 607.0, + "delta_ref_entropy_loss": 0.08447265625, + "delta_ref_ppl": -0.08642578125, + "entropy_loss": -0.036865234375, + "epoch": 0.358, + "grad_norm": 0.816575916121008, + "k1_kl": 0.0869140625, + "k3_kl": 0.05322265625, + "kimi_kl": 0.1748046875, + "learning_rate": 3.21e-07, + "loss": 0.0022, + "ppl": 0.0145263671875, + "reward": 0.9865632057189941, + "reward_std": 0.0008850778685882688, + "rewards/perpo_ocr_edit_distance_reward": 0.9865633249282837, "step": 1790, "temperature": 0.9 }, { - "advantages": -9.724924166221172e-05, - "completion_length": 628.5, - "delta_ref_entropy_loss": 0.02886962890625, - "delta_ref_ppl": -0.027679443359375, - "entropy_loss": -0.01971435546875, - "epoch": 0.7164, - "grad_norm": 0.27996360440205287, - "k1_kl": 0.027801513671875, - "k3_kl": 0.017486572265625, - "kimi_kl": 0.08087158203125, - "learning_rate": 1.418e-07, - "loss": 0.0008, - "ppl": 0.010284423828125, - "reward": 0.9999395906925201, - "reward_std": 0.00010321942681912333, - "rewards/perpo_ocr_edit_distance_reward": 0.9999395906925201, + "advantages": -6.130763949840912e-07, + "completion_length": 341.0, + "delta_ref_entropy_loss": 0.060302734375, + "delta_ref_ppl": -0.115234375, + "entropy_loss": -0.0712890625, + "epoch": 0.3582, + "grad_norm": 1.437106704007762, + "k1_kl": 0.11572265625, + "k3_kl": 0.0751953125, + "kimi_kl": 0.232421875, + "learning_rate": 3.209e-07, + "loss": 0.003, + "ppl": 0.0269775390625, + "reward": 0.7832837700843811, + "reward_std": 0.11016441881656647, + "rewards/perpo_ocr_edit_distance_reward": 0.7832838296890259, "step": 1791, "temperature": 0.9 }, { - "advantages": -0.0002981424331665039, - "completion_length": 818.0, - "delta_ref_entropy_loss": 0.0616455078125, - "delta_ref_ppl": -0.033935546875, - "entropy_loss": -0.0921630859375, - "epoch": 0.7168, - "grad_norm": 1.304448676957616, - "k1_kl": 0.0340576171875, - "k3_kl": 0.0185546875, - "kimi_kl": 0.0408935546875, - "learning_rate": 1.416e-07, - "loss": 0.001, - "ppl": 0.04807281494140625, - "reward": 0.8403545320034027, - "reward_std": 0.034753765910863876, - "rewards/perpo_ocr_edit_distance_reward": 0.8403545916080475, + "advantages": -1.1103494216513354e-05, + "completion_length": 237.0, + "delta_ref_entropy_loss": 0.0888671875, + "delta_ref_ppl": -0.15234375, + "entropy_loss": -0.056884765625, + "epoch": 0.3584, + "grad_norm": 1.3922866925877588, + "k1_kl": 0.15234375, + "k3_kl": 0.09765625, + "kimi_kl": 0.275390625, + "learning_rate": 3.2079999999999996e-07, + "loss": 0.0039, + "ppl": 0.018798828125, + "reward": 0.9874375462532043, + "reward_std": 0.002200466813519597, + "rewards/perpo_ocr_edit_distance_reward": 0.9874376058578491, "step": 1792, "temperature": 0.9 }, { - "advantages": -2.188342023146106e-06, - "completion_length": 299.5, - "delta_ref_entropy_loss": 0.1041259765625, - "delta_ref_ppl": -0.0931396484375, - "entropy_loss": -0.06494140625, - "epoch": 0.7172, - "grad_norm": 0.843865950731209, - "k1_kl": 0.0931396484375, - "k3_kl": 0.05596923828125, - "kimi_kl": 0.16595458984375, - "learning_rate": 1.4139999999999998e-07, - "loss": 0.0022, - "ppl": 0.0302581787109375, - "reward": 0.9554131925106049, - "reward_std": 0.0018902817973867059, - "rewards/perpo_ocr_edit_distance_reward": 0.9554132223129272, + "advantages": -1.9073486328125e-06, + "completion_length": 1594.0, + "delta_ref_entropy_loss": 0.035400390625, + "delta_ref_ppl": -0.03515625, + "entropy_loss": -0.04345703125, + "epoch": 0.3586, + "grad_norm": 0.7138343688580644, + "k1_kl": 0.035400390625, + "k3_kl": 0.0233154296875, + "kimi_kl": 0.05712890625, + "learning_rate": 3.2069999999999995e-07, + "loss": 0.0009, + "ppl": 0.0201416015625, + "reward": 0.9916064143180847, + "reward_std": 0.004340378101915121, + "rewards/perpo_ocr_edit_distance_reward": 0.9916064739227295, "step": 1793, "temperature": 0.9 }, { - "advantages": -1.6799995137262158e-05, - "completion_length": 446.5, - "delta_ref_entropy_loss": 0.0445556640625, - "delta_ref_ppl": -0.0455322265625, - "entropy_loss": -0.04608154296875, - "epoch": 0.7176, - "grad_norm": 0.954820152781197, - "k1_kl": 0.0455322265625, - "k3_kl": 0.030517578125, - "kimi_kl": 0.09814453125, - "learning_rate": 1.412e-07, - "loss": 0.0012, - "ppl": 0.0288238525390625, - "reward": 0.9422902464866638, - "reward_std": 0.0007101610535755754, - "rewards/perpo_ocr_edit_distance_reward": 0.9422902464866638, + "advantages": -4.333683682489209e-05, + "completion_length": 663.0, + "delta_ref_entropy_loss": 0.06103515625, + "delta_ref_ppl": -0.08544921875, + "entropy_loss": -0.03759765625, + "epoch": 0.3588, + "grad_norm": 0.5570880766026179, + "k1_kl": 0.0859375, + "k3_kl": 0.0537109375, + "kimi_kl": 0.150390625, + "learning_rate": 3.206e-07, + "loss": 0.0022, + "ppl": 0.0142822265625, + "reward": 0.9789755344390869, + "reward_std": 0.0008823653915897012, + "rewards/perpo_ocr_edit_distance_reward": 0.9789756536483765, "step": 1794, "temperature": 0.9 }, { - "advantages": -1.660415136939264e-06, - "completion_length": 119.0, - "delta_ref_entropy_loss": 0.049072265625, - "delta_ref_ppl": -0.0848388671875, - "entropy_loss": -0.04180908203125, - "epoch": 0.718, - "grad_norm": 1.0873060857422328, - "k1_kl": 0.0849609375, - "k3_kl": 0.0640869140625, - "kimi_kl": 0.31787109375, - "learning_rate": 1.4099999999999998e-07, - "loss": 0.0026, - "ppl": 0.019012451171875, - "reward": 0.8932118117809296, - "reward_std": 0.015500348061323166, - "rewards/perpo_ocr_edit_distance_reward": 0.8932118713855743, + "advantages": -9.41753460210748e-05, + "completion_length": 596.0, + "delta_ref_entropy_loss": 0.05078125, + "delta_ref_ppl": -0.076171875, + "entropy_loss": -0.0279541015625, + "epoch": 0.359, + "grad_norm": 0.43340307386505317, + "k1_kl": 0.076171875, + "k3_kl": 0.05126953125, + "kimi_kl": 0.1787109375, + "learning_rate": 3.205e-07, + "loss": 0.0022, + "ppl": 0.01068115234375, + "reward": 0.9952016472816467, + "reward_std": 0.000532967911567539, + "rewards/perpo_ocr_edit_distance_reward": 0.9952017068862915, "step": 1795, "temperature": 0.9 }, { - "advantages": -5.756106133958383e-06, - "completion_length": 365.5, - "delta_ref_entropy_loss": 0.044189453125, - "delta_ref_ppl": -0.03466796875, - "entropy_loss": -0.02264404296875, - "epoch": 0.7184, - "grad_norm": 0.732509351423514, - "k1_kl": 0.03466796875, - "k3_kl": 0.0234832763671875, - "kimi_kl": 0.138885498046875, - "learning_rate": 1.408e-07, - "loss": 0.0009, - "ppl": 0.0103759765625, - "reward": 0.9814167022705078, - "reward_std": 0.018048899015411735, - "rewards/perpo_ocr_edit_distance_reward": 0.9814167022705078, + "advantages": -5.568776941800024e-06, + "completion_length": 1474.0, + "delta_ref_entropy_loss": 0.041015625, + "delta_ref_ppl": -0.038330078125, + "entropy_loss": -0.062255859375, + "epoch": 0.3592, + "grad_norm": 1.2744466499008493, + "k1_kl": 0.038330078125, + "k3_kl": 0.0238037109375, + "kimi_kl": 0.052978515625, + "learning_rate": 3.204e-07, + "loss": 0.001, + "ppl": 0.033447265625, + "reward": 0.99046391248703, + "reward_std": 0.0014321013586595654, + "rewards/perpo_ocr_edit_distance_reward": 0.9904639720916748, "step": 1796, "temperature": 0.9 }, { - "advantages": 5.705015837520477e-07, - "completion_length": 1153.5, - "delta_ref_entropy_loss": 0.06243896484375, - "delta_ref_ppl": -0.06170654296875, - "entropy_loss": -0.100341796875, - "epoch": 0.7188, - "grad_norm": 167.9859203080598, - "k1_kl": 0.061767578125, - "k3_kl": 0.3349609375, - "kimi_kl": 0.12060546875, - "learning_rate": 1.406e-07, - "loss": 0.0134, - "ppl": 0.0577392578125, - "reward": 0.9369872510433197, - "reward_std": 0.004359005601145327, - "rewards/perpo_ocr_edit_distance_reward": 0.9369872212409973, + "advantages": -7.679632835788652e-05, + "completion_length": 626.0, + "delta_ref_entropy_loss": 0.064453125, + "delta_ref_ppl": -0.05908203125, + "entropy_loss": -0.033203125, + "epoch": 0.3594, + "grad_norm": 0.2346146088667268, + "k1_kl": 0.05908203125, + "k3_kl": 0.032958984375, + "kimi_kl": 0.115234375, + "learning_rate": 3.203e-07, + "loss": 0.0014, + "ppl": 0.0107421875, + "reward": 0.9934484958648682, + "reward_std": 0.00023267371580004692, + "rewards/perpo_ocr_edit_distance_reward": 0.9934485554695129, "step": 1797, "temperature": 0.9 }, { - "advantages": -8.557524324714905e-07, - "completion_length": 426.0, - "delta_ref_entropy_loss": 0.0787353515625, - "delta_ref_ppl": -0.0572509765625, - "entropy_loss": -0.1011962890625, - "epoch": 0.7192, - "grad_norm": 1.1528854923369203, - "k1_kl": 0.0574951171875, - "k3_kl": 0.0357666015625, - "kimi_kl": 0.0819091796875, - "learning_rate": 1.4039999999999999e-07, - "loss": 0.0014, - "ppl": 0.054931640625, - "reward": 0.9280487895011902, - "reward_std": 0.010454364295583218, - "rewards/perpo_ocr_edit_distance_reward": 0.9280488193035126, + "advantages": -1.0682004358386621e-05, + "completion_length": 223.0, + "delta_ref_entropy_loss": 0.07763671875, + "delta_ref_ppl": -0.134765625, + "entropy_loss": -0.050048828125, + "epoch": 0.3596, + "grad_norm": 2.6757564516051793, + "k1_kl": 0.134765625, + "k3_kl": 0.091796875, + "kimi_kl": 0.30078125, + "learning_rate": 3.2019999999999997e-07, + "loss": 0.0037, + "ppl": 0.0185546875, + "reward": 0.9854722023010254, + "reward_std": 0.003886482445523143, + "rewards/perpo_ocr_edit_distance_reward": 0.9854722023010254, "step": 1798, "temperature": 0.9 }, { - "advantages": -0.00030927998705010395, - "completion_length": 563.0, - "delta_ref_entropy_loss": 0.02630615234375, - "delta_ref_ppl": -0.0218505859375, - "entropy_loss": -0.024322509765625, - "epoch": 0.7196, - "grad_norm": 0.28629089352770437, - "k1_kl": 0.021820068359375, - "k3_kl": 0.0124053955078125, - "kimi_kl": 0.031219482421875, - "learning_rate": 1.402e-07, - "loss": 0.0008, - "ppl": 0.010986328125, - "reward": 0.9959614276885986, - "reward_std": 0.0003282254619989544, - "rewards/perpo_ocr_edit_distance_reward": 0.995961457490921, + "advantages": -4.717282081401208e-06, + "completion_length": 592.0, + "delta_ref_entropy_loss": 0.109375, + "delta_ref_ppl": -0.107421875, + "entropy_loss": -0.1396484375, + "epoch": 0.3598, + "grad_norm": 1.6713962610659572, + "k1_kl": 0.107421875, + "k3_kl": 0.0654296875, + "kimi_kl": 0.1689453125, + "learning_rate": 3.2009999999999996e-07, + "loss": 0.0026, + "ppl": 0.064453125, + "reward": 0.9073569178581238, + "reward_std": 0.007147093769162893, + "rewards/perpo_ocr_edit_distance_reward": 0.9073569774627686, "step": 1799, "temperature": 0.9 }, { - "advantages": -0.0003336582885822281, - "completion_length": 733.5, - "delta_ref_entropy_loss": 0.0416259765625, - "delta_ref_ppl": -0.0576171875, - "entropy_loss": -0.0303955078125, - "epoch": 0.72, - "grad_norm": 0.9508650571351825, - "k1_kl": 0.0576171875, - "k3_kl": 0.0421142578125, - "kimi_kl": 0.1943359375, - "learning_rate": 1.4e-07, - "loss": 0.002, - "ppl": 0.016510009765625, - "reward": 0.994610995054245, - "reward_std": 0.0004278872802387923, - "rewards/perpo_ocr_edit_distance_reward": 0.9946110546588898, + "advantages": 0.0, + "completion_length": 513.0, + "delta_ref_entropy_loss": 0.05517578125, + "delta_ref_ppl": -0.07373046875, + "entropy_loss": -0.031982421875, + "epoch": 0.36, + "grad_norm": 0.37634481990166124, + "k1_kl": 0.07373046875, + "k3_kl": 0.04736328125, + "kimi_kl": 0.1474609375, + "learning_rate": 3.2e-07, + "loss": 0.0019, + "ppl": 0.012451171875, + "reward": 0.9941208362579346, + "reward_std": 0.0009257376077584922, + "rewards/perpo_ocr_edit_distance_reward": 0.9941207766532898, "step": 1800, "temperature": 0.9 }, { - "advantages": -4.789659286075221e-06, - "completion_length": 743.0, - "delta_ref_entropy_loss": 0.08447265625, - "delta_ref_ppl": -0.09716796875, - "entropy_loss": -0.1705322265625, - "epoch": 0.7204, - "grad_norm": 1.3323016931248586, - "k1_kl": 0.09716796875, - "k3_kl": 0.0616455078125, - "kimi_kl": 0.146240234375, - "learning_rate": 1.398e-07, - "loss": 0.0025, - "ppl": 0.10076904296875, - "reward": 0.6917832493782043, - "reward_std": 0.01458457432454452, - "rewards/perpo_ocr_edit_distance_reward": 0.6917832642793655, + "advantages": -0.00014845814439468086, + "completion_length": 632.0, + "delta_ref_entropy_loss": 0.037841796875, + "delta_ref_ppl": -0.05078125, + "entropy_loss": -0.019775390625, + "epoch": 0.3602, + "grad_norm": 0.3526145194518417, + "k1_kl": 0.05078125, + "k3_kl": 0.031005859375, + "kimi_kl": 0.07470703125, + "learning_rate": 3.199e-07, + "loss": 0.0014, + "ppl": 0.006011962890625, + "reward": 0.9992663860321045, + "reward_std": 0.00012940836313646287, + "rewards/perpo_ocr_edit_distance_reward": 0.9992664456367493, "step": 1801, "temperature": 0.9 }, { - "advantages": -1.1035374882339966e-05, - "completion_length": 627.0, - "delta_ref_entropy_loss": 0.043212890625, - "delta_ref_ppl": -0.0599365234375, - "entropy_loss": -0.025390625, - "epoch": 0.7208, - "grad_norm": 0.8095778872830965, - "k1_kl": 0.0599365234375, - "k3_kl": 0.0418701171875, - "kimi_kl": 0.131103515625, - "learning_rate": 1.396e-07, - "loss": 0.0017, - "ppl": 0.014892578125, - "reward": 0.9984642267227173, - "reward_std": 0.0009453832317376509, - "rewards/perpo_ocr_edit_distance_reward": 0.9984642565250397, + "advantages": -0.00012458223500289023, + "completion_length": 286.0, + "delta_ref_entropy_loss": 0.050048828125, + "delta_ref_ppl": -0.09521484375, + "entropy_loss": -0.0283203125, + "epoch": 0.3604, + "grad_norm": 0.49150882585268885, + "k1_kl": 0.09521484375, + "k3_kl": 0.06787109375, + "kimi_kl": 0.23828125, + "learning_rate": 3.1979999999999994e-07, + "loss": 0.0028, + "ppl": 0.012939453125, + "reward": 0.9796866178512573, + "reward_std": 0.000993560068309307, + "rewards/perpo_ocr_edit_distance_reward": 0.9796867966651917, "step": 1802, "temperature": 0.9 }, { - "advantages": -2.2121839037936297e-05, - "completion_length": 982.5, - "delta_ref_entropy_loss": 0.03900146484375, - "delta_ref_ppl": -0.02960205078125, - "entropy_loss": -0.03472900390625, - "epoch": 0.7212, - "grad_norm": 1.1755893777171689, - "k1_kl": 0.029541015625, - "k3_kl": 0.017974853515625, - "kimi_kl": 0.06280517578125, - "learning_rate": 1.394e-07, - "loss": 0.0007, - "ppl": 0.018310546875, - "reward": 0.9723029434680939, - "reward_std": 0.001423564099241048, - "rewards/perpo_ocr_edit_distance_reward": 0.9723030030727386, + "advantages": -2.622604597490863e-06, + "completion_length": 163.0, + "delta_ref_entropy_loss": 0.03271484375, + "delta_ref_ppl": -0.2021484375, + "entropy_loss": -0.041015625, + "epoch": 0.3606, + "grad_norm": 2.223132875509278, + "k1_kl": 0.2021484375, + "k3_kl": 0.16015625, + "kimi_kl": 0.9921875, + "learning_rate": 3.197e-07, + "loss": 0.0064, + "ppl": 0.018798828125, + "reward": 0.9973675012588501, + "reward_std": 0.003152152756229043, + "rewards/perpo_ocr_edit_distance_reward": 0.9973676204681396, "step": 1803, "temperature": 0.9 }, { - "advantages": -3.980739035736036e-06, - "completion_length": 1279.5, - "delta_ref_entropy_loss": 0.03814697265625, - "delta_ref_ppl": -0.03961181640625, - "entropy_loss": -0.0589599609375, - "epoch": 0.7216, - "grad_norm": 1.6217336407862937, - "k1_kl": 0.03961181640625, - "k3_kl": 0.02655029296875, - "kimi_kl": 0.088623046875, - "learning_rate": 1.3919999999999998e-07, - "loss": 0.0011, - "ppl": 0.0328369140625, - "reward": 0.9831305742263794, - "reward_std": 0.004699953395174816, - "rewards/perpo_ocr_edit_distance_reward": 0.9831306338310242, + "advantages": -3.167561317241052e-06, + "completion_length": 1578.0, + "delta_ref_entropy_loss": 0.054931640625, + "delta_ref_ppl": -0.060302734375, + "entropy_loss": -0.0576171875, + "epoch": 0.3608, + "grad_norm": 2.833922661107834, + "k1_kl": 0.060302734375, + "k3_kl": 0.034423828125, + "kimi_kl": 0.0673828125, + "learning_rate": 3.196e-07, + "loss": 0.0014, + "ppl": 0.0390625, + "reward": 0.9894648194313049, + "reward_std": 0.0025891794357448816, + "rewards/perpo_ocr_edit_distance_reward": 0.9894648194313049, "step": 1804, "temperature": 0.9 }, { - "advantages": -8.77295242389664e-05, - "completion_length": 871.0, - "delta_ref_entropy_loss": 0.0611572265625, - "delta_ref_ppl": -0.03192138671875, - "entropy_loss": -0.05731201171875, - "epoch": 0.722, - "grad_norm": 0.6932657692938459, - "k1_kl": 0.03192138671875, - "k3_kl": 0.0216217041015625, - "kimi_kl": 0.0268707275390625, - "learning_rate": 1.3900000000000001e-07, - "loss": 0.001, - "ppl": 0.032135009765625, - "reward": 0.9842630624771118, - "reward_std": 0.0008155888499459252, - "rewards/perpo_ocr_edit_distance_reward": 0.9842631220817566, + "advantages": -6.267002845561365e-06, + "completion_length": 1292.0, + "delta_ref_entropy_loss": 0.06640625, + "delta_ref_ppl": -0.06884765625, + "entropy_loss": -0.0703125, + "epoch": 0.361, + "grad_norm": 10.622898771786465, + "k1_kl": 0.06884765625, + "k3_kl": 0.03759765625, + "kimi_kl": 0.0810546875, + "learning_rate": 3.1949999999999997e-07, + "loss": 0.0015, + "ppl": 0.032470703125, + "reward": 0.9622538089752197, + "reward_std": 0.002616980578750372, + "rewards/perpo_ocr_edit_distance_reward": 0.9622538685798645, "step": 1805, "temperature": 0.9 }, { - "advantages": -4.512497616815381e-05, - "completion_length": 482.0, - "delta_ref_entropy_loss": 0.03997802734375, - "delta_ref_ppl": -0.0415496826171875, - "entropy_loss": -0.0210113525390625, - "epoch": 0.7224, - "grad_norm": 0.540358326025956, - "k1_kl": 0.0413055419921875, - "k3_kl": 0.02666473388671875, - "kimi_kl": 0.09412002563476562, - "learning_rate": 1.388e-07, - "loss": 0.0011, - "ppl": 0.0084075927734375, - "reward": 0.9978089034557343, - "reward_std": 0.00018583332712296396, - "rewards/perpo_ocr_edit_distance_reward": 0.997808963060379, + "advantages": -2.7315958504914306e-05, + "completion_length": 623.0, + "delta_ref_entropy_loss": 0.078125, + "delta_ref_ppl": -0.0703125, + "entropy_loss": -0.064453125, + "epoch": 0.3612, + "grad_norm": 0.8869076193340245, + "k1_kl": 0.07080078125, + "k3_kl": 0.04638671875, + "kimi_kl": 0.1279296875, + "learning_rate": 3.194e-07, + "loss": 0.0019, + "ppl": 0.029296875, + "reward": 0.7097783088684082, + "reward_std": 0.004266632720828056, + "rewards/perpo_ocr_edit_distance_reward": 0.7097784280776978, "step": 1806, "temperature": 0.9 }, { - "advantages": -1.513958068244392e-05, - "completion_length": 338.5, - "delta_ref_entropy_loss": 0.045654296875, - "delta_ref_ppl": -0.0311279296875, - "entropy_loss": -0.019378662109375, - "epoch": 0.7228, - "grad_norm": 0.5655536926006921, - "k1_kl": 0.0311279296875, - "k3_kl": 0.016326904296875, - "kimi_kl": 0.041748046875, - "learning_rate": 1.3859999999999998e-07, - "loss": 0.0007, - "ppl": 0.009128570556640625, - "reward": 0.9961817562580109, - "reward_std": 0.0006524784839712083, - "rewards/perpo_ocr_edit_distance_reward": 0.996181845664978, + "advantages": -3.3408403396606445e-05, + "completion_length": 932.0, + "delta_ref_entropy_loss": 0.055908203125, + "delta_ref_ppl": -0.05078125, + "entropy_loss": -0.033447265625, + "epoch": 0.3614, + "grad_norm": 0.617549343791736, + "k1_kl": 0.05078125, + "k3_kl": 0.030517578125, + "kimi_kl": 0.10107421875, + "learning_rate": 3.1929999999999995e-07, + "loss": 0.0013, + "ppl": 0.0146484375, + "reward": 0.9931415915489197, + "reward_std": 0.0006645434186793864, + "rewards/perpo_ocr_edit_distance_reward": 0.9931417107582092, "step": 1807, "temperature": 0.9 }, { - "advantages": -0.00010486586324987002, - "completion_length": 859.5, - "delta_ref_entropy_loss": 0.04107666015625, - "delta_ref_ppl": -0.020416259765625, - "entropy_loss": -0.0220947265625, - "epoch": 0.7232, - "grad_norm": 0.7935965987924323, - "k1_kl": 0.020355224609375, - "k3_kl": 0.0084228515625, - "kimi_kl": 0.01708984375, - "learning_rate": 1.384e-07, - "loss": 0.0004, - "ppl": 0.008819580078125, - "reward": 0.9979167282581329, - "reward_std": 0.0003919462615158409, - "rewards/perpo_ocr_edit_distance_reward": 0.9979168176651001, + "advantages": -4.485675526666455e-05, + "completion_length": 522.0, + "delta_ref_entropy_loss": 0.06884765625, + "delta_ref_ppl": -0.06396484375, + "entropy_loss": -0.036865234375, + "epoch": 0.3616, + "grad_norm": 0.6337191096207834, + "k1_kl": 0.064453125, + "k3_kl": 0.03466796875, + "kimi_kl": 0.0810546875, + "learning_rate": 3.1919999999999995e-07, + "loss": 0.0014, + "ppl": 0.0185546875, + "reward": 0.8328558802604675, + "reward_std": 0.0006590558332391083, + "rewards/perpo_ocr_edit_distance_reward": 0.8328559398651123, "step": 1808, "temperature": 0.9 }, { - "advantages": -4.155295428631689e-06, - "completion_length": 1346.5, - "delta_ref_entropy_loss": 0.0462646484375, - "delta_ref_ppl": -0.0341796875, - "entropy_loss": -0.11572265625, - "epoch": 0.7236, - "grad_norm": 21.1398177110496, - "k1_kl": 0.0341796875, - "k3_kl": 0.242431640625, - "kimi_kl": 0.074951171875, - "learning_rate": 1.3819999999999998e-07, - "loss": 0.0097, - "ppl": 0.0809326171875, - "reward": 0.9119563400745392, - "reward_std": 0.02756270277313888, - "rewards/perpo_ocr_edit_distance_reward": 0.911956399679184, + "advantages": -2.1670546630048193e-05, + "completion_length": 558.0, + "delta_ref_entropy_loss": 0.1318359375, + "delta_ref_ppl": -0.11083984375, + "entropy_loss": -0.1103515625, + "epoch": 0.3618, + "grad_norm": 1.77542042630062, + "k1_kl": 0.11083984375, + "k3_kl": 0.060302734375, + "kimi_kl": 0.158203125, + "learning_rate": 3.191e-07, + "loss": 0.0024, + "ppl": 0.057861328125, + "reward": 0.9668828248977661, + "reward_std": 0.0022565973922610283, + "rewards/perpo_ocr_edit_distance_reward": 0.9668828248977661, "step": 1809, "temperature": 0.9 }, { - "advantages": -8.710793167665543e-06, - "completion_length": 678.5, - "delta_ref_entropy_loss": 0.068115234375, - "delta_ref_ppl": -0.0653076171875, - "entropy_loss": -0.06207275390625, - "epoch": 0.724, - "grad_norm": 1.5314095965597685, - "k1_kl": 0.0653076171875, - "k3_kl": 0.042999267578125, - "kimi_kl": 0.19708251953125, - "learning_rate": 1.3800000000000002e-07, - "loss": 0.0017, - "ppl": 0.030487060546875, - "reward": 0.9827486872673035, - "reward_std": 0.005660416325554252, - "rewards/perpo_ocr_edit_distance_reward": 0.9827486872673035, + "advantages": -1.533542490506079e-05, + "completion_length": 730.0, + "delta_ref_entropy_loss": 0.0294189453125, + "delta_ref_ppl": -0.04248046875, + "entropy_loss": -0.0252685546875, + "epoch": 0.362, + "grad_norm": 0.5974937542922597, + "k1_kl": 0.04248046875, + "k3_kl": 0.0269775390625, + "kimi_kl": 0.080078125, + "learning_rate": 3.19e-07, + "loss": 0.0011, + "ppl": 0.00982666015625, + "reward": 0.9985707998275757, + "reward_std": 0.0010113816242665052, + "rewards/perpo_ocr_edit_distance_reward": 0.9985707998275757, "step": 1810, "temperature": 0.9 }, { - "advantages": -2.99045032079448e-05, - "completion_length": 681.0, - "delta_ref_entropy_loss": 0.04248046875, - "delta_ref_ppl": -0.03289794921875, - "entropy_loss": -0.037841796875, - "epoch": 0.7244, - "grad_norm": 0.5926351763151629, - "k1_kl": 0.03277587890625, - "k3_kl": 0.0189208984375, - "kimi_kl": 0.066253662109375, - "learning_rate": 1.378e-07, - "loss": 0.0008, - "ppl": 0.017852783203125, - "reward": 0.9811784625053406, - "reward_std": 0.0016150331357493997, - "rewards/perpo_ocr_edit_distance_reward": 0.9811785519123077, + "advantages": -5.858285476278979e-06, + "completion_length": 2048.0, + "delta_ref_entropy_loss": 0.0205078125, + "delta_ref_ppl": -0.04833984375, + "entropy_loss": -0.212890625, + "epoch": 0.3622, + "grad_norm": 8.168547016459513, + "k1_kl": 0.048583984375, + "k3_kl": 0.07421875, + "kimi_kl": 0.1064453125, + "learning_rate": 3.189e-07, + "loss": 0.003, + "ppl": 0.1328125, + "reward": 0.729342520236969, + "reward_std": 0.010146496817469597, + "rewards/perpo_ocr_edit_distance_reward": 0.7293425798416138, "step": 1811, "temperature": 0.9 }, { - "advantages": -1.9797258232756576e-07, - "completion_length": 617.0, - "delta_ref_entropy_loss": 0.071868896484375, - "delta_ref_ppl": -0.0526123046875, - "entropy_loss": -0.173583984375, - "epoch": 0.7248, - "grad_norm": 1.5461732258527638, - "k1_kl": 0.0526123046875, - "k3_kl": 0.031890869140625, - "kimi_kl": 0.07513427734375, - "learning_rate": 1.376e-07, - "loss": 0.0013, - "ppl": 0.102020263671875, - "reward": 0.7980761826038361, - "reward_std": 0.011133307736599818, - "rewards/perpo_ocr_edit_distance_reward": 0.7980762124061584, + "advantages": 8.514949634275126e-09, + "completion_length": 885.0, + "delta_ref_entropy_loss": 0.039794921875, + "delta_ref_ppl": -0.039306640625, + "entropy_loss": -0.0208740234375, + "epoch": 0.3624, + "grad_norm": 0.38590996307224973, + "k1_kl": 0.039306640625, + "k3_kl": 0.0203857421875, + "kimi_kl": 0.050537109375, + "learning_rate": 3.1879999999999997e-07, + "loss": 0.0008, + "ppl": 0.0062255859375, + "reward": 0.9992840886116028, + "reward_std": 0.0002678516320884228, + "rewards/perpo_ocr_edit_distance_reward": 0.9992840886116028, "step": 1812, "temperature": 0.9 }, { - "advantages": -8.666089706821367e-05, - "completion_length": 677.0, - "delta_ref_entropy_loss": 0.0142822265625, - "delta_ref_ppl": -0.02105712890625, - "entropy_loss": -0.014434814453125, - "epoch": 0.7252, - "grad_norm": 0.1902645808598986, - "k1_kl": 0.0210418701171875, - "k3_kl": 0.01581573486328125, - "kimi_kl": 0.07525634765625, - "learning_rate": 1.374e-07, - "loss": 0.0007, - "ppl": 0.0065155029296875, - "reward": 0.998795360326767, - "reward_std": 9.739045344758779e-05, - "rewards/perpo_ocr_edit_distance_reward": 0.9987953901290894, + "advantages": -6.641660661443893e-07, + "completion_length": 8.0, + "delta_ref_entropy_loss": -0.1513671875, + "delta_ref_ppl": -3.96875, + "entropy_loss": -0.70703125, + "epoch": 0.3626, + "grad_norm": 39.302971583109226, + "k1_kl": 3.96875, + "k3_kl": 3.578125, + "kimi_kl": 23.125, + "learning_rate": 3.1869999999999996e-07, + "loss": 0.1433, + "ppl": 0.2109375, + "reward": 0.20523136854171753, + "reward_std": 0.015970328822731972, + "rewards/perpo_ocr_edit_distance_reward": 0.20523139834403992, "step": 1813, "temperature": 0.9 }, { - "advantages": -2.3058482838678174e-05, - "completion_length": 502.0, - "delta_ref_entropy_loss": 0.0374755859375, - "delta_ref_ppl": -0.029541015625, - "entropy_loss": -0.02691650390625, - "epoch": 0.7256, - "grad_norm": 0.35737282707327017, - "k1_kl": 0.02960205078125, - "k3_kl": 0.01763916015625, - "kimi_kl": 0.04852294921875, - "learning_rate": 1.3719999999999998e-07, - "loss": 0.0007, - "ppl": 0.013519287109375, - "reward": 0.994624137878418, - "reward_std": 0.0003193381999153644, - "rewards/perpo_ocr_edit_distance_reward": 0.9946241676807404, + "advantages": -1.3053418115305249e-05, + "completion_length": 779.0, + "delta_ref_entropy_loss": 0.02490234375, + "delta_ref_ppl": -0.03369140625, + "entropy_loss": -0.0189208984375, + "epoch": 0.3628, + "grad_norm": 0.2965728270396239, + "k1_kl": 0.03369140625, + "k3_kl": 0.022216796875, + "kimi_kl": 0.052001953125, + "learning_rate": 3.1859999999999995e-07, + "loss": 0.0009, + "ppl": 0.007049560546875, + "reward": 0.9779336452484131, + "reward_std": 0.005112688057124615, + "rewards/perpo_ocr_edit_distance_reward": 0.9779337644577026, "step": 1814, "temperature": 0.9 }, { - "advantages": 0.0, - "completion_length": 399.5, - "delta_ref_entropy_loss": 0.04241943359375, - "delta_ref_ppl": -0.0916748046875, - "entropy_loss": -0.181640625, - "epoch": 0.726, - "grad_norm": 3.938327524033014, - "k1_kl": 0.0916748046875, - "k3_kl": 0.0714111328125, - "kimi_kl": 0.2659912109375, - "learning_rate": 1.37e-07, - "loss": 0.0029, - "ppl": 0.083984375, - "reward": 0.5296949148178101, - "reward_std": 0.197188600897789, - "rewards/perpo_ocr_edit_distance_reward": 0.5296949446201324, + "advantages": -2.7077539925812744e-05, + "completion_length": 255.0, + "delta_ref_entropy_loss": 0.07275390625, + "delta_ref_ppl": -0.158203125, + "entropy_loss": -0.1376953125, + "epoch": 0.363, + "grad_norm": 2.224311262621401, + "k1_kl": 0.158203125, + "k3_kl": 0.11865234375, + "kimi_kl": 0.365234375, + "learning_rate": 3.185e-07, + "loss": 0.0048, + "ppl": 0.0625, + "reward": 0.9161316156387329, + "reward_std": 0.002101149410009384, + "rewards/perpo_ocr_edit_distance_reward": 0.9161316752433777, "step": 1815, "temperature": 0.9 }, { - "advantages": -9.570803376846015e-05, - "completion_length": 471.0, - "delta_ref_entropy_loss": 0.0374755859375, - "delta_ref_ppl": -0.0404052734375, - "entropy_loss": -0.0179443359375, - "epoch": 0.7264, - "grad_norm": 0.4323441483474118, - "k1_kl": 0.04052734375, - "k3_kl": 0.02740478515625, - "kimi_kl": 0.080810546875, - "learning_rate": 1.368e-07, + "advantages": -3.87941108783707e-05, + "completion_length": 485.0, + "delta_ref_entropy_loss": 0.044677734375, + "delta_ref_ppl": -0.04833984375, + "entropy_loss": -0.02978515625, + "epoch": 0.3632, + "grad_norm": 0.5829795688947583, + "k1_kl": 0.048095703125, + "k3_kl": 0.0286865234375, + "kimi_kl": 0.0751953125, + "learning_rate": 3.184e-07, "loss": 0.0012, - "ppl": 0.0054931640625, - "reward": 0.99979567527771, - "reward_std": 0.0002547989788581617, - "rewards/perpo_ocr_edit_distance_reward": 0.9997957348823547, + "ppl": 0.0125732421875, + "reward": 0.9965102076530457, + "reward_std": 0.0007775543490424752, + "rewards/perpo_ocr_edit_distance_reward": 0.9965102672576904, "step": 1816, "temperature": 0.9 }, { - "advantages": -0.00015423341096720833, - "completion_length": 672.0, - "delta_ref_entropy_loss": 0.08203125, - "delta_ref_ppl": -0.057586669921875, - "entropy_loss": -0.080535888671875, - "epoch": 0.7268, - "grad_norm": 1.0206249684526731, - "k1_kl": 0.05780029296875, - "k3_kl": 0.03179931640625, - "kimi_kl": 0.0804443359375, - "learning_rate": 1.366e-07, - "loss": 0.0014, - "ppl": 0.0408935546875, - "reward": 0.9700088798999786, - "reward_std": 0.002534678984375205, - "rewards/perpo_ocr_edit_distance_reward": 0.970008909702301, + "advantages": -2.384185791015625e-07, + "completion_length": 1364.0, + "delta_ref_entropy_loss": 0.0625, + "delta_ref_ppl": -0.059326171875, + "entropy_loss": -0.140625, + "epoch": 0.3634, + "grad_norm": 15.023569319930822, + "k1_kl": 0.059326171875, + "k3_kl": 0.06591796875, + "kimi_kl": 0.07763671875, + "learning_rate": 3.1830000000000004e-07, + "loss": 0.0026, + "ppl": 0.0888671875, + "reward": 0.8301680088043213, + "reward_std": 0.1663942039012909, + "rewards/perpo_ocr_edit_distance_reward": 0.8301680684089661, "step": 1817, "temperature": 0.9 }, { - "advantages": -0.00015162144882197026, - "completion_length": 722.5, - "delta_ref_entropy_loss": 0.0445556640625, - "delta_ref_ppl": -0.02685546875, - "entropy_loss": -0.0380859375, - "epoch": 0.7272, - "grad_norm": 0.786149050386072, - "k1_kl": 0.02685546875, - "k3_kl": 0.014404296875, - "kimi_kl": 0.03265380859375, - "learning_rate": 1.3639999999999998e-07, - "loss": 0.0007, - "ppl": 0.0169219970703125, - "reward": 0.950876921415329, - "reward_std": 0.0005479437822941691, - "rewards/perpo_ocr_edit_distance_reward": 0.9508769810199738, + "advantages": 0.0, + "completion_length": 220.0, + "delta_ref_entropy_loss": 0.1455078125, + "delta_ref_ppl": -0.2431640625, + "entropy_loss": -0.09326171875, + "epoch": 0.3636, + "grad_norm": 1.8157974325152006, + "k1_kl": 0.2431640625, + "k3_kl": 0.1748046875, + "kimi_kl": 0.67578125, + "learning_rate": 3.182e-07, + "loss": 0.007, + "ppl": 0.04248046875, + "reward": 0.26753246784210205, + "reward_std": 0.13383950293064117, + "rewards/perpo_ocr_edit_distance_reward": 0.26753246784210205, "step": 1818, "temperature": 0.9 }, { - "advantages": -0.00013835303707310231, - "completion_length": 670.0, - "delta_ref_entropy_loss": 0.0650634765625, - "delta_ref_ppl": -0.04254150390625, - "entropy_loss": -0.06134033203125, - "epoch": 0.7276, - "grad_norm": 0.87114044294988, - "k1_kl": 0.04241943359375, - "k3_kl": 0.0220947265625, - "kimi_kl": 0.0472412109375, - "learning_rate": 1.362e-07, - "loss": 0.001, - "ppl": 0.030181884765625, - "reward": 0.9466238021850586, - "reward_std": 0.0009133988423855044, - "rewards/perpo_ocr_edit_distance_reward": 0.9466238915920258, + "advantages": 4.257474817137563e-09, + "completion_length": 575.0, + "delta_ref_entropy_loss": 0.08642578125, + "delta_ref_ppl": -0.0966796875, + "entropy_loss": -0.054443359375, + "epoch": 0.3638, + "grad_norm": 0.9774395056156802, + "k1_kl": 0.09619140625, + "k3_kl": 0.06298828125, + "kimi_kl": 0.193359375, + "learning_rate": 3.1809999999999997e-07, + "loss": 0.0025, + "ppl": 0.0203857421875, + "reward": 0.9147111177444458, + "reward_std": 0.0011053712805733085, + "rewards/perpo_ocr_edit_distance_reward": 0.9147111773490906, "step": 1819, "temperature": 0.9 }, { - "advantages": -4.499299393501133e-05, - "completion_length": 425.5, - "delta_ref_entropy_loss": 0.0518798828125, - "delta_ref_ppl": -0.03558349609375, - "entropy_loss": -0.02691650390625, - "epoch": 0.728, - "grad_norm": 0.6650126585546993, - "k1_kl": 0.03570556640625, - "k3_kl": 0.0206298828125, - "kimi_kl": 0.0472412109375, - "learning_rate": 1.36e-07, - "loss": 0.0009, - "ppl": 0.013427734375, - "reward": 0.9931362867355347, - "reward_std": 0.0014041488466318697, - "rewards/perpo_ocr_edit_distance_reward": 0.9931363463401794, + "advantages": -8.030448952922598e-05, + "completion_length": 779.0, + "delta_ref_entropy_loss": 0.05126953125, + "delta_ref_ppl": -0.051513671875, + "entropy_loss": -0.033447265625, + "epoch": 0.364, + "grad_norm": 0.7089635619923517, + "k1_kl": 0.051513671875, + "k3_kl": 0.0281982421875, + "kimi_kl": 0.06689453125, + "learning_rate": 3.18e-07, + "loss": 0.0012, + "ppl": 0.0135498046875, + "reward": 0.9877042770385742, + "reward_std": 0.0006423771264962852, + "rewards/perpo_ocr_edit_distance_reward": 0.987704336643219, "step": 1820, "temperature": 0.9 }, { - "advantages": -1.0490417935216101e-05, - "completion_length": 1181.0, - "delta_ref_entropy_loss": 0.02020263671875, - "delta_ref_ppl": -0.01287841796875, - "entropy_loss": -0.03900146484375, - "epoch": 0.7284, - "grad_norm": 3.5875477212977773, - "k1_kl": 0.012908935546875, - "k3_kl": 0.0079498291015625, - "kimi_kl": 0.015289306640625, - "learning_rate": 1.3579999999999999e-07, - "loss": 0.0003, - "ppl": 0.019378662109375, - "reward": 0.9767659306526184, - "reward_std": 0.0015234072925522923, - "rewards/perpo_ocr_edit_distance_reward": 0.9767659604549408, + "advantages": -0.00012937614519614726, + "completion_length": 713.0, + "delta_ref_entropy_loss": 0.052490234375, + "delta_ref_ppl": -0.054931640625, + "entropy_loss": -0.031494140625, + "epoch": 0.3642, + "grad_norm": 0.2850716037024144, + "k1_kl": 0.054931640625, + "k3_kl": 0.0322265625, + "kimi_kl": 0.0986328125, + "learning_rate": 3.179e-07, + "loss": 0.0014, + "ppl": 0.0120849609375, + "reward": 0.9986410140991211, + "reward_std": 0.0002949383924715221, + "rewards/perpo_ocr_edit_distance_reward": 0.9986410737037659, "step": 1821, "temperature": 0.9 }, { - "advantages": -8.864062408520113e-06, - "completion_length": 702.5, - "delta_ref_entropy_loss": 0.0633544921875, - "delta_ref_ppl": -0.0595703125, - "entropy_loss": -0.0615234375, - "epoch": 0.7288, - "grad_norm": 0.8853697837399743, - "k1_kl": 0.0595703125, - "k3_kl": 0.03900146484375, - "kimi_kl": 0.125, - "learning_rate": 1.356e-07, + "advantages": 4.717282081401208e-06, + "completion_length": 715.0, + "delta_ref_entropy_loss": 0.05712890625, + "delta_ref_ppl": -0.0703125, + "entropy_loss": -0.040771484375, + "epoch": 0.3644, + "grad_norm": 0.7239561590189317, + "k1_kl": 0.0703125, + "k3_kl": 0.039794921875, + "kimi_kl": 0.134765625, + "learning_rate": 3.178e-07, "loss": 0.0016, - "ppl": 0.0345458984375, - "reward": 0.9811782240867615, - "reward_std": 0.0051822110544890165, - "rewards/perpo_ocr_edit_distance_reward": 0.9811782538890839, + "ppl": 0.015869140625, + "reward": 0.97160404920578, + "reward_std": 0.00533822737634182, + "rewards/perpo_ocr_edit_distance_reward": 0.9716041088104248, "step": 1822, "temperature": 0.9 }, { - "advantages": -0.00015243463440128835, - "completion_length": 498.5, - "delta_ref_entropy_loss": 0.07861328125, - "delta_ref_ppl": -0.0576171875, - "entropy_loss": -0.0985107421875, - "epoch": 0.7292, - "grad_norm": 1.0575295271466947, - "k1_kl": 0.0577392578125, - "k3_kl": 0.03509521484375, - "kimi_kl": 0.0875244140625, - "learning_rate": 1.3539999999999998e-07, - "loss": 0.0016, - "ppl": 0.0587158203125, - "reward": 0.7135764360427856, - "reward_std": 0.002607049980724696, - "rewards/perpo_ocr_edit_distance_reward": 0.7135765552520752, + "advantages": -0.00011417696077842265, + "completion_length": 423.0, + "delta_ref_entropy_loss": 0.044677734375, + "delta_ref_ppl": -0.05615234375, + "entropy_loss": -0.0289306640625, + "epoch": 0.3646, + "grad_norm": 0.4744688996592204, + "k1_kl": 0.05615234375, + "k3_kl": 0.034423828125, + "kimi_kl": 0.10302734375, + "learning_rate": 3.177e-07, + "loss": 0.0015, + "ppl": 0.01177978515625, + "reward": 0.997786819934845, + "reward_std": 0.0005710572586394846, + "rewards/perpo_ocr_edit_distance_reward": 0.9977869987487793, "step": 1823, "temperature": 0.9 }, { - "advantages": -3.074748331499677e-05, - "completion_length": 287.0, - "delta_ref_entropy_loss": 0.01019287109375, - "delta_ref_ppl": -0.131103515625, - "entropy_loss": -0.2122802734375, - "epoch": 0.7296, - "grad_norm": 2.8599331595544397, - "k1_kl": 0.131591796875, - "k3_kl": 0.11376953125, - "kimi_kl": 0.4755859375, - "learning_rate": 1.352e-07, - "loss": 0.0046, - "ppl": 0.11944580078125, - "reward": 0.8584681153297424, - "reward_std": 0.0470594278012868, - "rewards/perpo_ocr_edit_distance_reward": 0.8584681451320648, + "advantages": -4.543577233562246e-05, + "completion_length": 749.0, + "delta_ref_entropy_loss": 0.052734375, + "delta_ref_ppl": -0.061767578125, + "entropy_loss": -0.0751953125, + "epoch": 0.3648, + "grad_norm": 1.1951054125076834, + "k1_kl": 0.06201171875, + "k3_kl": 0.0419921875, + "kimi_kl": 0.1005859375, + "learning_rate": 3.176e-07, + "loss": 0.0017, + "ppl": 0.038818359375, + "reward": 0.9627139568328857, + "reward_std": 0.0012118825688958168, + "rewards/perpo_ocr_edit_distance_reward": 0.9627140164375305, "step": 1824, "temperature": 0.9 }, { - "advantages": -7.956368686512505e-05, - "completion_length": 607.0, - "delta_ref_entropy_loss": 0.10467529296875, - "delta_ref_ppl": -0.063720703125, - "entropy_loss": -0.156005859375, - "epoch": 0.73, - "grad_norm": 1.4469415796497682, - "k1_kl": 0.063720703125, - "k3_kl": 0.03472900390625, - "kimi_kl": 0.095458984375, - "learning_rate": 1.35e-07, - "loss": 0.0015, - "ppl": 0.0886688232421875, - "reward": 0.9182699918746948, - "reward_std": 0.003667146113002673, - "rewards/perpo_ocr_edit_distance_reward": 0.9182700216770172, + "advantages": 2.1542822651099414e-05, + "completion_length": 1995.0, + "delta_ref_entropy_loss": 0.0133056640625, + "delta_ref_ppl": -0.0184326171875, + "entropy_loss": -0.0390625, + "epoch": 0.365, + "grad_norm": 0.6958299262500581, + "k1_kl": 0.0184326171875, + "k3_kl": 0.01129150390625, + "kimi_kl": 0.0283203125, + "learning_rate": 3.175e-07, + "loss": 0.0004, + "ppl": 0.018310546875, + "reward": 0.9963492155075073, + "reward_std": 0.0010859897593036294, + "rewards/perpo_ocr_edit_distance_reward": 0.9963492155075073, "step": 1825, "temperature": 0.9 }, { - "advantages": -3.063678741455078e-05, - "completion_length": 375.5, - "delta_ref_entropy_loss": 0.0401611328125, - "delta_ref_ppl": -0.0537109375, - "entropy_loss": -0.025634765625, - "epoch": 0.7304, - "grad_norm": 0.38392717882750227, - "k1_kl": 0.053466796875, - "k3_kl": 0.0380859375, - "kimi_kl": 0.167724609375, - "learning_rate": 1.348e-07, - "loss": 0.0016, - "ppl": 0.0122528076171875, - "reward": 0.9988519847393036, - "reward_std": 0.0005061955307610333, - "rewards/perpo_ocr_edit_distance_reward": 0.9988520741462708, + "advantages": -5.3014075092505664e-05, + "completion_length": 921.0, + "delta_ref_entropy_loss": 0.04296875, + "delta_ref_ppl": -0.040283203125, + "entropy_loss": -0.040771484375, + "epoch": 0.3652, + "grad_norm": 0.5617673660816086, + "k1_kl": 0.040283203125, + "k3_kl": 0.0244140625, + "kimi_kl": 0.05078125, + "learning_rate": 3.174e-07, + "loss": 0.001, + "ppl": 0.017578125, + "reward": 0.9899464845657349, + "reward_std": 0.0007033594301901758, + "rewards/perpo_ocr_edit_distance_reward": 0.9899464845657349, "step": 1826, "temperature": 0.9 }, { - "advantages": -8.036622239160351e-05, - "completion_length": 1043.0, - "delta_ref_entropy_loss": 0.0279541015625, - "delta_ref_ppl": -0.02264404296875, - "entropy_loss": -0.02545166015625, - "epoch": 0.7308, - "grad_norm": 1.000688613097175, - "k1_kl": 0.02264404296875, - "k3_kl": 0.021026611328125, - "kimi_kl": 0.032470703125, - "learning_rate": 1.346e-07, - "loss": 0.0009, - "ppl": 0.012786865234375, - "reward": 0.998185783624649, - "reward_std": 0.0006418397533707321, - "rewards/perpo_ocr_edit_distance_reward": 0.9981858730316162, + "advantages": -9.800706720852759e-06, + "completion_length": 336.0, + "delta_ref_entropy_loss": 0.08740234375, + "delta_ref_ppl": -0.1171875, + "entropy_loss": -0.056640625, + "epoch": 0.3654, + "grad_norm": 1.7196892249727045, + "k1_kl": 0.1171875, + "k3_kl": 0.0810546875, + "kimi_kl": 0.33984375, + "learning_rate": 3.173e-07, + "loss": 0.0033, + "ppl": 0.0240478515625, + "reward": 0.9644991755485535, + "reward_std": 0.0025071119889616966, + "rewards/perpo_ocr_edit_distance_reward": 0.9644992351531982, "step": 1827, "temperature": 0.9 }, { - "advantages": -7.635780866621644e-05, - "completion_length": 217.5, - "delta_ref_entropy_loss": 0.048583984375, - "delta_ref_ppl": -0.079833984375, - "entropy_loss": -0.09228515625, - "epoch": 0.7312, - "grad_norm": 5.051633448929327, - "k1_kl": 0.07958984375, - "k3_kl": 0.0584716796875, - "kimi_kl": 0.179443359375, - "learning_rate": 1.3439999999999999e-07, - "loss": 0.0024, - "ppl": 0.05615234375, - "reward": 0.9784133434295654, - "reward_std": 0.00550589626072906, - "rewards/perpo_ocr_edit_distance_reward": 0.978413462638855, + "advantages": -2.014636993408203e-05, + "completion_length": 234.0, + "delta_ref_entropy_loss": 0.1435546875, + "delta_ref_ppl": -0.16796875, + "entropy_loss": -0.0732421875, + "epoch": 0.3656, + "grad_norm": 1.318111203797978, + "k1_kl": 0.16796875, + "k3_kl": 0.115234375, + "kimi_kl": 0.392578125, + "learning_rate": 3.1719999999999996e-07, + "loss": 0.0046, + "ppl": 0.0341796875, + "reward": 0.7889962196350098, + "reward_std": 0.001589312101714313, + "rewards/perpo_ocr_edit_distance_reward": 0.7889963388442993, "step": 1828, "temperature": 0.9 }, { - "advantages": -1.7540796761750244e-06, - "completion_length": 624.0, - "delta_ref_entropy_loss": 0.028076171875, - "delta_ref_ppl": -0.021026611328125, - "entropy_loss": -0.02386474609375, - "epoch": 0.7316, - "grad_norm": 0.49015537277807575, - "k1_kl": 0.02099609375, - "k3_kl": 0.0112152099609375, - "kimi_kl": 0.0310211181640625, - "learning_rate": 1.342e-07, - "loss": 0.0005, - "ppl": 0.0099945068359375, - "reward": 0.9974911510944366, - "reward_std": 0.00238541467115283, - "rewards/perpo_ocr_edit_distance_reward": 0.997491180896759, + "advantages": -3.705705967149697e-05, + "completion_length": 487.0, + "delta_ref_entropy_loss": 0.0517578125, + "delta_ref_ppl": -0.07177734375, + "entropy_loss": -0.035400390625, + "epoch": 0.3658, + "grad_norm": 0.6942862107314308, + "k1_kl": 0.07177734375, + "k3_kl": 0.04931640625, + "kimi_kl": 0.16796875, + "learning_rate": 3.171e-07, + "loss": 0.002, + "ppl": 0.0135498046875, + "reward": 0.9643839597702026, + "reward_std": 0.0010488773696124554, + "rewards/perpo_ocr_edit_distance_reward": 0.9643839597702026, "step": 1829, "temperature": 0.9 }, { - "advantages": -8.991787060708134e-06, - "completion_length": 305.5, - "delta_ref_entropy_loss": 0.04248046875, - "delta_ref_ppl": -0.03106689453125, - "entropy_loss": -0.03814697265625, - "epoch": 0.732, - "grad_norm": 0.9608855105266045, - "k1_kl": 0.03106689453125, - "k3_kl": 0.0172119140625, - "kimi_kl": 0.0380859375, - "learning_rate": 1.34e-07, - "loss": 0.0007, - "ppl": 0.021331787109375, - "reward": 0.9954698979854584, - "reward_std": 0.0013885994267184287, - "rewards/perpo_ocr_edit_distance_reward": 0.9954698979854584, + "advantages": -0.0001284394966205582, + "completion_length": 1233.0, + "delta_ref_entropy_loss": 0.0537109375, + "delta_ref_ppl": -0.056396484375, + "entropy_loss": -0.033935546875, + "epoch": 0.366, + "grad_norm": 0.5478875327192195, + "k1_kl": 0.05615234375, + "k3_kl": 0.029296875, + "kimi_kl": 0.07470703125, + "learning_rate": 3.17e-07, + "loss": 0.0013, + "ppl": 0.01409912109375, + "reward": 0.9944023489952087, + "reward_std": 0.0004965446423739195, + "rewards/perpo_ocr_edit_distance_reward": 0.9944024682044983, "step": 1830, "temperature": 0.9 }, { - "advantages": -2.644743290147744e-05, - "completion_length": 459.0, - "delta_ref_entropy_loss": 0.03680419921875, - "delta_ref_ppl": -0.0286865234375, - "entropy_loss": -0.04547119140625, - "epoch": 0.7324, - "grad_norm": 0.5563386762208965, - "k1_kl": 0.02880859375, - "k3_kl": 0.01715087890625, - "kimi_kl": 0.04248046875, - "learning_rate": 1.338e-07, - "loss": 0.0007, - "ppl": 0.0245361328125, - "reward": 0.9944553673267365, - "reward_std": 0.0005939902039244771, - "rewards/perpo_ocr_edit_distance_reward": 0.9944553673267365, + "advantages": -0.00011050701868953183, + "completion_length": 700.0, + "delta_ref_entropy_loss": 0.05517578125, + "delta_ref_ppl": -0.045166015625, + "entropy_loss": -0.01904296875, + "epoch": 0.3662, + "grad_norm": 0.8933855985827219, + "k1_kl": 0.045166015625, + "k3_kl": 0.0225830078125, + "kimi_kl": 0.053955078125, + "learning_rate": 3.169e-07, + "loss": 0.001, + "ppl": 0.005767822265625, + "reward": 0.9857687950134277, + "reward_std": 0.0004392504633869976, + "rewards/perpo_ocr_edit_distance_reward": 0.9857689142227173, "step": 1831, "temperature": 0.9 }, { - "advantages": -4.4639624320552684e-05, - "completion_length": 710.5, - "delta_ref_entropy_loss": 0.0478515625, - "delta_ref_ppl": -0.0404052734375, - "entropy_loss": -0.0965576171875, - "epoch": 0.7328, - "grad_norm": 1.138760891144222, - "k1_kl": 0.0401611328125, - "k3_kl": 0.0238037109375, - "kimi_kl": 0.0572509765625, - "learning_rate": 1.3359999999999998e-07, + "advantages": -7.212162017822266e-05, + "completion_length": 736.0, + "delta_ref_entropy_loss": 0.015869140625, + "delta_ref_ppl": -0.03369140625, + "entropy_loss": -0.020263671875, + "epoch": 0.3664, + "grad_norm": 0.3008015031828856, + "k1_kl": 0.03369140625, + "k3_kl": 0.0235595703125, + "kimi_kl": 0.0703125, + "learning_rate": 3.1680000000000003e-07, "loss": 0.001, - "ppl": 0.052581787109375, - "reward": 0.9613693356513977, - "reward_std": 0.005547756372834556, - "rewards/perpo_ocr_edit_distance_reward": 0.9613694250583649, + "ppl": 0.007293701171875, + "reward": 0.9687727093696594, + "reward_std": 0.0007262229919433594, + "rewards/perpo_ocr_edit_distance_reward": 0.9687727689743042, "step": 1832, "temperature": 0.9 }, { - "advantages": -2.5306429705551636e-05, - "completion_length": 248.5, - "delta_ref_entropy_loss": 0.07861328125, - "delta_ref_ppl": -0.123046875, - "entropy_loss": -0.098876953125, - "epoch": 0.7332, - "grad_norm": 2.2180021513337556, - "k1_kl": 0.122802734375, - "k3_kl": 0.0882568359375, - "kimi_kl": 0.38427734375, - "learning_rate": 1.334e-07, - "loss": 0.0035, - "ppl": 0.0474853515625, - "reward": 0.9080935716629028, - "reward_std": 0.08275662071537226, - "rewards/perpo_ocr_edit_distance_reward": 0.90809366106987, + "advantages": -2.043587983280304e-07, + "completion_length": 2048.0, + "delta_ref_entropy_loss": 0.03515625, + "delta_ref_ppl": -0.0869140625, + "entropy_loss": -0.275390625, + "epoch": 0.3666, + "grad_norm": 1.6507941875329553, + "k1_kl": 0.0869140625, + "k3_kl": 0.060302734375, + "kimi_kl": 0.1591796875, + "learning_rate": 3.1669999999999997e-07, + "loss": 0.0024, + "ppl": 0.1513671875, + "reward": 0.6919714212417603, + "reward_std": 0.193868950009346, + "rewards/perpo_ocr_edit_distance_reward": 0.6919714212417603, "step": 1833, "temperature": 0.9 }, { - "advantages": -1.797505788658782e-05, - "completion_length": 359.5, - "delta_ref_entropy_loss": 0.063232421875, - "delta_ref_ppl": -0.105712890625, - "entropy_loss": -0.107666015625, - "epoch": 0.7336, - "grad_norm": 1.3666315639726854, - "k1_kl": 0.10546875, - "k3_kl": 0.0767822265625, - "kimi_kl": 0.24951171875, - "learning_rate": 1.332e-07, - "loss": 0.0031, - "ppl": 0.059814453125, - "reward": 0.7855993807315826, - "reward_std": 0.03359688649652526, - "rewards/perpo_ocr_edit_distance_reward": 0.7855994701385498, + "advantages": -3.123283386230469e-05, + "completion_length": 1170.0, + "delta_ref_entropy_loss": 0.0830078125, + "delta_ref_ppl": -0.09326171875, + "entropy_loss": -0.04638671875, + "epoch": 0.3668, + "grad_norm": 0.659354313864706, + "k1_kl": 0.09326171875, + "k3_kl": 0.049072265625, + "kimi_kl": 0.1240234375, + "learning_rate": 3.1659999999999996e-07, + "loss": 0.002, + "ppl": 0.0184326171875, + "reward": 0.9830120205879211, + "reward_std": 0.000990052823908627, + "rewards/perpo_ocr_edit_distance_reward": 0.9830120801925659, "step": 1834, "temperature": 0.9 }, { - "advantages": -1.813684320950415e-05, - "completion_length": 506.5, - "delta_ref_entropy_loss": 0.05322265625, - "delta_ref_ppl": -0.0513916015625, - "entropy_loss": -0.028564453125, - "epoch": 0.734, - "grad_norm": 0.5798906589798002, - "k1_kl": 0.0513916015625, - "k3_kl": 0.03533935546875, - "kimi_kl": 0.137939453125, - "learning_rate": 1.33e-07, - "loss": 0.0014, - "ppl": 0.015350341796875, - "reward": 0.9945214986801147, - "reward_std": 0.0015345430001616478, - "rewards/perpo_ocr_edit_distance_reward": 0.9945215582847595, + "advantages": -7.075923349475488e-06, + "completion_length": 1304.0, + "delta_ref_entropy_loss": 0.053955078125, + "delta_ref_ppl": -0.045654296875, + "entropy_loss": -0.08251953125, + "epoch": 0.367, + "grad_norm": 1.3128544534981146, + "k1_kl": 0.045654296875, + "k3_kl": 0.029052734375, + "kimi_kl": 0.06591796875, + "learning_rate": 3.165e-07, + "loss": 0.0012, + "ppl": 0.036865234375, + "reward": 0.8630843758583069, + "reward_std": 0.007121817208826542, + "rewards/perpo_ocr_edit_distance_reward": 0.8630843758583069, "step": 1835, "temperature": 0.9 }, { - "advantages": -3.0453717954515014e-05, - "completion_length": 927.0, - "delta_ref_entropy_loss": 0.0360107421875, - "delta_ref_ppl": -0.0357666015625, - "entropy_loss": -0.02740478515625, - "epoch": 0.7344, - "grad_norm": 0.438977479842526, - "k1_kl": 0.03594970703125, - "k3_kl": 0.02105712890625, - "kimi_kl": 0.06463623046875, - "learning_rate": 1.328e-07, - "loss": 0.0009, - "ppl": 0.0128173828125, - "reward": 0.9985293745994568, - "reward_std": 0.0005377347697503865, - "rewards/perpo_ocr_edit_distance_reward": 0.998529464006424, + "advantages": -2.4846622181939892e-05, + "completion_length": 467.0, + "delta_ref_entropy_loss": 0.0859375, + "delta_ref_ppl": -0.078125, + "entropy_loss": -0.03515625, + "epoch": 0.3672, + "grad_norm": 0.7852787085370302, + "k1_kl": 0.078125, + "k3_kl": 0.0458984375, + "kimi_kl": 0.140625, + "learning_rate": 3.164e-07, + "loss": 0.0019, + "ppl": 0.01385498046875, + "reward": 0.9917779564857483, + "reward_std": 0.002981649711728096, + "rewards/perpo_ocr_edit_distance_reward": 0.9917780756950378, "step": 1836, "temperature": 0.9 }, { - "advantages": -5.3067295766595635e-05, - "completion_length": 933.0, - "delta_ref_entropy_loss": 0.079833984375, - "delta_ref_ppl": -0.051483154296875, - "entropy_loss": -0.0806884765625, - "epoch": 0.7348, - "grad_norm": 0.6395926189260562, - "k1_kl": 0.05169677734375, - "k3_kl": 0.029266357421875, - "kimi_kl": 0.07159423828125, - "learning_rate": 1.3259999999999998e-07, - "loss": 0.0012, - "ppl": 0.0413818359375, - "reward": 0.9273010790348053, - "reward_std": 0.012245653357240371, - "rewards/perpo_ocr_edit_distance_reward": 0.9273011684417725, + "advantages": -4.4826952944276854e-05, + "completion_length": 427.0, + "delta_ref_entropy_loss": 0.07421875, + "delta_ref_ppl": -0.126953125, + "entropy_loss": -0.033935546875, + "epoch": 0.3674, + "grad_norm": 0.5031265804542933, + "k1_kl": 0.1259765625, + "k3_kl": 0.0927734375, + "kimi_kl": 0.36328125, + "learning_rate": 3.163e-07, + "loss": 0.0037, + "ppl": 0.01312255859375, + "reward": 0.9941219687461853, + "reward_std": 0.0012293810723349452, + "rewards/perpo_ocr_edit_distance_reward": 0.9941220879554749, "step": 1837, "temperature": 0.9 }, { - "advantages": -0.0002980232238769531, - "completion_length": 279.0, - "delta_ref_entropy_loss": 0.0330810546875, - "delta_ref_ppl": -0.033203125, - "entropy_loss": -0.01611328125, - "epoch": 0.7352, - "grad_norm": 0.020503522942204703, - "k1_kl": 0.0333251953125, - "k3_kl": 0.01953125, - "kimi_kl": 0.0474853515625, - "learning_rate": 1.324e-07, - "loss": 0.0011, - "ppl": 0.00768280029296875, - "reward": 0.9995429217815399, - "reward_std": 0.0, - "rewards/perpo_ocr_edit_distance_reward": 0.9995429813861847, + "advantages": 1.647642784519121e-05, + "completion_length": 555.0, + "delta_ref_entropy_loss": 0.059326171875, + "delta_ref_ppl": -0.07861328125, + "entropy_loss": -0.027587890625, + "epoch": 0.3676, + "grad_norm": 0.37492959560645706, + "k1_kl": 0.07861328125, + "k3_kl": 0.05078125, + "kimi_kl": 0.189453125, + "learning_rate": 3.162e-07, + "loss": 0.002, + "ppl": 0.009521484375, + "reward": 0.9979919195175171, + "reward_std": 0.0004174317582510412, + "rewards/perpo_ocr_edit_distance_reward": 0.9979918599128723, "step": 1838, "temperature": 0.9 }, { - "advantages": -4.3247430767223705e-05, - "completion_length": 350.0, - "delta_ref_entropy_loss": 0.109375, - "delta_ref_ppl": -0.0867919921875, - "entropy_loss": -0.079345703125, - "epoch": 0.7356, - "grad_norm": 1.054735515742451, - "k1_kl": 0.0867919921875, - "k3_kl": 0.048828125, - "kimi_kl": 0.1416015625, - "learning_rate": 1.322e-07, - "loss": 0.002, - "ppl": 0.040283203125, - "reward": 0.9833684861660004, - "reward_std": 0.0022932779393158853, - "rewards/perpo_ocr_edit_distance_reward": 0.9833685755729675, + "advantages": -1.3308866073202807e-05, + "completion_length": 134.0, + "delta_ref_entropy_loss": 0.1103515625, + "delta_ref_ppl": -0.3203125, + "entropy_loss": -0.08251953125, + "epoch": 0.3678, + "grad_norm": 1.1004677921678023, + "k1_kl": 0.3203125, + "k3_kl": 0.26171875, + "kimi_kl": 1.3203125, + "learning_rate": 3.161e-07, + "loss": 0.0105, + "ppl": 0.030029296875, + "reward": 0.9859660863876343, + "reward_std": 0.0024581861216574907, + "rewards/perpo_ocr_edit_distance_reward": 0.9859662055969238, "step": 1839, "temperature": 0.9 }, { - "advantages": -1.7540796761750244e-06, - "completion_length": 601.0, - "delta_ref_entropy_loss": 0.052490234375, - "delta_ref_ppl": -0.0439453125, - "entropy_loss": -0.048095703125, - "epoch": 0.736, - "grad_norm": 0.6927151096335575, - "k1_kl": 0.0439453125, - "k3_kl": 0.0264892578125, - "kimi_kl": 0.08740234375, - "learning_rate": 1.32e-07, - "loss": 0.0011, - "ppl": 0.0262603759765625, - "reward": 0.9274786114692688, - "reward_std": 0.006025259383022785, - "rewards/perpo_ocr_edit_distance_reward": 0.9274786412715912, + "advantages": -2.384185791015625e-07, + "completion_length": 443.0, + "delta_ref_entropy_loss": 0.04931640625, + "delta_ref_ppl": -0.10302734375, + "entropy_loss": -0.059326171875, + "epoch": 0.368, + "grad_norm": 1.2420285459566518, + "k1_kl": 0.10302734375, + "k3_kl": 0.0771484375, + "kimi_kl": 0.265625, + "learning_rate": 3.1599999999999997e-07, + "loss": 0.0031, + "ppl": 0.0235595703125, + "reward": 0.8544146418571472, + "reward_std": 0.11236623674631119, + "rewards/perpo_ocr_edit_distance_reward": 0.854414701461792, "step": 1840, "temperature": 0.9 }, { - "advantages": -2.0521029000519775e-06, - "completion_length": 295.0, - "delta_ref_entropy_loss": 0.03155517578125, - "delta_ref_ppl": -0.0875244140625, - "entropy_loss": -0.0225830078125, - "epoch": 0.7364, - "grad_norm": 0.42146303992180284, - "k1_kl": 0.0870361328125, - "k3_kl": 0.067138671875, - "kimi_kl": 0.2451171875, - "learning_rate": 1.318e-07, - "loss": 0.0027, - "ppl": 0.0123443603515625, - "reward": 0.9966866672039032, - "reward_std": 0.0009926017373800278, - "rewards/perpo_ocr_edit_distance_reward": 0.9966866374015808, + "advantages": -2.002716064453125e-05, + "completion_length": 687.0, + "delta_ref_entropy_loss": 0.052734375, + "delta_ref_ppl": -0.068359375, + "entropy_loss": -0.047119140625, + "epoch": 0.3682, + "grad_norm": 0.5147896580784639, + "k1_kl": 0.068359375, + "k3_kl": 0.043701171875, + "kimi_kl": 0.16015625, + "learning_rate": 3.159e-07, + "loss": 0.0018, + "ppl": 0.0218505859375, + "reward": 0.9933673739433289, + "reward_std": 0.0007499310304410756, + "rewards/perpo_ocr_edit_distance_reward": 0.9933673739433289, "step": 1841, "temperature": 0.9 }, { - "advantages": -3.0653817795212035e-07, - "completion_length": 437.5, - "delta_ref_entropy_loss": 0.0401611328125, - "delta_ref_ppl": -0.0352783203125, - "entropy_loss": -0.05517578125, - "epoch": 0.7368, - "grad_norm": 0.744302768723546, - "k1_kl": 0.03515625, - "k3_kl": 0.02044677734375, - "kimi_kl": 0.0543212890625, - "learning_rate": 1.316e-07, - "loss": 0.0008, - "ppl": 0.0252685546875, - "reward": 0.9082345962524414, - "reward_std": 0.04563139204401523, - "rewards/perpo_ocr_edit_distance_reward": 0.9082345962524414, + "advantages": -3.848757387459045e-06, + "completion_length": 803.0, + "delta_ref_entropy_loss": 0.09619140625, + "delta_ref_ppl": -0.09912109375, + "entropy_loss": -0.22265625, + "epoch": 0.3684, + "grad_norm": 1.4625781998354521, + "k1_kl": 0.0986328125, + "k3_kl": 0.060791015625, + "kimi_kl": 0.18359375, + "learning_rate": 3.158e-07, + "loss": 0.0024, + "ppl": 0.1181640625, + "reward": 0.6337372660636902, + "reward_std": 0.006551329046487808, + "rewards/perpo_ocr_edit_distance_reward": 0.633737325668335, "step": 1842, "temperature": 0.9 }, { - "advantages": -5.065543700766284e-05, - "completion_length": 1035.5, - "delta_ref_entropy_loss": 0.017822265625, - "delta_ref_ppl": -0.01995849609375, - "entropy_loss": -0.0164794921875, - "epoch": 0.7372, - "grad_norm": 0.35546595793119895, - "k1_kl": 0.01995849609375, - "k3_kl": 0.0130615234375, - "kimi_kl": 0.042236328125, - "learning_rate": 1.3139999999999997e-07, - "loss": 0.0006, - "ppl": 0.006195068359375, - "reward": 0.9978407919406891, - "reward_std": 0.0033678443942335434, - "rewards/perpo_ocr_edit_distance_reward": 0.9978408217430115, + "advantages": -5.510875416803174e-05, + "completion_length": 1679.0, + "delta_ref_entropy_loss": 0.0233154296875, + "delta_ref_ppl": -0.046630859375, + "entropy_loss": -0.05908203125, + "epoch": 0.3686, + "grad_norm": 3.8254902951789984, + "k1_kl": 0.046630859375, + "k3_kl": 0.048095703125, + "kimi_kl": 0.099609375, + "learning_rate": 3.1569999999999995e-07, + "loss": 0.002, + "ppl": 0.03125, + "reward": 0.9874111413955688, + "reward_std": 0.001290691434405744, + "rewards/perpo_ocr_edit_distance_reward": 0.9874112606048584, "step": 1843, "temperature": 0.9 }, { - "advantages": -0.00010658162500476465, - "completion_length": 926.0, - "delta_ref_entropy_loss": 0.0501708984375, - "delta_ref_ppl": -0.03411865234375, - "entropy_loss": -0.0404052734375, - "epoch": 0.7376, - "grad_norm": 0.6998537914876519, - "k1_kl": 0.03411865234375, - "k3_kl": 0.0166015625, - "kimi_kl": 0.0322113037109375, - "learning_rate": 1.312e-07, - "loss": 0.0008, - "ppl": 0.020721435546875, - "reward": 0.9497824013233185, - "reward_std": 0.0008519227267242968, - "rewards/perpo_ocr_edit_distance_reward": 0.9497824907302856, + "advantages": -1.437323498976184e-05, + "completion_length": 360.0, + "delta_ref_entropy_loss": 0.09423828125, + "delta_ref_ppl": -0.10107421875, + "entropy_loss": -0.08056640625, + "epoch": 0.3688, + "grad_norm": 0.7826535156330061, + "k1_kl": 0.10107421875, + "k3_kl": 0.06494140625, + "kimi_kl": 0.236328125, + "learning_rate": 3.156e-07, + "loss": 0.0026, + "ppl": 0.0283203125, + "reward": 0.9149081110954285, + "reward_std": 0.004633691627532244, + "rewards/perpo_ocr_edit_distance_reward": 0.9149081707000732, "step": 1844, "temperature": 0.9 }, { - "advantages": -0.00013202429363445845, - "completion_length": 716.5, - "delta_ref_entropy_loss": 0.024993896484375, - "delta_ref_ppl": -0.0149993896484375, - "entropy_loss": -0.02081298828125, - "epoch": 0.738, - "grad_norm": 0.38773651650916025, - "k1_kl": 0.0150299072265625, - "k3_kl": 0.0093994140625, - "kimi_kl": 0.02239990234375, - "learning_rate": 1.31e-07, - "loss": 0.0005, - "ppl": 0.012542724609375, - "reward": 0.997813493013382, - "reward_std": 0.0001985787966987118, - "rewards/perpo_ocr_edit_distance_reward": 0.9978135228157043, + "advantages": -5.713531209039502e-05, + "completion_length": 400.0, + "delta_ref_entropy_loss": 0.055908203125, + "delta_ref_ppl": -0.08740234375, + "entropy_loss": -0.041259765625, + "epoch": 0.369, + "grad_norm": 0.6336015977190721, + "k1_kl": 0.0869140625, + "k3_kl": 0.056884765625, + "kimi_kl": 0.2138671875, + "learning_rate": 3.155e-07, + "loss": 0.0023, + "ppl": 0.01519775390625, + "reward": 0.9980553984642029, + "reward_std": 0.001390172285027802, + "rewards/perpo_ocr_edit_distance_reward": 0.9980555176734924, "step": 1845, "temperature": 0.9 }, { - "advantages": -0.00031192813639790984, - "completion_length": 720.5, - "delta_ref_entropy_loss": 0.045654296875, - "delta_ref_ppl": -0.04168701171875, - "entropy_loss": -0.07318115234375, - "epoch": 0.7384, - "grad_norm": 1419.6440134004722, - "k1_kl": 0.04150390625, - "k3_kl": 0.695556640625, - "kimi_kl": 0.1285400390625, - "learning_rate": 1.308e-07, - "loss": 0.0281, - "ppl": 0.042327880859375, - "reward": 0.9853411316871643, - "reward_std": 0.0017862451495602727, - "rewards/perpo_ocr_edit_distance_reward": 0.9853412210941315, + "advantages": 0.0, + "completion_length": 1009.0, + "delta_ref_entropy_loss": 0.06591796875, + "delta_ref_ppl": -0.0673828125, + "entropy_loss": -0.06298828125, + "epoch": 0.3692, + "grad_norm": 0.8452516276411152, + "k1_kl": 0.0673828125, + "k3_kl": 0.040283203125, + "kimi_kl": 0.10400390625, + "learning_rate": 3.154e-07, + "loss": 0.0016, + "ppl": 0.0296630859375, + "reward": 0.8742951154708862, + "reward_std": 0.0048532746732234955, + "rewards/perpo_ocr_edit_distance_reward": 0.8742951154708862, "step": 1846, "temperature": 0.9 }, { - "advantages": -4.090155880476232e-05, - "completion_length": 1230.0, - "delta_ref_entropy_loss": 0.03741455078125, - "delta_ref_ppl": -0.0283203125, - "entropy_loss": -0.0538330078125, - "epoch": 0.7388, - "grad_norm": 1.0469193683800737, - "k1_kl": 0.028167724609375, - "k3_kl": 0.02471923828125, - "kimi_kl": 0.04052734375, - "learning_rate": 1.306e-07, - "loss": 0.001, - "ppl": 0.0311279296875, - "reward": 0.8867986798286438, - "reward_std": 0.005446828581625596, - "rewards/perpo_ocr_edit_distance_reward": 0.8867987394332886, + "advantages": -2.4199487597797997e-05, + "completion_length": 203.0, + "delta_ref_entropy_loss": 0.1162109375, + "delta_ref_ppl": -0.1318359375, + "entropy_loss": -0.0341796875, + "epoch": 0.3694, + "grad_norm": 0.9050420639992295, + "k1_kl": 0.1328125, + "k3_kl": 0.08984375, + "kimi_kl": 0.31640625, + "learning_rate": 3.153e-07, + "loss": 0.0036, + "ppl": 0.01043701171875, + "reward": 0.976665735244751, + "reward_std": 0.0016588604776188731, + "rewards/perpo_ocr_edit_distance_reward": 0.9766657948493958, "step": 1847, "temperature": 0.9 }, { - "advantages": -2.6030201638604922e-05, - "completion_length": 452.0, - "delta_ref_entropy_loss": 0.107177734375, - "delta_ref_ppl": -0.067626953125, - "entropy_loss": -0.076171875, - "epoch": 0.7392, - "grad_norm": 0.8406263313654196, - "k1_kl": 0.06787109375, - "k3_kl": 0.035400390625, - "kimi_kl": 0.0882568359375, - "learning_rate": 1.3039999999999998e-07, - "loss": 0.0014, - "ppl": 0.040771484375, - "reward": 0.6229266822338104, - "reward_std": 0.003094519372098148, - "rewards/perpo_ocr_edit_distance_reward": 0.6229267716407776, + "advantages": -1.857961979112588e-05, + "completion_length": 242.0, + "delta_ref_entropy_loss": 0.08642578125, + "delta_ref_ppl": -0.2236328125, + "entropy_loss": -0.0595703125, + "epoch": 0.3696, + "grad_norm": 1.6795938554005059, + "k1_kl": 0.2236328125, + "k3_kl": 0.173828125, + "kimi_kl": 0.765625, + "learning_rate": 3.1519999999999996e-07, + "loss": 0.007, + "ppl": 0.021240234375, + "reward": 0.9863791465759277, + "reward_std": 0.003565703984349966, + "rewards/perpo_ocr_edit_distance_reward": 0.9863792657852173, "step": 1848, "temperature": 0.9 }, { - "advantages": -3.377880511834519e-05, - "completion_length": 609.0, - "delta_ref_entropy_loss": 0.02801513671875, - "delta_ref_ppl": -0.039306640625, - "entropy_loss": -0.041015625, - "epoch": 0.7396, - "grad_norm": 1.9090235657664052, - "k1_kl": 0.03948974609375, - "k3_kl": 0.027587890625, - "kimi_kl": 0.09906005859375, - "learning_rate": 1.3020000000000001e-07, - "loss": 0.0011, - "ppl": 0.020538330078125, - "reward": 0.9871274530887604, - "reward_std": 0.0026068525621667504, - "rewards/perpo_ocr_edit_distance_reward": 0.9871275424957275, + "advantages": -4.6713012125110254e-05, + "completion_length": 189.0, + "delta_ref_entropy_loss": 0.05419921875, + "delta_ref_ppl": -0.193359375, + "entropy_loss": -0.037353515625, + "epoch": 0.3698, + "grad_norm": 0.6155822342314051, + "k1_kl": 0.1923828125, + "k3_kl": 0.150390625, + "kimi_kl": 0.71875, + "learning_rate": 3.1509999999999996e-07, + "loss": 0.0061, + "ppl": 0.01312255859375, + "reward": 0.8942393064498901, + "reward_std": 0.0017230142839252949, + "rewards/perpo_ocr_edit_distance_reward": 0.8942394256591797, "step": 1849, "temperature": 0.9 }, { - "advantages": -0.000307227884150052, - "completion_length": 637.5, - "delta_ref_entropy_loss": 0.048095703125, - "delta_ref_ppl": -0.0413818359375, - "entropy_loss": -0.02740478515625, - "epoch": 0.74, - "grad_norm": 0.37508357225989397, - "k1_kl": 0.0413818359375, - "k3_kl": 0.02557373046875, - "kimi_kl": 0.10205078125, - "learning_rate": 1.3e-07, - "loss": 0.0013, - "ppl": 0.01239013671875, - "reward": 0.987925797700882, - "reward_std": 0.001336555345915258, - "rewards/perpo_ocr_edit_distance_reward": 0.9879258871078491, + "advantages": -9.196145356327179e-07, + "completion_length": 2048.0, + "delta_ref_entropy_loss": 0.01123046875, + "delta_ref_ppl": -0.0322265625, + "entropy_loss": -0.10009765625, + "epoch": 0.37, + "grad_norm": 1.3887864467861417, + "k1_kl": 0.0322265625, + "k3_kl": 0.02685546875, + "kimi_kl": 0.052734375, + "learning_rate": 3.15e-07, + "loss": 0.0011, + "ppl": 0.060791015625, + "reward": 0.9329827427864075, + "reward_std": 0.009468413889408112, + "rewards/perpo_ocr_edit_distance_reward": 0.9329828023910522, "step": 1850, "temperature": 0.9 }, { - "advantages": 1.3206687199840417e-05, - "completion_length": 1090.0, - "delta_ref_entropy_loss": 0.0090484619140625, - "delta_ref_ppl": -0.018280029296875, - "entropy_loss": -0.060791015625, - "epoch": 0.7404, - "grad_norm": 0.7068223928924448, - "k1_kl": 0.018280029296875, - "k3_kl": 0.0162353515625, - "kimi_kl": 0.0391845703125, - "learning_rate": 1.2979999999999998e-07, - "loss": 0.0006, - "ppl": 0.026580810546875, - "reward": 0.8308094441890717, - "reward_std": 0.12635665165726095, - "rewards/perpo_ocr_edit_distance_reward": 0.8308095037937164, + "advantages": -1.481601202613092e-06, + "completion_length": 800.0, + "delta_ref_entropy_loss": 0.1591796875, + "delta_ref_ppl": -0.11376953125, + "entropy_loss": -0.291015625, + "epoch": 0.3702, + "grad_norm": 5.98948813689194, + "k1_kl": 0.11328125, + "k3_kl": 0.0625, + "kimi_kl": 0.1318359375, + "learning_rate": 3.149e-07, + "loss": 0.0025, + "ppl": 0.158203125, + "reward": 0.7741384506225586, + "reward_std": 0.04507104679942131, + "rewards/perpo_ocr_edit_distance_reward": 0.7741385102272034, "step": 1851, "temperature": 0.9 }, { - "advantages": -5.091939783596899e-06, - "completion_length": 726.0, - "delta_ref_entropy_loss": 0.06524658203125, - "delta_ref_ppl": -0.03656005859375, - "entropy_loss": -0.125732421875, - "epoch": 0.7408, - "grad_norm": 3.48991839654153, - "k1_kl": 0.03643798828125, - "k3_kl": 0.02142333984375, - "kimi_kl": 0.04833984375, - "learning_rate": 1.296e-07, - "loss": 0.0009, - "ppl": 0.07276153564453125, - "reward": 0.9531672298908234, - "reward_std": 0.0020439426880329847, - "rewards/perpo_ocr_edit_distance_reward": 0.9531672298908234, + "advantages": -3.661428490886465e-05, + "completion_length": 458.0, + "delta_ref_entropy_loss": 0.10205078125, + "delta_ref_ppl": -0.095703125, + "entropy_loss": -0.08349609375, + "epoch": 0.3704, + "grad_norm": 1.3958910376987743, + "k1_kl": 0.095703125, + "k3_kl": 0.053466796875, + "kimi_kl": 0.1572265625, + "learning_rate": 3.148e-07, + "loss": 0.0022, + "ppl": 0.037353515625, + "reward": 0.9617108702659607, + "reward_std": 0.002688922919332981, + "rewards/perpo_ocr_edit_distance_reward": 0.9617109298706055, "step": 1852, "temperature": 0.9 }, { - "advantages": -4.234484367771074e-05, - "completion_length": 403.0, - "delta_ref_entropy_loss": 0.05633544921875, - "delta_ref_ppl": -0.062835693359375, - "entropy_loss": -0.062530517578125, - "epoch": 0.7412, - "grad_norm": 1.4553443652148057, - "k1_kl": 0.062835693359375, - "k3_kl": 0.0355987548828125, - "kimi_kl": 0.0780029296875, - "learning_rate": 1.2939999999999998e-07, - "loss": 0.0015, - "ppl": 0.0342559814453125, - "reward": 0.9976869821548462, - "reward_std": 0.0020828600972890854, - "rewards/perpo_ocr_edit_distance_reward": 0.9976870119571686, + "advantages": -8.71930842549773e-06, + "completion_length": 1243.0, + "delta_ref_entropy_loss": 0.05029296875, + "delta_ref_ppl": -0.0439453125, + "entropy_loss": -0.10888671875, + "epoch": 0.3706, + "grad_norm": 1.8915293691547652, + "k1_kl": 0.0439453125, + "k3_kl": 0.031982421875, + "kimi_kl": 0.06396484375, + "learning_rate": 3.147e-07, + "loss": 0.0013, + "ppl": 0.059814453125, + "reward": 0.8167698979377747, + "reward_std": 0.0038037023041397333, + "rewards/perpo_ocr_edit_distance_reward": 0.8167698979377747, "step": 1853, "temperature": 0.9 }, { - "advantages": -0.00011347234874392598, - "completion_length": 854.0, - "delta_ref_entropy_loss": 0.071533203125, - "delta_ref_ppl": -0.06024169921875, - "entropy_loss": -0.09375, - "epoch": 0.7416, - "grad_norm": 2.238538561027868, - "k1_kl": 0.06024169921875, - "k3_kl": 0.04058837890625, - "kimi_kl": 0.1082763671875, - "learning_rate": 1.292e-07, + "advantages": -0.00012380737462081015, + "completion_length": 857.0, + "delta_ref_entropy_loss": 0.06103515625, + "delta_ref_ppl": -0.06884765625, + "entropy_loss": -0.03759765625, + "epoch": 0.3708, + "grad_norm": 0.3217419965940318, + "k1_kl": 0.06884765625, + "k3_kl": 0.0390625, + "kimi_kl": 0.11279296875, + "learning_rate": 3.1459999999999997e-07, "loss": 0.0017, - "ppl": 0.046875, - "reward": 0.978020578622818, - "reward_std": 0.022679150919429958, - "rewards/perpo_ocr_edit_distance_reward": 0.9780206680297852, + "ppl": 0.01263427734375, + "reward": 0.9985412359237671, + "reward_std": 0.0003813958610408008, + "rewards/perpo_ocr_edit_distance_reward": 0.9985413551330566, "step": 1854, "temperature": 0.9 }, { - "advantages": -3.653977910289541e-05, - "completion_length": 769.5, - "delta_ref_entropy_loss": 0.03765869140625, - "delta_ref_ppl": -0.023468017578125, - "entropy_loss": -0.0338134765625, - "epoch": 0.742, - "grad_norm": 0.43234964127436076, - "k1_kl": 0.023468017578125, - "k3_kl": 0.014129638671875, - "kimi_kl": 0.03472900390625, - "learning_rate": 1.29e-07, - "loss": 0.0006, - "ppl": 0.017974853515625, - "reward": 0.9629979729652405, - "reward_std": 0.0005909397732466459, - "rewards/perpo_ocr_edit_distance_reward": 0.9629980027675629, - "step": 1855, - "temperature": 0.9 + "advantages": -8.310590783366933e-05, + "completion_length": 442.0, + "delta_ref_entropy_loss": 0.057861328125, + "delta_ref_ppl": -0.08935546875, + "entropy_loss": -0.04443359375, + "epoch": 0.371, + "grad_norm": 0.7235079566223162, + "k1_kl": 0.08935546875, + "k3_kl": 0.0634765625, + "kimi_kl": 0.283203125, + "learning_rate": 3.1449999999999996e-07, + "loss": 0.0026, + "ppl": 0.01318359375, + "reward": 0.9946178197860718, + "reward_std": 0.0008221370517276227, + "rewards/perpo_ocr_edit_distance_reward": 0.9946179389953613, + "step": 1855, + "temperature": 0.9 }, { - "advantages": -0.00013893418326915707, - "completion_length": 571.0, - "delta_ref_entropy_loss": 0.025634765625, - "delta_ref_ppl": -0.015594482421875, - "entropy_loss": -0.03485107421875, - "epoch": 0.7424, - "grad_norm": 0.6688484340120071, - "k1_kl": 0.015625, - "k3_kl": 0.0079803466796875, - "kimi_kl": 0.015594482421875, - "learning_rate": 1.288e-07, - "loss": 0.0005, - "ppl": 0.017578125, - "reward": 0.9763106107711792, - "reward_std": 0.0019718042603926733, - "rewards/perpo_ocr_edit_distance_reward": 0.9763107597827911, + "advantages": 1.610177059774287e-05, + "completion_length": 456.0, + "delta_ref_entropy_loss": 0.052734375, + "delta_ref_ppl": -0.0771484375, + "entropy_loss": -0.034423828125, + "epoch": 0.3712, + "grad_norm": 1.5673593638082564, + "k1_kl": 0.0771484375, + "k3_kl": 0.048583984375, + "kimi_kl": 0.14453125, + "learning_rate": 3.144e-07, + "loss": 0.0019, + "ppl": 0.015869140625, + "reward": 0.7812899351119995, + "reward_std": 0.0009568908717483282, + "rewards/perpo_ocr_edit_distance_reward": 0.7812899351119995, "step": 1856, "temperature": 0.9 }, { - "advantages": -4.276633444533218e-05, - "completion_length": 526.0, - "delta_ref_entropy_loss": 0.072998046875, - "delta_ref_ppl": -0.0618896484375, - "entropy_loss": -0.0673828125, - "epoch": 0.7428, - "grad_norm": 0.7782798323734583, - "k1_kl": 0.0618896484375, - "k3_kl": 0.0362548828125, - "kimi_kl": 0.09765625, - "learning_rate": 1.2859999999999997e-07, - "loss": 0.0015, - "ppl": 0.03668212890625, - "reward": 0.46649499237537384, - "reward_std": 0.0008995593670988455, - "rewards/perpo_ocr_edit_distance_reward": 0.4664950370788574, + "advantages": -1.532690987460228e-07, + "completion_length": 1171.0, + "delta_ref_entropy_loss": 0.04443359375, + "delta_ref_ppl": -0.08935546875, + "entropy_loss": -0.1474609375, + "epoch": 0.3714, + "grad_norm": 2.9397692540034615, + "k1_kl": 0.08935546875, + "k3_kl": 0.06005859375, + "kimi_kl": 0.2041015625, + "learning_rate": 3.143e-07, + "loss": 0.0024, + "ppl": 0.08740234375, + "reward": 0.9381810426712036, + "reward_std": 0.10879244655370712, + "rewards/perpo_ocr_edit_distance_reward": 0.9381811022758484, "step": 1857, "temperature": 0.9 }, { - "advantages": -2.7375563149689697e-06, - "completion_length": 582.0, - "delta_ref_entropy_loss": 0.099853515625, - "delta_ref_ppl": -0.1044921875, - "entropy_loss": -0.114013671875, - "epoch": 0.7432, - "grad_norm": 1.0249168485294113, - "k1_kl": 0.104736328125, - "k3_kl": 0.0684814453125, - "kimi_kl": 0.20947265625, - "learning_rate": 1.2839999999999999e-07, - "loss": 0.0027, - "ppl": 0.0616455078125, - "reward": 0.860266387462616, - "reward_std": 0.013535349629819393, - "rewards/perpo_ocr_edit_distance_reward": 0.8602664470672607, + "advantages": -2.8576170734595507e-05, + "completion_length": 1058.0, + "delta_ref_entropy_loss": 0.043212890625, + "delta_ref_ppl": -0.058837890625, + "entropy_loss": -0.09130859375, + "epoch": 0.3716, + "grad_norm": 1.0041307542101983, + "k1_kl": 0.058837890625, + "k3_kl": 0.033935546875, + "kimi_kl": 0.07177734375, + "learning_rate": 3.1419999999999994e-07, + "loss": 0.0014, + "ppl": 0.044921875, + "reward": 0.9726189970970154, + "reward_std": 0.002285124734044075, + "rewards/perpo_ocr_edit_distance_reward": 0.9726191163063049, "step": 1858, "temperature": 0.9 }, { - "advantages": -2.0257064875295328e-05, - "completion_length": 524.5, - "delta_ref_entropy_loss": 0.08306884765625, - "delta_ref_ppl": -0.0736083984375, - "entropy_loss": -0.0655517578125, - "epoch": 0.7436, - "grad_norm": 1.0798670270165303, - "k1_kl": 0.0736083984375, - "k3_kl": 0.04534912109375, - "kimi_kl": 0.13037109375, - "learning_rate": 1.282e-07, - "loss": 0.0018, - "ppl": 0.0323486328125, - "reward": 0.8872566819190979, - "reward_std": 0.00745737663237378, - "rewards/perpo_ocr_edit_distance_reward": 0.8872567415237427, + "advantages": -4.713024463853799e-05, + "completion_length": 921.0, + "delta_ref_entropy_loss": 0.06787109375, + "delta_ref_ppl": -0.08203125, + "entropy_loss": -0.058349609375, + "epoch": 0.3718, + "grad_norm": 0.6271728437070859, + "k1_kl": 0.08203125, + "k3_kl": 0.05419921875, + "kimi_kl": 0.2060546875, + "learning_rate": 3.141e-07, + "loss": 0.0022, + "ppl": 0.024169921875, + "reward": 0.9239065647125244, + "reward_std": 0.0011645404156297445, + "rewards/perpo_ocr_edit_distance_reward": 0.9239066243171692, "step": 1859, "temperature": 0.9 }, { - "advantages": -6.42793538645492e-05, - "completion_length": 492.0, - "delta_ref_entropy_loss": 0.06396484375, - "delta_ref_ppl": -0.0526123046875, - "entropy_loss": -0.07421875, - "epoch": 0.744, - "grad_norm": 1.0494244012576064, - "k1_kl": 0.052978515625, - "k3_kl": 0.031982421875, - "kimi_kl": 0.078857421875, - "learning_rate": 1.28e-07, - "loss": 0.0013, - "ppl": 0.04046630859375, - "reward": 0.9797989726066589, - "reward_std": 0.0009863852028502151, - "rewards/perpo_ocr_edit_distance_reward": 0.9797990620136261, + "advantages": -9.98633258859627e-05, + "completion_length": 1105.0, + "delta_ref_entropy_loss": 0.060791015625, + "delta_ref_ppl": -0.06689453125, + "entropy_loss": -0.05712890625, + "epoch": 0.372, + "grad_norm": 0.6379029100776455, + "k1_kl": 0.06689453125, + "k3_kl": 0.037109375, + "kimi_kl": 0.08935546875, + "learning_rate": 3.14e-07, + "loss": 0.0016, + "ppl": 0.0255126953125, + "reward": 0.9900063872337341, + "reward_std": 0.0009231806034222245, + "rewards/perpo_ocr_edit_distance_reward": 0.9900065064430237, "step": 1860, "temperature": 0.9 }, { - "advantages": -6.098406916521526e-05, - "completion_length": 628.5, - "delta_ref_entropy_loss": 0.02679443359375, - "delta_ref_ppl": -0.06231689453125, - "entropy_loss": -0.0570068359375, - "epoch": 0.7444, - "grad_norm": 2.0201712127792173, - "k1_kl": 0.062255859375, - "k3_kl": 0.043731689453125, - "kimi_kl": 0.1112060546875, - "learning_rate": 1.278e-07, - "loss": 0.0018, - "ppl": 0.03021240234375, - "reward": 0.3899580240249634, - "reward_std": 0.027324113936629146, - "rewards/perpo_ocr_edit_distance_reward": 0.38995806872844696, + "advantages": 0.0, + "completion_length": 818.0, + "delta_ref_entropy_loss": 0.041259765625, + "delta_ref_ppl": -0.04541015625, + "entropy_loss": -0.01483154296875, + "epoch": 0.3722, + "grad_norm": 0.0056990535857457695, + "k1_kl": 0.045166015625, + "k3_kl": 0.025390625, + "kimi_kl": 0.0771484375, + "learning_rate": 3.1389999999999997e-07, + "loss": 0.001, + "ppl": 0.0029144287109375, + "reward": 0.9973747134208679, + "reward_std": 0.0, + "rewards/perpo_ocr_edit_distance_reward": 0.9973747134208679, "step": 1861, "temperature": 0.9 }, { - "advantages": -0.0001881378047983162, - "completion_length": 463.5, - "delta_ref_entropy_loss": -0.00115966796875, - "delta_ref_ppl": -0.0301513671875, - "entropy_loss": -0.04559326171875, - "epoch": 0.7448, - "grad_norm": 0.22427931971640439, - "k1_kl": 0.030242919921875, - "k3_kl": 0.0186309814453125, - "kimi_kl": 0.03912353515625, - "learning_rate": 1.2759999999999998e-07, - "loss": 0.0009, - "ppl": 0.01171875, - "reward": 0.6571487635374069, - "reward_std": 0.00038533375482074916, - "rewards/perpo_ocr_edit_distance_reward": 0.6571488082408905, + "advantages": -2.6736940981209045e-06, + "completion_length": 1032.0, + "delta_ref_entropy_loss": 0.056396484375, + "delta_ref_ppl": -0.06884765625, + "entropy_loss": -0.087890625, + "epoch": 0.3724, + "grad_norm": 1.1851480112036417, + "k1_kl": 0.0693359375, + "k3_kl": 0.041259765625, + "kimi_kl": 0.10205078125, + "learning_rate": 3.138e-07, + "loss": 0.0017, + "ppl": 0.0400390625, + "reward": 0.9358436465263367, + "reward_std": 0.0030907164327800274, + "rewards/perpo_ocr_edit_distance_reward": 0.9358436465263367, "step": 1862, "temperature": 0.9 }, { - "advantages": -9.35750422286219e-05, - "completion_length": 432.0, - "delta_ref_entropy_loss": 0.0474853515625, - "delta_ref_ppl": -0.07513427734375, - "entropy_loss": -0.06756591796875, - "epoch": 0.7452, - "grad_norm": 1.116349230472527, - "k1_kl": 0.07513427734375, - "k3_kl": 0.054107666015625, - "kimi_kl": 0.2607421875, - "learning_rate": 1.2740000000000002e-07, - "loss": 0.0023, - "ppl": 0.038116455078125, - "reward": 0.9259108603000641, - "reward_std": 0.00102014365256764, - "rewards/perpo_ocr_edit_distance_reward": 0.9259109199047089, + "advantages": -1.4994826415204443e-05, + "completion_length": 590.0, + "delta_ref_entropy_loss": 0.08154296875, + "delta_ref_ppl": -0.0771484375, + "entropy_loss": -0.05908203125, + "epoch": 0.3726, + "grad_norm": 0.8894543003615821, + "k1_kl": 0.07763671875, + "k3_kl": 0.04296875, + "kimi_kl": 0.09814453125, + "learning_rate": 3.1369999999999996e-07, + "loss": 0.0017, + "ppl": 0.0250244140625, + "reward": 0.9773662090301514, + "reward_std": 0.0016036015003919601, + "rewards/perpo_ocr_edit_distance_reward": 0.9773662090301514, "step": 1863, "temperature": 0.9 }, { - "advantages": -1.2772424042850616e-06, - "completion_length": 246.5, - "delta_ref_entropy_loss": 0.0792236328125, - "delta_ref_ppl": -0.13134765625, - "entropy_loss": -0.0399169921875, - "epoch": 0.7456, - "grad_norm": 0.4146702834669479, - "k1_kl": 0.13134765625, - "k3_kl": 0.0989990234375, - "kimi_kl": 0.46435546875, - "learning_rate": 1.272e-07, - "loss": 0.004, - "ppl": 0.01873779296875, - "reward": 0.998639851808548, - "reward_std": 0.0016252199420705438, - "rewards/perpo_ocr_edit_distance_reward": 0.9986398816108704, + "advantages": -1.4475413934178505e-07, + "completion_length": 332.0, + "delta_ref_entropy_loss": 0.07275390625, + "delta_ref_ppl": -0.130859375, + "entropy_loss": -0.16015625, + "epoch": 0.3728, + "grad_norm": 4.0227791208367245, + "k1_kl": 0.130859375, + "k3_kl": 0.0947265625, + "kimi_kl": 0.265625, + "learning_rate": 3.1359999999999995e-07, + "loss": 0.0038, + "ppl": 0.07568359375, + "reward": 0.8276283144950867, + "reward_std": 0.055381014943122864, + "rewards/perpo_ocr_edit_distance_reward": 0.8276283144950867, "step": 1864, "temperature": 0.9 }, { - "advantages": -5.2579815019271336e-05, - "completion_length": 812.0, - "delta_ref_entropy_loss": 0.022796630859375, - "delta_ref_ppl": -0.0194854736328125, - "entropy_loss": -0.018646240234375, - "epoch": 0.746, - "grad_norm": 0.2732098373232913, - "k1_kl": 0.019500732421875, - "k3_kl": 0.01282501220703125, - "kimi_kl": 0.0434417724609375, - "learning_rate": 1.2699999999999999e-07, - "loss": 0.0006, - "ppl": 0.00921630859375, - "reward": 0.9988090693950653, - "reward_std": 0.0003741944092325866, - "rewards/perpo_ocr_edit_distance_reward": 0.9988091588020325, + "advantages": 3.065381861233618e-06, + "completion_length": 541.0, + "delta_ref_entropy_loss": 0.0927734375, + "delta_ref_ppl": -0.07470703125, + "entropy_loss": -0.1875, + "epoch": 0.373, + "grad_norm": 1.7529017010181376, + "k1_kl": 0.07421875, + "k3_kl": 0.04052734375, + "kimi_kl": 0.0849609375, + "learning_rate": 3.135e-07, + "loss": 0.0016, + "ppl": 0.11083984375, + "reward": 0.9421470761299133, + "reward_std": 0.002666410757228732, + "rewards/perpo_ocr_edit_distance_reward": 0.9421471357345581, "step": 1865, "temperature": 0.9 }, { - "advantages": 0.0, - "completion_length": 344.0, - "delta_ref_entropy_loss": 0.0362548828125, - "delta_ref_ppl": -0.02410888671875, - "entropy_loss": -0.02532958984375, - "epoch": 0.7464, - "grad_norm": 0.30596315780229366, - "k1_kl": 0.024169921875, - "k3_kl": 0.01092529296875, - "kimi_kl": 0.02227783203125, - "learning_rate": 1.268e-07, - "loss": 0.0004, - "ppl": 0.01171875, - "reward": 0.9957362115383148, - "reward_std": 0.00016411443357355893, - "rewards/perpo_ocr_edit_distance_reward": 0.9957362115383148, + "advantages": -1.7029899268550253e-08, + "completion_length": 693.0, + "delta_ref_entropy_loss": 0.0439453125, + "delta_ref_ppl": -0.09521484375, + "entropy_loss": -0.1552734375, + "epoch": 0.3732, + "grad_norm": 2.9913825350695444, + "k1_kl": 0.095703125, + "k3_kl": 0.0654296875, + "kimi_kl": 0.201171875, + "learning_rate": 3.134e-07, + "loss": 0.0026, + "ppl": 0.07666015625, + "reward": 0.900039792060852, + "reward_std": 0.20000489056110382, + "rewards/perpo_ocr_edit_distance_reward": 0.9000398516654968, "step": 1866, "temperature": 0.9 }, { - "advantages": -5.10896995820076e-07, - "completion_length": 675.0, - "delta_ref_entropy_loss": 0.0338134765625, - "delta_ref_ppl": -0.02777099609375, - "entropy_loss": -0.0245361328125, - "epoch": 0.7468, - "grad_norm": 0.5842011825119532, - "k1_kl": 0.02777099609375, - "k3_kl": 0.017364501953125, - "kimi_kl": 0.04339599609375, - "learning_rate": 1.2659999999999998e-07, - "loss": 0.0007, - "ppl": 0.0115814208984375, - "reward": 0.9652456939220428, - "reward_std": 0.033725716173648834, - "rewards/perpo_ocr_edit_distance_reward": 0.9652457237243652, + "advantages": -3.405979782655777e-07, + "completion_length": 492.0, + "delta_ref_entropy_loss": 0.1396484375, + "delta_ref_ppl": -0.1474609375, + "entropy_loss": -0.486328125, + "epoch": 0.3734, + "grad_norm": 4.366664555186716, + "k1_kl": 0.1474609375, + "k3_kl": 0.09765625, + "kimi_kl": 0.283203125, + "learning_rate": 3.1330000000000003e-07, + "loss": 0.0039, + "ppl": 0.26953125, + "reward": 0.3704127073287964, + "reward_std": 0.021759413182735443, + "rewards/perpo_ocr_edit_distance_reward": 0.3704127073287964, "step": 1867, "temperature": 0.9 }, { - "advantages": -3.120729213357265e-05, - "completion_length": 531.5, - "delta_ref_entropy_loss": 0.09228515625, - "delta_ref_ppl": -0.06396484375, - "entropy_loss": -0.2342529296875, - "epoch": 0.7472, - "grad_norm": 1.7005732898625956, - "k1_kl": 0.063720703125, - "k3_kl": 0.032928466796875, - "kimi_kl": 0.06085205078125, - "learning_rate": 1.264e-07, - "loss": 0.0013, - "ppl": 0.136199951171875, - "reward": 0.7748895883560181, - "reward_std": 0.08693155417859089, - "rewards/perpo_ocr_edit_distance_reward": 0.7748896479606628, + "advantages": -3.555843068170361e-05, + "completion_length": 457.0, + "delta_ref_entropy_loss": 0.0927734375, + "delta_ref_ppl": -0.07421875, + "entropy_loss": -0.043701171875, + "epoch": 0.3736, + "grad_norm": 0.9223871441995607, + "k1_kl": 0.07421875, + "k3_kl": 0.0419921875, + "kimi_kl": 0.11181640625, + "learning_rate": 3.1319999999999997e-07, + "loss": 0.0017, + "ppl": 0.0164794921875, + "reward": 0.9964625835418701, + "reward_std": 0.0018161758780479431, + "rewards/perpo_ocr_edit_distance_reward": 0.9964627027511597, "step": 1868, "temperature": 0.9 }, { - "advantages": -8.123474981402978e-05, - "completion_length": 651.5, - "delta_ref_entropy_loss": 0.025146484375, - "delta_ref_ppl": -0.025634765625, - "entropy_loss": -0.022216796875, - "epoch": 0.7476, - "grad_norm": 0.45247936259368565, - "k1_kl": 0.025634765625, - "k3_kl": 0.015167236328125, - "kimi_kl": 0.03521728515625, - "learning_rate": 1.262e-07, - "loss": 0.0007, - "ppl": 0.010040283203125, - "reward": 0.9996594786643982, - "reward_std": 0.00032643570739310235, - "rewards/perpo_ocr_edit_distance_reward": 0.9996595084667206, + "advantages": 1.7029899268550253e-08, + "completion_length": 513.0, + "delta_ref_entropy_loss": 0.08544921875, + "delta_ref_ppl": -0.0693359375, + "entropy_loss": -0.052001953125, + "epoch": 0.3738, + "grad_norm": 2.031431365615866, + "k1_kl": 0.0693359375, + "k3_kl": 0.0390625, + "kimi_kl": 0.12255859375, + "learning_rate": 3.1309999999999996e-07, + "loss": 0.0016, + "ppl": 0.01806640625, + "reward": 0.9593032598495483, + "reward_std": 0.008106415160000324, + "rewards/perpo_ocr_edit_distance_reward": 0.9593032598495483, "step": 1869, "temperature": 0.9 }, { - "advantages": -0.00017029048467520624, - "completion_length": 582.5, - "delta_ref_entropy_loss": 0.052001953125, - "delta_ref_ppl": -0.0390625, - "entropy_loss": -0.03668212890625, - "epoch": 0.748, - "grad_norm": 0.5249832931282212, - "k1_kl": 0.0389404296875, - "k3_kl": 0.020263671875, - "kimi_kl": 0.04522705078125, - "learning_rate": 1.26e-07, - "loss": 0.001, - "ppl": 0.0164794921875, - "reward": 0.9839682281017303, - "reward_std": 0.00027342253451934084, - "rewards/perpo_ocr_edit_distance_reward": 0.9839682877063751, + "advantages": -0.00011793205339927226, + "completion_length": 581.0, + "delta_ref_entropy_loss": 0.0498046875, + "delta_ref_ppl": -0.048828125, + "entropy_loss": -0.0306396484375, + "epoch": 0.374, + "grad_norm": 0.3850506271408749, + "k1_kl": 0.048828125, + "k3_kl": 0.0269775390625, + "kimi_kl": 0.0810546875, + "learning_rate": 3.13e-07, + "loss": 0.0012, + "ppl": 0.01055908203125, + "reward": 0.9969549179077148, + "reward_std": 0.0005497950478456914, + "rewards/perpo_ocr_edit_distance_reward": 0.9969549775123596, "step": 1870, "temperature": 0.9 }, { - "advantages": -6.609729780393536e-05, - "completion_length": 863.0, - "delta_ref_entropy_loss": 0.02392578125, - "delta_ref_ppl": -0.020263671875, - "entropy_loss": -0.0445556640625, - "epoch": 0.7484, - "grad_norm": 0.7838780980623462, - "k1_kl": 0.02032470703125, - "k3_kl": 0.01495361328125, - "kimi_kl": 0.036865234375, - "learning_rate": 1.258e-07, - "loss": 0.0007, - "ppl": 0.02728271484375, - "reward": 0.9846844673156738, - "reward_std": 0.00721076266199816, - "rewards/perpo_ocr_edit_distance_reward": 0.9846845269203186, + "advantages": -4.528250065050088e-05, + "completion_length": 164.0, + "delta_ref_entropy_loss": 0.072265625, + "delta_ref_ppl": -0.2236328125, + "entropy_loss": -0.03271484375, + "epoch": 0.3742, + "grad_norm": 0.8929159878433537, + "k1_kl": 0.2236328125, + "k3_kl": 0.173828125, + "kimi_kl": 0.76953125, + "learning_rate": 3.129e-07, + "loss": 0.007, + "ppl": 0.0106201171875, + "reward": 0.9424890875816345, + "reward_std": 0.0012172696879133582, + "rewards/perpo_ocr_edit_distance_reward": 0.9424891471862793, "step": 1871, "temperature": 0.9 }, { - "advantages": -7.3909759521484375e-06, - "completion_length": 561.5, - "delta_ref_entropy_loss": 0.03436279296875, - "delta_ref_ppl": -0.031646728515625, - "entropy_loss": -0.0511474609375, - "epoch": 0.7488, - "grad_norm": 0.8865077151467897, - "k1_kl": 0.031646728515625, - "k3_kl": 0.019775390625, - "kimi_kl": 0.04974365234375, - "learning_rate": 1.2559999999999999e-07, - "loss": 0.0008, - "ppl": 0.027679443359375, - "reward": 0.9260706007480621, - "reward_std": 0.0020388730335980654, - "rewards/perpo_ocr_edit_distance_reward": 0.9260706603527069, + "advantages": 2.0776476503669983e-06, + "completion_length": 918.0, + "delta_ref_entropy_loss": 0.043701171875, + "delta_ref_ppl": -0.033203125, + "entropy_loss": -0.10107421875, + "epoch": 0.3744, + "grad_norm": 1.1090992800380144, + "k1_kl": 0.033203125, + "k3_kl": 0.018310546875, + "kimi_kl": 0.0299072265625, + "learning_rate": 3.128e-07, + "loss": 0.0007, + "ppl": 0.045166015625, + "reward": 0.9728691577911377, + "reward_std": 0.00397381279617548, + "rewards/perpo_ocr_edit_distance_reward": 0.9728692173957825, "step": 1872, "temperature": 0.9 }, { - "advantages": -1.27724248955019e-07, - "completion_length": 723.5, - "delta_ref_entropy_loss": 0.03326416015625, - "delta_ref_ppl": -0.0416259765625, - "entropy_loss": -0.12664794921875, - "epoch": 0.7492, - "grad_norm": 2.1236908903450815, - "k1_kl": 0.0416259765625, - "k3_kl": 0.03118896484375, - "kimi_kl": 0.0772705078125, - "learning_rate": 1.254e-07, - "loss": 0.0012, - "ppl": 0.06568145751953125, - "reward": 0.8598544299602509, - "reward_std": 0.08724671602249146, - "rewards/perpo_ocr_edit_distance_reward": 0.8598544597625732, + "advantages": -5.430834789876826e-05, + "completion_length": 947.0, + "delta_ref_entropy_loss": 0.0927734375, + "delta_ref_ppl": -0.10693359375, + "entropy_loss": -0.06787109375, + "epoch": 0.3746, + "grad_norm": 1.6484877961168194, + "k1_kl": 0.10693359375, + "k3_kl": 0.0673828125, + "kimi_kl": 0.185546875, + "learning_rate": 3.127e-07, + "loss": 0.0028, + "ppl": 0.034423828125, + "reward": 0.9729628562927246, + "reward_std": 0.0008408822468481958, + "rewards/perpo_ocr_edit_distance_reward": 0.9729629755020142, "step": 1873, "temperature": 0.9 }, { - "advantages": -6.603769517266045e-05, - "completion_length": 627.5, - "delta_ref_entropy_loss": 0.0289306640625, - "delta_ref_ppl": -0.02825927734375, - "entropy_loss": -0.022491455078125, - "epoch": 0.7496, - "grad_norm": 0.3283309088966474, - "k1_kl": 0.02825927734375, - "k3_kl": 0.0184326171875, - "kimi_kl": 0.058349609375, - "learning_rate": 1.252e-07, - "loss": 0.0008, - "ppl": 0.01123046875, - "reward": 0.99774569272995, - "reward_std": 0.000683081685565412, - "rewards/perpo_ocr_edit_distance_reward": 0.9977457523345947, + "advantages": -2.0461424355744384e-05, + "completion_length": 560.0, + "delta_ref_entropy_loss": 0.06982421875, + "delta_ref_ppl": -0.07275390625, + "entropy_loss": -0.0361328125, + "epoch": 0.3748, + "grad_norm": 0.8530201527906724, + "k1_kl": 0.07275390625, + "k3_kl": 0.04345703125, + "kimi_kl": 0.146484375, + "learning_rate": 3.126e-07, + "loss": 0.0018, + "ppl": 0.0177001953125, + "reward": 0.9932112693786621, + "reward_std": 0.0036480615381151438, + "rewards/perpo_ocr_edit_distance_reward": 0.9932113885879517, "step": 1874, "temperature": 0.9 }, { - "advantages": -1.6655241779517382e-05, - "completion_length": 443.0, - "delta_ref_entropy_loss": 0.04345703125, - "delta_ref_ppl": -0.0743408203125, - "entropy_loss": -0.03759765625, - "epoch": 0.75, - "grad_norm": 0.5269720926420973, - "k1_kl": 0.0743408203125, - "k3_kl": 0.05279541015625, - "kimi_kl": 0.1895751953125, - "learning_rate": 1.25e-07, - "loss": 0.0021, - "ppl": 0.01654052734375, - "reward": 0.9997289180755615, - "reward_std": 0.0007172087789513171, - "rewards/perpo_ocr_edit_distance_reward": 0.9997289478778839, + "advantages": -1.8102782632922754e-05, + "completion_length": 342.0, + "delta_ref_entropy_loss": 0.040283203125, + "delta_ref_ppl": -0.12109375, + "entropy_loss": -0.040771484375, + "epoch": 0.375, + "grad_norm": 0.7378083013828464, + "k1_kl": 0.12158203125, + "k3_kl": 0.0927734375, + "kimi_kl": 0.4375, + "learning_rate": 3.1249999999999997e-07, + "loss": 0.0037, + "ppl": 0.01171875, + "reward": 0.9735934138298035, + "reward_std": 0.0017802617512643337, + "rewards/perpo_ocr_edit_distance_reward": 0.9735934734344482, "step": 1875, "temperature": 0.9 }, { - "advantages": -1.2316874745010864e-05, - "completion_length": 744.0, - "delta_ref_entropy_loss": 0.0523681640625, - "delta_ref_ppl": -0.0302734375, - "entropy_loss": -0.0692138671875, - "epoch": 0.7504, - "grad_norm": 1.1508719827294924, - "k1_kl": 0.030029296875, - "k3_kl": 0.0179443359375, - "kimi_kl": 0.025634765625, - "learning_rate": 1.2479999999999998e-07, - "loss": 0.0007, - "ppl": 0.04266357421875, - "reward": 0.964069128036499, - "reward_std": 0.00046847897465340793, - "rewards/perpo_ocr_edit_distance_reward": 0.964069128036499, + "advantages": -1.7029899268550253e-08, + "completion_length": 836.0, + "delta_ref_entropy_loss": 0.023681640625, + "delta_ref_ppl": -0.034423828125, + "entropy_loss": -0.08251953125, + "epoch": 0.3752, + "grad_norm": 0.9295451721037645, + "k1_kl": 0.03466796875, + "k3_kl": 0.019287109375, + "kimi_kl": 0.04638671875, + "learning_rate": 3.124e-07, + "loss": 0.0008, + "ppl": 0.043701171875, + "reward": 0.8406412601470947, + "reward_std": 0.001924793585203588, + "rewards/perpo_ocr_edit_distance_reward": 0.84064120054245, "step": 1876, "temperature": 0.9 }, { - "advantages": 6.61185822536936e-06, - "completion_length": 329.5, - "delta_ref_entropy_loss": 0.046630859375, - "delta_ref_ppl": -0.106201171875, - "entropy_loss": -0.072021484375, - "epoch": 0.7508, - "grad_norm": 0.5188349398070652, - "k1_kl": 0.106201171875, - "k3_kl": 0.08135986328125, - "kimi_kl": 0.403564453125, - "learning_rate": 1.246e-07, - "loss": 0.0033, - "ppl": 0.02813720703125, - "reward": 0.9974991083145142, - "reward_std": 0.000271923461696133, - "rewards/perpo_ocr_edit_distance_reward": 0.9974991381168365, + "advantages": -5.449567652249243e-06, + "completion_length": 1023.0, + "delta_ref_entropy_loss": 0.064453125, + "delta_ref_ppl": -0.062255859375, + "entropy_loss": -0.2041015625, + "epoch": 0.3754, + "grad_norm": 1.9092518202500064, + "k1_kl": 0.06201171875, + "k3_kl": 0.0390625, + "kimi_kl": 0.0927734375, + "learning_rate": 3.123e-07, + "loss": 0.0016, + "ppl": 0.1103515625, + "reward": 0.9401159882545471, + "reward_std": 0.0030057125259190798, + "rewards/perpo_ocr_edit_distance_reward": 0.9401159882545471, "step": 1877, "temperature": 0.9 }, { - "advantages": 1.5126807738852222e-05, - "completion_length": 756.0, - "delta_ref_entropy_loss": 0.0140380859375, - "delta_ref_ppl": -0.01263427734375, - "entropy_loss": -0.01458740234375, - "epoch": 0.7512, - "grad_norm": 0.20650425349319698, - "k1_kl": 0.01263427734375, - "k3_kl": 0.0075531005859375, - "kimi_kl": 0.01934814453125, - "learning_rate": 1.244e-07, - "loss": 0.0003, - "ppl": 0.0067291259765625, - "reward": 0.9931108057498932, - "reward_std": 0.00023154140217229724, - "rewards/perpo_ocr_edit_distance_reward": 0.9931108355522156, + "advantages": -1.558235817356035e-05, + "completion_length": 210.0, + "delta_ref_entropy_loss": 0.09033203125, + "delta_ref_ppl": -0.1748046875, + "entropy_loss": -0.1396484375, + "epoch": 0.3756, + "grad_norm": 2.5688678223096564, + "k1_kl": 0.1748046875, + "k3_kl": 0.125, + "kimi_kl": 0.443359375, + "learning_rate": 3.1219999999999995e-07, + "loss": 0.005, + "ppl": 0.064453125, + "reward": 0.9849912524223328, + "reward_std": 0.0026312777772545815, + "rewards/perpo_ocr_edit_distance_reward": 0.9849912524223328, "step": 1878, "temperature": 0.9 }, { - "advantages": -7.265380600074423e-05, - "completion_length": 463.5, - "delta_ref_entropy_loss": 0.039794921875, - "delta_ref_ppl": -0.0262451171875, - "entropy_loss": -0.04046630859375, - "epoch": 0.7516, - "grad_norm": 0.9297113642191652, - "k1_kl": 0.02630615234375, - "k3_kl": 0.013153076171875, - "kimi_kl": 0.029052734375, - "learning_rate": 1.242e-07, - "loss": 0.0006, - "ppl": 0.017364501953125, - "reward": 0.998251885175705, - "reward_std": 0.0024520251317881048, - "rewards/perpo_ocr_edit_distance_reward": 0.9982519149780273, + "advantages": -6.215913117557648e-07, + "completion_length": 2048.0, + "delta_ref_entropy_loss": 0.017578125, + "delta_ref_ppl": -0.043701171875, + "entropy_loss": -0.169921875, + "epoch": 0.3758, + "grad_norm": 7.231047781293077, + "k1_kl": 0.04345703125, + "k3_kl": 0.060546875, + "kimi_kl": 0.09375, + "learning_rate": 3.121e-07, + "loss": 0.0024, + "ppl": 0.10400390625, + "reward": 0.8173754811286926, + "reward_std": 0.01390167884528637, + "rewards/perpo_ocr_edit_distance_reward": 0.8173755407333374, "step": 1879, "temperature": 0.9 }, { - "advantages": -2.7183977181266528e-05, - "completion_length": 543.5, - "delta_ref_entropy_loss": 0.083251953125, - "delta_ref_ppl": -0.0599365234375, - "entropy_loss": -0.109375, - "epoch": 0.752, - "grad_norm": 1.0353898908248689, - "k1_kl": 0.0601806640625, - "k3_kl": 0.03216552734375, - "kimi_kl": 0.077880859375, - "learning_rate": 1.24e-07, - "loss": 0.0013, - "ppl": 0.0582275390625, - "reward": 0.8962894082069397, - "reward_std": 0.002426664636004716, - "rewards/perpo_ocr_edit_distance_reward": 0.8962895274162292, + "advantages": -1.6442367268609814e-05, + "completion_length": 383.0, + "delta_ref_entropy_loss": 0.0673828125, + "delta_ref_ppl": -0.1337890625, + "entropy_loss": -0.059814453125, + "epoch": 0.376, + "grad_norm": 1.2075736633284615, + "k1_kl": 0.1337890625, + "k3_kl": 0.0986328125, + "kimi_kl": 0.3359375, + "learning_rate": 3.12e-07, + "loss": 0.004, + "ppl": 0.024658203125, + "reward": 0.9943972229957581, + "reward_std": 0.0024894247762858868, + "rewards/perpo_ocr_edit_distance_reward": 0.9943973422050476, "step": 1880, "temperature": 0.9 }, { - "advantages": -3.2974141504382715e-05, - "completion_length": 881.0, - "delta_ref_entropy_loss": 0.02740478515625, - "delta_ref_ppl": -0.015380859375, - "entropy_loss": -0.02764892578125, - "epoch": 0.7524, - "grad_norm": 0.41893296206308744, - "k1_kl": 0.015289306640625, - "k3_kl": 0.0140380859375, - "kimi_kl": 0.016448974609375, - "learning_rate": 1.2379999999999998e-07, - "loss": 0.0006, - "ppl": 0.0140380859375, - "reward": 0.9981165826320648, - "reward_std": 0.00041800342296482995, - "rewards/perpo_ocr_edit_distance_reward": 0.9981165826320648, + "advantages": -7.934230234241113e-05, + "completion_length": 619.0, + "delta_ref_entropy_loss": 0.06396484375, + "delta_ref_ppl": -0.10009765625, + "entropy_loss": -0.033935546875, + "epoch": 0.3762, + "grad_norm": 0.32800111909079344, + "k1_kl": 0.10009765625, + "k3_kl": 0.0634765625, + "kimi_kl": 0.236328125, + "learning_rate": 3.119e-07, + "loss": 0.0026, + "ppl": 0.0091552734375, + "reward": 0.9717054963111877, + "reward_std": 0.0003293700283393264, + "rewards/perpo_ocr_edit_distance_reward": 0.9717056155204773, "step": 1881, "temperature": 0.9 }, { - "advantages": -0.0001081228319890215, - "completion_length": 550.0, - "delta_ref_entropy_loss": 0.1121826171875, - "delta_ref_ppl": -0.07952880859375, - "entropy_loss": -0.16339111328125, - "epoch": 0.7528, - "grad_norm": 3.6396719703916087, - "k1_kl": 0.07958984375, - "k3_kl": 0.04693603515625, - "kimi_kl": 0.10302734375, - "learning_rate": 1.236e-07, - "loss": 0.002, - "ppl": 0.093536376953125, - "reward": 0.9658679366111755, - "reward_std": 0.0048684812791179866, - "rewards/perpo_ocr_edit_distance_reward": 0.9658680558204651, + "advantages": 6.207398200785974e-06, + "completion_length": 1518.0, + "delta_ref_entropy_loss": 0.019775390625, + "delta_ref_ppl": -0.039794921875, + "entropy_loss": -0.033935546875, + "epoch": 0.3764, + "grad_norm": 0.6036428409815953, + "k1_kl": 0.039794921875, + "k3_kl": 0.0263671875, + "kimi_kl": 0.068359375, + "learning_rate": 3.118e-07, + "loss": 0.001, + "ppl": 0.0152587890625, + "reward": 0.9957455992698669, + "reward_std": 0.0026377979665994644, + "rewards/perpo_ocr_edit_distance_reward": 0.9957455396652222, "step": 1882, "temperature": 0.9 }, { - "advantages": -0.00010469130938872695, - "completion_length": 825.0, - "delta_ref_entropy_loss": 0.02081298828125, - "delta_ref_ppl": -0.03045654296875, - "entropy_loss": -0.01580810546875, - "epoch": 0.7532, - "grad_norm": 0.15905355367044216, - "k1_kl": 0.0303955078125, - "k3_kl": 0.0213623046875, - "kimi_kl": 0.09716796875, - "learning_rate": 1.2339999999999998e-07, - "loss": 0.001, - "ppl": 0.007232666015625, - "reward": 0.9979692101478577, - "reward_std": 5.166709524928592e-05, - "rewards/perpo_ocr_edit_distance_reward": 0.99796923995018, + "advantages": -3.405979782655777e-07, + "completion_length": 1402.0, + "delta_ref_entropy_loss": 0.07958984375, + "delta_ref_ppl": -0.09033203125, + "entropy_loss": -0.423828125, + "epoch": 0.3766, + "grad_norm": 2.0227173010281825, + "k1_kl": 0.08984375, + "k3_kl": 0.0673828125, + "kimi_kl": 0.1298828125, + "learning_rate": 3.1169999999999997e-07, + "loss": 0.0027, + "ppl": 0.2275390625, + "reward": 0.663733184337616, + "reward_std": 0.0868520513176918, + "rewards/perpo_ocr_edit_distance_reward": 0.6637332439422607, "step": 1883, "temperature": 0.9 }, { - "advantages": -3.13296914100647e-05, - "completion_length": 782.0, - "delta_ref_entropy_loss": 0.12255859375, - "delta_ref_ppl": -0.0823974609375, - "entropy_loss": -0.16461181640625, - "epoch": 0.7536, - "grad_norm": 1.6610351057288661, - "k1_kl": 0.0823974609375, - "k3_kl": 0.046875, - "kimi_kl": 0.126220703125, - "learning_rate": 1.232e-07, - "loss": 0.0019, - "ppl": 0.089935302734375, - "reward": 0.731018602848053, - "reward_std": 0.008003590570297092, - "rewards/perpo_ocr_edit_distance_reward": 0.7310186475515366, + "advantages": -3.30890943587292e-05, + "completion_length": 444.0, + "delta_ref_entropy_loss": 0.044677734375, + "delta_ref_ppl": -0.0859375, + "entropy_loss": -0.0272216796875, + "epoch": 0.3768, + "grad_norm": 0.5495932347506427, + "k1_kl": 0.0859375, + "k3_kl": 0.061279296875, + "kimi_kl": 0.234375, + "learning_rate": 3.1159999999999996e-07, + "loss": 0.0025, + "ppl": 0.01220703125, + "reward": 0.9965948462486267, + "reward_std": 0.0009291361784562469, + "rewards/perpo_ocr_edit_distance_reward": 0.9965949058532715, "step": 1884, "temperature": 0.9 }, { - "advantages": -5.120890818943735e-05, - "completion_length": 715.5, - "delta_ref_entropy_loss": 0.02996826171875, - "delta_ref_ppl": -0.02935791015625, - "entropy_loss": -0.02581787109375, - "epoch": 0.754, - "grad_norm": 0.46291637625272286, - "k1_kl": 0.02947998046875, - "k3_kl": 0.017364501953125, - "kimi_kl": 0.0391845703125, - "learning_rate": 1.23e-07, - "loss": 0.0007, - "ppl": 0.012908935546875, - "reward": 0.9970579445362091, - "reward_std": 0.0007625235011801124, - "rewards/perpo_ocr_edit_distance_reward": 0.9970580041408539, + "advantages": -2.7937549020862207e-05, + "completion_length": 363.0, + "delta_ref_entropy_loss": 0.0361328125, + "delta_ref_ppl": -0.080078125, + "entropy_loss": -0.041748046875, + "epoch": 0.377, + "grad_norm": 0.4566273749092968, + "k1_kl": 0.080078125, + "k3_kl": 0.05615234375, + "kimi_kl": 0.2021484375, + "learning_rate": 3.115e-07, + "loss": 0.0023, + "ppl": 0.01422119140625, + "reward": 0.9995559453964233, + "reward_std": 0.0008147502667270601, + "rewards/perpo_ocr_edit_distance_reward": 0.9995560646057129, "step": 1885, "temperature": 0.9 }, { - "advantages": -3.104550705756992e-05, - "completion_length": 552.5, - "delta_ref_entropy_loss": 0.018829345703125, - "delta_ref_ppl": -0.0245361328125, - "entropy_loss": -0.013946533203125, - "epoch": 0.7544, - "grad_norm": 0.35911857852699075, - "k1_kl": 0.0245208740234375, - "k3_kl": 0.0182342529296875, - "kimi_kl": 0.094879150390625, - "learning_rate": 1.228e-07, - "loss": 0.0008, - "ppl": 0.0066986083984375, - "reward": 0.9991963803768158, - "reward_std": 0.0010022492278949358, - "rewards/perpo_ocr_edit_distance_reward": 0.9991964399814606, + "advantages": -0.0005960464477539062, + "completion_length": 639.0, + "delta_ref_entropy_loss": 0.0400390625, + "delta_ref_ppl": -0.03955078125, + "entropy_loss": -0.0198974609375, + "epoch": 0.3772, + "grad_norm": 0.01852497150079271, + "k1_kl": 0.039306640625, + "k3_kl": 0.021728515625, + "kimi_kl": 0.06005859375, + "learning_rate": 3.114e-07, + "loss": 0.0015, + "ppl": 0.006622314453125, + "reward": 0.9983221292495728, + "reward_std": 0.0, + "rewards/perpo_ocr_edit_distance_reward": 0.9983221888542175, "step": 1886, "temperature": 0.9 }, { - "advantages": -6.250398928386858e-05, - "completion_length": 469.5, - "delta_ref_entropy_loss": 0.056640625, - "delta_ref_ppl": -0.0445556640625, - "entropy_loss": -0.07080078125, - "epoch": 0.7548, - "grad_norm": 0.9108238395795648, - "k1_kl": 0.04449462890625, - "k3_kl": 0.02685546875, - "kimi_kl": 0.05902099609375, - "learning_rate": 1.226e-07, - "loss": 0.0011, - "ppl": 0.04095458984375, - "reward": 0.908354789018631, - "reward_std": 0.002190261315263342, - "rewards/perpo_ocr_edit_distance_reward": 0.9083549082279205, + "advantages": -4.959106809110381e-05, + "completion_length": 619.0, + "delta_ref_entropy_loss": 0.036865234375, + "delta_ref_ppl": -0.0537109375, + "entropy_loss": -0.0218505859375, + "epoch": 0.3774, + "grad_norm": 0.4207264779611121, + "k1_kl": 0.0537109375, + "k3_kl": 0.03857421875, + "kimi_kl": 0.1513671875, + "learning_rate": 3.113e-07, + "loss": 0.0016, + "ppl": 0.0107421875, + "reward": 0.9951124787330627, + "reward_std": 0.0011016404023393989, + "rewards/perpo_ocr_edit_distance_reward": 0.9951125383377075, "step": 1887, "temperature": 0.9 }, { - "advantages": -7.175122300395742e-05, - "completion_length": 821.5, - "delta_ref_entropy_loss": 0.017608642578125, - "delta_ref_ppl": -0.01434326171875, - "entropy_loss": -0.009674072265625, - "epoch": 0.7552, - "grad_norm": 0.19599924783695538, - "k1_kl": 0.01434326171875, - "k3_kl": 0.00844573974609375, - "kimi_kl": 0.0216522216796875, - "learning_rate": 1.2239999999999998e-07, - "loss": 0.0004, - "ppl": 0.00360870361328125, - "reward": 0.9999449551105499, - "reward_std": 6.867560296086594e-05, - "rewards/perpo_ocr_edit_distance_reward": 0.9999449551105499, + "advantages": -5.817413693876006e-05, + "completion_length": 308.0, + "delta_ref_entropy_loss": 0.06591796875, + "delta_ref_ppl": -0.130859375, + "entropy_loss": -0.045166015625, + "epoch": 0.3776, + "grad_norm": 1.019937284972248, + "k1_kl": 0.130859375, + "k3_kl": 0.0927734375, + "kimi_kl": 0.37109375, + "learning_rate": 3.112e-07, + "loss": 0.0038, + "ppl": 0.018310546875, + "reward": 0.9562277793884277, + "reward_std": 0.0007780642481520772, + "rewards/perpo_ocr_edit_distance_reward": 0.9562279582023621, "step": 1888, "temperature": 0.9 }, { - "advantages": -2.6289906600140966e-05, - "completion_length": 751.0, - "delta_ref_entropy_loss": 0.058837890625, - "delta_ref_ppl": -0.026611328125, - "entropy_loss": -0.0618896484375, - "epoch": 0.7556, - "grad_norm": 1.2620330054644584, - "k1_kl": 0.026611328125, - "k3_kl": 0.01220703125, - "kimi_kl": 0.0208740234375, - "learning_rate": 1.222e-07, - "loss": 0.0005, - "ppl": 0.0374755859375, - "reward": 0.8646766245365143, - "reward_std": 0.0019334153039380908, - "rewards/perpo_ocr_edit_distance_reward": 0.8646766841411591, + "advantages": -1.2491431334638037e-05, + "completion_length": 736.0, + "delta_ref_entropy_loss": 0.038818359375, + "delta_ref_ppl": -0.042236328125, + "entropy_loss": -0.044189453125, + "epoch": 0.3778, + "grad_norm": 0.4402111515616817, + "k1_kl": 0.042236328125, + "k3_kl": 0.024169921875, + "kimi_kl": 0.06298828125, + "learning_rate": 3.1109999999999997e-07, + "loss": 0.001, + "ppl": 0.017333984375, + "reward": 0.8498727083206177, + "reward_std": 0.000580707797780633, + "rewards/perpo_ocr_edit_distance_reward": 0.8498727679252625, "step": 1889, "temperature": 0.9 }, { - "advantages": 5.219663954392217e-06, - "completion_length": 905.5, - "delta_ref_entropy_loss": 0.0306396484375, - "delta_ref_ppl": -0.02862548828125, - "entropy_loss": -0.046630859375, - "epoch": 0.756, - "grad_norm": 0.8866171998927694, - "k1_kl": 0.02850341796875, - "k3_kl": 0.0172119140625, - "kimi_kl": 0.042633056640625, - "learning_rate": 1.2199999999999998e-07, - "loss": 0.0007, - "ppl": 0.0229949951171875, - "reward": 0.7667783796787262, - "reward_std": 0.13343758415430784, - "rewards/perpo_ocr_edit_distance_reward": 0.766778439283371, + "advantages": -1.5309879017877392e-05, + "completion_length": 259.0, + "delta_ref_entropy_loss": 0.09375, + "delta_ref_ppl": -0.1630859375, + "entropy_loss": -0.08251953125, + "epoch": 0.378, + "grad_norm": 0.9914029134439263, + "k1_kl": 0.1630859375, + "k3_kl": 0.10888671875, + "kimi_kl": 0.349609375, + "learning_rate": 3.1099999999999997e-07, + "loss": 0.0044, + "ppl": 0.03271484375, + "reward": 0.9630818367004395, + "reward_std": 0.0021237025503069162, + "rewards/perpo_ocr_edit_distance_reward": 0.9630818367004395, "step": 1890, "temperature": 0.9 }, { - "advantages": -8.259075548266992e-05, - "completion_length": 491.5, - "delta_ref_entropy_loss": 0.042236328125, - "delta_ref_ppl": -0.032470703125, - "entropy_loss": -0.014892578125, - "epoch": 0.7564, - "grad_norm": 0.21356921829954614, - "k1_kl": 0.032470703125, - "k3_kl": 0.0194091796875, - "kimi_kl": 0.0657958984375, - "learning_rate": 1.218e-07, - "loss": 0.0009, - "ppl": 0.00447845458984375, - "reward": 0.9966546297073364, - "reward_std": 0.0002858587831724435, - "rewards/perpo_ocr_edit_distance_reward": 0.9966546893119812, + "advantages": -4.606587935995776e-06, + "completion_length": 661.0, + "delta_ref_entropy_loss": 0.05859375, + "delta_ref_ppl": -0.0654296875, + "entropy_loss": -0.07177734375, + "epoch": 0.3782, + "grad_norm": 0.7728761128915276, + "k1_kl": 0.0654296875, + "k3_kl": 0.04345703125, + "kimi_kl": 0.16796875, + "learning_rate": 3.109e-07, + "loss": 0.0017, + "ppl": 0.031982421875, + "reward": 0.9238377809524536, + "reward_std": 0.010999550111591816, + "rewards/perpo_ocr_edit_distance_reward": 0.9238378405570984, "step": 1891, "temperature": 0.9 }, { - "advantages": -5.907671948079951e-05, - "completion_length": 652.0, - "delta_ref_entropy_loss": 0.069091796875, - "delta_ref_ppl": -0.0533447265625, - "entropy_loss": -0.059814453125, - "epoch": 0.7568, - "grad_norm": 0.7136041352962338, - "k1_kl": 0.0533447265625, - "k3_kl": 0.03302001953125, - "kimi_kl": 0.110595703125, - "learning_rate": 1.216e-07, - "loss": 0.0014, - "ppl": 0.030609130859375, - "reward": 0.9921411573886871, - "reward_std": 0.001260677119717002, - "rewards/perpo_ocr_edit_distance_reward": 0.9921412765979767, + "advantages": -0.0005960464477539062, + "completion_length": 703.0, + "delta_ref_entropy_loss": 0.033203125, + "delta_ref_ppl": -0.034423828125, + "entropy_loss": -0.01519775390625, + "epoch": 0.3784, + "grad_norm": 0.006008702442198046, + "k1_kl": 0.034423828125, + "k3_kl": 0.017578125, + "kimi_kl": 0.044677734375, + "learning_rate": 3.108e-07, + "loss": 0.0013, + "ppl": 0.0028228759765625, + "reward": 0.9960596561431885, + "reward_std": 0.0, + "rewards/perpo_ocr_edit_distance_reward": 0.9960597157478333, "step": 1892, "temperature": 0.9 }, { - "advantages": 0.0, - "completion_length": 176.0, - "delta_ref_entropy_loss": 0.0240936279296875, - "delta_ref_ppl": -0.03244781494140625, - "entropy_loss": -0.01824951171875, - "epoch": 0.7572, - "grad_norm": 0.012347867325106857, - "k1_kl": 0.0324554443359375, - "k3_kl": 0.019969940185546875, - "kimi_kl": 0.047840118408203125, - "learning_rate": 1.214e-07, - "loss": 0.0008, - "ppl": 0.009765625, - "reward": 1.0, - "reward_std": 0.0, - "rewards/perpo_ocr_edit_distance_reward": 1.0, + "advantages": -8.736338713788427e-06, + "completion_length": 574.0, + "delta_ref_entropy_loss": 0.1318359375, + "delta_ref_ppl": -0.1357421875, + "entropy_loss": -0.173828125, + "epoch": 0.3786, + "grad_norm": 1.8489182138424343, + "k1_kl": 0.1357421875, + "k3_kl": 0.0859375, + "kimi_kl": 0.287109375, + "learning_rate": 3.1069999999999994e-07, + "loss": 0.0035, + "ppl": 0.09130859375, + "reward": 0.8483790159225464, + "reward_std": 0.00282135047018528, + "rewards/perpo_ocr_edit_distance_reward": 0.8483790755271912, "step": 1893, "temperature": 0.9 }, { - "advantages": -5.272882663120981e-05, - "completion_length": 483.5, - "delta_ref_entropy_loss": 0.037353515625, - "delta_ref_ppl": -0.047698974609375, - "entropy_loss": -0.03558349609375, - "epoch": 0.7576, - "grad_norm": 1.8252788562036972, - "k1_kl": 0.047698974609375, - "k3_kl": 0.03778076171875, - "kimi_kl": 0.12213134765625, - "learning_rate": 1.212e-07, - "loss": 0.0016, - "ppl": 0.0223388671875, - "reward": 0.99625563621521, - "reward_std": 0.001546087849419564, - "rewards/perpo_ocr_edit_distance_reward": 0.9962556958198547, + "advantages": -1.1094979527115356e-05, + "completion_length": 846.0, + "delta_ref_entropy_loss": 0.0732421875, + "delta_ref_ppl": -0.053955078125, + "entropy_loss": -0.1884765625, + "epoch": 0.3788, + "grad_norm": 6.298376100455271, + "k1_kl": 0.0537109375, + "k3_kl": 0.033935546875, + "kimi_kl": 0.0673828125, + "learning_rate": 3.106e-07, + "loss": 0.0014, + "ppl": 0.10107421875, + "reward": 0.9342471957206726, + "reward_std": 0.004511591512709856, + "rewards/perpo_ocr_edit_distance_reward": 0.9342473149299622, "step": 1894, "temperature": 0.9 }, { - "advantages": 1.242331161677157e-05, - "completion_length": 1106.0, - "delta_ref_entropy_loss": 0.019809722900390625, - "delta_ref_ppl": -0.03887939453125, - "entropy_loss": -0.086761474609375, - "epoch": 0.758, - "grad_norm": 1.8031068392365095, - "k1_kl": 0.0389404296875, - "k3_kl": 0.02606201171875, - "kimi_kl": 0.0777587890625, - "learning_rate": 1.2099999999999998e-07, - "loss": 0.001, - "ppl": 0.0429840087890625, - "reward": 0.8751202821731567, - "reward_std": 0.05523370820446871, - "rewards/perpo_ocr_edit_distance_reward": 0.8751202523708344, - "step": 1895, - "temperature": 0.9 - }, - { - "advantages": -6.114585266914219e-05, - "completion_length": 315.5, - "delta_ref_entropy_loss": 0.0699462890625, - "delta_ref_ppl": -0.0528564453125, - "entropy_loss": -0.0435791015625, - "epoch": 0.7584, - "grad_norm": 0.26591244889860466, - "k1_kl": 0.052978515625, - "k3_kl": 0.02978515625, - "kimi_kl": 0.0963134765625, - "learning_rate": 1.208e-07, - "loss": 0.0013, - "ppl": 0.015777587890625, - "reward": 0.9320046603679657, - "reward_std": 0.00029812162392772734, - "rewards/perpo_ocr_edit_distance_reward": 0.9320046901702881, + "advantages": -0.00014867953723296523, + "completion_length": 661.0, + "delta_ref_entropy_loss": 0.04345703125, + "delta_ref_ppl": -0.0615234375, + "entropy_loss": -0.0152587890625, + "epoch": 0.379, + "grad_norm": 0.18347169573900593, + "k1_kl": 0.061279296875, + "k3_kl": 0.04052734375, + "kimi_kl": 0.1591796875, + "learning_rate": 3.105e-07, + "loss": 0.0018, + "ppl": 0.003448486328125, + "reward": 0.9902398586273193, + "reward_std": 0.00024361687246710062, + "rewards/perpo_ocr_edit_distance_reward": 0.9902399182319641, + "step": 1895, + "temperature": 0.9 + }, + { + "advantages": -9.843281986832153e-06, + "completion_length": 702.0, + "delta_ref_entropy_loss": 0.09033203125, + "delta_ref_ppl": -0.099609375, + "entropy_loss": -0.109375, + "epoch": 0.3792, + "grad_norm": 1.0466356043081813, + "k1_kl": 0.09912109375, + "k3_kl": 0.056640625, + "kimi_kl": 0.142578125, + "learning_rate": 3.104e-07, + "loss": 0.0023, + "ppl": 0.04736328125, + "reward": 0.9307001829147339, + "reward_std": 0.0024917384143918753, + "rewards/perpo_ocr_edit_distance_reward": 0.9307001829147339, "step": 1896, "temperature": 0.9 }, { - "advantages": -2.443790468653617e-06, - "completion_length": 657.5, - "delta_ref_entropy_loss": 0.047119140625, - "delta_ref_ppl": -0.0445556640625, - "entropy_loss": -0.03082275390625, - "epoch": 0.7588, - "grad_norm": 0.6074384012451901, - "k1_kl": 0.0445556640625, - "k3_kl": 0.027313232421875, - "kimi_kl": 0.05859375, - "learning_rate": 1.2059999999999998e-07, - "loss": 0.0011, - "ppl": 0.0137939453125, - "reward": 0.9612533748149872, - "reward_std": 0.003609794919611886, - "rewards/perpo_ocr_edit_distance_reward": 0.961253434419632, + "advantages": -9.788785973796621e-05, + "completion_length": 1165.0, + "delta_ref_entropy_loss": 0.033935546875, + "delta_ref_ppl": -0.03857421875, + "entropy_loss": -0.03515625, + "epoch": 0.3794, + "grad_norm": 0.4331231821481683, + "k1_kl": 0.038818359375, + "k3_kl": 0.02099609375, + "kimi_kl": 0.0576171875, + "learning_rate": 3.103e-07, + "loss": 0.0009, + "ppl": 0.01177978515625, + "reward": 0.9949173331260681, + "reward_std": 0.0006827549077570438, + "rewards/perpo_ocr_edit_distance_reward": 0.9949173927307129, "step": 1897, "temperature": 0.9 }, { - "advantages": -0.00042961325380019844, - "completion_length": 413.5, - "delta_ref_entropy_loss": 0.048583984375, - "delta_ref_ppl": -0.03448486328125, - "entropy_loss": -0.03704833984375, - "epoch": 0.7592, - "grad_norm": 0.26011565850382284, - "k1_kl": 0.03460693359375, - "k3_kl": 0.01885986328125, - "kimi_kl": 0.0533447265625, - "learning_rate": 1.204e-07, - "loss": 0.0012, - "ppl": 0.015350341796875, - "reward": 0.997925728559494, - "reward_std": 9.559964382788166e-05, - "rewards/perpo_ocr_edit_distance_reward": 0.9979258179664612, + "advantages": 0.0, + "completion_length": 294.0, + "delta_ref_entropy_loss": 0.08740234375, + "delta_ref_ppl": -0.11376953125, + "entropy_loss": -0.0693359375, + "epoch": 0.3796, + "grad_norm": 0.5516271823916707, + "k1_kl": 0.11376953125, + "k3_kl": 0.07666015625, + "kimi_kl": 0.255859375, + "learning_rate": 3.1019999999999996e-07, + "loss": 0.0031, + "ppl": 0.02734375, + "reward": 0.9805309176445007, + "reward_std": 0.0017699050949886441, + "rewards/perpo_ocr_edit_distance_reward": 0.9805309772491455, "step": 1898, "temperature": 0.9 }, { - "advantages": -0.0005960464477539062, - "completion_length": 143.0, - "delta_ref_entropy_loss": 0.0550537109375, - "delta_ref_ppl": -0.05108642578125, - "entropy_loss": -0.025146484375, - "epoch": 0.7596, - "grad_norm": 0.03608022784319604, - "k1_kl": 0.05084228515625, - "k3_kl": 0.031097412109375, - "kimi_kl": 0.081787109375, - "learning_rate": 1.202e-07, - "loss": 0.0018, - "ppl": 0.012420654296875, - "reward": 0.9303143620491028, - "reward_std": 0.0, - "rewards/perpo_ocr_edit_distance_reward": 0.93031445145607, + "advantages": -7.0163187047000974e-06, + "completion_length": 322.0, + "delta_ref_entropy_loss": 0.04052734375, + "delta_ref_ppl": -0.09033203125, + "entropy_loss": -0.08154296875, + "epoch": 0.3798, + "grad_norm": 1.4153154824187848, + "k1_kl": 0.0908203125, + "k3_kl": 0.060302734375, + "kimi_kl": 0.1513671875, + "learning_rate": 3.1009999999999995e-07, + "loss": 0.0024, + "ppl": 0.034912109375, + "reward": 0.974590539932251, + "reward_std": 0.0059754373505711555, + "rewards/perpo_ocr_edit_distance_reward": 0.9745905995368958, "step": 1899, "temperature": 0.9 }, { - "advantages": -0.00031819513969821855, - "completion_length": 114.0, - "delta_ref_entropy_loss": 0.083251953125, - "delta_ref_ppl": -0.08642578125, - "entropy_loss": -0.0772705078125, - "epoch": 0.76, - "grad_norm": 1.0999396954123495, - "k1_kl": 0.08642578125, - "k3_kl": 0.0604248046875, - "kimi_kl": 0.13671875, - "learning_rate": 1.2e-07, - "loss": 0.0027, - "ppl": 0.04779052734375, - "reward": 0.9892154335975647, - "reward_std": 0.0002665477222763002, - "rewards/perpo_ocr_edit_distance_reward": 0.9892155230045319, + "advantages": -2.6753972633741796e-05, + "completion_length": 579.0, + "delta_ref_entropy_loss": 0.05517578125, + "delta_ref_ppl": -0.06005859375, + "entropy_loss": -0.03515625, + "epoch": 0.38, + "grad_norm": 0.6571107083808295, + "k1_kl": 0.060302734375, + "k3_kl": 0.037841796875, + "kimi_kl": 0.1279296875, + "learning_rate": 3.1e-07, + "loss": 0.0015, + "ppl": 0.0159912109375, + "reward": 0.997096598148346, + "reward_std": 0.0018101329915225506, + "rewards/perpo_ocr_edit_distance_reward": 0.9970966577529907, "step": 1900, "temperature": 0.9 }, { - "advantages": -0.0003059080672755954, - "completion_length": 660.0, - "delta_ref_entropy_loss": 0.027374267578125, - "delta_ref_ppl": -0.048919677734375, - "entropy_loss": -0.03955078125, - "epoch": 0.7604, - "grad_norm": 0.3805512068800977, - "k1_kl": 0.049407958984375, - "k3_kl": 0.03412628173828125, - "kimi_kl": 0.1238250732421875, - "learning_rate": 1.198e-07, - "loss": 0.0017, - "ppl": 0.0167083740234375, - "reward": 0.9692490696907043, - "reward_std": 0.0007598224328830838, - "rewards/perpo_ocr_edit_distance_reward": 0.9692491292953491, + "advantages": -8.596693078288808e-05, + "completion_length": 710.0, + "delta_ref_entropy_loss": 0.033935546875, + "delta_ref_ppl": -0.0390625, + "entropy_loss": -0.0301513671875, + "epoch": 0.3802, + "grad_norm": 0.43881495706581514, + "k1_kl": 0.0390625, + "k3_kl": 0.022705078125, + "kimi_kl": 0.064453125, + "learning_rate": 3.099e-07, + "loss": 0.001, + "ppl": 0.01104736328125, + "reward": 0.9963583946228027, + "reward_std": 0.0007913660374470055, + "rewards/perpo_ocr_edit_distance_reward": 0.9963584542274475, "step": 1901, "temperature": 0.9 }, { - "advantages": -1.6685043062292237e-05, - "completion_length": 477.5, - "delta_ref_entropy_loss": 0.0662841796875, - "delta_ref_ppl": -0.058837890625, - "entropy_loss": -0.05517578125, - "epoch": 0.7608, - "grad_norm": 0.7236900365392792, - "k1_kl": 0.058837890625, - "k3_kl": 0.0360107421875, - "kimi_kl": 0.12451171875, - "learning_rate": 1.1959999999999999e-07, - "loss": 0.0015, - "ppl": 0.02386474609375, - "reward": 0.8578585088253021, - "reward_std": 0.004138056363444775, - "rewards/perpo_ocr_edit_distance_reward": 0.8578585088253021, + "advantages": 8.319106200360693e-06, + "completion_length": 640.0, + "delta_ref_entropy_loss": 0.02978515625, + "delta_ref_ppl": -0.049072265625, + "entropy_loss": -0.02783203125, + "epoch": 0.3804, + "grad_norm": 0.8823826310769229, + "k1_kl": 0.049072265625, + "k3_kl": 0.0341796875, + "kimi_kl": 0.09521484375, + "learning_rate": 3.098e-07, + "loss": 0.0014, + "ppl": 0.01361083984375, + "reward": 0.9929278492927551, + "reward_std": 0.002969433320686221, + "rewards/perpo_ocr_edit_distance_reward": 0.9929278492927551, "step": 1902, "temperature": 0.9 }, { - "advantages": -5.132811776320523e-05, - "completion_length": 358.0, - "delta_ref_entropy_loss": 0.05902099609375, - "delta_ref_ppl": -0.12890625, - "entropy_loss": -0.1636962890625, - "epoch": 0.7612, - "grad_norm": 14.014869837262195, - "k1_kl": 0.12890625, - "k3_kl": 0.132568359375, - "kimi_kl": 0.284423828125, - "learning_rate": 1.194e-07, - "loss": 0.0054, - "ppl": 0.0838775634765625, - "reward": 0.6142570227384567, - "reward_std": 0.012831169915443752, - "rewards/perpo_ocr_edit_distance_reward": 0.6142570674419403, + "advantages": -2.4122851755237207e-05, + "completion_length": 1322.0, + "delta_ref_entropy_loss": 0.03515625, + "delta_ref_ppl": -0.039794921875, + "entropy_loss": -0.0244140625, + "epoch": 0.3806, + "grad_norm": 0.3196140767736689, + "k1_kl": 0.039794921875, + "k3_kl": 0.0234375, + "kimi_kl": 0.0732421875, + "learning_rate": 3.097e-07, + "loss": 0.001, + "ppl": 0.009765625, + "reward": 0.9863913059234619, + "reward_std": 0.0013112403685227036, + "rewards/perpo_ocr_edit_distance_reward": 0.9863913655281067, "step": 1903, "temperature": 0.9 }, { - "advantages": -0.00012545075514935888, - "completion_length": 574.0, - "delta_ref_entropy_loss": 0.0255126953125, - "delta_ref_ppl": -0.027587890625, - "entropy_loss": -0.02044677734375, - "epoch": 0.7616, - "grad_norm": 0.19038264703222318, - "k1_kl": 0.02764892578125, - "k3_kl": 0.016357421875, - "kimi_kl": 0.0390625, - "learning_rate": 1.192e-07, - "loss": 0.0008, - "ppl": 0.0086822509765625, - "reward": 0.5288310758769512, - "reward_std": 0.0001501355436630547, - "rewards/perpo_ocr_edit_distance_reward": 0.5288311094045639, + "advantages": -5.104712181491777e-05, + "completion_length": 584.0, + "delta_ref_entropy_loss": 0.072265625, + "delta_ref_ppl": -0.0751953125, + "entropy_loss": -0.07080078125, + "epoch": 0.3808, + "grad_norm": 3.8036410786787362, + "k1_kl": 0.0751953125, + "k3_kl": 0.046630859375, + "kimi_kl": 0.12109375, + "learning_rate": 3.0959999999999997e-07, + "loss": 0.0019, + "ppl": 0.029296875, + "reward": 0.9829868674278259, + "reward_std": 0.0007337824208661914, + "rewards/perpo_ocr_edit_distance_reward": 0.9829869270324707, "step": 1904, "temperature": 0.9 }, { - "advantages": -0.0002980232238769531, - "completion_length": 440.0, - "delta_ref_entropy_loss": 0.0277099609375, - "delta_ref_ppl": -0.02667236328125, - "entropy_loss": -0.01739501953125, - "epoch": 0.762, - "grad_norm": 0.012908798763672054, - "k1_kl": 0.0267333984375, - "k3_kl": 0.017486572265625, - "kimi_kl": 0.051025390625, - "learning_rate": 1.19e-07, - "loss": 0.001, - "ppl": 0.0064239501953125, - "reward": 0.995983898639679, - "reward_std": 0.0, - "rewards/perpo_ocr_edit_distance_reward": 0.9959839582443237, + "advantages": -5.143029738974292e-06, + "completion_length": 377.0, + "delta_ref_entropy_loss": 0.059326171875, + "delta_ref_ppl": -0.0986328125, + "entropy_loss": -0.033447265625, + "epoch": 0.381, + "grad_norm": 1.1329699575511591, + "k1_kl": 0.0986328125, + "k3_kl": 0.06982421875, + "kimi_kl": 0.3203125, + "learning_rate": 3.0949999999999996e-07, + "loss": 0.0028, + "ppl": 0.0128173828125, + "reward": 0.9975243806838989, + "reward_std": 0.003202368039637804, + "rewards/perpo_ocr_edit_distance_reward": 0.9975244402885437, "step": 1905, "temperature": 0.9 }, { - "advantages": 0.0, - "completion_length": 142.5, - "delta_ref_entropy_loss": 0.0347900390625, - "delta_ref_ppl": -0.08740234375, - "entropy_loss": -0.034423828125, - "epoch": 0.7624, - "grad_norm": 0.0728654184740881, - "k1_kl": 0.087158203125, - "k3_kl": 0.0633544921875, - "kimi_kl": 0.16357421875, - "learning_rate": 1.1879999999999999e-07, - "loss": 0.0025, - "ppl": 0.02044677734375, - "reward": 1.0, - "reward_std": 0.0, - "rewards/perpo_ocr_edit_distance_reward": 1.0, + "advantages": -2.043587983280304e-07, + "completion_length": 49.0, + "delta_ref_entropy_loss": 0.10107421875, + "delta_ref_ppl": -0.474609375, + "entropy_loss": -0.1640625, + "epoch": 0.3812, + "grad_norm": 7.198027695653508, + "k1_kl": 0.474609375, + "k3_kl": 0.40234375, + "kimi_kl": 1.4765625, + "learning_rate": 3.094e-07, + "loss": 0.0161, + "ppl": 0.083984375, + "reward": 0.36854350566864014, + "reward_std": 0.13199792802333832, + "rewards/perpo_ocr_edit_distance_reward": 0.3685435354709625, "step": 1906, "temperature": 0.9 }, { - "advantages": -6.594828521944152e-06, - "completion_length": 920.0, - "delta_ref_entropy_loss": 0.13134765625, - "delta_ref_ppl": -0.3048095703125, - "entropy_loss": -0.214599609375, - "epoch": 0.7628, - "grad_norm": 4.984245969936164, - "k1_kl": 0.3046875, - "k3_kl": 0.2479248046875, - "kimi_kl": 0.824462890625, - "learning_rate": 1.1859999999999999e-07, - "loss": 0.0099, - "ppl": 0.1319580078125, - "reward": 0.5874134376645088, - "reward_std": 0.015943756967317313, - "rewards/perpo_ocr_edit_distance_reward": 0.5874135047197342, + "advantages": 1.4194421055435669e-05, + "completion_length": 431.0, + "delta_ref_entropy_loss": 0.0341796875, + "delta_ref_ppl": -0.052734375, + "entropy_loss": -0.023193359375, + "epoch": 0.3814, + "grad_norm": 0.38461454623016283, + "k1_kl": 0.052734375, + "k3_kl": 0.034423828125, + "kimi_kl": 0.1171875, + "learning_rate": 3.093e-07, + "loss": 0.0014, + "ppl": 0.00677490234375, + "reward": 0.9972189664840698, + "reward_std": 0.00049993657739833, + "rewards/perpo_ocr_edit_distance_reward": 0.9972189664840698, "step": 1907, "temperature": 0.9 }, { - "advantages": -2.2177186110639013e-05, - "completion_length": 537.0, - "delta_ref_entropy_loss": 0.0615234375, - "delta_ref_ppl": -0.0535888671875, - "entropy_loss": -0.05096435546875, - "epoch": 0.7632, - "grad_norm": 2.9303014630643953, - "k1_kl": 0.05322265625, - "k3_kl": 0.04400634765625, - "kimi_kl": 0.085693359375, - "learning_rate": 1.184e-07, - "loss": 0.0018, - "ppl": 0.029144287109375, - "reward": 0.968087911605835, - "reward_std": 0.0005395032858359627, - "rewards/perpo_ocr_edit_distance_reward": 0.968087911605835, + "advantages": -4.13400812249165e-05, + "completion_length": 452.0, + "delta_ref_entropy_loss": 0.0380859375, + "delta_ref_ppl": -0.07080078125, + "entropy_loss": -0.020751953125, + "epoch": 0.3816, + "grad_norm": 0.4450108921069925, + "k1_kl": 0.0703125, + "k3_kl": 0.046875, + "kimi_kl": 0.1806640625, + "learning_rate": 3.0919999999999994e-07, + "loss": 0.0019, + "ppl": 0.006256103515625, + "reward": 0.9987691044807434, + "reward_std": 0.0007237410172820091, + "rewards/perpo_ocr_edit_distance_reward": 0.9987691044807434, "step": 1908, "temperature": 0.9 }, { - "advantages": -2.681676960492041e-05, - "completion_length": 928.5, - "delta_ref_entropy_loss": 0.04254150390625, - "delta_ref_ppl": -0.0391845703125, - "entropy_loss": -0.04205322265625, - "epoch": 0.7636, - "grad_norm": 1.0851811204967998, - "k1_kl": 0.0391845703125, - "k3_kl": 0.024871826171875, - "kimi_kl": 0.0782470703125, - "learning_rate": 1.182e-07, - "loss": 0.001, - "ppl": 0.023040771484375, - "reward": 0.9864234924316406, - "reward_std": 0.0007726751791778952, - "rewards/perpo_ocr_edit_distance_reward": 0.9864234924316406, + "advantages": -2.4250575734185986e-05, + "completion_length": 415.0, + "delta_ref_entropy_loss": 0.107421875, + "delta_ref_ppl": -0.115234375, + "entropy_loss": -0.059814453125, + "epoch": 0.3818, + "grad_norm": 1.1732686789980016, + "k1_kl": 0.115234375, + "k3_kl": 0.0771484375, + "kimi_kl": 0.275390625, + "learning_rate": 3.091e-07, + "loss": 0.0031, + "ppl": 0.0262451171875, + "reward": 0.9883065819740295, + "reward_std": 0.0023575434461236, + "rewards/perpo_ocr_edit_distance_reward": 0.9883067607879639, "step": 1909, "temperature": 0.9 }, { - "advantages": -1.9158636632710113e-07, - "completion_length": 116.5, - "delta_ref_entropy_loss": 0.0562744140625, - "delta_ref_ppl": -0.100341796875, - "entropy_loss": -0.075927734375, - "epoch": 0.764, - "grad_norm": 0.658440946621952, - "k1_kl": 0.100341796875, - "k3_kl": 0.069091796875, - "kimi_kl": 0.20556640625, - "learning_rate": 1.1799999999999998e-07, - "loss": 0.0028, - "ppl": 0.0411376953125, - "reward": 0.8266008794307709, - "reward_std": 0.12419027835130692, - "rewards/perpo_ocr_edit_distance_reward": 0.8266009092330933, + "advantages": -7.084438038873486e-06, + "completion_length": 246.0, + "delta_ref_entropy_loss": 0.1298828125, + "delta_ref_ppl": -0.1689453125, + "entropy_loss": -0.1220703125, + "epoch": 0.382, + "grad_norm": 2.050987255494155, + "k1_kl": 0.169921875, + "k3_kl": 0.1123046875, + "kimi_kl": 0.427734375, + "learning_rate": 3.09e-07, + "loss": 0.0045, + "ppl": 0.05224609375, + "reward": 0.8342649340629578, + "reward_std": 0.0023155841045081615, + "rewards/perpo_ocr_edit_distance_reward": 0.8342649340629578, "step": 1910, "temperature": 0.9 }, { - "advantages": -2.5174448502696123e-05, - "completion_length": 350.5, - "delta_ref_entropy_loss": 0.043212890625, - "delta_ref_ppl": -0.10919189453125, - "entropy_loss": -0.178955078125, - "epoch": 0.7644, - "grad_norm": 2.955818461934196, - "k1_kl": 0.10919189453125, - "k3_kl": 0.10260009765625, - "kimi_kl": 0.32305908203125, - "learning_rate": 1.178e-07, - "loss": 0.0041, - "ppl": 0.08367919921875, - "reward": 0.6683859080076218, - "reward_std": 0.06243192916736007, - "rewards/perpo_ocr_edit_distance_reward": 0.6683859527111053, + "advantages": 4.257474817137563e-09, + "completion_length": 497.0, + "delta_ref_entropy_loss": 0.07275390625, + "delta_ref_ppl": -0.08984375, + "entropy_loss": -0.033203125, + "epoch": 0.3822, + "grad_norm": 0.9844190950224375, + "k1_kl": 0.08984375, + "k3_kl": 0.0556640625, + "kimi_kl": 0.1357421875, + "learning_rate": 3.089e-07, + "loss": 0.0022, + "ppl": 0.0146484375, + "reward": 0.991836667060852, + "reward_std": 0.000725613150279969, + "rewards/perpo_ocr_edit_distance_reward": 0.9918367266654968, "step": 1911, "temperature": 0.9 }, { - "advantages": -0.00011705500946845859, - "completion_length": 262.5, - "delta_ref_entropy_loss": 0.0799560546875, - "delta_ref_ppl": -0.0748291015625, - "entropy_loss": -0.03924560546875, - "epoch": 0.7648, - "grad_norm": 0.21313780085772382, - "k1_kl": 0.0748291015625, - "k3_kl": 0.04681396484375, - "kimi_kl": 0.1326904296875, - "learning_rate": 1.176e-07, - "loss": 0.002, - "ppl": 0.014129638671875, - "reward": 0.9998335540294647, - "reward_std": 0.0001136896971729584, - "rewards/perpo_ocr_edit_distance_reward": 0.9998336136341095, + "advantages": -2.5970595743274316e-05, + "completion_length": 1122.0, + "delta_ref_entropy_loss": 0.0625, + "delta_ref_ppl": -0.05615234375, + "entropy_loss": -0.0537109375, + "epoch": 0.3824, + "grad_norm": 10.242441292353652, + "k1_kl": 0.05615234375, + "k3_kl": 0.044677734375, + "kimi_kl": 0.07275390625, + "learning_rate": 3.088e-07, + "loss": 0.0018, + "ppl": 0.0267333984375, + "reward": 0.9835037589073181, + "reward_std": 0.0008830220904201269, + "rewards/perpo_ocr_edit_distance_reward": 0.9835038185119629, "step": 1912, "temperature": 0.9 }, { - "advantages": -1.6433853033959167e-06, - "completion_length": 479.0, - "delta_ref_entropy_loss": 0.0511474609375, - "delta_ref_ppl": -0.05908203125, - "entropy_loss": -0.06591796875, - "epoch": 0.7652, - "grad_norm": 1.1600407517692408, - "k1_kl": 0.058837890625, - "k3_kl": 0.04058837890625, - "kimi_kl": 0.1119384765625, - "learning_rate": 1.1739999999999999e-07, - "loss": 0.0016, - "ppl": 0.032073974609375, - "reward": 0.970046728849411, - "reward_std": 0.0077425641939044, - "rewards/perpo_ocr_edit_distance_reward": 0.9700467586517334, + "advantages": 3.3038004403351806e-06, + "completion_length": 121.0, + "delta_ref_entropy_loss": 0.0595703125, + "delta_ref_ppl": -0.2412109375, + "entropy_loss": -0.05322265625, + "epoch": 0.3826, + "grad_norm": 2.105568136339155, + "k1_kl": 0.2421875, + "k3_kl": 0.1982421875, + "kimi_kl": 0.89453125, + "learning_rate": 3.0869999999999995e-07, + "loss": 0.0079, + "ppl": 0.0185546875, + "reward": 0.9777325987815857, + "reward_std": 0.002463454147800803, + "rewards/perpo_ocr_edit_distance_reward": 0.9777325391769409, "step": 1913, "temperature": 0.9 }, { - "advantages": -3.849608765449375e-05, - "completion_length": 782.0, - "delta_ref_entropy_loss": 0.01666259765625, - "delta_ref_ppl": -0.01348876953125, - "entropy_loss": -0.020294189453125, - "epoch": 0.7656, - "grad_norm": 0.24805277280327273, - "k1_kl": 0.01348876953125, - "k3_kl": 0.00817108154296875, - "kimi_kl": 0.018524169921875, - "learning_rate": 1.1719999999999999e-07, - "loss": 0.0004, - "ppl": 0.010040283203125, - "reward": 0.9986945688724518, - "reward_std": 0.00041066321136895567, - "rewards/perpo_ocr_edit_distance_reward": 0.998694658279419, + "advantages": -1.5667506886529736e-05, + "completion_length": 1303.0, + "delta_ref_entropy_loss": 0.017822265625, + "delta_ref_ppl": -0.04638671875, + "entropy_loss": -0.0517578125, + "epoch": 0.3828, + "grad_norm": 0.6724144186294909, + "k1_kl": 0.04638671875, + "k3_kl": 0.033447265625, + "kimi_kl": 0.09423828125, + "learning_rate": 3.086e-07, + "loss": 0.0014, + "ppl": 0.0263671875, + "reward": 0.9848781228065491, + "reward_std": 0.003158960724249482, + "rewards/perpo_ocr_edit_distance_reward": 0.9848781824111938, "step": 1914, "temperature": 0.9 }, { - "advantages": -0.00011279753607595921, - "completion_length": 563.0, + "advantages": -3.528594970703125e-05, + "completion_length": 491.0, "delta_ref_entropy_loss": 0.034912109375, - "delta_ref_ppl": -0.01544189453125, - "entropy_loss": -0.029052734375, - "epoch": 0.766, - "grad_norm": 0.65233232174045, - "k1_kl": 0.015472412109375, - "k3_kl": 0.0073394775390625, - "kimi_kl": 0.01312255859375, - "learning_rate": 1.17e-07, - "loss": 0.0004, - "ppl": 0.014495849609375, - "reward": 0.9981307089328766, - "reward_std": 0.0020516696822596714, - "rewards/perpo_ocr_edit_distance_reward": 0.9981307685375214, + "delta_ref_ppl": -0.064453125, + "entropy_loss": -0.0233154296875, + "epoch": 0.383, + "grad_norm": 0.31674371906720944, + "k1_kl": 0.064453125, + "k3_kl": 0.048095703125, + "kimi_kl": 0.185546875, + "learning_rate": 3.085e-07, + "loss": 0.002, + "ppl": 0.007110595703125, + "reward": 0.9899351000785828, + "reward_std": 0.00038251394289545715, + "rewards/perpo_ocr_edit_distance_reward": 0.9899351000785828, "step": 1915, "temperature": 0.9 }, { - "advantages": -4.4001003516314086e-06, - "completion_length": 712.5, - "delta_ref_entropy_loss": 0.02459716796875, - "delta_ref_ppl": -0.03509521484375, - "entropy_loss": -0.0302734375, - "epoch": 0.7664, - "grad_norm": 0.2874245921912131, - "k1_kl": 0.03509521484375, - "k3_kl": 0.02496337890625, - "kimi_kl": 0.08349609375, - "learning_rate": 1.168e-07, - "loss": 0.001, - "ppl": 0.01568603515625, - "reward": 0.9982300102710724, - "reward_std": 0.0004334552795626223, - "rewards/perpo_ocr_edit_distance_reward": 0.9982300102710724, + "advantages": 1.4645713235950097e-05, + "completion_length": 470.0, + "delta_ref_entropy_loss": 0.0634765625, + "delta_ref_ppl": -0.0732421875, + "entropy_loss": -0.0279541015625, + "epoch": 0.3832, + "grad_norm": 2.9929740690337128, + "k1_kl": 0.07373046875, + "k3_kl": 0.04638671875, + "kimi_kl": 0.1484375, + "learning_rate": 3.084e-07, + "loss": 0.0018, + "ppl": 0.01007080078125, + "reward": 0.9972714781761169, + "reward_std": 0.00048127229092642665, + "rewards/perpo_ocr_edit_distance_reward": 0.9972714185714722, "step": 1916, "temperature": 0.9 }, { - "advantages": -3.763607884366138e-05, - "completion_length": 550.0, - "delta_ref_entropy_loss": 0.07318115234375, - "delta_ref_ppl": -0.05877685546875, - "entropy_loss": -0.10003662109375, - "epoch": 0.7668, - "grad_norm": 1.2036514399709386, - "k1_kl": 0.05859375, - "k3_kl": 0.03692626953125, - "kimi_kl": 0.09033203125, - "learning_rate": 1.1659999999999999e-07, - "loss": 0.0015, - "ppl": 0.051971435546875, - "reward": 0.9127131104469299, - "reward_std": 0.0028324707236606628, - "rewards/perpo_ocr_edit_distance_reward": 0.9127130806446075, + "advantages": -1.3777188542007934e-05, + "completion_length": 44.0, + "delta_ref_entropy_loss": 0.0732421875, + "delta_ref_ppl": -0.62109375, + "entropy_loss": -0.134765625, + "epoch": 0.3834, + "grad_norm": 8.619861221150405, + "k1_kl": 0.6171875, + "k3_kl": 0.53125, + "kimi_kl": 3.21875, + "learning_rate": 3.0830000000000003e-07, + "loss": 0.0212, + "ppl": 0.039306640625, + "reward": 0.8579409718513489, + "reward_std": 0.002986140316352248, + "rewards/perpo_ocr_edit_distance_reward": 0.8579409718513489, "step": 1917, "temperature": 0.9 }, { - "advantages": -1.9090516275355185e-05, - "completion_length": 736.0, - "delta_ref_entropy_loss": 0.052978515625, - "delta_ref_ppl": -0.0439453125, - "entropy_loss": -0.0726318359375, - "epoch": 0.7672, - "grad_norm": 1.050662995601741, - "k1_kl": 0.044189453125, - "k3_kl": 0.0263671875, - "kimi_kl": 0.06207275390625, - "learning_rate": 1.164e-07, - "loss": 0.0011, - "ppl": 0.040283203125, - "reward": 0.954380452632904, - "reward_std": 0.01696469628950581, - "rewards/perpo_ocr_edit_distance_reward": 0.9543804824352264, + "advantages": -1.7796244719647802e-05, + "completion_length": 1401.0, + "delta_ref_entropy_loss": 0.037841796875, + "delta_ref_ppl": -0.040771484375, + "entropy_loss": -0.06494140625, + "epoch": 0.3836, + "grad_norm": 0.6609709069900105, + "k1_kl": 0.040771484375, + "k3_kl": 0.023193359375, + "kimi_kl": 0.04638671875, + "learning_rate": 3.0819999999999997e-07, + "loss": 0.0009, + "ppl": 0.0286865234375, + "reward": 0.9800453782081604, + "reward_std": 0.00133510900195688, + "rewards/perpo_ocr_edit_distance_reward": 0.9800453782081604, "step": 1918, "temperature": 0.9 }, { - "advantages": -1.8818038824974792e-06, - "completion_length": 702.5, - "delta_ref_entropy_loss": 0.03546142578125, - "delta_ref_ppl": -0.035888671875, - "entropy_loss": -0.03143310546875, - "epoch": 0.7676, - "grad_norm": 0.4144238254503072, - "k1_kl": 0.03582763671875, - "k3_kl": 0.0250244140625, - "kimi_kl": 0.0947265625, - "learning_rate": 1.162e-07, - "loss": 0.001, - "ppl": 0.0150909423828125, - "reward": 0.9886184930801392, - "reward_std": 0.0033453423529863358, - "rewards/perpo_ocr_edit_distance_reward": 0.9886185228824615, + "advantages": -3.143719368381426e-05, + "completion_length": 612.0, + "delta_ref_entropy_loss": 0.048583984375, + "delta_ref_ppl": -0.06494140625, + "entropy_loss": -0.039794921875, + "epoch": 0.3838, + "grad_norm": 1.0472701568439737, + "k1_kl": 0.064453125, + "k3_kl": 0.04296875, + "kimi_kl": 0.15625, + "learning_rate": 3.0809999999999996e-07, + "loss": 0.0017, + "ppl": 0.020263671875, + "reward": 0.9951126575469971, + "reward_std": 0.0007128069992177188, + "rewards/perpo_ocr_edit_distance_reward": 0.9951126575469971, "step": 1919, "temperature": 0.9 }, { - "advantages": -8.991786671685986e-06, - "completion_length": 607.0, - "delta_ref_entropy_loss": 0.04840087890625, - "delta_ref_ppl": -0.026214599609375, - "entropy_loss": -0.05364990234375, - "epoch": 0.768, - "grad_norm": 0.731531863089334, - "k1_kl": 0.026214599609375, - "k3_kl": 0.013153076171875, - "kimi_kl": 0.02716064453125, - "learning_rate": 1.16e-07, - "loss": 0.0005, - "ppl": 0.025852203369140625, - "reward": 0.8537512421607971, - "reward_std": 0.0006600788328796625, - "rewards/perpo_ocr_edit_distance_reward": 0.8537512421607971, + "advantages": -3.662279777927324e-05, + "completion_length": 1691.0, + "delta_ref_entropy_loss": 0.0235595703125, + "delta_ref_ppl": -0.03515625, + "entropy_loss": -0.039306640625, + "epoch": 0.384, + "grad_norm": 0.4347821685247371, + "k1_kl": 0.03515625, + "k3_kl": 0.0247802734375, + "kimi_kl": 0.056396484375, + "learning_rate": 3.08e-07, + "loss": 0.001, + "ppl": 0.0206298828125, + "reward": 0.9948769211769104, + "reward_std": 0.001294929999858141, + "rewards/perpo_ocr_edit_distance_reward": 0.9948769807815552, "step": 1920, "temperature": 0.9 }, { - "advantages": -1.7029899890275146e-07, - "completion_length": 335.0, - "delta_ref_entropy_loss": 0.05841064453125, - "delta_ref_ppl": -0.0966796875, - "entropy_loss": -0.160736083984375, - "epoch": 0.7684, - "grad_norm": 2.3053758804607947, - "k1_kl": 0.09619140625, - "k3_kl": 0.0771484375, - "kimi_kl": 0.33935546875, - "learning_rate": 1.1579999999999999e-07, - "loss": 0.0031, - "ppl": 0.091796875, - "reward": 0.7479954063892365, - "reward_std": 0.02403931214939803, - "rewards/perpo_ocr_edit_distance_reward": 0.7479954212903976, + "advantages": -6.112030678195879e-05, + "completion_length": 742.0, + "delta_ref_entropy_loss": 0.0712890625, + "delta_ref_ppl": -0.08447265625, + "entropy_loss": -0.056640625, + "epoch": 0.3842, + "grad_norm": 0.7352482766606937, + "k1_kl": 0.08447265625, + "k3_kl": 0.05078125, + "kimi_kl": 0.126953125, + "learning_rate": 3.079e-07, + "loss": 0.0021, + "ppl": 0.0233154296875, + "reward": 0.9878690242767334, + "reward_std": 0.0007358892471529543, + "rewards/perpo_ocr_edit_distance_reward": 0.9878690838813782, "step": 1921, "temperature": 0.9 }, { - "advantages": -8.5877523815725e-05, - "completion_length": 1157.5, - "delta_ref_entropy_loss": 0.02557373046875, - "delta_ref_ppl": -0.0163726806640625, - "entropy_loss": -0.02813720703125, - "epoch": 0.7688, - "grad_norm": 0.40394158510900907, - "k1_kl": 0.016448974609375, - "k3_kl": 0.008453369140625, - "kimi_kl": 0.01617431640625, - "learning_rate": 1.1559999999999999e-07, - "loss": 0.0004, - "ppl": 0.0139007568359375, - "reward": 0.9965089559555054, - "reward_std": 0.00048462653649039567, - "rewards/perpo_ocr_edit_distance_reward": 0.9965089857578278, + "advantages": -1.2551035979413427e-05, + "completion_length": 591.0, + "delta_ref_entropy_loss": 0.08251953125, + "delta_ref_ppl": -0.1005859375, + "entropy_loss": -0.1015625, + "epoch": 0.3844, + "grad_norm": 1.9046837391086724, + "k1_kl": 0.1005859375, + "k3_kl": 0.06396484375, + "kimi_kl": 0.1953125, + "learning_rate": 3.078e-07, + "loss": 0.0026, + "ppl": 0.052734375, + "reward": 0.9807553887367249, + "reward_std": 0.001934137544594705, + "rewards/perpo_ocr_edit_distance_reward": 0.9807553887367249, "step": 1922, "temperature": 0.9 }, { - "advantages": -0.00021964312054478796, - "completion_length": 759.0, - "delta_ref_entropy_loss": 0.018798828125, - "delta_ref_ppl": -0.016632080078125, - "entropy_loss": -0.03155517578125, - "epoch": 0.7692, - "grad_norm": 0.37049778665086186, - "k1_kl": 0.01666259765625, - "k3_kl": 0.00958251953125, - "kimi_kl": 0.017852783203125, - "learning_rate": 1.154e-07, - "loss": 0.0006, - "ppl": 0.0159912109375, - "reward": 0.9990044832229614, - "reward_std": 0.000393913833249826, - "rewards/perpo_ocr_edit_distance_reward": 0.9990045428276062, + "advantages": -7.82012939453125e-05, + "completion_length": 606.0, + "delta_ref_entropy_loss": 0.0439453125, + "delta_ref_ppl": -0.046875, + "entropy_loss": -0.040283203125, + "epoch": 0.3846, + "grad_norm": 0.6005811373753771, + "k1_kl": 0.046630859375, + "k3_kl": 0.0281982421875, + "kimi_kl": 0.08984375, + "learning_rate": 3.077e-07, + "loss": 0.0012, + "ppl": 0.01470947265625, + "reward": 0.9857334494590759, + "reward_std": 0.001206903369165957, + "rewards/perpo_ocr_edit_distance_reward": 0.9857335686683655, "step": 1923, "temperature": 0.9 }, { - "advantages": -6.463698446168564e-05, - "completion_length": 939.5, - "delta_ref_entropy_loss": 0.0224609375, - "delta_ref_ppl": -0.01336669921875, - "entropy_loss": -0.04302978515625, - "epoch": 0.7696, - "grad_norm": 0.7945298393762962, - "k1_kl": 0.0132904052734375, - "k3_kl": 0.007205963134765625, - "kimi_kl": 0.0163116455078125, - "learning_rate": 1.1519999999999999e-07, - "loss": 0.0004, - "ppl": 0.01837158203125, - "reward": 0.9952009320259094, - "reward_std": 0.0006739890013705008, - "rewards/perpo_ocr_edit_distance_reward": 0.9952009916305542, + "advantages": -5.911078187637031e-05, + "completion_length": 357.0, + "delta_ref_entropy_loss": 0.049072265625, + "delta_ref_ppl": -0.08544921875, + "entropy_loss": -0.02734375, + "epoch": 0.3848, + "grad_norm": 0.6422590835332327, + "k1_kl": 0.08544921875, + "k3_kl": 0.060546875, + "kimi_kl": 0.294921875, + "learning_rate": 3.076e-07, + "loss": 0.0025, + "ppl": 0.01104736328125, + "reward": 0.9966917037963867, + "reward_std": 0.0011960240080952644, + "rewards/perpo_ocr_edit_distance_reward": 0.9966918230056763, "step": 1924, "temperature": 0.9 }, { - "advantages": -0.00013280341272547957, - "completion_length": 429.5, - "delta_ref_entropy_loss": 0.0645751953125, - "delta_ref_ppl": -0.1407470703125, - "entropy_loss": -0.1063232421875, - "epoch": 0.77, - "grad_norm": 3.0058770180253127, - "k1_kl": 0.141845703125, - "k3_kl": 0.10491943359375, - "kimi_kl": 0.427978515625, - "learning_rate": 1.15e-07, - "loss": 0.0043, - "ppl": 0.052001953125, - "reward": 0.967744380235672, - "reward_std": 0.0020618847192963585, - "rewards/perpo_ocr_edit_distance_reward": 0.9677444696426392, + "advantages": -4.420961704454385e-05, + "completion_length": 1616.0, + "delta_ref_entropy_loss": 0.00933837890625, + "delta_ref_ppl": -0.0203857421875, + "entropy_loss": -0.032470703125, + "epoch": 0.385, + "grad_norm": 0.46021521794164255, + "k1_kl": 0.0203857421875, + "k3_kl": 0.0164794921875, + "kimi_kl": 0.04150390625, + "learning_rate": 3.0749999999999997e-07, + "loss": 0.0007, + "ppl": 0.01434326171875, + "reward": 0.9943327903747559, + "reward_std": 0.0012484804028645158, + "rewards/perpo_ocr_edit_distance_reward": 0.9943329095840454, "step": 1925, "temperature": 0.9 }, { - "advantages": -1.731089287204668e-05, - "completion_length": 850.0, - "delta_ref_entropy_loss": 0.020751953125, - "delta_ref_ppl": -0.019500732421875, - "entropy_loss": -0.024658203125, - "epoch": 0.7704, - "grad_norm": 0.2650533075814715, - "k1_kl": 0.01959228515625, - "k3_kl": 0.01214599609375, - "kimi_kl": 0.028076171875, - "learning_rate": 1.148e-07, - "loss": 0.0005, - "ppl": 0.014373779296875, - "reward": 0.9987299740314484, - "reward_std": 0.0004419363394845277, - "rewards/perpo_ocr_edit_distance_reward": 0.9987300038337708, + "advantages": -3.040688534383662e-05, + "completion_length": 1117.0, + "delta_ref_entropy_loss": 0.017822265625, + "delta_ref_ppl": -0.02587890625, + "entropy_loss": -0.02099609375, + "epoch": 0.3852, + "grad_norm": 0.4225890463913919, + "k1_kl": 0.0260009765625, + "k3_kl": 0.0179443359375, + "kimi_kl": 0.046142578125, + "learning_rate": 3.074e-07, + "loss": 0.0007, + "ppl": 0.007781982421875, + "reward": 0.9938972592353821, + "reward_std": 0.002700836630538106, + "rewards/perpo_ocr_edit_distance_reward": 0.9938973188400269, "step": 1926, "temperature": 0.9 }, { - "advantages": -1.3174242667446379e-05, - "completion_length": 396.5, - "delta_ref_entropy_loss": 0.0762939453125, - "delta_ref_ppl": -0.0711669921875, - "entropy_loss": -0.085205078125, - "epoch": 0.7708, - "grad_norm": 1.1638014546730184, - "k1_kl": 0.0712890625, - "k3_kl": 0.040771484375, - "kimi_kl": 0.09283447265625, - "learning_rate": 1.146e-07, - "loss": 0.0016, - "ppl": 0.04498291015625, - "reward": 0.981639564037323, - "reward_std": 0.0005951714119873941, - "rewards/perpo_ocr_edit_distance_reward": 0.9816396236419678, + "advantages": -1.4594623280572705e-05, + "completion_length": 340.0, + "delta_ref_entropy_loss": 0.06201171875, + "delta_ref_ppl": -0.109375, + "entropy_loss": -0.054931640625, + "epoch": 0.3854, + "grad_norm": 1.1624403114044357, + "k1_kl": 0.109375, + "k3_kl": 0.07568359375, + "kimi_kl": 0.267578125, + "learning_rate": 3.073e-07, + "loss": 0.003, + "ppl": 0.0230712890625, + "reward": 0.9956602454185486, + "reward_std": 0.0016511420253664255, + "rewards/perpo_ocr_edit_distance_reward": 0.9956603050231934, "step": 1927, "temperature": 0.9 }, { - "advantages": -0.0001177915526113793, - "completion_length": 606.0, - "delta_ref_entropy_loss": 0.0264892578125, - "delta_ref_ppl": -0.02618408203125, - "entropy_loss": -0.0264892578125, - "epoch": 0.7712, - "grad_norm": 0.40752586428427967, - "k1_kl": 0.02630615234375, - "k3_kl": 0.016845703125, - "kimi_kl": 0.0618896484375, - "learning_rate": 1.1439999999999999e-07, - "loss": 0.0008, - "ppl": 0.01080322265625, - "reward": 0.992299497127533, - "reward_std": 0.001712503843009472, - "rewards/perpo_ocr_edit_distance_reward": 0.9922995567321777, + "advantages": -1.021793991640152e-07, + "completion_length": 1211.0, + "delta_ref_entropy_loss": 0.047607421875, + "delta_ref_ppl": -0.060546875, + "entropy_loss": -0.11767578125, + "epoch": 0.3856, + "grad_norm": 1.398924601012395, + "k1_kl": 0.060791015625, + "k3_kl": 0.033935546875, + "kimi_kl": 0.07373046875, + "learning_rate": 3.0719999999999995e-07, + "loss": 0.0014, + "ppl": 0.058349609375, + "reward": 0.780745804309845, + "reward_std": 0.215884268283844, + "rewards/perpo_ocr_edit_distance_reward": 0.7807458639144897, "step": 1928, "temperature": 0.9 }, { - "advantages": -7.878031050267964e-05, - "completion_length": 490.0, - "delta_ref_entropy_loss": 0.0333251953125, - "delta_ref_ppl": -0.0264892578125, - "entropy_loss": -0.0545654296875, - "epoch": 0.7716, - "grad_norm": 1.0329568792378139, - "k1_kl": 0.0264892578125, - "k3_kl": 0.01654052734375, - "kimi_kl": 0.03399658203125, - "learning_rate": 1.1419999999999999e-07, - "loss": 0.0007, - "ppl": 0.02545166015625, - "reward": 0.8754940330982208, - "reward_std": 0.004714188180514611, - "rewards/perpo_ocr_edit_distance_reward": 0.8754940330982208, + "advantages": 0.0, + "completion_length": 902.0, + "delta_ref_entropy_loss": 0.11376953125, + "delta_ref_ppl": -0.11181640625, + "entropy_loss": -0.15234375, + "epoch": 0.3858, + "grad_norm": 3.706193640222785, + "k1_kl": 0.111328125, + "k3_kl": 0.07568359375, + "kimi_kl": 0.2021484375, + "learning_rate": 3.071e-07, + "loss": 0.003, + "ppl": 0.08154296875, + "reward": 0.9386246204376221, + "reward_std": 0.0032085622660815716, + "rewards/perpo_ocr_edit_distance_reward": 0.9386246800422668, "step": 1929, "temperature": 0.9 }, { - "advantages": -0.00011090722182416357, - "completion_length": 961.0, - "delta_ref_entropy_loss": 0.016510009765625, - "delta_ref_ppl": -0.015411376953125, - "entropy_loss": -0.0177001953125, - "epoch": 0.772, - "grad_norm": 0.39746211883582694, - "k1_kl": 0.015411376953125, - "k3_kl": 0.0091705322265625, - "kimi_kl": 0.024688720703125, - "learning_rate": 1.14e-07, - "loss": 0.0005, - "ppl": 0.0082855224609375, - "reward": 0.9993813037872314, - "reward_std": 0.0005198328653932549, - "rewards/perpo_ocr_edit_distance_reward": 0.9993813931941986, + "advantages": -9.294918709201738e-05, + "completion_length": 482.0, + "delta_ref_entropy_loss": 0.0908203125, + "delta_ref_ppl": -0.08056640625, + "entropy_loss": -0.040283203125, + "epoch": 0.386, + "grad_norm": 0.6121657087666522, + "k1_kl": 0.08056640625, + "k3_kl": 0.044189453125, + "kimi_kl": 0.11083984375, + "learning_rate": 3.07e-07, + "loss": 0.0019, + "ppl": 0.010986328125, + "reward": 0.9908565282821655, + "reward_std": 0.000907709589228034, + "rewards/perpo_ocr_edit_distance_reward": 0.9908566474914551, "step": 1930, "temperature": 0.9 }, { - "advantages": -4.9084427246270934e-05, - "completion_length": 677.0, - "delta_ref_entropy_loss": 0.05804443359375, - "delta_ref_ppl": -0.05029296875, - "entropy_loss": -0.05987548828125, - "epoch": 0.7724, - "grad_norm": 0.6538566020877083, - "k1_kl": 0.04998779296875, - "k3_kl": 0.02947998046875, - "kimi_kl": 0.07086181640625, - "learning_rate": 1.1379999999999999e-07, - "loss": 0.0012, - "ppl": 0.03082275390625, - "reward": 0.9426999092102051, - "reward_std": 0.0021356164215831086, - "rewards/perpo_ocr_edit_distance_reward": 0.9426999986171722, + "advantages": -8.433205948676914e-05, + "completion_length": 788.0, + "delta_ref_entropy_loss": 0.056640625, + "delta_ref_ppl": -0.061767578125, + "entropy_loss": -0.035400390625, + "epoch": 0.3862, + "grad_norm": 0.3299278024311092, + "k1_kl": 0.06201171875, + "k3_kl": 0.036865234375, + "kimi_kl": 0.1181640625, + "learning_rate": 3.069e-07, + "loss": 0.0016, + "ppl": 0.01324462890625, + "reward": 0.9947469234466553, + "reward_std": 0.0006067080539651215, + "rewards/perpo_ocr_edit_distance_reward": 0.9947469234466553, "step": 1931, "temperature": 0.9 }, { - "advantages": -2.8525080324470764e-06, - "completion_length": 209.0, - "delta_ref_entropy_loss": 0.1416015625, - "delta_ref_ppl": -0.15478515625, - "entropy_loss": -0.1337890625, - "epoch": 0.7728, - "grad_norm": 1.0818329859728928, - "k1_kl": 0.1552734375, - "k3_kl": 0.0916748046875, - "kimi_kl": 0.21044921875, - "learning_rate": 1.136e-07, - "loss": 0.0037, - "ppl": 0.07958984375, - "reward": 0.940568596124649, - "reward_std": 0.0036943932063877583, - "rewards/perpo_ocr_edit_distance_reward": 0.9405686259269714, + "advantages": -1.1171613550686743e-05, + "completion_length": 260.0, + "delta_ref_entropy_loss": 0.0966796875, + "delta_ref_ppl": -0.2255859375, + "entropy_loss": -0.150390625, + "epoch": 0.3864, + "grad_norm": 2.200106420332435, + "k1_kl": 0.2255859375, + "k3_kl": 0.1708984375, + "kimi_kl": 0.6328125, + "learning_rate": 3.068e-07, + "loss": 0.0068, + "ppl": 0.068359375, + "reward": 0.6367918848991394, + "reward_std": 0.005232573486864567, + "rewards/perpo_ocr_edit_distance_reward": 0.6367919445037842, "step": 1932, "temperature": 0.9 }, { - "advantages": -3.2280173854815075e-05, - "completion_length": 408.5, - "delta_ref_entropy_loss": 0.03399658203125, - "delta_ref_ppl": -0.0413818359375, - "entropy_loss": -0.02630615234375, - "epoch": 0.7732, - "grad_norm": 0.6449023245539495, - "k1_kl": 0.0413818359375, - "k3_kl": 0.02886962890625, - "kimi_kl": 0.106689453125, - "learning_rate": 1.134e-07, - "loss": 0.0012, - "ppl": 0.013641357421875, - "reward": 0.9984441995620728, - "reward_std": 0.0007487257535103709, - "rewards/perpo_ocr_edit_distance_reward": 0.9984442591667175, + "advantages": -6.984813080634922e-05, + "completion_length": 532.0, + "delta_ref_entropy_loss": 0.0751953125, + "delta_ref_ppl": -0.091796875, + "entropy_loss": -0.059814453125, + "epoch": 0.3866, + "grad_norm": 1.35797224561711, + "k1_kl": 0.09130859375, + "k3_kl": 0.052001953125, + "kimi_kl": 0.1337890625, + "learning_rate": 3.0669999999999996e-07, + "loss": 0.0022, + "ppl": 0.0283203125, + "reward": 0.9926697015762329, + "reward_std": 0.0009972741827368736, + "rewards/perpo_ocr_edit_distance_reward": 0.9926697015762329, "step": 1933, "temperature": 0.9 }, { - "advantages": 0.0, - "completion_length": 433.5, - "delta_ref_entropy_loss": 0.0302734375, - "delta_ref_ppl": -0.035888671875, - "entropy_loss": -0.014739990234375, - "epoch": 0.7736, - "grad_norm": 0.019387202555016615, - "k1_kl": 0.0360107421875, - "k3_kl": 0.02813720703125, - "kimi_kl": 0.15234375, - "learning_rate": 1.132e-07, - "loss": 0.0011, - "ppl": 0.0056304931640625, - "reward": 0.8495049774646759, - "reward_std": 0.0, - "rewards/perpo_ocr_edit_distance_reward": 0.8495049774646759, - "step": 1934, - "temperature": 0.9 - }, - { - "advantages": -4.938670826959424e-06, - "completion_length": 416.5, - "delta_ref_entropy_loss": 0.0384521484375, - "delta_ref_ppl": -0.041900634765625, - "entropy_loss": -0.086669921875, - "epoch": 0.774, - "grad_norm": 0.9301813290364562, - "k1_kl": 0.041900634765625, - "k3_kl": 0.0276641845703125, - "kimi_kl": 0.05316162109375, - "learning_rate": 1.1299999999999999e-07, - "loss": 0.0011, - "ppl": 0.04522705078125, - "reward": 0.9427144229412079, - "reward_std": 0.0038349272217601538, - "rewards/perpo_ocr_edit_distance_reward": 0.9427144527435303, + "advantages": -0.0001584036072017625, + "completion_length": 543.0, + "delta_ref_entropy_loss": 0.056396484375, + "delta_ref_ppl": -0.08056640625, + "entropy_loss": -0.030517578125, + "epoch": 0.3868, + "grad_norm": 0.42512325214169455, + "k1_kl": 0.08056640625, + "k3_kl": 0.05029296875, + "kimi_kl": 0.171875, + "learning_rate": 3.0659999999999995e-07, + "loss": 0.0022, + "ppl": 0.00958251953125, + "reward": 0.9959450364112854, + "reward_std": 0.000330086681060493, + "rewards/perpo_ocr_edit_distance_reward": 0.995945155620575, + "step": 1934, + "temperature": 0.9 + }, + { + "advantages": -1.0865075637411792e-05, + "completion_length": 2048.0, + "delta_ref_entropy_loss": 0.028564453125, + "delta_ref_ppl": -0.035888671875, + "entropy_loss": -0.1240234375, + "epoch": 0.387, + "grad_norm": 5.688251994991708, + "k1_kl": 0.035888671875, + "k3_kl": 0.11865234375, + "kimi_kl": 0.076171875, + "learning_rate": 3.065e-07, + "loss": 0.0047, + "ppl": 0.083984375, + "reward": 0.8240993618965149, + "reward_std": 0.003040329786017537, + "rewards/perpo_ocr_edit_distance_reward": 0.8240994215011597, "step": 1935, "temperature": 0.9 }, { - "advantages": -1.24105395116203e-06, - "completion_length": 1309.5, - "delta_ref_entropy_loss": 0.03125, - "delta_ref_ppl": -0.028076171875, - "entropy_loss": -0.0478515625, - "epoch": 0.7744, - "grad_norm": 0.92072720715717, - "k1_kl": 0.02813720703125, - "k3_kl": 0.02008056640625, - "kimi_kl": 0.049560546875, - "learning_rate": 1.1279999999999999e-07, - "loss": 0.0008, - "ppl": 0.0335693359375, - "reward": 0.5489441901445389, - "reward_std": 0.008531932020559907, - "rewards/perpo_ocr_edit_distance_reward": 0.5489441677927971, + "advantages": -7.986171112861484e-05, + "completion_length": 1147.0, + "delta_ref_entropy_loss": 0.035888671875, + "delta_ref_ppl": -0.028564453125, + "entropy_loss": -0.0272216796875, + "epoch": 0.3872, + "grad_norm": 0.44421990890528845, + "k1_kl": 0.028564453125, + "k3_kl": 0.0126953125, + "kimi_kl": 0.03369140625, + "learning_rate": 3.064e-07, + "loss": 0.0006, + "ppl": 0.00799560546875, + "reward": 0.9983597993850708, + "reward_std": 0.00043303731945343316, + "rewards/perpo_ocr_edit_distance_reward": 0.9983599185943604, "step": 1936, "temperature": 0.9 }, { - "advantages": -0.0001138278457801789, - "completion_length": 631.5, - "delta_ref_entropy_loss": 0.02203369140625, - "delta_ref_ppl": -0.017181396484375, - "entropy_loss": -0.022125244140625, - "epoch": 0.7748, - "grad_norm": 0.9947462370440294, - "k1_kl": 0.017181396484375, - "k3_kl": 0.01027679443359375, - "kimi_kl": 0.0335540771484375, - "learning_rate": 1.126e-07, - "loss": 0.0005, - "ppl": 0.00982666015625, - "reward": 0.9974296391010284, - "reward_std": 0.0005748766707256436, - "rewards/perpo_ocr_edit_distance_reward": 0.9974297285079956, + "advantages": -5.011899338569492e-05, + "completion_length": 932.0, + "delta_ref_entropy_loss": 0.02294921875, + "delta_ref_ppl": -0.032958984375, + "entropy_loss": -0.024169921875, + "epoch": 0.3874, + "grad_norm": 0.35410553885157997, + "k1_kl": 0.032958984375, + "k3_kl": 0.0220947265625, + "kimi_kl": 0.06005859375, + "learning_rate": 3.063e-07, + "loss": 0.0009, + "ppl": 0.00872802734375, + "reward": 0.9954267144203186, + "reward_std": 0.0007495195604860783, + "rewards/perpo_ocr_edit_distance_reward": 0.9954267740249634, "step": 1937, "temperature": 0.9 }, { - "advantages": -4.696846372098662e-05, - "completion_length": 775.5, - "delta_ref_entropy_loss": 0.036376953125, - "delta_ref_ppl": -0.0250244140625, - "entropy_loss": -0.02093505859375, - "epoch": 0.7752, - "grad_norm": 0.40615737568321686, - "k1_kl": 0.0250244140625, - "k3_kl": 0.012359619140625, - "kimi_kl": 0.02569580078125, - "learning_rate": 1.124e-07, - "loss": 0.0005, - "ppl": 0.01025390625, - "reward": 0.9974993467330933, - "reward_std": 8.595049439463764e-05, - "rewards/perpo_ocr_edit_distance_reward": 0.997499406337738, + "advantages": -1.9993101886939257e-05, + "completion_length": 455.0, + "delta_ref_entropy_loss": 0.06884765625, + "delta_ref_ppl": -0.0947265625, + "entropy_loss": -0.050048828125, + "epoch": 0.3876, + "grad_norm": 1.2841813766286865, + "k1_kl": 0.0947265625, + "k3_kl": 0.068359375, + "kimi_kl": 0.21875, + "learning_rate": 3.0620000000000003e-07, + "loss": 0.0028, + "ppl": 0.0250244140625, + "reward": 0.9689712524414062, + "reward_std": 0.0020313069690018892, + "rewards/perpo_ocr_edit_distance_reward": 0.968971312046051, "step": 1938, "temperature": 0.9 }, { - "advantages": -8.821487426757812e-06, - "completion_length": 793.5, - "delta_ref_entropy_loss": 0.031890869140625, - "delta_ref_ppl": -0.030029296875, - "entropy_loss": -0.040435791015625, - "epoch": 0.7756, - "grad_norm": 0.566557347596995, - "k1_kl": 0.02996826171875, - "k3_kl": 0.020751953125, - "kimi_kl": 0.0648193359375, - "learning_rate": 1.1219999999999999e-07, - "loss": 0.0008, - "ppl": 0.023120880126953125, - "reward": 0.9910880327224731, - "reward_std": 0.0006739071104675531, - "rewards/perpo_ocr_edit_distance_reward": 0.9910880625247955, + "advantages": -1.444135432393523e-05, + "completion_length": 832.0, + "delta_ref_entropy_loss": 0.038818359375, + "delta_ref_ppl": -0.036376953125, + "entropy_loss": -0.02197265625, + "epoch": 0.3878, + "grad_norm": 0.3198643866483991, + "k1_kl": 0.036376953125, + "k3_kl": 0.021728515625, + "kimi_kl": 0.06396484375, + "learning_rate": 3.0609999999999997e-07, + "loss": 0.0009, + "ppl": 0.0064697265625, + "reward": 0.9944890737533569, + "reward_std": 0.0004888560506515205, + "rewards/perpo_ocr_edit_distance_reward": 0.9944890737533569, "step": 1939, "temperature": 0.9 }, { - "advantages": -2.1202225610750247e-06, - "completion_length": 787.5, - "delta_ref_entropy_loss": 0.05340576171875, - "delta_ref_ppl": -0.05230712890625, - "entropy_loss": -0.054595947265625, - "epoch": 0.776, - "grad_norm": 1.2947557280256776, - "k1_kl": 0.05230712890625, - "k3_kl": 0.033416748046875, - "kimi_kl": 0.083984375, - "learning_rate": 1.12e-07, - "loss": 0.0013, - "ppl": 0.032958984375, - "reward": 0.9198419153690338, - "reward_std": 0.09788050223141909, - "rewards/perpo_ocr_edit_distance_reward": 0.9198419749736786, + "advantages": -0.00020970618061255664, + "completion_length": 1021.0, + "delta_ref_entropy_loss": 0.053466796875, + "delta_ref_ppl": -0.049560546875, + "entropy_loss": -0.033447265625, + "epoch": 0.388, + "grad_norm": 1.8370690193077597, + "k1_kl": 0.049560546875, + "k3_kl": 0.047119140625, + "kimi_kl": 0.0966796875, + "learning_rate": 3.0599999999999996e-07, + "loss": 0.0021, + "ppl": 0.0185546875, + "reward": 0.98968505859375, + "reward_std": 0.0003872454399242997, + "rewards/perpo_ocr_edit_distance_reward": 0.9896851181983948, "step": 1940, "temperature": 0.9 }, { - "advantages": -6.568432036146987e-05, - "completion_length": 671.0, - "delta_ref_entropy_loss": 0.058837890625, - "delta_ref_ppl": -0.06982421875, - "entropy_loss": -0.0750732421875, - "epoch": 0.7764, - "grad_norm": 0.8187284269500311, - "k1_kl": 0.07000732421875, - "k3_kl": 0.048095703125, - "kimi_kl": 0.1641845703125, - "learning_rate": 1.118e-07, - "loss": 0.002, - "ppl": 0.041748046875, - "reward": 0.8482154011726379, - "reward_std": 0.0012080353044439107, - "rewards/perpo_ocr_edit_distance_reward": 0.8482154905796051, + "advantages": -7.976804772624746e-05, + "completion_length": 623.0, + "delta_ref_entropy_loss": 0.1240234375, + "delta_ref_ppl": -0.11181640625, + "entropy_loss": -0.103515625, + "epoch": 0.3882, + "grad_norm": 1.0351278785262807, + "k1_kl": 0.111328125, + "k3_kl": 0.07080078125, + "kimi_kl": 0.251953125, + "learning_rate": 3.059e-07, + "loss": 0.0029, + "ppl": 0.046142578125, + "reward": 0.9360666871070862, + "reward_std": 0.0005403736722655594, + "rewards/perpo_ocr_edit_distance_reward": 0.936066746711731, "step": 1941, "temperature": 0.9 }, { - "advantages": -5.071929990663193e-05, - "completion_length": 938.5, - "delta_ref_entropy_loss": 0.02166748046875, - "delta_ref_ppl": -0.019683837890625, - "entropy_loss": -0.06439208984375, - "epoch": 0.7768, - "grad_norm": 0.5594795413296245, - "k1_kl": 0.01971435546875, - "k3_kl": 0.013336181640625, - "kimi_kl": 0.02484130859375, - "learning_rate": 1.116e-07, - "loss": 0.0006, - "ppl": 0.03558349609375, - "reward": 0.9981676340103149, - "reward_std": 0.0004432389250723645, - "rewards/perpo_ocr_edit_distance_reward": 0.9981676936149597, + "advantages": -4.054819146404043e-05, + "completion_length": 688.0, + "delta_ref_entropy_loss": 0.058349609375, + "delta_ref_ppl": -0.057861328125, + "entropy_loss": -0.037109375, + "epoch": 0.3884, + "grad_norm": 0.4957640846980769, + "k1_kl": 0.057861328125, + "k3_kl": 0.0291748046875, + "kimi_kl": 0.078125, + "learning_rate": 3.058e-07, + "loss": 0.0012, + "ppl": 0.01318359375, + "reward": 0.99627685546875, + "reward_std": 0.0007397125009447336, + "rewards/perpo_ocr_edit_distance_reward": 0.9962769150733948, "step": 1942, "temperature": 0.9 }, { - "advantages": -3.328068123664707e-05, - "completion_length": 308.0, - "delta_ref_entropy_loss": 0.0836181640625, - "delta_ref_ppl": -0.1282958984375, - "entropy_loss": -0.06988525390625, - "epoch": 0.7772, - "grad_norm": 0.23481478820862037, - "k1_kl": 0.1282958984375, - "k3_kl": 0.1008148193359375, - "kimi_kl": 0.441070556640625, - "learning_rate": 1.1139999999999999e-07, - "loss": 0.0041, - "ppl": 0.036468505859375, - "reward": 0.9999705255031586, - "reward_std": 7.792868564138189e-05, - "rewards/perpo_ocr_edit_distance_reward": 0.999970555305481, + "advantages": -0.0005960464477539062, + "completion_length": 357.0, + "delta_ref_entropy_loss": 0.0546875, + "delta_ref_ppl": -0.06005859375, + "entropy_loss": -0.0224609375, + "epoch": 0.3886, + "grad_norm": 0.010456716761376338, + "k1_kl": 0.06005859375, + "k3_kl": 0.03369140625, + "kimi_kl": 0.08984375, + "learning_rate": 3.057e-07, + "loss": 0.0019, + "ppl": 0.0042724609375, + "reward": 0.9934640526771545, + "reward_std": 0.0, + "rewards/perpo_ocr_edit_distance_reward": 0.9934641122817993, "step": 1943, "temperature": 0.9 }, { - "advantages": -0.00018074257661737647, - "completion_length": 518.5, - "delta_ref_entropy_loss": 0.034423828125, - "delta_ref_ppl": -0.025634765625, - "entropy_loss": -0.0343017578125, - "epoch": 0.7776, - "grad_norm": 0.46055379630662335, - "k1_kl": 0.025634765625, - "k3_kl": 0.0136871337890625, - "kimi_kl": 0.0382080078125, - "learning_rate": 1.1119999999999999e-07, - "loss": 0.0007, - "ppl": 0.0174560546875, - "reward": 0.9267257750034332, - "reward_std": 0.014743178217031527, - "rewards/perpo_ocr_edit_distance_reward": 0.9267258048057556, + "advantages": -0.0005960464477539062, + "completion_length": 673.0, + "delta_ref_entropy_loss": 0.0250244140625, + "delta_ref_ppl": -0.039306640625, + "entropy_loss": -0.0181884765625, + "epoch": 0.3888, + "grad_norm": 0.027332087524300004, + "k1_kl": 0.039306640625, + "k3_kl": 0.027587890625, + "kimi_kl": 0.11865234375, + "learning_rate": 3.056e-07, + "loss": 0.0017, + "ppl": 0.0064697265625, + "reward": 0.9988980889320374, + "reward_std": 0.0, + "rewards/perpo_ocr_edit_distance_reward": 0.9988981485366821, "step": 1944, "temperature": 0.9 }, { - "advantages": -2.8857163535178643e-05, - "completion_length": 701.0, - "delta_ref_entropy_loss": 0.03436279296875, - "delta_ref_ppl": -0.0306396484375, - "entropy_loss": -0.08087158203125, - "epoch": 0.778, - "grad_norm": 0.6361455559671998, - "k1_kl": 0.03057861328125, - "k3_kl": 0.018463134765625, - "kimi_kl": 0.0535888671875, - "learning_rate": 1.11e-07, + "advantages": -4.7990255552576855e-05, + "completion_length": 1535.0, + "delta_ref_entropy_loss": 0.0181884765625, + "delta_ref_ppl": -0.02978515625, + "entropy_loss": -0.035888671875, + "epoch": 0.389, + "grad_norm": 0.6700048880665284, + "k1_kl": 0.0299072265625, + "k3_kl": 0.0198974609375, + "kimi_kl": 0.04736328125, + "learning_rate": 3.055e-07, "loss": 0.0008, - "ppl": 0.0360107421875, - "reward": 0.9233072400093079, - "reward_std": 0.06486466956266668, - "rewards/perpo_ocr_edit_distance_reward": 0.9233072400093079, + "ppl": 0.01531982421875, + "reward": 0.9967939257621765, + "reward_std": 0.0013195544015616179, + "rewards/perpo_ocr_edit_distance_reward": 0.9967939853668213, "step": 1945, "temperature": 0.9 }, { - "advantages": -7.85078350418189e-06, - "completion_length": 484.5, - "delta_ref_entropy_loss": 0.0777587890625, - "delta_ref_ppl": -0.071533203125, - "entropy_loss": -0.09619140625, - "epoch": 0.7784, - "grad_norm": 1.8540164738804987, - "k1_kl": 0.071533203125, - "k3_kl": 0.0474853515625, - "kimi_kl": 0.115478515625, - "learning_rate": 1.1079999999999999e-07, - "loss": 0.0019, - "ppl": 0.062255859375, - "reward": 0.9696846008300781, - "reward_std": 0.0057995261158794165, - "rewards/perpo_ocr_edit_distance_reward": 0.9696846604347229, + "advantages": -9.97952065517893e-06, + "completion_length": 789.0, + "delta_ref_entropy_loss": 0.05859375, + "delta_ref_ppl": -0.07421875, + "entropy_loss": -0.039306640625, + "epoch": 0.3892, + "grad_norm": 0.7198345787406555, + "k1_kl": 0.07421875, + "k3_kl": 0.045654296875, + "kimi_kl": 0.1640625, + "learning_rate": 3.0539999999999997e-07, + "loss": 0.0018, + "ppl": 0.0179443359375, + "reward": 0.9867172241210938, + "reward_std": 0.0007528560818172991, + "rewards/perpo_ocr_edit_distance_reward": 0.9867172837257385, "step": 1946, "temperature": 0.9 }, { - "advantages": -2.0759447124873986e-05, - "completion_length": 657.0, - "delta_ref_entropy_loss": 0.04443359375, - "delta_ref_ppl": -0.0445556640625, - "entropy_loss": -0.08026123046875, - "epoch": 0.7788, - "grad_norm": 1.0413746422147625, - "k1_kl": 0.044677734375, - "k3_kl": 0.03155517578125, - "kimi_kl": 0.0986328125, - "learning_rate": 1.106e-07, - "loss": 0.0013, - "ppl": 0.0423583984375, - "reward": 0.965702474117279, - "reward_std": 0.0024394334759563208, - "rewards/perpo_ocr_edit_distance_reward": 0.9657025039196014, + "advantages": -1.021793991640152e-06, + "completion_length": 852.0, + "delta_ref_entropy_loss": 0.1044921875, + "delta_ref_ppl": -0.10595703125, + "entropy_loss": -0.07666015625, + "epoch": 0.3894, + "grad_norm": 0.9808975913686362, + "k1_kl": 0.1064453125, + "k3_kl": 0.064453125, + "kimi_kl": 0.166015625, + "learning_rate": 3.053e-07, + "loss": 0.0026, + "ppl": 0.0299072265625, + "reward": 0.3303276300430298, + "reward_std": 0.004109231289476156, + "rewards/perpo_ocr_edit_distance_reward": 0.3303276598453522, "step": 1947, "temperature": 0.9 }, { - "advantages": -0.00011046443978557363, - "completion_length": 482.5, - "delta_ref_entropy_loss": 0.02679443359375, - "delta_ref_ppl": -0.02081298828125, - "entropy_loss": -0.02606201171875, - "epoch": 0.7792, - "grad_norm": 0.39690948718331037, - "k1_kl": 0.0208740234375, - "k3_kl": 0.01116943359375, - "kimi_kl": 0.025634765625, - "learning_rate": 1.104e-07, - "loss": 0.0006, - "ppl": 0.0120849609375, - "reward": 0.9978224039077759, - "reward_std": 0.00032509486482013017, - "rewards/perpo_ocr_edit_distance_reward": 0.997822493314743, + "advantages": -2.588544703030493e-06, + "completion_length": 276.0, + "delta_ref_entropy_loss": 0.0615234375, + "delta_ref_ppl": -0.126953125, + "entropy_loss": -0.054931640625, + "epoch": 0.3896, + "grad_norm": 1.4288221137902342, + "k1_kl": 0.126953125, + "k3_kl": 0.08935546875, + "kimi_kl": 0.291015625, + "learning_rate": 3.052e-07, + "loss": 0.0036, + "ppl": 0.02392578125, + "reward": 0.033160898834466934, + "reward_std": 0.001754709635861218, + "rewards/perpo_ocr_edit_distance_reward": 0.03316090255975723, "step": 1948, "temperature": 0.9 }, { - "advantages": -6.986516154938727e-06, - "completion_length": 846.5, - "delta_ref_entropy_loss": 0.080322265625, - "delta_ref_ppl": -0.05987548828125, - "entropy_loss": -0.0919189453125, - "epoch": 0.7796, - "grad_norm": 1.1604340358241454, - "k1_kl": 0.06011962890625, - "k3_kl": 0.03204345703125, - "kimi_kl": 0.068359375, - "learning_rate": 1.1020000000000001e-07, - "loss": 0.0013, - "ppl": 0.05340576171875, - "reward": 0.9235097765922546, - "reward_std": 0.005247113178484142, - "rewards/perpo_ocr_edit_distance_reward": 0.9235098361968994, + "advantages": -7.493155749216385e-07, + "completion_length": 1125.0, + "delta_ref_entropy_loss": 0.028564453125, + "delta_ref_ppl": -0.06103515625, + "entropy_loss": -0.2099609375, + "epoch": 0.3898, + "grad_norm": 3.793735461007662, + "k1_kl": 0.06103515625, + "k3_kl": 0.044677734375, + "kimi_kl": 0.0888671875, + "learning_rate": 3.0509999999999995e-07, + "loss": 0.0018, + "ppl": 0.10986328125, + "reward": 0.7017677426338196, + "reward_std": 0.05745299533009529, + "rewards/perpo_ocr_edit_distance_reward": 0.7017678022384644, "step": 1949, "temperature": 0.9 }, { - "advantages": -0.0003126689371129032, - "completion_length": 641.0, - "delta_ref_entropy_loss": 0.02069091796875, - "delta_ref_ppl": -0.0562744140625, - "entropy_loss": -0.022857666015625, - "epoch": 0.78, - "grad_norm": 0.2542109483323442, - "k1_kl": 0.056243896484375, - "k3_kl": 0.03948974609375, - "kimi_kl": 0.104736328125, - "learning_rate": 1.0999999999999999e-07, - "loss": 0.0019, - "ppl": 0.00995635986328125, - "reward": 0.9534127414226532, - "reward_std": 0.00024070817744359374, - "rewards/perpo_ocr_edit_distance_reward": 0.9534128308296204, + "advantages": 6.914139248692663e-06, + "completion_length": 1335.0, + "delta_ref_entropy_loss": 0.0234375, + "delta_ref_ppl": -0.041259765625, + "entropy_loss": -0.057373046875, + "epoch": 0.39, + "grad_norm": 0.5405441372068815, + "k1_kl": 0.041259765625, + "k3_kl": 0.030517578125, + "kimi_kl": 0.07470703125, + "learning_rate": 3.05e-07, + "loss": 0.0012, + "ppl": 0.0255126953125, + "reward": 0.9358956813812256, + "reward_std": 0.004825289361178875, + "rewards/perpo_ocr_edit_distance_reward": 0.9358956813812256, "step": 1950, "temperature": 0.9 }, { - "advantages": -3.402573929633945e-05, - "completion_length": 512.5, - "delta_ref_entropy_loss": 0.05511474609375, - "delta_ref_ppl": -0.041259765625, - "entropy_loss": -0.04473876953125, - "epoch": 0.7804, - "grad_norm": 0.6920460407284373, - "k1_kl": 0.0413818359375, - "k3_kl": 0.0262451171875, - "kimi_kl": 0.06451416015625, - "learning_rate": 1.0979999999999999e-07, - "loss": 0.0011, - "ppl": 0.0243377685546875, - "reward": 0.9805576801300049, - "reward_std": 0.003042541560716927, - "rewards/perpo_ocr_edit_distance_reward": 0.980557769536972, + "advantages": -3.711666431627236e-05, + "completion_length": 1463.0, + "delta_ref_entropy_loss": 0.080078125, + "delta_ref_ppl": -0.06298828125, + "entropy_loss": -0.1591796875, + "epoch": 0.3902, + "grad_norm": 1.172672703224631, + "k1_kl": 0.06298828125, + "k3_kl": 0.051513671875, + "kimi_kl": 0.0654296875, + "learning_rate": 3.049e-07, + "loss": 0.0021, + "ppl": 0.08251953125, + "reward": 0.9275985956192017, + "reward_std": 0.0021934076212346554, + "rewards/perpo_ocr_edit_distance_reward": 0.9275987148284912, "step": 1951, "temperature": 0.9 }, { - "advantages": 2.0165528823667955e-05, - "completion_length": 917.0, - "delta_ref_entropy_loss": 0.0234375, - "delta_ref_ppl": -0.0189208984375, - "entropy_loss": -0.0560302734375, - "epoch": 0.7808, - "grad_norm": 1.7605776165397864, - "k1_kl": 0.01885986328125, - "k3_kl": 0.013580322265625, - "kimi_kl": 0.020599365234375, - "learning_rate": 1.096e-07, - "loss": 0.0005, - "ppl": 0.025634765625, - "reward": 0.857001394033432, - "reward_std": 0.06237657048041001, - "rewards/perpo_ocr_edit_distance_reward": 0.8570014536380768, + "advantages": -6.335122634482104e-06, + "completion_length": 551.0, + "delta_ref_entropy_loss": 0.11279296875, + "delta_ref_ppl": -0.12451171875, + "entropy_loss": -0.125, + "epoch": 0.3904, + "grad_norm": 2.1906252572691134, + "k1_kl": 0.12451171875, + "k3_kl": 0.07861328125, + "kimi_kl": 0.26953125, + "learning_rate": 3.048e-07, + "loss": 0.0032, + "ppl": 0.05712890625, + "reward": 0.9038394689559937, + "reward_std": 0.013387775048613548, + "rewards/perpo_ocr_edit_distance_reward": 0.9038395881652832, "step": 1952, "temperature": 0.9 }, { - "advantages": 0.0, - "completion_length": 823.5, - "delta_ref_entropy_loss": 0.0189208984375, - "delta_ref_ppl": -0.011871337890625, - "entropy_loss": -0.01898193359375, - "epoch": 0.7812, - "grad_norm": 0.2524922575268395, - "k1_kl": 0.0118408203125, - "k3_kl": 0.0062103271484375, - "kimi_kl": 0.0148468017578125, - "learning_rate": 1.0939999999999999e-07, - "loss": 0.0002, - "ppl": 0.0084075927734375, - "reward": 0.9992530941963196, - "reward_std": 0.001063487259671092, - "rewards/perpo_ocr_edit_distance_reward": 0.9992530941963196, + "advantages": -1.3589859918283764e-05, + "completion_length": 268.0, + "delta_ref_entropy_loss": 0.10302734375, + "delta_ref_ppl": -0.1318359375, + "entropy_loss": -0.083984375, + "epoch": 0.3906, + "grad_norm": 1.8314991521708583, + "k1_kl": 0.1318359375, + "k3_kl": 0.08984375, + "kimi_kl": 0.302734375, + "learning_rate": 3.047e-07, + "loss": 0.0036, + "ppl": 0.033935546875, + "reward": 0.9739863872528076, + "reward_std": 0.0024114460684359074, + "rewards/perpo_ocr_edit_distance_reward": 0.9739863872528076, "step": 1953, "temperature": 0.9 }, { - "advantages": 1.3662236597156152e-05, - "completion_length": 644.5, - "delta_ref_entropy_loss": 0.02764892578125, - "delta_ref_ppl": -0.032470703125, - "entropy_loss": -0.0316162109375, - "epoch": 0.7816, - "grad_norm": 0.3852558863864151, - "k1_kl": 0.032470703125, - "k3_kl": 0.0230712890625, - "kimi_kl": 0.075927734375, - "learning_rate": 1.092e-07, - "loss": 0.0009, - "ppl": 0.017333984375, - "reward": 0.9685279130935669, - "reward_std": 0.00041743527981452644, - "rewards/perpo_ocr_edit_distance_reward": 0.9685279428958893, + "advantages": 3.3038004403351806e-06, + "completion_length": 244.0, + "delta_ref_entropy_loss": 0.08251953125, + "delta_ref_ppl": -0.193359375, + "entropy_loss": -0.060302734375, + "epoch": 0.3908, + "grad_norm": 2.5958935835196124, + "k1_kl": 0.193359375, + "k3_kl": 0.140625, + "kimi_kl": 0.55859375, + "learning_rate": 3.0459999999999996e-07, + "loss": 0.0056, + "ppl": 0.0228271484375, + "reward": 0.9814456105232239, + "reward_std": 0.002495962893590331, + "rewards/perpo_ocr_edit_distance_reward": 0.9814456701278687, "step": 1954, "temperature": 0.9 }, { - "advantages": 1.021793991640152e-07, - "completion_length": 314.0, - "delta_ref_entropy_loss": -0.0379638671875, - "delta_ref_ppl": -0.219970703125, - "entropy_loss": -0.29779052734375, - "epoch": 0.782, - "grad_norm": 8.058512813225187, - "k1_kl": 0.219970703125, - "k3_kl": 0.190185546875, - "kimi_kl": 0.6513671875, - "learning_rate": 1.09e-07, - "loss": 0.0076, - "ppl": 0.14635467529296875, - "reward": 0.7917743921279907, - "reward_std": 0.04789400100708008, - "rewards/perpo_ocr_edit_distance_reward": 0.7917743921279907, + "advantages": -1.730237818264868e-05, + "completion_length": 412.0, + "delta_ref_entropy_loss": 0.15625, + "delta_ref_ppl": -0.1328125, + "entropy_loss": -0.1015625, + "epoch": 0.391, + "grad_norm": 1.13734090692488, + "k1_kl": 0.1318359375, + "k3_kl": 0.07470703125, + "kimi_kl": 0.1484375, + "learning_rate": 3.0449999999999995e-07, + "loss": 0.003, + "ppl": 0.0458984375, + "reward": 0.9317507147789001, + "reward_std": 0.003345129080116749, + "rewards/perpo_ocr_edit_distance_reward": 0.9317507743835449, "step": 1955, "temperature": 0.9 }, { - "advantages": -3.0125891044008313e-05, - "completion_length": 419.0, - "delta_ref_entropy_loss": 0.024641036987304688, - "delta_ref_ppl": -0.0823974609375, - "entropy_loss": -0.0616455078125, - "epoch": 0.7824, - "grad_norm": 1.9302653775537741, - "k1_kl": 0.0823974609375, - "k3_kl": 0.06475830078125, - "kimi_kl": 0.252197265625, - "learning_rate": 1.088e-07, - "loss": 0.0026, - "ppl": 0.034912109375, - "reward": 0.9106855392456055, - "reward_std": 0.010358025378081948, - "rewards/perpo_ocr_edit_distance_reward": 0.910685658454895, + "advantages": 2.1287373641598606e-08, + "completion_length": 2048.0, + "delta_ref_entropy_loss": 0.06396484375, + "delta_ref_ppl": -0.06298828125, + "entropy_loss": -0.2109375, + "epoch": 0.3912, + "grad_norm": 2.019484034922337, + "k1_kl": 0.06298828125, + "k3_kl": 0.048583984375, + "kimi_kl": 0.109375, + "learning_rate": 3.044e-07, + "loss": 0.0019, + "ppl": 0.11376953125, + "reward": 0.518511176109314, + "reward_std": 0.2404264658689499, + "rewards/perpo_ocr_edit_distance_reward": 0.5185112357139587, "step": 1956, "temperature": 0.9 }, { - "advantages": -8.514949456639442e-08, - "completion_length": 1220.0, - "delta_ref_entropy_loss": 0.00164794921875, - "delta_ref_ppl": -0.0244140625, - "entropy_loss": -0.122955322265625, - "epoch": 0.7828, - "grad_norm": 1.4806134077385142, - "k1_kl": 0.02459716796875, - "k3_kl": 0.017822265625, - "kimi_kl": 0.0440673828125, - "learning_rate": 1.0859999999999999e-07, - "loss": 0.0007, - "ppl": 0.0555877685546875, - "reward": 0.702429011464119, - "reward_std": 0.07779783010482788, - "rewards/perpo_ocr_edit_distance_reward": 0.7024290263652802, + "advantages": -3.0181239708326757e-05, + "completion_length": 433.0, + "delta_ref_entropy_loss": 0.02978515625, + "delta_ref_ppl": -0.103515625, + "entropy_loss": -0.034423828125, + "epoch": 0.3914, + "grad_norm": 0.8070660596280542, + "k1_kl": 0.103515625, + "k3_kl": 0.0830078125, + "kimi_kl": 0.373046875, + "learning_rate": 3.043e-07, + "loss": 0.0033, + "ppl": 0.013671875, + "reward": 0.9903217554092407, + "reward_std": 0.003003461519256234, + "rewards/perpo_ocr_edit_distance_reward": 0.9903218746185303, "step": 1957, "temperature": 0.9 }, { - "advantages": 0.0, - "completion_length": 458.0, - "delta_ref_entropy_loss": 0.0274658203125, - "delta_ref_ppl": -0.023193359375, - "entropy_loss": -0.01531982421875, - "epoch": 0.7832, - "grad_norm": 0.11312333940382574, - "k1_kl": 0.0233154296875, - "k3_kl": 0.0137939453125, - "kimi_kl": 0.03094482421875, - "learning_rate": 1.0839999999999999e-07, - "loss": 0.0006, - "ppl": 0.0066680908203125, - "reward": 1.0, - "reward_std": 0.0, - "rewards/perpo_ocr_edit_distance_reward": 1.0, + "advantages": -6.251675949897617e-05, + "completion_length": 668.0, + "delta_ref_entropy_loss": 0.07861328125, + "delta_ref_ppl": -0.080078125, + "entropy_loss": -0.057373046875, + "epoch": 0.3916, + "grad_norm": 0.6637346814055943, + "k1_kl": 0.080078125, + "k3_kl": 0.045654296875, + "kimi_kl": 0.12158203125, + "learning_rate": 3.0420000000000004e-07, + "loss": 0.0019, + "ppl": 0.0198974609375, + "reward": 0.9703369140625, + "reward_std": 0.0015346024883911014, + "rewards/perpo_ocr_edit_distance_reward": 0.9703369736671448, "step": 1958, "temperature": 0.9 }, { - "advantages": -9.604863180356915e-06, - "completion_length": 410.0, - "delta_ref_entropy_loss": 0.0621337890625, - "delta_ref_ppl": -0.0770263671875, - "entropy_loss": -0.0556640625, - "epoch": 0.7836, - "grad_norm": 0.8394369176110666, - "k1_kl": 0.0770263671875, - "k3_kl": 0.047210693359375, - "kimi_kl": 0.1483154296875, - "learning_rate": 1.082e-07, - "loss": 0.0019, - "ppl": 0.02496337890625, - "reward": 0.9962136447429657, - "reward_std": 0.003230139147490263, - "rewards/perpo_ocr_edit_distance_reward": 0.9962137043476105, + "advantages": -1.3623919130623108e-06, + "completion_length": 1543.0, + "delta_ref_entropy_loss": 0.025146484375, + "delta_ref_ppl": -0.04638671875, + "entropy_loss": -0.1455078125, + "epoch": 0.3918, + "grad_norm": 4.2361226999780595, + "k1_kl": 0.046630859375, + "k3_kl": 0.037109375, + "kimi_kl": 0.06787109375, + "learning_rate": 3.041e-07, + "loss": 0.0015, + "ppl": 0.06787109375, + "reward": 0.8830709457397461, + "reward_std": 0.03755251318216324, + "rewards/perpo_ocr_edit_distance_reward": 0.8830710053443909, "step": 1959, "temperature": 0.9 }, { - "advantages": -2.5136131398539874e-05, - "completion_length": 370.5, - "delta_ref_entropy_loss": 0.043212890625, - "delta_ref_ppl": -0.021453857421875, - "entropy_loss": -0.03369140625, - "epoch": 0.784, - "grad_norm": 0.6858022867370517, - "k1_kl": 0.02142333984375, - "k3_kl": 0.0103912353515625, - "kimi_kl": 0.01861572265625, - "learning_rate": 1.0799999999999999e-07, - "loss": 0.0004, - "ppl": 0.01513671875, - "reward": 0.9955715835094452, - "reward_std": 0.000903660838957876, - "rewards/perpo_ocr_edit_distance_reward": 0.99557164311409, + "advantages": -5.3984781516192015e-06, + "completion_length": 928.0, + "delta_ref_entropy_loss": 0.047119140625, + "delta_ref_ppl": -0.049072265625, + "entropy_loss": -0.032958984375, + "epoch": 0.392, + "grad_norm": 2.665110184650244, + "k1_kl": 0.048828125, + "k3_kl": 0.0257568359375, + "kimi_kl": 0.05615234375, + "learning_rate": 3.0399999999999997e-07, + "loss": 0.001, + "ppl": 0.01300048828125, + "reward": 0.9837785363197327, + "reward_std": 0.006206360179930925, + "rewards/perpo_ocr_edit_distance_reward": 0.9837785959243774, "step": 1960, "temperature": 0.9 }, { - "advantages": -5.143455200595781e-05, - "completion_length": 321.5, - "delta_ref_entropy_loss": 0.03924560546875, - "delta_ref_ppl": -0.0686187744140625, - "entropy_loss": -0.02069091796875, - "epoch": 0.7844, - "grad_norm": 0.2391109814483104, - "k1_kl": 0.0686187744140625, - "k3_kl": 0.04979705810546875, - "kimi_kl": 0.179412841796875, - "learning_rate": 1.078e-07, - "loss": 0.002, - "ppl": 0.0110626220703125, - "reward": 0.9999406635761261, - "reward_std": 0.00015697158232796937, - "rewards/perpo_ocr_edit_distance_reward": 0.9999406933784485, + "advantages": -1.5241760138451355e-06, + "completion_length": 495.0, + "delta_ref_entropy_loss": 0.07421875, + "delta_ref_ppl": -0.1064453125, + "entropy_loss": -0.0771484375, + "epoch": 0.3922, + "grad_norm": 0.9926221021471439, + "k1_kl": 0.1064453125, + "k3_kl": 0.0693359375, + "kimi_kl": 0.2255859375, + "learning_rate": 3.039e-07, + "loss": 0.0028, + "ppl": 0.03173828125, + "reward": 0.0437302440404892, + "reward_std": 0.0006083775078877807, + "rewards/perpo_ocr_edit_distance_reward": 0.0437302440404892, "step": 1961, "temperature": 0.9 }, { - "advantages": -1.2419053746270947e-05, - "completion_length": 584.5, - "delta_ref_entropy_loss": 0.024322509765625, - "delta_ref_ppl": -0.019989013671875, - "entropy_loss": -0.0220947265625, - "epoch": 0.7848, - "grad_norm": 0.3128912901127348, - "k1_kl": 0.019989013671875, - "k3_kl": 0.01290130615234375, - "kimi_kl": 0.0357513427734375, - "learning_rate": 1.076e-07, - "loss": 0.0005, - "ppl": 0.01165771484375, - "reward": 0.9991377592086792, - "reward_std": 0.00029288831865414977, - "rewards/perpo_ocr_edit_distance_reward": 0.9991377592086792, + "advantages": -2.043587983280304e-06, + "completion_length": 275.0, + "delta_ref_entropy_loss": 0.087890625, + "delta_ref_ppl": -0.1474609375, + "entropy_loss": -0.1005859375, + "epoch": 0.3924, + "grad_norm": 2.041379109644253, + "k1_kl": 0.1474609375, + "k3_kl": 0.09814453125, + "kimi_kl": 0.27734375, + "learning_rate": 3.038e-07, + "loss": 0.0039, + "ppl": 0.041259765625, + "reward": 0.6616640686988831, + "reward_std": 0.0040601929649710655, + "rewards/perpo_ocr_edit_distance_reward": 0.6616640090942383, "step": 1962, "temperature": 0.9 }, { - "advantages": -6.364925020818646e-06, - "completion_length": 831.5, - "delta_ref_entropy_loss": 0.0870361328125, - "delta_ref_ppl": -0.063720703125, - "entropy_loss": -0.14404296875, - "epoch": 0.7852, - "grad_norm": 1.4995569017773456, - "k1_kl": 0.0638427734375, - "k3_kl": 0.037109375, - "kimi_kl": 0.082275390625, - "learning_rate": 1.074e-07, - "loss": 0.0015, - "ppl": 0.077880859375, - "reward": 0.8288230001926422, - "reward_std": 0.08257393736857921, - "rewards/perpo_ocr_edit_distance_reward": 0.828823059797287, + "advantages": -9.921619493979961e-05, + "completion_length": 482.0, + "delta_ref_entropy_loss": 0.051025390625, + "delta_ref_ppl": -0.09228515625, + "entropy_loss": -0.03076171875, + "epoch": 0.3926, + "grad_norm": 0.663665047127206, + "k1_kl": 0.0927734375, + "k3_kl": 0.06298828125, + "kimi_kl": 0.251953125, + "learning_rate": 3.037e-07, + "loss": 0.0026, + "ppl": 0.0142822265625, + "reward": 0.9964643120765686, + "reward_std": 0.0005008835578337312, + "rewards/perpo_ocr_edit_distance_reward": 0.9964643716812134, "step": 1963, "temperature": 0.9 }, { - "advantages": -8.514949456639442e-08, - "completion_length": 486.0, - "delta_ref_entropy_loss": 0.03460693359375, - "delta_ref_ppl": -0.0269775390625, - "entropy_loss": -0.0302734375, - "epoch": 0.7856, - "grad_norm": 1.1450990419412346, - "k1_kl": 0.027099609375, - "k3_kl": 0.017852783203125, - "kimi_kl": 0.048065185546875, - "learning_rate": 1.072e-07, - "loss": 0.0007, - "ppl": 0.013946533203125, - "reward": 0.7227963358163834, - "reward_std": 0.059486087411642075, - "rewards/perpo_ocr_edit_distance_reward": 0.7227963656187057, + "advantages": -1.65360324899666e-05, + "completion_length": 550.0, + "delta_ref_entropy_loss": 0.0927734375, + "delta_ref_ppl": -0.146484375, + "entropy_loss": -0.08447265625, + "epoch": 0.3928, + "grad_norm": 0.92611512815606, + "k1_kl": 0.146484375, + "k3_kl": 0.09423828125, + "kimi_kl": 0.337890625, + "learning_rate": 3.036e-07, + "loss": 0.0038, + "ppl": 0.04345703125, + "reward": 0.9533191323280334, + "reward_std": 0.002472636057063937, + "rewards/perpo_ocr_edit_distance_reward": 0.9533191919326782, "step": 1964, "temperature": 0.9 }, { - "advantages": -0.0002988747188510388, - "completion_length": 445.5, - "delta_ref_entropy_loss": 0.01025390625, - "delta_ref_ppl": -0.1925048828125, - "entropy_loss": -0.24951171875, - "epoch": 0.786, - "grad_norm": 1.3487528277225147, - "k1_kl": 0.1923828125, - "k3_kl": 0.15673828125, - "kimi_kl": 0.515869140625, - "learning_rate": 1.0699999999999999e-07, - "loss": 0.0065, - "ppl": 0.1627197265625, - "reward": 0.6827544420957565, - "reward_std": 0.014838422648608685, - "rewards/perpo_ocr_edit_distance_reward": 0.6827545166015625, + "advantages": -4.776886726176599e-06, + "completion_length": 28.0, + "delta_ref_entropy_loss": 0.220703125, + "delta_ref_ppl": -1.2109375, + "entropy_loss": -0.322265625, + "epoch": 0.393, + "grad_norm": 15.74564173272069, + "k1_kl": 1.2109375, + "k3_kl": 1.0, + "kimi_kl": 4.1875, + "learning_rate": 3.035e-07, + "loss": 0.04, + "ppl": 0.15234375, + "reward": 0.5193877816200256, + "reward_std": 0.01685992442071438, + "rewards/perpo_ocr_edit_distance_reward": 0.5193878412246704, "step": 1965, "temperature": 0.9 }, { - "advantages": -5.2707536042362335e-06, - "completion_length": 252.0, - "delta_ref_entropy_loss": 0.150146484375, - "delta_ref_ppl": -0.140625, - "entropy_loss": -0.17041015625, - "epoch": 0.7864, - "grad_norm": 3.0823028738022855, - "k1_kl": 0.14013671875, - "k3_kl": 0.082763671875, - "kimi_kl": 0.22021484375, - "learning_rate": 1.068e-07, - "loss": 0.0033, - "ppl": 0.08148193359375, - "reward": 0.5700075477361679, - "reward_std": 0.0030458662658929825, - "rewards/perpo_ocr_edit_distance_reward": 0.5700076073408127, + "advantages": -5.338873506843811e-06, + "completion_length": 812.0, + "delta_ref_entropy_loss": 0.08447265625, + "delta_ref_ppl": -0.11083984375, + "entropy_loss": -0.462890625, + "epoch": 0.3932, + "grad_norm": 2.332828006434374, + "k1_kl": 0.11181640625, + "k3_kl": 0.0693359375, + "kimi_kl": 0.1298828125, + "learning_rate": 3.034e-07, + "loss": 0.0028, + "ppl": 0.283203125, + "reward": 0.8726784586906433, + "reward_std": 0.015842486172914505, + "rewards/perpo_ocr_edit_distance_reward": 0.8726785182952881, "step": 1966, "temperature": 0.9 }, { - "advantages": -6.091594877943862e-05, - "completion_length": 320.5, - "delta_ref_entropy_loss": 0.055419921875, - "delta_ref_ppl": -0.05517578125, - "entropy_loss": -0.03851318359375, - "epoch": 0.7868, - "grad_norm": 0.8536724980371471, - "k1_kl": 0.05517578125, - "k3_kl": 0.03753662109375, - "kimi_kl": 0.14599609375, - "learning_rate": 1.066e-07, - "loss": 0.0016, - "ppl": 0.020355224609375, - "reward": 0.8054209053516388, - "reward_std": 0.00038142989797051996, - "rewards/perpo_ocr_edit_distance_reward": 0.8054209649562836, + "advantages": -1.532690930616809e-06, + "completion_length": 530.0, + "delta_ref_entropy_loss": 0.1513671875, + "delta_ref_ppl": -0.138671875, + "entropy_loss": -0.328125, + "epoch": 0.3934, + "grad_norm": 2.462783916077667, + "k1_kl": 0.138671875, + "k3_kl": 0.08154296875, + "kimi_kl": 0.2392578125, + "learning_rate": 3.033e-07, + "loss": 0.0033, + "ppl": 0.166015625, + "reward": 0.68324214220047, + "reward_std": 0.02232346124947071, + "rewards/perpo_ocr_edit_distance_reward": 0.6832422018051147, "step": 1967, "temperature": 0.9 }, { - "advantages": -6.241458095246344e-06, - "completion_length": 802.0, - "delta_ref_entropy_loss": 0.08544921875, - "delta_ref_ppl": -0.056396484375, - "entropy_loss": -0.1170654296875, - "epoch": 0.7872, - "grad_norm": 1.4854715434206218, - "k1_kl": 0.05615234375, - "k3_kl": 0.0313720703125, - "kimi_kl": 0.0628662109375, - "learning_rate": 1.0639999999999999e-07, + "advantages": -4.13315647165291e-05, + "completion_length": 698.0, + "delta_ref_entropy_loss": 0.04638671875, + "delta_ref_ppl": -0.052734375, + "entropy_loss": -0.043212890625, + "epoch": 0.3936, + "grad_norm": 0.5054939978132342, + "k1_kl": 0.052490234375, + "k3_kl": 0.03173828125, + "kimi_kl": 0.095703125, + "learning_rate": 3.032e-07, "loss": 0.0013, - "ppl": 0.0703887939453125, - "reward": 0.8485352396965027, - "reward_std": 0.011569882743060589, - "rewards/perpo_ocr_edit_distance_reward": 0.8485353291034698, + "ppl": 0.0159912109375, + "reward": 0.9872745871543884, + "reward_std": 0.0011362743098288774, + "rewards/perpo_ocr_edit_distance_reward": 0.987274706363678, "step": 1968, "temperature": 0.9 }, { - "advantages": -6.982684681133833e-05, - "completion_length": 721.0, - "delta_ref_entropy_loss": 0.02978515625, - "delta_ref_ppl": -0.0242919921875, - "entropy_loss": -0.0303955078125, - "epoch": 0.7876, - "grad_norm": 0.7753236057512821, - "k1_kl": 0.02435302734375, - "k3_kl": 0.0142822265625, - "kimi_kl": 0.02874755859375, - "learning_rate": 1.062e-07, - "loss": 0.0006, - "ppl": 0.01617431640625, - "reward": 0.9919362366199493, - "reward_std": 0.0005338966293493286, - "rewards/perpo_ocr_edit_distance_reward": 0.9919363558292389, + "advantages": -2.3756708742439514e-06, + "completion_length": 1170.0, + "delta_ref_entropy_loss": 0.047119140625, + "delta_ref_ppl": -0.064453125, + "entropy_loss": -0.11083984375, + "epoch": 0.3938, + "grad_norm": 2.2512930848885184, + "k1_kl": 0.064453125, + "k3_kl": 0.09375, + "kimi_kl": 0.11865234375, + "learning_rate": 3.0309999999999995e-07, + "loss": 0.0038, + "ppl": 0.061279296875, + "reward": 0.886417031288147, + "reward_std": 0.017744487151503563, + "rewards/perpo_ocr_edit_distance_reward": 0.8864171504974365, "step": 1969, "temperature": 0.9 }, { - "advantages": -0.00011681553132802946, - "completion_length": 709.5, - "delta_ref_entropy_loss": 0.034423828125, - "delta_ref_ppl": -0.0296630859375, - "entropy_loss": -0.025238037109375, - "epoch": 0.788, - "grad_norm": 0.7599449799847403, - "k1_kl": 0.029541015625, - "k3_kl": 0.017547607421875, - "kimi_kl": 0.0472412109375, - "learning_rate": 1.06e-07, - "loss": 0.0008, - "ppl": 0.0129852294921875, - "reward": 0.9960339367389679, - "reward_std": 0.0009600100966054015, - "rewards/perpo_ocr_edit_distance_reward": 0.9960340559482574, + "advantages": -0.00018889564671553671, + "completion_length": 801.0, + "delta_ref_entropy_loss": 0.03857421875, + "delta_ref_ppl": -0.041015625, + "entropy_loss": -0.01953125, + "epoch": 0.394, + "grad_norm": 0.29692101272310706, + "k1_kl": 0.041259765625, + "k3_kl": 0.0240478515625, + "kimi_kl": 0.0751953125, + "learning_rate": 3.03e-07, + "loss": 0.0011, + "ppl": 0.00677490234375, + "reward": 0.9924289584159851, + "reward_std": 0.0002605725603643805, + "rewards/perpo_ocr_edit_distance_reward": 0.9924290180206299, "step": 1970, "temperature": 0.9 }, { - "advantages": -3.0947584491514135e-05, - "completion_length": 781.5, - "delta_ref_entropy_loss": 0.0953369140625, - "delta_ref_ppl": -0.06494140625, - "entropy_loss": -0.12939453125, - "epoch": 0.7884, - "grad_norm": 1.2091157135317838, - "k1_kl": 0.06494140625, - "k3_kl": 0.0399169921875, - "kimi_kl": 0.083984375, - "learning_rate": 1.058e-07, - "loss": 0.0016, - "ppl": 0.07073974609375, - "reward": 0.9177019596099854, - "reward_std": 0.0018914714164566249, - "rewards/perpo_ocr_edit_distance_reward": 0.9177020192146301, + "advantages": -0.00014342581562232226, + "completion_length": 633.0, + "delta_ref_entropy_loss": 0.0712890625, + "delta_ref_ppl": -0.080078125, + "entropy_loss": -0.0281982421875, + "epoch": 0.3942, + "grad_norm": 0.3479664532259167, + "k1_kl": 0.080078125, + "k3_kl": 0.046875, + "kimi_kl": 0.1484375, + "learning_rate": 3.029e-07, + "loss": 0.002, + "ppl": 0.00927734375, + "reward": 0.6776975989341736, + "reward_std": 0.0007309939246624708, + "rewards/perpo_ocr_edit_distance_reward": 0.6776977777481079, "step": 1971, "temperature": 0.9 }, { - "advantages": -1.4347689557325793e-05, - "completion_length": 602.0, - "delta_ref_entropy_loss": 0.0821533203125, - "delta_ref_ppl": -0.06884765625, - "entropy_loss": -0.0816650390625, - "epoch": 0.7888, - "grad_norm": 0.8950331170004794, - "k1_kl": 0.0687255859375, - "k3_kl": 0.04095458984375, - "kimi_kl": 0.11376953125, - "learning_rate": 1.0559999999999999e-07, - "loss": 0.0017, - "ppl": 0.04437255859375, - "reward": 0.9864784181118011, - "reward_std": 0.0013799594598822296, - "rewards/perpo_ocr_edit_distance_reward": 0.9864784479141235, + "advantages": 1.0413783456897363e-05, + "completion_length": 485.0, + "delta_ref_entropy_loss": 0.040283203125, + "delta_ref_ppl": -0.1005859375, + "entropy_loss": -0.033935546875, + "epoch": 0.3944, + "grad_norm": 0.7745218271556303, + "k1_kl": 0.1005859375, + "k3_kl": 0.07666015625, + "kimi_kl": 0.34375, + "learning_rate": 3.028e-07, + "loss": 0.0031, + "ppl": 0.01348876953125, + "reward": 0.7862852811813354, + "reward_std": 0.0015334896743297577, + "rewards/perpo_ocr_edit_distance_reward": 0.7862852811813354, "step": 1972, "temperature": 0.9 }, { - "advantages": -0.0003035749709852098, - "completion_length": 392.0, - "delta_ref_entropy_loss": 0.058349609375, - "delta_ref_ppl": -0.0469970703125, - "entropy_loss": -0.10772705078125, - "epoch": 0.7892, - "grad_norm": 0.9329694400782732, - "k1_kl": 0.0469970703125, - "k3_kl": 0.02728271484375, - "kimi_kl": 0.05712890625, - "learning_rate": 1.0539999999999999e-07, - "loss": 0.0014, - "ppl": 0.061920166015625, - "reward": 0.957028329372406, - "reward_std": 0.004570415709167719, - "rewards/perpo_ocr_edit_distance_reward": 0.9570284187793732, + "advantages": -3.405979782655777e-07, + "completion_length": 110.0, + "delta_ref_entropy_loss": 0.1181640625, + "delta_ref_ppl": -0.60546875, + "entropy_loss": -0.28125, + "epoch": 0.3946, + "grad_norm": 13.051502199391216, + "k1_kl": 0.609375, + "k3_kl": 0.5, + "kimi_kl": 2.203125, + "learning_rate": 3.0270000000000003e-07, + "loss": 0.02, + "ppl": 0.1044921875, + "reward": 0.727034866809845, + "reward_std": 0.10539624094963074, + "rewards/perpo_ocr_edit_distance_reward": 0.7270349264144897, "step": 1973, "temperature": 0.9 }, { - "advantages": -5.1200393500039354e-05, - "completion_length": 283.5, - "delta_ref_entropy_loss": 0.066650390625, - "delta_ref_ppl": -0.059326171875, - "entropy_loss": -0.0416259765625, - "epoch": 0.7896, - "grad_norm": 0.574333912694004, - "k1_kl": 0.059326171875, - "k3_kl": 0.03497314453125, - "kimi_kl": 0.089111328125, - "learning_rate": 1.052e-07, - "loss": 0.0014, - "ppl": 0.0233154296875, - "reward": 0.99798583984375, - "reward_std": 0.00024104301701299846, - "rewards/perpo_ocr_edit_distance_reward": 0.99798583984375, + "advantages": -6.358113023452461e-05, + "completion_length": 1236.0, + "delta_ref_entropy_loss": 0.0230712890625, + "delta_ref_ppl": -0.0230712890625, + "entropy_loss": -0.0189208984375, + "epoch": 0.3948, + "grad_norm": 0.10386013320679802, + "k1_kl": 0.0230712890625, + "k3_kl": 0.01507568359375, + "kimi_kl": 0.052490234375, + "learning_rate": 3.0259999999999997e-07, + "loss": 0.0007, + "ppl": 0.005126953125, + "reward": 0.9935197234153748, + "reward_std": 0.0007034270092844963, + "rewards/perpo_ocr_edit_distance_reward": 0.9935197830200195, "step": 1974, "temperature": 0.9 }, { - "advantages": -0.00030393259885386215, - "completion_length": 534.0, - "delta_ref_entropy_loss": 0.05609130859375, - "delta_ref_ppl": -0.042266845703125, - "entropy_loss": -0.048583984375, - "epoch": 0.79, - "grad_norm": 0.6252423463053047, - "k1_kl": 0.042266845703125, - "k3_kl": 0.024749755859375, - "kimi_kl": 0.053741455078125, - "learning_rate": 1.0499999999999999e-07, - "loss": 0.0013, - "ppl": 0.02996826171875, - "reward": 0.9766765534877777, - "reward_std": 0.0021140221506357193, - "rewards/perpo_ocr_edit_distance_reward": 0.9766766428947449, - "step": 1975, - "temperature": 0.9 - }, - { - "advantages": -1.0664974070095923e-05, - "completion_length": 591.5, - "delta_ref_entropy_loss": 0.03558349609375, - "delta_ref_ppl": -0.0325927734375, - "entropy_loss": -0.0181884765625, - "epoch": 0.7904, - "grad_norm": 0.25986194841875127, - "k1_kl": 0.0325927734375, - "k3_kl": 0.021331787109375, - "kimi_kl": 0.0853271484375, - "learning_rate": 1.048e-07, - "loss": 0.0009, - "ppl": 0.006927490234375, - "reward": 0.9981070756912231, - "reward_std": 0.0009469696087762713, - "rewards/perpo_ocr_edit_distance_reward": 0.9981071054935455, + "advantages": -4.0697203075978905e-05, + "completion_length": 528.0, + "delta_ref_entropy_loss": 0.06689453125, + "delta_ref_ppl": -0.115234375, + "entropy_loss": -0.037353515625, + "epoch": 0.395, + "grad_norm": 0.40348087752146194, + "k1_kl": 0.115234375, + "k3_kl": 0.0810546875, + "kimi_kl": 0.333984375, + "learning_rate": 3.0249999999999996e-07, + "loss": 0.0033, + "ppl": 0.01422119140625, + "reward": 0.9976107478141785, + "reward_std": 0.0009461567969992757, + "rewards/perpo_ocr_edit_distance_reward": 0.9976108074188232, + "step": 1975, + "temperature": 0.9 + }, + { + "advantages": -1.709801836113911e-05, + "completion_length": 198.0, + "delta_ref_entropy_loss": 0.062255859375, + "delta_ref_ppl": -0.20703125, + "entropy_loss": -0.05322265625, + "epoch": 0.3952, + "grad_norm": 1.3837562710704405, + "k1_kl": 0.2060546875, + "k3_kl": 0.1689453125, + "kimi_kl": 0.70703125, + "learning_rate": 3.024e-07, + "loss": 0.0068, + "ppl": 0.026123046875, + "reward": 0.9766483306884766, + "reward_std": 0.002389471745118499, + "rewards/perpo_ocr_edit_distance_reward": 0.9766483902931213, "step": 1976, "temperature": 0.9 }, { - "advantages": -7.833753556951706e-07, - "completion_length": 593.5, - "delta_ref_entropy_loss": 0.06170654296875, - "delta_ref_ppl": -0.04852294921875, - "entropy_loss": -0.11083984375, - "epoch": 0.7908, - "grad_norm": 1.156546495380483, - "k1_kl": 0.04852294921875, - "k3_kl": 0.026763916015625, - "kimi_kl": 0.0589599609375, - "learning_rate": 1.046e-07, - "loss": 0.0011, - "ppl": 0.061065673828125, - "reward": 0.9176516532897949, - "reward_std": 0.013918635435402393, - "rewards/perpo_ocr_edit_distance_reward": 0.9176516830921173, + "advantages": -9.230205250787549e-06, + "completion_length": 394.0, + "delta_ref_entropy_loss": 0.078125, + "delta_ref_ppl": -0.11328125, + "entropy_loss": -0.07763671875, + "epoch": 0.3954, + "grad_norm": 1.8456390944803494, + "k1_kl": 0.11328125, + "k3_kl": 0.07177734375, + "kimi_kl": 0.208984375, + "learning_rate": 3.023e-07, + "loss": 0.0029, + "ppl": 0.033447265625, + "reward": 0.9204429984092712, + "reward_std": 0.007275120820850134, + "rewards/perpo_ocr_edit_distance_reward": 0.9204431176185608, "step": 1977, "temperature": 0.9 }, { - "advantages": -0.0003003392901064217, - "completion_length": 624.0, - "delta_ref_entropy_loss": 0.0860595703125, - "delta_ref_ppl": -0.058349609375, - "entropy_loss": -0.07171630859375, - "epoch": 0.7912, - "grad_norm": 1.0416807695524606, - "k1_kl": 0.058349609375, - "k3_kl": 0.03875732421875, - "kimi_kl": 0.0966796875, - "learning_rate": 1.0440000000000001e-07, - "loss": 0.0018, - "ppl": 0.039031982421875, - "reward": 0.5840404778718948, - "reward_std": 0.0008634412079118192, - "rewards/perpo_ocr_edit_distance_reward": 0.5840405374765396, + "advantages": -3.2356808787881164e-06, + "completion_length": 1558.0, + "delta_ref_entropy_loss": 0.021484375, + "delta_ref_ppl": -0.0361328125, + "entropy_loss": -0.052001953125, + "epoch": 0.3956, + "grad_norm": 1.0746713164783372, + "k1_kl": 0.0361328125, + "k3_kl": 0.0262451171875, + "kimi_kl": 0.055419921875, + "learning_rate": 3.022e-07, + "loss": 0.001, + "ppl": 0.0225830078125, + "reward": 0.9943076968193054, + "reward_std": 0.005215195938944817, + "rewards/perpo_ocr_edit_distance_reward": 0.9943076968193054, "step": 1978, "temperature": 0.9 }, { - "advantages": -6.249972948069171e-06, - "completion_length": 523.5, - "delta_ref_entropy_loss": 0.0458984375, - "delta_ref_ppl": -0.15985107421875, - "entropy_loss": -0.1885986328125, - "epoch": 0.7916, - "grad_norm": 6.633416678798999, - "k1_kl": 0.16082763671875, - "k3_kl": 0.12255859375, - "kimi_kl": 0.435791015625, - "learning_rate": 1.0419999999999999e-07, - "loss": 0.0049, - "ppl": 0.113037109375, - "reward": 0.7450448274612427, - "reward_std": 0.09891320986207575, - "rewards/perpo_ocr_edit_distance_reward": 0.7450448572635651, + "advantages": -5.9638707170961425e-05, + "completion_length": 659.0, + "delta_ref_entropy_loss": 0.06591796875, + "delta_ref_ppl": -0.08642578125, + "entropy_loss": -0.09912109375, + "epoch": 0.3958, + "grad_norm": 1.1549482525800185, + "k1_kl": 0.08642578125, + "k3_kl": 0.0517578125, + "kimi_kl": 0.1328125, + "learning_rate": 3.021e-07, + "loss": 0.0021, + "ppl": 0.05078125, + "reward": 0.9837822914123535, + "reward_std": 0.0014700022293254733, + "rewards/perpo_ocr_edit_distance_reward": 0.9837824106216431, "step": 1979, "temperature": 0.9 }, { - "advantages": -9.011371366796084e-05, - "completion_length": 772.0, - "delta_ref_entropy_loss": 0.033935546875, - "delta_ref_ppl": -0.0284423828125, - "entropy_loss": -0.03094482421875, - "epoch": 0.792, - "grad_norm": 0.275429326574841, - "k1_kl": 0.0284423828125, - "k3_kl": 0.01666259765625, - "kimi_kl": 0.04229736328125, - "learning_rate": 1.0399999999999999e-07, - "loss": 0.0008, - "ppl": 0.012969970703125, - "reward": 0.9984179139137268, - "reward_std": 0.0002308668990735896, - "rewards/perpo_ocr_edit_distance_reward": 0.9984179735183716, + "advantages": -3.8657872210023925e-06, + "completion_length": 664.0, + "delta_ref_entropy_loss": 0.072265625, + "delta_ref_ppl": -0.0791015625, + "entropy_loss": -0.125, + "epoch": 0.396, + "grad_norm": 1.2411053805433312, + "k1_kl": 0.0791015625, + "k3_kl": 0.052001953125, + "kimi_kl": 0.1630859375, + "learning_rate": 3.02e-07, + "loss": 0.0021, + "ppl": 0.056396484375, + "reward": 0.8479271531105042, + "reward_std": 0.004315142519772053, + "rewards/perpo_ocr_edit_distance_reward": 0.8479272127151489, "step": 1980, "temperature": 0.9 }, { - "advantages": -0.00010539804861764424, - "completion_length": 759.0, - "delta_ref_entropy_loss": 0.06689453125, - "delta_ref_ppl": -0.04150390625, - "entropy_loss": -0.0699462890625, - "epoch": 0.7924, - "grad_norm": 1.3600764320710168, - "k1_kl": 0.04150390625, - "k3_kl": 0.023681640625, - "kimi_kl": 0.0572509765625, - "learning_rate": 1.038e-07, - "loss": 0.0011, - "ppl": 0.03857421875, - "reward": 0.9603928327560425, - "reward_std": 0.000740308481908869, - "rewards/perpo_ocr_edit_distance_reward": 0.9603928923606873, + "advantages": -1.0039125299954321e-05, + "completion_length": 495.0, + "delta_ref_entropy_loss": 0.038818359375, + "delta_ref_ppl": -0.0634765625, + "entropy_loss": -0.0390625, + "epoch": 0.3962, + "grad_norm": 1.0770183684545498, + "k1_kl": 0.06298828125, + "k3_kl": 0.041015625, + "kimi_kl": 0.134765625, + "learning_rate": 3.0189999999999997e-07, + "loss": 0.0017, + "ppl": 0.01318359375, + "reward": 0.9690936803817749, + "reward_std": 0.009225589223206043, + "rewards/perpo_ocr_edit_distance_reward": 0.9690937995910645, "step": 1981, "temperature": 0.9 }, { - "advantages": -2.692852774544008e-06, - "completion_length": 442.0, - "delta_ref_entropy_loss": 0.083984375, - "delta_ref_ppl": -0.062744140625, - "entropy_loss": -0.1064453125, - "epoch": 0.7928, - "grad_norm": 1.671053288442309, - "k1_kl": 0.062744140625, - "k3_kl": 0.03533935546875, - "kimi_kl": 0.091552734375, - "learning_rate": 1.0359999999999999e-07, - "loss": 0.0014, - "ppl": 0.052734375, - "reward": 0.8806120157241821, - "reward_std": 0.0748423698823899, - "rewards/perpo_ocr_edit_distance_reward": 0.8806120753288269, + "advantages": -9.613378097128589e-06, + "completion_length": 142.0, + "delta_ref_entropy_loss": 0.150390625, + "delta_ref_ppl": -0.2333984375, + "entropy_loss": -0.0634765625, + "epoch": 0.3964, + "grad_norm": 2.4323283900258854, + "k1_kl": 0.232421875, + "k3_kl": 0.1708984375, + "kimi_kl": 0.55859375, + "learning_rate": 3.018e-07, + "loss": 0.0068, + "ppl": 0.0284423828125, + "reward": 0.9595909714698792, + "reward_std": 0.0025564273819327354, + "rewards/perpo_ocr_edit_distance_reward": 0.9595909714698792, "step": 1982, "temperature": 0.9 }, { - "advantages": -0.00012307082579354756, - "completion_length": 491.5, - "delta_ref_entropy_loss": 0.04974365234375, - "delta_ref_ppl": -0.0277099609375, - "entropy_loss": -0.033233642578125, - "epoch": 0.7932, - "grad_norm": 0.5964158178642543, - "k1_kl": 0.0277099609375, - "k3_kl": 0.01207733154296875, - "kimi_kl": 0.021820068359375, - "learning_rate": 1.034e-07, - "loss": 0.0006, - "ppl": 0.01336669921875, - "reward": 0.9503071010112762, - "reward_std": 0.0005186930065974593, - "rewards/perpo_ocr_edit_distance_reward": 0.950307160615921, + "advantages": -6.0541293350979686e-05, + "completion_length": 784.0, + "delta_ref_entropy_loss": 0.0284423828125, + "delta_ref_ppl": -0.052001953125, + "entropy_loss": -0.035400390625, + "epoch": 0.3966, + "grad_norm": 0.4927729632509572, + "k1_kl": 0.052001953125, + "k3_kl": 0.03466796875, + "kimi_kl": 0.10595703125, + "learning_rate": 3.017e-07, + "loss": 0.0014, + "ppl": 0.01275634765625, + "reward": 0.964454174041748, + "reward_std": 0.0008846409618854523, + "rewards/perpo_ocr_edit_distance_reward": 0.9644542932510376, "step": 1983, "temperature": 0.9 }, { - "advantages": -0.00014058394299354404, - "completion_length": 1241.5, - "delta_ref_entropy_loss": 0.017822265625, - "delta_ref_ppl": -0.01007080078125, - "entropy_loss": -0.020263671875, - "epoch": 0.7936, - "grad_norm": 0.2279587618550105, - "k1_kl": 0.010101318359375, - "k3_kl": 0.00482177734375, - "kimi_kl": 0.00910186767578125, - "learning_rate": 1.032e-07, - "loss": 0.0003, - "ppl": 0.00982666015625, - "reward": 0.9981891512870789, - "reward_std": 0.00018777559307636693, - "rewards/perpo_ocr_edit_distance_reward": 0.9981892704963684, + "advantages": -1.0286059477948584e-05, + "completion_length": 577.0, + "delta_ref_entropy_loss": 0.0361328125, + "delta_ref_ppl": -0.051025390625, + "entropy_loss": -0.0291748046875, + "epoch": 0.3968, + "grad_norm": 2.419524428238621, + "k1_kl": 0.051025390625, + "k3_kl": 0.03564453125, + "kimi_kl": 0.0986328125, + "learning_rate": 3.0159999999999995e-07, + "loss": 0.0014, + "ppl": 0.0135498046875, + "reward": 0.9816145300865173, + "reward_std": 0.006507350131869316, + "rewards/perpo_ocr_edit_distance_reward": 0.9816145896911621, "step": 1984, "temperature": 0.9 }, { - "advantages": -2.5119102247117553e-06, - "completion_length": 524.0, - "delta_ref_entropy_loss": 0.02545166015625, - "delta_ref_ppl": -0.022705078125, - "entropy_loss": -0.0174713134765625, - "epoch": 0.794, - "grad_norm": 0.3089811695212916, - "k1_kl": 0.02276611328125, - "k3_kl": 0.01446533203125, - "kimi_kl": 0.0401611328125, - "learning_rate": 1.03e-07, - "loss": 0.0006, - "ppl": 0.00785064697265625, - "reward": 0.996510237455368, - "reward_std": 0.004176529590040445, - "rewards/perpo_ocr_edit_distance_reward": 0.9965102672576904, + "advantages": -7.520403596572578e-05, + "completion_length": 797.0, + "delta_ref_entropy_loss": 0.044677734375, + "delta_ref_ppl": -0.052490234375, + "entropy_loss": -0.0458984375, + "epoch": 0.397, + "grad_norm": 0.5325164866601552, + "k1_kl": 0.052490234375, + "k3_kl": 0.031982421875, + "kimi_kl": 0.09521484375, + "learning_rate": 3.015e-07, + "loss": 0.0014, + "ppl": 0.01806640625, + "reward": 0.9938146471977234, + "reward_std": 0.0010318398708477616, + "rewards/perpo_ocr_edit_distance_reward": 0.9938147664070129, "step": 1985, "temperature": 0.9 }, { - "advantages": -9.962491276382934e-07, - "completion_length": 620.0, - "delta_ref_entropy_loss": 0.0814208984375, - "delta_ref_ppl": -0.04534912109375, - "entropy_loss": -0.1448974609375, - "epoch": 0.7944, - "grad_norm": 1.3441157882540997, - "k1_kl": 0.04534912109375, - "k3_kl": 0.021392822265625, - "kimi_kl": 0.0360107421875, - "learning_rate": 1.028e-07, - "loss": 0.0009, - "ppl": 0.08331298828125, - "reward": 0.9533669054508209, - "reward_std": 0.00698124198243022, - "rewards/perpo_ocr_edit_distance_reward": 0.9533669352531433, + "advantages": -2.061043596768286e-05, + "completion_length": 561.0, + "delta_ref_entropy_loss": 0.1484375, + "delta_ref_ppl": -0.1474609375, + "entropy_loss": -0.265625, + "epoch": 0.3972, + "grad_norm": 3.3770926872817855, + "k1_kl": 0.1484375, + "k3_kl": 0.09521484375, + "kimi_kl": 0.302734375, + "learning_rate": 3.014e-07, + "loss": 0.0038, + "ppl": 0.1474609375, + "reward": 0.7620008587837219, + "reward_std": 0.002379282843321562, + "rewards/perpo_ocr_edit_distance_reward": 0.7620009183883667, "step": 1986, "temperature": 0.9 }, { - "advantages": -3.4979412703251e-05, - "completion_length": 531.5, - "delta_ref_entropy_loss": 0.0316162109375, - "delta_ref_ppl": -0.043701171875, - "entropy_loss": -0.02496337890625, - "epoch": 0.7948, - "grad_norm": 0.9471221574969512, - "k1_kl": 0.0438232421875, - "k3_kl": 0.03125, - "kimi_kl": 0.10693359375, - "learning_rate": 1.0259999999999999e-07, - "loss": 0.0013, - "ppl": 0.012786865234375, - "reward": 0.9987568259239197, - "reward_std": 0.0005830297595821321, - "rewards/perpo_ocr_edit_distance_reward": 0.9987568557262421, + "advantages": -8.65459514898248e-05, + "completion_length": 534.0, + "delta_ref_entropy_loss": 0.04345703125, + "delta_ref_ppl": -0.054931640625, + "entropy_loss": -0.0245361328125, + "epoch": 0.3974, + "grad_norm": 0.33500164801935534, + "k1_kl": 0.054931640625, + "k3_kl": 0.0322265625, + "kimi_kl": 0.08154296875, + "learning_rate": 3.013e-07, + "loss": 0.0014, + "ppl": 0.0076904296875, + "reward": 0.9978857040405273, + "reward_std": 0.00019519204215612262, + "rewards/perpo_ocr_edit_distance_reward": 0.9978857636451721, "step": 1987, "temperature": 0.9 }, { - "advantages": -0.0001049637867254205, - "completion_length": 318.5, - "delta_ref_entropy_loss": 0.0513916015625, - "delta_ref_ppl": -0.039031982421875, - "entropy_loss": -0.0518798828125, - "epoch": 0.7952, - "grad_norm": 0.7813787867321639, - "k1_kl": 0.039031982421875, - "k3_kl": 0.0240325927734375, - "kimi_kl": 0.051239013671875, - "learning_rate": 1.024e-07, - "loss": 0.0011, - "ppl": 0.031402587890625, - "reward": 0.9730173945426941, - "reward_std": 0.000233923303312622, - "rewards/perpo_ocr_edit_distance_reward": 0.9730174541473389, + "advantages": -9.151867561740801e-05, + "completion_length": 533.0, + "delta_ref_entropy_loss": 0.03515625, + "delta_ref_ppl": -0.0849609375, + "entropy_loss": -0.025634765625, + "epoch": 0.3976, + "grad_norm": 0.4742220667642273, + "k1_kl": 0.0849609375, + "k3_kl": 0.064453125, + "kimi_kl": 0.251953125, + "learning_rate": 3.012e-07, + "loss": 0.0027, + "ppl": 0.011474609375, + "reward": 0.9979691505432129, + "reward_std": 0.0006442957674153149, + "rewards/perpo_ocr_edit_distance_reward": 0.9979692697525024, "step": 1988, "temperature": 0.9 }, { - "advantages": -0.00029811688832381833, - "completion_length": 352.0, - "delta_ref_entropy_loss": 0.07958984375, - "delta_ref_ppl": -0.2001953125, - "entropy_loss": -0.1060791015625, - "epoch": 0.7956, - "grad_norm": 4.843880451497463, - "k1_kl": 0.201171875, - "k3_kl": 0.14764404296875, - "kimi_kl": 0.6129150390625, - "learning_rate": 1.0219999999999999e-07, - "loss": 0.0062, - "ppl": 0.0519561767578125, - "reward": 0.7232269197702408, - "reward_std": 0.08031129837036133, - "rewards/perpo_ocr_edit_distance_reward": 0.7232269942760468, + "advantages": -6.130763949840912e-07, + "completion_length": 1294.0, + "delta_ref_entropy_loss": 0.11865234375, + "delta_ref_ppl": -0.1064453125, + "entropy_loss": -0.25, + "epoch": 0.3978, + "grad_norm": 14.154253102571099, + "k1_kl": 0.1064453125, + "k3_kl": 0.07177734375, + "kimi_kl": 0.18359375, + "learning_rate": 3.0109999999999996e-07, + "loss": 0.0029, + "ppl": 0.13671875, + "reward": 0.4362000823020935, + "reward_std": 0.020592572167515755, + "rewards/perpo_ocr_edit_distance_reward": 0.4362000823020935, "step": 1989, "temperature": 0.9 }, { - "advantages": -0.00011241436732234433, - "completion_length": 591.0, - "delta_ref_entropy_loss": 0.040283203125, - "delta_ref_ppl": -0.0286865234375, - "entropy_loss": -0.0335693359375, - "epoch": 0.796, - "grad_norm": 0.6693925462830721, - "k1_kl": 0.02874755859375, - "k3_kl": 0.017364501953125, - "kimi_kl": 0.0433349609375, - "learning_rate": 1.0199999999999999e-07, - "loss": 0.0008, - "ppl": 0.01702880859375, - "reward": 0.9994970262050629, - "reward_std": 0.0004236603854224086, - "rewards/perpo_ocr_edit_distance_reward": 0.9994970858097076, + "advantages": -1.8051692904919037e-06, + "completion_length": 777.0, + "delta_ref_entropy_loss": 0.0198974609375, + "delta_ref_ppl": -0.034912109375, + "entropy_loss": -0.026123046875, + "epoch": 0.398, + "grad_norm": 0.29362242889280976, + "k1_kl": 0.034912109375, + "k3_kl": 0.023681640625, + "kimi_kl": 0.0703125, + "learning_rate": 3.0099999999999996e-07, + "loss": 0.0009, + "ppl": 0.0078125, + "reward": 0.020364860072731972, + "reward_std": 0.0003414281236473471, + "rewards/perpo_ocr_edit_distance_reward": 0.02036486379802227, "step": 1990, "temperature": 0.9 }, { - "advantages": -8.259501100837952e-07, - "completion_length": 594.0, - "delta_ref_entropy_loss": 0.08154296875, - "delta_ref_ppl": -0.05218505859375, - "entropy_loss": -0.11602783203125, - "epoch": 0.7964, - "grad_norm": 1.7642212147410812, - "k1_kl": 0.05224609375, - "k3_kl": 0.028839111328125, - "kimi_kl": 0.06170654296875, - "learning_rate": 1.018e-07, - "loss": 0.0012, - "ppl": 0.06146240234375, - "reward": 0.6865994185209274, - "reward_std": 0.013100793556077406, - "rewards/perpo_ocr_edit_distance_reward": 0.6865994483232498, + "advantages": -5.020414391765371e-05, + "completion_length": 241.0, + "delta_ref_entropy_loss": 0.0751953125, + "delta_ref_ppl": -0.1728515625, + "entropy_loss": -0.0556640625, + "epoch": 0.3982, + "grad_norm": 0.4257898495262226, + "k1_kl": 0.1728515625, + "k3_kl": 0.1376953125, + "kimi_kl": 0.65625, + "learning_rate": 3.009e-07, + "loss": 0.0055, + "ppl": 0.01904296875, + "reward": 0.9939500093460083, + "reward_std": 0.0009182007052004337, + "rewards/perpo_ocr_edit_distance_reward": 0.9939500689506531, "step": 1991, "temperature": 0.9 }, { - "advantages": 0.0, - "completion_length": 312.0, - "delta_ref_entropy_loss": 0.0562744140625, - "delta_ref_ppl": -0.05328369140625, - "entropy_loss": -0.0343017578125, - "epoch": 0.7968, - "grad_norm": 0.6266757031507004, - "k1_kl": 0.05328369140625, - "k3_kl": 0.03021240234375, - "kimi_kl": 0.08380126953125, - "learning_rate": 1.016e-07, - "loss": 0.0012, - "ppl": 0.01715087890625, - "reward": 0.997308760881424, - "reward_std": 0.0005598243587883189, - "rewards/perpo_ocr_edit_distance_reward": 0.997308760881424, + "advantages": -1.8664770323084667e-05, + "completion_length": 667.0, + "delta_ref_entropy_loss": 0.038818359375, + "delta_ref_ppl": -0.058349609375, + "entropy_loss": -0.028564453125, + "epoch": 0.3984, + "grad_norm": 0.6957629442731013, + "k1_kl": 0.05859375, + "k3_kl": 0.038330078125, + "kimi_kl": 0.1240234375, + "learning_rate": 3.008e-07, + "loss": 0.0016, + "ppl": 0.01025390625, + "reward": 0.9957916736602783, + "reward_std": 0.0012691117590293288, + "rewards/perpo_ocr_edit_distance_reward": 0.9957916736602783, "step": 1992, "temperature": 0.9 }, { - "advantages": -4.114317198400386e-05, - "completion_length": 540.0, - "delta_ref_entropy_loss": 0.03167724609375, - "delta_ref_ppl": -0.038330078125, - "entropy_loss": -0.028076171875, - "epoch": 0.7972, - "grad_norm": 0.9763419147088279, - "k1_kl": 0.038330078125, - "k3_kl": 0.026214599609375, - "kimi_kl": 0.09222412109375, - "learning_rate": 1.014e-07, - "loss": 0.0011, - "ppl": 0.012603759765625, - "reward": 0.9977065324783325, - "reward_std": 0.000260436674579978, - "rewards/perpo_ocr_edit_distance_reward": 0.9977065622806549, + "advantages": -6.244651012821123e-05, + "completion_length": 985.0, + "delta_ref_entropy_loss": 0.0546875, + "delta_ref_ppl": -0.061767578125, + "entropy_loss": -0.03759765625, + "epoch": 0.3986, + "grad_norm": 1.5165601153277268, + "k1_kl": 0.0615234375, + "k3_kl": 0.037841796875, + "kimi_kl": 0.1171875, + "learning_rate": 3.007e-07, + "loss": 0.0016, + "ppl": 0.0169677734375, + "reward": 0.9983106851577759, + "reward_std": 0.000854342826642096, + "rewards/perpo_ocr_edit_distance_reward": 0.9983108043670654, "step": 1993, "temperature": 0.9 }, { - "advantages": -4.19787011196604e-06, - "completion_length": 237.5, - "delta_ref_entropy_loss": 0.09759521484375, - "delta_ref_ppl": -0.101806640625, - "entropy_loss": -0.084716796875, - "epoch": 0.7976, - "grad_norm": 0.9490913061657201, - "k1_kl": 0.1019287109375, - "k3_kl": 0.0675048828125, - "kimi_kl": 0.22998046875, - "learning_rate": 1.0119999999999999e-07, - "loss": 0.0027, - "ppl": 0.0386962890625, - "reward": 0.8531578779220581, - "reward_std": 0.001969292527064681, - "rewards/perpo_ocr_edit_distance_reward": 0.8531579077243805, + "advantages": -0.0001081057998817414, + "completion_length": 627.0, + "delta_ref_entropy_loss": 0.0615234375, + "delta_ref_ppl": -0.08544921875, + "entropy_loss": -0.0576171875, + "epoch": 0.3988, + "grad_norm": 0.6177283642276772, + "k1_kl": 0.08544921875, + "k3_kl": 0.051025390625, + "kimi_kl": 0.1474609375, + "learning_rate": 3.006e-07, + "loss": 0.0021, + "ppl": 0.022705078125, + "reward": 0.9973573088645935, + "reward_std": 0.0005299681215547025, + "rewards/perpo_ocr_edit_distance_reward": 0.9973574280738831, "step": 1994, "temperature": 0.9 }, { - "advantages": -3.583942452678457e-05, - "completion_length": 460.5, - "delta_ref_entropy_loss": 0.06427001953125, - "delta_ref_ppl": -0.067138671875, - "entropy_loss": -0.05389404296875, - "epoch": 0.798, - "grad_norm": 1.0071245835755593, - "k1_kl": 0.0672607421875, - "k3_kl": 0.04107666015625, - "kimi_kl": 0.12939453125, - "learning_rate": 1.01e-07, - "loss": 0.0017, - "ppl": 0.030548095703125, - "reward": 0.9074390232563019, - "reward_std": 0.00048444457934238017, - "rewards/perpo_ocr_edit_distance_reward": 0.9074390828609467, + "advantages": 1.788139485370266e-07, + "completion_length": 501.0, + "delta_ref_entropy_loss": 0.07373046875, + "delta_ref_ppl": -0.1318359375, + "entropy_loss": -0.248046875, + "epoch": 0.399, + "grad_norm": 2.4562830218614407, + "k1_kl": 0.1318359375, + "k3_kl": 0.099609375, + "kimi_kl": 0.259765625, + "learning_rate": 3.0049999999999997e-07, + "loss": 0.004, + "ppl": 0.12353515625, + "reward": 0.812397837638855, + "reward_std": 0.0923379585146904, + "rewards/perpo_ocr_edit_distance_reward": 0.812397837638855, "step": 1995, "temperature": 0.9 }, { - "advantages": -0.0003078494755754946, - "completion_length": 277.0, - "delta_ref_entropy_loss": 0.02728271484375, - "delta_ref_ppl": -0.0457763671875, - "entropy_loss": -0.02593994140625, - "epoch": 0.7984, - "grad_norm": 0.42384409735705325, - "k1_kl": 0.0457763671875, - "k3_kl": 0.0355224609375, - "kimi_kl": 0.16552734375, - "learning_rate": 1.008e-07, - "loss": 0.0017, - "ppl": 0.0123291015625, - "reward": 0.9968658983707428, - "reward_std": 0.00038363883504644036, - "rewards/perpo_ocr_edit_distance_reward": 0.9968659281730652, + "advantages": -8.242471267294604e-06, + "completion_length": 695.0, + "delta_ref_entropy_loss": 0.0201416015625, + "delta_ref_ppl": -0.04638671875, + "entropy_loss": -0.0228271484375, + "epoch": 0.3992, + "grad_norm": 0.4909288053230153, + "k1_kl": 0.04638671875, + "k3_kl": 0.0361328125, + "kimi_kl": 0.12451171875, + "learning_rate": 3.0039999999999996e-07, + "loss": 0.0015, + "ppl": 0.00970458984375, + "reward": 0.9843855500221252, + "reward_std": 0.007122008129954338, + "rewards/perpo_ocr_edit_distance_reward": 0.98438560962677, "step": 1996, "temperature": 0.9 }, { - "advantages": -0.00010748421118478291, - "completion_length": 847.5, - "delta_ref_entropy_loss": 0.02899169921875, - "delta_ref_ppl": -0.033843994140625, - "entropy_loss": -0.0357666015625, - "epoch": 0.7988, - "grad_norm": 0.4594799119946911, - "k1_kl": 0.033935546875, - "k3_kl": 0.021209716796875, - "kimi_kl": 0.05859375, - "learning_rate": 1.0059999999999999e-07, - "loss": 0.001, - "ppl": 0.0184326171875, - "reward": 0.9188083708286285, - "reward_std": 0.000414872556575574, - "rewards/perpo_ocr_edit_distance_reward": 0.9188084304332733, + "advantages": 2.1048956114100292e-05, + "completion_length": 586.0, + "delta_ref_entropy_loss": 0.046142578125, + "delta_ref_ppl": -0.060546875, + "entropy_loss": -0.03857421875, + "epoch": 0.3994, + "grad_norm": 0.34501445496902355, + "k1_kl": 0.060791015625, + "k3_kl": 0.040283203125, + "kimi_kl": 0.11572265625, + "learning_rate": 3.003e-07, + "loss": 0.0016, + "ppl": 0.01513671875, + "reward": 0.9931455254554749, + "reward_std": 0.0007088527781888843, + "rewards/perpo_ocr_edit_distance_reward": 0.9931455850601196, "step": 1997, "temperature": 0.9 }, { - "advantages": -4.938671054333099e-07, - "completion_length": 537.5, - "delta_ref_entropy_loss": 0.02874755859375, - "delta_ref_ppl": -0.0367431640625, - "entropy_loss": -0.03326416015625, - "epoch": 0.7992, - "grad_norm": 0.6409000500093002, - "k1_kl": 0.036865234375, - "k3_kl": 0.0230712890625, - "kimi_kl": 0.06298828125, - "learning_rate": 1.004e-07, - "loss": 0.0009, - "ppl": 0.0165252685546875, - "reward": 0.9743164777755737, - "reward_std": 0.03400583565235138, - "rewards/perpo_ocr_edit_distance_reward": 0.9743165373802185, + "advantages": -4.89779886265751e-05, + "completion_length": 467.0, + "delta_ref_entropy_loss": 0.0947265625, + "delta_ref_ppl": -0.1220703125, + "entropy_loss": -0.0693359375, + "epoch": 0.3996, + "grad_norm": 1.0000131802226735, + "k1_kl": 0.12255859375, + "k3_kl": 0.076171875, + "kimi_kl": 0.236328125, + "learning_rate": 3.002e-07, + "loss": 0.0031, + "ppl": 0.03466796875, + "reward": 0.9739784598350525, + "reward_std": 0.0016379912849515676, + "rewards/perpo_ocr_edit_distance_reward": 0.9739785194396973, "step": 1998, "temperature": 0.9 }, { - "advantages": 3.62736864190083e-06, - "completion_length": 640.0, - "delta_ref_entropy_loss": 0.0399169921875, - "delta_ref_ppl": -0.03399658203125, - "entropy_loss": -0.0401611328125, - "epoch": 0.7996, - "grad_norm": 0.47284121061527185, - "k1_kl": 0.03411865234375, - "k3_kl": 0.02032470703125, - "kimi_kl": 0.0584716796875, - "learning_rate": 1.002e-07, - "loss": 0.0008, - "ppl": 0.0201873779296875, - "reward": 0.9920363128185272, - "reward_std": 0.001709614647552371, - "rewards/perpo_ocr_edit_distance_reward": 0.9920362830162048, + "advantages": -1.021793991640152e-07, + "completion_length": 89.0, + "delta_ref_entropy_loss": -0.07275390625, + "delta_ref_ppl": -0.455078125, + "entropy_loss": -0.703125, + "epoch": 0.3998, + "grad_norm": 6.63128092080664, + "k1_kl": 0.455078125, + "k3_kl": 0.39453125, + "kimi_kl": 1.921875, + "learning_rate": 3.0009999999999994e-07, + "loss": 0.0158, + "ppl": 0.31640625, + "reward": 0.3150799572467804, + "reward_std": 0.03667693957686424, + "rewards/perpo_ocr_edit_distance_reward": 0.3150799870491028, "step": 1999, "temperature": 0.9 }, { - "advantages": -3.478356899844215e-06, - "completion_length": 376.0, - "delta_ref_entropy_loss": 0.050537109375, - "delta_ref_ppl": -0.0435791015625, - "entropy_loss": -0.0484619140625, - "epoch": 0.8, - "grad_norm": 1.9110562833889524, - "k1_kl": 0.0435791015625, - "k3_kl": 0.0313720703125, - "kimi_kl": 0.08056640625, - "learning_rate": 1e-07, - "loss": 0.0013, - "ppl": 0.02691650390625, - "reward": 0.9424599409103394, - "reward_std": 0.0032140802213689312, - "rewards/perpo_ocr_edit_distance_reward": 0.9424599707126617, + "advantages": -2.0223005776642822e-05, + "completion_length": 797.0, + "delta_ref_entropy_loss": 0.091796875, + "delta_ref_ppl": -0.0869140625, + "entropy_loss": -0.08984375, + "epoch": 0.4, + "grad_norm": 0.9776757557247728, + "k1_kl": 0.0869140625, + "k3_kl": 0.05126953125, + "kimi_kl": 0.1474609375, + "learning_rate": 3e-07, + "loss": 0.0021, + "ppl": 0.044189453125, + "reward": 0.9738742113113403, + "reward_std": 0.0032706984784454107, + "rewards/perpo_ocr_edit_distance_reward": 0.9738742709159851, "step": 2000, "temperature": 0.9 }, { - "advantages": -0.00010584507981548086, - "completion_length": 479.5, - "delta_ref_entropy_loss": 0.02423095703125, - "delta_ref_ppl": -0.02935791015625, - "entropy_loss": -0.0137939453125, - "epoch": 0.8004, - "grad_norm": 0.2368705467729364, - "k1_kl": 0.02935791015625, - "k3_kl": 0.02130126953125, - "kimi_kl": 0.08935546875, - "learning_rate": 9.98e-08, - "loss": 0.001, - "ppl": 0.00555419921875, - "reward": 0.9998185038566589, - "reward_std": 0.0004047029942739755, - "rewards/perpo_ocr_edit_distance_reward": 0.9998186230659485, + "advantages": -1.9354480173205957e-05, + "completion_length": 379.0, + "delta_ref_entropy_loss": 0.042724609375, + "delta_ref_ppl": -0.08642578125, + "entropy_loss": -0.083984375, + "epoch": 0.4002, + "grad_norm": 1.0334236709494369, + "k1_kl": 0.08642578125, + "k3_kl": 0.060302734375, + "kimi_kl": 0.216796875, + "learning_rate": 2.999e-07, + "loss": 0.0024, + "ppl": 0.043212890625, + "reward": 0.7680412530899048, + "reward_std": 0.0034191918093711138, + "rewards/perpo_ocr_edit_distance_reward": 0.7680413126945496, "step": 2001, "temperature": 0.9 }, { - "advantages": -6.704671432089526e-05, - "completion_length": 430.5, - "delta_ref_entropy_loss": 0.0513916015625, - "delta_ref_ppl": -0.0303955078125, - "entropy_loss": -0.04376220703125, - "epoch": 0.8008, - "grad_norm": 1.1680453552410857, - "k1_kl": 0.0303955078125, - "k3_kl": 0.017059326171875, - "kimi_kl": 0.03857421875, - "learning_rate": 9.959999999999999e-08, - "loss": 0.0007, - "ppl": 0.0201416015625, - "reward": 0.9847603440284729, - "reward_std": 0.0007811693358235061, - "rewards/perpo_ocr_edit_distance_reward": 0.9847604334354401, + "advantages": -8.9066370492219e-06, + "completion_length": 317.0, + "delta_ref_entropy_loss": 0.032470703125, + "delta_ref_ppl": -0.109375, + "entropy_loss": -0.039794921875, + "epoch": 0.4004, + "grad_norm": 1.077636677996274, + "k1_kl": 0.109375, + "k3_kl": 0.08203125, + "kimi_kl": 0.33984375, + "learning_rate": 2.9979999999999997e-07, + "loss": 0.0033, + "ppl": 0.0179443359375, + "reward": 0.9902350306510925, + "reward_std": 0.0018136667786166072, + "rewards/perpo_ocr_edit_distance_reward": 0.9902350902557373, "step": 2002, "temperature": 0.9 }, { - "advantages": -8.422136306762695e-05, - "completion_length": 439.5, - "delta_ref_entropy_loss": 0.065185546875, - "delta_ref_ppl": -0.064697265625, - "entropy_loss": -0.04443359375, - "epoch": 0.8012, - "grad_norm": 0.8862825608286411, - "k1_kl": 0.064697265625, - "k3_kl": 0.04296875, - "kimi_kl": 0.13818359375, - "learning_rate": 9.94e-08, - "loss": 0.0018, - "ppl": 0.02435302734375, - "reward": 0.9941634237766266, - "reward_std": 0.0012973123812116683, - "rewards/perpo_ocr_edit_distance_reward": 0.9941635131835938, + "advantages": -8.361680374946445e-05, + "completion_length": 447.0, + "delta_ref_entropy_loss": 0.10986328125, + "delta_ref_ppl": -0.1103515625, + "entropy_loss": -0.04833984375, + "epoch": 0.4006, + "grad_norm": 0.6390613304057136, + "k1_kl": 0.1103515625, + "k3_kl": 0.06884765625, + "kimi_kl": 0.205078125, + "learning_rate": 2.997e-07, + "loss": 0.0028, + "ppl": 0.0206298828125, + "reward": 0.9869739413261414, + "reward_std": 0.00040907642687670887, + "rewards/perpo_ocr_edit_distance_reward": 0.9869740009307861, "step": 2003, "temperature": 0.9 }, { - "advantages": -0.0001036780258800718, - "completion_length": 278.0, - "delta_ref_entropy_loss": 0.0413818359375, - "delta_ref_ppl": -0.068359375, - "entropy_loss": -0.0406494140625, - "epoch": 0.8016, - "grad_norm": 0.8342955384669505, - "k1_kl": 0.068359375, - "k3_kl": 0.05242919921875, - "kimi_kl": 0.27978515625, - "learning_rate": 9.919999999999999e-08, - "loss": 0.0022, - "ppl": 0.018157958984375, - "reward": 0.9988439679145813, - "reward_std": 0.0011533724318724126, - "rewards/perpo_ocr_edit_distance_reward": 0.9988440573215485, + "advantages": -3.8129943277454004e-05, + "completion_length": 1217.0, + "delta_ref_entropy_loss": 0.02978515625, + "delta_ref_ppl": -0.048095703125, + "entropy_loss": -0.05615234375, + "epoch": 0.4008, + "grad_norm": 1.5228295539776635, + "k1_kl": 0.048095703125, + "k3_kl": 0.03857421875, + "kimi_kl": 0.0791015625, + "learning_rate": 2.9959999999999996e-07, + "loss": 0.0016, + "ppl": 0.02490234375, + "reward": 0.8544089198112488, + "reward_std": 0.0003467313072178513, + "rewards/perpo_ocr_edit_distance_reward": 0.8544089794158936, "step": 2004, "temperature": 0.9 }, { - "advantages": -0.0001327821230461268, - "completion_length": 751.5, - "delta_ref_entropy_loss": 0.029052734375, - "delta_ref_ppl": -0.037353515625, - "entropy_loss": -0.04205322265625, - "epoch": 0.802, - "grad_norm": 0.7516102749710581, - "k1_kl": 0.03729248046875, - "k3_kl": 0.02606201171875, - "kimi_kl": 0.125, - "learning_rate": 9.9e-08, - "loss": 0.0012, - "ppl": 0.020263671875, - "reward": 0.9176482856273651, - "reward_std": 0.006537909866892733, - "rewards/perpo_ocr_edit_distance_reward": 0.9176483750343323, + "advantages": -3.2731466490076855e-05, + "completion_length": 206.0, + "delta_ref_entropy_loss": 0.07763671875, + "delta_ref_ppl": -0.1806640625, + "entropy_loss": -0.1787109375, + "epoch": 0.401, + "grad_norm": 2.1406115341386656, + "k1_kl": 0.1796875, + "k3_kl": 0.1396484375, + "kimi_kl": 0.515625, + "learning_rate": 2.9949999999999995e-07, + "loss": 0.0056, + "ppl": 0.078125, + "reward": 0.9813530445098877, + "reward_std": 0.0012003120500594378, + "rewards/perpo_ocr_edit_distance_reward": 0.9813531041145325, "step": 2005, "temperature": 0.9 }, { - "advantages": -0.00032006417131924536, - "completion_length": 360.5, - "delta_ref_entropy_loss": 0.03033447265625, - "delta_ref_ppl": -0.0411376953125, - "entropy_loss": -0.03045654296875, - "epoch": 0.8024, - "grad_norm": 0.8497011471235094, - "k1_kl": 0.041259765625, - "k3_kl": 0.02734375, - "kimi_kl": 0.076904296875, - "learning_rate": 9.88e-08, - "loss": 0.0014, - "ppl": 0.011810302734375, - "reward": 0.998778373003006, - "reward_std": 0.0006265390547923744, - "rewards/perpo_ocr_edit_distance_reward": 0.9987784922122955, + "advantages": -8.855548003339209e-06, + "completion_length": 403.0, + "delta_ref_entropy_loss": 0.1748046875, + "delta_ref_ppl": -0.169921875, + "entropy_loss": -0.271484375, + "epoch": 0.4012, + "grad_norm": 1.7619757257313808, + "k1_kl": 0.169921875, + "k3_kl": 0.10791015625, + "kimi_kl": 0.345703125, + "learning_rate": 2.994e-07, + "loss": 0.0043, + "ppl": 0.142578125, + "reward": 0.7770119309425354, + "reward_std": 0.0037458264268934727, + "rewards/perpo_ocr_edit_distance_reward": 0.7770119309425354, "step": 2006, "temperature": 0.9 }, { - "advantages": -4.274504635759513e-05, - "completion_length": 615.5, - "delta_ref_entropy_loss": 0.05157470703125, - "delta_ref_ppl": -0.03912353515625, - "entropy_loss": -0.06591796875, - "epoch": 0.8028, - "grad_norm": 0.9999174506015119, - "k1_kl": 0.03924560546875, - "k3_kl": 0.0244140625, - "kimi_kl": 0.0638427734375, - "learning_rate": 9.859999999999998e-08, - "loss": 0.001, - "ppl": 0.036865234375, - "reward": 0.9522558450698853, - "reward_std": 0.00742726594035048, - "rewards/perpo_ocr_edit_distance_reward": 0.95225590467453, + "advantages": 3.4059798537100505e-08, + "completion_length": 30.0, + "delta_ref_entropy_loss": 0.0751953125, + "delta_ref_ppl": -1.0234375, + "entropy_loss": -0.103515625, + "epoch": 0.4014, + "grad_norm": 7.4851593597432, + "k1_kl": 1.0234375, + "k3_kl": 0.88671875, + "kimi_kl": 6.0625, + "learning_rate": 2.993e-07, + "loss": 0.0355, + "ppl": 0.042236328125, + "reward": 0.670495331287384, + "reward_std": 0.002686054678633809, + "rewards/perpo_ocr_edit_distance_reward": 0.670495331287384, "step": 2007, "temperature": 0.9 }, { - "advantages": -2.9981138141010888e-05, - "completion_length": 759.0, - "delta_ref_entropy_loss": 0.02252197265625, - "delta_ref_ppl": -0.018951416015625, - "entropy_loss": -0.0216064453125, - "epoch": 0.8032, - "grad_norm": 0.5032280817901351, - "k1_kl": 0.0189208984375, - "k3_kl": 0.011322021484375, - "kimi_kl": 0.02557373046875, - "learning_rate": 9.84e-08, - "loss": 0.0005, - "ppl": 0.01171875, - "reward": 0.9963952302932739, - "reward_std": 0.0005889948806725442, - "rewards/perpo_ocr_edit_distance_reward": 0.9963952898979187, + "advantages": -2.1585397917078808e-05, + "completion_length": 594.0, + "delta_ref_entropy_loss": 0.04638671875, + "delta_ref_ppl": -0.047607421875, + "entropy_loss": -0.0247802734375, + "epoch": 0.4016, + "grad_norm": 0.21409381666279118, + "k1_kl": 0.047607421875, + "k3_kl": 0.0234375, + "kimi_kl": 0.05419921875, + "learning_rate": 2.9920000000000003e-07, + "loss": 0.001, + "ppl": 0.00616455078125, + "reward": 0.990433394908905, + "reward_std": 0.0002943362051155418, + "rewards/perpo_ocr_edit_distance_reward": 0.9904334545135498, "step": 2008, "temperature": 0.9 }, { - "advantages": -5.10896995820076e-08, - "completion_length": 198.0, - "delta_ref_entropy_loss": 0.0604248046875, - "delta_ref_ppl": -0.222900390625, - "entropy_loss": -0.193359375, - "epoch": 0.8036, - "grad_norm": 5.732187398938887, - "k1_kl": 0.222900390625, - "k3_kl": 0.255126953125, - "kimi_kl": 0.60107421875, - "learning_rate": 9.819999999999999e-08, - "loss": 0.0102, - "ppl": 0.1322021484375, - "reward": 0.641560822725296, - "reward_std": 0.17528555763419718, - "rewards/perpo_ocr_edit_distance_reward": 0.6415608525276184, + "advantages": -4.988057480659336e-05, + "completion_length": 408.0, + "delta_ref_entropy_loss": 0.09912109375, + "delta_ref_ppl": -0.1201171875, + "entropy_loss": -0.05224609375, + "epoch": 0.4018, + "grad_norm": 0.6587733456671656, + "k1_kl": 0.1201171875, + "k3_kl": 0.08203125, + "kimi_kl": 0.333984375, + "learning_rate": 2.9909999999999997e-07, + "loss": 0.0033, + "ppl": 0.0228271484375, + "reward": 0.9870433807373047, + "reward_std": 0.0012653331505134702, + "rewards/perpo_ocr_edit_distance_reward": 0.9870434999465942, "step": 2009, "temperature": 0.9 }, { - "advantages": -6.173338476855861e-05, - "completion_length": 654.0, - "delta_ref_entropy_loss": 0.10302734375, - "delta_ref_ppl": -0.0706787109375, - "entropy_loss": -0.0750732421875, - "epoch": 0.804, - "grad_norm": 2.1744553480871036, - "k1_kl": 0.0706787109375, - "k3_kl": 0.0400390625, - "kimi_kl": 0.121826171875, - "learning_rate": 9.8e-08, - "loss": 0.0017, - "ppl": 0.0389404296875, - "reward": 0.8838213980197906, - "reward_std": 0.005763098466559313, - "rewards/perpo_ocr_edit_distance_reward": 0.883821427822113, + "advantages": -5.624124241876416e-05, + "completion_length": 309.0, + "delta_ref_entropy_loss": 0.0576171875, + "delta_ref_ppl": -0.10400390625, + "entropy_loss": -0.056884765625, + "epoch": 0.402, + "grad_norm": 0.9051923453766099, + "k1_kl": 0.10400390625, + "k3_kl": 0.064453125, + "kimi_kl": 0.17578125, + "learning_rate": 2.9899999999999996e-07, + "loss": 0.0026, + "ppl": 0.017822265625, + "reward": 0.9842061400413513, + "reward_std": 0.00095984066138044, + "rewards/perpo_ocr_edit_distance_reward": 0.9842061996459961, "step": 2010, "temperature": 0.9 }, { - "advantages": -2.614089589769719e-05, - "completion_length": 831.0, - "delta_ref_entropy_loss": 0.0498046875, - "delta_ref_ppl": -0.031982421875, - "entropy_loss": -0.05517578125, - "epoch": 0.8044, - "grad_norm": 0.7261779393922608, - "k1_kl": 0.031982421875, - "k3_kl": 0.017730712890625, - "kimi_kl": 0.056732177734375, - "learning_rate": 9.779999999999999e-08, - "loss": 0.0007, - "ppl": 0.0274658203125, - "reward": 0.9945079982280731, - "reward_std": 0.0013983473763801157, - "rewards/perpo_ocr_edit_distance_reward": 0.9945080578327179, + "advantages": -3.921985626220703e-05, + "completion_length": 462.0, + "delta_ref_entropy_loss": 0.064453125, + "delta_ref_ppl": -0.0927734375, + "entropy_loss": -0.040771484375, + "epoch": 0.4022, + "grad_norm": 0.5967826158772495, + "k1_kl": 0.0927734375, + "k3_kl": 0.06689453125, + "kimi_kl": 0.2431640625, + "learning_rate": 2.989e-07, + "loss": 0.0027, + "ppl": 0.013671875, + "reward": 0.9919948577880859, + "reward_std": 0.0009855038952082396, + "rewards/perpo_ocr_edit_distance_reward": 0.9919949173927307, "step": 2011, "temperature": 0.9 }, { - "advantages": -0.00010442734128446318, - "completion_length": 564.5, - "delta_ref_entropy_loss": 0.0467529296875, - "delta_ref_ppl": -0.044189453125, - "entropy_loss": -0.028564453125, - "epoch": 0.8048, - "grad_norm": 0.4136773148887002, - "k1_kl": 0.04412841796875, - "k3_kl": 0.02789306640625, - "kimi_kl": 0.1427001953125, - "learning_rate": 9.76e-08, - "loss": 0.0012, - "ppl": 0.0128631591796875, - "reward": 0.8141804039478302, - "reward_std": 0.0007320124859688804, - "rewards/perpo_ocr_edit_distance_reward": 0.814180463552475, + "advantages": -8.821487426757812e-06, + "completion_length": 483.0, + "delta_ref_entropy_loss": 0.0849609375, + "delta_ref_ppl": -0.1025390625, + "entropy_loss": -0.07763671875, + "epoch": 0.4024, + "grad_norm": 1.4673718176830948, + "k1_kl": 0.1025390625, + "k3_kl": 0.0703125, + "kimi_kl": 0.267578125, + "learning_rate": 2.988e-07, + "loss": 0.0028, + "ppl": 0.038818359375, + "reward": 0.9870967864990234, + "reward_std": 0.005699628964066505, + "rewards/perpo_ocr_edit_distance_reward": 0.987096905708313, "step": 2012, "temperature": 0.9 }, { - "advantages": -8.07642982181278e-05, - "completion_length": 415.5, - "delta_ref_entropy_loss": 0.0413818359375, - "delta_ref_ppl": -0.02593994140625, - "entropy_loss": -0.0362548828125, - "epoch": 0.8052, - "grad_norm": 0.6249278317926431, - "k1_kl": 0.02593994140625, - "k3_kl": 0.016082763671875, - "kimi_kl": 0.028106689453125, - "learning_rate": 9.74e-08, - "loss": 0.0007, - "ppl": 0.01922607421875, - "reward": 0.9863996505737305, - "reward_std": 0.0002905469882534817, - "rewards/perpo_ocr_edit_distance_reward": 0.9863996803760529, + "advantages": -0.0002220017631771043, + "completion_length": 446.0, + "delta_ref_entropy_loss": 0.060302734375, + "delta_ref_ppl": -0.0859375, + "entropy_loss": -0.06884765625, + "epoch": 0.4026, + "grad_norm": 0.7400610653609911, + "k1_kl": 0.0859375, + "k3_kl": 0.054443359375, + "kimi_kl": 0.1552734375, + "learning_rate": 2.987e-07, + "loss": 0.0024, + "ppl": 0.030029296875, + "reward": 0.8460637331008911, + "reward_std": 0.0003986002702731639, + "rewards/perpo_ocr_edit_distance_reward": 0.8460637927055359, "step": 2013, "temperature": 0.9 }, { - "advantages": -1.9052200173064193e-07, - "completion_length": 261.0, - "delta_ref_entropy_loss": 0.07098388671875, - "delta_ref_ppl": -0.1641845703125, - "entropy_loss": -0.2696533203125, - "epoch": 0.8056, - "grad_norm": 2.5313531073539752, - "k1_kl": 0.16424560546875, - "k3_kl": 0.11761474609375, - "kimi_kl": 0.358642578125, - "learning_rate": 9.72e-08, - "loss": 0.0047, - "ppl": 0.151763916015625, - "reward": 0.6903844773769379, - "reward_std": 0.01665976084768772, - "rewards/perpo_ocr_edit_distance_reward": 0.6903844773769379, + "advantages": -3.5762786865234375e-05, + "completion_length": 428.0, + "delta_ref_entropy_loss": 0.0908203125, + "delta_ref_ppl": -0.07275390625, + "entropy_loss": -0.06689453125, + "epoch": 0.4028, + "grad_norm": 0.841354881090035, + "k1_kl": 0.07275390625, + "k3_kl": 0.039794921875, + "kimi_kl": 0.095703125, + "learning_rate": 2.986e-07, + "loss": 0.0016, + "ppl": 0.029296875, + "reward": 0.934725821018219, + "reward_std": 0.0008527125464752316, + "rewards/perpo_ocr_edit_distance_reward": 0.9347258806228638, "step": 2014, "temperature": 0.9 }, { - "advantages": -1.1333397651469568e-05, - "completion_length": 1164.0, - "delta_ref_entropy_loss": 0.025787353515625, - "delta_ref_ppl": -0.015777587890625, - "entropy_loss": -0.03326416015625, - "epoch": 0.806, - "grad_norm": 0.8191746975646397, - "k1_kl": 0.01580810546875, - "k3_kl": 0.00970458984375, - "kimi_kl": 0.019775390625, - "learning_rate": 9.7e-08, - "loss": 0.0004, - "ppl": 0.0189208984375, - "reward": 0.9799254238605499, - "reward_std": 0.002528747951146215, - "rewards/perpo_ocr_edit_distance_reward": 0.9799255132675171, + "advantages": -3.620556526584551e-05, + "completion_length": 358.0, + "delta_ref_entropy_loss": 0.04150390625, + "delta_ref_ppl": -0.050537109375, + "entropy_loss": -0.04736328125, + "epoch": 0.403, + "grad_norm": 0.5237128358057125, + "k1_kl": 0.050537109375, + "k3_kl": 0.030029296875, + "kimi_kl": 0.056640625, + "learning_rate": 2.985e-07, + "loss": 0.0012, + "ppl": 0.016357421875, + "reward": 0.9825341105461121, + "reward_std": 0.0006055683479644358, + "rewards/perpo_ocr_edit_distance_reward": 0.9825341701507568, "step": 2015, "temperature": 0.9 }, { - "advantages": 1.6440239733128692e-05, - "completion_length": 301.5, - "delta_ref_entropy_loss": 0.0540771484375, - "delta_ref_ppl": -0.0667724609375, - "entropy_loss": -0.06427001953125, - "epoch": 0.8064, - "grad_norm": 1.394414552977161, - "k1_kl": 0.0670166015625, - "k3_kl": 0.04766845703125, - "kimi_kl": 0.149658203125, - "learning_rate": 9.679999999999999e-08, - "loss": 0.0019, - "ppl": 0.037017822265625, - "reward": 0.8974906802177429, - "reward_std": 0.004521691793343052, - "rewards/perpo_ocr_edit_distance_reward": 0.8974907100200653, + "advantages": 3.065381974920456e-07, + "completion_length": 267.0, + "delta_ref_entropy_loss": 0.07861328125, + "delta_ref_ppl": -0.2119140625, + "entropy_loss": -0.0771484375, + "epoch": 0.4032, + "grad_norm": 2.2665832095234193, + "k1_kl": 0.2119140625, + "k3_kl": 0.162109375, + "kimi_kl": 0.6953125, + "learning_rate": 2.9839999999999997e-07, + "loss": 0.0065, + "ppl": 0.0322265625, + "reward": 0.83315509557724, + "reward_std": 0.05853274464607239, + "rewards/perpo_ocr_edit_distance_reward": 0.83315509557724, "step": 2016, "temperature": 0.9 }, { - "advantages": -0.0003832791553577408, - "completion_length": 569.5, - "delta_ref_entropy_loss": 0.025390625, - "delta_ref_ppl": -0.035186767578125, - "entropy_loss": -0.01751708984375, - "epoch": 0.8068, - "grad_norm": 0.13070406704719864, - "k1_kl": 0.03515625, - "k3_kl": 0.023590087890625, - "kimi_kl": 0.06610107421875, - "learning_rate": 9.66e-08, - "loss": 0.0013, - "ppl": 0.0076141357421875, - "reward": 0.9978639483451843, - "reward_std": 4.987795909983106e-05, - "rewards/perpo_ocr_edit_distance_reward": 0.9978640377521515, - "step": 2017, - "temperature": 0.9 - }, + "advantages": -3.2356808787881164e-07, + "completion_length": 1139.0, + "delta_ref_entropy_loss": 0.05859375, + "delta_ref_ppl": -0.0966796875, + "entropy_loss": -0.1591796875, + "epoch": 0.4034, + "grad_norm": 3.108858271877651, + "k1_kl": 0.0966796875, + "k3_kl": 0.076171875, + "kimi_kl": 0.2158203125, + "learning_rate": 2.983e-07, + "loss": 0.003, + "ppl": 0.07861328125, + "reward": 0.26965591311454773, + "reward_std": 0.04885866120457649, + "rewards/perpo_ocr_edit_distance_reward": 0.2696559429168701, + "step": 2017, + "temperature": 0.9 + }, { - "advantages": -5.501934720086865e-05, - "completion_length": 569.5, - "delta_ref_entropy_loss": 0.02197265625, - "delta_ref_ppl": -0.0501708984375, - "entropy_loss": -0.02679443359375, - "epoch": 0.8072, - "grad_norm": 0.35540754912818434, - "k1_kl": 0.0501708984375, - "k3_kl": 0.03692626953125, - "kimi_kl": 0.16650390625, - "learning_rate": 9.639999999999999e-08, - "loss": 0.0015, - "ppl": 0.012786865234375, - "reward": 0.9998452365398407, - "reward_std": 0.0001821498735807836, - "rewards/perpo_ocr_edit_distance_reward": 0.9998452663421631, + "advantages": 1.8732889373040962e-07, + "completion_length": 538.0, + "delta_ref_entropy_loss": 0.1298828125, + "delta_ref_ppl": -0.123046875, + "entropy_loss": -0.380859375, + "epoch": 0.4036, + "grad_norm": 2.525147708777536, + "k1_kl": 0.12353515625, + "k3_kl": 0.072265625, + "kimi_kl": 0.1474609375, + "learning_rate": 2.982e-07, + "loss": 0.0029, + "ppl": 0.2109375, + "reward": 0.7186895608901978, + "reward_std": 0.08734344691038132, + "rewards/perpo_ocr_edit_distance_reward": 0.7186895608901978, "step": 2018, "temperature": 0.9 }, { - "advantages": -3.5256147384643555e-05, - "completion_length": 977.5, - "delta_ref_entropy_loss": 0.062744140625, - "delta_ref_ppl": -0.039947509765625, - "entropy_loss": -0.06805419921875, - "epoch": 0.8076, - "grad_norm": 0.8617597510848458, - "k1_kl": 0.040191650390625, - "k3_kl": 0.01873779296875, - "kimi_kl": 0.0363616943359375, - "learning_rate": 9.619999999999999e-08, - "loss": 0.0008, - "ppl": 0.03515625, - "reward": 0.9531338214874268, - "reward_std": 0.0028915059738210402, - "rewards/perpo_ocr_edit_distance_reward": 0.9531338810920715, + "advantages": -5.619866897177417e-06, + "completion_length": 685.0, + "delta_ref_entropy_loss": 0.0546875, + "delta_ref_ppl": -0.0712890625, + "entropy_loss": -0.046142578125, + "epoch": 0.4038, + "grad_norm": 0.44303885139641447, + "k1_kl": 0.0712890625, + "k3_kl": 0.04345703125, + "kimi_kl": 0.1396484375, + "learning_rate": 2.9809999999999995e-07, + "loss": 0.0017, + "ppl": 0.0206298828125, + "reward": 0.9822162985801697, + "reward_std": 0.00898059830069542, + "rewards/perpo_ocr_edit_distance_reward": 0.9822163581848145, "step": 2019, "temperature": 0.9 }, { - "advantages": -0.00010695415403461084, - "completion_length": 414.0, - "delta_ref_entropy_loss": 0.03717041015625, - "delta_ref_ppl": -0.03509521484375, - "entropy_loss": -0.0811767578125, - "epoch": 0.808, - "grad_norm": 2.083951492597076, - "k1_kl": 0.03509521484375, - "k3_kl": 0.022613525390625, - "kimi_kl": 0.0384521484375, - "learning_rate": 9.6e-08, - "loss": 0.001, - "ppl": 0.0478515625, - "reward": 0.9451569616794586, - "reward_std": 0.005383259092923254, - "rewards/perpo_ocr_edit_distance_reward": 0.9451570510864258, + "advantages": -8.991786671685986e-06, + "completion_length": 112.0, + "delta_ref_entropy_loss": 0.10302734375, + "delta_ref_ppl": -0.2890625, + "entropy_loss": -0.10302734375, + "epoch": 0.404, + "grad_norm": 2.2919827369251076, + "k1_kl": 0.2890625, + "k3_kl": 0.22265625, + "kimi_kl": 0.796875, + "learning_rate": 2.98e-07, + "loss": 0.0089, + "ppl": 0.04931640625, + "reward": 0.9451953768730164, + "reward_std": 0.004635958466678858, + "rewards/perpo_ocr_edit_distance_reward": 0.9451954960823059, "step": 2020, "temperature": 0.9 }, { - "advantages": -7.552760507678613e-06, - "completion_length": 871.5, - "delta_ref_entropy_loss": 0.03594970703125, - "delta_ref_ppl": -0.02471923828125, - "entropy_loss": -0.0406494140625, - "epoch": 0.8084, - "grad_norm": 0.533179421360148, - "k1_kl": 0.02471923828125, - "k3_kl": 0.01458740234375, - "kimi_kl": 0.02825927734375, - "learning_rate": 9.58e-08, - "loss": 0.0006, - "ppl": 0.02008056640625, - "reward": 0.9967202842235565, - "reward_std": 0.0008007512369658798, - "rewards/perpo_ocr_edit_distance_reward": 0.9967202842235565, + "advantages": -2.786091499729082e-05, + "completion_length": 515.0, + "delta_ref_entropy_loss": 0.0927734375, + "delta_ref_ppl": -0.0927734375, + "entropy_loss": -0.06591796875, + "epoch": 0.4042, + "grad_norm": 0.7815033645955755, + "k1_kl": 0.09326171875, + "k3_kl": 0.051513671875, + "kimi_kl": 0.1533203125, + "learning_rate": 2.979e-07, + "loss": 0.0021, + "ppl": 0.0264892578125, + "reward": 0.9397084712982178, + "reward_std": 0.0005116484244354069, + "rewards/perpo_ocr_edit_distance_reward": 0.9397084712982178, "step": 2021, "temperature": 0.9 }, { - "advantages": -1.3879367543268017e-05, - "completion_length": 319.0, - "delta_ref_entropy_loss": 0.0560302734375, - "delta_ref_ppl": -0.0452880859375, - "entropy_loss": -0.0423583984375, - "epoch": 0.8088, - "grad_norm": 0.9148870004330212, - "k1_kl": 0.04522705078125, - "k3_kl": 0.024444580078125, - "kimi_kl": 0.055572509765625, - "learning_rate": 9.56e-08, - "loss": 0.001, - "ppl": 0.025390625, - "reward": 0.9992609024047852, - "reward_std": 0.0007952586747705936, - "rewards/perpo_ocr_edit_distance_reward": 0.9992609620094299, + "advantages": -0.00011953286593779922, + "completion_length": 585.0, + "delta_ref_entropy_loss": 0.048583984375, + "delta_ref_ppl": -0.04345703125, + "entropy_loss": -0.030029296875, + "epoch": 0.4044, + "grad_norm": 0.669386588304797, + "k1_kl": 0.043212890625, + "k3_kl": 0.026611328125, + "kimi_kl": 0.07275390625, + "learning_rate": 2.978e-07, + "loss": 0.0012, + "ppl": 0.0101318359375, + "reward": 0.9965004920959473, + "reward_std": 0.00032749338424764574, + "rewards/perpo_ocr_edit_distance_reward": 0.996500551700592, "step": 2022, "temperature": 0.9 }, { - "advantages": -2.720526498478648e-05, - "completion_length": 670.0, - "delta_ref_entropy_loss": 0.036285400390625, - "delta_ref_ppl": -0.165924072265625, - "entropy_loss": -0.1439208984375, - "epoch": 0.8092, - "grad_norm": 3.9835936372284437, - "k1_kl": 0.165924072265625, - "k3_kl": 0.1218109130859375, - "kimi_kl": 0.446319580078125, - "learning_rate": 9.54e-08, - "loss": 0.0049, - "ppl": 0.073944091796875, - "reward": 0.6713815182447433, - "reward_std": 0.008355451573152095, - "rewards/perpo_ocr_edit_distance_reward": 0.6713815629482269, + "advantages": -3.0313219667732483e-06, + "completion_length": 483.0, + "delta_ref_entropy_loss": 0.0830078125, + "delta_ref_ppl": -0.10888671875, + "entropy_loss": -0.06787109375, + "epoch": 0.4046, + "grad_norm": 1.0494291052344074, + "k1_kl": 0.10888671875, + "k3_kl": 0.072265625, + "kimi_kl": 0.20703125, + "learning_rate": 2.977e-07, + "loss": 0.0029, + "ppl": 0.030517578125, + "reward": 0.9921674728393555, + "reward_std": 0.002724200952798128, + "rewards/perpo_ocr_edit_distance_reward": 0.9921675324440002, "step": 2023, "temperature": 0.9 }, { - "advantages": -1.0596855645417236e-05, - "completion_length": 817.0, - "delta_ref_entropy_loss": 0.02349853515625, - "delta_ref_ppl": -0.025634765625, - "entropy_loss": -0.02117919921875, - "epoch": 0.8096, - "grad_norm": 0.38581594867727265, - "k1_kl": 0.025634765625, - "k3_kl": 0.015716552734375, - "kimi_kl": 0.042724609375, - "learning_rate": 9.52e-08, - "loss": 0.0006, - "ppl": 0.009490966796875, - "reward": 0.9980645179748535, - "reward_std": 0.00013262775973998941, - "rewards/perpo_ocr_edit_distance_reward": 0.9980645477771759, + "advantages": -6.132892303867266e-05, + "completion_length": 1380.0, + "delta_ref_entropy_loss": 0.06005859375, + "delta_ref_ppl": -0.07421875, + "entropy_loss": -0.07568359375, + "epoch": 0.4048, + "grad_norm": 2.5189502171889235, + "k1_kl": 0.07421875, + "k3_kl": 0.04638671875, + "kimi_kl": 0.12890625, + "learning_rate": 2.9759999999999996e-07, + "loss": 0.0019, + "ppl": 0.033203125, + "reward": 0.9715747237205505, + "reward_std": 0.0015657370677217841, + "rewards/perpo_ocr_edit_distance_reward": 0.9715748429298401, "step": 2024, "temperature": 0.9 }, { - "advantages": -3.312315357106854e-05, - "completion_length": 809.0, - "delta_ref_entropy_loss": 0.04962158203125, - "delta_ref_ppl": -0.03271484375, - "entropy_loss": -0.0560302734375, - "epoch": 0.81, - "grad_norm": 0.6378510819445473, - "k1_kl": 0.03271484375, - "k3_kl": 0.0185699462890625, - "kimi_kl": 0.04803466796875, - "learning_rate": 9.499999999999999e-08, - "loss": 0.0008, - "ppl": 0.027069091796875, - "reward": 0.9920419752597809, - "reward_std": 0.0014827594059170224, - "rewards/perpo_ocr_edit_distance_reward": 0.9920420050621033, + "advantages": -1.958438360816217e-06, + "completion_length": 400.0, + "delta_ref_entropy_loss": 0.1572265625, + "delta_ref_ppl": -0.13671875, + "entropy_loss": -0.294921875, + "epoch": 0.405, + "grad_norm": 1.993075195925252, + "k1_kl": 0.13671875, + "k3_kl": 0.0830078125, + "kimi_kl": 0.2275390625, + "learning_rate": 2.9749999999999996e-07, + "loss": 0.0033, + "ppl": 0.1572265625, + "reward": 0.839285671710968, + "reward_std": 0.00850186962634325, + "rewards/perpo_ocr_edit_distance_reward": 0.8392857313156128, "step": 2025, "temperature": 0.9 }, { - "advantages": -0.0003178417682647705, - "completion_length": 437.0, - "delta_ref_entropy_loss": 0.0374755859375, - "delta_ref_ppl": -0.027099609375, - "entropy_loss": -0.01580810546875, - "epoch": 0.8104, - "grad_norm": 0.40186211323462745, - "k1_kl": 0.0269775390625, - "k3_kl": 0.015167236328125, - "kimi_kl": 0.037994384765625, - "learning_rate": 9.479999999999999e-08, - "loss": 0.0009, - "ppl": 0.00475311279296875, - "reward": 0.9846284687519073, - "reward_std": 0.0009162161150015891, - "rewards/perpo_ocr_edit_distance_reward": 0.9846285581588745, + "advantages": 8.514949634275126e-09, + "completion_length": 423.0, + "delta_ref_entropy_loss": 0.032470703125, + "delta_ref_ppl": -0.0703125, + "entropy_loss": -0.0247802734375, + "epoch": 0.4052, + "grad_norm": 0.36870453589456603, + "k1_kl": 0.0703125, + "k3_kl": 0.0517578125, + "kimi_kl": 0.1787109375, + "learning_rate": 2.974e-07, + "loss": 0.0021, + "ppl": 0.0093994140625, + "reward": 0.9935596585273743, + "reward_std": 0.0010326693300157785, + "rewards/perpo_ocr_edit_distance_reward": 0.993559718132019, "step": 2026, "temperature": 0.9 }, { - "advantages": -2.0529543689917773e-05, - "completion_length": 794.5, - "delta_ref_entropy_loss": 0.060791015625, - "delta_ref_ppl": -0.0416259765625, - "entropy_loss": -0.0888671875, - "epoch": 0.8108, - "grad_norm": 0.9514080243094098, - "k1_kl": 0.0416259765625, - "k3_kl": 0.0233154296875, - "kimi_kl": 0.063232421875, - "learning_rate": 9.46e-08, - "loss": 0.001, - "ppl": 0.0498046875, - "reward": 0.9588336944580078, - "reward_std": 0.0030477476539090276, - "rewards/perpo_ocr_edit_distance_reward": 0.958833783864975, + "advantages": -4.570267265080474e-05, + "completion_length": 221.0, + "delta_ref_entropy_loss": 0.07568359375, + "delta_ref_ppl": -0.1171875, + "entropy_loss": -0.076171875, + "epoch": 0.4054, + "grad_norm": 1.474891312047118, + "k1_kl": 0.11669921875, + "k3_kl": 0.08544921875, + "kimi_kl": 0.3359375, + "learning_rate": 2.973e-07, + "loss": 0.0035, + "ppl": 0.033203125, + "reward": 0.8592283129692078, + "reward_std": 0.001204059342853725, + "rewards/perpo_ocr_edit_distance_reward": 0.8592284321784973, "step": 2027, "temperature": 0.9 }, { - "advantages": -1.687663071958667e-05, - "completion_length": 906.0, - "delta_ref_entropy_loss": 0.057861328125, - "delta_ref_ppl": -0.072265625, - "entropy_loss": -0.07177734375, - "epoch": 0.8112, - "grad_norm": 3.4211689114784556, - "k1_kl": 0.0721435546875, - "k3_kl": 0.05645751953125, - "kimi_kl": 0.2427978515625, - "learning_rate": 9.44e-08, - "loss": 0.0023, - "ppl": 0.0406494140625, - "reward": 0.7883727252483368, - "reward_std": 0.24126426654402167, - "rewards/perpo_ocr_edit_distance_reward": 0.7883727550506592, + "advantages": -1.611028528714087e-05, + "completion_length": 620.0, + "delta_ref_entropy_loss": 0.1650390625, + "delta_ref_ppl": -0.10498046875, + "entropy_loss": -0.185546875, + "epoch": 0.4056, + "grad_norm": 1.5070313359142213, + "k1_kl": 0.10546875, + "k3_kl": 0.060302734375, + "kimi_kl": 0.1357421875, + "learning_rate": 2.972e-07, + "loss": 0.0024, + "ppl": 0.091796875, + "reward": 0.71964031457901, + "reward_std": 0.003074516309425235, + "rewards/perpo_ocr_edit_distance_reward": 0.7196404337882996, "step": 2028, "temperature": 0.9 }, { - "advantages": -0.0002945661542526068, - "completion_length": 623.0, - "delta_ref_entropy_loss": 0.045654296875, - "delta_ref_ppl": -0.03082275390625, - "entropy_loss": -0.037353515625, - "epoch": 0.8116, - "grad_norm": 0.741034464777094, - "k1_kl": 0.03082275390625, - "k3_kl": 0.020111083984375, - "kimi_kl": 0.0462646484375, - "learning_rate": 9.42e-08, - "loss": 0.0011, - "ppl": 0.01995849609375, - "reward": 0.9919966757297516, - "reward_std": 0.0005671234684996307, - "rewards/perpo_ocr_edit_distance_reward": 0.991996705532074, + "advantages": -6.866455078125e-05, + "completion_length": 148.0, + "delta_ref_entropy_loss": 0.09326171875, + "delta_ref_ppl": -0.216796875, + "entropy_loss": -0.06298828125, + "epoch": 0.4058, + "grad_norm": 1.012487366603724, + "k1_kl": 0.216796875, + "k3_kl": 0.1591796875, + "kimi_kl": 0.61328125, + "learning_rate": 2.971e-07, + "loss": 0.0064, + "ppl": 0.0177001953125, + "reward": 0.9992436766624451, + "reward_std": 0.0013880346668884158, + "rewards/perpo_ocr_edit_distance_reward": 0.9992437958717346, "step": 2029, "temperature": 0.9 }, { - "advantages": 4.6406474609739234e-06, - "completion_length": 318.0, - "delta_ref_entropy_loss": 0.05517578125, - "delta_ref_ppl": -0.131103515625, - "entropy_loss": -0.1353759765625, - "epoch": 0.812, - "grad_norm": 2.3076794720394473, - "k1_kl": 0.1307373046875, - "k3_kl": 0.09423828125, - "kimi_kl": 0.32421875, - "learning_rate": 9.4e-08, - "loss": 0.0038, - "ppl": 0.07171630859375, - "reward": 0.6578247398138046, - "reward_std": 0.019601719017373398, - "rewards/perpo_ocr_edit_distance_reward": 0.6578247547149658, + "advantages": -6.147793556010583e-06, + "completion_length": 920.0, + "delta_ref_entropy_loss": 0.0810546875, + "delta_ref_ppl": -0.10546875, + "entropy_loss": -0.06298828125, + "epoch": 0.406, + "grad_norm": 2.299154814432748, + "k1_kl": 0.10498046875, + "k3_kl": 0.064453125, + "kimi_kl": 0.1845703125, + "learning_rate": 2.9699999999999997e-07, + "loss": 0.0026, + "ppl": 0.0284423828125, + "reward": 0.9220043420791626, + "reward_std": 0.009590903297066689, + "rewards/perpo_ocr_edit_distance_reward": 0.9220044612884521, "step": 2030, "temperature": 0.9 }, { - "advantages": -2.3652400813034546e-05, - "completion_length": 1050.5, - "delta_ref_entropy_loss": 0.03619384765625, - "delta_ref_ppl": -0.03662109375, - "entropy_loss": -0.06463623046875, - "epoch": 0.8124, - "grad_norm": 0.9952433136084143, - "k1_kl": 0.03662109375, - "k3_kl": 0.02545166015625, - "kimi_kl": 0.0794677734375, - "learning_rate": 9.379999999999999e-08, - "loss": 0.001, - "ppl": 0.035552978515625, - "reward": 0.9387972354888916, - "reward_std": 0.020794058742467314, - "rewards/perpo_ocr_edit_distance_reward": 0.9387972950935364, + "advantages": -1.3623919414840202e-07, + "completion_length": 99.0, + "delta_ref_entropy_loss": 0.0390625, + "delta_ref_ppl": -0.47265625, + "entropy_loss": -0.21484375, + "epoch": 0.4062, + "grad_norm": 6.442827733182088, + "k1_kl": 0.474609375, + "k3_kl": 0.38671875, + "kimi_kl": 2.109375, + "learning_rate": 2.9689999999999997e-07, + "loss": 0.0155, + "ppl": 0.09130859375, + "reward": 0.4264069199562073, + "reward_std": 0.11247166246175766, + "rewards/perpo_ocr_edit_distance_reward": 0.42640694975852966, "step": 2031, "temperature": 0.9 }, { - "advantages": -6.249973239391693e-06, - "completion_length": 1221.0, - "delta_ref_entropy_loss": 0.0360107421875, - "delta_ref_ppl": -0.0413818359375, - "entropy_loss": -0.464599609375, - "epoch": 0.8128, - "grad_norm": 1032.6532079149522, - "k1_kl": 0.0413818359375, - "k3_kl": 14.45294189453125, - "kimi_kl": 0.10009765625, - "learning_rate": 9.36e-08, - "loss": 0.5807, - "ppl": 0.284423828125, - "reward": 0.6235131919384003, - "reward_std": 0.012655185535550117, - "rewards/perpo_ocr_edit_distance_reward": 0.6235132217407227, + "advantages": -4.669598274631426e-05, + "completion_length": 163.0, + "delta_ref_entropy_loss": 0.07568359375, + "delta_ref_ppl": -0.150390625, + "entropy_loss": -0.07421875, + "epoch": 0.4064, + "grad_norm": 1.4917163024089772, + "k1_kl": 0.1513671875, + "k3_kl": 0.1103515625, + "kimi_kl": 0.439453125, + "learning_rate": 2.968e-07, + "loss": 0.0045, + "ppl": 0.0260009765625, + "reward": 0.9695142507553101, + "reward_std": 0.0019059096230193973, + "rewards/perpo_ocr_edit_distance_reward": 0.9695143699645996, "step": 2032, "temperature": 0.9 }, { - "advantages": -1.087146165446029e-05, - "completion_length": 688.0, - "delta_ref_entropy_loss": 0.087890625, - "delta_ref_ppl": -0.06915283203125, - "entropy_loss": -0.085693359375, - "epoch": 0.8132, - "grad_norm": 1.5634641943763175, - "k1_kl": 0.069091796875, - "k3_kl": 0.04400634765625, - "kimi_kl": 0.13427734375, - "learning_rate": 9.339999999999999e-08, - "loss": 0.0018, - "ppl": 0.0458984375, - "reward": 0.9716385006904602, - "reward_std": 0.005112607032060623, - "rewards/perpo_ocr_edit_distance_reward": 0.9716385304927826, + "advantages": -2.143212805094663e-05, + "completion_length": 577.0, + "delta_ref_entropy_loss": 0.07958984375, + "delta_ref_ppl": -0.0849609375, + "entropy_loss": -0.049560546875, + "epoch": 0.4066, + "grad_norm": 0.7714172860592361, + "k1_kl": 0.0849609375, + "k3_kl": 0.05078125, + "kimi_kl": 0.267578125, + "learning_rate": 2.967e-07, + "loss": 0.002, + "ppl": 0.0234375, + "reward": 0.978280246257782, + "reward_std": 0.0010921151842921972, + "rewards/perpo_ocr_edit_distance_reward": 0.9782803058624268, "step": 2033, "temperature": 0.9 }, { - "advantages": -3.4059798537100505e-08, - "completion_length": 240.0, - "delta_ref_entropy_loss": 0.0262451171875, - "delta_ref_ppl": -0.290771484375, - "entropy_loss": -0.11334228515625, - "epoch": 0.8136, - "grad_norm": 4.475683859818462, - "k1_kl": 0.29083251953125, - "k3_kl": 0.24664306640625, - "kimi_kl": 1.4290771484375, - "learning_rate": 9.32e-08, - "loss": 0.0099, - "ppl": 0.05963134765625, - "reward": 0.6858600676059723, - "reward_std": 0.08534926921129227, - "rewards/perpo_ocr_edit_distance_reward": 0.6858600676059723, + "advantages": -0.00010446140368003398, + "completion_length": 735.0, + "delta_ref_entropy_loss": 0.0263671875, + "delta_ref_ppl": -0.0400390625, + "entropy_loss": -0.04296875, + "epoch": 0.4068, + "grad_norm": 0.632840224102582, + "k1_kl": 0.0400390625, + "k3_kl": 0.0262451171875, + "kimi_kl": 0.08056640625, + "learning_rate": 2.9659999999999994e-07, + "loss": 0.0012, + "ppl": 0.02099609375, + "reward": 0.9913342595100403, + "reward_std": 0.001041454030200839, + "rewards/perpo_ocr_edit_distance_reward": 0.9913344383239746, "step": 2034, "temperature": 0.9 }, { - "advantages": -4.539319797913777e-05, - "completion_length": 687.5, - "delta_ref_entropy_loss": 0.05682373046875, - "delta_ref_ppl": -0.03802490234375, - "entropy_loss": -0.04742431640625, - "epoch": 0.814, - "grad_norm": 0.8289776768646714, - "k1_kl": 0.03802490234375, - "k3_kl": 0.01904296875, - "kimi_kl": 0.035186767578125, - "learning_rate": 9.3e-08, - "loss": 0.0008, - "ppl": 0.02349853515625, - "reward": 0.9852165579795837, - "reward_std": 0.0025465237558819354, - "rewards/perpo_ocr_edit_distance_reward": 0.9852166175842285, + "advantages": -1.1648450708889868e-05, + "completion_length": 139.0, + "delta_ref_entropy_loss": 0.05517578125, + "delta_ref_ppl": -0.123046875, + "entropy_loss": -0.11328125, + "epoch": 0.407, + "grad_norm": 3.104566855678081, + "k1_kl": 0.12255859375, + "k3_kl": 0.0849609375, + "kimi_kl": 0.265625, + "learning_rate": 2.965e-07, + "loss": 0.0034, + "ppl": 0.059326171875, + "reward": 0.9912933707237244, + "reward_std": 0.00502685084939003, + "rewards/perpo_ocr_edit_distance_reward": 0.9912934303283691, "step": 2035, "temperature": 0.9 }, { - "advantages": -7.841417436793563e-05, - "completion_length": 779.5, - "delta_ref_entropy_loss": 0.021728515625, - "delta_ref_ppl": -0.016754150390625, - "entropy_loss": -0.022216796875, - "epoch": 0.8144, - "grad_norm": 0.24779145424026974, - "k1_kl": 0.016845703125, - "k3_kl": 0.008514404296875, - "kimi_kl": 0.01806640625, - "learning_rate": 9.279999999999998e-08, - "loss": 0.0004, - "ppl": 0.00799560546875, - "reward": 0.9972639083862305, - "reward_std": 0.0011147709665237926, - "rewards/perpo_ocr_edit_distance_reward": 0.9972639381885529, + "advantages": -4.905462628812529e-05, + "completion_length": 1224.0, + "delta_ref_entropy_loss": 0.05322265625, + "delta_ref_ppl": -0.0556640625, + "entropy_loss": -0.06396484375, + "epoch": 0.4072, + "grad_norm": 1.7217067043465162, + "k1_kl": 0.0556640625, + "k3_kl": 0.041259765625, + "kimi_kl": 0.08251953125, + "learning_rate": 2.964e-07, + "loss": 0.0017, + "ppl": 0.031982421875, + "reward": 0.9723055958747864, + "reward_std": 0.0007677443791180849, + "rewards/perpo_ocr_edit_distance_reward": 0.9723056554794312, "step": 2036, "temperature": 0.9 }, { - "advantages": -9.230205250787549e-06, - "completion_length": 57.0, - "delta_ref_entropy_loss": 0.0631103515625, - "delta_ref_ppl": -0.1611328125, - "entropy_loss": -0.053466796875, - "epoch": 0.8148, - "grad_norm": 3.6606939984022198, - "k1_kl": 0.1611328125, - "k3_kl": 0.121826171875, - "kimi_kl": 0.4052734375, - "learning_rate": 9.26e-08, - "loss": 0.0049, - "ppl": 0.04132080078125, - "reward": 0.9983291625976562, - "reward_std": 0.0015629252884536982, - "rewards/perpo_ocr_edit_distance_reward": 0.9983291923999786, + "advantages": -6.624630714213708e-06, + "completion_length": 705.0, + "delta_ref_entropy_loss": 0.080078125, + "delta_ref_ppl": -0.08447265625, + "entropy_loss": -0.08740234375, + "epoch": 0.4074, + "grad_norm": 1.789874751847012, + "k1_kl": 0.08447265625, + "k3_kl": 0.05859375, + "kimi_kl": 0.1826171875, + "learning_rate": 2.9629999999999997e-07, + "loss": 0.0023, + "ppl": 0.0400390625, + "reward": 0.9166001081466675, + "reward_std": 0.019134406000375748, + "rewards/perpo_ocr_edit_distance_reward": 0.9166002869606018, "step": 2037, "temperature": 0.9 }, { - "advantages": -3.8006478092711404e-05, - "completion_length": 331.5, - "delta_ref_entropy_loss": 0.094970703125, - "delta_ref_ppl": -0.10107421875, - "entropy_loss": -0.119140625, - "epoch": 0.8152, - "grad_norm": 1.8812194080074924, - "k1_kl": 0.10107421875, - "k3_kl": 0.0657958984375, - "kimi_kl": 0.19970703125, - "learning_rate": 9.24e-08, - "loss": 0.0027, - "ppl": 0.063446044921875, - "reward": 0.8855560421943665, - "reward_std": 0.03376499531441368, - "rewards/perpo_ocr_edit_distance_reward": 0.8855561017990112, + "advantages": -8.96624187589623e-05, + "completion_length": 704.0, + "delta_ref_entropy_loss": 0.0400390625, + "delta_ref_ppl": -0.042236328125, + "entropy_loss": -0.03466796875, + "epoch": 0.4076, + "grad_norm": 0.38089705533370033, + "k1_kl": 0.04248046875, + "k3_kl": 0.0244140625, + "kimi_kl": 0.06396484375, + "learning_rate": 2.962e-07, + "loss": 0.0011, + "ppl": 0.013427734375, + "reward": 0.9973742961883545, + "reward_std": 0.0004697496769949794, + "rewards/perpo_ocr_edit_distance_reward": 0.9973743557929993, "step": 2038, "temperature": 0.9 }, { - "advantages": -3.156066168230609e-05, - "completion_length": 663.0, - "delta_ref_entropy_loss": 0.0306396484375, - "delta_ref_ppl": -0.0330810546875, - "entropy_loss": -0.03302001953125, - "epoch": 0.8156, - "grad_norm": 0.6300362596890325, - "k1_kl": 0.0330810546875, - "k3_kl": 0.02069091796875, - "kimi_kl": 0.0513916015625, - "learning_rate": 9.22e-08, - "loss": 0.0009, - "ppl": 0.017608642578125, - "reward": 0.9970899820327759, - "reward_std": 0.0009280231315642595, - "rewards/perpo_ocr_edit_distance_reward": 0.9970900118350983, + "advantages": -1.8392290712654358e-06, + "completion_length": 530.0, + "delta_ref_entropy_loss": 0.0712890625, + "delta_ref_ppl": -0.10986328125, + "entropy_loss": -0.1708984375, + "epoch": 0.4078, + "grad_norm": 1.5295984025600016, + "k1_kl": 0.1103515625, + "k3_kl": 0.07421875, + "kimi_kl": 0.23046875, + "learning_rate": 2.9609999999999996e-07, + "loss": 0.003, + "ppl": 0.09912109375, + "reward": 0.7891985774040222, + "reward_std": 0.018281932920217514, + "rewards/perpo_ocr_edit_distance_reward": 0.789198637008667, "step": 2039, "temperature": 0.9 }, { - "advantages": -6.650175691902405e-06, - "completion_length": 368.5, - "delta_ref_entropy_loss": 0.0772705078125, - "delta_ref_ppl": -0.19219970703125, - "entropy_loss": -0.05413818359375, - "epoch": 0.816, - "grad_norm": 0.3726339135274581, - "k1_kl": 0.19317626953125, - "k3_kl": 0.15618896484375, - "kimi_kl": 0.74951171875, - "learning_rate": 9.199999999999999e-08, - "loss": 0.0063, - "ppl": 0.024658203125, - "reward": 0.9992820918560028, - "reward_std": 0.0005895323702134192, - "rewards/perpo_ocr_edit_distance_reward": 0.9992821216583252, + "advantages": -8.71930842549773e-06, + "completion_length": 522.0, + "delta_ref_entropy_loss": 0.06640625, + "delta_ref_ppl": -0.07080078125, + "entropy_loss": -0.09130859375, + "epoch": 0.408, + "grad_norm": 0.9302596608463696, + "k1_kl": 0.07080078125, + "k3_kl": 0.040283203125, + "kimi_kl": 0.095703125, + "learning_rate": 2.9599999999999995e-07, + "loss": 0.0016, + "ppl": 0.03466796875, + "reward": 0.9761413335800171, + "reward_std": 0.008659754879772663, + "rewards/perpo_ocr_edit_distance_reward": 0.9761414527893066, "step": 2040, "temperature": 0.9 }, { - "advantages": -3.789152671629381e-06, - "completion_length": 1242.0, - "delta_ref_entropy_loss": 0.11328125, - "delta_ref_ppl": -0.079833984375, - "entropy_loss": -0.210693359375, - "epoch": 0.8164, - "grad_norm": 445.53963117053274, - "k1_kl": 0.079833984375, - "k3_kl": 1.267578125, - "kimi_kl": 0.14599609375, - "learning_rate": 9.18e-08, - "loss": 0.0508, - "ppl": 0.137451171875, - "reward": 0.8273315727710724, - "reward_std": 0.018192239571362734, - "rewards/perpo_ocr_edit_distance_reward": 0.8273316025733948, + "advantages": -9.332385161542334e-06, + "completion_length": 86.0, + "delta_ref_entropy_loss": 0.1318359375, + "delta_ref_ppl": -0.30078125, + "entropy_loss": -0.1572265625, + "epoch": 0.4082, + "grad_norm": 4.687295314144904, + "k1_kl": 0.30078125, + "k3_kl": 0.232421875, + "kimi_kl": 0.76953125, + "learning_rate": 2.959e-07, + "loss": 0.0093, + "ppl": 0.078125, + "reward": 0.8799057602882385, + "reward_std": 0.007218916434794664, + "rewards/perpo_ocr_edit_distance_reward": 0.8799058794975281, "step": 2041, "temperature": 0.9 }, { - "advantages": -2.8269632821320556e-06, - "completion_length": 465.0, - "delta_ref_entropy_loss": 0.0457763671875, - "delta_ref_ppl": -0.03619384765625, - "entropy_loss": -0.03167724609375, - "epoch": 0.8168, - "grad_norm": 0.766669801229321, - "k1_kl": 0.03619384765625, - "k3_kl": 0.02203369140625, - "kimi_kl": 0.05535888671875, - "learning_rate": 9.16e-08, - "loss": 0.0009, - "ppl": 0.0169677734375, - "reward": 0.9967489242553711, - "reward_std": 0.0052289473824203014, - "rewards/perpo_ocr_edit_distance_reward": 0.9967489838600159, + "advantages": -1.9873892597388476e-05, + "completion_length": 866.0, + "delta_ref_entropy_loss": 0.049072265625, + "delta_ref_ppl": -0.061767578125, + "entropy_loss": -0.031494140625, + "epoch": 0.4084, + "grad_norm": 4.5325503125866335, + "k1_kl": 0.061767578125, + "k3_kl": 0.037109375, + "kimi_kl": 0.10791015625, + "learning_rate": 2.958e-07, + "loss": 0.0015, + "ppl": 0.01123046875, + "reward": 0.9637416005134583, + "reward_std": 0.00032851900323294103, + "rewards/perpo_ocr_edit_distance_reward": 0.963741660118103, "step": 2042, "temperature": 0.9 }, { - "advantages": -6.6787008108804e-05, - "completion_length": 388.5, - "delta_ref_entropy_loss": 0.06146240234375, - "delta_ref_ppl": -0.04571533203125, - "entropy_loss": -0.041778564453125, - "epoch": 0.8172, - "grad_norm": 0.6512745876551541, - "k1_kl": 0.04571533203125, - "k3_kl": 0.027435302734375, - "kimi_kl": 0.0748291015625, - "learning_rate": 9.139999999999998e-08, - "loss": 0.0012, - "ppl": 0.0245208740234375, - "reward": 0.9960980415344238, - "reward_std": 0.0005671085964422673, - "rewards/perpo_ocr_edit_distance_reward": 0.996098130941391, + "advantages": -4.087175966560608e-07, + "completion_length": 362.0, + "delta_ref_entropy_loss": 0.05810546875, + "delta_ref_ppl": -0.3125, + "entropy_loss": -0.1787109375, + "epoch": 0.4086, + "grad_norm": 5.971218536668798, + "k1_kl": 0.310546875, + "k3_kl": 0.251953125, + "kimi_kl": 1.0625, + "learning_rate": 2.957e-07, + "loss": 0.0101, + "ppl": 0.0673828125, + "reward": 0.4966284930706024, + "reward_std": 0.11386234313249588, + "rewards/perpo_ocr_edit_distance_reward": 0.4966285526752472, "step": 2043, "temperature": 0.9 }, { - "advantages": -1.7029898913278885e-07, - "completion_length": 476.0, - "delta_ref_entropy_loss": 0.04052734375, - "delta_ref_ppl": -0.03826904296875, - "entropy_loss": -0.2071533203125, - "epoch": 0.8176, - "grad_norm": 1.1381769525170211, - "k1_kl": 0.0379638671875, - "k3_kl": 0.02545166015625, - "kimi_kl": 0.0479736328125, - "learning_rate": 9.12e-08, - "loss": 0.001, - "ppl": 0.12115478515625, - "reward": 0.5254295915365219, - "reward_std": 0.09837426622834755, - "rewards/perpo_ocr_edit_distance_reward": 0.5254296213388443, + "advantages": -1.985260496439878e-05, + "completion_length": 137.0, + "delta_ref_entropy_loss": 0.06982421875, + "delta_ref_ppl": -0.3046875, + "entropy_loss": -0.09375, + "epoch": 0.4088, + "grad_norm": 2.4385622328067758, + "k1_kl": 0.3046875, + "k3_kl": 0.25390625, + "kimi_kl": 1.3046875, + "learning_rate": 2.9559999999999997e-07, + "loss": 0.0101, + "ppl": 0.04638671875, + "reward": 0.5956594347953796, + "reward_std": 0.002472205553203821, + "rewards/perpo_ocr_edit_distance_reward": 0.5956595540046692, "step": 2044, "temperature": 0.9 }, { - "advantages": -7.50677936594002e-05, - "completion_length": 568.0, - "delta_ref_entropy_loss": 0.030517578125, - "delta_ref_ppl": -0.0242919921875, - "entropy_loss": -0.03704833984375, - "epoch": 0.818, - "grad_norm": 1.9738433468845, - "k1_kl": 0.024322509765625, - "k3_kl": 0.016021728515625, - "kimi_kl": 0.055450439453125, - "learning_rate": 9.1e-08, - "loss": 0.0007, - "ppl": 0.021148681640625, - "reward": 0.9986673593521118, - "reward_std": 0.00037059594615129754, - "rewards/perpo_ocr_edit_distance_reward": 0.9986674189567566, + "advantages": -8.048330346355215e-05, + "completion_length": 1016.0, + "delta_ref_entropy_loss": 0.04736328125, + "delta_ref_ppl": -0.09130859375, + "entropy_loss": -0.047119140625, + "epoch": 0.409, + "grad_norm": 1.051380877116806, + "k1_kl": 0.09130859375, + "k3_kl": 0.062255859375, + "kimi_kl": 0.1484375, + "learning_rate": 2.9549999999999997e-07, + "loss": 0.0026, + "ppl": 0.01904296875, + "reward": 0.9944128394126892, + "reward_std": 0.0004290442739147693, + "rewards/perpo_ocr_edit_distance_reward": 0.994412899017334, "step": 2045, "temperature": 0.9 }, { - "advantages": -1.0728836770113048e-06, - "completion_length": 650.0, - "delta_ref_entropy_loss": 0.04296875, - "delta_ref_ppl": -0.35675048828125, - "entropy_loss": -0.421875, - "epoch": 0.8184, - "grad_norm": 23.103421429437688, - "k1_kl": 0.35675048828125, - "k3_kl": 0.339508056640625, - "kimi_kl": 1.065673828125, - "learning_rate": 9.08e-08, - "loss": 0.0135, - "ppl": 0.252197265625, - "reward": 0.6034414917230606, - "reward_std": 0.08977279718965292, - "rewards/perpo_ocr_edit_distance_reward": 0.6034415662288666, + "advantages": -5.653926564264111e-06, + "completion_length": 676.0, + "delta_ref_entropy_loss": 0.04931640625, + "delta_ref_ppl": -0.056396484375, + "entropy_loss": -0.047119140625, + "epoch": 0.4092, + "grad_norm": 0.593015942544484, + "k1_kl": 0.056396484375, + "k3_kl": 0.036865234375, + "kimi_kl": 0.10986328125, + "learning_rate": 2.9539999999999996e-07, + "loss": 0.0015, + "ppl": 0.019775390625, + "reward": 0.9876490831375122, + "reward_std": 0.0014053435297682881, + "rewards/perpo_ocr_edit_distance_reward": 0.987649142742157, "step": 2046, "temperature": 0.9 }, { - "advantages": -5.449567765936081e-07, - "completion_length": 347.0, - "delta_ref_entropy_loss": 0.07080078125, - "delta_ref_ppl": -0.05401611328125, - "entropy_loss": -0.041748046875, - "epoch": 0.8188, - "grad_norm": 0.5864544462636961, - "k1_kl": 0.05401611328125, - "k3_kl": 0.03204345703125, - "kimi_kl": 0.08447265625, - "learning_rate": 9.059999999999999e-08, - "loss": 0.0013, - "ppl": 0.020050048828125, - "reward": 0.834617018699646, - "reward_std": 0.003823899431154132, - "rewards/perpo_ocr_edit_distance_reward": 0.8346170485019684, + "advantages": -5.4614887631032616e-05, + "completion_length": 478.0, + "delta_ref_entropy_loss": 0.054931640625, + "delta_ref_ppl": -0.08056640625, + "entropy_loss": -0.054443359375, + "epoch": 0.4094, + "grad_norm": 1.0871423456547544, + "k1_kl": 0.08056640625, + "k3_kl": 0.053955078125, + "kimi_kl": 0.162109375, + "learning_rate": 2.953e-07, + "loss": 0.0022, + "ppl": 0.0247802734375, + "reward": 0.9951537847518921, + "reward_std": 0.0014592339284718037, + "rewards/perpo_ocr_edit_distance_reward": 0.9951539039611816, "step": 2047, "temperature": 0.9 }, { - "advantages": -1.3768673397862585e-05, - "completion_length": 465.5, - "delta_ref_entropy_loss": 0.043701171875, - "delta_ref_ppl": -0.031982421875, - "entropy_loss": -0.0360107421875, - "epoch": 0.8192, - "grad_norm": 1.042574935293207, - "k1_kl": 0.031982421875, - "k3_kl": 0.019775390625, - "kimi_kl": 0.041015625, - "learning_rate": 9.039999999999999e-08, - "loss": 0.0008, - "ppl": 0.017822265625, - "reward": 0.9701904356479645, - "reward_std": 0.003052702435525134, - "rewards/perpo_ocr_edit_distance_reward": 0.9701905250549316, + "advantages": -1.985686321859248e-05, + "completion_length": 60.0, + "delta_ref_entropy_loss": 0.09765625, + "delta_ref_ppl": -0.58203125, + "entropy_loss": -0.1044921875, + "epoch": 0.4096, + "grad_norm": 1.8327313319027312, + "k1_kl": 0.58203125, + "k3_kl": 0.486328125, + "kimi_kl": 2.15625, + "learning_rate": 2.952e-07, + "loss": 0.0194, + "ppl": 0.039794921875, + "reward": 0.9835164546966553, + "reward_std": 0.0016152170719578862, + "rewards/perpo_ocr_edit_distance_reward": 0.9835165143013, "step": 2048, "temperature": 0.9 }, { - "advantages": -9.312800102634355e-05, - "completion_length": 697.5, - "delta_ref_entropy_loss": 0.03289794921875, - "delta_ref_ppl": -0.0223388671875, - "entropy_loss": -0.015625, - "epoch": 0.8196, - "grad_norm": 0.06605691737443033, - "k1_kl": 0.0224609375, - "k3_kl": 0.0125885009765625, - "kimi_kl": 0.042266845703125, - "learning_rate": 9.02e-08, - "loss": 0.0006, - "ppl": 0.005340576171875, - "reward": 0.971113532781601, - "reward_std": 4.143355181440711e-05, - "rewards/perpo_ocr_edit_distance_reward": 0.9711135625839233, + "advantages": -0.0002276940067531541, + "completion_length": 834.0, + "delta_ref_entropy_loss": 0.040771484375, + "delta_ref_ppl": -0.044677734375, + "entropy_loss": -0.026123046875, + "epoch": 0.4098, + "grad_norm": 0.31507478822810453, + "k1_kl": 0.044677734375, + "k3_kl": 0.0238037109375, + "kimi_kl": 0.05517578125, + "learning_rate": 2.9509999999999994e-07, + "loss": 0.0012, + "ppl": 0.00860595703125, + "reward": 0.9984070062637329, + "reward_std": 0.00023659082944504917, + "rewards/perpo_ocr_edit_distance_reward": 0.9984070062637329, "step": 2049, "temperature": 0.9 }, { - "advantages": -1.7694064808893017e-05, - "completion_length": 316.0, - "delta_ref_entropy_loss": 0.0772705078125, - "delta_ref_ppl": -0.06695556640625, - "entropy_loss": -0.0423583984375, - "epoch": 0.82, - "grad_norm": 1.9673780212519545, - "k1_kl": 0.06671142578125, - "k3_kl": 0.037078857421875, - "kimi_kl": 0.08135986328125, - "learning_rate": 9e-08, - "loss": 0.0015, - "ppl": 0.0214996337890625, - "reward": 0.9993641376495361, - "reward_std": 0.0007930597057566047, - "rewards/perpo_ocr_edit_distance_reward": 0.9993641674518585, + "advantages": 1.3215201761340722e-05, + "completion_length": 140.0, + "delta_ref_entropy_loss": 0.126953125, + "delta_ref_ppl": -0.28125, + "entropy_loss": -0.08544921875, + "epoch": 0.41, + "grad_norm": 1.6902764709828355, + "k1_kl": 0.28125, + "k3_kl": 0.208984375, + "kimi_kl": 0.7578125, + "learning_rate": 2.95e-07, + "loss": 0.0084, + "ppl": 0.026611328125, + "reward": 0.9885057806968689, + "reward_std": 0.002473177621141076, + "rewards/perpo_ocr_edit_distance_reward": 0.9885057210922241, "step": 2050, "temperature": 0.9 }, { - "advantages": 3.818954994017076e-06, - "completion_length": 782.0, - "delta_ref_entropy_loss": 0.08447265625, - "delta_ref_ppl": -0.0650634765625, - "entropy_loss": -0.248291015625, - "epoch": 0.8204, - "grad_norm": 4.240057458306955, - "k1_kl": 0.0650634765625, - "k3_kl": 0.0391845703125, - "kimi_kl": 0.064697265625, - "learning_rate": 8.98e-08, - "loss": 0.0016, - "ppl": 0.141845703125, - "reward": 0.684650182723999, - "reward_std": 0.04609453375451267, - "rewards/perpo_ocr_edit_distance_reward": 0.684650182723999, + "advantages": -7.851635018596426e-05, + "completion_length": 239.0, + "delta_ref_entropy_loss": 0.064453125, + "delta_ref_ppl": -0.109375, + "entropy_loss": -0.0625, + "epoch": 0.4102, + "grad_norm": 0.9434126936216043, + "k1_kl": 0.109375, + "k3_kl": 0.08251953125, + "kimi_kl": 0.30859375, + "learning_rate": 2.949e-07, + "loss": 0.0034, + "ppl": 0.0238037109375, + "reward": 0.8873724341392517, + "reward_std": 0.0012010493082925677, + "rewards/perpo_ocr_edit_distance_reward": 0.8873725533485413, "step": 2051, "temperature": 0.9 }, { - "advantages": 1.5556812286376953e-05, - "completion_length": 552.5, - "delta_ref_entropy_loss": 0.04156494140625, - "delta_ref_ppl": -0.03265380859375, - "entropy_loss": -0.0244140625, - "epoch": 0.8208, - "grad_norm": 0.30712432015902985, - "k1_kl": 0.032623291015625, - "k3_kl": 0.0180816650390625, - "kimi_kl": 0.053741455078125, - "learning_rate": 8.96e-08, - "loss": 0.0007, - "ppl": 0.01025390625, - "reward": 0.9947294294834137, - "reward_std": 0.00036047337925992906, - "rewards/perpo_ocr_edit_distance_reward": 0.9947294592857361, + "advantages": -1.7898424630402587e-05, + "completion_length": 642.0, + "delta_ref_entropy_loss": 0.045166015625, + "delta_ref_ppl": -0.052978515625, + "entropy_loss": -0.032958984375, + "epoch": 0.4104, + "grad_norm": 0.7203448276225879, + "k1_kl": 0.05322265625, + "k3_kl": 0.0322265625, + "kimi_kl": 0.10302734375, + "learning_rate": 2.948e-07, + "loss": 0.0013, + "ppl": 0.01141357421875, + "reward": 0.9982839822769165, + "reward_std": 0.0018022130243480206, + "rewards/perpo_ocr_edit_distance_reward": 0.9982839822769165, "step": 2052, "temperature": 0.9 }, { - "advantages": -1.7072474065571441e-06, - "completion_length": 740.5, - "delta_ref_entropy_loss": 0.0328369140625, - "delta_ref_ppl": -0.04229736328125, - "entropy_loss": -0.03533935546875, - "epoch": 0.8212, - "grad_norm": 0.5400955968892831, - "k1_kl": 0.0423583984375, - "k3_kl": 0.02777099609375, - "kimi_kl": 0.0745849609375, - "learning_rate": 8.939999999999999e-08, - "loss": 0.0011, - "ppl": 0.016845703125, - "reward": 0.9711326956748962, - "reward_std": 0.0074570681899785995, - "rewards/perpo_ocr_edit_distance_reward": 0.9711326956748962, + "advantages": 1.958438360816217e-06, + "completion_length": 778.0, + "delta_ref_entropy_loss": 0.0703125, + "delta_ref_ppl": -0.06640625, + "entropy_loss": -0.2158203125, + "epoch": 0.4106, + "grad_norm": 3.100670569532931, + "k1_kl": 0.06591796875, + "k3_kl": 0.037841796875, + "kimi_kl": 0.0654296875, + "learning_rate": 2.947e-07, + "loss": 0.0015, + "ppl": 0.1201171875, + "reward": 0.8018574714660645, + "reward_std": 0.025894831866025925, + "rewards/perpo_ocr_edit_distance_reward": 0.8018574118614197, "step": 2053, "temperature": 0.9 }, { - "advantages": -6.377697445714148e-06, - "completion_length": 513.5, - "delta_ref_entropy_loss": 0.049774169921875, - "delta_ref_ppl": -0.04254150390625, - "entropy_loss": -0.0498046875, - "epoch": 0.8216, - "grad_norm": 0.5380297275365981, - "k1_kl": 0.04254150390625, - "k3_kl": 0.024169921875, - "kimi_kl": 0.06512451171875, - "learning_rate": 8.919999999999999e-08, - "loss": 0.001, - "ppl": 0.027435302734375, - "reward": 0.9923565089702606, - "reward_std": 0.0022876712027937174, - "rewards/perpo_ocr_edit_distance_reward": 0.992356538772583, + "advantages": -4.139116936130449e-05, + "completion_length": 380.0, + "delta_ref_entropy_loss": 0.09716796875, + "delta_ref_ppl": -0.130859375, + "entropy_loss": -0.076171875, + "epoch": 0.4108, + "grad_norm": 1.5330063879590266, + "k1_kl": 0.130859375, + "k3_kl": 0.09228515625, + "kimi_kl": 0.361328125, + "learning_rate": 2.9459999999999995e-07, + "loss": 0.0037, + "ppl": 0.0390625, + "reward": 0.9923428893089294, + "reward_std": 0.002162352902814746, + "rewards/perpo_ocr_edit_distance_reward": 0.992343008518219, "step": 2054, "temperature": 0.9 }, { - "advantages": -2.2138868871479644e-07, - "completion_length": 1296.5, - "delta_ref_entropy_loss": 0.028167724609375, - "delta_ref_ppl": -0.025634765625, - "entropy_loss": -0.02447509765625, - "epoch": 0.822, - "grad_norm": 0.3433334399812419, - "k1_kl": 0.025604248046875, - "k3_kl": 0.0135650634765625, - "kimi_kl": 0.028564453125, - "learning_rate": 8.899999999999999e-08, - "loss": 0.0005, - "ppl": 0.011505126953125, - "reward": 0.9087785184383392, - "reward_std": 0.01741454191505909, - "rewards/perpo_ocr_edit_distance_reward": 0.9087785184383392, + "advantages": -4.051412906846963e-05, + "completion_length": 521.0, + "delta_ref_entropy_loss": 0.061767578125, + "delta_ref_ppl": -0.0908203125, + "entropy_loss": -0.0419921875, + "epoch": 0.411, + "grad_norm": 0.4845766759252714, + "k1_kl": 0.0908203125, + "k3_kl": 0.061767578125, + "kimi_kl": 0.1904296875, + "learning_rate": 2.945e-07, + "loss": 0.0025, + "ppl": 0.016845703125, + "reward": 0.9245792627334595, + "reward_std": 0.000950892164837569, + "rewards/perpo_ocr_edit_distance_reward": 0.924579381942749, "step": 2055, "temperature": 0.9 }, { - "advantages": -6.948199086309614e-06, - "completion_length": 1409.0, - "delta_ref_entropy_loss": 0.044921875, - "delta_ref_ppl": -0.0325927734375, - "entropy_loss": -0.06494140625, - "epoch": 0.8224, - "grad_norm": 3.3004056549801346, - "k1_kl": 0.032470703125, - "k3_kl": 0.02880859375, - "kimi_kl": 0.0528564453125, - "learning_rate": 8.88e-08, - "loss": 0.0012, - "ppl": 0.03424072265625, - "reward": 0.9884126782417297, - "reward_std": 0.006197925191372633, - "rewards/perpo_ocr_edit_distance_reward": 0.9884127378463745, + "advantages": -1.4475413991021924e-05, + "completion_length": 632.0, + "delta_ref_entropy_loss": 0.08544921875, + "delta_ref_ppl": -0.06787109375, + "entropy_loss": -0.126953125, + "epoch": 0.4112, + "grad_norm": 1.1116874435204291, + "k1_kl": 0.06787109375, + "k3_kl": 0.033935546875, + "kimi_kl": 0.0732421875, + "learning_rate": 2.944e-07, + "loss": 0.0014, + "ppl": 0.0595703125, + "reward": 0.8906802535057068, + "reward_std": 0.004608322400599718, + "rewards/perpo_ocr_edit_distance_reward": 0.8906803727149963, "step": 2056, "temperature": 0.9 }, { - "advantages": -2.8397357709764037e-05, - "completion_length": 442.5, - "delta_ref_entropy_loss": 0.0302734375, - "delta_ref_ppl": -0.028167724609375, - "entropy_loss": -0.024139404296875, - "epoch": 0.8228, - "grad_norm": 0.5654329836000772, - "k1_kl": 0.02813720703125, - "k3_kl": 0.0171966552734375, - "kimi_kl": 0.0510711669921875, - "learning_rate": 8.86e-08, - "loss": 0.0007, - "ppl": 0.0114898681640625, - "reward": 0.9998258948326111, - "reward_std": 0.0004605263384291902, - "rewards/perpo_ocr_edit_distance_reward": 0.9998259544372559, + "advantages": -4.274504590284778e-06, + "completion_length": 32.0, + "delta_ref_entropy_loss": 0.1162109375, + "delta_ref_ppl": -0.98828125, + "entropy_loss": -0.388671875, + "epoch": 0.4114, + "grad_norm": 9.51141873770428, + "k1_kl": 0.9921875, + "k3_kl": 0.796875, + "kimi_kl": 2.890625, + "learning_rate": 2.943e-07, + "loss": 0.032, + "ppl": 0.1689453125, + "reward": 0.7942176461219788, + "reward_std": 0.017809858545660973, + "rewards/perpo_ocr_edit_distance_reward": 0.7942177653312683, "step": 2057, "temperature": 0.9 }, { - "advantages": -4.8450063331983984e-05, - "completion_length": 790.5, - "delta_ref_entropy_loss": 0.03619384765625, - "delta_ref_ppl": -0.02398681640625, - "entropy_loss": -0.03094482421875, - "epoch": 0.8232, - "grad_norm": 0.5547294487997448, - "k1_kl": 0.0240478515625, - "k3_kl": 0.01214599609375, - "kimi_kl": 0.02532958984375, - "learning_rate": 8.84e-08, - "loss": 0.0005, - "ppl": 0.0150146484375, - "reward": 0.9980610311031342, - "reward_std": 0.0006230151775525883, - "rewards/perpo_ocr_edit_distance_reward": 0.9980610609054565, + "advantages": -2.081053753499873e-05, + "completion_length": 976.0, + "delta_ref_entropy_loss": 0.021240234375, + "delta_ref_ppl": -0.03466796875, + "entropy_loss": -0.036376953125, + "epoch": 0.4116, + "grad_norm": 3.093132271983301, + "k1_kl": 0.03466796875, + "k3_kl": 0.0216064453125, + "kimi_kl": 0.054443359375, + "learning_rate": 2.9420000000000003e-07, + "loss": 0.0009, + "ppl": 0.014404296875, + "reward": 0.9970719814300537, + "reward_std": 0.0007183279958553612, + "rewards/perpo_ocr_edit_distance_reward": 0.9970719814300537, "step": 2058, "temperature": 0.9 }, { - "advantages": -8.698020974406973e-05, - "completion_length": 766.5, - "delta_ref_entropy_loss": 0.04052734375, - "delta_ref_ppl": -0.029296875, - "entropy_loss": -0.041748046875, - "epoch": 0.8236, - "grad_norm": 0.9358365864981387, - "k1_kl": 0.02935791015625, - "k3_kl": 0.0176239013671875, - "kimi_kl": 0.03985595703125, - "learning_rate": 8.82e-08, - "loss": 0.0008, - "ppl": 0.022613525390625, - "reward": 0.9901512265205383, - "reward_std": 0.000733597727958113, - "rewards/perpo_ocr_edit_distance_reward": 0.9901512861251831, + "advantages": -4.664489460992627e-05, + "completion_length": 459.0, + "delta_ref_entropy_loss": 0.03955078125, + "delta_ref_ppl": -0.107421875, + "entropy_loss": -0.052978515625, + "epoch": 0.4118, + "grad_norm": 0.7451405267355259, + "k1_kl": 0.10693359375, + "k3_kl": 0.0849609375, + "kimi_kl": 0.30859375, + "learning_rate": 2.9409999999999997e-07, + "loss": 0.0034, + "ppl": 0.02099609375, + "reward": 0.9922580122947693, + "reward_std": 0.0011778968619182706, + "rewards/perpo_ocr_edit_distance_reward": 0.9922581315040588, "step": 2059, "temperature": 0.9 }, { - "advantages": 4.19787011196604e-06, - "completion_length": 447.0, - "delta_ref_entropy_loss": 0.0374755859375, - "delta_ref_ppl": -0.0328369140625, - "entropy_loss": -0.0279541015625, - "epoch": 0.824, - "grad_norm": 1.2136487738989037, - "k1_kl": 0.03277587890625, - "k3_kl": 0.02044677734375, - "kimi_kl": 0.06298828125, - "learning_rate": 8.8e-08, - "loss": 0.0008, - "ppl": 0.01251220703125, - "reward": 0.9980307221412659, - "reward_std": 0.0020039245719090104, - "rewards/perpo_ocr_edit_distance_reward": 0.9980307221412659, + "advantages": -9.996550943469629e-06, + "completion_length": 561.0, + "delta_ref_entropy_loss": 0.1435546875, + "delta_ref_ppl": -0.10546875, + "entropy_loss": -0.1533203125, + "epoch": 0.412, + "grad_norm": 1.9810905876302993, + "k1_kl": 0.10546875, + "k3_kl": 0.05615234375, + "kimi_kl": 0.1201171875, + "learning_rate": 2.9399999999999996e-07, + "loss": 0.0023, + "ppl": 0.078125, + "reward": 0.933876097202301, + "reward_std": 0.007557978387922049, + "rewards/perpo_ocr_edit_distance_reward": 0.9338761568069458, "step": 2060, "temperature": 0.9 }, { - "advantages": 0.0, - "completion_length": 1208.5, - "delta_ref_entropy_loss": 0.018798828125, - "delta_ref_ppl": -0.016204833984375, - "entropy_loss": -0.033233642578125, - "epoch": 0.8244, - "grad_norm": 1.1746901147568682, - "k1_kl": 0.01617431640625, - "k3_kl": 0.009490966796875, - "kimi_kl": 0.02117919921875, - "learning_rate": 8.78e-08, - "loss": 0.0004, - "ppl": 0.0182342529296875, - "reward": 0.9212477505207062, - "reward_std": 0.0012685616966336966, - "rewards/perpo_ocr_edit_distance_reward": 0.9212477803230286, + "advantages": -0.0001316411216976121, + "completion_length": 759.0, + "delta_ref_entropy_loss": 0.048583984375, + "delta_ref_ppl": -0.06298828125, + "entropy_loss": -0.045166015625, + "epoch": 0.4122, + "grad_norm": 0.6558898955490717, + "k1_kl": 0.06298828125, + "k3_kl": 0.03759765625, + "kimi_kl": 0.1044921875, + "learning_rate": 2.939e-07, + "loss": 0.0016, + "ppl": 0.0201416015625, + "reward": 0.9922201633453369, + "reward_std": 0.0005468166782520711, + "rewards/perpo_ocr_edit_distance_reward": 0.9922202229499817, "step": 2061, "temperature": 0.9 }, { - "advantages": -4.736014852824155e-05, - "completion_length": 1001.0, - "delta_ref_entropy_loss": 0.028564453125, - "delta_ref_ppl": -0.019775390625, - "entropy_loss": -0.02520751953125, - "epoch": 0.8248, - "grad_norm": 0.3352397927734963, - "k1_kl": 0.0197601318359375, - "k3_kl": 0.010345458984375, - "kimi_kl": 0.0259857177734375, - "learning_rate": 8.759999999999999e-08, - "loss": 0.0005, - "ppl": 0.0114898681640625, - "reward": 0.9874989688396454, - "reward_std": 0.0006219234492164105, - "rewards/perpo_ocr_edit_distance_reward": 0.9874989688396454, + "advantages": 1.1239733339607483e-06, + "completion_length": 473.0, + "delta_ref_entropy_loss": 0.1630859375, + "delta_ref_ppl": -0.13671875, + "entropy_loss": -0.25390625, + "epoch": 0.4124, + "grad_norm": 3.448915172881708, + "k1_kl": 0.1357421875, + "k3_kl": 0.0849609375, + "kimi_kl": 0.2333984375, + "learning_rate": 2.938e-07, + "loss": 0.0034, + "ppl": 0.1298828125, + "reward": 0.6967301368713379, + "reward_std": 0.022184064611792564, + "rewards/perpo_ocr_edit_distance_reward": 0.6967300772666931, "step": 2062, "temperature": 0.9 }, { - "advantages": -1.2048653559304512e-06, - "completion_length": 658.0, - "delta_ref_entropy_loss": 0.025634765625, - "delta_ref_ppl": -0.02685546875, - "entropy_loss": -0.0546875, - "epoch": 0.8252, - "grad_norm": 0.6720895901481557, - "k1_kl": 0.02685546875, - "k3_kl": 0.01751708984375, - "kimi_kl": 0.042236328125, - "learning_rate": 8.74e-08, - "loss": 0.0007, - "ppl": 0.026947021484375, - "reward": 0.9083586931228638, - "reward_std": 0.046246071346104145, - "rewards/perpo_ocr_edit_distance_reward": 0.9083587229251862, + "advantages": -1.3283321322887787e-06, + "completion_length": 558.0, + "delta_ref_entropy_loss": 0.15625, + "delta_ref_ppl": -0.12451171875, + "entropy_loss": -0.267578125, + "epoch": 0.4126, + "grad_norm": 2.8830477374203176, + "k1_kl": 0.12451171875, + "k3_kl": 0.0771484375, + "kimi_kl": 0.189453125, + "learning_rate": 2.937e-07, + "loss": 0.0031, + "ppl": 0.140625, + "reward": 0.898095428943634, + "reward_std": 0.019045397639274597, + "rewards/perpo_ocr_edit_distance_reward": 0.898095428943634, "step": 2063, "temperature": 0.9 }, { - "advantages": -2.4544342522858642e-05, - "completion_length": 392.5, - "delta_ref_entropy_loss": 0.0771484375, - "delta_ref_ppl": -0.04345703125, - "entropy_loss": -0.04327392578125, - "epoch": 0.8256, - "grad_norm": 0.46690065636701766, - "k1_kl": 0.04345703125, - "k3_kl": 0.020751953125, - "kimi_kl": 0.045166015625, - "learning_rate": 8.72e-08, - "loss": 0.0009, - "ppl": 0.019439697265625, - "reward": 0.986665666103363, - "reward_std": 0.00047034883755259216, - "rewards/perpo_ocr_edit_distance_reward": 0.9866657257080078, + "advantages": -4.887580871582031e-06, + "completion_length": 483.0, + "delta_ref_entropy_loss": 0.06494140625, + "delta_ref_ppl": -0.0966796875, + "entropy_loss": -0.05517578125, + "epoch": 0.4128, + "grad_norm": 0.8784515924519937, + "k1_kl": 0.09619140625, + "k3_kl": 0.06005859375, + "kimi_kl": 0.2109375, + "learning_rate": 2.9360000000000003e-07, + "loss": 0.0024, + "ppl": 0.023681640625, + "reward": 0.996658444404602, + "reward_std": 0.0016402191249653697, + "rewards/perpo_ocr_edit_distance_reward": 0.9966585040092468, "step": 2064, "temperature": 0.9 }, { - "advantages": -0.0003006202834967553, - "completion_length": 916.5, - "delta_ref_entropy_loss": 0.088958740234375, - "delta_ref_ppl": -0.0537567138671875, - "entropy_loss": -0.087493896484375, - "epoch": 0.826, - "grad_norm": 1.1145926796031658, - "k1_kl": 0.05377197265625, - "k3_kl": 0.025787353515625, - "kimi_kl": 0.04550933837890625, - "learning_rate": 8.699999999999998e-08, - "loss": 0.0013, - "ppl": 0.04807281494140625, - "reward": 0.8485775291919708, - "reward_std": 0.004873857833445072, - "rewards/perpo_ocr_edit_distance_reward": 0.848577618598938, + "advantages": -2.7435167794465087e-05, + "completion_length": 437.0, + "delta_ref_entropy_loss": 0.10009765625, + "delta_ref_ppl": -0.1279296875, + "entropy_loss": -0.072265625, + "epoch": 0.413, + "grad_norm": 0.849684938614266, + "k1_kl": 0.126953125, + "k3_kl": 0.0810546875, + "kimi_kl": 0.263671875, + "learning_rate": 2.935e-07, + "loss": 0.0033, + "ppl": 0.0299072265625, + "reward": 0.9399086236953735, + "reward_std": 0.0011415502522140741, + "rewards/perpo_ocr_edit_distance_reward": 0.9399086833000183, "step": 2065, "temperature": 0.9 }, { - "advantages": -2.7441554266260937e-05, - "completion_length": 736.5, - "delta_ref_entropy_loss": 0.0242919921875, - "delta_ref_ppl": -0.013916015625, - "entropy_loss": -0.016693115234375, - "epoch": 0.8264, - "grad_norm": 0.22935131255329197, - "k1_kl": 0.013885498046875, - "k3_kl": 0.006072998046875, - "kimi_kl": 0.0107879638671875, - "learning_rate": 8.68e-08, - "loss": 0.0003, - "ppl": 0.00528717041015625, - "reward": 0.9898318648338318, - "reward_std": 0.00010510873835301027, - "rewards/perpo_ocr_edit_distance_reward": 0.9898318648338318, + "advantages": -3.065381974920456e-07, + "completion_length": 243.0, + "delta_ref_entropy_loss": 0.053466796875, + "delta_ref_ppl": -0.1376953125, + "entropy_loss": -0.1455078125, + "epoch": 0.4132, + "grad_norm": 3.3513280080066723, + "k1_kl": 0.13671875, + "k3_kl": 0.1005859375, + "kimi_kl": 0.328125, + "learning_rate": 2.9339999999999997e-07, + "loss": 0.004, + "ppl": 0.0634765625, + "reward": 0.8235629200935364, + "reward_std": 0.08800975233316422, + "rewards/perpo_ocr_edit_distance_reward": 0.8235629200935364, "step": 2066, "temperature": 0.9 }, { - "advantages": -6.396855894763576e-06, - "completion_length": 567.5, - "delta_ref_entropy_loss": 0.06862068176269531, - "delta_ref_ppl": -0.04339599609375, - "entropy_loss": -0.156982421875, - "epoch": 0.8268, - "grad_norm": 6.635451425782696, - "k1_kl": 0.0435791015625, - "k3_kl": 0.03955078125, - "kimi_kl": 0.0491943359375, - "learning_rate": 8.66e-08, - "loss": 0.0016, - "ppl": 0.0787353515625, - "reward": 0.8213653564453125, - "reward_std": 0.15021011140197515, - "rewards/perpo_ocr_edit_distance_reward": 0.821365475654602, + "advantages": -7.765633927192539e-05, + "completion_length": 183.0, + "delta_ref_entropy_loss": 0.09912109375, + "delta_ref_ppl": -0.2490234375, + "entropy_loss": -0.0771484375, + "epoch": 0.4134, + "grad_norm": 0.9311675687572608, + "k1_kl": 0.2490234375, + "k3_kl": 0.19140625, + "kimi_kl": 0.89453125, + "learning_rate": 2.933e-07, + "loss": 0.0078, + "ppl": 0.03076171875, + "reward": 0.9189666509628296, + "reward_std": 0.0012160517508164048, + "rewards/perpo_ocr_edit_distance_reward": 0.9189667701721191, "step": 2067, "temperature": 0.9 }, { - "advantages": -4.520487073023105e-05, - "completion_length": 377.5, - "delta_ref_entropy_loss": 0.070556640625, - "delta_ref_ppl": -0.081298828125, - "entropy_loss": -0.083740234375, - "epoch": 0.8272, - "grad_norm": 1.4497510000968783, - "k1_kl": 0.081298828125, - "k3_kl": 0.051025390625, - "kimi_kl": 0.1162109375, - "learning_rate": 8.64e-08, - "loss": 0.0021, - "ppl": 0.0472412109375, - "reward": 0.9893408119678497, - "reward_std": 0.001745167130138725, - "rewards/perpo_ocr_edit_distance_reward": 0.9893409013748169, + "advantages": -6.322350145637756e-06, + "completion_length": 312.0, + "delta_ref_entropy_loss": 0.0537109375, + "delta_ref_ppl": -0.1318359375, + "entropy_loss": -0.056396484375, + "epoch": 0.4136, + "grad_norm": 1.5814029397138916, + "k1_kl": 0.1318359375, + "k3_kl": 0.0908203125, + "kimi_kl": 0.326171875, + "learning_rate": 2.932e-07, + "loss": 0.0036, + "ppl": 0.0198974609375, + "reward": 0.6786220073699951, + "reward_std": 0.0012485916959121823, + "rewards/perpo_ocr_edit_distance_reward": 0.6786220073699951, "step": 2068, "temperature": 0.9 }, { - "advantages": -1.4115657535285209e-05, - "completion_length": 356.5, - "delta_ref_entropy_loss": 0.0028076171875, - "delta_ref_ppl": -0.0555419921875, - "entropy_loss": -0.18896484375, - "epoch": 0.8276, - "grad_norm": 2.3080561980342136, - "k1_kl": 0.0555419921875, - "k3_kl": 0.0426025390625, - "kimi_kl": 0.106689453125, - "learning_rate": 8.619999999999999e-08, - "loss": 0.0017, - "ppl": 0.08599853515625, - "reward": 0.6012142151594162, - "reward_std": 0.05455325054936111, - "rewards/perpo_ocr_edit_distance_reward": 0.601214274764061, + "advantages": -1.4594623280572705e-05, + "completion_length": 1275.0, + "delta_ref_entropy_loss": 0.04150390625, + "delta_ref_ppl": -0.04443359375, + "entropy_loss": -0.06298828125, + "epoch": 0.4138, + "grad_norm": 1.0040307779442492, + "k1_kl": 0.04443359375, + "k3_kl": 0.0277099609375, + "kimi_kl": 0.078125, + "learning_rate": 2.931e-07, + "loss": 0.0011, + "ppl": 0.02734375, + "reward": 0.9751997590065002, + "reward_std": 0.00281914952211082, + "rewards/perpo_ocr_edit_distance_reward": 0.975199818611145, "step": 2069, "temperature": 0.9 }, { - "advantages": -0.00014583128722733818, - "completion_length": 852.0, - "delta_ref_entropy_loss": 0.0357666015625, - "delta_ref_ppl": -0.01824951171875, - "entropy_loss": -0.020477294921875, - "epoch": 0.828, - "grad_norm": 0.7151615501492083, - "k1_kl": 0.018218994140625, - "k3_kl": 0.0088043212890625, - "kimi_kl": 0.0159912109375, - "learning_rate": 8.599999999999999e-08, - "loss": 0.0005, - "ppl": 0.00860595703125, - "reward": 0.9760664105415344, - "reward_std": 0.001430437754606828, - "rewards/perpo_ocr_edit_distance_reward": 0.9760664999485016, + "advantages": 8.923667337512597e-06, + "completion_length": 205.0, + "delta_ref_entropy_loss": 0.08154296875, + "delta_ref_ppl": -0.203125, + "entropy_loss": -0.0546875, + "epoch": 0.414, + "grad_norm": 0.8977871003854877, + "k1_kl": 0.2041015625, + "k3_kl": 0.154296875, + "kimi_kl": 0.62109375, + "learning_rate": 2.93e-07, + "loss": 0.0062, + "ppl": 0.0194091796875, + "reward": 0.998153567314148, + "reward_std": 0.0018062116578221321, + "rewards/perpo_ocr_edit_distance_reward": 0.998153567314148, "step": 2070, "temperature": 0.9 }, { - "advantages": -6.392172613089997e-05, - "completion_length": 1335.0, - "delta_ref_entropy_loss": 0.0132598876953125, - "delta_ref_ppl": -0.038970947265625, - "entropy_loss": -0.15008544921875, - "epoch": 0.8284, - "grad_norm": 5.969195709466675, - "k1_kl": 0.038970947265625, - "k3_kl": 0.0243072509765625, - "kimi_kl": 0.055389404296875, - "learning_rate": 8.58e-08, - "loss": 0.001, - "ppl": 0.06646728515625, - "reward": 0.8230670690536499, - "reward_std": 0.1260655305813998, - "rewards/perpo_ocr_edit_distance_reward": 0.8230671286582947, + "advantages": 7.101467872416833e-06, + "completion_length": 476.0, + "delta_ref_entropy_loss": 0.095703125, + "delta_ref_ppl": -0.09033203125, + "entropy_loss": -0.10302734375, + "epoch": 0.4142, + "grad_norm": 1.2636600147322785, + "k1_kl": 0.0908203125, + "k3_kl": 0.0556640625, + "kimi_kl": 0.1357421875, + "learning_rate": 2.929e-07, + "loss": 0.0022, + "ppl": 0.056396484375, + "reward": 0.8341968655586243, + "reward_std": 0.0022978081833571196, + "rewards/perpo_ocr_edit_distance_reward": 0.8341968655586243, "step": 2071, "temperature": 0.9 }, { - "advantages": -3.595863381633535e-05, - "completion_length": 483.5, - "delta_ref_entropy_loss": 0.0235595703125, - "delta_ref_ppl": -0.023162841796875, - "entropy_loss": -0.02142333984375, - "epoch": 0.8288, - "grad_norm": 0.3163037686282708, - "k1_kl": 0.023284912109375, - "k3_kl": 0.0146331787109375, - "kimi_kl": 0.07147216796875, - "learning_rate": 8.559999999999999e-08, - "loss": 0.0006, - "ppl": 0.00921630859375, - "reward": 0.9962218403816223, - "reward_std": 0.00048302338109351695, - "rewards/perpo_ocr_edit_distance_reward": 0.9962218999862671, + "advantages": -2.54341539402958e-05, + "completion_length": 887.0, + "delta_ref_entropy_loss": 0.056640625, + "delta_ref_ppl": -0.055908203125, + "entropy_loss": -0.038330078125, + "epoch": 0.4144, + "grad_norm": 0.5328276954373034, + "k1_kl": 0.055908203125, + "k3_kl": 0.03271484375, + "kimi_kl": 0.0859375, + "learning_rate": 2.928e-07, + "loss": 0.0013, + "ppl": 0.01434326171875, + "reward": 0.9911863803863525, + "reward_std": 0.000569766154512763, + "rewards/perpo_ocr_edit_distance_reward": 0.9911864399909973, "step": 2072, "temperature": 0.9 }, { - "advantages": -0.00035126720467815176, - "completion_length": 910.0, - "delta_ref_entropy_loss": 0.02783203125, - "delta_ref_ppl": -0.0189208984375, - "entropy_loss": -0.014556884765625, - "epoch": 0.8292, - "grad_norm": 0.27363582595582664, - "k1_kl": 0.01885986328125, - "k3_kl": 0.00970458984375, - "kimi_kl": 0.029205322265625, - "learning_rate": 8.54e-08, - "loss": 0.0007, - "ppl": 0.00540924072265625, - "reward": 0.992007702589035, - "reward_std": 0.0003098614397458732, - "rewards/perpo_ocr_edit_distance_reward": 0.9920077621936798, + "advantages": -4.087175966560608e-07, + "completion_length": 1635.0, + "delta_ref_entropy_loss": 0.006744384765625, + "delta_ref_ppl": -0.0203857421875, + "entropy_loss": -0.04541015625, + "epoch": 0.4146, + "grad_norm": 4.59021266086318, + "k1_kl": 0.0203857421875, + "k3_kl": 0.01422119140625, + "kimi_kl": 0.039306640625, + "learning_rate": 2.927e-07, + "loss": 0.0006, + "ppl": 0.0218505859375, + "reward": 0.8593540191650391, + "reward_std": 0.04318206384778023, + "rewards/perpo_ocr_edit_distance_reward": 0.8593540787696838, "step": 2073, "temperature": 0.9 }, { - "advantages": -4.257474817137563e-09, - "completion_length": 664.0, - "delta_ref_entropy_loss": 0.03515625, - "delta_ref_ppl": -0.022125244140625, - "entropy_loss": -0.043243408203125, - "epoch": 0.8296, - "grad_norm": 0.6581484273598022, - "k1_kl": 0.022125244140625, - "k3_kl": 0.0110015869140625, - "kimi_kl": 0.01824951171875, - "learning_rate": 8.52e-08, - "loss": 0.0004, - "ppl": 0.021514892578125, - "reward": 0.9949627220630646, - "reward_std": 0.0007991667371243238, - "rewards/perpo_ocr_edit_distance_reward": 0.9949627220630646, + "advantages": -3.2041756639955565e-05, + "completion_length": 381.0, + "delta_ref_entropy_loss": 0.0537109375, + "delta_ref_ppl": -0.119140625, + "entropy_loss": -0.060791015625, + "epoch": 0.4148, + "grad_norm": 0.8856907785808094, + "k1_kl": 0.119140625, + "k3_kl": 0.0869140625, + "kimi_kl": 0.349609375, + "learning_rate": 2.926e-07, + "loss": 0.0035, + "ppl": 0.0281982421875, + "reward": 0.9860377907752991, + "reward_std": 0.0014941396657377481, + "rewards/perpo_ocr_edit_distance_reward": 0.9860378503799438, "step": 2074, "temperature": 0.9 }, { - "advantages": -3.2365323150429504e-05, - "completion_length": 668.5, - "delta_ref_entropy_loss": 0.046142578125, - "delta_ref_ppl": -0.050537109375, - "entropy_loss": -0.0560302734375, - "epoch": 0.83, - "grad_norm": 1.4560157679382473, - "k1_kl": 0.05078125, - "k3_kl": 0.03118896484375, - "kimi_kl": 0.077880859375, - "learning_rate": 8.500000000000001e-08, - "loss": 0.0013, - "ppl": 0.0289154052734375, - "reward": 0.9829352796077728, - "reward_std": 0.027923691901378334, - "rewards/perpo_ocr_edit_distance_reward": 0.9829353094100952, + "advantages": -1.4747893146704882e-05, + "completion_length": 373.0, + "delta_ref_entropy_loss": 0.0264892578125, + "delta_ref_ppl": -0.09033203125, + "entropy_loss": -0.0419921875, + "epoch": 0.415, + "grad_norm": 0.5433175286693527, + "k1_kl": 0.09033203125, + "k3_kl": 0.0703125, + "kimi_kl": 0.25, + "learning_rate": 2.9249999999999995e-07, + "loss": 0.0028, + "ppl": 0.0167236328125, + "reward": 0.981943666934967, + "reward_std": 0.0010537835769355297, + "rewards/perpo_ocr_edit_distance_reward": 0.9819437265396118, "step": 2075, "temperature": 0.9 }, { - "advantages": -0.000136762862894102, - "completion_length": 947.0, - "delta_ref_entropy_loss": 0.02862548828125, - "delta_ref_ppl": -0.04058837890625, - "entropy_loss": -0.02978515625, - "epoch": 0.8304, - "grad_norm": 0.48112431566822167, - "k1_kl": 0.04052734375, - "k3_kl": 0.028961181640625, - "kimi_kl": 0.0985107421875, - "learning_rate": 8.479999999999999e-08, - "loss": 0.0013, - "ppl": 0.013946533203125, - "reward": 0.9001407027244568, - "reward_std": 0.0004717980118584819, - "rewards/perpo_ocr_edit_distance_reward": 0.900140792131424, + "advantages": -1.8732889373040962e-07, + "completion_length": 1844.0, + "delta_ref_entropy_loss": -0.0135498046875, + "delta_ref_ppl": -0.049560546875, + "entropy_loss": -0.3046875, + "epoch": 0.4152, + "grad_norm": 9.086813194951787, + "k1_kl": 0.04931640625, + "k3_kl": 0.1181640625, + "kimi_kl": 0.1171875, + "learning_rate": 2.924e-07, + "loss": 0.0047, + "ppl": 0.1767578125, + "reward": 0.6240507364273071, + "reward_std": 0.2426110804080963, + "rewards/perpo_ocr_edit_distance_reward": 0.6240507960319519, "step": 2076, "temperature": 0.9 }, { - "advantages": -0.00034452975160093047, - "completion_length": 489.5, - "delta_ref_entropy_loss": 0.0556640625, - "delta_ref_ppl": -0.04376220703125, - "entropy_loss": -0.02716064453125, - "epoch": 0.8308, - "grad_norm": 0.17408145990539178, - "k1_kl": 0.04376220703125, - "k3_kl": 0.024566650390625, - "kimi_kl": 0.0675048828125, - "learning_rate": 8.459999999999999e-08, - "loss": 0.0013, - "ppl": 0.0114288330078125, - "reward": 0.9843803346157074, - "reward_std": 0.0003619005437940359, - "rewards/perpo_ocr_edit_distance_reward": 0.9843803942203522, + "advantages": 6.267002845561365e-06, + "completion_length": 1440.0, + "delta_ref_entropy_loss": 0.052001953125, + "delta_ref_ppl": -0.06298828125, + "entropy_loss": -0.07421875, + "epoch": 0.4154, + "grad_norm": 1.951292981281766, + "k1_kl": 0.06298828125, + "k3_kl": 0.043701171875, + "kimi_kl": 0.0908203125, + "learning_rate": 2.923e-07, + "loss": 0.0017, + "ppl": 0.042724609375, + "reward": 0.9705765247344971, + "reward_std": 0.003958553075790405, + "rewards/perpo_ocr_edit_distance_reward": 0.9705765843391418, "step": 2077, "temperature": 0.9 }, { - "advantages": -8.691102812008467e-05, - "completion_length": 994.5, - "delta_ref_entropy_loss": 0.01910400390625, - "delta_ref_ppl": -0.017425537109375, - "entropy_loss": -0.02496337890625, - "epoch": 0.8312, - "grad_norm": 0.3636126773057349, - "k1_kl": 0.017425537109375, - "k3_kl": 0.011871337890625, - "kimi_kl": 0.0272216796875, - "learning_rate": 8.44e-08, - "loss": 0.0006, - "ppl": 0.012054443359375, - "reward": 0.9985424280166626, - "reward_std": 0.0004285344039089978, - "rewards/perpo_ocr_edit_distance_reward": 0.998542457818985, + "advantages": -1.886061340883316e-06, + "completion_length": 152.0, + "delta_ref_entropy_loss": 0.04638671875, + "delta_ref_ppl": -0.1748046875, + "entropy_loss": -0.11669921875, + "epoch": 0.4156, + "grad_norm": 1.5850340300829955, + "k1_kl": 0.1748046875, + "k3_kl": 0.1357421875, + "kimi_kl": 0.45703125, + "learning_rate": 2.922e-07, + "loss": 0.0054, + "ppl": 0.06591796875, + "reward": 0.9679608941078186, + "reward_std": 0.004442164208739996, + "rewards/perpo_ocr_edit_distance_reward": 0.9679609537124634, "step": 2078, "temperature": 0.9 }, { - "advantages": -5.34313085154281e-06, - "completion_length": 348.5, - "delta_ref_entropy_loss": 0.074462890625, - "delta_ref_ppl": -0.058349609375, - "entropy_loss": -0.0523681640625, - "epoch": 0.8316, - "grad_norm": 0.9258367893537429, - "k1_kl": 0.058349609375, - "k3_kl": 0.03466796875, - "kimi_kl": 0.080810546875, - "learning_rate": 8.42e-08, - "loss": 0.0014, - "ppl": 0.029296875, - "reward": 0.9881503283977509, - "reward_std": 0.001936985761858523, - "rewards/perpo_ocr_edit_distance_reward": 0.9881503582000732, + "advantages": -8.783170414972119e-06, + "completion_length": 1088.0, + "delta_ref_entropy_loss": 0.04296875, + "delta_ref_ppl": -0.052734375, + "entropy_loss": -0.0634765625, + "epoch": 0.4158, + "grad_norm": 0.8844469576310753, + "k1_kl": 0.052734375, + "k3_kl": 0.0322265625, + "kimi_kl": 0.07958984375, + "learning_rate": 2.9210000000000003e-07, + "loss": 0.0013, + "ppl": 0.02978515625, + "reward": 0.9623395800590515, + "reward_std": 0.000869069539476186, + "rewards/perpo_ocr_edit_distance_reward": 0.9623395800590515, "step": 2079, "temperature": 0.9 }, { - "advantages": -4.6815194764349144e-05, - "completion_length": 570.5, - "delta_ref_entropy_loss": 0.0233154296875, - "delta_ref_ppl": -0.02972412109375, - "entropy_loss": -0.021087646484375, - "epoch": 0.832, - "grad_norm": 0.47211354105440995, - "k1_kl": 0.0296630859375, - "k3_kl": 0.0213623046875, - "kimi_kl": 0.0743408203125, - "learning_rate": 8.4e-08, - "loss": 0.0009, - "ppl": 0.01080322265625, - "reward": 0.9905738234519958, - "reward_std": 0.002513281855499372, - "rewards/perpo_ocr_edit_distance_reward": 0.9905739426612854, + "advantages": -2.6089805032825097e-05, + "completion_length": 413.0, + "delta_ref_entropy_loss": 0.0517578125, + "delta_ref_ppl": -0.087890625, + "entropy_loss": -0.038330078125, + "epoch": 0.416, + "grad_norm": 1.020241600915004, + "k1_kl": 0.08837890625, + "k3_kl": 0.06591796875, + "kimi_kl": 0.216796875, + "learning_rate": 2.9199999999999997e-07, + "loss": 0.0027, + "ppl": 0.017333984375, + "reward": 0.9874160885810852, + "reward_std": 0.0015316576464101672, + "rewards/perpo_ocr_edit_distance_reward": 0.98741614818573, "step": 2080, "temperature": 0.9 }, { - "advantages": -1.2261527899681823e-06, - "completion_length": 245.0, - "delta_ref_entropy_loss": 0.055908203125, - "delta_ref_ppl": -0.062744140625, - "entropy_loss": -0.048828125, - "epoch": 0.8324, - "grad_norm": 1.0715177755628558, - "k1_kl": 0.062744140625, - "k3_kl": 0.0374755859375, - "kimi_kl": 0.115234375, - "learning_rate": 8.38e-08, - "loss": 0.0015, - "ppl": 0.02459716796875, - "reward": 0.9828571379184723, - "reward_std": 0.008552351035177708, - "rewards/perpo_ocr_edit_distance_reward": 0.9828571677207947, + "advantages": -0.0001024178127408959, + "completion_length": 529.0, + "delta_ref_entropy_loss": 0.038818359375, + "delta_ref_ppl": -0.09716796875, + "entropy_loss": -0.039306640625, + "epoch": 0.4162, + "grad_norm": 2.063144524214251, + "k1_kl": 0.09716796875, + "k3_kl": 0.07421875, + "kimi_kl": 0.306640625, + "learning_rate": 2.9189999999999996e-07, + "loss": 0.0031, + "ppl": 0.0189208984375, + "reward": 0.9868366122245789, + "reward_std": 0.0005650205421261489, + "rewards/perpo_ocr_edit_distance_reward": 0.9868367314338684, "step": 2081, "temperature": 0.9 }, { - "advantages": -4.1374141005690035e-05, - "completion_length": 551.5, - "delta_ref_entropy_loss": 0.02862548828125, - "delta_ref_ppl": -0.02972412109375, - "entropy_loss": -0.03411865234375, - "epoch": 0.8328, - "grad_norm": 0.851979139471632, - "k1_kl": 0.02972412109375, - "k3_kl": 0.0177001953125, - "kimi_kl": 0.04541015625, - "learning_rate": 8.36e-08, - "loss": 0.0007, - "ppl": 0.015716552734375, - "reward": 0.9969160258769989, - "reward_std": 0.005777559053967707, - "rewards/perpo_ocr_edit_distance_reward": 0.9969160556793213, + "advantages": 0.0, + "completion_length": 852.0, + "delta_ref_entropy_loss": 0.025146484375, + "delta_ref_ppl": -0.0400390625, + "entropy_loss": -0.0245361328125, + "epoch": 0.4164, + "grad_norm": 0.51487084421956, + "k1_kl": 0.0400390625, + "k3_kl": 0.025390625, + "kimi_kl": 0.07373046875, + "learning_rate": 2.918e-07, + "loss": 0.001, + "ppl": 0.01116943359375, + "reward": 0.9960576891899109, + "reward_std": 0.0016871134284883738, + "rewards/perpo_ocr_edit_distance_reward": 0.9960577487945557, "step": 2082, "temperature": 0.9 }, { - "advantages": -1.2048649296048097e-06, - "completion_length": 417.0, - "delta_ref_entropy_loss": 0.0338134765625, - "delta_ref_ppl": -0.0467529296875, - "entropy_loss": -0.0264892578125, - "epoch": 0.8332, - "grad_norm": 0.628962397693668, - "k1_kl": 0.04681396484375, - "k3_kl": 0.033935546875, - "kimi_kl": 0.19818115234375, - "learning_rate": 8.339999999999999e-08, - "loss": 0.0014, - "ppl": 0.01092529296875, - "reward": 0.9937002062797546, - "reward_std": 0.002047724963631481, - "rewards/perpo_ocr_edit_distance_reward": 0.993700236082077, + "advantages": -1.9686563973664306e-05, + "completion_length": 409.0, + "delta_ref_entropy_loss": 0.06494140625, + "delta_ref_ppl": -0.103515625, + "entropy_loss": -0.095703125, + "epoch": 0.4166, + "grad_norm": 1.7988719955890782, + "k1_kl": 0.103515625, + "k3_kl": 0.0693359375, + "kimi_kl": 0.2060546875, + "learning_rate": 2.917e-07, + "loss": 0.0028, + "ppl": 0.050048828125, + "reward": 0.9776977300643921, + "reward_std": 0.0020615416578948498, + "rewards/perpo_ocr_edit_distance_reward": 0.9776977300643921, "step": 2083, "temperature": 0.9 }, { - "advantages": 0.0, - "completion_length": 670.0, - "delta_ref_entropy_loss": 0.022796630859375, - "delta_ref_ppl": -0.027587890625, - "entropy_loss": -0.014556884765625, - "epoch": 0.8336, - "grad_norm": 0.015341012318100325, - "k1_kl": 0.027587890625, - "k3_kl": 0.0202178955078125, - "kimi_kl": 0.1087188720703125, - "learning_rate": 8.319999999999999e-08, - "loss": 0.0008, - "ppl": 0.006134033203125, - "reward": 1.0, - "reward_std": 0.0, - "rewards/perpo_ocr_edit_distance_reward": 1.0, + "advantages": -1.97206227312563e-05, + "completion_length": 657.0, + "delta_ref_entropy_loss": 0.0177001953125, + "delta_ref_ppl": -0.047607421875, + "entropy_loss": -0.032958984375, + "epoch": 0.4168, + "grad_norm": 0.5824753263320747, + "k1_kl": 0.047607421875, + "k3_kl": 0.03369140625, + "kimi_kl": 0.10986328125, + "learning_rate": 2.916e-07, + "loss": 0.0014, + "ppl": 0.01165771484375, + "reward": 0.9894697070121765, + "reward_std": 0.001194671611301601, + "rewards/perpo_ocr_edit_distance_reward": 0.9894697070121765, "step": 2084, "temperature": 0.9 }, { - "advantages": -8.01256737759104e-06, - "completion_length": 387.0, - "delta_ref_entropy_loss": 0.0252685546875, - "delta_ref_ppl": -0.03173828125, - "entropy_loss": -0.0205078125, - "epoch": 0.834, - "grad_norm": 0.7451792531361854, - "k1_kl": 0.0318603515625, - "k3_kl": 0.0224456787109375, - "kimi_kl": 0.0836334228515625, - "learning_rate": 8.3e-08, - "loss": 0.0009, - "ppl": 0.0107269287109375, - "reward": 0.9961734712123871, - "reward_std": 0.0018109465017914772, - "rewards/perpo_ocr_edit_distance_reward": 0.9961735010147095, + "advantages": -4.087175966560608e-07, + "completion_length": 1210.0, + "delta_ref_entropy_loss": 0.05712890625, + "delta_ref_ppl": -0.07470703125, + "entropy_loss": -0.359375, + "epoch": 0.417, + "grad_norm": 2.5132008836024635, + "k1_kl": 0.0751953125, + "k3_kl": 0.052490234375, + "kimi_kl": 0.099609375, + "learning_rate": 2.915e-07, + "loss": 0.0021, + "ppl": 0.1943359375, + "reward": 0.4571285545825958, + "reward_std": 0.04119659960269928, + "rewards/perpo_ocr_edit_distance_reward": 0.4571285843849182, "step": 2085, "temperature": 0.9 }, { - "advantages": -5.814859105157666e-05, - "completion_length": 256.0, - "delta_ref_entropy_loss": 0.060791015625, - "delta_ref_ppl": -0.16357421875, - "entropy_loss": -0.0540771484375, - "epoch": 0.8344, - "grad_norm": 0.4361009180092178, - "k1_kl": 0.16357421875, - "k3_kl": 0.132080078125, - "kimi_kl": 0.55810546875, - "learning_rate": 8.28e-08, - "loss": 0.0053, - "ppl": 0.0203857421875, - "reward": 0.9941354095935822, - "reward_std": 0.00016963857342489064, - "rewards/perpo_ocr_edit_distance_reward": 0.9941354393959045, + "advantages": -6.811959565311554e-07, + "completion_length": 174.0, + "delta_ref_entropy_loss": 0.03515625, + "delta_ref_ppl": -0.251953125, + "entropy_loss": -0.123046875, + "epoch": 0.4172, + "grad_norm": 1.8722692058544173, + "k1_kl": 0.25390625, + "k3_kl": 0.2255859375, + "kimi_kl": 1.125, + "learning_rate": 2.914e-07, + "loss": 0.0091, + "ppl": 0.046875, + "reward": 0.7940403819084167, + "reward_std": 0.024051140993833542, + "rewards/perpo_ocr_edit_distance_reward": 0.794040322303772, "step": 2086, "temperature": 0.9 }, { - "advantages": -2.1093659597681835e-05, - "completion_length": 1531.0, - "delta_ref_entropy_loss": 0.0225830078125, - "delta_ref_ppl": -0.01275634765625, - "entropy_loss": -0.0408935546875, - "epoch": 0.8348, - "grad_norm": 0.8464357952494616, - "k1_kl": 0.0128173828125, - "k3_kl": 0.00983428955078125, - "kimi_kl": 0.0146942138671875, - "learning_rate": 8.26e-08, - "loss": 0.0004, - "ppl": 0.025970458984375, - "reward": 0.9942847788333893, - "reward_std": 0.0017718230374157429, - "rewards/perpo_ocr_edit_distance_reward": 0.9942848682403564, + "advantages": -0.00019712108769454062, + "completion_length": 529.0, + "delta_ref_entropy_loss": 0.054443359375, + "delta_ref_ppl": -0.05712890625, + "entropy_loss": -0.02587890625, + "epoch": 0.4174, + "grad_norm": 0.3712785535097127, + "k1_kl": 0.05712890625, + "k3_kl": 0.037353515625, + "kimi_kl": 0.11962890625, + "learning_rate": 2.9129999999999997e-07, + "loss": 0.0017, + "ppl": 0.009765625, + "reward": 0.9974126815795898, + "reward_std": 0.00037513489951379597, + "rewards/perpo_ocr_edit_distance_reward": 0.9974127411842346, "step": 2087, "temperature": 0.9 }, { - "advantages": -3.177779164786898e-05, - "completion_length": 804.5, - "delta_ref_entropy_loss": 0.0672607421875, - "delta_ref_ppl": -0.07025146484375, - "entropy_loss": -0.150146484375, - "epoch": 0.8352, - "grad_norm": 0.9634816897347295, - "k1_kl": 0.07025146484375, - "k3_kl": 0.045562744140625, - "kimi_kl": 0.150482177734375, - "learning_rate": 8.24e-08, - "loss": 0.0019, - "ppl": 0.08453369140625, - "reward": 0.8931174874305725, - "reward_std": 0.07772053085500374, - "rewards/perpo_ocr_edit_distance_reward": 0.8931175470352173, + "advantages": -2.1287374693201855e-05, + "completion_length": 771.0, + "delta_ref_entropy_loss": 0.06201171875, + "delta_ref_ppl": -0.06787109375, + "entropy_loss": -0.059814453125, + "epoch": 0.4176, + "grad_norm": 1.078308495523395, + "k1_kl": 0.06787109375, + "k3_kl": 0.043212890625, + "kimi_kl": 0.134765625, + "learning_rate": 2.912e-07, + "loss": 0.0017, + "ppl": 0.02880859375, + "reward": 0.9752163887023926, + "reward_std": 0.004302170127630234, + "rewards/perpo_ocr_edit_distance_reward": 0.9752165079116821, "step": 2088, "temperature": 0.9 }, { - "advantages": -6.258487883314956e-05, - "completion_length": 1270.5, - "delta_ref_entropy_loss": 0.03302001953125, - "delta_ref_ppl": -0.021697998046875, - "entropy_loss": -0.03900146484375, - "epoch": 0.8356, - "grad_norm": 0.543953975234979, - "k1_kl": 0.021697998046875, - "k3_kl": 0.013153076171875, - "kimi_kl": 0.02593994140625, - "learning_rate": 8.22e-08, - "loss": 0.0006, - "ppl": 0.019195556640625, - "reward": 0.9944805204868317, - "reward_std": 0.0012254425964783877, - "rewards/perpo_ocr_edit_distance_reward": 0.9944806098937988, + "advantages": -8.617129424237646e-06, + "completion_length": 541.0, + "delta_ref_entropy_loss": 0.059814453125, + "delta_ref_ppl": -0.061279296875, + "entropy_loss": -0.109375, + "epoch": 0.4178, + "grad_norm": 1.6922260089493353, + "k1_kl": 0.0615234375, + "k3_kl": 0.038330078125, + "kimi_kl": 0.07421875, + "learning_rate": 2.911e-07, + "loss": 0.0015, + "ppl": 0.054443359375, + "reward": 0.9804428815841675, + "reward_std": 0.006801804061979055, + "rewards/perpo_ocr_edit_distance_reward": 0.980443000793457, "step": 2089, "temperature": 0.9 }, { - "advantages": 0.0, - "completion_length": 352.5, - "delta_ref_entropy_loss": 0.022735595703125, - "delta_ref_ppl": -0.01519775390625, - "entropy_loss": -0.022216796875, - "epoch": 0.836, - "grad_norm": 0.022068935791804665, - "k1_kl": 0.015228271484375, - "k3_kl": 0.00799560546875, - "kimi_kl": 0.014862060546875, - "learning_rate": 8.2e-08, - "loss": 0.0003, - "ppl": 0.010833740234375, - "reward": 1.0, - "reward_std": 0.0, - "rewards/perpo_ocr_edit_distance_reward": 1.0, + "advantages": -8.538791735190898e-05, + "completion_length": 527.0, + "delta_ref_entropy_loss": 0.06640625, + "delta_ref_ppl": -0.08251953125, + "entropy_loss": -0.0419921875, + "epoch": 0.418, + "grad_norm": 0.42698562497427656, + "k1_kl": 0.08251953125, + "k3_kl": 0.05419921875, + "kimi_kl": 0.1845703125, + "learning_rate": 2.9099999999999995e-07, + "loss": 0.0023, + "ppl": 0.01470947265625, + "reward": 0.9970284104347229, + "reward_std": 0.0007974233594723046, + "rewards/perpo_ocr_edit_distance_reward": 0.9970285296440125, "step": 2090, "temperature": 0.9 }, { - "advantages": -0.00014994826779002324, - "completion_length": 811.5, - "delta_ref_entropy_loss": 0.01947021484375, - "delta_ref_ppl": -0.01422119140625, - "entropy_loss": -0.01483154296875, - "epoch": 0.8364, - "grad_norm": 0.30235914421526516, - "k1_kl": 0.014251708984375, - "k3_kl": 0.008880615234375, - "kimi_kl": 0.02801513671875, - "learning_rate": 8.179999999999999e-08, - "loss": 0.0005, - "ppl": 0.00634765625, - "reward": 0.9998486638069153, - "reward_std": 0.00021658077457686886, - "rewards/perpo_ocr_edit_distance_reward": 0.9998487234115601, + "advantages": -4.4431006244849414e-05, + "completion_length": 459.0, + "delta_ref_entropy_loss": 0.08251953125, + "delta_ref_ppl": -0.123046875, + "entropy_loss": -0.05029296875, + "epoch": 0.4182, + "grad_norm": 5.831430625401992, + "k1_kl": 0.123046875, + "k3_kl": 0.07958984375, + "kimi_kl": 0.28515625, + "learning_rate": 2.909e-07, + "loss": 0.0032, + "ppl": 0.02685546875, + "reward": 0.9951910376548767, + "reward_std": 0.0016248654574155807, + "rewards/perpo_ocr_edit_distance_reward": 0.9951911568641663, "step": 2091, "temperature": 0.9 }, { - "advantages": 1.0899135531872162e-06, - "completion_length": 250.0, - "delta_ref_entropy_loss": 0.0693359375, - "delta_ref_ppl": -0.105224609375, - "entropy_loss": -0.06536865234375, - "epoch": 0.8368, - "grad_norm": 3.028883244589272, - "k1_kl": 0.105224609375, - "k3_kl": 0.07305908203125, - "kimi_kl": 0.211181640625, - "learning_rate": 8.16e-08, - "loss": 0.0029, - "ppl": 0.0423583984375, - "reward": 0.997579962015152, - "reward_std": 0.0019064624793827534, - "rewards/perpo_ocr_edit_distance_reward": 0.997579962015152, + "advantages": -0.0001259361015399918, + "completion_length": 313.0, + "delta_ref_entropy_loss": 0.061279296875, + "delta_ref_ppl": -0.11669921875, + "entropy_loss": -0.040771484375, + "epoch": 0.4184, + "grad_norm": 0.4999636446670949, + "k1_kl": 0.11669921875, + "k3_kl": 0.0810546875, + "kimi_kl": 0.24609375, + "learning_rate": 2.908e-07, + "loss": 0.0034, + "ppl": 0.0169677734375, + "reward": 0.9976012110710144, + "reward_std": 0.0003733491466846317, + "rewards/perpo_ocr_edit_distance_reward": 0.997601330280304, "step": 2092, "temperature": 0.9 }, { - "advantages": -2.2522041263073334e-06, - "completion_length": 318.0, - "delta_ref_entropy_loss": 0.0716552734375, - "delta_ref_ppl": -0.06866455078125, - "entropy_loss": -0.05255126953125, - "epoch": 0.8372, - "grad_norm": 2.3181212062901992, - "k1_kl": 0.06890869140625, - "k3_kl": 0.038421630859375, - "kimi_kl": 0.084259033203125, - "learning_rate": 8.14e-08, - "loss": 0.0015, - "ppl": 0.02728271484375, - "reward": 0.9576706886291504, - "reward_std": 0.024274411145597696, - "rewards/perpo_ocr_edit_distance_reward": 0.9576707780361176, + "advantages": -8.685248644724197e-07, + "completion_length": 1024.0, + "delta_ref_entropy_loss": 0.09814453125, + "delta_ref_ppl": -0.09521484375, + "entropy_loss": -0.1806640625, + "epoch": 0.4186, + "grad_norm": 1.7539123101684233, + "k1_kl": 0.09521484375, + "k3_kl": 0.06396484375, + "kimi_kl": 0.14453125, + "learning_rate": 2.907e-07, + "loss": 0.0026, + "ppl": 0.10595703125, + "reward": 0.8811758756637573, + "reward_std": 0.128823921084404, + "rewards/perpo_ocr_edit_distance_reward": 0.8811760544776917, "step": 2093, "temperature": 0.9 }, { - "advantages": -0.00015535525744780898, - "completion_length": 833.5, - "delta_ref_entropy_loss": 0.01995849609375, - "delta_ref_ppl": -0.024658203125, - "entropy_loss": -0.015869140625, - "epoch": 0.8376, - "grad_norm": 1.0670996191797582, - "k1_kl": 0.024658203125, - "k3_kl": 0.016571044921875, - "kimi_kl": 0.0604248046875, - "learning_rate": 8.119999999999999e-08, - "loss": 0.0008, - "ppl": 0.00787353515625, - "reward": 0.9958532452583313, - "reward_std": 0.00040866012568585575, - "rewards/perpo_ocr_edit_distance_reward": 0.9958533346652985, + "advantages": -8.818081550998613e-05, + "completion_length": 407.0, + "delta_ref_entropy_loss": 0.060546875, + "delta_ref_ppl": -0.07958984375, + "entropy_loss": -0.037109375, + "epoch": 0.4188, + "grad_norm": 0.5377739703995217, + "k1_kl": 0.07958984375, + "k3_kl": 0.051025390625, + "kimi_kl": 0.173828125, + "learning_rate": 2.906e-07, + "loss": 0.0021, + "ppl": 0.01416015625, + "reward": 0.9878934621810913, + "reward_std": 0.0003828690096270293, + "rewards/perpo_ocr_edit_distance_reward": 0.9878934621810913, "step": 2094, "temperature": 0.9 }, { - "advantages": -1.7268317606067285e-05, - "completion_length": 550.5, - "delta_ref_entropy_loss": 0.0587158203125, - "delta_ref_ppl": -0.06060791015625, - "entropy_loss": -0.0419921875, - "epoch": 0.838, - "grad_norm": 41.53520492579194, - "k1_kl": 0.06036376953125, - "k3_kl": 0.1650390625, - "kimi_kl": 0.122802734375, - "learning_rate": 8.1e-08, - "loss": 0.0066, - "ppl": 0.0205078125, - "reward": 0.9592563807964325, - "reward_std": 0.0025825555785559118, - "rewards/perpo_ocr_edit_distance_reward": 0.9592563807964325, + "advantages": -9.502683496975806e-06, + "completion_length": 701.0, + "delta_ref_entropy_loss": 0.06982421875, + "delta_ref_ppl": -0.09765625, + "entropy_loss": -0.10498046875, + "epoch": 0.419, + "grad_norm": 1.842199593867517, + "k1_kl": 0.09765625, + "k3_kl": 0.059326171875, + "kimi_kl": 0.15234375, + "learning_rate": 2.9049999999999996e-07, + "loss": 0.0024, + "ppl": 0.054443359375, + "reward": 0.917097806930542, + "reward_std": 0.004371924325823784, + "rewards/perpo_ocr_edit_distance_reward": 0.9170978665351868, "step": 2095, "temperature": 0.9 }, { - "advantages": -2.486365303866478e-05, - "completion_length": 608.5, - "delta_ref_entropy_loss": 0.07391357421875, - "delta_ref_ppl": -0.061279296875, - "entropy_loss": -0.0872802734375, - "epoch": 0.8384, - "grad_norm": 1.4866328975934695, - "k1_kl": 0.06146240234375, - "k3_kl": 0.03570556640625, - "kimi_kl": 0.0877685546875, - "learning_rate": 8.08e-08, - "loss": 0.0015, - "ppl": 0.04815673828125, - "reward": 0.7259735763072968, - "reward_std": 0.004107680812012404, - "rewards/perpo_ocr_edit_distance_reward": 0.7259736061096191, + "advantages": -1.2091228427379974e-06, + "completion_length": 962.0, + "delta_ref_entropy_loss": 0.00830078125, + "delta_ref_ppl": -0.0247802734375, + "entropy_loss": -0.033447265625, + "epoch": 0.4192, + "grad_norm": 0.4898489111958079, + "k1_kl": 0.0247802734375, + "k3_kl": 0.01708984375, + "kimi_kl": 0.048583984375, + "learning_rate": 2.9039999999999995e-07, + "loss": 0.0007, + "ppl": 0.0147705078125, + "reward": 0.9443686008453369, + "reward_std": 0.021192336454987526, + "rewards/perpo_ocr_edit_distance_reward": 0.9443686604499817, "step": 2096, "temperature": 0.9 }, { - "advantages": -8.974756951829477e-06, - "completion_length": 480.0, - "delta_ref_entropy_loss": 0.0650634765625, - "delta_ref_ppl": -0.0560302734375, - "entropy_loss": -0.060302734375, - "epoch": 0.8388, - "grad_norm": 0.9318774459424352, - "k1_kl": 0.05615234375, - "k3_kl": 0.036865234375, - "kimi_kl": 0.0968017578125, - "learning_rate": 8.060000000000001e-08, - "loss": 0.0015, - "ppl": 0.0341796875, - "reward": 0.9468319118022919, - "reward_std": 0.014081278117373586, - "rewards/perpo_ocr_edit_distance_reward": 0.9468319714069366, + "advantages": -1.7711095097183716e-06, + "completion_length": 329.0, + "delta_ref_entropy_loss": 0.0703125, + "delta_ref_ppl": -0.1416015625, + "entropy_loss": -0.126953125, + "epoch": 0.4194, + "grad_norm": 3.2444466831730714, + "k1_kl": 0.142578125, + "k3_kl": 0.0966796875, + "kimi_kl": 0.33203125, + "learning_rate": 2.903e-07, + "loss": 0.0039, + "ppl": 0.051025390625, + "reward": 0.9629145264625549, + "reward_std": 0.03838060423731804, + "rewards/perpo_ocr_edit_distance_reward": 0.9629145860671997, "step": 2097, "temperature": 0.9 }, { - "advantages": -5.5266282288357615e-05, - "completion_length": 823.5, - "delta_ref_entropy_loss": 0.0283203125, - "delta_ref_ppl": -0.020751953125, - "entropy_loss": -0.040771484375, - "epoch": 0.8392, - "grad_norm": 0.5466246644155263, - "k1_kl": 0.020751953125, - "k3_kl": 0.014556884765625, - "kimi_kl": 0.03009033203125, - "learning_rate": 8.039999999999999e-08, - "loss": 0.0006, - "ppl": 0.02117919921875, - "reward": 0.9762586653232574, - "reward_std": 0.011761978756112512, - "rewards/perpo_ocr_edit_distance_reward": 0.9762587249279022, + "advantages": -2.3433141905115917e-05, + "completion_length": 619.0, + "delta_ref_entropy_loss": 0.04638671875, + "delta_ref_ppl": -0.0634765625, + "entropy_loss": -0.039306640625, + "epoch": 0.4196, + "grad_norm": 1.192578649783152, + "k1_kl": 0.06298828125, + "k3_kl": 0.03857421875, + "kimi_kl": 0.1328125, + "learning_rate": 2.902e-07, + "loss": 0.0016, + "ppl": 0.0145263671875, + "reward": 0.9961723685264587, + "reward_std": 0.0020811432041227818, + "rewards/perpo_ocr_edit_distance_reward": 0.9961724281311035, "step": 2098, "temperature": 0.9 }, { - "advantages": -9.55377333866636e-06, - "completion_length": 554.0, - "delta_ref_entropy_loss": 0.072998046875, - "delta_ref_ppl": -0.070068359375, - "entropy_loss": -0.0865478515625, - "epoch": 0.8396, - "grad_norm": 1.1064370845041334, - "k1_kl": 0.070068359375, - "k3_kl": 0.0396728515625, - "kimi_kl": 0.084716796875, - "learning_rate": 8.019999999999999e-08, - "loss": 0.0016, - "ppl": 0.04827880859375, - "reward": 0.9663817584514618, - "reward_std": 0.004034694284200668, - "rewards/perpo_ocr_edit_distance_reward": 0.9663818180561066, + "advantages": -2.048696842393838e-05, + "completion_length": 909.0, + "delta_ref_entropy_loss": 0.0245361328125, + "delta_ref_ppl": -0.037109375, + "entropy_loss": -0.04541015625, + "epoch": 0.4198, + "grad_norm": 0.844655389326705, + "k1_kl": 0.037109375, + "k3_kl": 0.025146484375, + "kimi_kl": 0.06787109375, + "learning_rate": 2.9010000000000004e-07, + "loss": 0.001, + "ppl": 0.0205078125, + "reward": 0.9740661382675171, + "reward_std": 0.004057652782648802, + "rewards/perpo_ocr_edit_distance_reward": 0.9740662574768066, "step": 2099, "temperature": 0.9 }, { - "advantages": -9.707894059829414e-05, - "completion_length": 430.5, - "delta_ref_entropy_loss": 0.0400390625, - "delta_ref_ppl": -0.041259765625, - "entropy_loss": -0.0423583984375, - "epoch": 0.84, - "grad_norm": 0.20854394768020165, - "k1_kl": 0.040985107421875, - "k3_kl": 0.0227203369140625, - "kimi_kl": 0.04888916015625, - "learning_rate": 8e-08, - "loss": 0.001, - "ppl": 0.020416259765625, - "reward": 0.9998839199542999, - "reward_std": 0.00021312183525878936, - "rewards/perpo_ocr_edit_distance_reward": 0.9998839497566223, + "advantages": -8.445126877631992e-05, + "completion_length": 1478.0, + "delta_ref_entropy_loss": 0.035888671875, + "delta_ref_ppl": -0.03955078125, + "entropy_loss": -0.058837890625, + "epoch": 0.42, + "grad_norm": 0.8638216956081517, + "k1_kl": 0.03955078125, + "k3_kl": 0.0244140625, + "kimi_kl": 0.045654296875, + "learning_rate": 2.9e-07, + "loss": 0.0011, + "ppl": 0.027099609375, + "reward": 0.9887765049934387, + "reward_std": 0.0015135271241888404, + "rewards/perpo_ocr_edit_distance_reward": 0.988776683807373, "step": 2100, "temperature": 0.9 }, { - "advantages": -8.915152420740924e-06, - "completion_length": 929.5, - "delta_ref_entropy_loss": 0.09912109375, - "delta_ref_ppl": -0.058349609375, - "entropy_loss": -0.147216796875, - "epoch": 0.8404, - "grad_norm": 2.7446830875276222, - "k1_kl": 0.058349609375, - "k3_kl": 0.032470703125, - "kimi_kl": 0.063720703125, - "learning_rate": 7.979999999999999e-08, - "loss": 0.0013, - "ppl": 0.083984375, - "reward": 0.7860680222511292, - "reward_std": 0.006509797880426049, - "rewards/perpo_ocr_edit_distance_reward": 0.7860681116580963, + "advantages": -3.467287388048135e-05, + "completion_length": 571.0, + "delta_ref_entropy_loss": 0.044921875, + "delta_ref_ppl": -0.072265625, + "entropy_loss": -0.051025390625, + "epoch": 0.4202, + "grad_norm": 0.748514151322148, + "k1_kl": 0.072265625, + "k3_kl": 0.054931640625, + "kimi_kl": 0.158203125, + "learning_rate": 2.8989999999999997e-07, + "loss": 0.0022, + "ppl": 0.0247802734375, + "reward": 0.9914016723632812, + "reward_std": 0.0026030598673969507, + "rewards/perpo_ocr_edit_distance_reward": 0.9914017915725708, "step": 2101, "temperature": 0.9 }, { - "advantages": -3.6935723073838744e-05, - "completion_length": 1017.0, - "delta_ref_entropy_loss": 0.03460693359375, - "delta_ref_ppl": -0.033355712890625, - "entropy_loss": -0.037353515625, - "epoch": 0.8408, - "grad_norm": 0.8501593748680757, - "k1_kl": 0.033447265625, - "k3_kl": 0.023162841796875, - "kimi_kl": 0.08758544921875, - "learning_rate": 7.96e-08, - "loss": 0.001, - "ppl": 0.02008056640625, - "reward": 0.9965822696685791, - "reward_std": 0.0014548554900102317, - "rewards/perpo_ocr_edit_distance_reward": 0.9965823292732239, + "advantages": -2.9947077564429492e-05, + "completion_length": 678.0, + "delta_ref_entropy_loss": 0.052734375, + "delta_ref_ppl": -0.047119140625, + "entropy_loss": -0.041748046875, + "epoch": 0.4204, + "grad_norm": 0.6316772708383944, + "k1_kl": 0.047119140625, + "k3_kl": 0.0235595703125, + "kimi_kl": 0.052490234375, + "learning_rate": 2.898e-07, + "loss": 0.001, + "ppl": 0.016845703125, + "reward": 0.9957693815231323, + "reward_std": 0.0013211743207648396, + "rewards/perpo_ocr_edit_distance_reward": 0.9957695007324219, "step": 2102, "temperature": 0.9 }, { - "advantages": -2.230916749113021e-06, - "completion_length": 774.5, - "delta_ref_entropy_loss": 0.035888671875, - "delta_ref_ppl": -0.03564453125, - "entropy_loss": -0.0362548828125, - "epoch": 0.8412, - "grad_norm": 58.39610299438987, - "k1_kl": 0.03564453125, - "k3_kl": 0.26800537109375, - "kimi_kl": 0.054443359375, - "learning_rate": 7.94e-08, - "loss": 0.0108, - "ppl": 0.02044677734375, - "reward": 0.6372187733650208, - "reward_std": 0.02845894207712263, - "rewards/perpo_ocr_edit_distance_reward": 0.6372188180685043, + "advantages": -0.0005960464477539062, + "completion_length": 129.0, + "delta_ref_entropy_loss": 0.0224609375, + "delta_ref_ppl": -0.2119140625, + "entropy_loss": -0.037353515625, + "epoch": 0.4206, + "grad_norm": 0.03134574915090821, + "k1_kl": 0.2109375, + "k3_kl": 0.166015625, + "kimi_kl": 0.68359375, + "learning_rate": 2.897e-07, + "loss": 0.0072, + "ppl": 0.00885009765625, + "reward": 0.9708737730979919, + "reward_std": 0.0, + "rewards/perpo_ocr_edit_distance_reward": 0.9708738327026367, "step": 2103, "temperature": 0.9 }, { - "advantages": -6.721701356582344e-05, - "completion_length": 817.0, - "delta_ref_entropy_loss": 0.0318603515625, - "delta_ref_ppl": -0.021240234375, - "entropy_loss": -0.0274658203125, - "epoch": 0.8416, - "grad_norm": 0.37423523715191576, - "k1_kl": 0.021240234375, - "k3_kl": 0.0108642578125, - "kimi_kl": 0.0291748046875, - "learning_rate": 7.920000000000001e-08, - "loss": 0.0005, - "ppl": 0.013031005859375, - "reward": 0.9880901277065277, - "reward_std": 0.0008951866184361279, - "rewards/perpo_ocr_edit_distance_reward": 0.9880902469158173, + "advantages": -9.510347445029765e-05, + "completion_length": 480.0, + "delta_ref_entropy_loss": 0.04736328125, + "delta_ref_ppl": -0.0947265625, + "entropy_loss": -0.0380859375, + "epoch": 0.4208, + "grad_norm": 0.7608600081862127, + "k1_kl": 0.0947265625, + "k3_kl": 0.06494140625, + "kimi_kl": 0.255859375, + "learning_rate": 2.896e-07, + "loss": 0.0027, + "ppl": 0.014892578125, + "reward": 0.9939616918563843, + "reward_std": 0.0009741600370034575, + "rewards/perpo_ocr_edit_distance_reward": 0.9939618110656738, "step": 2104, "temperature": 0.9 }, { - "advantages": -3.816400590039848e-05, - "completion_length": 582.0, - "delta_ref_entropy_loss": 0.01068115234375, - "delta_ref_ppl": -0.032135009765625, - "entropy_loss": -0.043701171875, - "epoch": 0.842, - "grad_norm": 0.4513863487105781, - "k1_kl": 0.03216552734375, - "k3_kl": 0.0257720947265625, - "kimi_kl": 0.1055908203125, - "learning_rate": 7.899999999999999e-08, - "loss": 0.0011, - "ppl": 0.019134521484375, - "reward": 0.9403283596038818, - "reward_std": 0.004605847643688321, - "rewards/perpo_ocr_edit_distance_reward": 0.9403284192085266, + "advantages": 0.0, + "completion_length": 4.0, + "delta_ref_entropy_loss": 0.10693359375, + "delta_ref_ppl": -0.77734375, + "entropy_loss": -0.455078125, + "epoch": 0.421, + "grad_norm": 87.06697301304357, + "k1_kl": 0.77734375, + "k3_kl": 1.0390625, + "kimi_kl": 3.453125, + "learning_rate": 2.895e-07, + "loss": 0.0416, + "ppl": 0.298828125, + "reward": 0.5851404666900635, + "reward_std": 0.25723350048065186, + "rewards/perpo_ocr_edit_distance_reward": 0.5851405262947083, "step": 2105, "temperature": 0.9 }, { - "advantages": -0.00032014932003221475, - "completion_length": 746.5, - "delta_ref_entropy_loss": 0.03704833984375, - "delta_ref_ppl": -0.0296173095703125, - "entropy_loss": -0.02313232421875, - "epoch": 0.8424, - "grad_norm": 0.10584809978520364, - "k1_kl": 0.0296173095703125, - "k3_kl": 0.0177001953125, - "kimi_kl": 0.0650177001953125, - "learning_rate": 7.879999999999999e-08, - "loss": 0.001, - "ppl": 0.00998687744140625, - "reward": 0.9894726574420929, - "reward_std": 4.622178676072508e-05, - "rewards/perpo_ocr_edit_distance_reward": 0.9894727170467377, + "advantages": -1.0354178812121972e-05, + "completion_length": 379.0, + "delta_ref_entropy_loss": 0.0703125, + "delta_ref_ppl": -0.11083984375, + "entropy_loss": -0.044189453125, + "epoch": 0.4212, + "grad_norm": 0.45714846483963045, + "k1_kl": 0.11083984375, + "k3_kl": 0.07666015625, + "kimi_kl": 0.296875, + "learning_rate": 2.894e-07, + "loss": 0.0031, + "ppl": 0.01904296875, + "reward": 0.9905723929405212, + "reward_std": 0.0007223636494018137, + "rewards/perpo_ocr_edit_distance_reward": 0.990572452545166, "step": 2106, "temperature": 0.9 }, { - "advantages": -0.0003343820571899414, - "completion_length": 819.0, - "delta_ref_entropy_loss": 0.01715087890625, - "delta_ref_ppl": -0.01666259765625, - "entropy_loss": -0.0094451904296875, - "epoch": 0.8428, - "grad_norm": 0.08133715259281434, - "k1_kl": 0.0167236328125, - "k3_kl": 0.01153564453125, - "kimi_kl": 0.05047607421875, - "learning_rate": 7.86e-08, - "loss": 0.0008, - "ppl": 0.00392913818359375, - "reward": 0.9896034002304077, - "reward_std": 0.00018419693515170366, - "rewards/perpo_ocr_edit_distance_reward": 0.9896034896373749, + "advantages": -7.893358088040259e-06, + "completion_length": 376.0, + "delta_ref_entropy_loss": 0.1640625, + "delta_ref_ppl": -0.1455078125, + "entropy_loss": -0.349609375, + "epoch": 0.4214, + "grad_norm": 2.965790693728969, + "k1_kl": 0.1455078125, + "k3_kl": 0.08544921875, + "kimi_kl": 0.22265625, + "learning_rate": 2.893e-07, + "loss": 0.0034, + "ppl": 0.1806640625, + "reward": 0.7616586089134216, + "reward_std": 0.00638223672285676, + "rewards/perpo_ocr_edit_distance_reward": 0.7616587281227112, "step": 2107, "temperature": 0.9 }, { - "advantages": -2.1606684640573803e-05, - "completion_length": 576.5, - "delta_ref_entropy_loss": 0.0377197265625, - "delta_ref_ppl": -0.0330810546875, - "entropy_loss": -0.040283203125, - "epoch": 0.8432, - "grad_norm": 0.6860696351428458, - "k1_kl": 0.033203125, - "k3_kl": 0.0194091796875, - "kimi_kl": 0.040283203125, - "learning_rate": 7.839999999999999e-08, - "loss": 0.0008, - "ppl": 0.02178955078125, - "reward": 0.956779807806015, - "reward_std": 0.0005121896683704108, - "rewards/perpo_ocr_edit_distance_reward": 0.9567798674106598, + "advantages": 5.59730215172749e-05, + "completion_length": 324.0, + "delta_ref_entropy_loss": 0.10498046875, + "delta_ref_ppl": -0.1123046875, + "entropy_loss": -0.0595703125, + "epoch": 0.4216, + "grad_norm": 0.6617827002501819, + "k1_kl": 0.11279296875, + "k3_kl": 0.06591796875, + "kimi_kl": 0.2021484375, + "learning_rate": 2.892e-07, + "loss": 0.0026, + "ppl": 0.0196533203125, + "reward": 0.9388499855995178, + "reward_std": 0.0003564191865734756, + "rewards/perpo_ocr_edit_distance_reward": 0.9388500452041626, "step": 2108, "temperature": 0.9 }, { - "advantages": -9.405187529409886e-05, - "completion_length": 835.0, - "delta_ref_entropy_loss": 0.0257568359375, - "delta_ref_ppl": -0.0194091796875, - "entropy_loss": -0.015045166015625, - "epoch": 0.8436, - "grad_norm": 0.4518218573140721, - "k1_kl": 0.01947021484375, - "k3_kl": 0.010894775390625, - "kimi_kl": 0.02276611328125, - "learning_rate": 7.82e-08, - "loss": 0.0005, - "ppl": 0.007781982421875, - "reward": 0.9986300468444824, - "reward_std": 0.00127128235908458, - "rewards/perpo_ocr_edit_distance_reward": 0.9986300766468048, + "advantages": -6.130763949840912e-07, + "completion_length": 639.0, + "delta_ref_entropy_loss": 0.08935546875, + "delta_ref_ppl": -0.0986328125, + "entropy_loss": -0.12060546875, + "epoch": 0.4218, + "grad_norm": 1.9055469921800803, + "k1_kl": 0.0986328125, + "k3_kl": 0.058837890625, + "kimi_kl": 0.15625, + "learning_rate": 2.891e-07, + "loss": 0.0024, + "ppl": 0.06201171875, + "reward": 0.8522504568099976, + "reward_std": 0.04217366874217987, + "rewards/perpo_ocr_edit_distance_reward": 0.8522505164146423, "step": 2109, "temperature": 0.9 }, { - "advantages": -5.4665978268531035e-06, - "completion_length": 263.5, - "delta_ref_entropy_loss": 0.0699462890625, - "delta_ref_ppl": -0.0631103515625, - "entropy_loss": -0.1405029296875, - "epoch": 0.844, - "grad_norm": 1.5983370586976022, - "k1_kl": 0.0631103515625, - "k3_kl": 0.04229736328125, - "kimi_kl": 0.0726318359375, - "learning_rate": 7.8e-08, - "loss": 0.0017, - "ppl": 0.08282470703125, - "reward": 0.9500779807567596, - "reward_std": 0.05563772842288017, - "rewards/perpo_ocr_edit_distance_reward": 0.9500780999660492, + "advantages": -4.938671054333099e-07, + "completion_length": 1456.0, + "delta_ref_entropy_loss": 0.028564453125, + "delta_ref_ppl": -0.031494140625, + "entropy_loss": -0.07470703125, + "epoch": 0.422, + "grad_norm": 0.9919729186915414, + "k1_kl": 0.031494140625, + "k3_kl": 0.0201416015625, + "kimi_kl": 0.03271484375, + "learning_rate": 2.8899999999999995e-07, + "loss": 0.0008, + "ppl": 0.0380859375, + "reward": 0.8962261080741882, + "reward_std": 0.06900068372488022, + "rewards/perpo_ocr_edit_distance_reward": 0.896226167678833, "step": 2110, "temperature": 0.9 }, { - "advantages": -6.61568992654793e-05, - "completion_length": 206.0, - "delta_ref_entropy_loss": -0.0068359375, - "delta_ref_ppl": -0.0496826171875, - "entropy_loss": -0.08447265625, - "epoch": 0.8444, - "grad_norm": 0.33912240228104207, - "k1_kl": 0.0498046875, - "k3_kl": 0.02777099609375, - "kimi_kl": 0.0909423828125, - "learning_rate": 7.78e-08, - "loss": 0.0012, - "ppl": 0.023040771484375, - "reward": 0.9950253665447235, - "reward_std": 0.0002717633615247905, - "rewards/perpo_ocr_edit_distance_reward": 0.9950253963470459, + "advantages": -1.532690987460228e-07, + "completion_length": 1255.0, + "delta_ref_entropy_loss": 0.0673828125, + "delta_ref_ppl": -0.08056640625, + "entropy_loss": -0.1357421875, + "epoch": 0.4222, + "grad_norm": 1.8744783906053202, + "k1_kl": 0.08056640625, + "k3_kl": 0.054443359375, + "kimi_kl": 0.15625, + "learning_rate": 2.889e-07, + "loss": 0.0022, + "ppl": 0.0693359375, + "reward": 0.7931094765663147, + "reward_std": 0.15132015943527222, + "rewards/perpo_ocr_edit_distance_reward": 0.7931095361709595, "step": 2111, "temperature": 0.9 }, { - "advantages": -8.161578989529517e-06, - "completion_length": 735.0, - "delta_ref_entropy_loss": 0.021728515625, - "delta_ref_ppl": -0.025634765625, - "entropy_loss": -0.021453857421875, - "epoch": 0.8448, - "grad_norm": 0.2858969099624391, - "k1_kl": 0.02581787109375, - "k3_kl": 0.018341064453125, - "kimi_kl": 0.077392578125, - "learning_rate": 7.76e-08, - "loss": 0.0007, - "ppl": 0.00968170166015625, - "reward": 0.9936873614788055, - "reward_std": 0.0015143499476835132, - "rewards/perpo_ocr_edit_distance_reward": 0.9936873912811279, + "advantages": -8.286748925456777e-05, + "completion_length": 296.0, + "delta_ref_entropy_loss": 0.06787109375, + "delta_ref_ppl": -0.1708984375, + "entropy_loss": -0.05908203125, + "epoch": 0.4224, + "grad_norm": 0.8499265942407854, + "k1_kl": 0.1708984375, + "k3_kl": 0.134765625, + "kimi_kl": 0.62890625, + "learning_rate": 2.888e-07, + "loss": 0.0055, + "ppl": 0.0245361328125, + "reward": 0.9527807235717773, + "reward_std": 0.0009274734184145927, + "rewards/perpo_ocr_edit_distance_reward": 0.9527809023857117, "step": 2112, "temperature": 0.9 }, { - "advantages": 8.174351478373865e-06, - "completion_length": 375.5, - "delta_ref_entropy_loss": 0.05023193359375, - "delta_ref_ppl": -0.0477294921875, - "entropy_loss": -0.048583984375, - "epoch": 0.8452, - "grad_norm": 0.8658572412589173, - "k1_kl": 0.0477294921875, - "k3_kl": 0.02880859375, - "kimi_kl": 0.06787109375, - "learning_rate": 7.739999999999999e-08, - "loss": 0.0011, - "ppl": 0.02203369140625, - "reward": 0.8366382122039795, - "reward_std": 0.0013898182078264654, - "rewards/perpo_ocr_edit_distance_reward": 0.8366382718086243, + "advantages": -2.6038715077447705e-05, + "completion_length": 953.0, + "delta_ref_entropy_loss": 0.0400390625, + "delta_ref_ppl": -0.046875, + "entropy_loss": -0.0615234375, + "epoch": 0.4226, + "grad_norm": 1.9040613035766727, + "k1_kl": 0.046875, + "k3_kl": 0.033935546875, + "kimi_kl": 0.08544921875, + "learning_rate": 2.887e-07, + "loss": 0.0014, + "ppl": 0.025390625, + "reward": 0.9741465449333191, + "reward_std": 0.0008815157343633473, + "rewards/perpo_ocr_edit_distance_reward": 0.9741466045379639, "step": 2113, "temperature": 0.9 }, { - "advantages": 8.514949634275126e-09, - "completion_length": 474.0, - "delta_ref_entropy_loss": 0.04547119140625, - "delta_ref_ppl": -0.04095458984375, - "entropy_loss": -0.03173828125, - "epoch": 0.8456, - "grad_norm": 0.24579264330421516, - "k1_kl": 0.04095458984375, - "k3_kl": 0.0240478515625, - "kimi_kl": 0.05145263671875, - "learning_rate": 7.72e-08, - "loss": 0.001, - "ppl": 0.019134521484375, - "reward": 0.9979936182498932, - "reward_std": 0.00019451188563834876, - "rewards/perpo_ocr_edit_distance_reward": 0.9979936480522156, + "advantages": -2.454008426866494e-05, + "completion_length": 146.0, + "delta_ref_entropy_loss": 0.058837890625, + "delta_ref_ppl": -0.154296875, + "entropy_loss": -0.06884765625, + "epoch": 0.4228, + "grad_norm": 1.2945911034311912, + "k1_kl": 0.154296875, + "k3_kl": 0.11083984375, + "kimi_kl": 0.376953125, + "learning_rate": 2.8860000000000003e-07, + "loss": 0.0045, + "ppl": 0.0322265625, + "reward": 0.8950847387313843, + "reward_std": 0.0033725674729794264, + "rewards/perpo_ocr_edit_distance_reward": 0.8950848579406738, "step": 2114, "temperature": 0.9 }, { - "advantages": -1.9243786937295226e-06, - "completion_length": 661.0, - "delta_ref_entropy_loss": 0.06298828125, - "delta_ref_ppl": -0.0518798828125, - "entropy_loss": -0.0753173828125, - "epoch": 0.846, - "grad_norm": 0.9118486448990073, - "k1_kl": 0.0517578125, - "k3_kl": 0.0284423828125, - "kimi_kl": 0.079833984375, - "learning_rate": 7.7e-08, - "loss": 0.0011, - "ppl": 0.040985107421875, - "reward": 0.9345281422138214, - "reward_std": 0.01037660613656044, - "rewards/perpo_ocr_edit_distance_reward": 0.9345281720161438, + "advantages": -1.839229116740171e-05, + "completion_length": 1215.0, + "delta_ref_entropy_loss": 0.0751953125, + "delta_ref_ppl": -0.08740234375, + "entropy_loss": -0.052490234375, + "epoch": 0.423, + "grad_norm": 1.3506422618085132, + "k1_kl": 0.08740234375, + "k3_kl": 0.0546875, + "kimi_kl": 0.12890625, + "learning_rate": 2.8849999999999997e-07, + "loss": 0.0022, + "ppl": 0.0244140625, + "reward": 0.9766569137573242, + "reward_std": 0.0008260478498414159, + "rewards/perpo_ocr_edit_distance_reward": 0.976656973361969, "step": 2115, "temperature": 0.9 }, { - "advantages": -4.534210731321764e-07, - "completion_length": 1786.5, - "delta_ref_entropy_loss": 0.01434326171875, - "delta_ref_ppl": -0.037353515625, - "entropy_loss": -0.093994140625, - "epoch": 0.8464, - "grad_norm": 1.3092155506811352, - "k1_kl": 0.03759765625, - "k3_kl": 0.0321197509765625, - "kimi_kl": 0.109893798828125, - "learning_rate": 7.679999999999999e-08, - "loss": 0.0013, - "ppl": 0.0399322509765625, - "reward": 0.6410740315914154, - "reward_std": 0.057857816107571125, - "rewards/perpo_ocr_edit_distance_reward": 0.641074076294899, + "advantages": -8.130925561999902e-05, + "completion_length": 805.0, + "delta_ref_entropy_loss": 0.048095703125, + "delta_ref_ppl": -0.041015625, + "entropy_loss": -0.034423828125, + "epoch": 0.4232, + "grad_norm": 0.780743626856752, + "k1_kl": 0.041015625, + "k3_kl": 0.0223388671875, + "kimi_kl": 0.048828125, + "learning_rate": 2.8839999999999996e-07, + "loss": 0.001, + "ppl": 0.0125732421875, + "reward": 0.9988961815834045, + "reward_std": 0.0005283859791234136, + "rewards/perpo_ocr_edit_distance_reward": 0.9988963007926941, "step": 2116, "temperature": 0.9 }, { - "advantages": 9.570803740643896e-06, - "completion_length": 474.0, - "delta_ref_entropy_loss": 0.0567626953125, - "delta_ref_ppl": -0.0579833984375, - "entropy_loss": -0.0552978515625, - "epoch": 0.8468, - "grad_norm": 1.5922692372778293, - "k1_kl": 0.0579833984375, - "k3_kl": 0.0404052734375, - "kimi_kl": 0.14404296875, - "learning_rate": 7.66e-08, - "loss": 0.0016, - "ppl": 0.02850341796875, - "reward": 0.9784836769104004, - "reward_std": 0.0009452399826841429, - "rewards/perpo_ocr_edit_distance_reward": 0.9784837067127228, + "advantages": -5.27926886206842e-07, + "completion_length": 2048.0, + "delta_ref_entropy_loss": 0.046142578125, + "delta_ref_ppl": -0.05224609375, + "entropy_loss": -0.0859375, + "epoch": 0.4234, + "grad_norm": 1.0611024561289126, + "k1_kl": 0.05224609375, + "k3_kl": 0.033447265625, + "kimi_kl": 0.0888671875, + "learning_rate": 2.883e-07, + "loss": 0.0013, + "ppl": 0.04296875, + "reward": 0.8833885788917542, + "reward_std": 0.15872548520565033, + "rewards/perpo_ocr_edit_distance_reward": 0.8833886981010437, "step": 2117, "temperature": 0.9 }, { - "advantages": -0.00013885753492104413, - "completion_length": 918.0, - "delta_ref_entropy_loss": 0.0242919921875, - "delta_ref_ppl": -0.01959228515625, - "entropy_loss": -0.01812744140625, - "epoch": 0.8472, - "grad_norm": 0.23859840580104033, - "k1_kl": 0.01953125, - "k3_kl": 0.011962890625, - "kimi_kl": 0.03533935546875, - "learning_rate": 7.64e-08, - "loss": 0.0006, - "ppl": 0.0078887939453125, - "reward": 0.9903177917003632, - "reward_std": 0.0022075184897403233, - "rewards/perpo_ocr_edit_distance_reward": 0.9903178513050079, + "advantages": -4.557201100396924e-05, + "completion_length": 782.0, + "delta_ref_entropy_loss": 0.053466796875, + "delta_ref_ppl": -0.05322265625, + "entropy_loss": -0.033935546875, + "epoch": 0.4236, + "grad_norm": 0.3961306370571246, + "k1_kl": 0.05322265625, + "k3_kl": 0.0301513671875, + "kimi_kl": 0.08544921875, + "learning_rate": 2.882e-07, + "loss": 0.0013, + "ppl": 0.01080322265625, + "reward": 0.9265964031219482, + "reward_std": 0.00046070231474004686, + "rewards/perpo_ocr_edit_distance_reward": 0.926596462726593, "step": 2118, "temperature": 0.9 }, { - "advantages": -1.4905419448041357e-05, - "completion_length": 607.5, - "delta_ref_entropy_loss": 0.02606201171875, - "delta_ref_ppl": -0.01397705078125, - "entropy_loss": -0.01605224609375, - "epoch": 0.8476, - "grad_norm": 0.25903714953985374, - "k1_kl": 0.0140380859375, - "k3_kl": 0.0076446533203125, - "kimi_kl": 0.021759033203125, - "learning_rate": 7.62e-08, - "loss": 0.0003, - "ppl": 0.005859375, - "reward": 0.9990779757499695, - "reward_std": 0.00023568230972159654, - "rewards/perpo_ocr_edit_distance_reward": 0.9990780353546143, + "advantages": -6.699562072753906e-05, + "completion_length": 372.0, + "delta_ref_entropy_loss": 0.1376953125, + "delta_ref_ppl": -0.1728515625, + "entropy_loss": -0.142578125, + "epoch": 0.4238, + "grad_norm": 1.5356501951467534, + "k1_kl": 0.173828125, + "k3_kl": 0.11767578125, + "kimi_kl": 0.365234375, + "learning_rate": 2.881e-07, + "loss": 0.0048, + "ppl": 0.068359375, + "reward": 0.923737108707428, + "reward_std": 0.0011707345256581903, + "rewards/perpo_ocr_edit_distance_reward": 0.9237371683120728, "step": 2119, "temperature": 0.9 }, { - "advantages": -9.775162538971927e-06, - "completion_length": 668.0, - "delta_ref_entropy_loss": 0.068359375, - "delta_ref_ppl": -0.0657958984375, - "entropy_loss": -0.06005859375, - "epoch": 0.848, - "grad_norm": 1.5941840357787178, - "k1_kl": 0.0657958984375, - "k3_kl": 0.0447998046875, - "kimi_kl": 0.1051025390625, - "learning_rate": 7.599999999999999e-08, - "loss": 0.0018, - "ppl": 0.0330810546875, - "reward": 0.9616281390190125, - "reward_std": 0.053109577973373234, - "rewards/perpo_ocr_edit_distance_reward": 0.9616282284259796, + "advantages": -9.907143976306543e-06, + "completion_length": 703.0, + "delta_ref_entropy_loss": 0.036865234375, + "delta_ref_ppl": -0.03857421875, + "entropy_loss": -0.05859375, + "epoch": 0.424, + "grad_norm": 0.765054428774298, + "k1_kl": 0.038818359375, + "k3_kl": 0.0228271484375, + "kimi_kl": 0.05712890625, + "learning_rate": 2.88e-07, + "loss": 0.0009, + "ppl": 0.0255126953125, + "reward": 0.9870765805244446, + "reward_std": 0.00161832629237324, + "rewards/perpo_ocr_edit_distance_reward": 0.9870765209197998, "step": 2120, "temperature": 0.9 }, { - "advantages": -2.5659801394795068e-05, - "completion_length": 501.0, - "delta_ref_entropy_loss": 0.0303955078125, - "delta_ref_ppl": -0.0201263427734375, - "entropy_loss": -0.01947021484375, - "epoch": 0.8484, - "grad_norm": 0.6049956087112953, - "k1_kl": 0.0201263427734375, - "k3_kl": 0.011810302734375, - "kimi_kl": 0.0434112548828125, - "learning_rate": 7.58e-08, - "loss": 0.0005, - "ppl": 0.009002685546875, - "reward": 0.9753768146038055, - "reward_std": 0.0036875783116556704, - "rewards/perpo_ocr_edit_distance_reward": 0.9753769040107727, + "advantages": -1.4867102436255664e-05, + "completion_length": 1333.0, + "delta_ref_entropy_loss": 0.017822265625, + "delta_ref_ppl": -0.0341796875, + "entropy_loss": -0.05126953125, + "epoch": 0.4242, + "grad_norm": 0.6949542074043468, + "k1_kl": 0.0341796875, + "k3_kl": 0.024169921875, + "kimi_kl": 0.05712890625, + "learning_rate": 2.879e-07, + "loss": 0.001, + "ppl": 0.021240234375, + "reward": 0.9873902797698975, + "reward_std": 0.003908843733370304, + "rewards/perpo_ocr_edit_distance_reward": 0.987390398979187, "step": 2121, "temperature": 0.9 }, { - "advantages": -8.942399927036604e-05, - "completion_length": 789.5, - "delta_ref_entropy_loss": 0.0892333984375, - "delta_ref_ppl": -0.05877685546875, - "entropy_loss": -0.10601806640625, - "epoch": 0.8488, - "grad_norm": 1.296002403012925, - "k1_kl": 0.058837890625, - "k3_kl": 0.0435791015625, - "kimi_kl": 0.1123046875, - "learning_rate": 7.56e-08, - "loss": 0.0018, - "ppl": 0.06317138671875, - "reward": 0.8757535219192505, - "reward_std": 0.0023403211052936967, - "rewards/perpo_ocr_edit_distance_reward": 0.8757535815238953, + "advantages": -1.132488341681892e-05, + "completion_length": 65.0, + "delta_ref_entropy_loss": 0.0458984375, + "delta_ref_ppl": -0.29296875, + "entropy_loss": -0.06787109375, + "epoch": 0.4244, + "grad_norm": 2.4472217962609215, + "k1_kl": 0.29296875, + "k3_kl": 0.2353515625, + "kimi_kl": 0.8515625, + "learning_rate": 2.8779999999999997e-07, + "loss": 0.0094, + "ppl": 0.02001953125, + "reward": 0.9835164546966553, + "reward_std": 0.0029074351768940687, + "rewards/perpo_ocr_edit_distance_reward": 0.9835165143013, "step": 2122, "temperature": 0.9 }, { - "advantages": 0.0, - "completion_length": 590.5, - "delta_ref_entropy_loss": 0.020477294921875, - "delta_ref_ppl": -0.016326904296875, - "entropy_loss": -0.019775390625, - "epoch": 0.8492, - "grad_norm": 0.011356117072898336, - "k1_kl": 0.016265869140625, - "k3_kl": 0.008697509765625, - "kimi_kl": 0.01922607421875, - "learning_rate": 7.539999999999999e-08, - "loss": 0.0003, - "ppl": 0.0081787109375, - "reward": 1.0, - "reward_std": 0.0, - "rewards/perpo_ocr_edit_distance_reward": 1.0, + "advantages": -1.7221485904883593e-05, + "completion_length": 1198.0, + "delta_ref_entropy_loss": 0.0166015625, + "delta_ref_ppl": -0.019775390625, + "entropy_loss": -0.026611328125, + "epoch": 0.4246, + "grad_norm": 0.5750609728764509, + "k1_kl": 0.019775390625, + "k3_kl": 0.01239013671875, + "kimi_kl": 0.0277099609375, + "learning_rate": 2.877e-07, + "loss": 0.0005, + "ppl": 0.0096435546875, + "reward": 0.9696862101554871, + "reward_std": 0.004843039903789759, + "rewards/perpo_ocr_edit_distance_reward": 0.9696862697601318, "step": 2123, "temperature": 0.9 }, { - "advantages": -1.0175365332543151e-06, - "completion_length": 746.0, - "delta_ref_entropy_loss": 0.009674072265625, - "delta_ref_ppl": -0.02783203125, - "entropy_loss": -0.02032470703125, - "epoch": 0.8496, - "grad_norm": 1.7495665448240016, + "advantages": -2.1168165403651074e-05, + "completion_length": 1051.0, + "delta_ref_entropy_loss": 0.0205078125, + "delta_ref_ppl": -0.0277099609375, + "entropy_loss": -0.0267333984375, + "epoch": 0.4248, + "grad_norm": 0.4537698768423944, "k1_kl": 0.0277099609375, - "k3_kl": 0.02008056640625, - "kimi_kl": 0.06591796875, - "learning_rate": 7.52e-08, + "k3_kl": 0.018310546875, + "kimi_kl": 0.04296875, + "learning_rate": 2.876e-07, "loss": 0.0008, - "ppl": 0.00995635986328125, - "reward": 0.9619064331054688, - "reward_std": 0.02292993851006031, - "rewards/perpo_ocr_edit_distance_reward": 0.9619064629077911, + "ppl": 0.0106201171875, + "reward": 0.9932004809379578, + "reward_std": 0.0007037679315544665, + "rewards/perpo_ocr_edit_distance_reward": 0.9932005405426025, "step": 2124, "temperature": 0.9 }, { - "advantages": -1.4356205213061912e-05, - "completion_length": 694.5, - "delta_ref_entropy_loss": 0.080810546875, - "delta_ref_ppl": -0.068115234375, - "entropy_loss": -0.097900390625, - "epoch": 0.85, - "grad_norm": 2.5520482987178363, - "k1_kl": 0.068115234375, - "k3_kl": 0.054443359375, - "kimi_kl": 0.090576171875, - "learning_rate": 7.5e-08, - "loss": 0.0022, - "ppl": 0.0546875, - "reward": 0.9546394646167755, - "reward_std": 0.004674340598285198, - "rewards/perpo_ocr_edit_distance_reward": 0.9546395540237427, + "advantages": -4.0190563595388085e-05, + "completion_length": 758.0, + "delta_ref_entropy_loss": 0.03662109375, + "delta_ref_ppl": -0.035400390625, + "entropy_loss": -0.0267333984375, + "epoch": 0.425, + "grad_norm": 0.665527852524985, + "k1_kl": 0.035400390625, + "k3_kl": 0.0194091796875, + "kimi_kl": 0.050537109375, + "learning_rate": 2.8749999999999995e-07, + "loss": 0.0008, + "ppl": 0.01141357421875, + "reward": 0.9986637234687805, + "reward_std": 0.0005356679321266711, + "rewards/perpo_ocr_edit_distance_reward": 0.9986637830734253, "step": 2125, "temperature": 0.9 }, { - "advantages": -1.3538769962906372e-06, - "completion_length": 482.0, - "delta_ref_entropy_loss": 0.0584716796875, - "delta_ref_ppl": -0.104736328125, - "entropy_loss": -0.1375732421875, - "epoch": 0.8504, - "grad_norm": 2.207338947169995, - "k1_kl": 0.105224609375, - "k3_kl": 0.078857421875, - "kimi_kl": 0.2685546875, - "learning_rate": 7.480000000000001e-08, - "loss": 0.0032, - "ppl": 0.074310302734375, - "reward": 0.6036549210548401, - "reward_std": 0.010732761467806995, - "rewards/perpo_ocr_edit_distance_reward": 0.6036549806594849, + "advantages": 1.2602125707417144e-06, + "completion_length": 286.0, + "delta_ref_entropy_loss": 0.04638671875, + "delta_ref_ppl": -0.1357421875, + "entropy_loss": -0.07177734375, + "epoch": 0.4252, + "grad_norm": 1.7690130305429312, + "k1_kl": 0.1357421875, + "k3_kl": 0.09716796875, + "kimi_kl": 0.337890625, + "learning_rate": 2.874e-07, + "loss": 0.0039, + "ppl": 0.0274658203125, + "reward": 0.9450768232345581, + "reward_std": 0.0065421657636761665, + "rewards/perpo_ocr_edit_distance_reward": 0.9450768232345581, "step": 2126, "temperature": 0.9 }, { - "advantages": -3.0006682209204882e-05, - "completion_length": 583.0, - "delta_ref_entropy_loss": 0.02362060546875, - "delta_ref_ppl": -0.011444091796875, - "entropy_loss": -0.02044677734375, - "epoch": 0.8508, - "grad_norm": 0.20005179097596457, - "k1_kl": 0.011474609375, - "k3_kl": 0.00408172607421875, - "kimi_kl": 0.007160186767578125, - "learning_rate": 7.459999999999999e-08, - "loss": 0.0002, - "ppl": 0.00884246826171875, - "reward": 0.9997500777244568, - "reward_std": 0.00023375342425424606, - "rewards/perpo_ocr_edit_distance_reward": 0.9997500777244568, + "advantages": -5.517687441169983e-06, + "completion_length": 526.0, + "delta_ref_entropy_loss": 0.055908203125, + "delta_ref_ppl": -0.10693359375, + "entropy_loss": -0.2314453125, + "epoch": 0.4254, + "grad_norm": 2.2102247286762142, + "k1_kl": 0.10693359375, + "k3_kl": 0.07470703125, + "kimi_kl": 0.2138671875, + "learning_rate": 2.873e-07, + "loss": 0.003, + "ppl": 0.1064453125, + "reward": 0.8545536994934082, + "reward_std": 0.015371494926512241, + "rewards/perpo_ocr_edit_distance_reward": 0.854553759098053, "step": 2127, "temperature": 0.9 }, { - "advantages": -1.1597361663007177e-05, - "completion_length": 295.0, - "delta_ref_entropy_loss": 0.1138916015625, - "delta_ref_ppl": -0.07696533203125, - "entropy_loss": -0.07794189453125, - "epoch": 0.8512, - "grad_norm": 0.9113987071338228, - "k1_kl": 0.07745361328125, - "k3_kl": 0.04339599609375, - "kimi_kl": 0.13873291015625, - "learning_rate": 7.439999999999999e-08, - "loss": 0.0017, - "ppl": 0.04510498046875, - "reward": 0.9007495641708374, - "reward_std": 0.001765443419571966, - "rewards/perpo_ocr_edit_distance_reward": 0.9007496535778046, + "advantages": -6.951604882488027e-05, + "completion_length": 602.0, + "delta_ref_entropy_loss": 0.041259765625, + "delta_ref_ppl": -0.0908203125, + "entropy_loss": -0.05517578125, + "epoch": 0.4256, + "grad_norm": 0.9363113267008699, + "k1_kl": 0.0908203125, + "k3_kl": 0.060546875, + "kimi_kl": 0.19921875, + "learning_rate": 2.872e-07, + "loss": 0.0025, + "ppl": 0.02587890625, + "reward": 0.9931333065032959, + "reward_std": 0.0006347867893055081, + "rewards/perpo_ocr_edit_distance_reward": 0.9931333661079407, "step": 2128, "temperature": 0.9 }, { - "advantages": -2.2729594093107153e-05, - "completion_length": 603.0, - "delta_ref_entropy_loss": 0.07275390625, - "delta_ref_ppl": -0.0675048828125, - "entropy_loss": -0.0660400390625, - "epoch": 0.8516, - "grad_norm": 0.8393169600178002, - "k1_kl": 0.0675048828125, - "k3_kl": 0.0401611328125, - "kimi_kl": 0.1036376953125, - "learning_rate": 7.42e-08, - "loss": 0.0016, - "ppl": 0.03741455078125, - "reward": 0.9638667702674866, - "reward_std": 0.0007446374365827069, - "rewards/perpo_ocr_edit_distance_reward": 0.9638668596744537, + "advantages": -2.1610941985272802e-05, + "completion_length": 455.0, + "delta_ref_entropy_loss": 0.033447265625, + "delta_ref_ppl": -0.08056640625, + "entropy_loss": -0.0673828125, + "epoch": 0.4258, + "grad_norm": 0.7042591236630833, + "k1_kl": 0.08056640625, + "k3_kl": 0.0546875, + "kimi_kl": 0.142578125, + "learning_rate": 2.871e-07, + "loss": 0.0022, + "ppl": 0.029296875, + "reward": 0.9855055212974548, + "reward_std": 0.0030525082256644964, + "rewards/perpo_ocr_edit_distance_reward": 0.9855056405067444, "step": 2129, "temperature": 0.9 }, { - "advantages": -1.7642975635112634e-05, - "completion_length": 708.0, - "delta_ref_entropy_loss": 0.001678466796875, - "delta_ref_ppl": -0.04339599609375, - "entropy_loss": -0.11480712890625, - "epoch": 0.852, - "grad_norm": 0.9838424743244545, - "k1_kl": 0.04339599609375, - "k3_kl": 0.034088134765625, - "kimi_kl": 0.098388671875, - "learning_rate": 7.399999999999999e-08, - "loss": 0.0014, - "ppl": 0.05767822265625, - "reward": 0.7773874402046204, - "reward_std": 0.07487348496215418, - "rewards/perpo_ocr_edit_distance_reward": 0.7773874998092651, + "advantages": -1.5429088307428174e-05, + "completion_length": 111.0, + "delta_ref_entropy_loss": 0.0625, + "delta_ref_ppl": -0.314453125, + "entropy_loss": -0.09033203125, + "epoch": 0.426, + "grad_norm": 3.0076074315597316, + "k1_kl": 0.31640625, + "k3_kl": 0.255859375, + "kimi_kl": 1.109375, + "learning_rate": 2.8699999999999996e-07, + "loss": 0.0102, + "ppl": 0.050048828125, + "reward": 0.9792147278785706, + "reward_std": 0.003771338379010558, + "rewards/perpo_ocr_edit_distance_reward": 0.9792148470878601, "step": 2130, "temperature": 0.9 }, { - "advantages": -3.277829910075525e-05, - "completion_length": 451.5, - "delta_ref_entropy_loss": 0.05975341796875, - "delta_ref_ppl": -0.04608154296875, - "entropy_loss": -0.05682373046875, - "epoch": 0.8524, - "grad_norm": 0.7254942012056125, - "k1_kl": 0.04638671875, - "k3_kl": 0.02374267578125, - "kimi_kl": 0.052978515625, - "learning_rate": 7.38e-08, - "loss": 0.001, - "ppl": 0.02685546875, - "reward": 0.9815663397312164, - "reward_std": 0.0010663565626600757, - "rewards/perpo_ocr_edit_distance_reward": 0.9815663695335388, + "advantages": -3.9202826883411035e-05, + "completion_length": 667.0, + "delta_ref_entropy_loss": 0.07080078125, + "delta_ref_ppl": -0.08154296875, + "entropy_loss": -0.060791015625, + "epoch": 0.4262, + "grad_norm": 0.5842779291449497, + "k1_kl": 0.08154296875, + "k3_kl": 0.042236328125, + "kimi_kl": 0.103515625, + "learning_rate": 2.8689999999999996e-07, + "loss": 0.0017, + "ppl": 0.0257568359375, + "reward": 0.9793302416801453, + "reward_std": 0.0012045669136568904, + "rewards/perpo_ocr_edit_distance_reward": 0.9793302416801453, "step": 2131, "temperature": 0.9 }, { - "advantages": -0.00021570283934124745, - "completion_length": 822.0, - "delta_ref_entropy_loss": 0.02923583984375, - "delta_ref_ppl": -0.013916015625, - "entropy_loss": -0.025146484375, - "epoch": 0.8528, - "grad_norm": 0.5679158714549054, - "k1_kl": 0.013916015625, - "k3_kl": 0.00502777099609375, - "kimi_kl": 0.007293701171875, - "learning_rate": 7.36e-08, - "loss": 0.0004, - "ppl": 0.0118255615234375, - "reward": 0.9990254640579224, - "reward_std": 0.0004425220104167238, - "rewards/perpo_ocr_edit_distance_reward": 0.9990255236625671, + "advantages": -1.793248338799458e-05, + "completion_length": 465.0, + "delta_ref_entropy_loss": 0.054931640625, + "delta_ref_ppl": -0.07470703125, + "entropy_loss": -0.11083984375, + "epoch": 0.4264, + "grad_norm": 1.2282028647713539, + "k1_kl": 0.07470703125, + "k3_kl": 0.047607421875, + "kimi_kl": 0.099609375, + "learning_rate": 2.868e-07, + "loss": 0.0019, + "ppl": 0.041748046875, + "reward": 0.9731178283691406, + "reward_std": 0.005121630150824785, + "rewards/perpo_ocr_edit_distance_reward": 0.9731178879737854, "step": 2132, "temperature": 0.9 }, { - "advantages": -3.604378071031533e-05, - "completion_length": 1141.0, - "delta_ref_entropy_loss": 0.018310546875, - "delta_ref_ppl": -0.01361083984375, - "entropy_loss": -0.016143798828125, - "epoch": 0.8532, - "grad_norm": 0.35151869667736746, - "k1_kl": 0.01361083984375, - "k3_kl": 0.0069580078125, - "kimi_kl": 0.017333984375, - "learning_rate": 7.340000000000001e-08, - "loss": 0.0003, - "ppl": 0.007080078125, - "reward": 0.9979455769062042, - "reward_std": 0.00042257190216332674, - "rewards/perpo_ocr_edit_distance_reward": 0.9979456663131714, + "advantages": -6.688493158435449e-05, + "completion_length": 334.0, + "delta_ref_entropy_loss": 0.056884765625, + "delta_ref_ppl": -0.0703125, + "entropy_loss": -0.030029296875, + "epoch": 0.4266, + "grad_norm": 1.2306231298116228, + "k1_kl": 0.0703125, + "k3_kl": 0.0439453125, + "kimi_kl": 0.12255859375, + "learning_rate": 2.867e-07, + "loss": 0.0018, + "ppl": 0.0159912109375, + "reward": 0.9952245354652405, + "reward_std": 0.0007913819281384349, + "rewards/perpo_ocr_edit_distance_reward": 0.9952245950698853, "step": 2133, "temperature": 0.9 }, { - "advantages": -0.0002980359963018486, - "completion_length": 501.5, - "delta_ref_entropy_loss": 0.03204345703125, - "delta_ref_ppl": -0.02032470703125, - "entropy_loss": -0.01373291015625, - "epoch": 0.8536, - "grad_norm": 0.10980773175134662, - "k1_kl": 0.0203857421875, - "k3_kl": 0.010955810546875, - "kimi_kl": 0.020538330078125, - "learning_rate": 7.32e-08, - "loss": 0.0007, - "ppl": 0.0048065185546875, - "reward": 0.9995847642421722, - "reward_std": 0.00031159407808445394, - "rewards/perpo_ocr_edit_distance_reward": 0.9995847940444946, + "advantages": -7.799693776178174e-06, + "completion_length": 597.0, + "delta_ref_entropy_loss": 0.0498046875, + "delta_ref_ppl": -0.054931640625, + "entropy_loss": -0.052978515625, + "epoch": 0.4268, + "grad_norm": 2.831366077471488, + "k1_kl": 0.0546875, + "k3_kl": 0.034912109375, + "kimi_kl": 0.07958984375, + "learning_rate": 2.866e-07, + "loss": 0.0014, + "ppl": 0.0247802734375, + "reward": 0.9281237721443176, + "reward_std": 0.0031648569274693727, + "rewards/perpo_ocr_edit_distance_reward": 0.9281237721443176, "step": 2134, "temperature": 0.9 }, { - "advantages": -0.00012427356341504492, - "completion_length": 859.0, - "delta_ref_entropy_loss": 0.03076171875, - "delta_ref_ppl": -0.021514892578125, - "entropy_loss": -0.032745361328125, - "epoch": 0.854, - "grad_norm": 3.605594041692613, - "k1_kl": 0.021514892578125, - "k3_kl": 0.012451171875, - "kimi_kl": 0.032867431640625, - "learning_rate": 7.299999999999999e-08, - "loss": 0.0006, - "ppl": 0.015899658203125, - "reward": 0.9966096580028534, - "reward_std": 0.0010084295208798721, - "rewards/perpo_ocr_edit_distance_reward": 0.9966097176074982, + "advantages": -9.758132364368066e-06, + "completion_length": 323.0, + "delta_ref_entropy_loss": 0.07666015625, + "delta_ref_ppl": -0.10546875, + "entropy_loss": -0.0615234375, + "epoch": 0.427, + "grad_norm": 0.8162039823343981, + "k1_kl": 0.10595703125, + "k3_kl": 0.07080078125, + "kimi_kl": 0.2060546875, + "learning_rate": 2.865e-07, + "loss": 0.0028, + "ppl": 0.025634765625, + "reward": 0.3413011431694031, + "reward_std": 0.0007702477159909904, + "rewards/perpo_ocr_edit_distance_reward": 0.34130117297172546, "step": 2135, "temperature": 0.9 }, { - "advantages": -3.901550007867627e-05, - "completion_length": 597.0, - "delta_ref_entropy_loss": 0.058837890625, - "delta_ref_ppl": -0.072021484375, - "entropy_loss": -0.062255859375, - "epoch": 0.8544, - "grad_norm": 3.29599489188868, - "k1_kl": 0.072265625, - "k3_kl": 0.04638671875, - "kimi_kl": 0.15185546875, - "learning_rate": 7.28e-08, - "loss": 0.0019, - "ppl": 0.03509521484375, - "reward": 0.9790986478328705, - "reward_std": 0.0012095484416931868, - "rewards/perpo_ocr_edit_distance_reward": 0.9790986776351929, + "advantages": -1.8562590184956207e-06, + "completion_length": 37.0, + "delta_ref_entropy_loss": 0.078125, + "delta_ref_ppl": -1.078125, + "entropy_loss": -0.296875, + "epoch": 0.4272, + "grad_norm": 6.119034972496158, + "k1_kl": 1.078125, + "k3_kl": 0.9296875, + "kimi_kl": 4.375, + "learning_rate": 2.8639999999999997e-07, + "loss": 0.0372, + "ppl": 0.1376953125, + "reward": 0.9474393725395203, + "reward_std": 0.00920662097632885, + "rewards/perpo_ocr_edit_distance_reward": 0.9474393725395203, "step": 2136, "temperature": 0.9 }, { - "advantages": -1.4262540389609057e-05, - "completion_length": 802.0, - "delta_ref_entropy_loss": 0.03265380859375, - "delta_ref_ppl": -0.0313720703125, - "entropy_loss": -0.033935546875, - "epoch": 0.8548, - "grad_norm": 1.354580413564431, - "k1_kl": 0.03131103515625, - "k3_kl": 0.03472900390625, - "kimi_kl": 0.0635986328125, - "learning_rate": 7.259999999999999e-08, - "loss": 0.0014, - "ppl": 0.0237884521484375, - "reward": 0.9971979260444641, - "reward_std": 0.0006962197367101908, - "rewards/perpo_ocr_edit_distance_reward": 0.9971979856491089, + "advantages": 1.6399793821619824e-05, + "completion_length": 404.0, + "delta_ref_entropy_loss": 0.0595703125, + "delta_ref_ppl": -0.07080078125, + "entropy_loss": -0.0380859375, + "epoch": 0.4274, + "grad_norm": 1.034653940044695, + "k1_kl": 0.07080078125, + "k3_kl": 0.048095703125, + "kimi_kl": 0.1689453125, + "learning_rate": 2.8629999999999996e-07, + "loss": 0.0019, + "ppl": 0.0145263671875, + "reward": 0.9925497770309448, + "reward_std": 0.00197599153034389, + "rewards/perpo_ocr_edit_distance_reward": 0.9925497770309448, "step": 2137, "temperature": 0.9 }, { - "advantages": 4.6108450533211e-06, - "completion_length": 1041.5, - "delta_ref_entropy_loss": 0.0430908203125, - "delta_ref_ppl": -0.0487060546875, - "entropy_loss": -0.0438232421875, - "epoch": 0.8552, - "grad_norm": 0.6209172878129028, - "k1_kl": 0.048583984375, - "k3_kl": 0.031005859375, - "kimi_kl": 0.0938720703125, - "learning_rate": 7.24e-08, - "loss": 0.0012, - "ppl": 0.02099609375, - "reward": 0.6648494601249695, - "reward_std": 0.002468604623572901, - "rewards/perpo_ocr_edit_distance_reward": 0.6648494452238083, + "advantages": -1.6655241779517382e-05, + "completion_length": 529.0, + "delta_ref_entropy_loss": 0.07666015625, + "delta_ref_ppl": -0.10791015625, + "entropy_loss": -0.1201171875, + "epoch": 0.4276, + "grad_norm": 1.4338527630592066, + "k1_kl": 0.10791015625, + "k3_kl": 0.07177734375, + "kimi_kl": 0.2314453125, + "learning_rate": 2.862e-07, + "loss": 0.0029, + "ppl": 0.0537109375, + "reward": 0.974819540977478, + "reward_std": 0.003987874370068312, + "rewards/perpo_ocr_edit_distance_reward": 0.9748196601867676, "step": 2138, "temperature": 0.9 }, { - "advantages": -3.720181588917626e-05, - "completion_length": 318.0, - "delta_ref_entropy_loss": 0.03631591796875, - "delta_ref_ppl": -0.052978515625, - "entropy_loss": -0.0787353515625, - "epoch": 0.8556, - "grad_norm": 2.2517354024445115, - "k1_kl": 0.052978515625, - "k3_kl": 0.03814697265625, - "kimi_kl": 0.134033203125, - "learning_rate": 7.22e-08, - "loss": 0.0016, - "ppl": 0.03375244140625, - "reward": 0.8456396758556366, - "reward_std": 0.0776689030462876, - "rewards/perpo_ocr_edit_distance_reward": 0.8456397354602814, + "advantages": -0.00012958049774169922, + "completion_length": 597.0, + "delta_ref_entropy_loss": 0.07861328125, + "delta_ref_ppl": -0.0869140625, + "entropy_loss": -0.046875, + "epoch": 0.4278, + "grad_norm": 0.7573611163707845, + "k1_kl": 0.0869140625, + "k3_kl": 0.049560546875, + "kimi_kl": 0.1435546875, + "learning_rate": 2.861e-07, + "loss": 0.0021, + "ppl": 0.0216064453125, + "reward": 0.9835237860679626, + "reward_std": 0.0007542914245277643, + "rewards/perpo_ocr_edit_distance_reward": 0.9835238456726074, "step": 2139, "temperature": 0.9 }, { - "advantages": -2.159391283385048e-05, - "completion_length": 907.0, - "delta_ref_entropy_loss": 0.03509521484375, - "delta_ref_ppl": -0.019287109375, - "entropy_loss": -0.07861328125, - "epoch": 0.856, - "grad_norm": 56098620.29377468, - "k1_kl": 0.019287109375, - "k3_kl": 135168.0060119629, - "kimi_kl": 0.1207275390625, - "learning_rate": 7.2e-08, - "loss": 5419.8662, - "ppl": 0.0535888671875, - "reward": 0.9662632346153259, - "reward_std": 0.0052452844101935625, - "rewards/perpo_ocr_edit_distance_reward": 0.9662632644176483, + "advantages": 7.433551218127832e-06, + "completion_length": 781.0, + "delta_ref_entropy_loss": 0.0400390625, + "delta_ref_ppl": -0.04833984375, + "entropy_loss": -0.037841796875, + "epoch": 0.428, + "grad_norm": 0.7248284417852611, + "k1_kl": 0.04833984375, + "k3_kl": 0.02978515625, + "kimi_kl": 0.083984375, + "learning_rate": 2.8599999999999994e-07, + "loss": 0.0012, + "ppl": 0.0167236328125, + "reward": 0.9958297610282898, + "reward_std": 0.0010418068850412965, + "rewards/perpo_ocr_edit_distance_reward": 0.9958297610282898, "step": 2140, "temperature": 0.9 }, { - "advantages": -2.2700856334267883e-05, - "completion_length": 597.0, - "delta_ref_entropy_loss": 0.12451171875, - "delta_ref_ppl": -0.071044921875, - "entropy_loss": -0.1572265625, - "epoch": 0.8564, - "grad_norm": 2.154622675319498, - "k1_kl": 0.071044921875, - "k3_kl": 0.0321044921875, - "kimi_kl": 0.0665283203125, - "learning_rate": 7.18e-08, - "loss": 0.0013, - "ppl": 0.08349609375, - "reward": 0.9309877455234528, - "reward_std": 0.00530119810719043, - "rewards/perpo_ocr_edit_distance_reward": 0.9309878349304199, + "advantages": -1.6825541024445556e-05, + "completion_length": 408.0, + "delta_ref_entropy_loss": 0.169921875, + "delta_ref_ppl": -0.1708984375, + "entropy_loss": -0.25390625, + "epoch": 0.4282, + "grad_norm": 1.9686209345707082, + "k1_kl": 0.1708984375, + "k3_kl": 0.10498046875, + "kimi_kl": 0.2412109375, + "learning_rate": 2.859e-07, + "loss": 0.0042, + "ppl": 0.130859375, + "reward": 0.9023911952972412, + "reward_std": 0.003445847425609827, + "rewards/perpo_ocr_edit_distance_reward": 0.9023913145065308, "step": 2141, "temperature": 0.9 }, { - "advantages": -2.2585904162042425e-05, - "completion_length": 414.5, - "delta_ref_entropy_loss": 0.119873046875, - "delta_ref_ppl": -0.10791015625, - "entropy_loss": -0.118408203125, - "epoch": 0.8568, - "grad_norm": 1.1743224190936556, - "k1_kl": 0.107666015625, - "k3_kl": 0.067626953125, - "kimi_kl": 0.18359375, - "learning_rate": 7.159999999999999e-08, - "loss": 0.0027, - "ppl": 0.06396484375, - "reward": 0.9586078226566315, - "reward_std": 0.0016123080276884139, - "rewards/perpo_ocr_edit_distance_reward": 0.9586078524589539, + "advantages": -1.5054431059979834e-05, + "completion_length": 970.0, + "delta_ref_entropy_loss": 0.0294189453125, + "delta_ref_ppl": -0.041259765625, + "entropy_loss": -0.029541015625, + "epoch": 0.4284, + "grad_norm": 0.4124651436552987, + "k1_kl": 0.041259765625, + "k3_kl": 0.0240478515625, + "kimi_kl": 0.0654296875, + "learning_rate": 2.858e-07, + "loss": 0.001, + "ppl": 0.0107421875, + "reward": 0.994800329208374, + "reward_std": 0.0027284426614642143, + "rewards/perpo_ocr_edit_distance_reward": 0.9948003888130188, "step": 2142, "temperature": 0.9 }, { - "advantages": -2.5323459340143017e-05, - "completion_length": 341.0, - "delta_ref_entropy_loss": 0.047607421875, - "delta_ref_ppl": -0.07635498046875, - "entropy_loss": -0.02581787109375, - "epoch": 0.8572, - "grad_norm": 0.7976731580545655, - "k1_kl": 0.07623291015625, - "k3_kl": 0.05609130859375, - "kimi_kl": 0.2969970703125, - "learning_rate": 7.14e-08, - "loss": 0.0023, - "ppl": 0.0135040283203125, - "reward": 0.9995478987693787, - "reward_std": 0.0006224596290849149, - "rewards/perpo_ocr_edit_distance_reward": 0.9995479583740234, + "advantages": -1.3726098586630542e-05, + "completion_length": 668.0, + "delta_ref_entropy_loss": 0.033203125, + "delta_ref_ppl": -0.046630859375, + "entropy_loss": -0.04833984375, + "epoch": 0.4286, + "grad_norm": 0.6732812273279466, + "k1_kl": 0.046630859375, + "k3_kl": 0.028564453125, + "kimi_kl": 0.08203125, + "learning_rate": 2.8569999999999997e-07, + "loss": 0.0012, + "ppl": 0.022216796875, + "reward": 0.9630219340324402, + "reward_std": 0.001141587970778346, + "rewards/perpo_ocr_edit_distance_reward": 0.963021993637085, "step": 2143, "temperature": 0.9 }, { - "advantages": -2.2930758859729394e-05, - "completion_length": 312.5, - "delta_ref_entropy_loss": 0.10107421875, - "delta_ref_ppl": -0.095703125, - "entropy_loss": -0.0904541015625, - "epoch": 0.8576, - "grad_norm": 0.7501036528464593, - "k1_kl": 0.095947265625, - "k3_kl": 0.056640625, - "kimi_kl": 0.14501953125, - "learning_rate": 7.12e-08, - "loss": 0.0023, - "ppl": 0.049102783203125, - "reward": 0.956026017665863, - "reward_std": 0.001063839066773653, - "rewards/perpo_ocr_edit_distance_reward": 0.9560261070728302, + "advantages": 1.021793991640152e-07, + "completion_length": 1289.0, + "delta_ref_entropy_loss": 0.06640625, + "delta_ref_ppl": -0.0595703125, + "entropy_loss": -0.12060546875, + "epoch": 0.4288, + "grad_norm": 46.80092924324846, + "k1_kl": 0.059814453125, + "k3_kl": 0.04443359375, + "kimi_kl": 0.10205078125, + "learning_rate": 2.856e-07, + "loss": 0.0018, + "ppl": 0.07275390625, + "reward": 0.9311103820800781, + "reward_std": 0.07442644238471985, + "rewards/perpo_ocr_edit_distance_reward": 0.9311104416847229, "step": 2144, "temperature": 0.9 }, { - "advantages": -2.571514755800308e-06, - "completion_length": 222.5, - "delta_ref_entropy_loss": 0.099853515625, - "delta_ref_ppl": -0.090087890625, - "entropy_loss": -0.08984375, - "epoch": 0.858, - "grad_norm": 1.725852868018472, - "k1_kl": 0.090087890625, - "k3_kl": 0.0518798828125, - "kimi_kl": 0.138671875, - "learning_rate": 7.099999999999999e-08, + "advantages": 2.0435878468560986e-05, + "completion_length": 464.0, + "delta_ref_entropy_loss": 0.08447265625, + "delta_ref_ppl": -0.08837890625, + "entropy_loss": -0.057373046875, + "epoch": 0.429, + "grad_norm": 0.4913137490614285, + "k1_kl": 0.08837890625, + "k3_kl": 0.053466796875, + "kimi_kl": 0.154296875, + "learning_rate": 2.8549999999999996e-07, "loss": 0.0021, - "ppl": 0.0467529296875, - "reward": 0.8250018358230591, - "reward_std": 0.03626819257624447, - "rewards/perpo_ocr_edit_distance_reward": 0.8250018656253815, + "ppl": 0.0206298828125, + "reward": 0.9951742887496948, + "reward_std": 0.001150849973782897, + "rewards/perpo_ocr_edit_distance_reward": 0.9951742887496948, "step": 2145, "temperature": 0.9 }, { - "advantages": -6.76959753036499e-05, - "completion_length": 618.5, - "delta_ref_entropy_loss": 0.040283203125, - "delta_ref_ppl": -0.118682861328125, - "entropy_loss": -0.03436279296875, - "epoch": 0.8584, - "grad_norm": 0.2316976874284732, - "k1_kl": 0.118682861328125, - "k3_kl": 0.09326171875, - "kimi_kl": 0.365692138671875, - "learning_rate": 7.08e-08, - "loss": 0.0038, - "ppl": 0.01690673828125, - "reward": 0.9991258084774017, - "reward_std": 0.00017012203170452267, - "rewards/perpo_ocr_edit_distance_reward": 0.9991258382797241, + "advantages": -1.1171613550686743e-05, + "completion_length": 577.0, + "delta_ref_entropy_loss": 0.06494140625, + "delta_ref_ppl": -0.09912109375, + "entropy_loss": -0.1103515625, + "epoch": 0.4292, + "grad_norm": 1.3147530384758908, + "k1_kl": 0.0986328125, + "k3_kl": 0.05859375, + "kimi_kl": 0.1669921875, + "learning_rate": 2.8539999999999995e-07, + "loss": 0.0023, + "ppl": 0.046142578125, + "reward": 0.9566397070884705, + "reward_std": 0.0021875861566513777, + "rewards/perpo_ocr_edit_distance_reward": 0.9566397666931152, "step": 2146, "temperature": 0.9 }, { - "advantages": -3.1569174360868146e-05, - "completion_length": 880.5, - "delta_ref_entropy_loss": 0.03131103515625, - "delta_ref_ppl": -0.024658203125, - "entropy_loss": -0.029693603515625, - "epoch": 0.8588, - "grad_norm": 0.5419381327738043, - "k1_kl": 0.024658203125, - "k3_kl": 0.013824462890625, - "kimi_kl": 0.0303955078125, - "learning_rate": 7.06e-08, - "loss": 0.0006, - "ppl": 0.013916015625, - "reward": 0.9432817101478577, - "reward_std": 0.14657264834386297, - "rewards/perpo_ocr_edit_distance_reward": 0.9432817697525024, + "advantages": -4.320485459174961e-05, + "completion_length": 534.0, + "delta_ref_entropy_loss": 0.06494140625, + "delta_ref_ppl": -0.09716796875, + "entropy_loss": -0.060302734375, + "epoch": 0.4294, + "grad_norm": 0.7044637586838252, + "k1_kl": 0.09716796875, + "k3_kl": 0.06591796875, + "kimi_kl": 0.2021484375, + "learning_rate": 2.853e-07, + "loss": 0.0027, + "ppl": 0.029052734375, + "reward": 0.972443699836731, + "reward_std": 0.0010823190677911043, + "rewards/perpo_ocr_edit_distance_reward": 0.9724438190460205, "step": 2147, "temperature": 0.9 }, { - "advantages": -2.053805837931577e-05, - "completion_length": 395.5, - "delta_ref_entropy_loss": 0.0733642578125, - "delta_ref_ppl": -0.083740234375, - "entropy_loss": -0.036865234375, - "epoch": 0.8592, - "grad_norm": 2.620837117931846, - "k1_kl": 0.08428955078125, - "k3_kl": 0.05712890625, - "kimi_kl": 0.20654296875, - "learning_rate": 7.04e-08, - "loss": 0.0023, - "ppl": 0.014434814453125, - "reward": 0.9912886917591095, - "reward_std": 0.0011169976933160797, - "rewards/perpo_ocr_edit_distance_reward": 0.9912886917591095, + "advantages": -3.1607494747731835e-05, + "completion_length": 1186.0, + "delta_ref_entropy_loss": 0.0279541015625, + "delta_ref_ppl": -0.036865234375, + "entropy_loss": -0.0341796875, + "epoch": 0.4296, + "grad_norm": 0.4272427449149321, + "k1_kl": 0.036865234375, + "k3_kl": 0.02099609375, + "kimi_kl": 0.046630859375, + "learning_rate": 2.852e-07, + "loss": 0.0009, + "ppl": 0.01251220703125, + "reward": 0.9982635974884033, + "reward_std": 0.0009772289777174592, + "rewards/perpo_ocr_edit_distance_reward": 0.9982635974884033, "step": 2148, "temperature": 0.9 }, { - "advantages": -6.533095438499004e-05, - "completion_length": 267.5, - "delta_ref_entropy_loss": 0.0355224609375, - "delta_ref_ppl": -0.178955078125, - "entropy_loss": -0.073974609375, - "epoch": 0.8596, - "grad_norm": 0.30001102004462255, - "k1_kl": 0.17901611328125, - "k3_kl": 0.140838623046875, - "kimi_kl": 0.53265380859375, - "learning_rate": 7.019999999999999e-08, - "loss": 0.0057, - "ppl": 0.0382080078125, - "reward": 0.9951991438865662, - "reward_std": 0.000275874714134261, - "rewards/perpo_ocr_edit_distance_reward": 0.9951992034912109, + "advantages": -1.7029899268550253e-08, + "completion_length": 587.0, + "delta_ref_entropy_loss": 0.05908203125, + "delta_ref_ppl": -0.08154296875, + "entropy_loss": -0.0400390625, + "epoch": 0.4298, + "grad_norm": 0.3913042824299826, + "k1_kl": 0.08154296875, + "k3_kl": 0.048583984375, + "kimi_kl": 0.146484375, + "learning_rate": 2.8510000000000003e-07, + "loss": 0.0019, + "ppl": 0.0155029296875, + "reward": 0.7817045450210571, + "reward_std": 0.000609444105066359, + "rewards/perpo_ocr_edit_distance_reward": 0.7817045450210571, "step": 2149, "temperature": 0.9 }, { - "advantages": -1.7268318060814636e-05, - "completion_length": 298.0, - "delta_ref_entropy_loss": 0.107177734375, - "delta_ref_ppl": -0.11181640625, - "entropy_loss": -0.15087890625, - "epoch": 0.86, - "grad_norm": 1.7365478078670404, - "k1_kl": 0.111328125, - "k3_kl": 0.0731201171875, - "kimi_kl": 0.284423828125, - "learning_rate": 7e-08, - "loss": 0.0029, - "ppl": 0.0899658203125, - "reward": 0.8250581622123718, - "reward_std": 0.0049760539550334215, - "rewards/perpo_ocr_edit_distance_reward": 0.8250582814216614, + "advantages": -2.175569716200698e-05, + "completion_length": 757.0, + "delta_ref_entropy_loss": 0.037841796875, + "delta_ref_ppl": -0.054931640625, + "entropy_loss": -0.0279541015625, + "epoch": 0.43, + "grad_norm": 0.6045824519763833, + "k1_kl": 0.0546875, + "k3_kl": 0.034912109375, + "kimi_kl": 0.1015625, + "learning_rate": 2.8499999999999997e-07, + "loss": 0.0014, + "ppl": 0.0107421875, + "reward": 0.9844542145729065, + "reward_std": 0.0014677209546789527, + "rewards/perpo_ocr_edit_distance_reward": 0.9844542741775513, "step": 2150, "temperature": 0.9 }, { - "advantages": -5.937048763371422e-05, - "completion_length": 607.5, - "delta_ref_entropy_loss": 0.0709228515625, - "delta_ref_ppl": -0.05059814453125, - "entropy_loss": -0.05328369140625, - "epoch": 0.8604, - "grad_norm": 1.8367615285128815, - "k1_kl": 0.0504150390625, - "k3_kl": 0.039825439453125, - "kimi_kl": 0.05987548828125, - "learning_rate": 6.98e-08, - "loss": 0.0017, - "ppl": 0.030548095703125, - "reward": 0.9339708685874939, - "reward_std": 0.0012746556021738797, - "rewards/perpo_ocr_edit_distance_reward": 0.9339709281921387, + "advantages": -0.00012845653691329062, + "completion_length": 372.0, + "delta_ref_entropy_loss": 0.0311279296875, + "delta_ref_ppl": -0.06396484375, + "entropy_loss": -0.022216796875, + "epoch": 0.4302, + "grad_norm": 0.4156782181563176, + "k1_kl": 0.064453125, + "k3_kl": 0.048095703125, + "kimi_kl": 0.2021484375, + "learning_rate": 2.8489999999999996e-07, + "loss": 0.0021, + "ppl": 0.005523681640625, + "reward": 0.9964224100112915, + "reward_std": 0.0004302305751480162, + "rewards/perpo_ocr_edit_distance_reward": 0.996422529220581, "step": 2151, "temperature": 0.9 }, { - "advantages": -2.2794519736635266e-05, - "completion_length": 203.0, - "delta_ref_entropy_loss": 0.107177734375, - "delta_ref_ppl": -0.126220703125, - "entropy_loss": -0.09130859375, - "epoch": 0.8608, - "grad_norm": 4.428152617176973, - "k1_kl": 0.12548828125, - "k3_kl": 0.07373046875, - "kimi_kl": 0.18017578125, - "learning_rate": 6.959999999999999e-08, - "loss": 0.003, - "ppl": 0.04534912109375, - "reward": 0.9769049882888794, - "reward_std": 0.002408224099781364, - "rewards/perpo_ocr_edit_distance_reward": 0.9769050478935242, + "advantages": -3.950936661567539e-05, + "completion_length": 926.0, + "delta_ref_entropy_loss": 0.02001953125, + "delta_ref_ppl": -0.044677734375, + "entropy_loss": -0.059814453125, + "epoch": 0.4304, + "grad_norm": 0.7285725228004103, + "k1_kl": 0.044677734375, + "k3_kl": 0.031005859375, + "kimi_kl": 0.083984375, + "learning_rate": 2.848e-07, + "loss": 0.0013, + "ppl": 0.03173828125, + "reward": 0.9840182662010193, + "reward_std": 0.0016251287888735533, + "rewards/perpo_ocr_edit_distance_reward": 0.9840183258056641, "step": 2152, "temperature": 0.9 }, { - "advantages": -2.767358679989229e-07, - "completion_length": 411.0, - "delta_ref_entropy_loss": 0.0618896484375, - "delta_ref_ppl": -0.0611572265625, - "entropy_loss": -0.04095458984375, - "epoch": 0.8612, - "grad_norm": 1.4711817500317699, - "k1_kl": 0.0614013671875, - "k3_kl": 0.039306640625, - "kimi_kl": 0.12060546875, - "learning_rate": 6.94e-08, - "loss": 0.0016, - "ppl": 0.019073486328125, - "reward": 0.7254562973976135, - "reward_std": 0.03195439209230244, - "rewards/perpo_ocr_edit_distance_reward": 0.7254563271999359, + "advantages": -2.806527481880039e-05, + "completion_length": 695.0, + "delta_ref_entropy_loss": 0.033935546875, + "delta_ref_ppl": -0.049072265625, + "entropy_loss": -0.033447265625, + "epoch": 0.4306, + "grad_norm": 0.36760483733413585, + "k1_kl": 0.049072265625, + "k3_kl": 0.03173828125, + "kimi_kl": 0.09228515625, + "learning_rate": 2.847e-07, + "loss": 0.0013, + "ppl": 0.01190185546875, + "reward": 0.9919251203536987, + "reward_std": 0.002330339979380369, + "rewards/perpo_ocr_edit_distance_reward": 0.9919252395629883, "step": 2153, "temperature": 0.9 }, { - "advantages": -1.54674057171178e-05, - "completion_length": 876.0, - "delta_ref_entropy_loss": 0.0482177734375, - "delta_ref_ppl": -0.0355224609375, - "entropy_loss": -0.0440673828125, - "epoch": 0.8616, - "grad_norm": 0.6654380179859642, - "k1_kl": 0.035400390625, - "k3_kl": 0.01898193359375, - "kimi_kl": 0.05224609375, - "learning_rate": 6.92e-08, - "loss": 0.0008, - "ppl": 0.0225830078125, - "reward": 0.9275022745132446, - "reward_std": 0.005482407344970852, - "rewards/perpo_ocr_edit_distance_reward": 0.927502304315567, + "advantages": -1.679148044786416e-05, + "completion_length": 555.0, + "delta_ref_entropy_loss": 0.07080078125, + "delta_ref_ppl": -0.05126953125, + "entropy_loss": -0.1396484375, + "epoch": 0.4308, + "grad_norm": 2.1649173956604155, + "k1_kl": 0.051025390625, + "k3_kl": 0.0294189453125, + "kimi_kl": 0.06005859375, + "learning_rate": 2.846e-07, + "loss": 0.0012, + "ppl": 0.0673828125, + "reward": 0.5002557039260864, + "reward_std": 0.0006603420479223132, + "rewards/perpo_ocr_edit_distance_reward": 0.5002557635307312, "step": 2154, "temperature": 0.9 }, { - "advantages": 0.0, - "completion_length": 277.5, - "delta_ref_entropy_loss": 0.0390625, - "delta_ref_ppl": -0.016845703125, - "entropy_loss": -0.0166015625, - "epoch": 0.862, - "grad_norm": 0.0174876804229059, - "k1_kl": 0.0169677734375, - "k3_kl": 0.0065460205078125, - "kimi_kl": 0.0123138427734375, - "learning_rate": 6.900000000000001e-08, - "loss": 0.0003, - "ppl": 0.00467681884765625, - "reward": 1.0, - "reward_std": 0.0, - "rewards/perpo_ocr_edit_distance_reward": 1.0, + "advantages": -3.2612257200526074e-05, + "completion_length": 799.0, + "delta_ref_entropy_loss": 0.0322265625, + "delta_ref_ppl": -0.052978515625, + "entropy_loss": -0.035888671875, + "epoch": 0.431, + "grad_norm": 0.46968609389340404, + "k1_kl": 0.052978515625, + "k3_kl": 0.040283203125, + "kimi_kl": 0.1328125, + "learning_rate": 2.845e-07, + "loss": 0.0016, + "ppl": 0.01513671875, + "reward": 0.9908532500267029, + "reward_std": 0.001727648195810616, + "rewards/perpo_ocr_edit_distance_reward": 0.9908533692359924, "step": 2155, "temperature": 0.9 }, { - "advantages": 2.601742835395271e-05, - "completion_length": 419.0, - "delta_ref_entropy_loss": 0.047607421875, - "delta_ref_ppl": -0.05010986328125, - "entropy_loss": -0.047119140625, - "epoch": 0.8624, - "grad_norm": 0.5896354193683055, - "k1_kl": 0.04986572265625, - "k3_kl": 0.033203125, - "kimi_kl": 0.139495849609375, - "learning_rate": 6.88e-08, - "loss": 0.0013, - "ppl": 0.023681640625, - "reward": 0.9974522888660431, - "reward_std": 0.0007280269055627286, - "rewards/perpo_ocr_edit_distance_reward": 0.9974522888660431, + "advantages": -3.4059798537100505e-08, + "completion_length": 1147.0, + "delta_ref_entropy_loss": 0.031982421875, + "delta_ref_ppl": -0.158203125, + "entropy_loss": -0.458984375, + "epoch": 0.4312, + "grad_norm": 8.405640844830566, + "k1_kl": 0.1572265625, + "k3_kl": 0.150390625, + "kimi_kl": 0.337890625, + "learning_rate": 2.844e-07, + "loss": 0.006, + "ppl": 0.2265625, + "reward": 0.47906044125556946, + "reward_std": 0.30553850531578064, + "rewards/perpo_ocr_edit_distance_reward": 0.47906047105789185, "step": 2156, "temperature": 0.9 }, { - "advantages": -0.00029844897136399595, - "completion_length": 903.0, - "delta_ref_entropy_loss": 0.02777099609375, - "delta_ref_ppl": -0.02203369140625, - "entropy_loss": -0.03851318359375, - "epoch": 0.8628, - "grad_norm": 4.017346278086665, - "k1_kl": 0.02197265625, - "k3_kl": 0.0135498046875, - "kimi_kl": 0.03033447265625, - "learning_rate": 6.859999999999999e-08, - "loss": 0.0008, - "ppl": 0.01983642578125, - "reward": 0.6861927360296249, - "reward_std": 0.024115683510899544, - "rewards/perpo_ocr_edit_distance_reward": 0.6861928254365921, + "advantages": -0.00013818060688208789, + "completion_length": 569.0, + "delta_ref_entropy_loss": 0.041259765625, + "delta_ref_ppl": -0.048828125, + "entropy_loss": -0.0252685546875, + "epoch": 0.4314, + "grad_norm": 0.18459738481428611, + "k1_kl": 0.048828125, + "k3_kl": 0.02783203125, + "kimi_kl": 0.083984375, + "learning_rate": 2.8429999999999997e-07, + "loss": 0.0013, + "ppl": 0.0093994140625, + "reward": 0.9827794432640076, + "reward_std": 0.0003929099766537547, + "rewards/perpo_ocr_edit_distance_reward": 0.9827795028686523, "step": 2157, "temperature": 0.9 }, { - "advantages": -0.00012619581229955656, - "completion_length": 576.5, - "delta_ref_entropy_loss": 0.03399658203125, - "delta_ref_ppl": -0.03076171875, - "entropy_loss": -0.02850341796875, - "epoch": 0.8632, - "grad_norm": 0.3767176238519734, - "k1_kl": 0.03076171875, - "k3_kl": 0.019866943359375, - "kimi_kl": 0.08123779296875, - "learning_rate": 6.84e-08, - "loss": 0.0009, - "ppl": 0.0113677978515625, - "reward": 0.9851986169815063, - "reward_std": 0.0002334086166229099, - "rewards/perpo_ocr_edit_distance_reward": 0.9851986765861511, + "advantages": -2.043587983280304e-06, + "completion_length": 456.0, + "delta_ref_entropy_loss": 0.11865234375, + "delta_ref_ppl": -0.126953125, + "entropy_loss": -0.15625, + "epoch": 0.4316, + "grad_norm": 1.6685865115026526, + "k1_kl": 0.1279296875, + "k3_kl": 0.076171875, + "kimi_kl": 0.1875, + "learning_rate": 2.842e-07, + "loss": 0.003, + "ppl": 0.080078125, + "reward": 0.9535157084465027, + "reward_std": 0.01650400459766388, + "rewards/perpo_ocr_edit_distance_reward": 0.9535157680511475, "step": 2158, "temperature": 0.9 }, { - "advantages": -3.7806375985383056e-06, - "completion_length": 862.5, - "delta_ref_entropy_loss": 0.044342041015625, - "delta_ref_ppl": -0.039581298828125, - "entropy_loss": -0.036041259765625, - "epoch": 0.8636, - "grad_norm": 3.648027108158921, - "k1_kl": 0.039825439453125, - "k3_kl": 0.0564727783203125, - "kimi_kl": 0.0677490234375, - "learning_rate": 6.819999999999999e-08, - "loss": 0.0023, - "ppl": 0.019287109375, - "reward": 0.7836778461933136, - "reward_std": 0.005582986865192652, - "rewards/perpo_ocr_edit_distance_reward": 0.783677875995636, + "advantages": -3.392355938558467e-05, + "completion_length": 799.0, + "delta_ref_entropy_loss": 0.0257568359375, + "delta_ref_ppl": -0.046142578125, + "entropy_loss": -0.041748046875, + "epoch": 0.4318, + "grad_norm": 0.732463619877424, + "k1_kl": 0.046142578125, + "k3_kl": 0.031982421875, + "kimi_kl": 0.08251953125, + "learning_rate": 2.841e-07, + "loss": 0.0013, + "ppl": 0.0179443359375, + "reward": 0.9966434836387634, + "reward_std": 0.0009039691649377346, + "rewards/perpo_ocr_edit_distance_reward": 0.9966435432434082, "step": 2159, "temperature": 0.9 }, { - "advantages": -3.12796674393212e-05, - "completion_length": 516.5, - "delta_ref_entropy_loss": 0.0198974609375, - "delta_ref_ppl": -0.01971435546875, - "entropy_loss": -0.01934814453125, - "epoch": 0.864, - "grad_norm": 0.36583995205979847, - "k1_kl": 0.01971435546875, - "k3_kl": 0.011993408203125, - "kimi_kl": 0.02520751953125, - "learning_rate": 6.8e-08, - "loss": 0.0005, - "ppl": 0.0088043212890625, - "reward": 0.9895999431610107, - "reward_std": 0.016101223591249436, - "rewards/perpo_ocr_edit_distance_reward": 0.9896000325679779, + "advantages": 8.514949634275126e-09, + "completion_length": 108.0, + "delta_ref_entropy_loss": 0.0113525390625, + "delta_ref_ppl": -0.1923828125, + "entropy_loss": -0.055419921875, + "epoch": 0.432, + "grad_norm": 1.3444256701117072, + "k1_kl": 0.19140625, + "k3_kl": 0.1572265625, + "kimi_kl": 0.73046875, + "learning_rate": 2.8399999999999995e-07, + "loss": 0.0063, + "ppl": 0.02490234375, + "reward": 0.9975053071975708, + "reward_std": 0.0015068607171997428, + "rewards/perpo_ocr_edit_distance_reward": 0.9975053668022156, "step": 2160, "temperature": 0.9 }, { - "advantages": -0.0002172355270886328, - "completion_length": 825.0, - "delta_ref_entropy_loss": 0.05364990234375, - "delta_ref_ppl": -0.033447265625, - "entropy_loss": -0.04229736328125, - "epoch": 0.8644, - "grad_norm": 0.6636525706055889, - "k1_kl": 0.033477783203125, - "k3_kl": 0.018463134765625, - "kimi_kl": 0.04534912109375, - "learning_rate": 6.78e-08, - "loss": 0.001, - "ppl": 0.018402099609375, - "reward": 0.9820029735565186, - "reward_std": 0.002144313828466693, - "rewards/perpo_ocr_edit_distance_reward": 0.9820030331611633, + "advantages": -1.532690987460228e-07, + "completion_length": 214.0, + "delta_ref_entropy_loss": 0.0240478515625, + "delta_ref_ppl": -0.2197265625, + "entropy_loss": -0.1953125, + "epoch": 0.4322, + "grad_norm": 3.3779385009368847, + "k1_kl": 0.2197265625, + "k3_kl": 0.1728515625, + "kimi_kl": 0.71484375, + "learning_rate": 2.839e-07, + "loss": 0.0069, + "ppl": 0.08837890625, + "reward": 0.7138903737068176, + "reward_std": 0.12979809939861298, + "rewards/perpo_ocr_edit_distance_reward": 0.7138903737068176, "step": 2161, "temperature": 0.9 }, { - "advantages": -0.00031243051853380166, - "completion_length": 384.5, - "delta_ref_entropy_loss": 0.036865234375, - "delta_ref_ppl": -0.0316162109375, - "entropy_loss": -0.015838623046875, - "epoch": 0.8648, - "grad_norm": 0.5837082518280722, - "k1_kl": 0.0316162109375, - "k3_kl": 0.021820068359375, - "kimi_kl": 0.08221435546875, - "learning_rate": 6.76e-08, - "loss": 0.0012, - "ppl": 0.00601959228515625, - "reward": 0.9987977743148804, - "reward_std": 0.00039350774022750556, - "rewards/perpo_ocr_edit_distance_reward": 0.9987978041172028, + "advantages": -1.3623919585370459e-05, + "completion_length": 377.0, + "delta_ref_entropy_loss": 0.0859375, + "delta_ref_ppl": -0.1318359375, + "entropy_loss": -0.052734375, + "epoch": 0.4324, + "grad_norm": 0.7432468835215782, + "k1_kl": 0.1318359375, + "k3_kl": 0.08642578125, + "kimi_kl": 0.337890625, + "learning_rate": 2.838e-07, + "loss": 0.0035, + "ppl": 0.020263671875, + "reward": 0.9929491877555847, + "reward_std": 0.001772869611158967, + "rewards/perpo_ocr_edit_distance_reward": 0.9929491281509399, "step": 2162, "temperature": 0.9 }, { - "advantages": -4.802431817552133e-06, - "completion_length": 550.5, - "delta_ref_entropy_loss": 0.0543212890625, - "delta_ref_ppl": -0.0557861328125, - "entropy_loss": -0.141845703125, - "epoch": 0.8652, - "grad_norm": 1.3207462231770197, - "k1_kl": 0.0557861328125, - "k3_kl": 0.0367431640625, - "kimi_kl": 0.099853515625, - "learning_rate": 6.74e-08, - "loss": 0.0015, - "ppl": 0.0797119140625, - "reward": 0.8773854672908783, - "reward_std": 0.027203008998185396, - "rewards/perpo_ocr_edit_distance_reward": 0.8773855865001678, + "advantages": 4.1774343117140234e-05, + "completion_length": 670.0, + "delta_ref_entropy_loss": 0.039794921875, + "delta_ref_ppl": -0.05126953125, + "entropy_loss": -0.0230712890625, + "epoch": 0.4326, + "grad_norm": 0.3169352227054454, + "k1_kl": 0.05126953125, + "k3_kl": 0.027587890625, + "kimi_kl": 0.0673828125, + "learning_rate": 2.837e-07, + "loss": 0.0011, + "ppl": 0.0078125, + "reward": 0.9959554076194763, + "reward_std": 0.0007154308841563761, + "rewards/perpo_ocr_edit_distance_reward": 0.9959553480148315, "step": 2163, "temperature": 0.9 }, { - "advantages": -0.00011952860677411081, - "completion_length": 446.5, - "delta_ref_entropy_loss": 0.0252685546875, - "delta_ref_ppl": -0.018798828125, - "entropy_loss": -0.01947021484375, - "epoch": 0.8656, - "grad_norm": 0.46037403412522315, - "k1_kl": 0.018798828125, - "k3_kl": 0.010162353515625, - "kimi_kl": 0.0225830078125, - "learning_rate": 6.719999999999999e-08, - "loss": 0.0005, - "ppl": 0.008819580078125, - "reward": 0.9998643100261688, - "reward_std": 0.0002812077655107714, - "rewards/perpo_ocr_edit_distance_reward": 0.9998643100261688, + "advantages": 3.0211042030714452e-05, + "completion_length": 445.0, + "delta_ref_entropy_loss": 0.049072265625, + "delta_ref_ppl": -0.06982421875, + "entropy_loss": -0.04150390625, + "epoch": 0.4328, + "grad_norm": 0.7345249135980765, + "k1_kl": 0.06982421875, + "k3_kl": 0.046142578125, + "kimi_kl": 0.1572265625, + "learning_rate": 2.836e-07, + "loss": 0.0018, + "ppl": 0.01806640625, + "reward": 0.9942044615745544, + "reward_std": 0.0007451012497767806, + "rewards/perpo_ocr_edit_distance_reward": 0.9942044019699097, "step": 2164, "temperature": 0.9 }, { - "advantages": -7.577453652629629e-05, - "completion_length": 319.0, - "delta_ref_entropy_loss": 0.07012939453125, - "delta_ref_ppl": -0.4080810546875, - "entropy_loss": -0.1041259765625, - "epoch": 0.866, - "grad_norm": 0.41565063911590755, - "k1_kl": 0.4080810546875, - "k3_kl": 0.3399658203125, - "kimi_kl": 1.67431640625, - "learning_rate": 6.7e-08, - "loss": 0.0137, - "ppl": 0.044036865234375, - "reward": 0.6196449771523476, - "reward_std": 0.00014665971684735268, - "rewards/perpo_ocr_edit_distance_reward": 0.6196450367569923, + "advantages": -6.007084084558301e-05, + "completion_length": 1010.0, + "delta_ref_entropy_loss": 0.045166015625, + "delta_ref_ppl": -0.06640625, + "entropy_loss": -0.058349609375, + "epoch": 0.433, + "grad_norm": 3.661925501533777, + "k1_kl": 0.06689453125, + "k3_kl": 0.046142578125, + "kimi_kl": 0.12890625, + "learning_rate": 2.8349999999999996e-07, + "loss": 0.0019, + "ppl": 0.02978515625, + "reward": 0.9963082075119019, + "reward_std": 0.0011757742613554, + "rewards/perpo_ocr_edit_distance_reward": 0.9963083267211914, "step": 2165, "temperature": 0.9 }, { - "advantages": -2.680506105434688e-05, - "completion_length": 367.0, - "delta_ref_entropy_loss": 0.02923583984375, - "delta_ref_ppl": -0.0472412109375, - "entropy_loss": -0.03271484375, - "epoch": 0.8664, - "grad_norm": 0.9971705299227102, - "k1_kl": 0.04736328125, - "k3_kl": 0.038330078125, - "kimi_kl": 0.19677734375, - "learning_rate": 6.679999999999999e-08, + "advantages": -1.7649361325311475e-05, + "completion_length": 1147.0, + "delta_ref_entropy_loss": 0.05078125, + "delta_ref_ppl": -0.06689453125, + "entropy_loss": -0.048828125, + "epoch": 0.4332, + "grad_norm": 0.567665956543139, + "k1_kl": 0.06689453125, + "k3_kl": 0.039794921875, + "kimi_kl": 0.09765625, + "learning_rate": 2.8339999999999996e-07, "loss": 0.0016, - "ppl": 0.0167236328125, - "reward": 0.9908470511436462, - "reward_std": 0.011630409018835053, - "rewards/perpo_ocr_edit_distance_reward": 0.990847110748291, + "ppl": 0.020263671875, + "reward": 0.9790873527526855, + "reward_std": 0.002310709562152624, + "rewards/perpo_ocr_edit_distance_reward": 0.9790874123573303, "step": 2166, "temperature": 0.9 }, { - "advantages": -3.3634050851105712e-06, - "completion_length": 707.5, - "delta_ref_entropy_loss": 0.0274658203125, - "delta_ref_ppl": -0.046142578125, - "entropy_loss": -0.0257568359375, - "epoch": 0.8668, - "grad_norm": 0.479132776815766, - "k1_kl": 0.046142578125, - "k3_kl": 0.036834716796875, - "kimi_kl": 0.126220703125, - "learning_rate": 6.66e-08, - "loss": 0.0015, - "ppl": 0.014373779296875, - "reward": 0.9984830617904663, - "reward_std": 0.0007483865774702281, - "rewards/perpo_ocr_edit_distance_reward": 0.9984830915927887, + "advantages": -1.3692038919543847e-05, + "completion_length": 284.0, + "delta_ref_entropy_loss": 0.062255859375, + "delta_ref_ppl": -0.16796875, + "entropy_loss": -0.1279296875, + "epoch": 0.4334, + "grad_norm": 1.474042281710048, + "k1_kl": 0.16796875, + "k3_kl": 0.12109375, + "kimi_kl": 0.41015625, + "learning_rate": 2.833e-07, + "loss": 0.0049, + "ppl": 0.05615234375, + "reward": 0.9745688438415527, + "reward_std": 0.007368716411292553, + "rewards/perpo_ocr_edit_distance_reward": 0.9745689630508423, "step": 2167, "temperature": 0.9 }, { - "advantages": -0.0002983042172104433, - "completion_length": 557.0, - "delta_ref_entropy_loss": 0.0262451171875, - "delta_ref_ppl": -0.05023193359375, - "entropy_loss": -0.0989990234375, - "epoch": 0.8672, - "grad_norm": 1.406509940822295, - "k1_kl": 0.0501708984375, - "k3_kl": 0.03515625, - "kimi_kl": 0.0859375, - "learning_rate": 6.64e-08, - "loss": 0.0017, - "ppl": 0.0515899658203125, - "reward": 0.8228469491004944, - "reward_std": 0.046457622200250626, - "rewards/perpo_ocr_edit_distance_reward": 0.8228470087051392, + "advantages": -8.412770512222778e-06, + "completion_length": 842.0, + "delta_ref_entropy_loss": 0.060546875, + "delta_ref_ppl": -0.07470703125, + "entropy_loss": -0.140625, + "epoch": 0.4336, + "grad_norm": 2.3963517942009465, + "k1_kl": 0.07421875, + "k3_kl": 0.052001953125, + "kimi_kl": 0.10595703125, + "learning_rate": 2.832e-07, + "loss": 0.0021, + "ppl": 0.07861328125, + "reward": 0.7694135308265686, + "reward_std": 0.007981101050972939, + "rewards/perpo_ocr_edit_distance_reward": 0.7694135904312134, "step": 2168, "temperature": 0.9 }, { - "advantages": -0.00010449972199211288, - "completion_length": 521.5, - "delta_ref_entropy_loss": 0.043304443359375, - "delta_ref_ppl": -0.08453369140625, - "entropy_loss": -0.1429443359375, - "epoch": 0.8676, - "grad_norm": 3.448507897743662, - "k1_kl": 0.08453369140625, - "k3_kl": 0.09722900390625, - "kimi_kl": 0.191864013671875, - "learning_rate": 6.62e-08, - "loss": 0.004, - "ppl": 0.0847930908203125, - "reward": 0.5882942825555801, - "reward_std": 0.037459910578036215, - "rewards/perpo_ocr_edit_distance_reward": 0.5882943123579025, + "advantages": -4.845006515097339e-06, + "completion_length": 189.0, + "delta_ref_entropy_loss": 0.11083984375, + "delta_ref_ppl": -0.19921875, + "entropy_loss": -0.09326171875, + "epoch": 0.4338, + "grad_norm": 0.9779089003447987, + "k1_kl": 0.2001953125, + "k3_kl": 0.142578125, + "kimi_kl": 0.51171875, + "learning_rate": 2.831e-07, + "loss": 0.0057, + "ppl": 0.0390625, + "reward": 0.9792570471763611, + "reward_std": 0.0034283215645700693, + "rewards/perpo_ocr_edit_distance_reward": 0.9792571067810059, "step": 2169, "temperature": 0.9 }, { - "advantages": -7.151919089665171e-05, - "completion_length": 523.5, - "delta_ref_entropy_loss": 0.021881103515625, - "delta_ref_ppl": -0.019989013671875, - "entropy_loss": -0.02783203125, - "epoch": 0.868, - "grad_norm": 0.5738375386811485, - "k1_kl": 0.019989013671875, - "k3_kl": 0.012603759765625, - "kimi_kl": 0.03125, - "learning_rate": 6.6e-08, - "loss": 0.0006, - "ppl": 0.012603759765625, - "reward": 0.8853577673435211, - "reward_std": 0.0004286827752366662, - "rewards/perpo_ocr_edit_distance_reward": 0.8853578269481659, + "advantages": -1.745564645716513e-06, + "completion_length": 279.0, + "delta_ref_entropy_loss": 0.07861328125, + "delta_ref_ppl": -0.09326171875, + "entropy_loss": -0.1982421875, + "epoch": 0.434, + "grad_norm": 3.455529785447502, + "k1_kl": 0.09326171875, + "k3_kl": 0.060791015625, + "kimi_kl": 0.1318359375, + "learning_rate": 2.83e-07, + "loss": 0.0024, + "ppl": 0.0810546875, + "reward": 0.5772949457168579, + "reward_std": 0.029108118265867233, + "rewards/perpo_ocr_edit_distance_reward": 0.5772950053215027, "step": 2170, "temperature": 0.9 }, { - "advantages": -0.0005960464477539062, - "completion_length": 308.5, - "delta_ref_entropy_loss": 0.02435302734375, - "delta_ref_ppl": -0.03118896484375, - "entropy_loss": -0.01751708984375, - "epoch": 0.8684, - "grad_norm": 0.010042369632626382, - "k1_kl": 0.03106689453125, - "k3_kl": 0.017974853515625, - "kimi_kl": 0.037841796875, - "learning_rate": 6.58e-08, - "loss": 0.0013, - "ppl": 0.00399017333984375, - "reward": 0.9974380433559418, - "reward_std": 0.0, - "rewards/perpo_ocr_edit_distance_reward": 0.9974381327629089, + "advantages": -7.578304916933121e-07, + "completion_length": 2048.0, + "delta_ref_entropy_loss": 0.0128173828125, + "delta_ref_ppl": -0.02197265625, + "entropy_loss": -0.0625, + "epoch": 0.4342, + "grad_norm": 1.1704728096528179, + "k1_kl": 0.02197265625, + "k3_kl": 0.017333984375, + "kimi_kl": 0.0308837890625, + "learning_rate": 2.8289999999999997e-07, + "loss": 0.0007, + "ppl": 0.033203125, + "reward": 0.8914487957954407, + "reward_std": 0.022792084142565727, + "rewards/perpo_ocr_edit_distance_reward": 0.8914488554000854, "step": 2171, "temperature": 0.9 }, { - "advantages": -4.189355422568042e-06, - "completion_length": 357.5, - "delta_ref_entropy_loss": 0.0762939453125, - "delta_ref_ppl": -0.05072021484375, - "entropy_loss": -0.10321044921875, - "epoch": 0.8688, - "grad_norm": 0.9778878703691534, - "k1_kl": 0.0507354736328125, - "k3_kl": 0.034332275390625, - "kimi_kl": 0.08136749267578125, - "learning_rate": 6.56e-08, - "loss": 0.0014, - "ppl": 0.05938720703125, - "reward": 0.9193652272224426, - "reward_std": 0.0024893844965845346, - "rewards/perpo_ocr_edit_distance_reward": 0.9193652868270874, + "advantages": -4.029273986816406e-05, + "completion_length": 691.0, + "delta_ref_entropy_loss": 0.04296875, + "delta_ref_ppl": -0.046875, + "entropy_loss": -0.031494140625, + "epoch": 0.4344, + "grad_norm": 0.5906726739135256, + "k1_kl": 0.046875, + "k3_kl": 0.027587890625, + "kimi_kl": 0.08154296875, + "learning_rate": 2.8279999999999996e-07, + "loss": 0.0011, + "ppl": 0.013427734375, + "reward": 0.9934792518615723, + "reward_std": 0.0011681539472192526, + "rewards/perpo_ocr_edit_distance_reward": 0.993479311466217, "step": 2172, "temperature": 0.9 }, { - "advantages": -2.7750219942390686e-05, - "completion_length": 654.0, - "delta_ref_entropy_loss": 0.0418701171875, - "delta_ref_ppl": -0.0416259765625, - "entropy_loss": -0.03326416015625, - "epoch": 0.8692, - "grad_norm": 0.6719542951622355, - "k1_kl": 0.0416259765625, - "k3_kl": 0.0264892578125, - "kimi_kl": 0.0830078125, - "learning_rate": 6.54e-08, - "loss": 0.0011, - "ppl": 0.017333984375, - "reward": 0.9739047288894653, - "reward_std": 0.0017877223872346804, - "rewards/perpo_ocr_edit_distance_reward": 0.9739047884941101, + "advantages": -2.183233118557837e-05, + "completion_length": 833.0, + "delta_ref_entropy_loss": 0.099609375, + "delta_ref_ppl": -0.109375, + "entropy_loss": -0.2490234375, + "epoch": 0.4346, + "grad_norm": 1.7368628010735692, + "k1_kl": 0.109375, + "k3_kl": 0.0625, + "kimi_kl": 0.12890625, + "learning_rate": 2.827e-07, + "loss": 0.0025, + "ppl": 0.1337890625, + "reward": 0.6743168830871582, + "reward_std": 0.003021965268999338, + "rewards/perpo_ocr_edit_distance_reward": 0.6743170022964478, "step": 2173, "temperature": 0.9 }, { - "advantages": -3.616511873727468e-05, - "completion_length": 452.0, - "delta_ref_entropy_loss": 0.0787353515625, - "delta_ref_ppl": -0.0513916015625, - "entropy_loss": -0.06011962890625, - "epoch": 0.8696, - "grad_norm": 1.1633078201473797, - "k1_kl": 0.0516357421875, - "k3_kl": 0.027618408203125, - "kimi_kl": 0.063934326171875, - "learning_rate": 6.519999999999999e-08, - "loss": 0.0011, - "ppl": 0.0319671630859375, - "reward": 0.9787687659263611, - "reward_std": 0.0017862248787423596, - "rewards/perpo_ocr_edit_distance_reward": 0.9787688255310059, + "advantages": -5.313328529155115e-06, + "completion_length": 730.0, + "delta_ref_entropy_loss": 0.12890625, + "delta_ref_ppl": -0.10791015625, + "entropy_loss": -0.1943359375, + "epoch": 0.4348, + "grad_norm": 2.5703341870387, + "k1_kl": 0.10791015625, + "k3_kl": 0.06298828125, + "kimi_kl": 0.1435546875, + "learning_rate": 2.826e-07, + "loss": 0.0025, + "ppl": 0.0986328125, + "reward": 0.9117313027381897, + "reward_std": 0.00150083948392421, + "rewards/perpo_ocr_edit_distance_reward": 0.9117313623428345, "step": 2174, "temperature": 0.9 }, { - "advantages": -1.9584385881898925e-06, - "completion_length": 653.0, - "delta_ref_entropy_loss": 0.088134765625, - "delta_ref_ppl": -0.0693359375, - "entropy_loss": -0.1455078125, - "epoch": 0.87, - "grad_norm": 1.2215259363487463, - "k1_kl": 0.069091796875, - "k3_kl": 0.0362548828125, - "kimi_kl": 0.0693359375, - "learning_rate": 6.5e-08, - "loss": 0.0015, - "ppl": 0.08642578125, - "reward": 0.8979589939117432, - "reward_std": 0.004195212619379163, - "rewards/perpo_ocr_edit_distance_reward": 0.8979590535163879, + "advantages": -5.449567765936081e-07, + "completion_length": 1278.0, + "delta_ref_entropy_loss": -0.0012054443359375, + "delta_ref_ppl": -0.019287109375, + "entropy_loss": -0.014892578125, + "epoch": 0.435, + "grad_norm": 0.4420231831193762, + "k1_kl": 0.019287109375, + "k3_kl": 0.014892578125, + "kimi_kl": 0.05859375, + "learning_rate": 2.8249999999999994e-07, + "loss": 0.0006, + "ppl": 0.004852294921875, + "reward": 0.9621405601501465, + "reward_std": 0.04625962674617767, + "rewards/perpo_ocr_edit_distance_reward": 0.962140679359436, "step": 2175, "temperature": 0.9 }, { - "advantages": -7.201731516803811e-05, - "completion_length": 638.5, - "delta_ref_entropy_loss": 0.03619384765625, - "delta_ref_ppl": -0.031494140625, - "entropy_loss": -0.019317626953125, - "epoch": 0.8704, - "grad_norm": 0.3924692147942319, - "k1_kl": 0.0316162109375, - "k3_kl": 0.01904296875, - "kimi_kl": 0.07373046875, - "learning_rate": 6.48e-08, - "loss": 0.0008, - "ppl": 0.0084228515625, - "reward": 0.9960536062717438, - "reward_std": 0.0003687888092827052, - "rewards/perpo_ocr_edit_distance_reward": 0.9960536658763885, + "advantages": -5.687986231350806e-06, + "completion_length": 203.0, + "delta_ref_entropy_loss": 0.030517578125, + "delta_ref_ppl": -0.09228515625, + "entropy_loss": -0.06982421875, + "epoch": 0.4352, + "grad_norm": 1.5297746554875702, + "k1_kl": 0.09228515625, + "k3_kl": 0.068359375, + "kimi_kl": 0.2138671875, + "learning_rate": 2.824e-07, + "loss": 0.0027, + "ppl": 0.0247802734375, + "reward": 0.9812375903129578, + "reward_std": 0.007338220719248056, + "rewards/perpo_ocr_edit_distance_reward": 0.9812377095222473, "step": 2176, "temperature": 0.9 }, { - "advantages": -0.0002980232238769531, - "completion_length": 425.0, - "delta_ref_entropy_loss": 0.02276611328125, - "delta_ref_ppl": -0.02490234375, - "entropy_loss": -0.015777587890625, - "epoch": 0.8708, - "grad_norm": 0.015008710278708126, - "k1_kl": 0.02484130859375, - "k3_kl": 0.016510009765625, - "kimi_kl": 0.054931640625, - "learning_rate": 6.46e-08, - "loss": 0.001, - "ppl": 0.006103515625, - "reward": 0.9990719258785248, - "reward_std": 0.0, - "rewards/perpo_ocr_edit_distance_reward": 0.9990719556808472, + "advantages": -7.663454653084045e-07, + "completion_length": 327.0, + "delta_ref_entropy_loss": 0.0654296875, + "delta_ref_ppl": -0.09375, + "entropy_loss": -0.06201171875, + "epoch": 0.4354, + "grad_norm": 0.9863792049812448, + "k1_kl": 0.09375, + "k3_kl": 0.06787109375, + "kimi_kl": 0.23046875, + "learning_rate": 2.823e-07, + "loss": 0.0027, + "ppl": 0.023193359375, + "reward": 0.9603686928749084, + "reward_std": 0.022467685863375664, + "rewards/perpo_ocr_edit_distance_reward": 0.9603687524795532, "step": 2177, "temperature": 0.9 }, { - "advantages": -5.484691973833833e-05, - "completion_length": 583.5, - "delta_ref_entropy_loss": 0.0379638671875, - "delta_ref_ppl": -0.03564453125, - "entropy_loss": -0.02349853515625, - "epoch": 0.8712, - "grad_norm": 0.31337023326277563, - "k1_kl": 0.03564453125, - "k3_kl": 0.022247314453125, - "kimi_kl": 0.07763671875, - "learning_rate": 6.44e-08, - "loss": 0.0009, - "ppl": 0.010772705078125, - "reward": 0.9962767660617828, - "reward_std": 0.0003069435406359844, - "rewards/perpo_ocr_edit_distance_reward": 0.9962767958641052, + "advantages": -4.45161567768082e-05, + "completion_length": 531.0, + "delta_ref_entropy_loss": 0.0576171875, + "delta_ref_ppl": -0.0849609375, + "entropy_loss": -0.03076171875, + "epoch": 0.4356, + "grad_norm": 0.4764699988359978, + "k1_kl": 0.0849609375, + "k3_kl": 0.050537109375, + "kimi_kl": 0.1328125, + "learning_rate": 2.8219999999999997e-07, + "loss": 0.0021, + "ppl": 0.00921630859375, + "reward": 0.9953814148902893, + "reward_std": 0.0010479306802153587, + "rewards/perpo_ocr_edit_distance_reward": 0.9953814744949341, "step": 2178, "temperature": 0.9 }, { - "advantages": -2.2888184503244702e-05, - "completion_length": 770.5, - "delta_ref_entropy_loss": 0.0645751953125, - "delta_ref_ppl": -0.06982421875, - "entropy_loss": -0.072265625, - "epoch": 0.8716, - "grad_norm": 0.8422624288862594, - "k1_kl": 0.06982421875, - "k3_kl": 0.0511474609375, - "kimi_kl": 0.166015625, - "learning_rate": 6.419999999999999e-08, - "loss": 0.0021, - "ppl": 0.03570556640625, - "reward": 0.9655944108963013, - "reward_std": 0.002852492092642933, - "rewards/perpo_ocr_edit_distance_reward": 0.9655944406986237, + "advantages": 2.1100046069477685e-05, + "completion_length": 501.0, + "delta_ref_entropy_loss": 0.056640625, + "delta_ref_ppl": -0.08154296875, + "entropy_loss": -0.03271484375, + "epoch": 0.4358, + "grad_norm": 0.19571602835374582, + "k1_kl": 0.08154296875, + "k3_kl": 0.0546875, + "kimi_kl": 0.1845703125, + "learning_rate": 2.821e-07, + "loss": 0.0022, + "ppl": 0.0103759765625, + "reward": 0.9957839846611023, + "reward_std": 0.0003033830726053566, + "rewards/perpo_ocr_edit_distance_reward": 0.9957839846611023, "step": 2179, "temperature": 0.9 }, { - "advantages": -0.0002980232238769531, - "completion_length": 369.0, - "delta_ref_entropy_loss": 0.0220947265625, - "delta_ref_ppl": -0.06268310546875, - "entropy_loss": -0.02392578125, - "epoch": 0.872, - "grad_norm": 0.015881499255742403, - "k1_kl": 0.06268310546875, - "k3_kl": 0.048797607421875, - "kimi_kl": 0.1875, - "learning_rate": 6.4e-08, - "loss": 0.0022, - "ppl": 0.0108489990234375, - "reward": 0.9994991719722748, - "reward_std": 0.0, - "rewards/perpo_ocr_edit_distance_reward": 0.9994992017745972, + "advantages": -3.738062787306262e-06, + "completion_length": 658.0, + "delta_ref_entropy_loss": 0.0458984375, + "delta_ref_ppl": -0.06103515625, + "entropy_loss": -0.05078125, + "epoch": 0.436, + "grad_norm": 0.8028261845766015, + "k1_kl": 0.06103515625, + "k3_kl": 0.03857421875, + "kimi_kl": 0.10498046875, + "learning_rate": 2.8199999999999996e-07, + "loss": 0.0015, + "ppl": 0.0206298828125, + "reward": 0.9630382061004639, + "reward_std": 0.0021873272489756346, + "rewards/perpo_ocr_edit_distance_reward": 0.9630382061004639, "step": 2180, "temperature": 0.9 }, { - "advantages": 9.466494930165936e-06, - "completion_length": 577.5, - "delta_ref_entropy_loss": 0.06829833984375, - "delta_ref_ppl": -0.0430908203125, - "entropy_loss": -0.08160400390625, - "epoch": 0.8724, - "grad_norm": 1.3084473084510733, - "k1_kl": 0.04315185546875, - "k3_kl": 0.02435302734375, - "kimi_kl": 0.0628662109375, - "learning_rate": 6.379999999999999e-08, - "loss": 0.001, - "ppl": 0.04449462890625, - "reward": 0.8856253921985626, - "reward_std": 0.002572401877841912, - "rewards/perpo_ocr_edit_distance_reward": 0.8856254518032074, + "advantages": -2.1602427295874804e-05, + "completion_length": 832.0, + "delta_ref_entropy_loss": 0.06591796875, + "delta_ref_ppl": -0.05078125, + "entropy_loss": -0.076171875, + "epoch": 0.4362, + "grad_norm": 1.1263065437044792, + "k1_kl": 0.051025390625, + "k3_kl": 0.0279541015625, + "kimi_kl": 0.05908203125, + "learning_rate": 2.8189999999999995e-07, + "loss": 0.0011, + "ppl": 0.04052734375, + "reward": 0.9738408327102661, + "reward_std": 0.003446160349994898, + "rewards/perpo_ocr_edit_distance_reward": 0.9738408923149109, "step": 2181, "temperature": 0.9 }, { - "advantages": -1.4475413934178505e-07, - "completion_length": 662.5, - "delta_ref_entropy_loss": 0.0242919921875, - "delta_ref_ppl": -0.024688720703125, - "entropy_loss": -0.108673095703125, - "epoch": 0.8728, - "grad_norm": 1.576445390425759, - "k1_kl": 0.024688720703125, - "k3_kl": 0.0248260498046875, - "kimi_kl": 0.031036376953125, - "learning_rate": 6.36e-08, - "loss": 0.001, - "ppl": 0.054107666015625, - "reward": 0.8809939026832581, - "reward_std": 0.11790057271718979, - "rewards/perpo_ocr_edit_distance_reward": 0.8809939324855804, + "advantages": 1.697880907158833e-05, + "completion_length": 229.0, + "delta_ref_entropy_loss": 0.0419921875, + "delta_ref_ppl": -0.12255859375, + "entropy_loss": -0.029296875, + "epoch": 0.4364, + "grad_norm": 0.5616069313154334, + "k1_kl": 0.12255859375, + "k3_kl": 0.08984375, + "kimi_kl": 0.310546875, + "learning_rate": 2.818e-07, + "loss": 0.0036, + "ppl": 0.01007080078125, + "reward": 0.9917242527008057, + "reward_std": 0.0009033366804942489, + "rewards/perpo_ocr_edit_distance_reward": 0.9917243123054504, "step": 2182, "temperature": 0.9 }, { - "advantages": -8.402126331930049e-05, - "completion_length": 741.0, - "delta_ref_entropy_loss": 0.04266357421875, - "delta_ref_ppl": -0.028350830078125, - "entropy_loss": -0.03668212890625, - "epoch": 0.8732, - "grad_norm": 0.7750825393819184, - "k1_kl": 0.028228759765625, - "k3_kl": 0.01544189453125, - "kimi_kl": 0.04150390625, - "learning_rate": 6.34e-08, - "loss": 0.0007, - "ppl": 0.018951416015625, - "reward": 0.996950775384903, - "reward_std": 0.0007655788140255027, - "rewards/perpo_ocr_edit_distance_reward": 0.9969508647918701, + "advantages": -6.897109415149316e-06, + "completion_length": 943.0, + "delta_ref_entropy_loss": 0.031982421875, + "delta_ref_ppl": -0.041748046875, + "entropy_loss": -0.04541015625, + "epoch": 0.4366, + "grad_norm": 0.8201937399452582, + "k1_kl": 0.04150390625, + "k3_kl": 0.0263671875, + "kimi_kl": 0.07421875, + "learning_rate": 2.817e-07, + "loss": 0.0011, + "ppl": 0.0203857421875, + "reward": 0.9855549931526184, + "reward_std": 0.0023651071824133396, + "rewards/perpo_ocr_edit_distance_reward": 0.9855549931526184, "step": 2183, "temperature": 0.9 }, { - "advantages": -0.00033032894134521484, - "completion_length": 374.0, - "delta_ref_entropy_loss": 0.07958984375, - "delta_ref_ppl": -0.08642578125, - "entropy_loss": -0.046142578125, - "epoch": 0.8736, - "grad_norm": 0.5464159270514546, - "k1_kl": 0.086669921875, - "k3_kl": 0.0577392578125, - "kimi_kl": 0.2529296875, - "learning_rate": 6.32e-08, - "loss": 0.0026, - "ppl": 0.01495361328125, - "reward": 0.8461150228977203, - "reward_std": 0.00014766064123250544, - "rewards/perpo_ocr_edit_distance_reward": 0.8461150527000427, + "advantages": -3.2016209843277466e-06, + "completion_length": 468.0, + "delta_ref_entropy_loss": 0.1748046875, + "delta_ref_ppl": -0.150390625, + "entropy_loss": -0.39453125, + "epoch": 0.4368, + "grad_norm": 2.879462690521685, + "k1_kl": 0.1494140625, + "k3_kl": 0.08642578125, + "kimi_kl": 0.185546875, + "learning_rate": 2.816e-07, + "loss": 0.0035, + "ppl": 0.21484375, + "reward": 0.5024765729904175, + "reward_std": 0.01055814791470766, + "rewards/perpo_ocr_edit_distance_reward": 0.5024765729904175, "step": 2184, "temperature": 0.9 }, { - "advantages": -1.9082002609138726e-05, - "completion_length": 219.5, - "delta_ref_entropy_loss": 0.0501708984375, - "delta_ref_ppl": -0.105224609375, - "entropy_loss": -0.08837890625, - "epoch": 0.874, - "grad_norm": 1.6250509961523605, - "k1_kl": 0.10498046875, - "k3_kl": 0.072509765625, - "kimi_kl": 0.23779296875, - "learning_rate": 6.3e-08, - "loss": 0.0029, - "ppl": 0.04461669921875, - "reward": 0.9643819630146027, - "reward_std": 0.005439715692773461, - "rewards/perpo_ocr_edit_distance_reward": 0.9643820226192474, + "advantages": -2.537454975026776e-06, + "completion_length": 48.0, + "delta_ref_entropy_loss": 0.045166015625, + "delta_ref_ppl": -0.8359375, + "entropy_loss": -0.1875, + "epoch": 0.437, + "grad_norm": 3.158340178670047, + "k1_kl": 0.8359375, + "k3_kl": 0.71875, + "kimi_kl": 3.28125, + "learning_rate": 2.8149999999999997e-07, + "loss": 0.0288, + "ppl": 0.09765625, + "reward": 0.9209038615226746, + "reward_std": 0.0032618646509945393, + "rewards/perpo_ocr_edit_distance_reward": 0.9209039807319641, "step": 2185, "temperature": 0.9 }, { - "advantages": -5.100454927742248e-06, - "completion_length": 249.0, - "delta_ref_entropy_loss": 0.067626953125, - "delta_ref_ppl": -0.146728515625, - "entropy_loss": -0.087890625, - "epoch": 0.8744, - "grad_norm": 1.975541830225002, - "k1_kl": 0.146728515625, - "k3_kl": 0.1170654296875, - "kimi_kl": 0.4365234375, - "learning_rate": 6.279999999999999e-08, - "loss": 0.0047, - "ppl": 0.05615234375, - "reward": 0.9821159243583679, - "reward_std": 0.004096261225640774, - "rewards/perpo_ocr_edit_distance_reward": 0.9821160435676575, + "advantages": -4.938671054333099e-07, + "completion_length": 330.0, + "delta_ref_entropy_loss": 0.04052734375, + "delta_ref_ppl": -0.1328125, + "entropy_loss": -0.0634765625, + "epoch": 0.4372, + "grad_norm": 9.674099677736823, + "k1_kl": 0.1328125, + "k3_kl": 0.09619140625, + "kimi_kl": 0.30078125, + "learning_rate": 2.8139999999999997e-07, + "loss": 0.0038, + "ppl": 0.0255126953125, + "reward": 0.9468875527381897, + "reward_std": 0.12497220188379288, + "rewards/perpo_ocr_edit_distance_reward": 0.9468876123428345, "step": 2186, "temperature": 0.9 }, { - "advantages": -9.401356101079728e-05, - "completion_length": 1425.5, - "delta_ref_entropy_loss": 0.02044677734375, - "delta_ref_ppl": -0.023895263671875, - "entropy_loss": -0.0421142578125, - "epoch": 0.8748, - "grad_norm": 35.19470599768254, - "k1_kl": 0.02392578125, - "k3_kl": 0.146514892578125, - "kimi_kl": 0.0504150390625, - "learning_rate": 6.26e-08, - "loss": 0.0059, - "ppl": 0.023681640625, - "reward": 0.8737412393093109, - "reward_std": 0.0009618130861781538, - "rewards/perpo_ocr_edit_distance_reward": 0.8737412691116333, + "advantages": -1.2312617400311865e-05, + "completion_length": 480.0, + "delta_ref_entropy_loss": 0.08203125, + "delta_ref_ppl": -0.0986328125, + "entropy_loss": -0.068359375, + "epoch": 0.4374, + "grad_norm": 1.0710825266844142, + "k1_kl": 0.09814453125, + "k3_kl": 0.062255859375, + "kimi_kl": 0.2119140625, + "learning_rate": 2.8129999999999996e-07, + "loss": 0.0025, + "ppl": 0.0247802734375, + "reward": 0.5975944399833679, + "reward_std": 0.0005921583506278694, + "rewards/perpo_ocr_edit_distance_reward": 0.5975944399833679, "step": 2187, "temperature": 0.9 }, { - "advantages": -3.1079566298330974e-07, - "completion_length": 578.0, - "delta_ref_entropy_loss": 0.022674560546875, - "delta_ref_ppl": -0.083251953125, - "entropy_loss": -0.2283935546875, - "epoch": 0.8752, - "grad_norm": 1.6399690517643817, - "k1_kl": 0.083251953125, - "k3_kl": 0.06378173828125, - "kimi_kl": 0.2021484375, - "learning_rate": 6.239999999999999e-08, - "loss": 0.0026, - "ppl": 0.09844970703125, - "reward": 0.4813259243965149, - "reward_std": 0.22973361052572727, - "rewards/perpo_ocr_edit_distance_reward": 0.4813259094953537, + "advantages": -2.6481493478058837e-06, + "completion_length": 381.0, + "delta_ref_entropy_loss": 0.0712890625, + "delta_ref_ppl": -0.12109375, + "entropy_loss": -0.0830078125, + "epoch": 0.4376, + "grad_norm": 1.3199802968343328, + "k1_kl": 0.12109375, + "k3_kl": 0.08056640625, + "kimi_kl": 0.2373046875, + "learning_rate": 2.812e-07, + "loss": 0.0032, + "ppl": 0.036376953125, + "reward": 0.9702273011207581, + "reward_std": 0.006353865843266249, + "rewards/perpo_ocr_edit_distance_reward": 0.9702273011207581, "step": 2188, "temperature": 0.9 }, { - "advantages": -1.6714846424292773e-05, - "completion_length": 913.5, - "delta_ref_entropy_loss": 0.021148681640625, - "delta_ref_ppl": -0.0148468017578125, - "entropy_loss": -0.02130126953125, - "epoch": 0.8756, - "grad_norm": 0.977659692742832, - "k1_kl": 0.0147552490234375, - "k3_kl": 0.01812744140625, - "kimi_kl": 0.0228271484375, - "learning_rate": 6.22e-08, - "loss": 0.0007, - "ppl": 0.0223388671875, - "reward": 0.9828140735626221, - "reward_std": 0.00043214927427470684, - "rewards/perpo_ocr_edit_distance_reward": 0.9828141331672668, + "advantages": -1.1461122085165698e-05, + "completion_length": 775.0, + "delta_ref_entropy_loss": 0.025390625, + "delta_ref_ppl": -0.03564453125, + "entropy_loss": -0.042724609375, + "epoch": 0.4378, + "grad_norm": 0.9450747800321572, + "k1_kl": 0.03564453125, + "k3_kl": 0.0225830078125, + "kimi_kl": 0.054931640625, + "learning_rate": 2.811e-07, + "loss": 0.0009, + "ppl": 0.0191650390625, + "reward": 0.9631093740463257, + "reward_std": 0.0028714737854897976, + "rewards/perpo_ocr_edit_distance_reward": 0.9631093740463257, "step": 2189, "temperature": 0.9 }, { - "advantages": -2.442087497911416e-05, - "completion_length": 499.0, - "delta_ref_entropy_loss": 0.049560546875, - "delta_ref_ppl": -0.0498046875, - "entropy_loss": -0.03485107421875, - "epoch": 0.876, - "grad_norm": 0.27813691914714544, - "k1_kl": 0.050048828125, - "k3_kl": 0.030120849609375, - "kimi_kl": 0.07318115234375, - "learning_rate": 6.2e-08, - "loss": 0.0012, - "ppl": 0.01409912109375, - "reward": 0.9809648990631104, - "reward_std": 0.0007349118241108954, - "rewards/perpo_ocr_edit_distance_reward": 0.9809649288654327, + "advantages": -3.569466935005039e-05, + "completion_length": 807.0, + "delta_ref_entropy_loss": 0.0294189453125, + "delta_ref_ppl": -0.053466796875, + "entropy_loss": -0.030029296875, + "epoch": 0.438, + "grad_norm": 0.5069486393351633, + "k1_kl": 0.053466796875, + "k3_kl": 0.03564453125, + "kimi_kl": 0.091796875, + "learning_rate": 2.8100000000000004e-07, + "loss": 0.0015, + "ppl": 0.01202392578125, + "reward": 0.9934368133544922, + "reward_std": 0.0013320372672751546, + "rewards/perpo_ocr_edit_distance_reward": 0.9934369325637817, "step": 2190, "temperature": 0.9 }, { - "advantages": 5.108969673983665e-07, - "completion_length": 1079.0, - "delta_ref_entropy_loss": 0.0352783203125, - "delta_ref_ppl": -0.0545654296875, - "entropy_loss": -0.130615234375, - "epoch": 0.8764, - "grad_norm": 12.380801236923128, - "k1_kl": 0.054443359375, - "k3_kl": 0.0416259765625, - "kimi_kl": 0.096923828125, - "learning_rate": 6.18e-08, - "loss": 0.0017, - "ppl": 0.077392578125, - "reward": 0.93719482421875, - "reward_std": 0.023458599112927914, - "rewards/perpo_ocr_edit_distance_reward": 0.9371948838233948, + "advantages": -5.422320100478828e-05, + "completion_length": 524.0, + "delta_ref_entropy_loss": 0.0301513671875, + "delta_ref_ppl": -0.041259765625, + "entropy_loss": -0.0303955078125, + "epoch": 0.4382, + "grad_norm": 0.5505373678504992, + "k1_kl": 0.041259765625, + "k3_kl": 0.0262451171875, + "kimi_kl": 0.07373046875, + "learning_rate": 2.809e-07, + "loss": 0.0011, + "ppl": 0.01177978515625, + "reward": 0.9752640128135681, + "reward_std": 0.0014706631191074848, + "rewards/perpo_ocr_edit_distance_reward": 0.9752641320228577, "step": 2191, "temperature": 0.9 }, { - "advantages": -0.0002980232238769531, - "completion_length": 627.5, - "delta_ref_entropy_loss": 0.01849365234375, - "delta_ref_ppl": -0.01959228515625, - "entropy_loss": -0.011810302734375, - "epoch": 0.8768, - "grad_norm": 0.0037868366533731625, - "k1_kl": 0.019561767578125, - "k3_kl": 0.013824462890625, - "kimi_kl": 0.062744140625, - "learning_rate": 6.16e-08, - "loss": 0.0009, - "ppl": 0.003082275390625, - "reward": 0.9998039305210114, - "reward_std": 0.0, - "rewards/perpo_ocr_edit_distance_reward": 0.9998039603233337, + "advantages": -8.737615280551836e-05, + "completion_length": 766.0, + "delta_ref_entropy_loss": 0.07080078125, + "delta_ref_ppl": -0.07470703125, + "entropy_loss": -0.053955078125, + "epoch": 0.4384, + "grad_norm": 0.5016386427369666, + "k1_kl": 0.07470703125, + "k3_kl": 0.046142578125, + "kimi_kl": 0.1416015625, + "learning_rate": 2.8079999999999997e-07, + "loss": 0.0019, + "ppl": 0.02197265625, + "reward": 0.9746665954589844, + "reward_std": 0.000582074630074203, + "rewards/perpo_ocr_edit_distance_reward": 0.9746666550636292, "step": 2192, "temperature": 0.9 }, { - "advantages": -1.9388540489551076e-05, - "completion_length": 581.5, - "delta_ref_entropy_loss": 0.03936767578125, - "delta_ref_ppl": -0.02191162109375, - "entropy_loss": -0.056640625, - "epoch": 0.8772, - "grad_norm": 1.8264400875897027, - "k1_kl": 0.02191162109375, - "k3_kl": 0.0109405517578125, - "kimi_kl": 0.015594482421875, - "learning_rate": 6.14e-08, - "loss": 0.0005, - "ppl": 0.02679443359375, - "reward": 0.985932856798172, - "reward_std": 0.0009815469529712573, - "rewards/perpo_ocr_edit_distance_reward": 0.9859329462051392, + "advantages": -6.100109749240801e-05, + "completion_length": 950.0, + "delta_ref_entropy_loss": 0.056396484375, + "delta_ref_ppl": -0.0810546875, + "entropy_loss": -0.05615234375, + "epoch": 0.4386, + "grad_norm": 0.49293957335446764, + "k1_kl": 0.0810546875, + "k3_kl": 0.05224609375, + "kimi_kl": 0.26171875, + "learning_rate": 2.807e-07, + "loss": 0.0021, + "ppl": 0.022705078125, + "reward": 0.9766529202461243, + "reward_std": 0.0010164397535845637, + "rewards/perpo_ocr_edit_distance_reward": 0.976652979850769, "step": 2193, "temperature": 0.9 }, { - "advantages": -7.649830422451487e-05, - "completion_length": 1164.5, - "delta_ref_entropy_loss": 0.061798095703125, - "delta_ref_ppl": -0.03497314453125, - "entropy_loss": -0.05419921875, - "epoch": 0.8776, - "grad_norm": 9.323622189531937, - "k1_kl": 0.03497314453125, - "k3_kl": 0.3646240234375, - "kimi_kl": 0.0726318359375, - "learning_rate": 6.119999999999999e-08, - "loss": 0.0146, - "ppl": 0.0343017578125, - "reward": 0.9708094596862793, - "reward_std": 0.0012214909947942942, - "rewards/perpo_ocr_edit_distance_reward": 0.9708095788955688, + "advantages": -4.1433744627283886e-05, + "completion_length": 49.0, + "delta_ref_entropy_loss": 0.1328125, + "delta_ref_ppl": -0.474609375, + "entropy_loss": -0.1357421875, + "epoch": 0.4388, + "grad_norm": 3.673603610754803, + "k1_kl": 0.470703125, + "k3_kl": 0.35546875, + "kimi_kl": 1.1640625, + "learning_rate": 2.806e-07, + "loss": 0.0142, + "ppl": 0.04296875, + "reward": 0.9591836333274841, + "reward_std": 0.002159773837774992, + "rewards/perpo_ocr_edit_distance_reward": 0.9591837525367737, "step": 2194, "temperature": 0.9 }, { - "advantages": -6.505421424662927e-06, - "completion_length": 432.5, - "delta_ref_entropy_loss": 0.0567626953125, - "delta_ref_ppl": -0.0447998046875, - "entropy_loss": -0.05047607421875, - "epoch": 0.878, - "grad_norm": 0.7126455783997186, - "k1_kl": 0.0447998046875, - "k3_kl": 0.02752685546875, - "kimi_kl": 0.076416015625, - "learning_rate": 6.099999999999999e-08, - "loss": 0.0011, - "ppl": 0.0260009765625, - "reward": 0.9715136289596558, - "reward_std": 0.0025717816315591335, - "rewards/perpo_ocr_edit_distance_reward": 0.9715136885643005, + "advantages": -5.27926886206842e-07, + "completion_length": 347.0, + "delta_ref_entropy_loss": 0.0615234375, + "delta_ref_ppl": -0.06787109375, + "entropy_loss": -0.10009765625, + "epoch": 0.439, + "grad_norm": 2.1278978693813992, + "k1_kl": 0.06787109375, + "k3_kl": 0.0478515625, + "kimi_kl": 0.11083984375, + "learning_rate": 2.805e-07, + "loss": 0.0019, + "ppl": 0.048583984375, + "reward": 0.8237096071243286, + "reward_std": 0.12990500032901764, + "rewards/perpo_ocr_edit_distance_reward": 0.8237096667289734, "step": 2195, "temperature": 0.9 }, { - "advantages": -4.01054148824187e-06, - "completion_length": 345.0, - "delta_ref_entropy_loss": 0.0499267578125, - "delta_ref_ppl": -0.0443115234375, - "entropy_loss": -0.0281982421875, - "epoch": 0.8784, - "grad_norm": 0.7298260370650499, - "k1_kl": 0.0443115234375, - "k3_kl": 0.0281982421875, - "kimi_kl": 0.0863037109375, - "learning_rate": 6.08e-08, - "loss": 0.0011, - "ppl": 0.014434814453125, - "reward": 0.6686491668224335, - "reward_std": 0.0004804954514838755, - "rewards/perpo_ocr_edit_distance_reward": 0.6686491817235947, + "advantages": 1.3147082427167334e-05, + "completion_length": 491.0, + "delta_ref_entropy_loss": 0.0947265625, + "delta_ref_ppl": -0.07568359375, + "entropy_loss": -0.0654296875, + "epoch": 0.4392, + "grad_norm": 0.9952420615259896, + "k1_kl": 0.07568359375, + "k3_kl": 0.0478515625, + "kimi_kl": 0.1552734375, + "learning_rate": 2.804e-07, + "loss": 0.0019, + "ppl": 0.031005859375, + "reward": 0.9814443588256836, + "reward_std": 0.0005475096986629069, + "rewards/perpo_ocr_edit_distance_reward": 0.9814444184303284, "step": 2196, "temperature": 0.9 }, { - "advantages": -0.00011078375382567174, - "completion_length": 562.5, - "delta_ref_entropy_loss": 0.073974609375, - "delta_ref_ppl": -0.0513916015625, - "entropy_loss": -0.0806884765625, - "epoch": 0.8788, - "grad_norm": 0.6781637065447442, - "k1_kl": 0.0513916015625, - "k3_kl": 0.0323486328125, - "kimi_kl": 0.097412109375, - "learning_rate": 6.06e-08, - "loss": 0.0014, - "ppl": 0.04766845703125, - "reward": 0.7586895227432251, - "reward_std": 0.0035933184262830764, - "rewards/perpo_ocr_edit_distance_reward": 0.7586895823478699, + "advantages": -4.83691728732083e-05, + "completion_length": 838.0, + "delta_ref_entropy_loss": 0.047607421875, + "delta_ref_ppl": -0.064453125, + "entropy_loss": -0.03955078125, + "epoch": 0.4394, + "grad_norm": 0.6420293831796835, + "k1_kl": 0.06494140625, + "k3_kl": 0.03564453125, + "kimi_kl": 0.07470703125, + "learning_rate": 2.803e-07, + "loss": 0.0015, + "ppl": 0.01446533203125, + "reward": 0.9968194365501404, + "reward_std": 0.0007802596082910895, + "rewards/perpo_ocr_edit_distance_reward": 0.9968195557594299, "step": 2197, "temperature": 0.9 }, { - "advantages": -1.8660512751011993e-05, - "completion_length": 964.5, - "delta_ref_entropy_loss": 0.01812744140625, - "delta_ref_ppl": -0.03204345703125, - "entropy_loss": -0.02593994140625, - "epoch": 0.8792, - "grad_norm": 0.7013112327515661, - "k1_kl": 0.03216552734375, - "k3_kl": 0.023895263671875, - "kimi_kl": 0.090576171875, - "learning_rate": 6.04e-08, - "loss": 0.001, - "ppl": 0.013336181640625, - "reward": 0.7569504380226135, - "reward_std": 0.007524011052737478, - "rewards/perpo_ocr_edit_distance_reward": 0.7569505274295807, + "advantages": -7.482086220989004e-05, + "completion_length": 775.0, + "delta_ref_entropy_loss": 0.03125, + "delta_ref_ppl": -0.053955078125, + "entropy_loss": -0.0203857421875, + "epoch": 0.4396, + "grad_norm": 0.2945449631346178, + "k1_kl": 0.0537109375, + "k3_kl": 0.033203125, + "kimi_kl": 0.08837890625, + "learning_rate": 2.802e-07, + "loss": 0.0014, + "ppl": 0.00628662109375, + "reward": 0.9991872310638428, + "reward_std": 0.0004690640198532492, + "rewards/perpo_ocr_edit_distance_reward": 0.9991872310638428, "step": 2198, "temperature": 0.9 }, { - "advantages": -0.00036463567812461406, - "completion_length": 451.0, - "delta_ref_entropy_loss": 0.0374755859375, - "delta_ref_ppl": -0.026275634765625, - "entropy_loss": -0.015533447265625, - "epoch": 0.8796, - "grad_norm": 0.30953733703102154, - "k1_kl": 0.026275634765625, - "k3_kl": 0.0143280029296875, - "kimi_kl": 0.05206298828125, - "learning_rate": 6.02e-08, - "loss": 0.0009, - "ppl": 0.00511932373046875, - "reward": 0.9973495006561279, - "reward_std": 0.0001737030252115801, - "rewards/perpo_ocr_edit_distance_reward": 0.9973495602607727, + "advantages": -5.900860287511023e-06, + "completion_length": 2048.0, + "delta_ref_entropy_loss": 0.0174560546875, + "delta_ref_ppl": -0.046630859375, + "entropy_loss": -0.1982421875, + "epoch": 0.4398, + "grad_norm": 109.26284425317579, + "k1_kl": 0.046630859375, + "k3_kl": 0.6328125, + "kimi_kl": 0.12158203125, + "learning_rate": 2.8010000000000003e-07, + "loss": 0.0253, + "ppl": 0.12060546875, + "reward": 0.8129565715789795, + "reward_std": 0.002785723889246583, + "rewards/perpo_ocr_edit_distance_reward": 0.8129565715789795, "step": 2199, "temperature": 0.9 }, { - "advantages": -3.888777564498014e-05, - "completion_length": 1011.5, - "delta_ref_entropy_loss": 0.04010009765625, - "delta_ref_ppl": -0.03399658203125, - "entropy_loss": -0.05291748046875, - "epoch": 0.88, - "grad_norm": 0.8282956717156887, - "k1_kl": 0.03399658203125, - "k3_kl": 0.0201416015625, - "kimi_kl": 0.06561279296875, - "learning_rate": 6e-08, - "loss": 0.0008, - "ppl": 0.0299072265625, - "reward": 0.9549471437931061, - "reward_std": 0.0018423039582557976, - "rewards/perpo_ocr_edit_distance_reward": 0.9549472332000732, + "advantages": -7.976804772624746e-05, + "completion_length": 929.0, + "delta_ref_entropy_loss": 0.044921875, + "delta_ref_ppl": -0.046630859375, + "entropy_loss": -0.05126953125, + "epoch": 0.44, + "grad_norm": 0.5288127731243565, + "k1_kl": 0.04638671875, + "k3_kl": 0.0263671875, + "kimi_kl": 0.06884765625, + "learning_rate": 2.8e-07, + "loss": 0.0011, + "ppl": 0.0250244140625, + "reward": 0.9872890710830688, + "reward_std": 0.00032682542223483324, + "rewards/perpo_ocr_edit_distance_reward": 0.9872891306877136, "step": 2200, "temperature": 0.9 }, { - "advantages": -9.359632895211689e-05, - "completion_length": 581.5, - "delta_ref_entropy_loss": 0.0411376953125, - "delta_ref_ppl": -0.0460205078125, - "entropy_loss": -0.0322265625, - "epoch": 0.8804, - "grad_norm": 1.1689986729850743, - "k1_kl": 0.0458984375, - "k3_kl": 0.02978515625, - "kimi_kl": 0.087890625, - "learning_rate": 5.979999999999999e-08, - "loss": 0.0013, - "ppl": 0.0151824951171875, - "reward": 0.9900987446308136, - "reward_std": 0.0016489621048094705, - "rewards/perpo_ocr_edit_distance_reward": 0.9900988340377808, + "advantages": -1.4645713235950097e-05, + "completion_length": 682.0, + "delta_ref_entropy_loss": 0.078125, + "delta_ref_ppl": -0.09521484375, + "entropy_loss": -0.103515625, + "epoch": 0.4402, + "grad_norm": 1.419018205525322, + "k1_kl": 0.09521484375, + "k3_kl": 0.0546875, + "kimi_kl": 0.12353515625, + "learning_rate": 2.7989999999999996e-07, + "loss": 0.0022, + "ppl": 0.0478515625, + "reward": 0.9309760928153992, + "reward_std": 0.0010625497670844197, + "rewards/perpo_ocr_edit_distance_reward": 0.930976152420044, "step": 2201, "temperature": 0.9 }, { - "advantages": -0.00029850006103515625, - "completion_length": 284.5, - "delta_ref_entropy_loss": 0.0540771484375, - "delta_ref_ppl": -0.04669189453125, - "entropy_loss": -0.04150390625, - "epoch": 0.8808, - "grad_norm": 1.0042265295829889, - "k1_kl": 0.046875, - "k3_kl": 0.02960205078125, - "kimi_kl": 0.09326171875, - "learning_rate": 5.96e-08, - "loss": 0.0015, - "ppl": 0.01978302001953125, - "reward": 0.9873213469982147, - "reward_std": 0.022180262953042984, - "rewards/perpo_ocr_edit_distance_reward": 0.9873214364051819, + "advantages": -0.00013214349746704102, + "completion_length": 668.0, + "delta_ref_entropy_loss": 0.048095703125, + "delta_ref_ppl": -0.048828125, + "entropy_loss": -0.0306396484375, + "epoch": 0.4404, + "grad_norm": 0.44769480302172077, + "k1_kl": 0.048583984375, + "k3_kl": 0.029296875, + "kimi_kl": 0.0888671875, + "learning_rate": 2.798e-07, + "loss": 0.0013, + "ppl": 0.01214599609375, + "reward": 0.9897770285606384, + "reward_std": 0.00041537568904459476, + "rewards/perpo_ocr_edit_distance_reward": 0.989777147769928, "step": 2202, "temperature": 0.9 }, { - "advantages": -6.095001026551472e-05, - "completion_length": 686.5, - "delta_ref_entropy_loss": 0.084228515625, - "delta_ref_ppl": -0.054443359375, - "entropy_loss": -0.0908203125, - "epoch": 0.8812, - "grad_norm": 1.4894072971522747, - "k1_kl": 0.054443359375, - "k3_kl": 0.03045654296875, - "kimi_kl": 0.08203125, - "learning_rate": 5.9399999999999996e-08, - "loss": 0.0013, - "ppl": 0.05059814453125, - "reward": 0.9310539066791534, - "reward_std": 0.001409730626619421, - "rewards/perpo_ocr_edit_distance_reward": 0.9310539662837982, + "advantages": 0.0, + "completion_length": 305.0, + "delta_ref_entropy_loss": 0.04931640625, + "delta_ref_ppl": -0.08740234375, + "entropy_loss": -0.044677734375, + "epoch": 0.4406, + "grad_norm": 0.5507678607205813, + "k1_kl": 0.087890625, + "k3_kl": 0.0595703125, + "kimi_kl": 0.19140625, + "learning_rate": 2.797e-07, + "loss": 0.0024, + "ppl": 0.011962890625, + "reward": 0.8258928656578064, + "reward_std": 0.0022321443539112806, + "rewards/perpo_ocr_edit_distance_reward": 0.8258928656578064, "step": 2203, "temperature": 0.9 }, { - "advantages": -8.571834769099951e-05, - "completion_length": 432.0, - "delta_ref_entropy_loss": 0.0521240234375, - "delta_ref_ppl": -0.055419921875, - "entropy_loss": -0.0531005859375, - "epoch": 0.8816, - "grad_norm": 0.7012706250121644, - "k1_kl": 0.0555419921875, - "k3_kl": 0.0316162109375, - "kimi_kl": 0.0703125, - "learning_rate": 5.92e-08, - "loss": 0.0014, - "ppl": 0.02899169921875, - "reward": 0.9897597134113312, - "reward_std": 0.00043655448826029897, - "rewards/perpo_ocr_edit_distance_reward": 0.9897598028182983, + "advantages": -3.5217832191847265e-05, + "completion_length": 1077.0, + "delta_ref_entropy_loss": 0.038330078125, + "delta_ref_ppl": -0.04638671875, + "entropy_loss": -0.04052734375, + "epoch": 0.4408, + "grad_norm": 0.5002929816615346, + "k1_kl": 0.04638671875, + "k3_kl": 0.02734375, + "kimi_kl": 0.07421875, + "learning_rate": 2.796e-07, + "loss": 0.0011, + "ppl": 0.0140380859375, + "reward": 0.9762479066848755, + "reward_std": 0.0015928968787193298, + "rewards/perpo_ocr_edit_distance_reward": 0.9762479662895203, "step": 2204, "temperature": 0.9 }, { - "advantages": -3.322533348182333e-05, - "completion_length": 720.5, - "delta_ref_entropy_loss": 0.03106689453125, - "delta_ref_ppl": -0.01824951171875, - "entropy_loss": -0.03466796875, - "epoch": 0.882, - "grad_norm": 0.5381456814001774, - "k1_kl": 0.018310546875, - "k3_kl": 0.011474609375, - "kimi_kl": 0.02081298828125, - "learning_rate": 5.899999999999999e-08, - "loss": 0.0005, - "ppl": 0.018402099609375, - "reward": 0.9866533875465393, - "reward_std": 0.005440037348307669, - "rewards/perpo_ocr_edit_distance_reward": 0.9866533875465393, + "advantages": -5.338873506843811e-06, + "completion_length": 744.0, + "delta_ref_entropy_loss": 0.025390625, + "delta_ref_ppl": -0.04541015625, + "entropy_loss": -0.02392578125, + "epoch": 0.441, + "grad_norm": 0.5179642953429846, + "k1_kl": 0.04541015625, + "k3_kl": 0.030029296875, + "kimi_kl": 0.083984375, + "learning_rate": 2.7950000000000003e-07, + "loss": 0.0012, + "ppl": 0.00811767578125, + "reward": 0.9897356629371643, + "reward_std": 0.006270666606724262, + "rewards/perpo_ocr_edit_distance_reward": 0.9897356629371643, "step": 2205, "temperature": 0.9 }, { - "advantages": 0.0, - "completion_length": 485.5, - "delta_ref_entropy_loss": 0.03338623046875, - "delta_ref_ppl": -0.038238525390625, - "entropy_loss": -0.0240478515625, - "epoch": 0.8824, - "grad_norm": 0.018960043695637797, - "k1_kl": 0.038482666015625, - "k3_kl": 0.0235748291015625, - "kimi_kl": 0.070281982421875, - "learning_rate": 5.88e-08, - "loss": 0.0009, - "ppl": 0.0089263916015625, - "reward": 1.0, - "reward_std": 0.0, - "rewards/perpo_ocr_edit_distance_reward": 1.0, + "advantages": -9.67298274190398e-06, + "completion_length": 407.0, + "delta_ref_entropy_loss": 0.11865234375, + "delta_ref_ppl": -0.10009765625, + "entropy_loss": -0.107421875, + "epoch": 0.4412, + "grad_norm": 1.3201721583415185, + "k1_kl": 0.10009765625, + "k3_kl": 0.052001953125, + "kimi_kl": 0.1279296875, + "learning_rate": 2.794e-07, + "loss": 0.0021, + "ppl": 0.05322265625, + "reward": 0.9681753516197205, + "reward_std": 0.0016632216284051538, + "rewards/perpo_ocr_edit_distance_reward": 0.9681754112243652, "step": 2206, "temperature": 0.9 }, { - "advantages": 1.1341912795614917e-05, - "completion_length": 421.0, - "delta_ref_entropy_loss": 0.0263671875, - "delta_ref_ppl": -0.0330810546875, - "entropy_loss": -0.016387939453125, - "epoch": 0.8828, - "grad_norm": 0.366159095103917, - "k1_kl": 0.03302001953125, - "k3_kl": 0.0224609375, - "kimi_kl": 0.0745849609375, - "learning_rate": 5.8599999999999995e-08, - "loss": 0.0009, - "ppl": 0.0053253173828125, - "reward": 0.9952637255191803, - "reward_std": 0.0005127653130330145, - "rewards/perpo_ocr_edit_distance_reward": 0.9952637553215027, + "advantages": -1.444135432393523e-05, + "completion_length": 797.0, + "delta_ref_entropy_loss": 0.034912109375, + "delta_ref_ppl": -0.04931640625, + "entropy_loss": -0.058349609375, + "epoch": 0.4414, + "grad_norm": 0.81607406471361, + "k1_kl": 0.04931640625, + "k3_kl": 0.032470703125, + "kimi_kl": 0.0869140625, + "learning_rate": 2.7929999999999997e-07, + "loss": 0.0013, + "ppl": 0.0233154296875, + "reward": 0.9952980875968933, + "reward_std": 0.0022588572464883327, + "rewards/perpo_ocr_edit_distance_reward": 0.9952981472015381, "step": 2207, "temperature": 0.9 }, { - "advantages": -1.0554280379437841e-05, - "completion_length": 788.5, - "delta_ref_entropy_loss": 0.011871337890625, - "delta_ref_ppl": -0.026123046875, - "entropy_loss": -0.015960693359375, - "epoch": 0.8832, - "grad_norm": 0.17852188181950476, - "k1_kl": 0.0261077880859375, - "k3_kl": 0.020263671875, - "kimi_kl": 0.082275390625, - "learning_rate": 5.84e-08, - "loss": 0.0008, - "ppl": 0.00815582275390625, - "reward": 0.9994354248046875, - "reward_std": 0.0003537096199579537, - "rewards/perpo_ocr_edit_distance_reward": 0.9994354546070099, + "advantages": -3.201621075277217e-05, + "completion_length": 458.0, + "delta_ref_entropy_loss": 0.062255859375, + "delta_ref_ppl": -0.08984375, + "entropy_loss": -0.044189453125, + "epoch": 0.4416, + "grad_norm": 0.4475337230012263, + "k1_kl": 0.08984375, + "k3_kl": 0.057373046875, + "kimi_kl": 0.166015625, + "learning_rate": 2.792e-07, + "loss": 0.0023, + "ppl": 0.0185546875, + "reward": 0.9726762175559998, + "reward_std": 0.0012301841052249074, + "rewards/perpo_ocr_edit_distance_reward": 0.9726763367652893, "step": 2208, "temperature": 0.9 }, { - "advantages": -2.9802323808780784e-07, - "completion_length": 1118.0, - "delta_ref_entropy_loss": 0.08447265625, - "delta_ref_ppl": -0.0513916015625, - "entropy_loss": -0.26953125, - "epoch": 0.8836, - "grad_norm": 1.4534338351035152, - "k1_kl": 0.0511474609375, - "k3_kl": 0.0302734375, - "kimi_kl": 0.04241943359375, - "learning_rate": 5.82e-08, - "loss": 0.0012, - "ppl": 0.1572265625, - "reward": 0.8316347002983093, - "reward_std": 0.13150948286056519, - "rewards/perpo_ocr_edit_distance_reward": 0.8316347301006317, + "advantages": 1.7029899268550253e-08, + "completion_length": 2016.0, + "delta_ref_entropy_loss": 0.0098876953125, + "delta_ref_ppl": -0.033935546875, + "entropy_loss": -0.072265625, + "epoch": 0.4418, + "grad_norm": 800.3889418896545, + "k1_kl": 0.033935546875, + "k3_kl": 0.197265625, + "kimi_kl": 0.07421875, + "learning_rate": 2.791e-07, + "loss": 0.0079, + "ppl": 0.0400390625, + "reward": 0.9683296084403992, + "reward_std": 0.0070249782875180244, + "rewards/perpo_ocr_edit_distance_reward": 0.9683296084403992, "step": 2209, "temperature": 0.9 }, { - "advantages": -4.300049710082021e-06, - "completion_length": 702.0, - "delta_ref_entropy_loss": 0.05169677734375, - "delta_ref_ppl": -0.05889892578125, - "entropy_loss": -0.0606689453125, - "epoch": 0.884, - "grad_norm": 0.8736580306906262, - "k1_kl": 0.0589599609375, - "k3_kl": 0.040435791015625, - "kimi_kl": 0.1109619140625, - "learning_rate": 5.8e-08, - "loss": 0.0016, - "ppl": 0.0283203125, - "reward": 0.9235179424285889, - "reward_std": 0.10261414432898164, - "rewards/perpo_ocr_edit_distance_reward": 0.923518031835556, + "advantages": -1.7029899268550253e-08, + "completion_length": 989.0, + "delta_ref_entropy_loss": 0.01611328125, + "delta_ref_ppl": -0.029541015625, + "entropy_loss": -0.02880859375, + "epoch": 0.442, + "grad_norm": 0.3401165032742073, + "k1_kl": 0.0296630859375, + "k3_kl": 0.0201416015625, + "kimi_kl": 0.05126953125, + "learning_rate": 2.79e-07, + "loss": 0.0008, + "ppl": 0.01348876953125, + "reward": 0.9902911186218262, + "reward_std": 0.005072731990367174, + "rewards/perpo_ocr_edit_distance_reward": 0.9902911186218262, "step": 2210, "temperature": 0.9 }, { - "advantages": -0.0001771726856532041, - "completion_length": 754.5, - "delta_ref_entropy_loss": 0.02349853515625, - "delta_ref_ppl": -0.026092529296875, - "entropy_loss": -0.015289306640625, - "epoch": 0.8844, - "grad_norm": 0.3193705270691389, - "k1_kl": 0.025970458984375, - "k3_kl": 0.017822265625, - "kimi_kl": 0.05426025390625, - "learning_rate": 5.7799999999999995e-08, - "loss": 0.0009, - "ppl": 0.0055084228515625, - "reward": 0.994965523481369, - "reward_std": 0.0002647472865646705, - "rewards/perpo_ocr_edit_distance_reward": 0.9949656426906586, + "advantages": 2.835478153428994e-05, + "completion_length": 473.0, + "delta_ref_entropy_loss": 0.05517578125, + "delta_ref_ppl": -0.059814453125, + "entropy_loss": -0.033447265625, + "epoch": 0.4422, + "grad_norm": 0.35926231253417845, + "k1_kl": 0.059814453125, + "k3_kl": 0.03662109375, + "kimi_kl": 0.109375, + "learning_rate": 2.789e-07, + "loss": 0.0014, + "ppl": 0.00958251953125, + "reward": 0.9854212403297424, + "reward_std": 0.000500441703479737, + "rewards/perpo_ocr_edit_distance_reward": 0.9854212403297424, "step": 2211, "temperature": 0.9 }, { - "advantages": -1.268727487513388e-06, - "completion_length": 505.0, - "delta_ref_entropy_loss": 0.009929656982421875, - "delta_ref_ppl": -0.1031494140625, - "entropy_loss": -0.278076171875, - "epoch": 0.8848, - "grad_norm": 3.7610115383554743, - "k1_kl": 0.1031494140625, - "k3_kl": 0.0982666015625, - "kimi_kl": 0.2552490234375, - "learning_rate": 5.759999999999999e-08, - "loss": 0.0039, - "ppl": 0.14483642578125, - "reward": 0.7098934650421143, - "reward_std": 0.08373316004872322, - "rewards/perpo_ocr_edit_distance_reward": 0.7098935395479202, + "advantages": -0.00016843847697600722, + "completion_length": 452.0, + "delta_ref_entropy_loss": 0.04736328125, + "delta_ref_ppl": -0.0693359375, + "entropy_loss": -0.0302734375, + "epoch": 0.4424, + "grad_norm": 0.5521412796989221, + "k1_kl": 0.0693359375, + "k3_kl": 0.0390625, + "kimi_kl": 0.10400390625, + "learning_rate": 2.788e-07, + "loss": 0.0017, + "ppl": 0.00885009765625, + "reward": 0.9967856407165527, + "reward_std": 0.0002032963529927656, + "rewards/perpo_ocr_edit_distance_reward": 0.9967857599258423, "step": 2212, "temperature": 0.9 }, { - "advantages": -9.647437764215283e-06, - "completion_length": 644.5, - "delta_ref_entropy_loss": 0.064697265625, - "delta_ref_ppl": -0.0469970703125, - "entropy_loss": -0.056640625, - "epoch": 0.8852, - "grad_norm": 0.6849728810600347, - "k1_kl": 0.0469970703125, - "k3_kl": 0.02862548828125, - "kimi_kl": 0.08642578125, - "learning_rate": 5.74e-08, - "loss": 0.0012, - "ppl": 0.02996826171875, - "reward": 0.9876053929328918, - "reward_std": 0.0028376260306686163, - "rewards/perpo_ocr_edit_distance_reward": 0.9876054227352142, + "advantages": -7.263252336997539e-05, + "completion_length": 419.0, + "delta_ref_entropy_loss": 0.0458984375, + "delta_ref_ppl": -0.0712890625, + "entropy_loss": -0.055908203125, + "epoch": 0.4426, + "grad_norm": 0.8055012381194036, + "k1_kl": 0.07177734375, + "k3_kl": 0.04931640625, + "kimi_kl": 0.15625, + "learning_rate": 2.787e-07, + "loss": 0.002, + "ppl": 0.0238037109375, + "reward": 0.9935325980186462, + "reward_std": 0.0015417735558003187, + "rewards/perpo_ocr_edit_distance_reward": 0.993532657623291, "step": 2213, "temperature": 0.9 }, { - "advantages": -4.955700887876446e-06, - "completion_length": 647.0, - "delta_ref_entropy_loss": 0.1080322265625, - "delta_ref_ppl": -0.088623046875, - "entropy_loss": -0.2481689453125, - "epoch": 0.8856, - "grad_norm": 1.7602750330490171, - "k1_kl": 0.088623046875, - "k3_kl": 0.0570068359375, - "kimi_kl": 0.12451171875, - "learning_rate": 5.7199999999999996e-08, - "loss": 0.0023, - "ppl": 0.14129638671875, - "reward": 0.6838005781173706, - "reward_std": 0.010914287529885769, - "rewards/perpo_ocr_edit_distance_reward": 0.683800607919693, + "advantages": -3.9781843952368945e-05, + "completion_length": 519.0, + "delta_ref_entropy_loss": 0.0771484375, + "delta_ref_ppl": -0.11669921875, + "entropy_loss": -0.031494140625, + "epoch": 0.4428, + "grad_norm": 0.34219510795965247, + "k1_kl": 0.1162109375, + "k3_kl": 0.07763671875, + "kimi_kl": 0.259765625, + "learning_rate": 2.786e-07, + "loss": 0.0031, + "ppl": 0.00970458984375, + "reward": 0.9949333071708679, + "reward_std": 0.0005422187969088554, + "rewards/perpo_ocr_edit_distance_reward": 0.9949333071708679, "step": 2214, "temperature": 0.9 }, { - "advantages": -0.00014150569248716494, - "completion_length": 765.5, - "delta_ref_entropy_loss": 0.0220947265625, - "delta_ref_ppl": -0.016265869140625, - "entropy_loss": -0.02227783203125, - "epoch": 0.886, - "grad_norm": 0.5851771930737233, - "k1_kl": 0.016265869140625, - "k3_kl": 0.0099029541015625, - "kimi_kl": 0.021697998046875, - "learning_rate": 5.7e-08, - "loss": 0.0005, - "ppl": 0.0107421875, - "reward": 0.9995044767856598, - "reward_std": 0.00031148173002293333, - "rewards/perpo_ocr_edit_distance_reward": 0.9995044767856598, + "advantages": -1.1793204976129346e-05, + "completion_length": 1853.0, + "delta_ref_entropy_loss": 0.00994873046875, + "delta_ref_ppl": -0.0255126953125, + "entropy_loss": -0.05908203125, + "epoch": 0.443, + "grad_norm": 2.525068998589232, + "k1_kl": 0.025634765625, + "k3_kl": 0.0341796875, + "kimi_kl": 0.04931640625, + "learning_rate": 2.785e-07, + "loss": 0.0014, + "ppl": 0.03466796875, + "reward": 0.993610680103302, + "reward_std": 0.001343972748145461, + "rewards/perpo_ocr_edit_distance_reward": 0.993610680103302, "step": 2215, "temperature": 0.9 }, { - "advantages": -1.9116062048851745e-06, - "completion_length": 474.0, - "delta_ref_entropy_loss": 0.0550537109375, - "delta_ref_ppl": -0.04840087890625, - "entropy_loss": -0.129150390625, - "epoch": 0.8864, - "grad_norm": 1.3817229592659832, - "k1_kl": 0.04840087890625, - "k3_kl": 0.027557373046875, - "kimi_kl": 0.058349609375, - "learning_rate": 5.68e-08, - "loss": 0.0011, - "ppl": 0.07183837890625, - "reward": 0.5813513100147247, - "reward_std": 0.0016193416668102145, - "rewards/perpo_ocr_edit_distance_reward": 0.5813513174653053, + "advantages": 1.1367457773303613e-05, + "completion_length": 835.0, + "delta_ref_entropy_loss": 0.06884765625, + "delta_ref_ppl": -0.06640625, + "entropy_loss": -0.0732421875, + "epoch": 0.4432, + "grad_norm": 0.6159810177484495, + "k1_kl": 0.06640625, + "k3_kl": 0.0390625, + "kimi_kl": 0.111328125, + "learning_rate": 2.7839999999999995e-07, + "loss": 0.0016, + "ppl": 0.031005859375, + "reward": 0.9695432782173157, + "reward_std": 0.0006487749633379281, + "rewards/perpo_ocr_edit_distance_reward": 0.9695433378219604, "step": 2216, "temperature": 0.9 }, { - "advantages": -5.706719146303385e-05, - "completion_length": 432.0, - "delta_ref_entropy_loss": 0.0665283203125, - "delta_ref_ppl": -0.06829833984375, - "entropy_loss": -0.08184814453125, - "epoch": 0.8868, - "grad_norm": 0.7499596973279501, - "k1_kl": 0.068359375, - "k3_kl": 0.046417236328125, - "kimi_kl": 0.118896484375, - "learning_rate": 5.66e-08, - "loss": 0.0019, - "ppl": 0.04522705078125, - "reward": 0.8675780892372131, - "reward_std": 0.041343149001477286, - "rewards/perpo_ocr_edit_distance_reward": 0.8675781488418579, + "advantages": -2.568108720879536e-05, + "completion_length": 887.0, + "delta_ref_entropy_loss": 0.0213623046875, + "delta_ref_ppl": -0.034912109375, + "entropy_loss": -0.031494140625, + "epoch": 0.4434, + "grad_norm": 0.4125672863183292, + "k1_kl": 0.03515625, + "k3_kl": 0.0203857421875, + "kimi_kl": 0.046142578125, + "learning_rate": 2.783e-07, + "loss": 0.0008, + "ppl": 0.0140380859375, + "reward": 0.9911900758743286, + "reward_std": 0.0028791625518351793, + "rewards/perpo_ocr_edit_distance_reward": 0.9911901354789734, "step": 2217, "temperature": 0.9 }, { - "advantages": 2.0130406483076513e-05, - "completion_length": 975.0, - "delta_ref_entropy_loss": 0.04840087890625, - "delta_ref_ppl": -0.02386474609375, - "entropy_loss": -0.05767822265625, - "epoch": 0.8872, - "grad_norm": 0.6536583694576311, - "k1_kl": 0.02386474609375, - "k3_kl": 0.010711669921875, - "kimi_kl": 0.0163726806640625, - "learning_rate": 5.6399999999999995e-08, - "loss": 0.0004, - "ppl": 0.029388427734375, - "reward": 0.9817785322666168, - "reward_std": 0.0004953591487719677, - "rewards/perpo_ocr_edit_distance_reward": 0.9817785620689392, + "advantages": -0.000160242838319391, + "completion_length": 276.0, + "delta_ref_entropy_loss": 0.02587890625, + "delta_ref_ppl": -0.078125, + "entropy_loss": -0.021728515625, + "epoch": 0.4436, + "grad_norm": 0.6214974277310304, + "k1_kl": 0.078125, + "k3_kl": 0.057373046875, + "kimi_kl": 0.2197265625, + "learning_rate": 2.782e-07, + "loss": 0.0024, + "ppl": 0.00872802734375, + "reward": 0.9930819869041443, + "reward_std": 0.0005907366285100579, + "rewards/perpo_ocr_edit_distance_reward": 0.9930821061134338, "step": 2218, "temperature": 0.9 }, { - "advantages": -1.532690987460228e-07, - "completion_length": 309.5, - "delta_ref_entropy_loss": 0.10723876953125, - "delta_ref_ppl": -0.0701904296875, - "entropy_loss": -0.3548583984375, - "epoch": 0.8876, - "grad_norm": 3.173423726185914, - "k1_kl": 0.070556640625, - "k3_kl": 0.04095458984375, - "kimi_kl": 0.1064453125, - "learning_rate": 5.62e-08, - "loss": 0.0016, - "ppl": 0.192535400390625, - "reward": 0.6917087286710739, - "reward_std": 0.013895584270358086, - "rewards/perpo_ocr_edit_distance_reward": 0.6917087435722351, + "advantages": -5.8753153098223265e-06, + "completion_length": 1278.0, + "delta_ref_entropy_loss": 0.06298828125, + "delta_ref_ppl": -0.05029296875, + "entropy_loss": -0.05029296875, + "epoch": 0.4438, + "grad_norm": 17.657716216418375, + "k1_kl": 0.05029296875, + "k3_kl": 0.0673828125, + "kimi_kl": 0.06591796875, + "learning_rate": 2.781e-07, + "loss": 0.0027, + "ppl": 0.0361328125, + "reward": 0.9851732850074768, + "reward_std": 0.004243553150445223, + "rewards/perpo_ocr_edit_distance_reward": 0.9851732850074768, "step": 2219, "temperature": 0.9 }, { - "advantages": -0.0003625835743150674, - "completion_length": 609.0, - "delta_ref_entropy_loss": 0.0343017578125, - "delta_ref_ppl": -0.03912353515625, - "entropy_loss": -0.01824951171875, - "epoch": 0.888, - "grad_norm": 0.1164137356072985, - "k1_kl": 0.03912353515625, - "k3_kl": 0.02642822265625, - "kimi_kl": 0.0955810546875, - "learning_rate": 5.6e-08, - "loss": 0.0014, - "ppl": 0.0081787109375, - "reward": 0.9911371469497681, - "reward_std": 4.892519791610539e-05, - "rewards/perpo_ocr_edit_distance_reward": 0.9911372065544128, + "advantages": 8.174351933121216e-06, + "completion_length": 1190.0, + "delta_ref_entropy_loss": 0.014404296875, + "delta_ref_ppl": -0.0289306640625, + "entropy_loss": -0.025634765625, + "epoch": 0.444, + "grad_norm": 0.3103530508351417, + "k1_kl": 0.02880859375, + "k3_kl": 0.0191650390625, + "kimi_kl": 0.046142578125, + "learning_rate": 2.7800000000000003e-07, + "loss": 0.0008, + "ppl": 0.00927734375, + "reward": 0.9962303042411804, + "reward_std": 0.0019781466107815504, + "rewards/perpo_ocr_edit_distance_reward": 0.9962303042411804, "step": 2220, "temperature": 0.9 }, { - "advantages": 1.2393509678076953e-05, - "completion_length": 1004.5, - "delta_ref_entropy_loss": 0.04290771484375, - "delta_ref_ppl": -0.02642822265625, - "entropy_loss": -0.034027099609375, - "epoch": 0.8884, - "grad_norm": 0.37132356594011795, - "k1_kl": 0.02630615234375, - "k3_kl": 0.0150604248046875, - "kimi_kl": 0.03783416748046875, - "learning_rate": 5.58e-08, - "loss": 0.0006, - "ppl": 0.0167388916015625, - "reward": 0.9938940703868866, - "reward_std": 0.001301400570810074, - "rewards/perpo_ocr_edit_distance_reward": 0.9938940703868866, + "advantages": 0.0, + "completion_length": 737.0, + "delta_ref_entropy_loss": 0.09912109375, + "delta_ref_ppl": -0.10888671875, + "entropy_loss": -0.35546875, + "epoch": 0.4442, + "grad_norm": 2.3009325647460823, + "k1_kl": 0.10888671875, + "k3_kl": 0.07373046875, + "kimi_kl": 0.162109375, + "learning_rate": 2.7789999999999997e-07, + "loss": 0.003, + "ppl": 0.181640625, + "reward": 0.5532564520835876, + "reward_std": 0.03454234078526497, + "rewards/perpo_ocr_edit_distance_reward": 0.5532565116882324, "step": 2221, "temperature": 0.9 }, { - "advantages": -2.188767757616006e-05, - "completion_length": 518.5, - "delta_ref_entropy_loss": 0.0379638671875, - "delta_ref_ppl": -0.05242919921875, - "entropy_loss": -0.01983642578125, - "epoch": 0.8888, - "grad_norm": 0.31847807518101734, - "k1_kl": 0.05206298828125, - "k3_kl": 0.03521728515625, - "kimi_kl": 0.128662109375, - "learning_rate": 5.5599999999999995e-08, - "loss": 0.0014, - "ppl": 0.0082550048828125, - "reward": 0.9988120496273041, - "reward_std": 0.000241777379414998, - "rewards/perpo_ocr_edit_distance_reward": 0.9988120794296265, + "advantages": -6.186962127685547e-05, + "completion_length": 590.0, + "delta_ref_entropy_loss": 0.0242919921875, + "delta_ref_ppl": -0.03369140625, + "entropy_loss": -0.03076171875, + "epoch": 0.4444, + "grad_norm": 0.3638781900529808, + "k1_kl": 0.03369140625, + "k3_kl": 0.0233154296875, + "kimi_kl": 0.060791015625, + "learning_rate": 2.7779999999999996e-07, + "loss": 0.001, + "ppl": 0.010986328125, + "reward": 0.9844129681587219, + "reward_std": 0.00045031614718027413, + "rewards/perpo_ocr_edit_distance_reward": 0.9844130277633667, "step": 2222, "temperature": 0.9 }, { - "advantages": -5.1259998144814745e-05, - "completion_length": 901.0, - "delta_ref_entropy_loss": 0.0408935546875, - "delta_ref_ppl": -0.02392578125, - "entropy_loss": -0.052978515625, - "epoch": 0.8892, - "grad_norm": 0.8936383489534045, - "k1_kl": 0.0238037109375, - "k3_kl": 0.0214996337890625, - "kimi_kl": 0.033416748046875, - "learning_rate": 5.539999999999999e-08, - "loss": 0.0009, - "ppl": 0.028961181640625, - "reward": 0.9948022663593292, - "reward_std": 0.0009138325694948435, - "rewards/perpo_ocr_edit_distance_reward": 0.994802325963974, + "advantages": -5.8889392676064745e-05, + "completion_length": 507.0, + "delta_ref_entropy_loss": 0.05517578125, + "delta_ref_ppl": -0.08349609375, + "entropy_loss": -0.043701171875, + "epoch": 0.4446, + "grad_norm": 0.6219970455059474, + "k1_kl": 0.08349609375, + "k3_kl": 0.054931640625, + "kimi_kl": 0.1787109375, + "learning_rate": 2.777e-07, + "loss": 0.0023, + "ppl": 0.0196533203125, + "reward": 0.9982035160064697, + "reward_std": 0.0009118049056269228, + "rewards/perpo_ocr_edit_distance_reward": 0.9982035756111145, "step": 2223, "temperature": 0.9 }, { - "advantages": -0.00018521777747082524, - "completion_length": 583.5, - "delta_ref_entropy_loss": 0.02081298828125, - "delta_ref_ppl": -0.02252197265625, - "entropy_loss": -0.016998291015625, - "epoch": 0.8896, - "grad_norm": 0.18404498313124962, - "k1_kl": 0.0225830078125, - "k3_kl": 0.013458251953125, - "kimi_kl": 0.04437255859375, - "learning_rate": 5.52e-08, - "loss": 0.0007, - "ppl": 0.0069122314453125, - "reward": 0.9987487196922302, - "reward_std": 9.053388325952483e-05, - "rewards/perpo_ocr_edit_distance_reward": 0.9987487494945526, + "advantages": -0.00010861669579753652, + "completion_length": 360.0, + "delta_ref_entropy_loss": 0.06787109375, + "delta_ref_ppl": -0.09130859375, + "entropy_loss": -0.04345703125, + "epoch": 0.4448, + "grad_norm": 0.5853598826060569, + "k1_kl": 0.091796875, + "k3_kl": 0.05712890625, + "kimi_kl": 0.1552734375, + "learning_rate": 2.776e-07, + "loss": 0.0024, + "ppl": 0.013916015625, + "reward": 0.9854258894920349, + "reward_std": 0.0006838857661932707, + "rewards/perpo_ocr_edit_distance_reward": 0.9854260087013245, "step": 2224, "temperature": 0.9 }, { - "advantages": -4.836491513060537e-06, - "completion_length": 314.0, - "delta_ref_entropy_loss": 0.11865234375, - "delta_ref_ppl": -0.171875, - "entropy_loss": -0.089111328125, - "epoch": 0.89, - "grad_norm": 4.653126736344101, - "k1_kl": 0.171875, - "k3_kl": 0.11932373046875, - "kimi_kl": 0.4034423828125, - "learning_rate": 5.4999999999999996e-08, - "loss": 0.0048, - "ppl": 0.04345703125, - "reward": 0.9760538041591644, - "reward_std": 0.014154041768051684, - "rewards/perpo_ocr_edit_distance_reward": 0.9760538339614868, + "advantages": -6.450925866374746e-05, + "completion_length": 496.0, + "delta_ref_entropy_loss": 0.05810546875, + "delta_ref_ppl": -0.06396484375, + "entropy_loss": -0.041015625, + "epoch": 0.445, + "grad_norm": 0.6542965497648365, + "k1_kl": 0.0634765625, + "k3_kl": 0.042724609375, + "kimi_kl": 0.12451171875, + "learning_rate": 2.775e-07, + "loss": 0.0018, + "ppl": 0.0166015625, + "reward": 0.9924280643463135, + "reward_std": 0.0008243183838203549, + "rewards/perpo_ocr_edit_distance_reward": 0.9924281239509583, "step": 2225, "temperature": 0.9 }, { - "advantages": -5.37314599569072e-05, - "completion_length": 517.0, - "delta_ref_entropy_loss": 0.0361328125, - "delta_ref_ppl": -0.0377197265625, - "entropy_loss": -0.030517578125, - "epoch": 0.8904, - "grad_norm": 0.6073336979970899, - "k1_kl": 0.03759765625, - "k3_kl": 0.02349853515625, - "kimi_kl": 0.0606689453125, - "learning_rate": 5.48e-08, - "loss": 0.001, - "ppl": 0.0159912109375, - "reward": 0.9737575054168701, - "reward_std": 0.0009666447585914284, - "rewards/perpo_ocr_edit_distance_reward": 0.9737576246261597, + "advantages": -0.00016619478992652148, + "completion_length": 755.0, + "delta_ref_entropy_loss": 0.033447265625, + "delta_ref_ppl": -0.04736328125, + "entropy_loss": -0.031005859375, + "epoch": 0.4452, + "grad_norm": 0.20725141619352416, + "k1_kl": 0.04736328125, + "k3_kl": 0.0286865234375, + "kimi_kl": 0.09765625, + "learning_rate": 2.774e-07, + "loss": 0.0013, + "ppl": 0.00933837890625, + "reward": 0.9949378967285156, + "reward_std": 0.00030989787774160504, + "rewards/perpo_ocr_edit_distance_reward": 0.9949379563331604, "step": 2226, "temperature": 0.9 }, { - "advantages": -6.43815335479303e-05, - "completion_length": 603.5, - "delta_ref_entropy_loss": 0.0443115234375, - "delta_ref_ppl": -0.062652587890625, - "entropy_loss": -0.02264404296875, - "epoch": 0.8908, - "grad_norm": 0.4954040289468391, - "k1_kl": 0.062652587890625, - "k3_kl": 0.0444793701171875, - "kimi_kl": 0.158355712890625, - "learning_rate": 5.46e-08, - "loss": 0.0018, - "ppl": 0.0093536376953125, - "reward": 0.9832957088947296, - "reward_std": 0.003550518522388302, - "rewards/perpo_ocr_edit_distance_reward": 0.9832957684993744, + "advantages": -2.893379860324785e-05, + "completion_length": 391.0, + "delta_ref_entropy_loss": 0.04541015625, + "delta_ref_ppl": -0.044921875, + "entropy_loss": -0.0380859375, + "epoch": 0.4454, + "grad_norm": 0.4909957236965091, + "k1_kl": 0.044921875, + "k3_kl": 0.024658203125, + "kimi_kl": 0.0556640625, + "learning_rate": 2.773e-07, + "loss": 0.001, + "ppl": 0.011962890625, + "reward": 0.990261435508728, + "reward_std": 0.0007829507812857628, + "rewards/perpo_ocr_edit_distance_reward": 0.990261435508728, "step": 2227, "temperature": 0.9 }, { - "advantages": -3.9730756355993435e-05, - "completion_length": 380.0, - "delta_ref_entropy_loss": 0.02813720703125, - "delta_ref_ppl": -0.0355224609375, - "entropy_loss": -0.0413818359375, - "epoch": 0.8912, - "grad_norm": 0.9138582605852651, - "k1_kl": 0.03564453125, - "k3_kl": 0.02655029296875, - "kimi_kl": 0.075439453125, - "learning_rate": 5.44e-08, + "advantages": -4.076106415595859e-05, + "completion_length": 1208.0, + "delta_ref_entropy_loss": 0.060302734375, + "delta_ref_ppl": -0.05419921875, + "entropy_loss": -0.07470703125, + "epoch": 0.4456, + "grad_norm": 0.9500740452515865, + "k1_kl": 0.05419921875, + "k3_kl": 0.026611328125, + "kimi_kl": 0.0478515625, + "learning_rate": 2.7719999999999997e-07, "loss": 0.0011, - "ppl": 0.02239990234375, - "reward": 0.8422139883041382, - "reward_std": 0.04042809482780285, - "rewards/perpo_ocr_edit_distance_reward": 0.842214047908783, + "ppl": 0.03466796875, + "reward": 0.9865979552268982, + "reward_std": 0.0026152771897614002, + "rewards/perpo_ocr_edit_distance_reward": 0.9865980744361877, "step": 2228, "temperature": 0.9 }, { - "advantages": -9.414554415343446e-05, - "completion_length": 541.5, - "delta_ref_entropy_loss": 0.0509033203125, - "delta_ref_ppl": -0.039794921875, - "entropy_loss": -0.0478515625, - "epoch": 0.8916, - "grad_norm": 0.661346699450827, - "k1_kl": 0.03973388671875, - "k3_kl": 0.022491455078125, - "kimi_kl": 0.056396484375, - "learning_rate": 5.4199999999999996e-08, - "loss": 0.001, - "ppl": 0.02557373046875, - "reward": 0.9945186376571655, - "reward_std": 0.000712588858732488, - "rewards/perpo_ocr_edit_distance_reward": 0.9945187270641327, + "advantages": 2.1393810811787262e-07, + "completion_length": 2048.0, + "delta_ref_entropy_loss": 0.005096435546875, + "delta_ref_ppl": -0.10498046875, + "entropy_loss": -0.2041015625, + "epoch": 0.4458, + "grad_norm": 1.5682678835150166, + "k1_kl": 0.10498046875, + "k3_kl": 0.0732421875, + "kimi_kl": 0.216796875, + "learning_rate": 2.771e-07, + "loss": 0.0029, + "ppl": 0.09375, + "reward": 0.6757492423057556, + "reward_std": 0.1462251842021942, + "rewards/perpo_ocr_edit_distance_reward": 0.6757492423057556, "step": 2229, "temperature": 0.9 }, { - "advantages": -3.214393655071035e-05, - "completion_length": 497.5, - "delta_ref_entropy_loss": 0.0435791015625, - "delta_ref_ppl": -0.0550537109375, - "entropy_loss": -0.01971435546875, - "epoch": 0.892, - "grad_norm": 0.4186539695285504, - "k1_kl": 0.0548095703125, - "k3_kl": 0.0386962890625, - "kimi_kl": 0.1591796875, - "learning_rate": 5.3999999999999994e-08, - "loss": 0.0016, - "ppl": 0.0069122314453125, - "reward": 0.999772697687149, - "reward_std": 0.00014867461868561804, - "rewards/perpo_ocr_edit_distance_reward": 0.9997727274894714, + "advantages": -3.722736073541455e-05, + "completion_length": 578.0, + "delta_ref_entropy_loss": 0.056396484375, + "delta_ref_ppl": -0.07373046875, + "entropy_loss": -0.041748046875, + "epoch": 0.446, + "grad_norm": 0.4182423877800695, + "k1_kl": 0.07421875, + "k3_kl": 0.046142578125, + "kimi_kl": 0.1259765625, + "learning_rate": 2.77e-07, + "loss": 0.0019, + "ppl": 0.01416015625, + "reward": 0.9916233420372009, + "reward_std": 0.0003576340968720615, + "rewards/perpo_ocr_edit_distance_reward": 0.9916234016418457, "step": 2230, "temperature": 0.9 }, { - "advantages": -1.4586108591174707e-05, - "completion_length": 323.5, - "delta_ref_entropy_loss": 0.036376953125, - "delta_ref_ppl": -0.04754638671875, - "entropy_loss": -0.04937744140625, - "epoch": 0.8924, - "grad_norm": 3.9883801402761447, - "k1_kl": 0.0477294921875, - "k3_kl": 0.02874755859375, - "kimi_kl": 0.0555419921875, - "learning_rate": 5.38e-08, - "loss": 0.0012, - "ppl": 0.026397705078125, - "reward": 0.8497690856456757, - "reward_std": 0.00559421197976917, - "rewards/perpo_ocr_edit_distance_reward": 0.8497691750526428, + "advantages": -3.87941108783707e-05, + "completion_length": 583.0, + "delta_ref_entropy_loss": 0.162109375, + "delta_ref_ppl": -0.115234375, + "entropy_loss": -0.220703125, + "epoch": 0.4462, + "grad_norm": 1.3949528134199862, + "k1_kl": 0.11572265625, + "k3_kl": 0.06396484375, + "kimi_kl": 0.1943359375, + "learning_rate": 2.7689999999999995e-07, + "loss": 0.0026, + "ppl": 0.11279296875, + "reward": 0.9488897323608398, + "reward_std": 0.0009972808184102178, + "rewards/perpo_ocr_edit_distance_reward": 0.9488898515701294, "step": 2231, "temperature": 0.9 }, { - "advantages": -2.671139691301505e-05, - "completion_length": 728.0, - "delta_ref_entropy_loss": 0.03546142578125, - "delta_ref_ppl": -0.03521728515625, - "entropy_loss": -0.03857421875, - "epoch": 0.8928, - "grad_norm": 1.8343843327010758, - "k1_kl": 0.03515625, - "k3_kl": 0.023040771484375, - "kimi_kl": 0.0760498046875, - "learning_rate": 5.36e-08, - "loss": 0.001, - "ppl": 0.019073486328125, - "reward": 0.9425566494464874, - "reward_std": 0.010401805106084794, - "rewards/perpo_ocr_edit_distance_reward": 0.9425567090511322, + "advantages": -0.0001672846992732957, + "completion_length": 388.0, + "delta_ref_entropy_loss": 0.076171875, + "delta_ref_ppl": -0.0810546875, + "entropy_loss": -0.038330078125, + "epoch": 0.4464, + "grad_norm": 0.4507350492223613, + "k1_kl": 0.08154296875, + "k3_kl": 0.048583984375, + "kimi_kl": 0.12890625, + "learning_rate": 2.768e-07, + "loss": 0.0021, + "ppl": 0.01483154296875, + "reward": 0.9856473207473755, + "reward_std": 0.00035804291837848723, + "rewards/perpo_ocr_edit_distance_reward": 0.9856473803520203, "step": 2232, "temperature": 0.9 }, { - "advantages": -6.793652573833242e-05, - "completion_length": 721.5, - "delta_ref_entropy_loss": 0.02386474609375, - "delta_ref_ppl": -0.01776123046875, - "entropy_loss": -0.01812744140625, - "epoch": 0.8932, - "grad_norm": 0.38287045079705073, - "k1_kl": 0.0177001953125, - "k3_kl": 0.009490966796875, - "kimi_kl": 0.02130126953125, - "learning_rate": 5.34e-08, - "loss": 0.0004, - "ppl": 0.0080718994140625, - "reward": 0.9993610382080078, - "reward_std": 0.0003027306011063047, - "rewards/perpo_ocr_edit_distance_reward": 0.9993610680103302, + "advantages": -1.5667506886529736e-05, + "completion_length": 508.0, + "delta_ref_entropy_loss": 0.1328125, + "delta_ref_ppl": -0.138671875, + "entropy_loss": -0.21484375, + "epoch": 0.4466, + "grad_norm": 1.5198382185708272, + "k1_kl": 0.138671875, + "k3_kl": 0.0859375, + "kimi_kl": 0.25390625, + "learning_rate": 2.767e-07, + "loss": 0.0034, + "ppl": 0.1064453125, + "reward": 0.9282181859016418, + "reward_std": 0.0037076359149068594, + "rewards/perpo_ocr_edit_distance_reward": 0.9282182455062866, "step": 2233, "temperature": 0.9 }, { - "advantages": -4.402228829469834e-06, - "completion_length": 664.5, - "delta_ref_entropy_loss": 0.030517578125, - "delta_ref_ppl": -0.01751708984375, - "entropy_loss": -0.018310546875, - "epoch": 0.8936, - "grad_norm": 0.5369272527937663, - "k1_kl": 0.017547607421875, - "k3_kl": 0.0086517333984375, - "kimi_kl": 0.019866943359375, - "learning_rate": 5.319999999999999e-08, - "loss": 0.0004, - "ppl": 0.0060882568359375, - "reward": 0.9971543550491333, - "reward_std": 0.00395691029552836, - "rewards/perpo_ocr_edit_distance_reward": 0.9971543848514557, + "advantages": -0.0003645675606094301, + "completion_length": 894.0, + "delta_ref_entropy_loss": 0.04931640625, + "delta_ref_ppl": -0.05908203125, + "entropy_loss": -0.0228271484375, + "epoch": 0.4468, + "grad_norm": 0.19948134916162302, + "k1_kl": 0.05908203125, + "k3_kl": 0.033203125, + "kimi_kl": 0.107421875, + "learning_rate": 2.766e-07, + "loss": 0.0017, + "ppl": 0.00738525390625, + "reward": 0.998447835445404, + "reward_std": 0.00020364572992548347, + "rewards/perpo_ocr_edit_distance_reward": 0.9984480142593384, "step": 2234, "temperature": 0.9 }, { - "advantages": 1.575265713427143e-07, - "completion_length": 323.0, - "delta_ref_entropy_loss": 0.016571044921875, - "delta_ref_ppl": -0.0557861328125, - "entropy_loss": -0.3157958984375, - "epoch": 0.894, - "grad_norm": 1.8442664270457103, - "k1_kl": 0.055419921875, - "k3_kl": 0.03692626953125, - "kimi_kl": 0.0670166015625, - "learning_rate": 5.3e-08, - "loss": 0.0015, - "ppl": 0.1778564453125, - "reward": 0.8158972263336182, - "reward_std": 0.02680368348956108, - "rewards/perpo_ocr_edit_distance_reward": 0.8158972263336182, + "advantages": -0.00014906270371284336, + "completion_length": 496.0, + "delta_ref_entropy_loss": 0.02294921875, + "delta_ref_ppl": -0.056884765625, + "entropy_loss": -0.035888671875, + "epoch": 0.447, + "grad_norm": 0.6390346852104881, + "k1_kl": 0.056884765625, + "k3_kl": 0.0390625, + "kimi_kl": 0.126953125, + "learning_rate": 2.765e-07, + "loss": 0.0017, + "ppl": 0.0118408203125, + "reward": 0.993887722492218, + "reward_std": 0.00024276922340504825, + "rewards/perpo_ocr_edit_distance_reward": 0.9938877820968628, "step": 2235, "temperature": 0.9 }, { - "advantages": -0.00033673218422336504, - "completion_length": 477.5, - "delta_ref_entropy_loss": 0.0238037109375, - "delta_ref_ppl": -0.0157470703125, - "entropy_loss": -0.018218994140625, - "epoch": 0.8944, - "grad_norm": 1.598570320419202, - "k1_kl": 0.0157623291015625, - "k3_kl": 0.00968170166015625, - "kimi_kl": 0.03179931640625, - "learning_rate": 5.2799999999999996e-08, - "loss": 0.0007, - "ppl": 0.008209228515625, - "reward": 0.9991146922111511, - "reward_std": 0.00022492768766824156, - "rewards/perpo_ocr_edit_distance_reward": 0.9991148114204407, + "advantages": -9.366444828629028e-07, + "completion_length": 665.0, + "delta_ref_entropy_loss": 0.035888671875, + "delta_ref_ppl": -0.040771484375, + "entropy_loss": -0.048583984375, + "epoch": 0.4472, + "grad_norm": 0.9001906646372401, + "k1_kl": 0.040771484375, + "k3_kl": 0.0264892578125, + "kimi_kl": 0.07666015625, + "learning_rate": 2.7639999999999996e-07, + "loss": 0.0011, + "ppl": 0.0201416015625, + "reward": 0.9794176816940308, + "reward_std": 0.00897081010043621, + "rewards/perpo_ocr_edit_distance_reward": 0.9794176816940308, "step": 2236, "temperature": 0.9 }, { - "advantages": -8.5192074607221e-05, - "completion_length": 802.5, - "delta_ref_entropy_loss": 0.04901123046875, - "delta_ref_ppl": -0.039794921875, - "entropy_loss": -0.03363037109375, - "epoch": 0.8948, - "grad_norm": 0.7026737334631079, - "k1_kl": 0.0399169921875, - "k3_kl": 0.02301025390625, - "kimi_kl": 0.0572509765625, - "learning_rate": 5.26e-08, - "loss": 0.001, - "ppl": 0.0169219970703125, - "reward": 0.9952463209629059, - "reward_std": 0.0013436598528642207, - "rewards/perpo_ocr_edit_distance_reward": 0.9952463805675507, + "advantages": 4.257474817137563e-09, + "completion_length": 624.0, + "delta_ref_entropy_loss": 0.0245361328125, + "delta_ref_ppl": -0.05126953125, + "entropy_loss": -0.037841796875, + "epoch": 0.4474, + "grad_norm": 1.1624834705985383, + "k1_kl": 0.05126953125, + "k3_kl": 0.0341796875, + "kimi_kl": 0.08984375, + "learning_rate": 2.7629999999999995e-07, + "loss": 0.0014, + "ppl": 0.01123046875, + "reward": 0.967968761920929, + "reward_std": 0.0020584836602211, + "rewards/perpo_ocr_edit_distance_reward": 0.9679687023162842, "step": 2237, "temperature": 0.9 }, { - "advantages": -3.2786813790153246e-05, - "completion_length": 562.0, - "delta_ref_entropy_loss": 0.0228271484375, - "delta_ref_ppl": -0.021759033203125, - "entropy_loss": -0.0255126953125, - "epoch": 0.8952, - "grad_norm": 0.5659730448320883, - "k1_kl": 0.021759033203125, - "k3_kl": 0.0119476318359375, - "kimi_kl": 0.02215576171875, - "learning_rate": 5.24e-08, - "loss": 0.0005, - "ppl": 0.013824462890625, - "reward": 0.9924950003623962, - "reward_std": 0.0009997697197832167, - "rewards/perpo_ocr_edit_distance_reward": 0.9924950301647186, + "advantages": -5.364418484532507e-06, + "completion_length": 1681.0, + "delta_ref_entropy_loss": 0.005767822265625, + "delta_ref_ppl": -0.01031494140625, + "entropy_loss": -0.0096435546875, + "epoch": 0.4476, + "grad_norm": 0.12152242175223679, + "k1_kl": 0.01031494140625, + "k3_kl": 0.00689697265625, + "kimi_kl": 0.0179443359375, + "learning_rate": 2.762e-07, + "loss": 0.0003, + "ppl": 0.0023040771484375, + "reward": 0.7291592955589294, + "reward_std": 0.017395326867699623, + "rewards/perpo_ocr_edit_distance_reward": 0.729159414768219, "step": 2238, "temperature": 0.9 }, { - "advantages": -3.2016209843277466e-06, - "completion_length": 667.5, - "delta_ref_entropy_loss": 0.0247802734375, - "delta_ref_ppl": -0.03179931640625, - "entropy_loss": -0.02362060546875, - "epoch": 0.8956, - "grad_norm": 0.3826049365041388, - "k1_kl": 0.03179931640625, - "k3_kl": 0.0210113525390625, - "kimi_kl": 0.067626953125, - "learning_rate": 5.2200000000000004e-08, - "loss": 0.0008, - "ppl": 0.0118865966796875, - "reward": 0.9949394166469574, - "reward_std": 0.000614631047938019, - "rewards/perpo_ocr_edit_distance_reward": 0.9949394166469574, + "advantages": -1.2397767022775952e-05, + "completion_length": 1476.0, + "delta_ref_entropy_loss": 0.10498046875, + "delta_ref_ppl": -0.09814453125, + "entropy_loss": -0.30078125, + "epoch": 0.4478, + "grad_norm": 5.253368737539609, + "k1_kl": 0.09814453125, + "k3_kl": 0.07275390625, + "kimi_kl": 0.162109375, + "learning_rate": 2.761e-07, + "loss": 0.0029, + "ppl": 0.1796875, + "reward": 0.8998385071754456, + "reward_std": 0.006075622979551554, + "rewards/perpo_ocr_edit_distance_reward": 0.8998386263847351, "step": 2239, "temperature": 0.9 }, { - "advantages": -2.8674093170533155e-05, - "completion_length": 794.5, - "delta_ref_entropy_loss": 0.036376953125, - "delta_ref_ppl": -0.021240234375, - "entropy_loss": -0.03936767578125, - "epoch": 0.896, - "grad_norm": 0.7399549899570451, - "k1_kl": 0.021240234375, - "k3_kl": 0.012847900390625, - "kimi_kl": 0.02667236328125, - "learning_rate": 5.1999999999999996e-08, - "loss": 0.0005, - "ppl": 0.020416259765625, - "reward": 0.9937289953231812, - "reward_std": 0.001102839276427403, - "rewards/perpo_ocr_edit_distance_reward": 0.9937290549278259, + "advantages": -6.29425048828125e-05, + "completion_length": 613.0, + "delta_ref_entropy_loss": 0.059814453125, + "delta_ref_ppl": -0.08984375, + "entropy_loss": -0.060546875, + "epoch": 0.448, + "grad_norm": 0.4752629430241387, + "k1_kl": 0.08935546875, + "k3_kl": 0.056396484375, + "kimi_kl": 0.1376953125, + "learning_rate": 2.7600000000000004e-07, + "loss": 0.0023, + "ppl": 0.0223388671875, + "reward": 0.912158727645874, + "reward_std": 0.0009824938606470823, + "rewards/perpo_ocr_edit_distance_reward": 0.9121588468551636, "step": 2240, "temperature": 0.9 }, { - "advantages": -5.986009779856971e-06, - "completion_length": 348.0, - "delta_ref_entropy_loss": 0.082763671875, - "delta_ref_ppl": -0.0625, - "entropy_loss": -0.0751953125, - "epoch": 0.8964, - "grad_norm": 1.160957955910967, - "k1_kl": 0.0625, - "k3_kl": 0.03363037109375, - "kimi_kl": 0.082794189453125, - "learning_rate": 5.1799999999999994e-08, - "loss": 0.0014, - "ppl": 0.0380859375, - "reward": 0.8958422243595123, - "reward_std": 0.002138318057404831, - "rewards/perpo_ocr_edit_distance_reward": 0.8958422839641571, + "advantages": -8.02108297648374e-06, + "completion_length": 530.0, + "delta_ref_entropy_loss": 0.06005859375, + "delta_ref_ppl": -0.08251953125, + "entropy_loss": -0.056396484375, + "epoch": 0.4482, + "grad_norm": 1.1664220725871053, + "k1_kl": 0.08251953125, + "k3_kl": 0.048828125, + "kimi_kl": 0.1357421875, + "learning_rate": 2.759e-07, + "loss": 0.002, + "ppl": 0.0194091796875, + "reward": 0.9729330539703369, + "reward_std": 0.008392059244215488, + "rewards/perpo_ocr_edit_distance_reward": 0.9729331135749817, "step": 2241, "temperature": 0.9 }, { - "advantages": -1.1435577107477002e-05, - "completion_length": 260.5, - "delta_ref_entropy_loss": 0.03411865234375, - "delta_ref_ppl": -0.04150390625, - "entropy_loss": -0.01898193359375, - "epoch": 0.8968, - "grad_norm": 0.7033953694175417, - "k1_kl": 0.04150390625, - "k3_kl": 0.02813720703125, - "kimi_kl": 0.0693359375, - "learning_rate": 5.16e-08, - "loss": 0.0011, - "ppl": 0.0072479248046875, - "reward": 0.967890739440918, - "reward_std": 0.000508698693010956, - "rewards/perpo_ocr_edit_distance_reward": 0.9678907990455627, + "advantages": -3.984996510553174e-05, + "completion_length": 585.0, + "delta_ref_entropy_loss": 0.08544921875, + "delta_ref_ppl": -0.09326171875, + "entropy_loss": -0.173828125, + "epoch": 0.4484, + "grad_norm": 1.7571997420734742, + "k1_kl": 0.09375, + "k3_kl": 0.06103515625, + "kimi_kl": 0.1630859375, + "learning_rate": 2.7579999999999997e-07, + "loss": 0.0025, + "ppl": 0.09033203125, + "reward": 0.9577308893203735, + "reward_std": 0.0022503137588500977, + "rewards/perpo_ocr_edit_distance_reward": 0.9577310085296631, "step": 2242, "temperature": 0.9 }, { - "advantages": 0.0, - "completion_length": 667.0, - "delta_ref_entropy_loss": 0.01898193359375, - "delta_ref_ppl": -0.0191650390625, - "entropy_loss": -0.01788330078125, - "epoch": 0.8972, - "grad_norm": 0.014101910512626577, - "k1_kl": 0.01910400390625, - "k3_kl": 0.01214599609375, - "kimi_kl": 0.0328369140625, - "learning_rate": 5.14e-08, - "loss": 0.0005, - "ppl": 0.008392333984375, - "reward": 0.9998436570167542, - "reward_std": 0.0, - "rewards/perpo_ocr_edit_distance_reward": 0.9998436868190765, + "advantages": -2.8448446755646728e-05, + "completion_length": 982.0, + "delta_ref_entropy_loss": 0.05126953125, + "delta_ref_ppl": -0.056640625, + "entropy_loss": -0.08154296875, + "epoch": 0.4486, + "grad_norm": 94.93863587623763, + "k1_kl": 0.056396484375, + "k3_kl": 0.703125, + "kimi_kl": 0.189453125, + "learning_rate": 2.757e-07, + "loss": 0.0281, + "ppl": 0.076171875, + "reward": 0.9711415767669678, + "reward_std": 0.0007975740008987486, + "rewards/perpo_ocr_edit_distance_reward": 0.9711415767669678, "step": 2243, "temperature": 0.9 }, { - "advantages": -0.0003082454213654273, - "completion_length": 249.0, - "delta_ref_entropy_loss": 0.02496337890625, - "delta_ref_ppl": -0.0609130859375, - "entropy_loss": -0.033447265625, - "epoch": 0.8976, - "grad_norm": 0.9589699849119294, - "k1_kl": 0.0611572265625, - "k3_kl": 0.050750732421875, - "kimi_kl": 0.253173828125, - "learning_rate": 5.12e-08, - "loss": 0.0023, - "ppl": 0.01678466796875, - "reward": 0.9918411672115326, - "reward_std": 0.0014081160770729184, - "rewards/perpo_ocr_edit_distance_reward": 0.9918412268161774, + "advantages": -2.6370798877906054e-05, + "completion_length": 359.0, + "delta_ref_entropy_loss": 0.09716796875, + "delta_ref_ppl": -0.1552734375, + "entropy_loss": -0.19140625, + "epoch": 0.4488, + "grad_norm": 2.7550312752944683, + "k1_kl": 0.1552734375, + "k3_kl": 0.09765625, + "kimi_kl": 0.259765625, + "learning_rate": 2.756e-07, + "loss": 0.0039, + "ppl": 0.09423828125, + "reward": 0.9220007658004761, + "reward_std": 0.002159785246476531, + "rewards/perpo_ocr_edit_distance_reward": 0.9220008850097656, "step": 2244, "temperature": 0.9 }, { - "advantages": -6.650175497391331e-06, - "completion_length": 923.0, - "delta_ref_entropy_loss": 0.04339599609375, - "delta_ref_ppl": -0.0594482421875, - "entropy_loss": -0.033447265625, - "epoch": 0.898, - "grad_norm": 1.0770169423964466, - "k1_kl": 0.0594482421875, - "k3_kl": 0.04052734375, - "kimi_kl": 0.133544921875, - "learning_rate": 5.0999999999999993e-08, - "loss": 0.0016, - "ppl": 0.0179595947265625, - "reward": 0.9807671010494232, - "reward_std": 0.013234838377684355, - "rewards/perpo_ocr_edit_distance_reward": 0.9807671010494232, + "advantages": -1.4262540389609057e-05, + "completion_length": 113.0, + "delta_ref_entropy_loss": 0.0869140625, + "delta_ref_ppl": -0.404296875, + "entropy_loss": -0.1416015625, + "epoch": 0.449, + "grad_norm": 2.8311669272199667, + "k1_kl": 0.40234375, + "k3_kl": 0.326171875, + "kimi_kl": 1.578125, + "learning_rate": 2.755e-07, + "loss": 0.013, + "ppl": 0.052490234375, + "reward": 0.9658456444740295, + "reward_std": 0.004075435921549797, + "rewards/perpo_ocr_edit_distance_reward": 0.9658457040786743, "step": 2245, "temperature": 0.9 }, { - "advantages": -1.0128532526465506e-05, - "completion_length": 623.5, - "delta_ref_entropy_loss": 0.03839111328125, - "delta_ref_ppl": -0.06689453125, - "entropy_loss": -0.06475830078125, - "epoch": 0.8984, - "grad_norm": 1.106708610097807, - "k1_kl": 0.0672607421875, - "k3_kl": 0.047119140625, - "kimi_kl": 0.13427734375, - "learning_rate": 5.08e-08, - "loss": 0.0019, - "ppl": 0.0288543701171875, - "reward": 0.7948386967182159, - "reward_std": 0.07188083280925639, - "rewards/perpo_ocr_edit_distance_reward": 0.7948387265205383, + "advantages": -1.1239733794354834e-05, + "completion_length": 366.0, + "delta_ref_entropy_loss": 0.07421875, + "delta_ref_ppl": -0.130859375, + "entropy_loss": -0.0634765625, + "epoch": 0.4492, + "grad_norm": 0.9451706503194288, + "k1_kl": 0.130859375, + "k3_kl": 0.09521484375, + "kimi_kl": 0.365234375, + "learning_rate": 2.754e-07, + "loss": 0.0038, + "ppl": 0.024169921875, + "reward": 0.9856649041175842, + "reward_std": 0.00975959375500679, + "rewards/perpo_ocr_edit_distance_reward": 0.985664963722229, "step": 2246, "temperature": 0.9 }, { - "advantages": -2.6438917757332092e-06, - "completion_length": 202.0, - "delta_ref_entropy_loss": 0.114501953125, - "delta_ref_ppl": -0.2080078125, - "entropy_loss": -0.1146240234375, - "epoch": 0.8988, - "grad_norm": 1.072759850063981, - "k1_kl": 0.2080078125, - "k3_kl": 0.1468505859375, - "kimi_kl": 0.457763671875, - "learning_rate": 5.0599999999999996e-08, - "loss": 0.0059, - "ppl": 0.06787109375, - "reward": 0.7622068524360657, - "reward_std": 0.0031614487525075674, - "rewards/perpo_ocr_edit_distance_reward": 0.7622068822383881, + "advantages": 5.313328529155115e-06, + "completion_length": 362.0, + "delta_ref_entropy_loss": 0.1513671875, + "delta_ref_ppl": -0.125, + "entropy_loss": -0.1630859375, + "epoch": 0.4494, + "grad_norm": 1.51998701684692, + "k1_kl": 0.125, + "k3_kl": 0.06884765625, + "kimi_kl": 0.166015625, + "learning_rate": 2.753e-07, + "loss": 0.0028, + "ppl": 0.080078125, + "reward": 0.8439716100692749, + "reward_std": 0.0030914079397916794, + "rewards/perpo_ocr_edit_distance_reward": 0.8439716100692749, "step": 2247, "temperature": 0.9 }, { - "advantages": -0.0005960464477539062, - "completion_length": 301.0, - "delta_ref_entropy_loss": 0.0445556640625, - "delta_ref_ppl": -0.02874755859375, - "entropy_loss": -0.0301513671875, - "epoch": 0.8992, - "grad_norm": 0.03939640412531568, - "k1_kl": 0.02874755859375, - "k3_kl": 0.01385498046875, - "kimi_kl": 0.025146484375, - "learning_rate": 5.04e-08, + "advantages": -0.00011723382340278476, + "completion_length": 951.0, + "delta_ref_entropy_loss": 0.040283203125, + "delta_ref_ppl": -0.046875, + "entropy_loss": -0.03857421875, + "epoch": 0.4496, + "grad_norm": 0.9454162313923826, + "k1_kl": 0.046875, + "k3_kl": 0.027587890625, + "kimi_kl": 0.07421875, + "learning_rate": 2.752e-07, "loss": 0.0012, - "ppl": 0.015899658203125, - "reward": 0.9750617146492004, - "reward_std": 0.0, - "rewards/perpo_ocr_edit_distance_reward": 0.9750617742538452, + "ppl": 0.015869140625, + "reward": 0.9947551488876343, + "reward_std": 0.0006263779359869659, + "rewards/perpo_ocr_edit_distance_reward": 0.9947552680969238, "step": 2248, "temperature": 0.9 }, { - "advantages": -4.589557852341386e-05, - "completion_length": 836.5, - "delta_ref_entropy_loss": 0.1065673828125, - "delta_ref_ppl": -0.0732421875, - "entropy_loss": -0.1346435546875, - "epoch": 0.8996, - "grad_norm": 1.7870719193442164, - "k1_kl": 0.073486328125, - "k3_kl": 0.041412353515625, - "kimi_kl": 0.1072998046875, - "learning_rate": 5.02e-08, - "loss": 0.0017, - "ppl": 0.07550048828125, - "reward": 0.861893355846405, - "reward_std": 0.006867081654490903, - "rewards/perpo_ocr_edit_distance_reward": 0.8618934154510498, + "advantages": 0.0, + "completion_length": 287.0, + "delta_ref_entropy_loss": 0.068359375, + "delta_ref_ppl": -0.09765625, + "entropy_loss": -0.056884765625, + "epoch": 0.4498, + "grad_norm": 0.9232111660059371, + "k1_kl": 0.09765625, + "k3_kl": 0.064453125, + "kimi_kl": 0.2158203125, + "learning_rate": 2.751e-07, + "loss": 0.0026, + "ppl": 0.0230712890625, + "reward": 0.984387218952179, + "reward_std": 0.0009014105889946222, + "rewards/perpo_ocr_edit_distance_reward": 0.984387218952179, "step": 2249, "temperature": 0.9 }, { - "advantages": -1.6936234065845213e-05, - "completion_length": 1190.5, - "delta_ref_entropy_loss": 0.06280517578125, - "delta_ref_ppl": -0.057098388671875, - "entropy_loss": -0.1923828125, - "epoch": 0.9, - "grad_norm": 1.8433836381338755, - "k1_kl": 0.056854248046875, - "k3_kl": 0.0372314453125, - "kimi_kl": 0.08099365234375, - "learning_rate": 5e-08, + "advantages": -3.1982151995180175e-05, + "completion_length": 939.0, + "delta_ref_entropy_loss": 0.0478515625, + "delta_ref_ppl": -0.05859375, + "entropy_loss": -0.02734375, + "epoch": 0.45, + "grad_norm": 0.3484246726395535, + "k1_kl": 0.058837890625, + "k3_kl": 0.037353515625, + "kimi_kl": 0.10791015625, + "learning_rate": 2.75e-07, "loss": 0.0015, - "ppl": 0.11578369140625, - "reward": 0.8311665058135986, - "reward_std": 0.013807762472424656, - "rewards/perpo_ocr_edit_distance_reward": 0.8311665058135986, + "ppl": 0.01239013671875, + "reward": 0.9924940466880798, + "reward_std": 0.0009643306257203221, + "rewards/perpo_ocr_edit_distance_reward": 0.9924941062927246, "step": 2250, "temperature": 0.9 }, { - "advantages": -3.7125180369912414e-06, - "completion_length": 515.0, - "delta_ref_entropy_loss": 0.02569580078125, - "delta_ref_ppl": -0.04931640625, - "entropy_loss": -0.0223388671875, - "epoch": 0.9004, - "grad_norm": 0.5406070263298193, - "k1_kl": 0.04931640625, - "k3_kl": 0.0372314453125, + "advantages": -1.5224730304908007e-05, + "completion_length": 867.0, + "delta_ref_entropy_loss": 0.09326171875, + "delta_ref_ppl": -0.09375, + "entropy_loss": -0.1259765625, + "epoch": 0.4502, + "grad_norm": 1.1256670837953864, + "k1_kl": 0.09375, + "k3_kl": 0.057373046875, "kimi_kl": 0.181640625, - "learning_rate": 4.9799999999999996e-08, - "loss": 0.0015, - "ppl": 0.010406494140625, - "reward": 0.9903448224067688, - "reward_std": 0.003957013133913279, - "rewards/perpo_ocr_edit_distance_reward": 0.9903448820114136, + "learning_rate": 2.7489999999999995e-07, + "loss": 0.0023, + "ppl": 0.059326171875, + "reward": 0.9425931572914124, + "reward_std": 0.00101967784576118, + "rewards/perpo_ocr_edit_distance_reward": 0.9425932168960571, "step": 2251, "temperature": 0.9 }, { - "advantages": -2.9129643223768653e-05, - "completion_length": 674.0, - "delta_ref_entropy_loss": 0.03759765625, - "delta_ref_ppl": -0.0408935546875, - "entropy_loss": -0.04620361328125, - "epoch": 0.9008, - "grad_norm": 1.8313778496078414, - "k1_kl": 0.0408935546875, - "k3_kl": 0.02557373046875, - "kimi_kl": 0.058837890625, - "learning_rate": 4.9599999999999994e-08, - "loss": 0.0011, - "ppl": 0.0262451171875, - "reward": 0.9890874028205872, - "reward_std": 0.02781349583528936, - "rewards/perpo_ocr_edit_distance_reward": 0.9890874922275543, + "advantages": -1.565047750773374e-05, + "completion_length": 228.0, + "delta_ref_entropy_loss": 0.03466796875, + "delta_ref_ppl": -0.134765625, + "entropy_loss": -0.06689453125, + "epoch": 0.4504, + "grad_norm": 2.1751001073293112, + "k1_kl": 0.134765625, + "k3_kl": 0.10302734375, + "kimi_kl": 0.3828125, + "learning_rate": 2.748e-07, + "loss": 0.0042, + "ppl": 0.03271484375, + "reward": 0.8306817412376404, + "reward_std": 0.003711347235366702, + "rewards/perpo_ocr_edit_distance_reward": 0.8306818604469299, "step": 2252, "temperature": 0.9 }, { - "advantages": -0.00016453437274321914, - "completion_length": 777.5, - "delta_ref_entropy_loss": 0.02752685546875, - "delta_ref_ppl": -0.02471923828125, - "entropy_loss": -0.017578125, - "epoch": 0.9012, - "grad_norm": 0.12179340919162619, - "k1_kl": 0.024658203125, - "k3_kl": 0.015594482421875, - "kimi_kl": 0.05859375, - "learning_rate": 4.94e-08, - "loss": 0.0008, - "ppl": 0.00620269775390625, - "reward": 0.999766618013382, - "reward_std": 7.938212365843356e-05, - "rewards/perpo_ocr_edit_distance_reward": 0.9997666776180267, + "advantages": -3.228017521905713e-05, + "completion_length": 570.0, + "delta_ref_entropy_loss": 0.08154296875, + "delta_ref_ppl": -0.11376953125, + "entropy_loss": -0.0888671875, + "epoch": 0.4506, + "grad_norm": 0.8048415477539086, + "k1_kl": 0.11376953125, + "k3_kl": 0.06884765625, + "kimi_kl": 0.244140625, + "learning_rate": 2.747e-07, + "loss": 0.0028, + "ppl": 0.035888671875, + "reward": 0.9814228415489197, + "reward_std": 0.0020098888780921698, + "rewards/perpo_ocr_edit_distance_reward": 0.9814229607582092, "step": 2253, "temperature": 0.9 }, { - "advantages": -1.419016371073667e-05, - "completion_length": 604.5, - "delta_ref_entropy_loss": 0.0438232421875, - "delta_ref_ppl": -0.02783203125, - "entropy_loss": -0.01947021484375, - "epoch": 0.9016, - "grad_norm": 0.27454652895778164, - "k1_kl": 0.02783203125, - "k3_kl": 0.01495361328125, - "kimi_kl": 0.03515625, - "learning_rate": 4.92e-08, - "loss": 0.0006, - "ppl": 0.0084686279296875, - "reward": 0.990364670753479, - "reward_std": 0.0013706237805308774, - "rewards/perpo_ocr_edit_distance_reward": 0.9903647601604462, + "advantages": -2.7758735541283386e-06, + "completion_length": 1044.0, + "delta_ref_entropy_loss": 0.06591796875, + "delta_ref_ppl": -0.0517578125, + "entropy_loss": -0.08837890625, + "epoch": 0.4508, + "grad_norm": 1.4426246095009045, + "k1_kl": 0.051513671875, + "k3_kl": 0.029541015625, + "kimi_kl": 0.058837890625, + "learning_rate": 2.746e-07, + "loss": 0.0012, + "ppl": 0.046875, + "reward": 0.9736816883087158, + "reward_std": 0.0029913787730038166, + "rewards/perpo_ocr_edit_distance_reward": 0.9736817479133606, "step": 2254, "temperature": 0.9 }, { - "advantages": -1.897556441132764e-05, - "completion_length": 860.5, - "delta_ref_entropy_loss": 0.0306396484375, - "delta_ref_ppl": -0.02069091796875, - "entropy_loss": -0.0516357421875, - "epoch": 0.902, - "grad_norm": 0.5833957103666523, - "k1_kl": 0.020751953125, - "k3_kl": 0.013763427734375, - "kimi_kl": 0.0233154296875, - "learning_rate": 4.9e-08, - "loss": 0.0006, - "ppl": 0.026123046875, - "reward": 0.847225695848465, - "reward_std": 0.03794529678998515, - "rewards/perpo_ocr_edit_distance_reward": 0.847225695848465, + "advantages": -4.356673889560625e-05, + "completion_length": 562.0, + "delta_ref_entropy_loss": 0.0224609375, + "delta_ref_ppl": -0.037109375, + "entropy_loss": -0.0284423828125, + "epoch": 0.451, + "grad_norm": 0.2616335766232105, + "k1_kl": 0.037109375, + "k3_kl": 0.02490234375, + "kimi_kl": 0.06640625, + "learning_rate": 2.7450000000000003e-07, + "loss": 0.001, + "ppl": 0.0093994140625, + "reward": 0.9980385899543762, + "reward_std": 0.00029095850186422467, + "rewards/perpo_ocr_edit_distance_reward": 0.9980385899543762, "step": 2255, "temperature": 0.9 }, { - "advantages": -9.70704263636435e-07, - "completion_length": 1092.0, - "delta_ref_entropy_loss": 0.042388916015625, - "delta_ref_ppl": -0.0341796875, - "entropy_loss": -0.12060546875, - "epoch": 0.9024, - "grad_norm": 1.8415519839404293, - "k1_kl": 0.03448486328125, - "k3_kl": 0.0244140625, - "kimi_kl": 0.0557861328125, - "learning_rate": 4.88e-08, - "loss": 0.001, - "ppl": 0.06951904296875, - "reward": 0.9032409191131592, - "reward_std": 0.07807342009618878, - "rewards/perpo_ocr_edit_distance_reward": 0.9032409489154816, + "advantages": -0.0001241650024894625, + "completion_length": 716.0, + "delta_ref_entropy_loss": 0.06494140625, + "delta_ref_ppl": -0.076171875, + "entropy_loss": -0.042724609375, + "epoch": 0.4512, + "grad_norm": 0.8619760968160048, + "k1_kl": 0.076171875, + "k3_kl": 0.040283203125, + "kimi_kl": 0.10205078125, + "learning_rate": 2.7439999999999997e-07, + "loss": 0.0017, + "ppl": 0.0211181640625, + "reward": 0.9957417845726013, + "reward_std": 0.000654103874694556, + "rewards/perpo_ocr_edit_distance_reward": 0.9957419037818909, "step": 2256, "temperature": 0.9 }, { - "advantages": -4.0722744870436145e-05, - "completion_length": 680.0, - "delta_ref_entropy_loss": 0.0491943359375, - "delta_ref_ppl": -0.04931640625, - "entropy_loss": -0.0382080078125, - "epoch": 0.9028, - "grad_norm": 1.0784786429697546, - "k1_kl": 0.04931640625, - "k3_kl": 0.03277587890625, - "kimi_kl": 0.15185546875, - "learning_rate": 4.86e-08, - "loss": 0.0014, - "ppl": 0.01629638671875, - "reward": 0.9885652661323547, - "reward_std": 0.0016991156444419175, - "rewards/perpo_ocr_edit_distance_reward": 0.9885653257369995, + "advantages": -2.55448497910038e-08, + "completion_length": 12.0, + "delta_ref_entropy_loss": -0.314453125, + "delta_ref_ppl": -2.046875, + "entropy_loss": -1.203125, + "epoch": 0.4514, + "grad_norm": 29.82351970511392, + "k1_kl": 2.0625, + "k3_kl": 1.8203125, + "kimi_kl": 8.1875, + "learning_rate": 2.7429999999999996e-07, + "loss": 0.0729, + "ppl": 0.48828125, + "reward": 0.3262763023376465, + "reward_std": 0.2232019603252411, + "rewards/perpo_ocr_edit_distance_reward": 0.3262763023376465, "step": 2257, "temperature": 0.9 }, { - "advantages": -1.2568065358209424e-05, - "completion_length": 950.0, - "delta_ref_entropy_loss": 0.0301513671875, - "delta_ref_ppl": -0.01885986328125, - "entropy_loss": -0.043182373046875, - "epoch": 0.9032, - "grad_norm": 0.7403339559306916, - "k1_kl": 0.0189208984375, - "k3_kl": 0.0113067626953125, - "kimi_kl": 0.0211639404296875, - "learning_rate": 4.8399999999999997e-08, - "loss": 0.0005, - "ppl": 0.021068572998046875, - "reward": 0.9735190570354462, - "reward_std": 0.0018160910112783313, - "rewards/perpo_ocr_edit_distance_reward": 0.9735190868377686, + "advantages": -3.412791920709424e-05, + "completion_length": 450.0, + "delta_ref_entropy_loss": 0.03125, + "delta_ref_ppl": -0.052978515625, + "entropy_loss": -0.041259765625, + "epoch": 0.4516, + "grad_norm": 1.0932345393325833, + "k1_kl": 0.052978515625, + "k3_kl": 0.031005859375, + "kimi_kl": 0.07373046875, + "learning_rate": 2.742e-07, + "loss": 0.0013, + "ppl": 0.01190185546875, + "reward": 0.9854164719581604, + "reward_std": 0.002145275240764022, + "rewards/perpo_ocr_edit_distance_reward": 0.9854165315628052, "step": 2258, "temperature": 0.9 }, { - "advantages": -8.69972438977129e-05, - "completion_length": 757.0, - "delta_ref_entropy_loss": 0.09808349609375, - "delta_ref_ppl": -0.05242919921875, - "entropy_loss": -0.199249267578125, - "epoch": 0.9036, - "grad_norm": 2.6475184167469017, - "k1_kl": 0.05218505859375, - "k3_kl": 0.026611328125, - "kimi_kl": 0.04461669921875, - "learning_rate": 4.8199999999999995e-08, - "loss": 0.0012, - "ppl": 0.1144866943359375, - "reward": 0.9065592586994171, - "reward_std": 0.004925502747937571, - "rewards/perpo_ocr_edit_distance_reward": 0.9065592586994171, + "advantages": -2.0435878468560986e-05, + "completion_length": 158.0, + "delta_ref_entropy_loss": 0.0311279296875, + "delta_ref_ppl": -0.16015625, + "entropy_loss": -0.12353515625, + "epoch": 0.4518, + "grad_norm": 1.9931814694682826, + "k1_kl": 0.16015625, + "k3_kl": 0.134765625, + "kimi_kl": 0.48828125, + "learning_rate": 2.741e-07, + "loss": 0.0054, + "ppl": 0.057373046875, + "reward": 0.9610705375671387, + "reward_std": 0.001986610936000943, + "rewards/perpo_ocr_edit_distance_reward": 0.9610705971717834, "step": 2259, "temperature": 0.9 }, { - "advantages": -7.657494370505447e-05, - "completion_length": 943.5, - "delta_ref_entropy_loss": 0.04052734375, - "delta_ref_ppl": -0.0306396484375, - "entropy_loss": -0.03167724609375, - "epoch": 0.904, - "grad_norm": 0.93990291745199, - "k1_kl": 0.0306396484375, - "k3_kl": 0.017303466796875, - "kimi_kl": 0.03857421875, - "learning_rate": 4.8e-08, - "loss": 0.0008, - "ppl": 0.0169677734375, - "reward": 0.9697721302509308, - "reward_std": 0.0015314003976527601, - "rewards/perpo_ocr_edit_distance_reward": 0.969772219657898, + "advantages": -0.00016097146726679057, + "completion_length": 524.0, + "delta_ref_entropy_loss": 0.051513671875, + "delta_ref_ppl": -0.052490234375, + "entropy_loss": -0.01544189453125, + "epoch": 0.452, + "grad_norm": 0.22070733331364795, + "k1_kl": 0.052490234375, + "k3_kl": 0.037109375, + "kimi_kl": 0.12890625, + "learning_rate": 2.74e-07, + "loss": 0.0016, + "ppl": 0.003997802734375, + "reward": 0.9939758777618408, + "reward_std": 0.00021736817143391818, + "rewards/perpo_ocr_edit_distance_reward": 0.9939759373664856, "step": 2260, "temperature": 0.9 }, { - "advantages": -6.875821522456249e-05, - "completion_length": 652.0, - "delta_ref_entropy_loss": 0.074462890625, - "delta_ref_ppl": -0.0498046875, - "entropy_loss": -0.14227294921875, - "epoch": 0.9044, - "grad_norm": 1.0177605304939354, - "k1_kl": 0.050048828125, - "k3_kl": 0.03082275390625, - "kimi_kl": 0.0711669921875, - "learning_rate": 4.78e-08, - "loss": 0.0013, - "ppl": 0.080078125, - "reward": 0.7208257764577866, - "reward_std": 0.02969479163584765, - "rewards/perpo_ocr_edit_distance_reward": 0.7208258360624313, + "advantages": -0.00012690681614913046, + "completion_length": 424.0, + "delta_ref_entropy_loss": 0.08056640625, + "delta_ref_ppl": -0.119140625, + "entropy_loss": -0.041748046875, + "epoch": 0.4522, + "grad_norm": 0.38623993644019256, + "k1_kl": 0.11962890625, + "k3_kl": 0.07373046875, + "kimi_kl": 0.265625, + "learning_rate": 2.739e-07, + "loss": 0.0031, + "ppl": 0.010986328125, + "reward": 0.989381730556488, + "reward_std": 0.0003696416097227484, + "rewards/perpo_ocr_edit_distance_reward": 0.9893818497657776, "step": 2261, "temperature": 0.9 }, { - "advantages": -0.0001222150749526918, - "completion_length": 681.0, - "delta_ref_entropy_loss": 0.03759765625, - "delta_ref_ppl": -0.03289794921875, - "entropy_loss": -0.0269775390625, - "epoch": 0.9048, - "grad_norm": 0.4441424567711991, - "k1_kl": 0.03302001953125, - "k3_kl": 0.019775390625, - "kimi_kl": 0.06884765625, - "learning_rate": 4.76e-08, - "loss": 0.0009, - "ppl": 0.011199951171875, - "reward": 0.9975730776786804, - "reward_std": 0.00028327215113677084, - "rewards/perpo_ocr_edit_distance_reward": 0.9975731074810028, + "advantages": -1.1495181752252392e-05, + "completion_length": 193.0, + "delta_ref_entropy_loss": 0.07666015625, + "delta_ref_ppl": -0.103515625, + "entropy_loss": -0.0517578125, + "epoch": 0.4524, + "grad_norm": 1.0477041664941231, + "k1_kl": 0.103515625, + "k3_kl": 0.0634765625, + "kimi_kl": 0.17578125, + "learning_rate": 2.738e-07, + "loss": 0.0025, + "ppl": 0.01470947265625, + "reward": 0.9781954884529114, + "reward_std": 0.001380323781631887, + "rewards/perpo_ocr_edit_distance_reward": 0.9781954884529114, "step": 2262, "temperature": 0.9 }, { - "advantages": -0.0002980317388265874, - "completion_length": 338.5, - "delta_ref_entropy_loss": 0.04248046875, - "delta_ref_ppl": -0.0382080078125, - "entropy_loss": -0.03082275390625, - "epoch": 0.9052, - "grad_norm": 0.3149147624393035, - "k1_kl": 0.0384521484375, - "k3_kl": 0.02142333984375, - "kimi_kl": 0.06201171875, - "learning_rate": 4.7399999999999994e-08, - "loss": 0.0012, - "ppl": 0.0148162841796875, - "reward": 0.9411387741565704, - "reward_std": 0.0001642967399675399, - "rewards/perpo_ocr_edit_distance_reward": 0.9411388337612152, + "advantages": -4.0650367736816406e-05, + "completion_length": 517.0, + "delta_ref_entropy_loss": 0.07763671875, + "delta_ref_ppl": -0.095703125, + "entropy_loss": -0.08544921875, + "epoch": 0.4526, + "grad_norm": 1.7429272272853769, + "k1_kl": 0.095703125, + "k3_kl": 0.05859375, + "kimi_kl": 0.1591796875, + "learning_rate": 2.7369999999999997e-07, + "loss": 0.0024, + "ppl": 0.043701171875, + "reward": 0.7197474837303162, + "reward_std": 0.0022042531054466963, + "rewards/perpo_ocr_edit_distance_reward": 0.7197476029396057, "step": 2263, "temperature": 0.9 }, { - "advantages": -6.777900125598535e-06, - "completion_length": 813.0, - "delta_ref_entropy_loss": 0.08758544921875, - "delta_ref_ppl": -0.043670654296875, - "entropy_loss": -0.11688232421875, - "epoch": 0.9056, - "grad_norm": 0.9265533502405192, - "k1_kl": 0.043426513671875, - "k3_kl": 0.02069091796875, - "kimi_kl": 0.03497314453125, - "learning_rate": 4.72e-08, - "loss": 0.0008, - "ppl": 0.0623016357421875, - "reward": 0.9000407457351685, - "reward_std": 0.002779674716293812, - "rewards/perpo_ocr_edit_distance_reward": 0.9000408351421356, + "advantages": -1.5667507113903412e-06, + "completion_length": 420.0, + "delta_ref_entropy_loss": 0.07177734375, + "delta_ref_ppl": -0.10107421875, + "entropy_loss": -0.05908203125, + "epoch": 0.4528, + "grad_norm": 1.0461919378051674, + "k1_kl": 0.10107421875, + "k3_kl": 0.06591796875, + "kimi_kl": 0.1630859375, + "learning_rate": 2.736e-07, + "loss": 0.0026, + "ppl": 0.027099609375, + "reward": 0.961090087890625, + "reward_std": 0.026873577386140823, + "rewards/perpo_ocr_edit_distance_reward": 0.961090087890625, "step": 2264, "temperature": 0.9 }, { - "advantages": -2.111707544827368e-06, - "completion_length": 280.0, - "delta_ref_entropy_loss": 0.075927734375, - "delta_ref_ppl": -0.047119140625, - "entropy_loss": -0.05291748046875, - "epoch": 0.906, - "grad_norm": 0.707730789721044, - "k1_kl": 0.047119140625, - "k3_kl": 0.02459716796875, - "kimi_kl": 0.060302734375, - "learning_rate": 4.7e-08, - "loss": 0.001, - "ppl": 0.023193359375, - "reward": 0.9947991967201233, - "reward_std": 0.00297123403288424, - "rewards/perpo_ocr_edit_distance_reward": 0.9947992265224457, + "advantages": -2.5357519916724414e-05, + "completion_length": 831.0, + "delta_ref_entropy_loss": 0.06982421875, + "delta_ref_ppl": -0.08984375, + "entropy_loss": -0.07421875, + "epoch": 0.453, + "grad_norm": 2.025713014299706, + "k1_kl": 0.08935546875, + "k3_kl": 0.052490234375, + "kimi_kl": 0.1162109375, + "learning_rate": 2.735e-07, + "loss": 0.0021, + "ppl": 0.033935546875, + "reward": 0.5271521210670471, + "reward_std": 0.0012423773296177387, + "rewards/perpo_ocr_edit_distance_reward": 0.5271521806716919, "step": 2265, "temperature": 0.9 }, { - "advantages": -1.7847334675025195e-05, - "completion_length": 565.0, - "delta_ref_entropy_loss": 0.02960205078125, - "delta_ref_ppl": -0.022918701171875, - "entropy_loss": -0.022003173828125, - "epoch": 0.9064, - "grad_norm": 0.4690381789482955, - "k1_kl": 0.023040771484375, - "k3_kl": 0.0137481689453125, - "kimi_kl": 0.03045654296875, - "learning_rate": 4.68e-08, - "loss": 0.0006, - "ppl": 0.00982666015625, - "reward": 0.9962963759899139, - "reward_std": 0.003138175467029214, - "rewards/perpo_ocr_edit_distance_reward": 0.9962964653968811, + "advantages": -1.6263553561657318e-06, + "completion_length": 434.0, + "delta_ref_entropy_loss": 0.119140625, + "delta_ref_ppl": -0.125, + "entropy_loss": -0.173828125, + "epoch": 0.4532, + "grad_norm": 1.7036379261204888, + "k1_kl": 0.125, + "k3_kl": 0.07763671875, + "kimi_kl": 0.205078125, + "learning_rate": 2.7339999999999995e-07, + "loss": 0.0031, + "ppl": 0.07421875, + "reward": 0.9129076600074768, + "reward_std": 0.02621147781610489, + "rewards/perpo_ocr_edit_distance_reward": 0.9129077196121216, "step": 2266, "temperature": 0.9 }, { - "advantages": -1.674890495451109e-05, - "completion_length": 517.0, - "delta_ref_entropy_loss": 0.080322265625, - "delta_ref_ppl": -0.0557861328125, - "entropy_loss": -0.0751953125, - "epoch": 0.9068, - "grad_norm": 0.7961137681033275, - "k1_kl": 0.0557861328125, - "k3_kl": 0.03314208984375, - "kimi_kl": 0.076416015625, - "learning_rate": 4.66e-08, - "loss": 0.0013, - "ppl": 0.0380859375, - "reward": 0.9699920117855072, - "reward_std": 0.001931265345774591, - "rewards/perpo_ocr_edit_distance_reward": 0.969992071390152, - "step": 2267, - "temperature": 0.9 + "advantages": -3.665685881060199e-06, + "completion_length": 468.0, + "delta_ref_entropy_loss": 0.0267333984375, + "delta_ref_ppl": -0.06787109375, + "entropy_loss": -0.025390625, + "epoch": 0.4534, + "grad_norm": 0.6114740959905036, + "k1_kl": 0.06787109375, + "k3_kl": 0.047119140625, + "kimi_kl": 0.1611328125, + "learning_rate": 2.733e-07, + "loss": 0.0019, + "ppl": 0.00909423828125, + "reward": 0.9952039122581482, + "reward_std": 0.002213329542428255, + "rewards/perpo_ocr_edit_distance_reward": 0.9952038526535034, + "step": 2267, + "temperature": 0.9 }, { - "advantages": -2.1900450519751757e-05, - "completion_length": 805.5, - "delta_ref_entropy_loss": 0.04901123046875, - "delta_ref_ppl": -0.0384521484375, - "entropy_loss": -0.043365478515625, - "epoch": 0.9072, - "grad_norm": 0.9150369036417625, - "k1_kl": 0.038330078125, - "k3_kl": 0.0224609375, - "kimi_kl": 0.0684814453125, - "learning_rate": 4.639999999999999e-08, - "loss": 0.0009, - "ppl": 0.0231170654296875, - "reward": 0.9885299503803253, - "reward_std": 0.002670868707355112, - "rewards/perpo_ocr_edit_distance_reward": 0.9885299801826477, + "advantages": -0.0005960464477539062, + "completion_length": 402.0, + "delta_ref_entropy_loss": 0.049560546875, + "delta_ref_ppl": -0.0771484375, + "entropy_loss": -0.025634765625, + "epoch": 0.4536, + "grad_norm": 0.07087888785582268, + "k1_kl": 0.0771484375, + "k3_kl": 0.045654296875, + "kimi_kl": 0.1015625, + "learning_rate": 2.732e-07, + "loss": 0.0024, + "ppl": 0.0091552734375, + "reward": 0.993220329284668, + "reward_std": 0.0, + "rewards/perpo_ocr_edit_distance_reward": 0.9932203888893127, "step": 2268, "temperature": 0.9 }, { - "advantages": -1.5241759570017166e-05, - "completion_length": 268.5, - "delta_ref_entropy_loss": 0.058837890625, - "delta_ref_ppl": -0.1474609375, - "entropy_loss": -0.1015625, - "epoch": 0.9076, - "grad_norm": 2.515375187959843, - "k1_kl": 0.14697265625, - "k3_kl": 0.1109619140625, - "kimi_kl": 0.490234375, - "learning_rate": 4.62e-08, - "loss": 0.0045, - "ppl": 0.06103515625, - "reward": 0.918388158082962, - "reward_std": 0.07113773329183459, - "rewards/perpo_ocr_edit_distance_reward": 0.9183882474899292, + "advantages": -2.047845373454038e-05, + "completion_length": 536.0, + "delta_ref_entropy_loss": 0.0194091796875, + "delta_ref_ppl": -0.043212890625, + "entropy_loss": -0.041015625, + "epoch": 0.4538, + "grad_norm": 1.0239808673443047, + "k1_kl": 0.04296875, + "k3_kl": 0.03125, + "kimi_kl": 0.0947265625, + "learning_rate": 2.731e-07, + "loss": 0.0013, + "ppl": 0.015869140625, + "reward": 0.9921656847000122, + "reward_std": 0.0015624056104570627, + "rewards/perpo_ocr_edit_distance_reward": 0.992165744304657, "step": 2269, "temperature": 0.9 }, { - "advantages": -4.3128219431309844e-06, - "completion_length": 400.0, - "delta_ref_entropy_loss": 0.0648193359375, - "delta_ref_ppl": -0.068359375, - "entropy_loss": -0.135009765625, - "epoch": 0.908, - "grad_norm": 1.5663371774670534, - "k1_kl": 0.0682373046875, - "k3_kl": 0.0491943359375, - "kimi_kl": 0.11572265625, - "learning_rate": 4.5999999999999995e-08, - "loss": 0.002, - "ppl": 0.087432861328125, - "reward": 0.788530707359314, - "reward_std": 0.015426317695528269, - "rewards/perpo_ocr_edit_distance_reward": 0.7885307967662811, + "advantages": -1.3385501006268896e-05, + "completion_length": 220.0, + "delta_ref_entropy_loss": 0.07763671875, + "delta_ref_ppl": -0.138671875, + "entropy_loss": -0.11669921875, + "epoch": 0.454, + "grad_norm": 2.121038432019091, + "k1_kl": 0.138671875, + "k3_kl": 0.1103515625, + "kimi_kl": 0.298828125, + "learning_rate": 2.73e-07, + "loss": 0.0044, + "ppl": 0.06640625, + "reward": 0.9859874248504639, + "reward_std": 0.0037135297898203135, + "rewards/perpo_ocr_edit_distance_reward": 0.9859875440597534, "step": 2270, "temperature": 0.9 }, { - "advantages": 1.069477730197832e-05, - "completion_length": 571.0, - "delta_ref_entropy_loss": 0.052490234375, - "delta_ref_ppl": -0.0369873046875, - "entropy_loss": -0.03973388671875, - "epoch": 0.9084, - "grad_norm": 0.741724337747339, - "k1_kl": 0.0369873046875, - "k3_kl": 0.02178955078125, - "kimi_kl": 0.0518798828125, - "learning_rate": 4.58e-08, - "loss": 0.0009, - "ppl": 0.02069091796875, - "reward": 0.8488238751888275, - "reward_std": 0.0011703906930051744, - "rewards/perpo_ocr_edit_distance_reward": 0.8488239049911499, + "advantages": 1.2704304936050903e-05, + "completion_length": 575.0, + "delta_ref_entropy_loss": 0.07080078125, + "delta_ref_ppl": -0.08154296875, + "entropy_loss": -0.05615234375, + "epoch": 0.4542, + "grad_norm": 0.49198587645089575, + "k1_kl": 0.08154296875, + "k3_kl": 0.046142578125, + "kimi_kl": 0.1396484375, + "learning_rate": 2.7289999999999996e-07, + "loss": 0.0018, + "ppl": 0.0220947265625, + "reward": 0.9903976917266846, + "reward_std": 0.0005691255209967494, + "rewards/perpo_ocr_edit_distance_reward": 0.9903976321220398, "step": 2271, "temperature": 0.9 }, { - "advantages": 5.389963234847528e-06, - "completion_length": 340.0, - "delta_ref_entropy_loss": 0.0347900390625, - "delta_ref_ppl": -0.0498046875, - "entropy_loss": -0.0296630859375, - "epoch": 0.9088, - "grad_norm": 1.598329323413791, - "k1_kl": 0.0498046875, - "k3_kl": 0.02850341796875, - "kimi_kl": 0.05413818359375, - "learning_rate": 4.56e-08, - "loss": 0.0011, - "ppl": 0.0124664306640625, - "reward": 0.9948935210704803, - "reward_std": 0.0015319702215492725, - "rewards/perpo_ocr_edit_distance_reward": 0.9948936104774475, + "advantages": -0.00010758639109553769, + "completion_length": 733.0, + "delta_ref_entropy_loss": 0.054931640625, + "delta_ref_ppl": -0.05419921875, + "entropy_loss": -0.0439453125, + "epoch": 0.4544, + "grad_norm": 0.3942883653663182, + "k1_kl": 0.05419921875, + "k3_kl": 0.031982421875, + "kimi_kl": 0.09375, + "learning_rate": 2.7279999999999995e-07, + "loss": 0.0014, + "ppl": 0.017333984375, + "reward": 0.9681751132011414, + "reward_std": 0.000533111859112978, + "rewards/perpo_ocr_edit_distance_reward": 0.9681752324104309, "step": 2272, "temperature": 0.9 }, { - "advantages": -1.9924982126440227e-06, - "completion_length": 1220.0, - "delta_ref_entropy_loss": -0.0126953125, - "delta_ref_ppl": -0.019775390625, - "entropy_loss": -0.20263671875, - "epoch": 0.9092, - "grad_norm": 1.5090236827859473, - "k1_kl": 0.01971435546875, - "k3_kl": 0.01678466796875, - "kimi_kl": 0.0345458984375, - "learning_rate": 4.54e-08, - "loss": 0.0007, - "ppl": 0.126708984375, - "reward": 0.802677720785141, - "reward_std": 0.14173088362440467, - "rewards/perpo_ocr_edit_distance_reward": 0.8026778101921082, + "advantages": -2.997262345161289e-05, + "completion_length": 693.0, + "delta_ref_entropy_loss": 0.048095703125, + "delta_ref_ppl": -0.050537109375, + "entropy_loss": -0.06396484375, + "epoch": 0.4546, + "grad_norm": 0.7223697911797777, + "k1_kl": 0.05078125, + "k3_kl": 0.0299072265625, + "kimi_kl": 0.07177734375, + "learning_rate": 2.727e-07, + "loss": 0.0012, + "ppl": 0.0267333984375, + "reward": 0.863728940486908, + "reward_std": 0.0007521773222833872, + "rewards/perpo_ocr_edit_distance_reward": 0.8637290000915527, "step": 2273, "temperature": 0.9 }, { - "advantages": -0.00014896052744006738, - "completion_length": 574.0, - "delta_ref_entropy_loss": 0.01812744140625, - "delta_ref_ppl": -0.0089263916015625, - "entropy_loss": -0.02313232421875, - "epoch": 0.9096, - "grad_norm": 0.5295521252698262, - "k1_kl": 0.0089569091796875, - "k3_kl": 0.00450897216796875, - "kimi_kl": 0.00759124755859375, - "learning_rate": 4.5199999999999994e-08, - "loss": 0.0003, - "ppl": 0.01165771484375, - "reward": 0.999684602022171, - "reward_std": 0.0002771668223431334, - "rewards/perpo_ocr_edit_distance_reward": 0.9996846616268158, + "advantages": 1.3751643564319238e-05, + "completion_length": 317.0, + "delta_ref_entropy_loss": 0.037841796875, + "delta_ref_ppl": -0.10107421875, + "entropy_loss": -0.0341796875, + "epoch": 0.4548, + "grad_norm": 0.8565649095006689, + "k1_kl": 0.10107421875, + "k3_kl": 0.0712890625, + "kimi_kl": 0.263671875, + "learning_rate": 2.726e-07, + "loss": 0.0028, + "ppl": 0.011474609375, + "reward": 0.9842146635055542, + "reward_std": 0.0017574660014361143, + "rewards/perpo_ocr_edit_distance_reward": 0.9842146635055542, "step": 2274, "temperature": 0.9 }, { - "advantages": -3.360850467970522e-05, - "completion_length": 1384.0, - "delta_ref_entropy_loss": 0.0242919921875, - "delta_ref_ppl": -0.019378662109375, - "entropy_loss": -0.046142578125, - "epoch": 0.91, - "grad_norm": 3.5760983537330766, - "k1_kl": 0.01934814453125, - "k3_kl": 0.0135040283203125, - "kimi_kl": 0.036865234375, - "learning_rate": 4.5e-08, - "loss": 0.0006, - "ppl": 0.024383544921875, - "reward": 0.9532931447029114, - "reward_std": 0.013955237431218848, - "rewards/perpo_ocr_edit_distance_reward": 0.9532932043075562, + "advantages": -6.48839159111958e-06, + "completion_length": 81.0, + "delta_ref_entropy_loss": 0.03857421875, + "delta_ref_ppl": -0.484375, + "entropy_loss": -0.1259765625, + "epoch": 0.455, + "grad_norm": 3.3903708591817776, + "k1_kl": 0.484375, + "k3_kl": 0.400390625, + "kimi_kl": 1.9765625, + "learning_rate": 2.725e-07, + "loss": 0.016, + "ppl": 0.05517578125, + "reward": 0.9741495847702026, + "reward_std": 0.006461592856794596, + "rewards/perpo_ocr_edit_distance_reward": 0.9741497039794922, "step": 2275, "temperature": 0.9 }, { - "advantages": -4.3570998968789354e-05, - "completion_length": 320.5, - "delta_ref_entropy_loss": 0.03765869140625, - "delta_ref_ppl": -0.04443359375, - "entropy_loss": -0.03399658203125, - "epoch": 0.9104, - "grad_norm": 0.4882374370607488, - "k1_kl": 0.04443359375, - "k3_kl": 0.03033447265625, - "kimi_kl": 0.083251953125, - "learning_rate": 4.48e-08, - "loss": 0.0013, - "ppl": 0.0174560546875, - "reward": 0.9911482036113739, - "reward_std": 0.0002920237893704325, - "rewards/perpo_ocr_edit_distance_reward": 0.9911482334136963, + "advantages": -0.0005960464477539062, + "completion_length": 478.0, + "delta_ref_entropy_loss": 0.045166015625, + "delta_ref_ppl": -0.052978515625, + "entropy_loss": -0.0240478515625, + "epoch": 0.4552, + "grad_norm": 0.015650690144843556, + "k1_kl": 0.052978515625, + "k3_kl": 0.0361328125, + "kimi_kl": 0.142578125, + "learning_rate": 2.724e-07, + "loss": 0.002, + "ppl": 0.005523681640625, + "reward": 0.9953746795654297, + "reward_std": 0.0, + "rewards/perpo_ocr_edit_distance_reward": 0.9953747391700745, "step": 2276, "temperature": 0.9 }, { - "advantages": -6.850276884051709e-05, - "completion_length": 566.5, - "delta_ref_entropy_loss": 0.0760498046875, - "delta_ref_ppl": -0.0562744140625, - "entropy_loss": -0.047119140625, - "epoch": 0.9108, - "grad_norm": 1.3810387228352605, - "k1_kl": 0.05615234375, - "k3_kl": 0.02777099609375, - "kimi_kl": 0.0667724609375, - "learning_rate": 4.4599999999999996e-08, - "loss": 0.0012, - "ppl": 0.02001953125, - "reward": 0.9671192765235901, - "reward_std": 0.00475443153118249, - "rewards/perpo_ocr_edit_distance_reward": 0.9671193063259125, + "advantages": 0.0, + "completion_length": 620.0, + "delta_ref_entropy_loss": 0.032470703125, + "delta_ref_ppl": -0.0439453125, + "entropy_loss": -0.01422119140625, + "epoch": 0.4554, + "grad_norm": 0.02850222427766021, + "k1_kl": 0.043701171875, + "k3_kl": 0.0263671875, + "kimi_kl": 0.0791015625, + "learning_rate": 2.7229999999999997e-07, + "loss": 0.0011, + "ppl": 0.0030975341796875, + "reward": 0.999651312828064, + "reward_std": 0.0, + "rewards/perpo_ocr_edit_distance_reward": 0.999651312828064, "step": 2277, "temperature": 0.9 }, { - "advantages": -7.964883533873035e-05, - "completion_length": 292.5, - "delta_ref_entropy_loss": 0.041259765625, - "delta_ref_ppl": -0.080322265625, - "entropy_loss": -0.055419921875, - "epoch": 0.9112, - "grad_norm": 1.3740607966246532, - "k1_kl": 0.080078125, - "k3_kl": 0.059326171875, - "kimi_kl": 0.23876953125, - "learning_rate": 4.44e-08, - "loss": 0.0024, - "ppl": 0.02978515625, - "reward": 0.9275204241275787, - "reward_std": 0.04109415860148147, - "rewards/perpo_ocr_edit_distance_reward": 0.9275205135345459, + "advantages": 1.021793991640152e-07, + "completion_length": 286.0, + "delta_ref_entropy_loss": -0.01470947265625, + "delta_ref_ppl": -0.2314453125, + "entropy_loss": -0.48828125, + "epoch": 0.4556, + "grad_norm": 5.748220056429929, + "k1_kl": 0.2314453125, + "k3_kl": 0.177734375, + "kimi_kl": 0.48828125, + "learning_rate": 2.7219999999999996e-07, + "loss": 0.0071, + "ppl": 0.259765625, + "reward": 0.7305366396903992, + "reward_std": 0.16323114931583405, + "rewards/perpo_ocr_edit_distance_reward": 0.7305366396903992, "step": 2278, "temperature": 0.9 }, { - "advantages": -0.00012859276830567978, - "completion_length": 790.5, - "delta_ref_entropy_loss": 0.0458984375, - "delta_ref_ppl": -0.02496337890625, - "entropy_loss": -0.0374755859375, - "epoch": 0.9116, - "grad_norm": 0.4487793011234075, - "k1_kl": 0.02496337890625, - "k3_kl": 0.01373291015625, - "kimi_kl": 0.02734375, - "learning_rate": 4.42e-08, - "loss": 0.0007, - "ppl": 0.0162811279296875, - "reward": 0.9747192561626434, - "reward_std": 0.0004196263325866312, - "rewards/perpo_ocr_edit_distance_reward": 0.9747192859649658, + "advantages": 4.19872158090584e-05, + "completion_length": 933.0, + "delta_ref_entropy_loss": 0.0250244140625, + "delta_ref_ppl": -0.044921875, + "entropy_loss": -0.034912109375, + "epoch": 0.4558, + "grad_norm": 0.3752077707865294, + "k1_kl": 0.044921875, + "k3_kl": 0.027587890625, + "kimi_kl": 0.06884765625, + "learning_rate": 2.721e-07, + "loss": 0.0011, + "ppl": 0.0135498046875, + "reward": 0.9912535548210144, + "reward_std": 0.0007114307372830808, + "rewards/perpo_ocr_edit_distance_reward": 0.9912536144256592, "step": 2279, "temperature": 0.9 }, { - "advantages": -0.00010284355812473223, - "completion_length": 681.5, - "delta_ref_entropy_loss": 0.028076171875, - "delta_ref_ppl": -0.02508544921875, - "entropy_loss": -0.02685546875, - "epoch": 0.912, - "grad_norm": 0.38212220775291655, - "k1_kl": 0.02508544921875, - "k3_kl": 0.01641845703125, - "kimi_kl": 0.058319091796875, - "learning_rate": 4.4e-08, - "loss": 0.0008, - "ppl": 0.01324462890625, - "reward": 0.9152026176452637, - "reward_std": 0.04712995622685412, - "rewards/perpo_ocr_edit_distance_reward": 0.9152026474475861, + "advantages": -2.3305417926167138e-05, + "completion_length": 829.0, + "delta_ref_entropy_loss": 0.08154296875, + "delta_ref_ppl": -0.0732421875, + "entropy_loss": -0.060546875, + "epoch": 0.456, + "grad_norm": 0.8176986837723856, + "k1_kl": 0.0732421875, + "k3_kl": 0.038330078125, + "kimi_kl": 0.1142578125, + "learning_rate": 2.72e-07, + "loss": 0.0016, + "ppl": 0.0255126953125, + "reward": 0.9908081889152527, + "reward_std": 0.0006307983421720564, + "rewards/perpo_ocr_edit_distance_reward": 0.9908082485198975, "step": 2280, "temperature": 0.9 }, { - "advantages": -4.6295783249661326e-05, - "completion_length": 678.0, - "delta_ref_entropy_loss": 0.03558349609375, - "delta_ref_ppl": -0.03363037109375, - "entropy_loss": -0.03021240234375, - "epoch": 0.9124, - "grad_norm": 0.5547956009466266, - "k1_kl": 0.03363037109375, - "k3_kl": 0.020263671875, - "kimi_kl": 0.0511474609375, - "learning_rate": 4.3799999999999995e-08, - "loss": 0.0009, - "ppl": 0.014739990234375, - "reward": 0.9983121752738953, - "reward_std": 0.000528254036908038, - "rewards/perpo_ocr_edit_distance_reward": 0.9983121454715729, + "advantages": -0.00010480199853191152, + "completion_length": 700.0, + "delta_ref_entropy_loss": 0.0311279296875, + "delta_ref_ppl": -0.033447265625, + "entropy_loss": -0.0218505859375, + "epoch": 0.4562, + "grad_norm": 0.2977666164499529, + "k1_kl": 0.033447265625, + "k3_kl": 0.023681640625, + "kimi_kl": 0.054931640625, + "learning_rate": 2.7189999999999994e-07, + "loss": 0.0011, + "ppl": 0.01055908203125, + "reward": 0.988461434841156, + "reward_std": 0.0007123997202143073, + "rewards/perpo_ocr_edit_distance_reward": 0.9884615540504456, "step": 2281, "temperature": 0.9 }, { - "advantages": -0.00011599915887927637, - "completion_length": 425.5, - "delta_ref_entropy_loss": 0.04931640625, - "delta_ref_ppl": -0.03314208984375, - "entropy_loss": -0.04071044921875, - "epoch": 0.9128, - "grad_norm": 1.3469697337800872, - "k1_kl": 0.03314208984375, - "k3_kl": 0.019012451171875, - "kimi_kl": 0.065826416015625, - "learning_rate": 4.36e-08, - "loss": 0.0009, - "ppl": 0.022552490234375, - "reward": 0.9927657246589661, - "reward_std": 0.0006161568890092894, - "rewards/perpo_ocr_edit_distance_reward": 0.9927658140659332, + "advantages": -3.0006682209204882e-05, + "completion_length": 479.0, + "delta_ref_entropy_loss": 0.04443359375, + "delta_ref_ppl": -0.06640625, + "entropy_loss": -0.037353515625, + "epoch": 0.4564, + "grad_norm": 0.4836669661482361, + "k1_kl": 0.06689453125, + "k3_kl": 0.043701171875, + "kimi_kl": 0.130859375, + "learning_rate": 2.718e-07, + "loss": 0.0018, + "ppl": 0.01251220703125, + "reward": 0.9953360557556152, + "reward_std": 0.0007513786549679935, + "rewards/perpo_ocr_edit_distance_reward": 0.9953361749649048, "step": 2282, "temperature": 0.9 }, { - "advantages": -2.2734915546607226e-05, - "completion_length": 243.0, - "delta_ref_entropy_loss": 0.065673828125, - "delta_ref_ppl": -0.04766845703125, - "entropy_loss": -0.051025390625, - "epoch": 0.9132, - "grad_norm": 0.8576035994510349, - "k1_kl": 0.04766845703125, - "k3_kl": 0.02801513671875, - "kimi_kl": 0.0823974609375, - "learning_rate": 4.34e-08, - "loss": 0.0011, - "ppl": 0.02426910400390625, - "reward": 0.9733283221721649, - "reward_std": 0.0006052798707969487, - "rewards/perpo_ocr_edit_distance_reward": 0.9733283817768097, + "advantages": -2.183233118557837e-05, + "completion_length": 167.0, + "delta_ref_entropy_loss": 0.06787109375, + "delta_ref_ppl": -0.1572265625, + "entropy_loss": -0.0595703125, + "epoch": 0.4566, + "grad_norm": 1.2810455673286572, + "k1_kl": 0.1572265625, + "k3_kl": 0.11474609375, + "kimi_kl": 0.4453125, + "learning_rate": 2.717e-07, + "loss": 0.0046, + "ppl": 0.017578125, + "reward": 0.965728759765625, + "reward_std": 0.004577403888106346, + "rewards/perpo_ocr_edit_distance_reward": 0.9657288193702698, "step": 2283, "temperature": 0.9 }, { - "advantages": -2.056360426649917e-05, - "completion_length": 203.5, - "delta_ref_entropy_loss": 0.06182861328125, - "delta_ref_ppl": -0.126220703125, - "entropy_loss": -0.0633544921875, - "epoch": 0.9136, - "grad_norm": 3.4532702518903458, - "k1_kl": 0.1265869140625, - "k3_kl": 0.1015625, - "kimi_kl": 0.57080078125, - "learning_rate": 4.32e-08, - "loss": 0.0041, - "ppl": 0.031097412109375, - "reward": 0.9976630806922913, - "reward_std": 0.0033082511217799038, - "rewards/perpo_ocr_edit_distance_reward": 0.9976631700992584, + "advantages": 1.4645713235950097e-05, + "completion_length": 165.0, + "delta_ref_entropy_loss": -0.01031494140625, + "delta_ref_ppl": -0.138671875, + "entropy_loss": -0.05126953125, + "epoch": 0.4568, + "grad_norm": 1.4477940767426511, + "k1_kl": 0.138671875, + "k3_kl": 0.11328125, + "kimi_kl": 0.59765625, + "learning_rate": 2.7159999999999997e-07, + "loss": 0.0045, + "ppl": 0.02294921875, + "reward": 0.9903344511985779, + "reward_std": 0.001644100178964436, + "rewards/perpo_ocr_edit_distance_reward": 0.9903343915939331, "step": 2284, "temperature": 0.9 }, { - "advantages": -1.1410032470848819e-05, - "completion_length": 571.0, - "delta_ref_entropy_loss": 0.0374755859375, - "delta_ref_ppl": -0.02789306640625, - "entropy_loss": -0.0400390625, - "epoch": 0.914, - "grad_norm": 2.4818441092599515, - "k1_kl": 0.02789306640625, - "k3_kl": 0.016937255859375, - "kimi_kl": 0.040771484375, - "learning_rate": 4.2999999999999995e-08, - "loss": 0.0007, - "ppl": 0.024017333984375, - "reward": 0.9863428175449371, - "reward_std": 0.00468395696952939, - "rewards/perpo_ocr_edit_distance_reward": 0.9863428771495819, + "advantages": -1.3623919414840202e-07, + "completion_length": 762.0, + "delta_ref_entropy_loss": 0.06201171875, + "delta_ref_ppl": -0.12353515625, + "entropy_loss": -0.283203125, + "epoch": 0.457, + "grad_norm": 2.1040204317587867, + "k1_kl": 0.12353515625, + "k3_kl": 0.0849609375, + "kimi_kl": 0.1865234375, + "learning_rate": 2.715e-07, + "loss": 0.0034, + "ppl": 0.142578125, + "reward": 0.7010442614555359, + "reward_std": 0.05828561633825302, + "rewards/perpo_ocr_edit_distance_reward": 0.7010442614555359, "step": 2285, "temperature": 0.9 }, { "advantages": 0.0, - "completion_length": 578.5, - "delta_ref_entropy_loss": 0.0208740234375, - "delta_ref_ppl": -0.01763916015625, - "entropy_loss": -0.013031005859375, - "epoch": 0.9144, - "grad_norm": 0.008030393429585244, - "k1_kl": 0.017578125, - "k3_kl": 0.0118255615234375, - "kimi_kl": 0.05615234375, - "learning_rate": 4.279999999999999e-08, - "loss": 0.0005, - "ppl": 0.00421905517578125, - "reward": 1.0, + "completion_length": 356.0, + "delta_ref_entropy_loss": 0.0301513671875, + "delta_ref_ppl": -0.09326171875, + "entropy_loss": -0.0211181640625, + "epoch": 0.4572, + "grad_norm": 0.012584428171790094, + "k1_kl": 0.09326171875, + "k3_kl": 0.07177734375, + "kimi_kl": 0.3046875, + "learning_rate": 2.7139999999999996e-07, + "loss": 0.0029, + "ppl": 0.004180908203125, + "reward": 0.790515661239624, "reward_std": 0.0, - "rewards/perpo_ocr_edit_distance_reward": 1.0, + "rewards/perpo_ocr_edit_distance_reward": 0.790515661239624, "step": 2286, "temperature": 0.9 }, { - "advantages": 0.0, - "completion_length": 579.0, - "delta_ref_entropy_loss": 0.01861572265625, - "delta_ref_ppl": -0.01934814453125, - "entropy_loss": -0.013397216796875, - "epoch": 0.9148, - "grad_norm": 0.006459157298671475, - "k1_kl": 0.0194091796875, - "k3_kl": 0.013214111328125, - "kimi_kl": 0.0504150390625, - "learning_rate": 4.26e-08, - "loss": 0.0005, - "ppl": 0.0048675537109375, - "reward": 1.0, - "reward_std": 0.0, - "rewards/perpo_ocr_edit_distance_reward": 1.0, + "advantages": -9.34984054765664e-05, + "completion_length": 951.0, + "delta_ref_entropy_loss": 0.0458984375, + "delta_ref_ppl": -0.05419921875, + "entropy_loss": -0.037841796875, + "epoch": 0.4574, + "grad_norm": 0.2487323023846639, + "k1_kl": 0.053955078125, + "k3_kl": 0.0322265625, + "kimi_kl": 0.09619140625, + "learning_rate": 2.7129999999999995e-07, + "loss": 0.0014, + "ppl": 0.0159912109375, + "reward": 0.9944815039634705, + "reward_std": 0.0005375570617616177, + "rewards/perpo_ocr_edit_distance_reward": 0.99448162317276, "step": 2287, "temperature": 0.9 }, { - "advantages": -4.287277079129126e-06, - "completion_length": 1241.0, - "delta_ref_entropy_loss": 0.018310546875, - "delta_ref_ppl": -0.014404296875, - "entropy_loss": -0.0279541015625, - "epoch": 0.9152, - "grad_norm": 0.560837797208004, - "k1_kl": 0.014434814453125, - "k3_kl": 0.008575439453125, - "kimi_kl": 0.019927978515625, - "learning_rate": 4.2399999999999996e-08, - "loss": 0.0003, - "ppl": 0.01434326171875, - "reward": 0.9952456951141357, - "reward_std": 0.0024354164488613605, - "rewards/perpo_ocr_edit_distance_reward": 0.9952457547187805, + "advantages": -1.6825541024445556e-05, + "completion_length": 902.0, + "delta_ref_entropy_loss": 0.04443359375, + "delta_ref_ppl": -0.053466796875, + "entropy_loss": -0.054931640625, + "epoch": 0.4576, + "grad_norm": 3.077851379218889, + "k1_kl": 0.05322265625, + "k3_kl": 0.0439453125, + "kimi_kl": 0.09375, + "learning_rate": 2.712e-07, + "loss": 0.0018, + "ppl": 0.026123046875, + "reward": 0.9651581048965454, + "reward_std": 0.0014177000848576427, + "rewards/perpo_ocr_edit_distance_reward": 0.9651581645011902, "step": 2288, "temperature": 0.9 }, { - "advantages": -8.586475451011211e-05, - "completion_length": 661.0, - "delta_ref_entropy_loss": 0.0274658203125, - "delta_ref_ppl": -0.0211181640625, - "entropy_loss": -0.015106201171875, - "epoch": 0.9156, - "grad_norm": 0.13230493245404532, - "k1_kl": 0.0211181640625, - "k3_kl": 0.01263427734375, - "kimi_kl": 0.03759765625, - "learning_rate": 4.22e-08, - "loss": 0.0006, - "ppl": 0.0047149658203125, - "reward": 0.9994609951972961, - "reward_std": 4.917019032291137e-05, - "rewards/perpo_ocr_edit_distance_reward": 0.9994609951972961, + "advantages": -8.044924470596015e-05, + "completion_length": 721.0, + "delta_ref_entropy_loss": 0.041015625, + "delta_ref_ppl": -0.051025390625, + "entropy_loss": -0.0238037109375, + "epoch": 0.4578, + "grad_norm": 0.1518461556599227, + "k1_kl": 0.05126953125, + "k3_kl": 0.0296630859375, + "kimi_kl": 0.10107421875, + "learning_rate": 2.711e-07, + "loss": 0.0013, + "ppl": 0.00634765625, + "reward": 0.9967072010040283, + "reward_std": 0.00011170162906637415, + "rewards/perpo_ocr_edit_distance_reward": 0.9967072010040283, "step": 2289, "temperature": 0.9 }, { - "advantages": -5.10896995820076e-08, - "completion_length": 297.5, - "delta_ref_entropy_loss": 0.0518798828125, - "delta_ref_ppl": -0.09326171875, - "entropy_loss": -0.0462646484375, - "epoch": 0.916, - "grad_norm": 1.0183964868302138, - "k1_kl": 0.0933837890625, - "k3_kl": 0.072021484375, - "kimi_kl": 0.326416015625, - "learning_rate": 4.2e-08, - "loss": 0.0029, - "ppl": 0.02099609375, - "reward": 0.7233161479234695, - "reward_std": 0.015975182875990868, - "rewards/perpo_ocr_edit_distance_reward": 0.7233161628246307, + "advantages": -8.834260370349512e-05, + "completion_length": 430.0, + "delta_ref_entropy_loss": 0.041748046875, + "delta_ref_ppl": -0.052734375, + "entropy_loss": -0.02587890625, + "epoch": 0.458, + "grad_norm": 0.2719934711513509, + "k1_kl": 0.052734375, + "k3_kl": 0.034912109375, + "kimi_kl": 0.10498046875, + "learning_rate": 2.7100000000000003e-07, + "loss": 0.0015, + "ppl": 0.0072021484375, + "reward": 0.9915629625320435, + "reward_std": 0.00018917504348792136, + "rewards/perpo_ocr_edit_distance_reward": 0.9915630221366882, "step": 2290, "temperature": 0.9 }, { - "advantages": -1.2772424362594847e-07, - "completion_length": 348.5, - "delta_ref_entropy_loss": 0.0616455078125, - "delta_ref_ppl": -0.0784912109375, - "entropy_loss": -0.07159423828125, - "epoch": 0.9164, - "grad_norm": 0.9294480911513907, - "k1_kl": 0.0787353515625, - "k3_kl": 0.05419921875, - "kimi_kl": 0.17333984375, - "learning_rate": 4.18e-08, - "loss": 0.0022, - "ppl": 0.03363037109375, - "reward": 0.9012682139873505, - "reward_std": 0.038569083757465705, - "rewards/perpo_ocr_edit_distance_reward": 0.9012682437896729, + "advantages": -1.2516975402832031e-06, + "completion_length": 452.0, + "delta_ref_entropy_loss": 0.048828125, + "delta_ref_ppl": -0.1259765625, + "entropy_loss": -0.06201171875, + "epoch": 0.4582, + "grad_norm": 1.1088945193838595, + "k1_kl": 0.1259765625, + "k3_kl": 0.0927734375, + "kimi_kl": 0.369140625, + "learning_rate": 2.7089999999999997e-07, + "loss": 0.0037, + "ppl": 0.0263671875, + "reward": 0.9386914372444153, + "reward_std": 0.013528315350413322, + "rewards/perpo_ocr_edit_distance_reward": 0.9386914968490601, "step": 2291, "temperature": 0.9 }, { - "advantages": -5.861691170139238e-05, - "completion_length": 787.5, - "delta_ref_entropy_loss": 0.029296875, - "delta_ref_ppl": -0.0262451171875, - "entropy_loss": -0.02386474609375, - "epoch": 0.9168, - "grad_norm": 0.6971833625793099, - "k1_kl": 0.02630615234375, - "k3_kl": 0.01702880859375, - "kimi_kl": 0.0579833984375, - "learning_rate": 4.1599999999999995e-08, - "loss": 0.0007, - "ppl": 0.011871337890625, - "reward": 0.999410480260849, - "reward_std": 0.0003130999975837767, - "rewards/perpo_ocr_edit_distance_reward": 0.9994105696678162, + "advantages": -8.659703780722339e-06, + "completion_length": 700.0, + "delta_ref_entropy_loss": 0.0380859375, + "delta_ref_ppl": -0.04736328125, + "entropy_loss": -0.03857421875, + "epoch": 0.4584, + "grad_norm": 1.045360125796215, + "k1_kl": 0.04736328125, + "k3_kl": 0.029052734375, + "kimi_kl": 0.06591796875, + "learning_rate": 2.7079999999999996e-07, + "loss": 0.0012, + "ppl": 0.0179443359375, + "reward": 0.9904534220695496, + "reward_std": 0.003838459961116314, + "rewards/perpo_ocr_edit_distance_reward": 0.9904534816741943, "step": 2292, "temperature": 0.9 }, { - "advantages": -5.36441826959333e-07, - "completion_length": 327.5, - "delta_ref_entropy_loss": 0.055908203125, - "delta_ref_ppl": -0.0684814453125, - "entropy_loss": -0.0980224609375, - "epoch": 0.9172, - "grad_norm": 2.2377958390449684, - "k1_kl": 0.068359375, - "k3_kl": 0.04364013671875, - "kimi_kl": 0.126708984375, - "learning_rate": 4.14e-08, - "loss": 0.0017, - "ppl": 0.04803466796875, - "reward": 0.8199580609798431, - "reward_std": 0.03103451238712296, - "rewards/perpo_ocr_edit_distance_reward": 0.8199581503868103, + "advantages": -6.113733888923889e-06, + "completion_length": 462.0, + "delta_ref_entropy_loss": 0.154296875, + "delta_ref_ppl": -0.1416015625, + "entropy_loss": -0.275390625, + "epoch": 0.4586, + "grad_norm": 2.350848027553164, + "k1_kl": 0.1416015625, + "k3_kl": 0.09130859375, + "kimi_kl": 0.19140625, + "learning_rate": 2.707e-07, + "loss": 0.0037, + "ppl": 0.1533203125, + "reward": 0.7842064499855042, + "reward_std": 0.006875635124742985, + "rewards/perpo_ocr_edit_distance_reward": 0.7842064499855042, "step": 2293, "temperature": 0.9 }, { - "advantages": 0.0, - "completion_length": 630.0, - "delta_ref_entropy_loss": 0.01873779296875, - "delta_ref_ppl": -0.018096923828125, - "entropy_loss": -0.018798828125, - "epoch": 0.9176, - "grad_norm": 0.012393465239687755, - "k1_kl": 0.018096923828125, - "k3_kl": 0.01165771484375, - "kimi_kl": 0.034423828125, - "learning_rate": 4.12e-08, - "loss": 0.0005, - "ppl": 0.0070953369140625, - "reward": 1.0, - "reward_std": 0.0, - "rewards/perpo_ocr_edit_distance_reward": 1.0, + "advantages": -1.5667507113903412e-06, + "completion_length": 1033.0, + "delta_ref_entropy_loss": 0.10693359375, + "delta_ref_ppl": -0.1123046875, + "entropy_loss": -0.35546875, + "epoch": 0.4588, + "grad_norm": 2.991918563098658, + "k1_kl": 0.11279296875, + "k3_kl": 0.0771484375, + "kimi_kl": 0.1650390625, + "learning_rate": 2.706e-07, + "loss": 0.0031, + "ppl": 0.2021484375, + "reward": 0.822670042514801, + "reward_std": 0.03185080736875534, + "rewards/perpo_ocr_edit_distance_reward": 0.8226701021194458, "step": 2294, "temperature": 0.9 }, { - "advantages": 0.0, - "completion_length": 552.5, - "delta_ref_entropy_loss": 0.037109375, - "delta_ref_ppl": -0.01513671875, - "entropy_loss": -0.011444091796875, - "epoch": 0.918, - "grad_norm": 0.01700429641291914, - "k1_kl": 0.015106201171875, - "k3_kl": 0.00524139404296875, - "kimi_kl": 0.0087432861328125, - "learning_rate": 4.1e-08, - "loss": 0.0002, - "ppl": 0.0030975341796875, - "reward": 0.9998849928379059, - "reward_std": 0.0, - "rewards/perpo_ocr_edit_distance_reward": 0.9998850226402283, + "advantages": -3.5592489439295605e-05, + "completion_length": 673.0, + "delta_ref_entropy_loss": 0.035888671875, + "delta_ref_ppl": -0.034423828125, + "entropy_loss": -0.0279541015625, + "epoch": 0.459, + "grad_norm": 0.5529097868188249, + "k1_kl": 0.034423828125, + "k3_kl": 0.0184326171875, + "kimi_kl": 0.0498046875, + "learning_rate": 2.705e-07, + "loss": 0.0008, + "ppl": 0.01153564453125, + "reward": 0.9947143793106079, + "reward_std": 0.0015742580872029066, + "rewards/perpo_ocr_edit_distance_reward": 0.9947143793106079, "step": 2295, "temperature": 0.9 }, { - "advantages": -0.00017911196482600644, - "completion_length": 456.5, - "delta_ref_entropy_loss": 0.02154541015625, - "delta_ref_ppl": -0.031494140625, - "entropy_loss": -0.0224609375, - "epoch": 0.9184, - "grad_norm": 0.8369741774116966, - "k1_kl": 0.03143310546875, - "k3_kl": 0.02294921875, - "kimi_kl": 0.1104736328125, - "learning_rate": 4.08e-08, - "loss": 0.0011, - "ppl": 0.011566162109375, - "reward": 0.8982032835483551, - "reward_std": 0.0004320537991588935, - "rewards/perpo_ocr_edit_distance_reward": 0.8982033431529999, + "advantages": -0.00012385845184326172, + "completion_length": 632.0, + "delta_ref_entropy_loss": 0.055419921875, + "delta_ref_ppl": -0.051513671875, + "entropy_loss": -0.02490234375, + "epoch": 0.4592, + "grad_norm": 0.3028374523540428, + "k1_kl": 0.05126953125, + "k3_kl": 0.0281982421875, + "kimi_kl": 0.08349609375, + "learning_rate": 2.704e-07, + "loss": 0.0013, + "ppl": 0.008056640625, + "reward": 0.9967250823974609, + "reward_std": 0.00024373046471737325, + "rewards/perpo_ocr_edit_distance_reward": 0.9967251420021057, "step": 2296, "temperature": 0.9 }, { - "advantages": -3.3233848625968676e-05, - "completion_length": 674.0, - "delta_ref_entropy_loss": 0.02899169921875, - "delta_ref_ppl": -0.022430419921875, - "entropy_loss": -0.02044677734375, - "epoch": 0.9188, - "grad_norm": 1.0655707986470953, - "k1_kl": 0.022552490234375, - "k3_kl": 0.0141143798828125, - "kimi_kl": 0.041656494140625, - "learning_rate": 4.059999999999999e-08, - "loss": 0.0006, - "ppl": 0.0080718994140625, - "reward": 0.9985947906970978, - "reward_std": 0.0009091291576623917, - "rewards/perpo_ocr_edit_distance_reward": 0.9985947906970978, + "advantages": -2.7247838829680404e-07, + "completion_length": 77.0, + "delta_ref_entropy_loss": -0.09521484375, + "delta_ref_ppl": -0.40625, + "entropy_loss": -0.494140625, + "epoch": 0.4594, + "grad_norm": 5.618565474714622, + "k1_kl": 0.40625, + "k3_kl": 0.34375, + "kimi_kl": 1.1484375, + "learning_rate": 2.703e-07, + "loss": 0.0138, + "ppl": 0.1904296875, + "reward": 0.3622816503047943, + "reward_std": 0.06951741874217987, + "rewards/perpo_ocr_edit_distance_reward": 0.3622816205024719, "step": 2297, "temperature": 0.9 }, { - "advantages": -0.00030077355268076644, - "completion_length": 576.5, - "delta_ref_entropy_loss": 0.028076171875, - "delta_ref_ppl": -0.015960693359375, - "entropy_loss": -0.046875, - "epoch": 0.9192, - "grad_norm": 0.7028198810662054, - "k1_kl": 0.015869140625, - "k3_kl": 0.0078582763671875, - "kimi_kl": 0.013580322265625, - "learning_rate": 4.04e-08, - "loss": 0.0006, - "ppl": 0.0218048095703125, - "reward": 0.9866973757743835, - "reward_std": 0.005377743858844042, - "rewards/perpo_ocr_edit_distance_reward": 0.9866974949836731, + "advantages": -1.128230792346585e-06, + "completion_length": 1149.0, + "delta_ref_entropy_loss": -0.0101318359375, + "delta_ref_ppl": -0.07958984375, + "entropy_loss": -0.271484375, + "epoch": 0.4596, + "grad_norm": 10.391013678263192, + "k1_kl": 0.0791015625, + "k3_kl": 0.080078125, + "kimi_kl": 0.1416015625, + "learning_rate": 2.7019999999999997e-07, + "loss": 0.0032, + "ppl": 0.150390625, + "reward": 0.6487156748771667, + "reward_std": 0.08454062044620514, + "rewards/perpo_ocr_edit_distance_reward": 0.6487157344818115, "step": 2298, "temperature": 0.9 }, { - "advantages": 0.0, - "completion_length": 468.0, - "delta_ref_entropy_loss": 0.02593994140625, - "delta_ref_ppl": -0.02435302734375, - "entropy_loss": -0.01177978515625, - "epoch": 0.9196, - "grad_norm": 0.007300643213626281, - "k1_kl": 0.02435302734375, - "k3_kl": 0.015777587890625, - "kimi_kl": 0.0540771484375, - "learning_rate": 4.0199999999999996e-08, - "loss": 0.0006, - "ppl": 0.004180908203125, - "reward": 0.9986747205257416, - "reward_std": 0.0, - "rewards/perpo_ocr_edit_distance_reward": 0.998674750328064, + "advantages": -0.0002564532624091953, + "completion_length": 554.0, + "delta_ref_entropy_loss": 0.03759765625, + "delta_ref_ppl": -0.07177734375, + "entropy_loss": -0.02978515625, + "epoch": 0.4598, + "grad_norm": 0.39474544166166425, + "k1_kl": 0.07177734375, + "k3_kl": 0.04736328125, + "kimi_kl": 0.1474609375, + "learning_rate": 2.701e-07, + "loss": 0.0021, + "ppl": 0.0120849609375, + "reward": 0.7875956892967224, + "reward_std": 0.00016563848475925624, + "rewards/perpo_ocr_edit_distance_reward": 0.787595808506012, "step": 2299, "temperature": 0.9 }, { - "advantages": -7.691980033541768e-05, - "completion_length": 1144.0, - "delta_ref_entropy_loss": 0.0250244140625, - "delta_ref_ppl": -0.02764892578125, - "entropy_loss": -0.04052734375, - "epoch": 0.92, - "grad_norm": 0.7444486471341821, - "k1_kl": 0.02764892578125, - "k3_kl": 0.01617431640625, - "kimi_kl": 0.033203125, - "learning_rate": 4e-08, - "loss": 0.0007, - "ppl": 0.02203369140625, - "reward": 0.8559341430664062, - "reward_std": 0.01589988537307363, - "rewards/perpo_ocr_edit_distance_reward": 0.8559341430664062, + "advantages": -2.2990363390817947e-07, + "completion_length": 274.0, + "delta_ref_entropy_loss": -0.0927734375, + "delta_ref_ppl": -0.130859375, + "entropy_loss": -0.357421875, + "epoch": 0.46, + "grad_norm": 3.990028582614135, + "k1_kl": 0.1298828125, + "k3_kl": 0.11328125, + "kimi_kl": 0.33203125, + "learning_rate": 2.7e-07, + "loss": 0.0045, + "ppl": 0.1640625, + "reward": 0.3546563982963562, + "reward_std": 0.03688982129096985, + "rewards/perpo_ocr_edit_distance_reward": 0.3546564280986786, "step": 2300, "temperature": 0.9 }, { - "advantages": -3.855143495457014e-05, - "completion_length": 729.0, - "delta_ref_entropy_loss": 0.029296875, - "delta_ref_ppl": -0.01971435546875, - "entropy_loss": -0.01519775390625, - "epoch": 0.9204, - "grad_norm": 0.5088645236701319, - "k1_kl": 0.01959228515625, - "k3_kl": 0.01213836669921875, - "kimi_kl": 0.047149658203125, - "learning_rate": 3.98e-08, - "loss": 0.0005, - "ppl": 0.005615234375, - "reward": 0.9987855851650238, - "reward_std": 0.0010545804107096046, - "rewards/perpo_ocr_edit_distance_reward": 0.9987856447696686, + "advantages": -8.748259278945625e-05, + "completion_length": 548.0, + "delta_ref_entropy_loss": 0.061767578125, + "delta_ref_ppl": -0.0732421875, + "entropy_loss": -0.044189453125, + "epoch": 0.4602, + "grad_norm": 0.4777989579423474, + "k1_kl": 0.07275390625, + "k3_kl": 0.04052734375, + "kimi_kl": 0.126953125, + "learning_rate": 2.6989999999999995e-07, + "loss": 0.0017, + "ppl": 0.0179443359375, + "reward": 0.9968295097351074, + "reward_std": 0.0004839546454604715, + "rewards/perpo_ocr_edit_distance_reward": 0.9968295097351074, "step": 2301, "temperature": 0.9 }, { - "advantages": -3.715923958225176e-05, - "completion_length": 656.0, - "delta_ref_entropy_loss": 0.0302734375, - "delta_ref_ppl": -0.018890380859375, - "entropy_loss": -0.0191650390625, - "epoch": 0.9208, - "grad_norm": 0.21175628164027793, - "k1_kl": 0.018890380859375, - "k3_kl": 0.0101165771484375, - "kimi_kl": 0.02691650390625, - "learning_rate": 3.9600000000000004e-08, - "loss": 0.0004, - "ppl": 0.00701904296875, - "reward": 0.9996887743473053, - "reward_std": 0.0001790701353456825, - "rewards/perpo_ocr_edit_distance_reward": 0.9996887743473053, + "advantages": -1.4611653568863403e-05, + "completion_length": 440.0, + "delta_ref_entropy_loss": 0.051025390625, + "delta_ref_ppl": -0.1298828125, + "entropy_loss": -0.064453125, + "epoch": 0.4604, + "grad_norm": 2.2490466941088005, + "k1_kl": 0.1298828125, + "k3_kl": 0.0908203125, + "kimi_kl": 0.375, + "learning_rate": 2.698e-07, + "loss": 0.0036, + "ppl": 0.0289306640625, + "reward": 0.990824282169342, + "reward_std": 0.00048216618597507477, + "rewards/perpo_ocr_edit_distance_reward": 0.9908244013786316, "step": 2302, "temperature": 0.9 }, { - "advantages": -5.812943163618911e-05, - "completion_length": 543.0, - "delta_ref_entropy_loss": 0.029571533203125, - "delta_ref_ppl": -0.01953125, - "entropy_loss": -0.026153564453125, - "epoch": 0.9212, - "grad_norm": 1.3219561406454485, - "k1_kl": 0.01953125, - "k3_kl": 0.00919342041015625, - "kimi_kl": 0.0167694091796875, - "learning_rate": 3.9399999999999995e-08, - "loss": 0.0004, - "ppl": 0.0115203857421875, - "reward": 0.9994058012962341, - "reward_std": 0.0004976809432264417, - "rewards/perpo_ocr_edit_distance_reward": 0.9994059205055237, + "advantages": -1.616137524251826e-05, + "completion_length": 734.0, + "delta_ref_entropy_loss": 0.035400390625, + "delta_ref_ppl": -0.05419921875, + "entropy_loss": -0.06982421875, + "epoch": 0.4606, + "grad_norm": 0.6781360531347229, + "k1_kl": 0.05419921875, + "k3_kl": 0.033203125, + "kimi_kl": 0.06689453125, + "learning_rate": 2.697e-07, + "loss": 0.0013, + "ppl": 0.036865234375, + "reward": 0.9463388919830322, + "reward_std": 0.0030600798781961203, + "rewards/perpo_ocr_edit_distance_reward": 0.946338951587677, "step": 2303, "temperature": 0.9 }, { - "advantages": -0.0001106858308048686, - "completion_length": 589.5, - "delta_ref_entropy_loss": 0.0599365234375, - "delta_ref_ppl": -0.0347900390625, - "entropy_loss": -0.03631591796875, - "epoch": 0.9216, - "grad_norm": 0.5397081646188917, - "k1_kl": 0.03485107421875, - "k3_kl": 0.0173187255859375, - "kimi_kl": 0.03619384765625, - "learning_rate": 3.9199999999999994e-08, + "advantages": 2.952133036160376e-05, + "completion_length": 428.0, + "delta_ref_entropy_loss": 0.0286865234375, + "delta_ref_ppl": -0.036376953125, + "entropy_loss": -0.0205078125, + "epoch": 0.4608, + "grad_norm": 0.23327531141792313, + "k1_kl": 0.03662109375, + "k3_kl": 0.02099609375, + "kimi_kl": 0.05517578125, + "learning_rate": 2.696e-07, "loss": 0.0008, - "ppl": 0.017852783203125, - "reward": 0.9800652265548706, - "reward_std": 0.0009571538830641657, - "rewards/perpo_ocr_edit_distance_reward": 0.9800653457641602, + "ppl": 0.006317138671875, + "reward": 0.9938672184944153, + "reward_std": 0.0004772424581460655, + "rewards/perpo_ocr_edit_distance_reward": 0.9938672184944153, "step": 2304, "temperature": 0.9 }, { - "advantages": -4.193612767267041e-06, - "completion_length": 668.5, - "delta_ref_entropy_loss": 0.0518798828125, - "delta_ref_ppl": -0.0477294921875, - "entropy_loss": -0.057830810546875, - "epoch": 0.922, - "grad_norm": 0.7812628685798246, - "k1_kl": 0.0477294921875, - "k3_kl": 0.029510498046875, - "kimi_kl": 0.09033203125, - "learning_rate": 3.9e-08, - "loss": 0.0012, - "ppl": 0.032196044921875, - "reward": 0.9837463796138763, - "reward_std": 0.0014717938611283898, - "rewards/perpo_ocr_edit_distance_reward": 0.9837464094161987, + "advantages": -3.7465778746081924e-07, + "completion_length": 412.0, + "delta_ref_entropy_loss": 0.03515625, + "delta_ref_ppl": -0.08935546875, + "entropy_loss": -0.15234375, + "epoch": 0.461, + "grad_norm": 1.5443213793288388, + "k1_kl": 0.08935546875, + "k3_kl": 0.06591796875, + "kimi_kl": 0.2412109375, + "learning_rate": 2.695e-07, + "loss": 0.0026, + "ppl": 0.0703125, + "reward": 0.5470696687698364, + "reward_std": 0.07635150104761124, + "rewards/perpo_ocr_edit_distance_reward": 0.5470697283744812, "step": 2305, "temperature": 0.9 }, { - "advantages": -3.75168674509041e-05, - "completion_length": 544.0, - "delta_ref_entropy_loss": 0.02252197265625, - "delta_ref_ppl": -0.03533935546875, - "entropy_loss": -0.0177001953125, - "epoch": 0.9224, - "grad_norm": 0.2609682377589809, - "k1_kl": 0.03533935546875, - "k3_kl": 0.0280914306640625, - "kimi_kl": 0.15118408203125, - "learning_rate": 3.88e-08, - "loss": 0.0012, - "ppl": 0.008148193359375, - "reward": 0.9992292821407318, - "reward_std": 0.00027111911913380027, - "rewards/perpo_ocr_edit_distance_reward": 0.999229371547699, - "step": 2306, - "temperature": 0.9 + "advantages": -3.648230267572217e-05, + "completion_length": 679.0, + "delta_ref_entropy_loss": 0.06982421875, + "delta_ref_ppl": -0.09521484375, + "entropy_loss": -0.0634765625, + "epoch": 0.4612, + "grad_norm": 0.8562686481824034, + "k1_kl": 0.09521484375, + "k3_kl": 0.0615234375, + "kimi_kl": 0.1484375, + "learning_rate": 2.6939999999999996e-07, + "loss": 0.0025, + "ppl": 0.02392578125, + "reward": 0.6979607939720154, + "reward_std": 0.0015345066785812378, + "rewards/perpo_ocr_edit_distance_reward": 0.6979609131813049, + "step": 2306, + "temperature": 0.9 }, { - "advantages": -0.00010938304694718681, - "completion_length": 674.5, - "delta_ref_entropy_loss": 0.0308837890625, - "delta_ref_ppl": -0.02679443359375, - "entropy_loss": -0.0233154296875, - "epoch": 0.9228, - "grad_norm": 0.3365016733033871, - "k1_kl": 0.02679443359375, - "k3_kl": 0.01580810546875, - "kimi_kl": 0.0382080078125, - "learning_rate": 3.86e-08, - "loss": 0.0007, - "ppl": 0.011444091796875, - "reward": 0.9912149310112, - "reward_std": 0.0005898931703995913, - "rewards/perpo_ocr_edit_distance_reward": 0.9912149608135223, + "advantages": -0.00012301547394599766, + "completion_length": 458.0, + "delta_ref_entropy_loss": 0.050537109375, + "delta_ref_ppl": -0.0693359375, + "entropy_loss": -0.028076171875, + "epoch": 0.4614, + "grad_norm": 0.23182669482439305, + "k1_kl": 0.0693359375, + "k3_kl": 0.044677734375, + "kimi_kl": 0.1484375, + "learning_rate": 2.6929999999999996e-07, + "loss": 0.0019, + "ppl": 0.0093994140625, + "reward": 0.9980613589286804, + "reward_std": 0.00017687075887806714, + "rewards/perpo_ocr_edit_distance_reward": 0.9980614185333252, "step": 2307, "temperature": 0.9 }, { - "advantages": -1.3368470774821617e-06, - "completion_length": 810.0, - "delta_ref_entropy_loss": 0.0357818603515625, - "delta_ref_ppl": -0.084716796875, - "entropy_loss": -0.0626220703125, - "epoch": 0.9232, - "grad_norm": 1.1906691609839843, - "k1_kl": 0.084747314453125, - "k3_kl": 0.06756591796875, - "kimi_kl": 0.26617431640625, - "learning_rate": 3.839999999999999e-08, - "loss": 0.0027, - "ppl": 0.02783203125, - "reward": 0.6497255563735962, - "reward_std": 0.030892602168023586, - "rewards/perpo_ocr_edit_distance_reward": 0.6497256457805634, + "advantages": -1.8664770323084667e-05, + "completion_length": 153.0, + "delta_ref_entropy_loss": 0.08203125, + "delta_ref_ppl": -0.16015625, + "entropy_loss": -0.041748046875, + "epoch": 0.4616, + "grad_norm": 0.5418117827265201, + "k1_kl": 0.1591796875, + "k3_kl": 0.11376953125, + "kimi_kl": 0.361328125, + "learning_rate": 2.692e-07, + "loss": 0.0046, + "ppl": 0.0101318359375, + "reward": 0.9287250638008118, + "reward_std": 0.0008128260960802436, + "rewards/perpo_ocr_edit_distance_reward": 0.9287250638008118, "step": 2308, "temperature": 0.9 }, { - "advantages": 1.2904406503366772e-05, - "completion_length": 728.0, - "delta_ref_entropy_loss": 0.022369384765625, - "delta_ref_ppl": -0.019775390625, - "entropy_loss": -0.02093505859375, - "epoch": 0.9236, - "grad_norm": 0.32897581418438704, - "k1_kl": 0.0197601318359375, - "k3_kl": 0.01251220703125, - "kimi_kl": 0.0390167236328125, - "learning_rate": 3.82e-08, - "loss": 0.0005, - "ppl": 0.0091705322265625, - "reward": 0.999421238899231, - "reward_std": 0.0001852104760473594, - "rewards/perpo_ocr_edit_distance_reward": 0.9994212985038757, + "advantages": -4.243425064487383e-05, + "completion_length": 478.0, + "delta_ref_entropy_loss": 0.03271484375, + "delta_ref_ppl": -0.0849609375, + "entropy_loss": -0.052978515625, + "epoch": 0.4618, + "grad_norm": 1.116350237971387, + "k1_kl": 0.0849609375, + "k3_kl": 0.062255859375, + "kimi_kl": 0.1884765625, + "learning_rate": 2.691e-07, + "loss": 0.0025, + "ppl": 0.0189208984375, + "reward": 0.9958873987197876, + "reward_std": 0.001506103784777224, + "rewards/perpo_ocr_edit_distance_reward": 0.9958875179290771, "step": 2309, "temperature": 0.9 }, { - "advantages": -0.00047166859440039843, - "completion_length": 695.5, - "delta_ref_entropy_loss": 0.0264892578125, - "delta_ref_ppl": -0.0267333984375, - "entropy_loss": -0.029052734375, - "epoch": 0.924, - "grad_norm": 0.18206591403975894, - "k1_kl": 0.0267333984375, - "k3_kl": 0.0152435302734375, - "kimi_kl": 0.03656005859375, - "learning_rate": 3.7999999999999996e-08, - "loss": 0.0011, - "ppl": 0.01226806640625, - "reward": 0.9808274507522583, - "reward_std": 0.000133890935103409, - "rewards/perpo_ocr_edit_distance_reward": 0.9808275699615479, + "advantages": 2.050399962172378e-05, + "completion_length": 564.0, + "delta_ref_entropy_loss": 0.044677734375, + "delta_ref_ppl": -0.0380859375, + "entropy_loss": -0.03076171875, + "epoch": 0.462, + "grad_norm": 0.6380714726546882, + "k1_kl": 0.0380859375, + "k3_kl": 0.0201416015625, + "kimi_kl": 0.0498046875, + "learning_rate": 2.69e-07, + "loss": 0.0008, + "ppl": 0.00872802734375, + "reward": 0.9974225163459778, + "reward_std": 0.00114550965372473, + "rewards/perpo_ocr_edit_distance_reward": 0.9974225163459778, "step": 2310, "temperature": 0.9 }, { - "advantages": -5.236694050836377e-06, - "completion_length": 876.5, - "delta_ref_entropy_loss": 0.03399658203125, - "delta_ref_ppl": -0.02789306640625, - "entropy_loss": -0.0250244140625, - "epoch": 0.9244, - "grad_norm": 1.9657041500011754, - "k1_kl": 0.02783203125, - "k3_kl": 0.017059326171875, - "kimi_kl": 0.051025390625, - "learning_rate": 3.78e-08, - "loss": 0.0007, - "ppl": 0.01214599609375, - "reward": 0.9948410987854004, - "reward_std": 0.0015809352044016123, - "rewards/perpo_ocr_edit_distance_reward": 0.9948411583900452, + "advantages": 1.2261527899681823e-06, + "completion_length": 862.0, + "delta_ref_entropy_loss": 0.0703125, + "delta_ref_ppl": -0.08544921875, + "entropy_loss": -0.2216796875, + "epoch": 0.4622, + "grad_norm": 1.5723282563668264, + "k1_kl": 0.08544921875, + "k3_kl": 0.050537109375, + "kimi_kl": 0.1015625, + "learning_rate": 2.689e-07, + "loss": 0.002, + "ppl": 0.1015625, + "reward": 0.8023122549057007, + "reward_std": 0.00711727375164628, + "rewards/perpo_ocr_edit_distance_reward": 0.8023121953010559, "step": 2311, "temperature": 0.9 }, { - "advantages": -1.0899135958197803e-06, - "completion_length": 315.5, - "delta_ref_entropy_loss": 0.1083984375, - "delta_ref_ppl": -0.1513671875, - "entropy_loss": -0.300048828125, - "epoch": 0.9248, - "grad_norm": 5.834979800597704, - "k1_kl": 0.15185546875, - "k3_kl": 0.109130859375, - "kimi_kl": 0.255126953125, - "learning_rate": 3.76e-08, - "loss": 0.0044, - "ppl": 0.1624755859375, - "reward": 0.6210499852895737, - "reward_std": 0.046552170999348164, - "rewards/perpo_ocr_edit_distance_reward": 0.6210500150918961, + "advantages": -3.9611546526430175e-05, + "completion_length": 514.0, + "delta_ref_entropy_loss": 0.0478515625, + "delta_ref_ppl": -0.064453125, + "entropy_loss": -0.0311279296875, + "epoch": 0.4624, + "grad_norm": 0.4122980442667905, + "k1_kl": 0.064453125, + "k3_kl": 0.04052734375, + "kimi_kl": 0.11328125, + "learning_rate": 2.6879999999999997e-07, + "loss": 0.0017, + "ppl": 0.009521484375, + "reward": 0.9911927580833435, + "reward_std": 0.0009742606780491769, + "rewards/perpo_ocr_edit_distance_reward": 0.9911928772926331, "step": 2312, "temperature": 0.9 }, { - "advantages": -0.0002980232238769531, - "completion_length": 678.5, - "delta_ref_entropy_loss": 0.0205078125, - "delta_ref_ppl": -0.0094451904296875, - "entropy_loss": -0.011566162109375, - "epoch": 0.9252, - "grad_norm": 0.013015735485260306, - "k1_kl": 0.0094451904296875, - "k3_kl": 0.00374603271484375, - "kimi_kl": 0.00592041015625, - "learning_rate": 3.7400000000000004e-08, - "loss": 0.0004, - "ppl": 0.003360748291015625, - "reward": 0.9997448623180389, - "reward_std": 0.0, - "rewards/perpo_ocr_edit_distance_reward": 0.9997449219226837, + "advantages": 1.2602125707417144e-06, + "completion_length": 57.0, + "delta_ref_entropy_loss": 0.0101318359375, + "delta_ref_ppl": -0.396484375, + "entropy_loss": -0.0830078125, + "epoch": 0.4626, + "grad_norm": 4.570915890293967, + "k1_kl": 0.396484375, + "k3_kl": 0.326171875, + "kimi_kl": 1.5703125, + "learning_rate": 2.6869999999999996e-07, + "loss": 0.013, + "ppl": 0.04345703125, + "reward": 0.9842146039009094, + "reward_std": 0.006712617352604866, + "rewards/perpo_ocr_edit_distance_reward": 0.9842146635055542, "step": 2313, "temperature": 0.9 }, { - "advantages": -0.0003038815093532321, - "completion_length": 352.5, - "delta_ref_entropy_loss": 0.0341796875, - "delta_ref_ppl": -0.0482177734375, - "entropy_loss": -0.02862548828125, - "epoch": 0.9256, - "grad_norm": 0.9155620885975182, - "k1_kl": 0.04833984375, - "k3_kl": 0.0330810546875, - "kimi_kl": 0.114990234375, - "learning_rate": 3.7199999999999996e-08, - "loss": 0.0016, - "ppl": 0.01678466796875, - "reward": 0.913887619972229, - "reward_std": 0.0014033347833901644, - "rewards/perpo_ocr_edit_distance_reward": 0.9138876795768738, + "advantages": -5.057880116510205e-05, + "completion_length": 666.0, + "delta_ref_entropy_loss": 0.041259765625, + "delta_ref_ppl": -0.0625, + "entropy_loss": -0.045654296875, + "epoch": 0.4628, + "grad_norm": 0.43721549699334805, + "k1_kl": 0.06298828125, + "k3_kl": 0.04052734375, + "kimi_kl": 0.10986328125, + "learning_rate": 2.686e-07, + "loss": 0.0017, + "ppl": 0.0162353515625, + "reward": 0.9906191229820251, + "reward_std": 0.0005731857963837683, + "rewards/perpo_ocr_edit_distance_reward": 0.9906191825866699, "step": 2314, "temperature": 0.9 }, { - "advantages": -0.00016141790183610283, - "completion_length": 597.5, - "delta_ref_entropy_loss": 0.02197265625, - "delta_ref_ppl": -0.0189208984375, - "entropy_loss": -0.01434326171875, - "epoch": 0.926, - "grad_norm": 0.5092080520930213, - "k1_kl": 0.0189208984375, - "k3_kl": 0.0123291015625, - "kimi_kl": 0.038330078125, - "learning_rate": 3.6999999999999994e-08, - "loss": 0.0007, - "ppl": 0.005340576171875, - "reward": 0.9972136914730072, - "reward_std": 0.00016925521777011454, - "rewards/perpo_ocr_edit_distance_reward": 0.997213751077652, + "advantages": -5.7986806496046484e-05, + "completion_length": 833.0, + "delta_ref_entropy_loss": 0.0341796875, + "delta_ref_ppl": -0.0576171875, + "entropy_loss": -0.03955078125, + "epoch": 0.463, + "grad_norm": 0.32964318798196246, + "k1_kl": 0.0576171875, + "k3_kl": 0.03662109375, + "kimi_kl": 0.10546875, + "learning_rate": 2.685e-07, + "loss": 0.0015, + "ppl": 0.015625, + "reward": 0.993301510810852, + "reward_std": 0.0004876042949035764, + "rewards/perpo_ocr_edit_distance_reward": 0.9933016300201416, "step": 2315, "temperature": 0.9 }, { - "advantages": -6.083080188545864e-05, - "completion_length": 768.5, - "delta_ref_entropy_loss": 0.02728271484375, - "delta_ref_ppl": -0.0206298828125, - "entropy_loss": -0.02239990234375, - "epoch": 0.9264, - "grad_norm": 0.4271285319614608, - "k1_kl": 0.0206298828125, - "k3_kl": 0.012237548828125, - "kimi_kl": 0.0360107421875, - "learning_rate": 3.68e-08, - "loss": 0.0006, - "ppl": 0.010833740234375, - "reward": 0.9992397427558899, - "reward_std": 0.00048239286115858704, - "rewards/perpo_ocr_edit_distance_reward": 0.9992398619651794, + "advantages": -6.505421424662927e-06, + "completion_length": 1547.0, + "delta_ref_entropy_loss": 0.01141357421875, + "delta_ref_ppl": -0.0213623046875, + "entropy_loss": -0.024658203125, + "epoch": 0.4632, + "grad_norm": 0.5428670427654876, + "k1_kl": 0.0213623046875, + "k3_kl": 0.01336669921875, + "kimi_kl": 0.0283203125, + "learning_rate": 2.684e-07, + "loss": 0.0005, + "ppl": 0.0091552734375, + "reward": 0.9885457754135132, + "reward_std": 0.003820219077169895, + "rewards/perpo_ocr_edit_distance_reward": 0.9885457754135132, "step": 2316, "temperature": 0.9 }, { - "advantages": -1.5939985781976418e-05, - "completion_length": 301.0, - "delta_ref_entropy_loss": 0.074951171875, - "delta_ref_ppl": -0.105712890625, - "entropy_loss": -0.111083984375, - "epoch": 0.9268, - "grad_norm": 1.453905664557656, - "k1_kl": 0.1058349609375, - "k3_kl": 0.0712890625, - "kimi_kl": 0.230224609375, - "learning_rate": 3.66e-08, - "loss": 0.0029, - "ppl": 0.0645751953125, - "reward": 0.9079250693321228, - "reward_std": 0.0023294567363336682, - "rewards/perpo_ocr_edit_distance_reward": 0.9079250991344452, + "advantages": -3.5881996154785156e-05, + "completion_length": 480.0, + "delta_ref_entropy_loss": 0.06201171875, + "delta_ref_ppl": -0.06982421875, + "entropy_loss": -0.0279541015625, + "epoch": 0.4634, + "grad_norm": 0.47098686108764787, + "k1_kl": 0.06982421875, + "k3_kl": 0.041259765625, + "kimi_kl": 0.11767578125, + "learning_rate": 2.683e-07, + "loss": 0.0017, + "ppl": 0.00958251953125, + "reward": 0.9974561333656311, + "reward_std": 0.0008494146750308573, + "rewards/perpo_ocr_edit_distance_reward": 0.9974561929702759, "step": 2317, "temperature": 0.9 }, { - "advantages": 7.706029720111474e-06, - "completion_length": 1346.5, - "delta_ref_entropy_loss": 0.0657958984375, - "delta_ref_ppl": -0.0592041015625, - "entropy_loss": -0.149658203125, - "epoch": 0.9272, - "grad_norm": 6.834960355863583, - "k1_kl": 0.0592041015625, - "k3_kl": 0.09710693359375, - "kimi_kl": 0.076171875, - "learning_rate": 3.64e-08, - "loss": 0.0039, - "ppl": 0.078857421875, - "reward": 0.8626129925251007, - "reward_std": 0.04148986516520381, - "rewards/perpo_ocr_edit_distance_reward": 0.8626130223274231, + "advantages": -2.3330962903855834e-06, + "completion_length": 73.0, + "delta_ref_entropy_loss": 0.0308837890625, + "delta_ref_ppl": -0.484375, + "entropy_loss": -0.1376953125, + "epoch": 0.4636, + "grad_norm": 4.956131864063537, + "k1_kl": 0.486328125, + "k3_kl": 0.421875, + "kimi_kl": 1.7734375, + "learning_rate": 2.682e-07, + "loss": 0.0169, + "ppl": 0.0517578125, + "reward": 0.9715301990509033, + "reward_std": 0.003558719763532281, + "rewards/perpo_ocr_edit_distance_reward": 0.9715302586555481, "step": 2318, "temperature": 0.9 }, { - "advantages": -1.2866088842322654e-05, - "completion_length": 1207.5, - "delta_ref_entropy_loss": 0.04656982421875, - "delta_ref_ppl": -0.03173828125, - "entropy_loss": -0.02880859375, - "epoch": 0.9276, - "grad_norm": 0.9749280929013338, - "k1_kl": 0.03173828125, - "k3_kl": 0.017730712890625, - "kimi_kl": 0.04095458984375, - "learning_rate": 3.62e-08, - "loss": 0.0007, - "ppl": 0.0142822265625, - "reward": 0.984545111656189, - "reward_std": 0.0017007815768010914, - "rewards/perpo_ocr_edit_distance_reward": 0.9845452010631561, + "advantages": -6.972040864638984e-05, + "completion_length": 387.0, + "delta_ref_entropy_loss": 0.08203125, + "delta_ref_ppl": -0.09228515625, + "entropy_loss": -0.06396484375, + "epoch": 0.4638, + "grad_norm": 1.1170856612968383, + "k1_kl": 0.09228515625, + "k3_kl": 0.05908203125, + "kimi_kl": 0.189453125, + "learning_rate": 2.6809999999999997e-07, + "loss": 0.0024, + "ppl": 0.02490234375, + "reward": 0.9914647936820984, + "reward_std": 0.0009992328705266118, + "rewards/perpo_ocr_edit_distance_reward": 0.9914648532867432, "step": 2319, "temperature": 0.9 }, { - "advantages": -1.6927719116210938e-05, - "completion_length": 591.0, - "delta_ref_entropy_loss": 0.0526123046875, - "delta_ref_ppl": -0.0369873046875, - "entropy_loss": -0.04071044921875, - "epoch": 0.928, - "grad_norm": 0.5974114151327581, - "k1_kl": 0.0369873046875, - "k3_kl": 0.02374267578125, - "kimi_kl": 0.065185546875, - "learning_rate": 3.6e-08, - "loss": 0.001, - "ppl": 0.0236053466796875, - "reward": 0.9805092215538025, - "reward_std": 0.0012069705408066511, - "rewards/perpo_ocr_edit_distance_reward": 0.9805092811584473, + "advantages": -1.9226756194257177e-05, + "completion_length": 546.0, + "delta_ref_entropy_loss": 0.057373046875, + "delta_ref_ppl": -0.0703125, + "entropy_loss": -0.078125, + "epoch": 0.464, + "grad_norm": 1.4635996334079733, + "k1_kl": 0.0703125, + "k3_kl": 0.046630859375, + "kimi_kl": 0.158203125, + "learning_rate": 2.68e-07, + "loss": 0.0019, + "ppl": 0.0264892578125, + "reward": 0.972887396812439, + "reward_std": 0.002558942185714841, + "rewards/perpo_ocr_edit_distance_reward": 0.972887396812439, "step": 2320, "temperature": 0.9 }, { - "advantages": -9.212109216605313e-05, - "completion_length": 555.5, - "delta_ref_entropy_loss": 0.02490234375, - "delta_ref_ppl": -0.01873779296875, - "entropy_loss": -0.029296875, - "epoch": 0.9284, - "grad_norm": 0.3825795726922779, - "k1_kl": 0.018798828125, - "k3_kl": 0.010040283203125, - "kimi_kl": 0.02093505859375, - "learning_rate": 3.5799999999999996e-08, - "loss": 0.0005, - "ppl": 0.014556884765625, - "reward": 0.9996238946914673, - "reward_std": 0.00012171003512406742, - "rewards/perpo_ocr_edit_distance_reward": 0.9996239542961121, + "advantages": -2.5170191293000244e-05, + "completion_length": 702.0, + "delta_ref_entropy_loss": 0.1083984375, + "delta_ref_ppl": -0.11328125, + "entropy_loss": -0.1630859375, + "epoch": 0.4642, + "grad_norm": 2.451443051723837, + "k1_kl": 0.11376953125, + "k3_kl": 0.06640625, + "kimi_kl": 0.208984375, + "learning_rate": 2.679e-07, + "loss": 0.0027, + "ppl": 0.08837890625, + "reward": 0.8067601919174194, + "reward_std": 0.0019314008532091975, + "rewards/perpo_ocr_edit_distance_reward": 0.8067602515220642, "step": 2321, "temperature": 0.9 }, { - "advantages": 2.6753972633741796e-05, - "completion_length": 431.5, - "delta_ref_entropy_loss": 0.01800537109375, - "delta_ref_ppl": -0.015167236328125, - "entropy_loss": -0.014312744140625, - "epoch": 0.9288, - "grad_norm": 0.2454993318671991, - "k1_kl": 0.015167236328125, - "k3_kl": 0.0081787109375, - "kimi_kl": 0.01812744140625, - "learning_rate": 3.56e-08, - "loss": 0.0003, - "ppl": 0.0059967041015625, - "reward": 0.9997083246707916, - "reward_std": 0.0001091214653570205, - "rewards/perpo_ocr_edit_distance_reward": 0.9997083246707916, + "advantages": -1.169954066426726e-05, + "completion_length": 646.0, + "delta_ref_entropy_loss": 0.03955078125, + "delta_ref_ppl": -0.07421875, + "entropy_loss": -0.032470703125, + "epoch": 0.4644, + "grad_norm": 0.511813016292343, + "k1_kl": 0.07421875, + "k3_kl": 0.0478515625, + "kimi_kl": 0.1513671875, + "learning_rate": 2.6779999999999995e-07, + "loss": 0.0019, + "ppl": 0.01123046875, + "reward": 0.9868205189704895, + "reward_std": 0.0020809799898415804, + "rewards/perpo_ocr_edit_distance_reward": 0.9868205785751343, "step": 2322, "temperature": 0.9 }, { - "advantages": -0.00034168788624810986, - "completion_length": 708.5, - "delta_ref_entropy_loss": 0.014617919921875, - "delta_ref_ppl": -0.0277099609375, - "entropy_loss": -0.02203369140625, - "epoch": 0.9292, - "grad_norm": 0.18914328832876537, - "k1_kl": 0.027587890625, - "k3_kl": 0.0200958251953125, - "kimi_kl": 0.063323974609375, - "learning_rate": 3.54e-08, - "loss": 0.0011, - "ppl": 0.010589599609375, - "reward": 0.999184250831604, - "reward_std": 0.00014499310054816306, - "rewards/perpo_ocr_edit_distance_reward": 0.9991843700408936, + "advantages": -7.237707177409902e-05, + "completion_length": 546.0, + "delta_ref_entropy_loss": 0.052734375, + "delta_ref_ppl": -0.0634765625, + "entropy_loss": -0.040771484375, + "epoch": 0.4646, + "grad_norm": 0.686587604904649, + "k1_kl": 0.0634765625, + "k3_kl": 0.037841796875, + "kimi_kl": 0.09228515625, + "learning_rate": 2.677e-07, + "loss": 0.0016, + "ppl": 0.01611328125, + "reward": 0.9980912208557129, + "reward_std": 0.0007235347875393927, + "rewards/perpo_ocr_edit_distance_reward": 0.9980912804603577, "step": 2323, "temperature": 0.9 }, { - "advantages": -3.473248060448242e-05, - "completion_length": 662.0, - "delta_ref_entropy_loss": 0.03985595703125, - "delta_ref_ppl": -0.0299072265625, - "entropy_loss": -0.03662109375, - "epoch": 0.9296, - "grad_norm": 0.6551846339198331, - "k1_kl": 0.02996826171875, - "k3_kl": 0.016265869140625, - "kimi_kl": 0.033935546875, - "learning_rate": 3.52e-08, - "loss": 0.0007, - "ppl": 0.017333984375, - "reward": 0.9709843099117279, - "reward_std": 0.0003689247969305143, - "rewards/perpo_ocr_edit_distance_reward": 0.9709843695163727, + "advantages": -1.094171057047788e-05, + "completion_length": 472.0, + "delta_ref_entropy_loss": 0.024169921875, + "delta_ref_ppl": -0.04541015625, + "entropy_loss": -0.040283203125, + "epoch": 0.4648, + "grad_norm": 0.5233174938296148, + "k1_kl": 0.04541015625, + "k3_kl": 0.0302734375, + "kimi_kl": 0.08349609375, + "learning_rate": 2.676e-07, + "loss": 0.0012, + "ppl": 0.0130615234375, + "reward": 0.9880259037017822, + "reward_std": 0.00145931716542691, + "rewards/perpo_ocr_edit_distance_reward": 0.988025963306427, "step": 2324, "temperature": 0.9 }, { - "advantages": -1.4458384100635158e-05, - "completion_length": 1178.5, - "delta_ref_entropy_loss": 0.051513671875, - "delta_ref_ppl": -0.056640625, - "entropy_loss": -0.1669921875, - "epoch": 0.93, - "grad_norm": 1.3177229643378958, - "k1_kl": 0.056884765625, - "k3_kl": 0.03680419921875, - "kimi_kl": 0.125, - "learning_rate": 3.5e-08, - "loss": 0.0015, - "ppl": 0.098419189453125, - "reward": 0.8077676594257355, - "reward_std": 0.08193415036657825, - "rewards/perpo_ocr_edit_distance_reward": 0.8077677190303802, + "advantages": -2.963202405226184e-06, + "completion_length": 1507.0, + "delta_ref_entropy_loss": 0.01904296875, + "delta_ref_ppl": -0.0322265625, + "entropy_loss": -0.043701171875, + "epoch": 0.465, + "grad_norm": 1.4917490916668363, + "k1_kl": 0.0322265625, + "k3_kl": 0.0205078125, + "kimi_kl": 0.05224609375, + "learning_rate": 2.675e-07, + "loss": 0.0008, + "ppl": 0.0189208984375, + "reward": 0.990670382976532, + "reward_std": 0.0113621074706316, + "rewards/perpo_ocr_edit_distance_reward": 0.9906704425811768, "step": 2325, "temperature": 0.9 }, { - "advantages": 0.0, - "completion_length": 73.5, - "delta_ref_entropy_loss": 0.050048828125, - "delta_ref_ppl": -0.10546875, - "entropy_loss": -0.0362548828125, - "epoch": 0.9304, - "grad_norm": 0.03705623302322145, - "k1_kl": 0.105224609375, - "k3_kl": 0.0799560546875, - "kimi_kl": 0.2978515625, - "learning_rate": 3.4799999999999994e-08, - "loss": 0.0032, - "ppl": 0.014312744140625, - "reward": 0.997767835855484, - "reward_std": 0.0, - "rewards/perpo_ocr_edit_distance_reward": 0.9977678656578064, + "advantages": -1.7608916095923632e-05, + "completion_length": 289.0, + "delta_ref_entropy_loss": 0.10986328125, + "delta_ref_ppl": -0.1484375, + "entropy_loss": -0.08203125, + "epoch": 0.4652, + "grad_norm": 0.9692763474835893, + "k1_kl": 0.1484375, + "k3_kl": 0.10107421875, + "kimi_kl": 0.359375, + "learning_rate": 2.674e-07, + "loss": 0.0041, + "ppl": 0.03271484375, + "reward": 0.9936107397079468, + "reward_std": 0.0057067652232944965, + "rewards/perpo_ocr_edit_distance_reward": 0.9936108589172363, "step": 2326, "temperature": 0.9 }, { - "advantages": 0.0, - "completion_length": 348.0, - "delta_ref_entropy_loss": 0.0384521484375, - "delta_ref_ppl": -0.02813720703125, - "entropy_loss": -0.012939453125, - "epoch": 0.9308, - "grad_norm": 0.01236819738040333, - "k1_kl": 0.0281982421875, - "k3_kl": 0.015167236328125, - "kimi_kl": 0.048553466796875, - "learning_rate": 3.46e-08, - "loss": 0.0006, - "ppl": 0.00385284423828125, - "reward": 1.0, - "reward_std": 0.0, - "rewards/perpo_ocr_edit_distance_reward": 1.0, + "advantages": -1.3453620795189636e-06, + "completion_length": 575.0, + "delta_ref_entropy_loss": 0.08056640625, + "delta_ref_ppl": -0.09716796875, + "entropy_loss": -0.1708984375, + "epoch": 0.4654, + "grad_norm": 2.5784989853642, + "k1_kl": 0.09716796875, + "k3_kl": 0.07470703125, + "kimi_kl": 0.1728515625, + "learning_rate": 2.6729999999999996e-07, + "loss": 0.003, + "ppl": 0.08251953125, + "reward": 0.8751821517944336, + "reward_std": 0.0062349457293748856, + "rewards/perpo_ocr_edit_distance_reward": 0.8751822113990784, "step": 2327, "temperature": 0.9 }, { - "advantages": -1.1835779602620278e-05, - "completion_length": 248.0, - "delta_ref_entropy_loss": 0.1025390625, - "delta_ref_ppl": -0.1566162109375, - "entropy_loss": -0.215087890625, - "epoch": 0.9312, - "grad_norm": 3.3858311720993295, - "k1_kl": 0.15673828125, - "k3_kl": 0.11004638671875, - "kimi_kl": 0.3663330078125, - "learning_rate": 3.44e-08, - "loss": 0.0044, - "ppl": 0.1055908203125, - "reward": 0.8383030593395233, - "reward_std": 0.07192129199393094, - "rewards/perpo_ocr_edit_distance_reward": 0.8383031189441681, + "advantages": -0.0001838803436839953, + "completion_length": 360.0, + "delta_ref_entropy_loss": 0.0279541015625, + "delta_ref_ppl": -0.052734375, + "entropy_loss": -0.0262451171875, + "epoch": 0.4656, + "grad_norm": 0.48723861020802184, + "k1_kl": 0.052978515625, + "k3_kl": 0.036376953125, + "kimi_kl": 0.134765625, + "learning_rate": 2.6719999999999996e-07, + "loss": 0.0016, + "ppl": 0.00604248046875, + "reward": 0.9977611303329468, + "reward_std": 0.0004556526255328208, + "rewards/perpo_ocr_edit_distance_reward": 0.9977612495422363, "step": 2328, "temperature": 0.9 }, { - "advantages": -0.0002864492789740325, - "completion_length": 712.0, - "delta_ref_entropy_loss": 0.03924560546875, - "delta_ref_ppl": -0.0562744140625, - "entropy_loss": -0.031494140625, - "epoch": 0.9316, - "grad_norm": 0.5430738085686642, - "k1_kl": 0.0565185546875, - "k3_kl": 0.0413818359375, - "kimi_kl": 0.107177734375, - "learning_rate": 3.42e-08, - "loss": 0.0019, - "ppl": 0.0228271484375, - "reward": 0.9821817278862, - "reward_std": 0.0003180571075063199, - "rewards/perpo_ocr_edit_distance_reward": 0.9821817576885223, + "advantages": 8.514949634275126e-09, + "completion_length": 547.0, + "delta_ref_entropy_loss": 0.0238037109375, + "delta_ref_ppl": -0.060302734375, + "entropy_loss": -0.0390625, + "epoch": 0.4658, + "grad_norm": 0.5540982034716725, + "k1_kl": 0.060302734375, + "k3_kl": 0.044189453125, + "kimi_kl": 0.1455078125, + "learning_rate": 2.671e-07, + "loss": 0.0018, + "ppl": 0.018310546875, + "reward": 0.996286153793335, + "reward_std": 0.0010471836430951953, + "rewards/perpo_ocr_edit_distance_reward": 0.9962862133979797, "step": 2329, "temperature": 0.9 }, { - "advantages": -4.599775820679497e-05, - "completion_length": 869.5, - "delta_ref_entropy_loss": 0.0374755859375, - "delta_ref_ppl": -0.029296875, - "entropy_loss": -0.02301025390625, - "epoch": 0.932, - "grad_norm": 0.6262490399815551, - "k1_kl": 0.02923583984375, - "k3_kl": 0.015960693359375, - "kimi_kl": 0.038330078125, - "learning_rate": 3.4e-08, - "loss": 0.0007, - "ppl": 0.009246826171875, - "reward": 0.9945530891418457, - "reward_std": 0.0007909060368547216, - "rewards/perpo_ocr_edit_distance_reward": 0.9945531785488129, + "advantages": -2.8269632821320556e-06, + "completion_length": 104.0, + "delta_ref_entropy_loss": 0.18359375, + "delta_ref_ppl": -0.421875, + "entropy_loss": -0.2294921875, + "epoch": 0.466, + "grad_norm": 3.464653609658724, + "k1_kl": 0.421875, + "k3_kl": 0.328125, + "kimi_kl": 1.3515625, + "learning_rate": 2.67e-07, + "loss": 0.0131, + "ppl": 0.10791015625, + "reward": 0.7244898080825806, + "reward_std": 0.0028880517929792404, + "rewards/perpo_ocr_edit_distance_reward": 0.7244898080825806, "step": 2330, "temperature": 0.9 }, { - "advantages": -0.00010083403425653614, - "completion_length": 444.5, - "delta_ref_entropy_loss": 0.069091796875, - "delta_ref_ppl": -0.087646484375, - "entropy_loss": -0.0352783203125, - "epoch": 0.9324, - "grad_norm": 3.4258632225373047, - "k1_kl": 0.087646484375, - "k3_kl": 0.05810546875, + "advantages": -2.918924656114541e-05, + "completion_length": 1153.0, + "delta_ref_entropy_loss": 0.036865234375, + "delta_ref_ppl": -0.06298828125, + "entropy_loss": -0.0458984375, + "epoch": 0.4662, + "grad_norm": 1999.1032538517236, + "k1_kl": 0.06298828125, + "k3_kl": 11.4375, "kimi_kl": 0.1826171875, - "learning_rate": 3.38e-08, - "loss": 0.0024, - "ppl": 0.01348876953125, - "reward": 0.6931841671466827, - "reward_std": 0.00771847547730431, - "rewards/perpo_ocr_edit_distance_reward": 0.6931842416524887, + "learning_rate": 2.669e-07, + "loss": 0.4591, + "ppl": 0.028076171875, + "reward": 0.9943429231643677, + "reward_std": 0.0013587812427431345, + "rewards/perpo_ocr_edit_distance_reward": 0.9943429827690125, "step": 2331, "temperature": 0.9 }, { - "advantages": -1.4368977304002328e-05, - "completion_length": 902.0, - "delta_ref_entropy_loss": 0.049072265625, - "delta_ref_ppl": -0.0615234375, - "entropy_loss": -0.029052734375, - "epoch": 0.9328, - "grad_norm": 0.6685128539904213, - "k1_kl": 0.06158447265625, - "k3_kl": 0.038970947265625, - "kimi_kl": 0.1107177734375, - "learning_rate": 3.3599999999999996e-08, - "loss": 0.0016, - "ppl": 0.0126953125, - "reward": 0.9652529060840607, - "reward_std": 0.02073861879762262, - "rewards/perpo_ocr_edit_distance_reward": 0.9652529954910278, + "advantages": -1.507146043877583e-05, + "completion_length": 118.0, + "delta_ref_entropy_loss": 0.057373046875, + "delta_ref_ppl": -0.296875, + "entropy_loss": -0.09033203125, + "epoch": 0.4664, + "grad_norm": 1.3160651258129672, + "k1_kl": 0.296875, + "k3_kl": 0.2451171875, + "kimi_kl": 1.078125, + "learning_rate": 2.668e-07, + "loss": 0.0098, + "ppl": 0.03759765625, + "reward": 0.9700934290885925, + "reward_std": 0.0021583193447440863, + "rewards/perpo_ocr_edit_distance_reward": 0.9700934886932373, "step": 2332, "temperature": 0.9 }, { - "advantages": -3.504753112792969e-05, - "completion_length": 749.0, - "delta_ref_entropy_loss": 0.05169677734375, - "delta_ref_ppl": -0.028564453125, - "entropy_loss": -0.04119873046875, - "epoch": 0.9332, - "grad_norm": 0.6931710406274337, - "k1_kl": 0.02838134765625, - "k3_kl": 0.0137939453125, - "kimi_kl": 0.037353515625, - "learning_rate": 3.3399999999999995e-08, - "loss": 0.0006, - "ppl": 0.02008056640625, - "reward": 0.9544307887554169, - "reward_std": 0.009485342248808593, - "rewards/perpo_ocr_edit_distance_reward": 0.954430878162384, + "advantages": -2.367156048421748e-05, + "completion_length": 598.0, + "delta_ref_entropy_loss": 0.04052734375, + "delta_ref_ppl": -0.051025390625, + "entropy_loss": -0.0230712890625, + "epoch": 0.4666, + "grad_norm": 0.25047736408010574, + "k1_kl": 0.051025390625, + "k3_kl": 0.0308837890625, + "kimi_kl": 0.0927734375, + "learning_rate": 2.6669999999999997e-07, + "loss": 0.0013, + "ppl": 0.006103515625, + "reward": 0.9988716840744019, + "reward_std": 0.0002596045669633895, + "rewards/perpo_ocr_edit_distance_reward": 0.9988716840744019, "step": 2333, "temperature": 0.9 }, { - "advantages": -0.00011776175233535469, - "completion_length": 579.0, - "delta_ref_entropy_loss": 0.02862548828125, - "delta_ref_ppl": -0.0213623046875, - "entropy_loss": -0.016998291015625, - "epoch": 0.9336, - "grad_norm": 0.2859596628575046, - "k1_kl": 0.021392822265625, - "k3_kl": 0.0125732421875, - "kimi_kl": 0.0281982421875, - "learning_rate": 3.32e-08, - "loss": 0.0006, - "ppl": 0.0075836181640625, - "reward": 0.9986758232116699, - "reward_std": 0.00022702432761434466, - "rewards/perpo_ocr_edit_distance_reward": 0.9986759424209595, + "advantages": -4.189355422568042e-06, + "completion_length": 516.0, + "delta_ref_entropy_loss": 0.036376953125, + "delta_ref_ppl": -0.06689453125, + "entropy_loss": -0.035400390625, + "epoch": 0.4668, + "grad_norm": 0.5574945408693772, + "k1_kl": 0.06689453125, + "k3_kl": 0.04638671875, + "kimi_kl": 0.138671875, + "learning_rate": 2.6659999999999997e-07, + "loss": 0.0019, + "ppl": 0.0101318359375, + "reward": 0.9855746626853943, + "reward_std": 0.008057557046413422, + "rewards/perpo_ocr_edit_distance_reward": 0.9855747222900391, "step": 2334, "temperature": 0.9 }, { - "advantages": -5.455528116726782e-05, - "completion_length": 376.5, - "delta_ref_entropy_loss": 0.0269775390625, - "delta_ref_ppl": -0.023101806640625, - "entropy_loss": -0.01837158203125, - "epoch": 0.934, - "grad_norm": 0.5754219094320099, - "k1_kl": 0.023284912109375, - "k3_kl": 0.0133209228515625, - "kimi_kl": 0.0274658203125, - "learning_rate": 3.3e-08, - "loss": 0.0006, - "ppl": 0.00848388671875, - "reward": 0.9983764290809631, - "reward_std": 0.0004892166034551337, - "rewards/perpo_ocr_edit_distance_reward": 0.9983765184879303, + "advantages": -4.853521386394277e-05, + "completion_length": 608.0, + "delta_ref_entropy_loss": 0.024169921875, + "delta_ref_ppl": -0.040771484375, + "entropy_loss": -0.0277099609375, + "epoch": 0.467, + "grad_norm": 0.3302898135513906, + "k1_kl": 0.040771484375, + "k3_kl": 0.026123046875, + "kimi_kl": 0.07177734375, + "learning_rate": 2.665e-07, + "loss": 0.0011, + "ppl": 0.0091552734375, + "reward": 0.9985533356666565, + "reward_std": 0.0006021165754646063, + "rewards/perpo_ocr_edit_distance_reward": 0.998553454875946, "step": 2335, "temperature": 0.9 }, { - "advantages": -0.0002980232238769531, - "completion_length": 334.0, - "delta_ref_entropy_loss": 0.0355224609375, - "delta_ref_ppl": -0.02685546875, - "entropy_loss": -0.01611328125, - "epoch": 0.9344, - "grad_norm": 0.015460884844543017, - "k1_kl": 0.0267333984375, - "k3_kl": 0.01470947265625, - "kimi_kl": 0.03814697265625, - "learning_rate": 3.28e-08, - "loss": 0.0009, - "ppl": 0.0051116943359375, - "reward": 0.9926818907260895, - "reward_std": 0.0, - "rewards/perpo_ocr_edit_distance_reward": 0.9926819801330566, + "advantages": -1.106943454942666e-05, + "completion_length": 488.0, + "delta_ref_entropy_loss": 0.12109375, + "delta_ref_ppl": -0.12890625, + "entropy_loss": -0.337890625, + "epoch": 0.4672, + "grad_norm": 2.2311660423382316, + "k1_kl": 0.1279296875, + "k3_kl": 0.08251953125, + "kimi_kl": 0.2421875, + "learning_rate": 2.664e-07, + "loss": 0.0033, + "ppl": 0.1708984375, + "reward": 0.7434030771255493, + "reward_std": 0.004525256343185902, + "rewards/perpo_ocr_edit_distance_reward": 0.7434031367301941, "step": 2336, "temperature": 0.9 }, { - "advantages": -2.3990871085288745e-05, - "completion_length": 534.0, - "delta_ref_entropy_loss": 0.03228759765625, - "delta_ref_ppl": -0.03033447265625, - "entropy_loss": -0.01275634765625, - "epoch": 0.9348, - "grad_norm": 0.5570944965477422, - "k1_kl": 0.0303955078125, - "k3_kl": 0.020263671875, - "kimi_kl": 0.06884765625, - "learning_rate": 3.2599999999999994e-08, - "loss": 0.0008, - "ppl": 0.00514984130859375, - "reward": 0.9159931838512421, - "reward_std": 0.0996593604068039, - "rewards/perpo_ocr_edit_distance_reward": 0.9159932136535645, + "advantages": -1.246588635694934e-05, + "completion_length": 390.0, + "delta_ref_entropy_loss": 0.15625, + "delta_ref_ppl": -0.158203125, + "entropy_loss": -0.4765625, + "epoch": 0.4674, + "grad_norm": 3.075233761716117, + "k1_kl": 0.158203125, + "k3_kl": 0.099609375, + "kimi_kl": 0.1796875, + "learning_rate": 2.6629999999999994e-07, + "loss": 0.004, + "ppl": 0.240234375, + "reward": 0.7459227442741394, + "reward_std": 0.0046752747148275375, + "rewards/perpo_ocr_edit_distance_reward": 0.7459228038787842, "step": 2337, "temperature": 0.9 }, { - "advantages": -1.183578024210874e-05, - "completion_length": 720.0, - "delta_ref_entropy_loss": 0.0439453125, - "delta_ref_ppl": -0.027496337890625, - "entropy_loss": -0.05963134765625, - "epoch": 0.9352, - "grad_norm": 0.679187070415631, - "k1_kl": 0.027496337890625, - "k3_kl": 0.012847900390625, - "kimi_kl": 0.02447509765625, - "learning_rate": 3.24e-08, - "loss": 0.0005, - "ppl": 0.030670166015625, - "reward": 0.9452262222766876, - "reward_std": 0.0010288916528224945, - "rewards/perpo_ocr_edit_distance_reward": 0.94522625207901, + "advantages": -4.32389133493416e-05, + "completion_length": 920.0, + "delta_ref_entropy_loss": 0.0576171875, + "delta_ref_ppl": -0.06103515625, + "entropy_loss": -0.043212890625, + "epoch": 0.4676, + "grad_norm": 1.3932809772602, + "k1_kl": 0.06103515625, + "k3_kl": 0.0291748046875, + "kimi_kl": 0.061279296875, + "learning_rate": 2.662e-07, + "loss": 0.0012, + "ppl": 0.015625, + "reward": 0.9922528862953186, + "reward_std": 0.0014756890013813972, + "rewards/perpo_ocr_edit_distance_reward": 0.9922529458999634, "step": 2338, "temperature": 0.9 }, { - "advantages": -2.9844897653674707e-05, - "completion_length": 476.0, - "delta_ref_entropy_loss": 0.0323486328125, - "delta_ref_ppl": -0.02581787109375, - "entropy_loss": -0.017181396484375, - "epoch": 0.9356, - "grad_norm": 0.15635222740061006, - "k1_kl": 0.02587890625, - "k3_kl": 0.013946533203125, - "kimi_kl": 0.03277587890625, - "learning_rate": 3.22e-08, - "loss": 0.0006, - "ppl": 0.007415771484375, - "reward": 0.999627947807312, - "reward_std": 0.00016405459609813988, - "rewards/perpo_ocr_edit_distance_reward": 0.9996279776096344, + "advantages": -4.586151771945879e-05, + "completion_length": 535.0, + "delta_ref_entropy_loss": 0.048095703125, + "delta_ref_ppl": -0.07568359375, + "entropy_loss": -0.058349609375, + "epoch": 0.4678, + "grad_norm": 0.730432175422542, + "k1_kl": 0.0751953125, + "k3_kl": 0.048828125, + "kimi_kl": 0.12890625, + "learning_rate": 2.661e-07, + "loss": 0.002, + "ppl": 0.0294189453125, + "reward": 0.995972752571106, + "reward_std": 0.0011994745582342148, + "rewards/perpo_ocr_edit_distance_reward": 0.9959728121757507, "step": 2339, "temperature": 0.9 }, { - "advantages": -5.733115540351719e-05, - "completion_length": 351.5, - "delta_ref_entropy_loss": 0.03338623046875, - "delta_ref_ppl": -0.030059814453125, - "entropy_loss": -0.03143310546875, - "epoch": 0.936, - "grad_norm": 0.4846639139486272, - "k1_kl": 0.0301513671875, - "k3_kl": 0.01856231689453125, - "kimi_kl": 0.053253173828125, - "learning_rate": 3.2e-08, - "loss": 0.0008, - "ppl": 0.0137786865234375, - "reward": 0.917753666639328, - "reward_std": 0.00013565777044277638, - "rewards/perpo_ocr_edit_distance_reward": 0.9177537262439728, + "advantages": -0.00011222703324165195, + "completion_length": 1239.0, + "delta_ref_entropy_loss": 0.03759765625, + "delta_ref_ppl": -0.04541015625, + "entropy_loss": -0.055908203125, + "epoch": 0.468, + "grad_norm": 1.1767210608048981, + "k1_kl": 0.04541015625, + "k3_kl": 0.02490234375, + "kimi_kl": 0.0458984375, + "learning_rate": 2.66e-07, + "loss": 0.0011, + "ppl": 0.02734375, + "reward": 0.9957047700881958, + "reward_std": 0.000506889948155731, + "rewards/perpo_ocr_edit_distance_reward": 0.9957048892974854, "step": 2340, "temperature": 0.9 }, { - "advantages": -9.110995961236767e-06, - "completion_length": 94.5, - "delta_ref_entropy_loss": 0.078857421875, - "delta_ref_ppl": -0.100830078125, - "entropy_loss": -0.053955078125, - "epoch": 0.9364, - "grad_norm": 2.8485570929765354, - "k1_kl": 0.100830078125, - "k3_kl": 0.070068359375, - "kimi_kl": 0.222412109375, - "learning_rate": 3.18e-08, - "loss": 0.0028, - "ppl": 0.0328369140625, - "reward": 0.9716732501983643, - "reward_std": 0.002659092308022082, - "rewards/perpo_ocr_edit_distance_reward": 0.971673309803009, + "advantages": -5.0800190365407616e-05, + "completion_length": 704.0, + "delta_ref_entropy_loss": 0.03369140625, + "delta_ref_ppl": -0.048095703125, + "entropy_loss": -0.035400390625, + "epoch": 0.4682, + "grad_norm": 0.45747813826435985, + "k1_kl": 0.048095703125, + "k3_kl": 0.032470703125, + "kimi_kl": 0.11083984375, + "learning_rate": 2.659e-07, + "loss": 0.0014, + "ppl": 0.014404296875, + "reward": 0.9911112785339355, + "reward_std": 0.0007378667360171676, + "rewards/perpo_ocr_edit_distance_reward": 0.9911113381385803, "step": 2341, "temperature": 0.9 }, { - "advantages": -8.913875399230164e-05, - "completion_length": 339.5, - "delta_ref_entropy_loss": 0.0574951171875, - "delta_ref_ppl": -0.05078125, - "entropy_loss": -0.06805419921875, - "epoch": 0.9368, - "grad_norm": 1.9419628218723861, - "k1_kl": 0.0506591796875, - "k3_kl": 0.03045654296875, - "kimi_kl": 0.081298828125, - "learning_rate": 3.16e-08, - "loss": 0.0013, - "ppl": 0.035491943359375, - "reward": 0.9477042853832245, - "reward_std": 0.0015747371799079701, - "rewards/perpo_ocr_edit_distance_reward": 0.9477043449878693, + "advantages": -8.565188181819394e-05, + "completion_length": 755.0, + "delta_ref_entropy_loss": 0.051025390625, + "delta_ref_ppl": -0.06298828125, + "entropy_loss": -0.026611328125, + "epoch": 0.4684, + "grad_norm": 13.777723996258995, + "k1_kl": 0.06298828125, + "k3_kl": 0.09423828125, + "kimi_kl": 0.130859375, + "learning_rate": 2.6579999999999996e-07, + "loss": 0.0039, + "ppl": 0.014892578125, + "reward": 0.991858184337616, + "reward_std": 0.0005959077388979495, + "rewards/perpo_ocr_edit_distance_reward": 0.991858184337616, "step": 2342, "temperature": 0.9 }, { - "advantages": -3.2842159271240234e-05, - "completion_length": 247.5, - "delta_ref_entropy_loss": 0.0928955078125, - "delta_ref_ppl": -0.2093505859375, - "entropy_loss": -0.07354736328125, - "epoch": 0.9372, - "grad_norm": 0.5448363741799621, - "k1_kl": 0.2103271484375, - "k3_kl": 0.16387939453125, - "kimi_kl": 0.929443359375, - "learning_rate": 3.1399999999999997e-08, - "loss": 0.0066, - "ppl": 0.03594970703125, - "reward": 0.9763496518135071, - "reward_std": 0.0007276910473592579, - "rewards/perpo_ocr_edit_distance_reward": 0.9763496816158295, + "advantages": -8.983271982287988e-06, + "completion_length": 207.0, + "delta_ref_entropy_loss": 0.043701171875, + "delta_ref_ppl": -0.1630859375, + "entropy_loss": -0.08251953125, + "epoch": 0.4686, + "grad_norm": 1.3015555127936682, + "k1_kl": 0.1630859375, + "k3_kl": 0.1259765625, + "kimi_kl": 0.3828125, + "learning_rate": 2.657e-07, + "loss": 0.005, + "ppl": 0.04736328125, + "reward": 0.9884688258171082, + "reward_std": 0.0008474597125314176, + "rewards/perpo_ocr_edit_distance_reward": 0.9884689450263977, "step": 2343, "temperature": 0.9 }, { - "advantages": -7.857169748604065e-05, - "completion_length": 802.5, - "delta_ref_entropy_loss": 0.03106689453125, - "delta_ref_ppl": -0.022705078125, - "entropy_loss": -0.0119171142578125, - "epoch": 0.9376, - "grad_norm": 0.30493762221964826, - "k1_kl": 0.022705078125, - "k3_kl": 0.013519287109375, - "kimi_kl": 0.037841796875, - "learning_rate": 3.1199999999999995e-08, - "loss": 0.0006, - "ppl": 0.004913330078125, - "reward": 0.992386668920517, - "reward_std": 0.0036902016581734642, - "rewards/perpo_ocr_edit_distance_reward": 0.9923867583274841, + "advantages": 2.55448497910038e-08, + "completion_length": 118.0, + "delta_ref_entropy_loss": 0.01251220703125, + "delta_ref_ppl": -0.263671875, + "entropy_loss": -0.09326171875, + "epoch": 0.4688, + "grad_norm": 2.037471842593011, + "k1_kl": 0.263671875, + "k3_kl": 0.2158203125, + "kimi_kl": 0.91796875, + "learning_rate": 2.656e-07, + "loss": 0.0086, + "ppl": 0.033935546875, + "reward": 0.9869445562362671, + "reward_std": 0.0019927890971302986, + "rewards/perpo_ocr_edit_distance_reward": 0.9869444966316223, "step": 2344, "temperature": 0.9 }, { - "advantages": -0.0002980657986242363, - "completion_length": 535.5, - "delta_ref_entropy_loss": 0.047119140625, - "delta_ref_ppl": -0.04302978515625, - "entropy_loss": -0.0899658203125, - "epoch": 0.938, - "grad_norm": 1.9165702846280626, - "k1_kl": 0.04302978515625, - "k3_kl": 0.02386474609375, - "kimi_kl": 0.0540771484375, - "learning_rate": 3.1e-08, - "loss": 0.0013, - "ppl": 0.03777313232421875, - "reward": 0.7977188527584076, - "reward_std": 0.045134950429201126, - "rewards/perpo_ocr_edit_distance_reward": 0.79771888256073, + "advantages": -0.0001068796482286416, + "completion_length": 439.0, + "delta_ref_entropy_loss": 0.0576171875, + "delta_ref_ppl": -0.07177734375, + "entropy_loss": -0.0260009765625, + "epoch": 0.469, + "grad_norm": 0.46290399675888616, + "k1_kl": 0.07177734375, + "k3_kl": 0.050537109375, + "kimi_kl": 0.1611328125, + "learning_rate": 2.655e-07, + "loss": 0.0021, + "ppl": 0.0081787109375, + "reward": 0.995455801486969, + "reward_std": 0.0005373746971599758, + "rewards/perpo_ocr_edit_distance_reward": 0.9954559206962585, "step": 2345, "temperature": 0.9 }, { - "advantages": -3.3719201155690826e-06, - "completion_length": 721.0, - "delta_ref_entropy_loss": 0.06396484375, - "delta_ref_ppl": -0.05322265625, - "entropy_loss": -0.07098388671875, - "epoch": 0.9384, - "grad_norm": 1.0962497088381087, - "k1_kl": 0.05322265625, - "k3_kl": 0.03466796875, - "kimi_kl": 0.11376953125, - "learning_rate": 3.08e-08, - "loss": 0.0014, - "ppl": 0.0343475341796875, - "reward": 0.7129293233156204, - "reward_std": 0.01365019241347909, - "rewards/perpo_ocr_edit_distance_reward": 0.712929368019104, - "step": 2346, - "temperature": 0.9 + "advantages": -9.426049473404419e-06, + "completion_length": 666.0, + "delta_ref_entropy_loss": 0.0179443359375, + "delta_ref_ppl": -0.043701171875, + "entropy_loss": -0.0947265625, + "epoch": 0.4692, + "grad_norm": 1.0758348773055522, + "k1_kl": 0.043701171875, + "k3_kl": 0.0322265625, + "kimi_kl": 0.08642578125, + "learning_rate": 2.6540000000000003e-07, + "loss": 0.0013, + "ppl": 0.03369140625, + "reward": 0.9822078943252563, + "reward_std": 0.0026129535399377346, + "rewards/perpo_ocr_edit_distance_reward": 0.9822080135345459, + "step": 2346, + "temperature": 0.9 }, { - "advantages": -4.257474817137563e-09, - "completion_length": 409.5, - "delta_ref_entropy_loss": 0.0401611328125, - "delta_ref_ppl": -0.0233154296875, - "entropy_loss": -0.02349853515625, - "epoch": 0.9388, - "grad_norm": 0.1508417043555419, - "k1_kl": 0.02337646484375, - "k3_kl": 0.01019287109375, - "kimi_kl": 0.0172119140625, - "learning_rate": 3.0599999999999996e-08, - "loss": 0.0004, - "ppl": 0.011077880859375, - "reward": 0.9989634156227112, - "reward_std": 0.00012466580665204674, - "rewards/perpo_ocr_edit_distance_reward": 0.9989634454250336, + "advantages": -2.207926445407793e-05, + "completion_length": 192.0, + "delta_ref_entropy_loss": 0.0712890625, + "delta_ref_ppl": -0.212890625, + "entropy_loss": -0.1005859375, + "epoch": 0.4694, + "grad_norm": 4.255310539543032, + "k1_kl": 0.212890625, + "k3_kl": 0.1728515625, + "kimi_kl": 0.78125, + "learning_rate": 2.6529999999999997e-07, + "loss": 0.007, + "ppl": 0.045654296875, + "reward": 0.9851809144020081, + "reward_std": 0.0022150774020701647, + "rewards/perpo_ocr_edit_distance_reward": 0.9851809740066528, "step": 2347, "temperature": 0.9 }, { - "advantages": -0.0001387127849739045, - "completion_length": 367.0, - "delta_ref_entropy_loss": 0.0413818359375, - "delta_ref_ppl": -0.03369140625, - "entropy_loss": -0.019561767578125, - "epoch": 0.9392, - "grad_norm": 0.9765559078450596, - "k1_kl": 0.03369140625, - "k3_kl": 0.019500732421875, - "kimi_kl": 0.0689697265625, - "learning_rate": 3.04e-08, - "loss": 0.0009, - "ppl": 0.008819580078125, - "reward": 0.9735940992832184, - "reward_std": 0.00022154999169288203, - "rewards/perpo_ocr_edit_distance_reward": 0.9735941886901855, + "advantages": -7.670266495551914e-05, + "completion_length": 840.0, + "delta_ref_entropy_loss": 0.044189453125, + "delta_ref_ppl": -0.04443359375, + "entropy_loss": -0.06884765625, + "epoch": 0.4696, + "grad_norm": 0.3424722939765272, + "k1_kl": 0.04443359375, + "k3_kl": 0.029541015625, + "kimi_kl": 0.10498046875, + "learning_rate": 2.6519999999999997e-07, + "loss": 0.0013, + "ppl": 0.02294921875, + "reward": 0.9051591157913208, + "reward_std": 0.0006771826301701367, + "rewards/perpo_ocr_edit_distance_reward": 0.9051591753959656, "step": 2348, "temperature": 0.9 }, { - "advantages": 1.5461020146601e-05, - "completion_length": 883.0, - "delta_ref_entropy_loss": 0.0416259765625, - "delta_ref_ppl": -0.02716064453125, - "entropy_loss": -0.0382080078125, - "epoch": 0.9396, - "grad_norm": 0.5269422472803429, - "k1_kl": 0.0272216796875, - "k3_kl": 0.015350341796875, - "kimi_kl": 0.02850341796875, - "learning_rate": 3.02e-08, - "loss": 0.0006, - "ppl": 0.01824951171875, - "reward": 0.9933865368366241, - "reward_std": 0.0007685546588618308, - "rewards/perpo_ocr_edit_distance_reward": 0.9933865666389465, + "advantages": -6.369182301568799e-06, + "completion_length": 590.0, + "delta_ref_entropy_loss": 0.052978515625, + "delta_ref_ppl": -0.07275390625, + "entropy_loss": -0.046875, + "epoch": 0.4698, + "grad_norm": 0.7815461477233934, + "k1_kl": 0.072265625, + "k3_kl": 0.0400390625, + "kimi_kl": 0.126953125, + "learning_rate": 2.651e-07, + "loss": 0.0016, + "ppl": 0.01300048828125, + "reward": 0.9809057712554932, + "reward_std": 0.0012321618851274252, + "rewards/perpo_ocr_edit_distance_reward": 0.9809058308601379, "step": 2349, "temperature": 0.9 }, { - "advantages": 0.0, - "completion_length": 344.0, - "delta_ref_entropy_loss": 0.09332275390625, - "delta_ref_ppl": -0.11712646484375, - "entropy_loss": -0.06005859375, - "epoch": 0.94, - "grad_norm": 0.14099917426509403, - "k1_kl": 0.1175537109375, - "k3_kl": 0.07940673828125, - "kimi_kl": 0.20294189453125, - "learning_rate": 3e-08, - "loss": 0.0032, - "ppl": 0.0350341796875, - "reward": 0.986865907907486, - "reward_std": 0.0, - "rewards/perpo_ocr_edit_distance_reward": 0.9868659377098083, + "advantages": -2.6771002012537792e-05, + "completion_length": 163.0, + "delta_ref_entropy_loss": 0.06591796875, + "delta_ref_ppl": -0.275390625, + "entropy_loss": -0.09423828125, + "epoch": 0.47, + "grad_norm": 1.7162561432384882, + "k1_kl": 0.27734375, + "k3_kl": 0.2138671875, + "kimi_kl": 0.88671875, + "learning_rate": 2.65e-07, + "loss": 0.0086, + "ppl": 0.043212890625, + "reward": 0.9941889643669128, + "reward_std": 0.0024451271165162325, + "rewards/perpo_ocr_edit_distance_reward": 0.9941890239715576, "step": 2350, "temperature": 0.9 }, { - "advantages": -4.7104700570343994e-05, - "completion_length": 295.5, - "delta_ref_entropy_loss": 0.0281982421875, - "delta_ref_ppl": -0.03887939453125, - "entropy_loss": -0.018035888671875, - "epoch": 0.9404, - "grad_norm": 0.6253722525484713, - "k1_kl": 0.03900146484375, - "k3_kl": 0.024749755859375, - "kimi_kl": 0.07464599609375, - "learning_rate": 2.98e-08, - "loss": 0.001, - "ppl": 0.0087432861328125, - "reward": 0.9990821182727814, - "reward_std": 0.0007136417552828789, - "rewards/perpo_ocr_edit_distance_reward": 0.9990821778774261, + "advantages": -2.3007394702290185e-05, + "completion_length": 791.0, + "delta_ref_entropy_loss": 0.031494140625, + "delta_ref_ppl": -0.05078125, + "entropy_loss": -0.06298828125, + "epoch": 0.4702, + "grad_norm": 0.2636307185305698, + "k1_kl": 0.05078125, + "k3_kl": 0.03271484375, + "kimi_kl": 0.08642578125, + "learning_rate": 2.649e-07, + "loss": 0.0013, + "ppl": 0.022705078125, + "reward": 0.9898315668106079, + "reward_std": 0.0017495627980679274, + "rewards/perpo_ocr_edit_distance_reward": 0.9898315668106079, "step": 2351, "temperature": 0.9 }, { - "advantages": -3.477505379123613e-05, - "completion_length": 619.0, - "delta_ref_entropy_loss": 0.01800537109375, - "delta_ref_ppl": -0.013824462890625, - "entropy_loss": -0.013214111328125, - "epoch": 0.9408, - "grad_norm": 0.45776400628084307, - "k1_kl": 0.013824462890625, - "k3_kl": 0.00792694091796875, - "kimi_kl": 0.013946533203125, - "learning_rate": 2.96e-08, - "loss": 0.0004, - "ppl": 0.006072998046875, - "reward": 0.9992025792598724, - "reward_std": 0.00032302744511980563, - "rewards/perpo_ocr_edit_distance_reward": 0.9992026388645172, + "advantages": -1.3623919130623108e-06, + "completion_length": 2048.0, + "delta_ref_entropy_loss": 0.00165557861328125, + "delta_ref_ppl": -0.01239013671875, + "entropy_loss": -0.037109375, + "epoch": 0.4704, + "grad_norm": 2.0590039237015576, + "k1_kl": 0.012451171875, + "k3_kl": 0.0186767578125, + "kimi_kl": 0.03173828125, + "learning_rate": 2.648e-07, + "loss": 0.0007, + "ppl": 0.01708984375, + "reward": 0.9488481879234314, + "reward_std": 0.06231909617781639, + "rewards/perpo_ocr_edit_distance_reward": 0.9488482475280762, "step": 2352, "temperature": 0.9 }, { - "advantages": -0.0002980232238769531, - "completion_length": 124.0, - "delta_ref_entropy_loss": 0.05419921875, - "delta_ref_ppl": -0.0804443359375, - "entropy_loss": -0.04876708984375, - "epoch": 0.9412, - "grad_norm": 0.04911525611840413, - "k1_kl": 0.080322265625, - "k3_kl": 0.05133056640625, - "kimi_kl": 0.1474609375, - "learning_rate": 2.94e-08, - "loss": 0.0024, - "ppl": 0.0238189697265625, - "reward": 0.9982078671455383, - "reward_std": 0.0, - "rewards/perpo_ocr_edit_distance_reward": 0.9982079267501831, + "advantages": -2.1287373783707153e-06, + "completion_length": 82.0, + "delta_ref_entropy_loss": -0.000698089599609375, + "delta_ref_ppl": -0.279296875, + "entropy_loss": -0.2578125, + "epoch": 0.4706, + "grad_norm": 4.015157404675585, + "k1_kl": 0.279296875, + "k3_kl": 0.2080078125, + "kimi_kl": 0.8359375, + "learning_rate": 2.647e-07, + "loss": 0.0083, + "ppl": 0.08203125, + "reward": 0.9419485330581665, + "reward_std": 0.007863570004701614, + "rewards/perpo_ocr_edit_distance_reward": 0.9419485330581665, "step": 2353, "temperature": 0.9 }, { - "advantages": 5.8310375607106835e-05, - "completion_length": 317.0, - "delta_ref_entropy_loss": 0.0389404296875, - "delta_ref_ppl": -0.0423583984375, - "entropy_loss": -0.0335693359375, - "epoch": 0.9416, - "grad_norm": 0.6979220381689739, - "k1_kl": 0.0423583984375, - "k3_kl": 0.02886962890625, - "kimi_kl": 0.084716796875, - "learning_rate": 2.92e-08, - "loss": 0.0011, - "ppl": 0.01629638671875, - "reward": 0.998911440372467, - "reward_std": 0.0006259901419980451, - "rewards/perpo_ocr_edit_distance_reward": 0.9989114701747894, + "advantages": -9.391989442519844e-05, + "completion_length": 377.0, + "delta_ref_entropy_loss": 0.047119140625, + "delta_ref_ppl": -0.0810546875, + "entropy_loss": -0.0419921875, + "epoch": 0.4708, + "grad_norm": 0.7923838915534066, + "k1_kl": 0.0810546875, + "k3_kl": 0.059326171875, + "kimi_kl": 0.2138671875, + "learning_rate": 2.646e-07, + "loss": 0.0025, + "ppl": 0.0162353515625, + "reward": 0.9958361387252808, + "reward_std": 0.0009880390716716647, + "rewards/perpo_ocr_edit_distance_reward": 0.9958362579345703, "step": 2354, "temperature": 0.9 }, { - "advantages": -3.4911292630113167e-06, - "completion_length": 734.5, - "delta_ref_entropy_loss": 0.03106689453125, - "delta_ref_ppl": -0.0584716796875, - "entropy_loss": -0.0703125, - "epoch": 0.942, - "grad_norm": 1.708972782906878, - "k1_kl": 0.0584716796875, - "k3_kl": 0.040771484375, - "kimi_kl": 0.124267578125, - "learning_rate": 2.9e-08, - "loss": 0.0016, - "ppl": 0.03131103515625, - "reward": 0.9235666394233704, - "reward_std": 0.17244956362992525, - "rewards/perpo_ocr_edit_distance_reward": 0.9235667288303375, + "advantages": -6.505421424662927e-06, + "completion_length": 546.0, + "delta_ref_entropy_loss": 0.04541015625, + "delta_ref_ppl": -0.0791015625, + "entropy_loss": -0.06640625, + "epoch": 0.471, + "grad_norm": 0.7694610322251569, + "k1_kl": 0.0791015625, + "k3_kl": 0.043701171875, + "kimi_kl": 0.10302734375, + "learning_rate": 2.645e-07, + "loss": 0.0018, + "ppl": 0.0244140625, + "reward": 0.9827014207839966, + "reward_std": 0.005134718492627144, + "rewards/perpo_ocr_edit_distance_reward": 0.9827014803886414, "step": 2355, "temperature": 0.9 }, { - "advantages": -6.386212021425308e-07, - "completion_length": 475.0, - "delta_ref_entropy_loss": 0.0648193359375, - "delta_ref_ppl": -0.12176513671875, - "entropy_loss": -0.0540771484375, - "epoch": 0.9424, - "grad_norm": 4.2832939401008225, - "k1_kl": 0.12176513671875, - "k3_kl": 0.088348388671875, - "kimi_kl": 0.28173828125, - "learning_rate": 2.8799999999999996e-08, - "loss": 0.0035, - "ppl": 0.02935791015625, - "reward": 0.5961880460381508, - "reward_std": 0.019815120846033096, - "rewards/perpo_ocr_edit_distance_reward": 0.5961881056427956, + "advantages": -1.125676317315083e-05, + "completion_length": 560.0, + "delta_ref_entropy_loss": 0.03369140625, + "delta_ref_ppl": -0.0625, + "entropy_loss": -0.057373046875, + "epoch": 0.4712, + "grad_norm": 0.4139973913331772, + "k1_kl": 0.0625, + "k3_kl": 0.0478515625, + "kimi_kl": 0.1328125, + "learning_rate": 2.644e-07, + "loss": 0.0019, + "ppl": 0.0225830078125, + "reward": 0.992680013179779, + "reward_std": 0.0014123816508799791, + "rewards/perpo_ocr_edit_distance_reward": 0.9926800727844238, "step": 2356, "temperature": 0.9 }, { - "advantages": -4.281316614651587e-05, - "completion_length": 529.5, - "delta_ref_entropy_loss": 0.038818359375, - "delta_ref_ppl": -0.0391845703125, - "entropy_loss": -0.017578125, - "epoch": 0.9428, - "grad_norm": 0.28168002041031553, - "k1_kl": 0.03912353515625, - "k3_kl": 0.025238037109375, - "kimi_kl": 0.08740234375, - "learning_rate": 2.8599999999999998e-08, - "loss": 0.0011, - "ppl": 0.0071868896484375, - "reward": 0.996919184923172, - "reward_std": 0.0009302275284426287, - "rewards/perpo_ocr_edit_distance_reward": 0.9969192147254944, + "advantages": -1.900536699395161e-05, + "completion_length": 353.0, + "delta_ref_entropy_loss": 0.0498046875, + "delta_ref_ppl": -0.0634765625, + "entropy_loss": -0.032470703125, + "epoch": 0.4714, + "grad_norm": 0.6858211203808854, + "k1_kl": 0.0634765625, + "k3_kl": 0.042724609375, + "kimi_kl": 0.1494140625, + "learning_rate": 2.6429999999999995e-07, + "loss": 0.0017, + "ppl": 0.00933837890625, + "reward": 0.9977176785469055, + "reward_std": 0.000795968750026077, + "rewards/perpo_ocr_edit_distance_reward": 0.9977176785469055, "step": 2357, "temperature": 0.9 }, { - "advantages": -2.6123865609406494e-05, - "completion_length": 324.5, - "delta_ref_entropy_loss": 0.0262451171875, - "delta_ref_ppl": -0.02947998046875, - "entropy_loss": -0.013458251953125, - "epoch": 0.9432, - "grad_norm": 0.3696554895336508, - "k1_kl": 0.02960205078125, - "k3_kl": 0.01953125, - "kimi_kl": 0.0618896484375, - "learning_rate": 2.84e-08, - "loss": 0.0008, - "ppl": 0.0056304931640625, - "reward": 0.9992832839488983, - "reward_std": 0.0002758447953965515, - "rewards/perpo_ocr_edit_distance_reward": 0.9992833435535431, + "advantages": -0.0001406840019626543, + "completion_length": 1242.0, + "delta_ref_entropy_loss": 0.036376953125, + "delta_ref_ppl": -0.038818359375, + "entropy_loss": -0.044921875, + "epoch": 0.4716, + "grad_norm": 0.8099672432226276, + "k1_kl": 0.0390625, + "k3_kl": 0.0230712890625, + "kimi_kl": 0.051513671875, + "learning_rate": 2.642e-07, + "loss": 0.0011, + "ppl": 0.018798828125, + "reward": 0.9951439499855042, + "reward_std": 0.0003841787693090737, + "rewards/perpo_ocr_edit_distance_reward": 0.9951440691947937, "step": 2358, "temperature": 0.9 }, { - "advantages": -2.0308154489612207e-05, - "completion_length": 558.5, - "delta_ref_entropy_loss": 0.055419921875, - "delta_ref_ppl": -0.0662841796875, - "entropy_loss": -0.07281494140625, - "epoch": 0.9436, - "grad_norm": 0.787833828865061, - "k1_kl": 0.0665283203125, - "k3_kl": 0.0445556640625, - "kimi_kl": 0.1455078125, - "learning_rate": 2.8199999999999998e-08, - "loss": 0.0018, - "ppl": 0.04296875, - "reward": 0.9810739159584045, - "reward_std": 0.0039100018329918385, - "rewards/perpo_ocr_edit_distance_reward": 0.9810740053653717, + "advantages": -0.0001327991485595703, + "completion_length": 673.0, + "delta_ref_entropy_loss": 0.045654296875, + "delta_ref_ppl": -0.059326171875, + "entropy_loss": -0.038330078125, + "epoch": 0.4718, + "grad_norm": 0.5972836678053163, + "k1_kl": 0.059326171875, + "k3_kl": 0.03759765625, + "kimi_kl": 0.10986328125, + "learning_rate": 2.641e-07, + "loss": 0.0016, + "ppl": 0.016357421875, + "reward": 0.9934911727905273, + "reward_std": 0.0005411716992966831, + "rewards/perpo_ocr_edit_distance_reward": 0.9934912323951721, "step": 2359, "temperature": 0.9 }, { - "advantages": -2.4250576643680688e-05, - "completion_length": 750.0, - "delta_ref_entropy_loss": 0.0711669921875, - "delta_ref_ppl": -0.04901123046875, - "entropy_loss": -0.046630859375, - "epoch": 0.944, - "grad_norm": 0.7031092217926204, - "k1_kl": 0.04901123046875, - "k3_kl": 0.024169921875, - "kimi_kl": 0.0504150390625, - "learning_rate": 2.8e-08, - "loss": 0.001, - "ppl": 0.02252197265625, - "reward": 0.9749899506568909, - "reward_std": 0.0007797807629685849, - "rewards/perpo_ocr_edit_distance_reward": 0.9749899804592133, + "advantages": -3.482614556560293e-05, + "completion_length": 145.0, + "delta_ref_entropy_loss": 0.02734375, + "delta_ref_ppl": -0.3046875, + "entropy_loss": -0.1650390625, + "epoch": 0.472, + "grad_norm": 2.497091649689393, + "k1_kl": 0.3046875, + "k3_kl": 0.23828125, + "kimi_kl": 0.9765625, + "learning_rate": 2.64e-07, + "loss": 0.0096, + "ppl": 0.0625, + "reward": 0.9912282228469849, + "reward_std": 0.0023452916648238897, + "rewards/perpo_ocr_edit_distance_reward": 0.9912282824516296, "step": 2360, "temperature": 0.9 }, { - "advantages": -4.35113939190046e-06, - "completion_length": 317.0, - "delta_ref_entropy_loss": 0.125, - "delta_ref_ppl": -0.19189453125, - "entropy_loss": -0.4599609375, - "epoch": 0.9444, - "grad_norm": 5.825290758597221, - "k1_kl": 0.19091796875, - "k3_kl": 0.1337890625, - "kimi_kl": 0.31787109375, - "learning_rate": 2.7799999999999997e-08, - "loss": 0.0054, - "ppl": 0.263671875, - "reward": 0.6189426779747009, - "reward_std": 0.037257069372572005, - "rewards/perpo_ocr_edit_distance_reward": 0.6189427226781845, + "advantages": -2.454859895806294e-05, + "completion_length": 384.0, + "delta_ref_entropy_loss": 0.11181640625, + "delta_ref_ppl": -0.146484375, + "entropy_loss": -0.1025390625, + "epoch": 0.4722, + "grad_norm": 1.2116212915860876, + "k1_kl": 0.146484375, + "k3_kl": 0.09765625, + "kimi_kl": 0.32421875, + "learning_rate": 2.6390000000000003e-07, + "loss": 0.0039, + "ppl": 0.04052734375, + "reward": 0.9188910126686096, + "reward_std": 0.00267690047621727, + "rewards/perpo_ocr_edit_distance_reward": 0.9188911318778992, "step": 2361, "temperature": 0.9 }, { - "advantages": -2.098935101457755e-05, - "completion_length": 263.5, - "delta_ref_entropy_loss": 0.0716552734375, - "delta_ref_ppl": -0.149169921875, - "entropy_loss": -0.07012939453125, - "epoch": 0.9448, - "grad_norm": 2.9380746547602272, - "k1_kl": 0.149658203125, - "k3_kl": 0.1212158203125, - "kimi_kl": 0.599609375, - "learning_rate": 2.76e-08, - "loss": 0.0049, - "ppl": 0.03460693359375, - "reward": 0.8116451799869537, - "reward_std": 0.003264794300775975, - "rewards/perpo_ocr_edit_distance_reward": 0.8116452395915985, + "advantages": 0.0, + "completion_length": 214.0, + "delta_ref_entropy_loss": 0.052734375, + "delta_ref_ppl": -0.140625, + "entropy_loss": -0.091796875, + "epoch": 0.4724, + "grad_norm": 1.0157528819406605, + "k1_kl": 0.140625, + "k3_kl": 0.10107421875, + "kimi_kl": 0.498046875, + "learning_rate": 2.6379999999999997e-07, + "loss": 0.004, + "ppl": 0.03662109375, + "reward": 0.951410174369812, + "reward_std": 0.0008110894705168903, + "rewards/perpo_ocr_edit_distance_reward": 0.951410174369812, "step": 2362, "temperature": 0.9 }, { - "advantages": -9.16208591661416e-05, - "completion_length": 620.5, - "delta_ref_entropy_loss": 0.02850341796875, - "delta_ref_ppl": -0.01556396484375, - "entropy_loss": -0.02459716796875, - "epoch": 0.9452, - "grad_norm": 0.49089004139323883, - "k1_kl": 0.015594482421875, - "k3_kl": 0.007415771484375, - "kimi_kl": 0.0108642578125, - "learning_rate": 2.74e-08, - "loss": 0.0004, - "ppl": 0.01171875, - "reward": 0.9994960725307465, - "reward_std": 6.618765473831445e-05, - "rewards/perpo_ocr_edit_distance_reward": 0.9994960725307465, + "advantages": -1.1154584171890747e-06, + "completion_length": 1570.0, + "delta_ref_entropy_loss": 0.00125885009765625, + "delta_ref_ppl": -0.0169677734375, + "entropy_loss": -0.07373046875, + "epoch": 0.4726, + "grad_norm": 0.8955723345296839, + "k1_kl": 0.01708984375, + "k3_kl": 0.01373291015625, + "kimi_kl": 0.0322265625, + "learning_rate": 2.6369999999999996e-07, + "loss": 0.0006, + "ppl": 0.03759765625, + "reward": 0.6398026347160339, + "reward_std": 0.04551501199603081, + "rewards/perpo_ocr_edit_distance_reward": 0.6398026943206787, "step": 2363, "temperature": 0.9 }, { - "advantages": -1.3504710295819677e-05, - "completion_length": 460.5, - "delta_ref_entropy_loss": 0.043212890625, - "delta_ref_ppl": -0.0491943359375, - "entropy_loss": -0.014007568359375, - "epoch": 0.9456, - "grad_norm": 0.26355338205722967, - "k1_kl": 0.049072265625, - "k3_kl": 0.0345458984375, - "kimi_kl": 0.1451416015625, - "learning_rate": 2.72e-08, + "advantages": -2.850805140042212e-05, + "completion_length": 813.0, + "delta_ref_entropy_loss": 0.01513671875, + "delta_ref_ppl": -0.048583984375, + "entropy_loss": -0.051513671875, + "epoch": 0.4728, + "grad_norm": 1.4126606618556967, + "k1_kl": 0.048583984375, + "k3_kl": 0.035400390625, + "kimi_kl": 0.1083984375, + "learning_rate": 2.636e-07, "loss": 0.0014, - "ppl": 0.005218505859375, - "reward": 0.9996663928031921, - "reward_std": 0.00026479928055778146, - "rewards/perpo_ocr_edit_distance_reward": 0.9996663928031921, + "ppl": 0.022705078125, + "reward": 0.9769948720932007, + "reward_std": 0.002887952607125044, + "rewards/perpo_ocr_edit_distance_reward": 0.9769949913024902, "step": 2364, "temperature": 0.9 }, { - "advantages": -8.30207568469632e-07, - "completion_length": 494.5, - "delta_ref_entropy_loss": 0.0400390625, - "delta_ref_ppl": -0.039306640625, - "entropy_loss": -0.03887939453125, - "epoch": 0.946, - "grad_norm": 1.5680645406462066, - "k1_kl": 0.0394287109375, - "k3_kl": 0.02642822265625, - "kimi_kl": 0.08563232421875, - "learning_rate": 2.6999999999999997e-08, - "loss": 0.0011, - "ppl": 0.019805908203125, - "reward": 0.9760735034942627, - "reward_std": 0.015408878214657307, - "rewards/perpo_ocr_edit_distance_reward": 0.9760735630989075, + "advantages": -0.0003040347946807742, + "completion_length": 958.0, + "delta_ref_entropy_loss": 0.04345703125, + "delta_ref_ppl": -0.0478515625, + "entropy_loss": -0.032958984375, + "epoch": 0.473, + "grad_norm": 1.4632794640123319, + "k1_kl": 0.0478515625, + "k3_kl": 0.025146484375, + "kimi_kl": 0.0595703125, + "learning_rate": 2.635e-07, + "loss": 0.0013, + "ppl": 0.01068115234375, + "reward": 0.8029229044914246, + "reward_std": 0.00012405896268319339, + "rewards/perpo_ocr_edit_distance_reward": 0.8029229640960693, "step": 2365, "temperature": 0.9 }, { - "advantages": -2.1023410766929373e-05, - "completion_length": 989.5, - "delta_ref_entropy_loss": 0.02728271484375, - "delta_ref_ppl": -0.02325439453125, - "entropy_loss": -0.02593994140625, - "epoch": 0.9464, - "grad_norm": 0.3703649408678135, - "k1_kl": 0.02325439453125, - "k3_kl": 0.013427734375, - "kimi_kl": 0.03228759765625, - "learning_rate": 2.68e-08, - "loss": 0.0006, - "ppl": 0.010711669921875, - "reward": 0.9721066653728485, - "reward_std": 0.055985069659072906, - "rewards/perpo_ocr_edit_distance_reward": 0.9721066951751709, + "advantages": -2.1815301806782372e-05, + "completion_length": 430.0, + "delta_ref_entropy_loss": 0.0361328125, + "delta_ref_ppl": -0.057861328125, + "entropy_loss": -0.031494140625, + "epoch": 0.4732, + "grad_norm": 1.4214282105760843, + "k1_kl": 0.0576171875, + "k3_kl": 0.036865234375, + "kimi_kl": 0.11279296875, + "learning_rate": 2.634e-07, + "loss": 0.0015, + "ppl": 0.01422119140625, + "reward": 0.9964091777801514, + "reward_std": 0.0006801955751143396, + "rewards/perpo_ocr_edit_distance_reward": 0.9964092373847961, "step": 2366, "temperature": 0.9 }, { - "advantages": -2.2138868871479644e-07, - "completion_length": 651.0, - "delta_ref_entropy_loss": 0.0635986328125, - "delta_ref_ppl": -0.0660400390625, - "entropy_loss": -0.11669921875, - "epoch": 0.9468, - "grad_norm": 3.207756798584916, - "k1_kl": 0.066162109375, - "k3_kl": 0.0521240234375, - "kimi_kl": 0.11669921875, - "learning_rate": 2.6599999999999997e-08, - "loss": 0.0021, - "ppl": 0.067138671875, - "reward": 0.9558807611465454, - "reward_std": 0.03000091202557087, - "rewards/perpo_ocr_edit_distance_reward": 0.9558807909488678, + "advantages": -2.6490008167456836e-05, + "completion_length": 877.0, + "delta_ref_entropy_loss": 0.028076171875, + "delta_ref_ppl": -0.0576171875, + "entropy_loss": -0.03173828125, + "epoch": 0.4734, + "grad_norm": 0.3121542240834153, + "k1_kl": 0.0576171875, + "k3_kl": 0.036865234375, + "kimi_kl": 0.1015625, + "learning_rate": 2.633e-07, + "loss": 0.0015, + "ppl": 0.010498046875, + "reward": 0.9952553510665894, + "reward_std": 0.0005433133337646723, + "rewards/perpo_ocr_edit_distance_reward": 0.9952553510665894, "step": 2367, "temperature": 0.9 }, { - "advantages": -3.3165729291795287e-06, - "completion_length": 386.5, - "delta_ref_entropy_loss": 0.044189453125, - "delta_ref_ppl": -0.05194091796875, - "entropy_loss": -0.023895263671875, - "epoch": 0.9472, - "grad_norm": 0.4316127585858051, - "k1_kl": 0.05206298828125, - "k3_kl": 0.0360107421875, - "kimi_kl": 0.1436767578125, - "learning_rate": 2.6399999999999998e-08, - "loss": 0.0014, - "ppl": 0.011077880859375, - "reward": 0.9949290156364441, - "reward_std": 0.0012344460701569915, - "rewards/perpo_ocr_edit_distance_reward": 0.9949290156364441, + "advantages": 0.0, + "completion_length": 573.0, + "delta_ref_entropy_loss": 0.053955078125, + "delta_ref_ppl": -0.0771484375, + "entropy_loss": -0.04248046875, + "epoch": 0.4736, + "grad_norm": 0.5066885545128837, + "k1_kl": 0.07763671875, + "k3_kl": 0.052001953125, + "kimi_kl": 0.1904296875, + "learning_rate": 2.632e-07, + "loss": 0.0021, + "ppl": 0.0177001953125, + "reward": 0.9957039952278137, + "reward_std": 0.0009139996254816651, + "rewards/perpo_ocr_edit_distance_reward": 0.9957040548324585, "step": 2368, "temperature": 0.9 }, { - "advantages": -4.521438313531689e-06, - "completion_length": 436.5, - "delta_ref_entropy_loss": 0.03656005859375, - "delta_ref_ppl": -0.02716064453125, - "entropy_loss": -0.021728515625, - "epoch": 0.9476, - "grad_norm": 0.4653772932665809, - "k1_kl": 0.02728271484375, - "k3_kl": 0.0155029296875, - "kimi_kl": 0.03466796875, - "learning_rate": 2.62e-08, - "loss": 0.0006, - "ppl": 0.00860595703125, - "reward": 0.9941245019435883, - "reward_std": 0.00042091726209037006, - "rewards/perpo_ocr_edit_distance_reward": 0.9941245019435883, + "advantages": -6.505421424662927e-06, + "completion_length": 217.0, + "delta_ref_entropy_loss": 0.1572265625, + "delta_ref_ppl": -0.20703125, + "entropy_loss": -0.2197265625, + "epoch": 0.4738, + "grad_norm": 2.3851098479917177, + "k1_kl": 0.20703125, + "k3_kl": 0.1533203125, + "kimi_kl": 0.578125, + "learning_rate": 2.6309999999999997e-07, + "loss": 0.0062, + "ppl": 0.11181640625, + "reward": 0.8270641565322876, + "reward_std": 0.009051409550011158, + "rewards/perpo_ocr_edit_distance_reward": 0.8270642161369324, "step": 2369, "temperature": 0.9 }, { - "advantages": -0.00012992961637792177, - "completion_length": 356.5, - "delta_ref_entropy_loss": 0.02734375, - "delta_ref_ppl": -0.028106689453125, - "entropy_loss": -0.02252197265625, - "epoch": 0.948, - "grad_norm": 0.7744972525165897, - "k1_kl": 0.02813720703125, - "k3_kl": 0.019439697265625, - "kimi_kl": 0.071746826171875, - "learning_rate": 2.5999999999999998e-08, - "loss": 0.0009, - "ppl": 0.01202392578125, - "reward": 0.9972241222858429, - "reward_std": 0.0003441403532633558, - "rewards/perpo_ocr_edit_distance_reward": 0.9972242414951324, + "advantages": -1.3449362995743286e-05, + "completion_length": 234.0, + "delta_ref_entropy_loss": 0.0703125, + "delta_ref_ppl": -0.1845703125, + "entropy_loss": -0.058837890625, + "epoch": 0.474, + "grad_norm": 1.497903666568278, + "k1_kl": 0.185546875, + "k3_kl": 0.138671875, + "kimi_kl": 0.53125, + "learning_rate": 2.63e-07, + "loss": 0.0056, + "ppl": 0.0264892578125, + "reward": 0.9835361838340759, + "reward_std": 0.0017999060219153762, + "rewards/perpo_ocr_edit_distance_reward": 0.9835362434387207, "step": 2370, "temperature": 0.9 }, { - "advantages": -4.437140211166479e-05, - "completion_length": 714.5, - "delta_ref_entropy_loss": 0.0703125, - "delta_ref_ppl": -0.0596923828125, - "entropy_loss": -0.0921630859375, - "epoch": 0.9484, - "grad_norm": 0.9904626280778546, - "k1_kl": 0.0596923828125, - "k3_kl": 0.03582763671875, - "kimi_kl": 0.109375, - "learning_rate": 2.58e-08, - "loss": 0.0015, - "ppl": 0.04864501953125, - "reward": 0.8772079050540924, - "reward_std": 0.040507582380087115, - "rewards/perpo_ocr_edit_distance_reward": 0.8772079348564148, + "advantages": -4.6251076128100976e-05, + "completion_length": 974.0, + "delta_ref_entropy_loss": 0.03173828125, + "delta_ref_ppl": -0.039306640625, + "entropy_loss": -0.043701171875, + "epoch": 0.4742, + "grad_norm": 0.4042577198233776, + "k1_kl": 0.039306640625, + "k3_kl": 0.022216796875, + "kimi_kl": 0.05908203125, + "learning_rate": 2.629e-07, + "loss": 0.0009, + "ppl": 0.01611328125, + "reward": 0.9943240284919739, + "reward_std": 0.00045233272248879075, + "rewards/perpo_ocr_edit_distance_reward": 0.9943240284919739, "step": 2371, "temperature": 0.9 }, { - "advantages": -6.6927505372405e-05, - "completion_length": 756.0, - "delta_ref_entropy_loss": 0.03802490234375, - "delta_ref_ppl": -0.03131103515625, - "entropy_loss": -0.0294189453125, - "epoch": 0.9488, - "grad_norm": 1.2989394178405564, - "k1_kl": 0.03131103515625, - "k3_kl": 0.017486572265625, - "kimi_kl": 0.0416259765625, - "learning_rate": 2.56e-08, - "loss": 0.0008, - "ppl": 0.0165863037109375, - "reward": 0.9810785949230194, - "reward_std": 0.007020274679234717, - "rewards/perpo_ocr_edit_distance_reward": 0.9810786545276642, + "advantages": -4.002026253147051e-05, + "completion_length": 158.0, + "delta_ref_entropy_loss": 0.1162109375, + "delta_ref_ppl": -0.328125, + "entropy_loss": -0.1435546875, + "epoch": 0.4744, + "grad_norm": 1.8564228716398201, + "k1_kl": 0.328125, + "k3_kl": 0.244140625, + "kimi_kl": 1.0, + "learning_rate": 2.6279999999999994e-07, + "loss": 0.0098, + "ppl": 0.060302734375, + "reward": 0.9289631843566895, + "reward_std": 0.00287676346488297, + "rewards/perpo_ocr_edit_distance_reward": 0.928963303565979, "step": 2372, "temperature": 0.9 }, { - "advantages": -4.9952950575971045e-05, - "completion_length": 706.5, - "delta_ref_entropy_loss": 0.02655029296875, - "delta_ref_ppl": -0.02288818359375, - "entropy_loss": -0.017791748046875, - "epoch": 0.9492, - "grad_norm": 0.525358575433947, - "k1_kl": 0.02288818359375, - "k3_kl": 0.01702880859375, - "kimi_kl": 0.060546875, - "learning_rate": 2.54e-08, - "loss": 0.0007, - "ppl": 0.010101318359375, - "reward": 0.9976007640361786, - "reward_std": 0.0019480240371194668, - "rewards/perpo_ocr_edit_distance_reward": 0.9976008236408234, + "advantages": -1.9039427570533007e-05, + "completion_length": 780.0, + "delta_ref_entropy_loss": 0.12890625, + "delta_ref_ppl": -0.11328125, + "entropy_loss": -0.10888671875, + "epoch": 0.4746, + "grad_norm": 0.8206475949367993, + "k1_kl": 0.11328125, + "k3_kl": 0.0625, + "kimi_kl": 0.2197265625, + "learning_rate": 2.627e-07, + "loss": 0.0025, + "ppl": 0.047607421875, + "reward": 0.8556867837905884, + "reward_std": 0.0034726315643638372, + "rewards/perpo_ocr_edit_distance_reward": 0.8556869029998779, "step": 2373, "temperature": 0.9 }, { - "advantages": -3.216948141471221e-05, - "completion_length": 1382.5, - "delta_ref_entropy_loss": 0.04364013671875, - "delta_ref_ppl": -0.03924560546875, - "entropy_loss": -0.0599365234375, - "epoch": 0.9496, - "grad_norm": 2.838562239705388, - "k1_kl": 0.03924560546875, - "k3_kl": 0.0238037109375, - "kimi_kl": 0.0662841796875, - "learning_rate": 2.52e-08, - "loss": 0.001, - "ppl": 0.0273895263671875, - "reward": 0.973150759935379, - "reward_std": 0.03922258959937608, - "rewards/perpo_ocr_edit_distance_reward": 0.9731508493423462, + "advantages": -2.145767393813003e-05, + "completion_length": 688.0, + "delta_ref_entropy_loss": 0.054931640625, + "delta_ref_ppl": -0.061279296875, + "entropy_loss": -0.049072265625, + "epoch": 0.4748, + "grad_norm": 0.8626831011797723, + "k1_kl": 0.061279296875, + "k3_kl": 0.034912109375, + "kimi_kl": 0.080078125, + "learning_rate": 2.626e-07, + "loss": 0.0014, + "ppl": 0.0208740234375, + "reward": 0.9964534044265747, + "reward_std": 0.0006929162773303688, + "rewards/perpo_ocr_edit_distance_reward": 0.9964534640312195, "step": 2374, "temperature": 0.9 }, { - "advantages": -0.00011802145932682251, - "completion_length": 491.5, - "delta_ref_entropy_loss": 0.0745849609375, - "delta_ref_ppl": -0.0611572265625, - "entropy_loss": -0.11651611328125, - "epoch": 0.95, - "grad_norm": 1.086624574531613, - "k1_kl": 0.06121826171875, - "k3_kl": 0.036865234375, - "kimi_kl": 0.1002197265625, - "learning_rate": 2.5e-08, - "loss": 0.0016, - "ppl": 0.066497802734375, - "reward": 0.8485356271266937, - "reward_std": 0.0064233365264954045, - "rewards/perpo_ocr_edit_distance_reward": 0.8485356867313385, + "advantages": -5.1089696171402466e-06, + "completion_length": 509.0, + "delta_ref_entropy_loss": 0.10400390625, + "delta_ref_ppl": -0.1396484375, + "entropy_loss": -0.205078125, + "epoch": 0.475, + "grad_norm": 2.1454681920004637, + "k1_kl": 0.1396484375, + "k3_kl": 0.087890625, + "kimi_kl": 0.2451171875, + "learning_rate": 2.625e-07, + "loss": 0.0035, + "ppl": 0.10107421875, + "reward": 0.8434136509895325, + "reward_std": 0.006528791971504688, + "rewards/perpo_ocr_edit_distance_reward": 0.843413770198822, "step": 2375, "temperature": 0.9 }, { - "advantages": -9.121640175635548e-06, - "completion_length": 515.0, - "delta_ref_entropy_loss": 0.0640869140625, - "delta_ref_ppl": -0.04412841796875, - "entropy_loss": -0.0635986328125, - "epoch": 0.9504, - "grad_norm": 1.0518651757903592, - "k1_kl": 0.04443359375, - "k3_kl": 0.024017333984375, - "kimi_kl": 0.042236328125, - "learning_rate": 2.4799999999999997e-08, - "loss": 0.001, - "ppl": 0.032958984375, - "reward": 0.9414370954036713, - "reward_std": 0.0186094893142581, - "rewards/perpo_ocr_edit_distance_reward": 0.9414371848106384, + "advantages": -8.354868623428047e-05, + "completion_length": 1022.0, + "delta_ref_entropy_loss": 0.045166015625, + "delta_ref_ppl": -0.04736328125, + "entropy_loss": -0.04443359375, + "epoch": 0.4752, + "grad_norm": 0.6358285122076504, + "k1_kl": 0.04736328125, + "k3_kl": 0.0260009765625, + "kimi_kl": 0.06396484375, + "learning_rate": 2.624e-07, + "loss": 0.0011, + "ppl": 0.013671875, + "reward": 0.996245801448822, + "reward_std": 0.0009190035052597523, + "rewards/perpo_ocr_edit_distance_reward": 0.9962458610534668, "step": 2376, "temperature": 0.9 }, { - "advantages": -6.404945244309346e-05, - "completion_length": 728.5, - "delta_ref_entropy_loss": 0.03131103515625, - "delta_ref_ppl": -0.0296630859375, - "entropy_loss": -0.0579833984375, - "epoch": 0.9508, - "grad_norm": 1.0787643699510292, - "k1_kl": 0.02972412109375, - "k3_kl": 0.01934814453125, - "kimi_kl": 0.0474853515625, - "learning_rate": 2.46e-08, - "loss": 0.0008, - "ppl": 0.03106689453125, - "reward": 0.9115479290485382, - "reward_std": 0.020280321994505357, - "rewards/perpo_ocr_edit_distance_reward": 0.9115480184555054, + "advantages": -2.9802324661432067e-06, + "completion_length": 956.0, + "delta_ref_entropy_loss": 0.050048828125, + "delta_ref_ppl": -0.05908203125, + "entropy_loss": -0.08349609375, + "epoch": 0.4754, + "grad_norm": 0.8200532460198142, + "k1_kl": 0.05908203125, + "k3_kl": 0.033203125, + "kimi_kl": 0.07958984375, + "learning_rate": 2.6229999999999996e-07, + "loss": 0.0013, + "ppl": 0.031982421875, + "reward": 0.9610334038734436, + "reward_std": 0.022511160001158714, + "rewards/perpo_ocr_edit_distance_reward": 0.9610335230827332, "step": 2377, "temperature": 0.9 }, { - "advantages": -1.0124274922418408e-05, - "completion_length": 807.0, - "delta_ref_entropy_loss": 0.03570556640625, - "delta_ref_ppl": -0.03369140625, - "entropy_loss": -0.017578125, - "epoch": 0.9512, - "grad_norm": 0.31119271959905287, - "k1_kl": 0.033721923828125, - "k3_kl": 0.022796630859375, - "kimi_kl": 0.11395263671875, - "learning_rate": 2.44e-08, - "loss": 0.0009, - "ppl": 0.00716400146484375, - "reward": 0.9995712339878082, - "reward_std": 0.00016043180949054658, - "rewards/perpo_ocr_edit_distance_reward": 0.9995712637901306, + "advantages": -2.620901432237588e-05, + "completion_length": 335.0, + "delta_ref_entropy_loss": 0.07177734375, + "delta_ref_ppl": -0.09326171875, + "entropy_loss": -0.0673828125, + "epoch": 0.4756, + "grad_norm": 0.7021732647003217, + "k1_kl": 0.09326171875, + "k3_kl": 0.06103515625, + "kimi_kl": 0.20703125, + "learning_rate": 2.6219999999999995e-07, + "loss": 0.0025, + "ppl": 0.021728515625, + "reward": 0.9922193288803101, + "reward_std": 0.000874068180564791, + "rewards/perpo_ocr_edit_distance_reward": 0.9922193288803101, "step": 2378, "temperature": 0.9 }, { - "advantages": -5.8493444896612345e-05, - "completion_length": 643.0, - "delta_ref_entropy_loss": 0.07159423828125, - "delta_ref_ppl": -0.0467071533203125, - "entropy_loss": -0.0728759765625, - "epoch": 0.9516, - "grad_norm": 1.0339979336917875, - "k1_kl": 0.0467071533203125, - "k3_kl": 0.024505615234375, - "kimi_kl": 0.07386016845703125, - "learning_rate": 2.4199999999999998e-08, - "loss": 0.001, - "ppl": 0.03802490234375, - "reward": 0.9664712250232697, - "reward_std": 0.003107859600277152, - "rewards/perpo_ocr_edit_distance_reward": 0.9664712250232697, + "advantages": -1.253400569112273e-05, + "completion_length": 440.0, + "delta_ref_entropy_loss": 0.06396484375, + "delta_ref_ppl": -0.11572265625, + "entropy_loss": -0.08984375, + "epoch": 0.4758, + "grad_norm": 0.9256635731180399, + "k1_kl": 0.11572265625, + "k3_kl": 0.0791015625, + "kimi_kl": 0.29296875, + "learning_rate": 2.621e-07, + "loss": 0.0032, + "ppl": 0.03857421875, + "reward": 0.9651934504508972, + "reward_std": 0.001942210365086794, + "rewards/perpo_ocr_edit_distance_reward": 0.9651934504508972, "step": 2379, "temperature": 0.9 }, { - "advantages": -0.0001241777717950754, - "completion_length": 610.5, - "delta_ref_entropy_loss": 0.02532958984375, - "delta_ref_ppl": -0.01904296875, - "entropy_loss": -0.01849365234375, - "epoch": 0.952, - "grad_norm": 0.2551002274623474, - "k1_kl": 0.01904296875, - "k3_kl": 0.011077880859375, - "kimi_kl": 0.03533935546875, - "learning_rate": 2.4e-08, - "loss": 0.0006, - "ppl": 0.0065765380859375, - "reward": 0.9982355535030365, - "reward_std": 0.0003123363567283377, - "rewards/perpo_ocr_edit_distance_reward": 0.9982356131076813, + "advantages": -0.00013739723362959921, + "completion_length": 558.0, + "delta_ref_entropy_loss": 0.0263671875, + "delta_ref_ppl": -0.07861328125, + "entropy_loss": -0.032470703125, + "epoch": 0.476, + "grad_norm": 1.135662138300715, + "k1_kl": 0.07861328125, + "k3_kl": 0.06103515625, + "kimi_kl": 0.2158203125, + "learning_rate": 2.62e-07, + "loss": 0.0026, + "ppl": 0.0130615234375, + "reward": 0.9983032941818237, + "reward_std": 0.0007676161476410925, + "rewards/perpo_ocr_edit_distance_reward": 0.9983034729957581, "step": 2380, "temperature": 0.9 }, { - "advantages": -4.4120209850007086e-05, - "completion_length": 474.5, - "delta_ref_entropy_loss": 0.0335693359375, - "delta_ref_ppl": -0.0294189453125, - "entropy_loss": -0.041259765625, - "epoch": 0.9524, - "grad_norm": 0.8770815428958451, - "k1_kl": 0.02947998046875, - "k3_kl": 0.02099609375, - "kimi_kl": 0.049560546875, - "learning_rate": 2.38e-08, - "loss": 0.0009, - "ppl": 0.022613525390625, - "reward": 0.988421231508255, - "reward_std": 0.0007536588527727872, - "rewards/perpo_ocr_edit_distance_reward": 0.9884212017059326, + "advantages": -1.8221991922473535e-05, + "completion_length": 301.0, + "delta_ref_entropy_loss": 0.12060546875, + "delta_ref_ppl": -0.1826171875, + "entropy_loss": -0.1904296875, + "epoch": 0.4762, + "grad_norm": 2.131513326218331, + "k1_kl": 0.1826171875, + "k3_kl": 0.1298828125, + "kimi_kl": 0.54296875, + "learning_rate": 2.6190000000000004e-07, + "loss": 0.0052, + "ppl": 0.10791015625, + "reward": 0.8803329467773438, + "reward_std": 0.0022355930414050817, + "rewards/perpo_ocr_edit_distance_reward": 0.8803330063819885, "step": 2381, "temperature": 0.9 }, { - "advantages": 8.549009180569556e-06, - "completion_length": 777.5, - "delta_ref_entropy_loss": 0.0518798828125, - "delta_ref_ppl": -0.04461669921875, - "entropy_loss": -0.0589599609375, - "epoch": 0.9528, - "grad_norm": 0.9746685950582366, - "k1_kl": 0.04461669921875, - "k3_kl": 0.027313232421875, - "kimi_kl": 0.06689453125, - "learning_rate": 2.36e-08, - "loss": 0.0011, - "ppl": 0.0281982421875, - "reward": 0.987541913986206, - "reward_std": 0.0018372941995039582, - "rewards/perpo_ocr_edit_distance_reward": 0.987541913986206, + "advantages": -1.9516264728736132e-05, + "completion_length": 543.0, + "delta_ref_entropy_loss": 0.016357421875, + "delta_ref_ppl": -0.044921875, + "entropy_loss": -0.032958984375, + "epoch": 0.4764, + "grad_norm": 0.7599165600723834, + "k1_kl": 0.044677734375, + "k3_kl": 0.034423828125, + "kimi_kl": 0.10107421875, + "learning_rate": 2.618e-07, + "loss": 0.0014, + "ppl": 0.01416015625, + "reward": 0.9869527816772461, + "reward_std": 0.004260622430592775, + "rewards/perpo_ocr_edit_distance_reward": 0.9869529008865356, "step": 2382, "temperature": 0.9 }, { - "advantages": -1.871585982371471e-05, - "completion_length": 570.5, - "delta_ref_entropy_loss": 0.057373046875, - "delta_ref_ppl": -0.0501708984375, - "entropy_loss": -0.10015869140625, - "epoch": 0.9532, - "grad_norm": 2.0972521991299256, - "k1_kl": 0.0504150390625, - "k3_kl": 0.03466796875, - "kimi_kl": 0.07275390625, - "learning_rate": 2.34e-08, - "loss": 0.0014, - "ppl": 0.05889892578125, - "reward": 0.9312169551849365, - "reward_std": 0.004534799896646291, - "rewards/perpo_ocr_edit_distance_reward": 0.9312169849872589, + "advantages": -2.1593912606476806e-05, + "completion_length": 58.0, + "delta_ref_entropy_loss": 0.0120849609375, + "delta_ref_ppl": -0.306640625, + "entropy_loss": -0.12353515625, + "epoch": 0.4766, + "grad_norm": 3.039486638580441, + "k1_kl": 0.306640625, + "k3_kl": 0.251953125, + "kimi_kl": 0.9765625, + "learning_rate": 2.6169999999999997e-07, + "loss": 0.0101, + "ppl": 0.054931640625, + "reward": 0.9753861427307129, + "reward_std": 0.004235017579048872, + "rewards/perpo_ocr_edit_distance_reward": 0.9753862023353577, "step": 2383, "temperature": 0.9 }, { - "advantages": -3.558397474989761e-05, - "completion_length": 368.5, - "delta_ref_entropy_loss": 0.0982666015625, - "delta_ref_ppl": -0.0784912109375, - "entropy_loss": -0.06298828125, - "epoch": 0.9536, - "grad_norm": 1.140343120641763, - "k1_kl": 0.0784912109375, - "k3_kl": 0.0455322265625, - "kimi_kl": 0.148681640625, - "learning_rate": 2.3199999999999996e-08, - "loss": 0.0019, - "ppl": 0.03179931640625, - "reward": 0.8736232221126556, - "reward_std": 0.0023585182207170874, - "rewards/perpo_ocr_edit_distance_reward": 0.8736232817173004, + "advantages": -6.758315430488437e-05, + "completion_length": 986.0, + "delta_ref_entropy_loss": 0.04052734375, + "delta_ref_ppl": -0.048583984375, + "entropy_loss": -0.032470703125, + "epoch": 0.4768, + "grad_norm": 0.32537773070918175, + "k1_kl": 0.048828125, + "k3_kl": 0.0277099609375, + "kimi_kl": 0.08984375, + "learning_rate": 2.616e-07, + "loss": 0.0012, + "ppl": 0.01123046875, + "reward": 0.9989943504333496, + "reward_std": 0.0004038885817863047, + "rewards/perpo_ocr_edit_distance_reward": 0.9989944100379944, "step": 2384, "temperature": 0.9 }, { - "advantages": -2.599614140308404e-05, - "completion_length": 668.5, - "delta_ref_entropy_loss": 0.0489501953125, - "delta_ref_ppl": -0.0616455078125, - "entropy_loss": -0.05712890625, - "epoch": 0.954, - "grad_norm": 0.877699284065682, - "k1_kl": 0.0616455078125, - "k3_kl": 0.04052734375, - "kimi_kl": 0.111572265625, - "learning_rate": 2.2999999999999998e-08, - "loss": 0.0017, - "ppl": 0.027313232421875, - "reward": 0.9656596481800079, - "reward_std": 0.005401714239269495, - "rewards/perpo_ocr_edit_distance_reward": 0.9656597673892975, + "advantages": -1.730237818264868e-05, + "completion_length": 1837.0, + "delta_ref_entropy_loss": 0.004364013671875, + "delta_ref_ppl": -0.01507568359375, + "entropy_loss": -0.0206298828125, + "epoch": 0.477, + "grad_norm": 0.19648214030606628, + "k1_kl": 0.01507568359375, + "k3_kl": 0.01239013671875, + "kimi_kl": 0.056640625, + "learning_rate": 2.615e-07, + "loss": 0.0005, + "ppl": 0.006439208984375, + "reward": 0.9790687561035156, + "reward_std": 0.00236553605645895, + "rewards/perpo_ocr_edit_distance_reward": 0.9790688157081604, "step": 2385, "temperature": 0.9 }, { - "advantages": -0.0002980232238769531, - "completion_length": 304.5, - "delta_ref_entropy_loss": 0.0224609375, - "delta_ref_ppl": -0.0208740234375, - "entropy_loss": -0.015228271484375, - "epoch": 0.9544, - "grad_norm": 0.01328781554098676, - "k1_kl": 0.0208740234375, - "k3_kl": 0.0128173828125, - "kimi_kl": 0.04205322265625, - "learning_rate": 2.28e-08, - "loss": 0.0008, - "ppl": 0.007080078125, - "reward": 0.9997689425945282, - "reward_std": 0.0, - "rewards/perpo_ocr_edit_distance_reward": 0.9997689723968506, + "advantages": 6.811959792685229e-06, + "completion_length": 442.0, + "delta_ref_entropy_loss": 0.06494140625, + "delta_ref_ppl": -0.095703125, + "entropy_loss": -0.044921875, + "epoch": 0.4772, + "grad_norm": 0.9228888326400134, + "k1_kl": 0.095703125, + "k3_kl": 0.06494140625, + "kimi_kl": 0.232421875, + "learning_rate": 2.614e-07, + "loss": 0.0026, + "ppl": 0.0203857421875, + "reward": 0.9875233769416809, + "reward_std": 0.001152045326307416, + "rewards/perpo_ocr_edit_distance_reward": 0.9875233769416809, "step": 2386, "temperature": 0.9 }, { - "advantages": -0.0002980572836754902, - "completion_length": 208.0, - "delta_ref_entropy_loss": 0.10009765625, - "delta_ref_ppl": -0.129150390625, - "entropy_loss": -0.1107177734375, - "epoch": 0.9548, - "grad_norm": 3.635071486152983, - "k1_kl": 0.129638671875, - "k3_kl": 0.098236083984375, - "kimi_kl": 0.21527099609375, - "learning_rate": 2.2599999999999997e-08, - "loss": 0.0042, - "ppl": 0.05426025390625, - "reward": 0.8801335096359253, - "reward_std": 0.09195204824209213, - "rewards/perpo_ocr_edit_distance_reward": 0.8801335990428925, + "advantages": -2.3501261239289306e-06, + "completion_length": 1110.0, + "delta_ref_entropy_loss": 0.06689453125, + "delta_ref_ppl": -0.0810546875, + "entropy_loss": -0.208984375, + "epoch": 0.4774, + "grad_norm": 1.4480343141863221, + "k1_kl": 0.0810546875, + "k3_kl": 0.053955078125, + "kimi_kl": 0.1201171875, + "learning_rate": 2.613e-07, + "loss": 0.0022, + "ppl": 0.1083984375, + "reward": 0.8888025879859924, + "reward_std": 0.01079240720719099, + "rewards/perpo_ocr_edit_distance_reward": 0.8888025879859924, "step": 2387, "temperature": 0.9 }, { - "advantages": -4.341346925684775e-05, - "completion_length": 575.0, - "delta_ref_entropy_loss": 0.046142578125, - "delta_ref_ppl": -0.03546142578125, - "entropy_loss": -0.077911376953125, - "epoch": 0.9552, - "grad_norm": 1.0512844792862104, - "k1_kl": 0.03546142578125, - "k3_kl": 0.02020263671875, - "kimi_kl": 0.04571533203125, - "learning_rate": 2.24e-08, - "loss": 0.0009, - "ppl": 0.046234130859375, - "reward": 0.9888015985488892, - "reward_std": 0.005064107739599422, - "rewards/perpo_ocr_edit_distance_reward": 0.9888016283512115, + "advantages": -1.3623919414840202e-07, + "completion_length": 2048.0, + "delta_ref_entropy_loss": -0.08251953125, + "delta_ref_ppl": -0.2109375, + "entropy_loss": -0.625, + "epoch": 0.4776, + "grad_norm": 4.77558436644668, + "k1_kl": 0.2119140625, + "k3_kl": 0.1689453125, + "kimi_kl": 0.490234375, + "learning_rate": 2.612e-07, + "loss": 0.0068, + "ppl": 0.30078125, + "reward": 0.5231773257255554, + "reward_std": 0.29024896025657654, + "rewards/perpo_ocr_edit_distance_reward": 0.5231773853302002, "step": 2388, "temperature": 0.9 }, { - "advantages": -0.00032380649099650327, - "completion_length": 794.0, - "delta_ref_entropy_loss": 0.02593994140625, - "delta_ref_ppl": -0.0206298828125, - "entropy_loss": -0.02435302734375, - "epoch": 0.9556, - "grad_norm": 1.2824408232561786, - "k1_kl": 0.02069091796875, - "k3_kl": 0.0142822265625, - "kimi_kl": 0.0472412109375, - "learning_rate": 2.22e-08, - "loss": 0.0009, - "ppl": 0.010101318359375, - "reward": 0.9985352158546448, - "reward_std": 0.00019772961968556046, - "rewards/perpo_ocr_edit_distance_reward": 0.9985353052616119, + "advantages": -3.213542004232295e-05, + "completion_length": 525.0, + "delta_ref_entropy_loss": 0.0673828125, + "delta_ref_ppl": -0.099609375, + "entropy_loss": -0.06396484375, + "epoch": 0.4778, + "grad_norm": 0.6774628195782532, + "k1_kl": 0.099609375, + "k3_kl": 0.057373046875, + "kimi_kl": 0.1513671875, + "learning_rate": 2.611e-07, + "loss": 0.0023, + "ppl": 0.023193359375, + "reward": 0.9361793994903564, + "reward_std": 0.0009602979989722371, + "rewards/perpo_ocr_edit_distance_reward": 0.9361794590950012, "step": 2389, "temperature": 0.9 }, { - "advantages": -1.0503190452482158e-05, - "completion_length": 1276.5, - "delta_ref_entropy_loss": 0.07080078125, - "delta_ref_ppl": -0.0487060546875, - "entropy_loss": -0.2364501953125, - "epoch": 0.956, - "grad_norm": 6.543226438916386, - "k1_kl": 0.0487060546875, - "k3_kl": 0.0408935546875, - "kimi_kl": 0.0849609375, - "learning_rate": 2.2e-08, - "loss": 0.0016, - "ppl": 0.150390625, - "reward": 0.8016044795513153, - "reward_std": 0.0059421901241876185, - "rewards/perpo_ocr_edit_distance_reward": 0.8016045689582825, + "advantages": -5.541529390029609e-05, + "completion_length": 1337.0, + "delta_ref_entropy_loss": 0.0458984375, + "delta_ref_ppl": -0.05615234375, + "entropy_loss": -0.0830078125, + "epoch": 0.478, + "grad_norm": 0.7416106427269616, + "k1_kl": 0.055908203125, + "k3_kl": 0.035400390625, + "kimi_kl": 0.0888671875, + "learning_rate": 2.61e-07, + "loss": 0.0015, + "ppl": 0.04052734375, + "reward": 0.9700092077255249, + "reward_std": 0.0015889779897406697, + "rewards/perpo_ocr_edit_distance_reward": 0.9700093269348145, "step": 2390, "temperature": 0.9 }, { - "advantages": -6.1030901633785106e-05, - "completion_length": 685.5, - "delta_ref_entropy_loss": 0.03582763671875, - "delta_ref_ppl": -0.020416259765625, - "entropy_loss": -0.0301513671875, - "epoch": 0.9564, - "grad_norm": 0.42025353754487915, - "k1_kl": 0.02044677734375, - "k3_kl": 0.010162353515625, - "kimi_kl": 0.019775390625, - "learning_rate": 2.18e-08, - "loss": 0.0005, - "ppl": 0.014892578125, - "reward": 0.9939403831958771, - "reward_std": 0.0009758544620126486, - "rewards/perpo_ocr_edit_distance_reward": 0.9939404428005219, + "advantages": -2.474444409017451e-05, + "completion_length": 441.0, + "delta_ref_entropy_loss": 0.087890625, + "delta_ref_ppl": -0.10498046875, + "entropy_loss": -0.0703125, + "epoch": 0.4782, + "grad_norm": 0.9697560431268031, + "k1_kl": 0.10498046875, + "k3_kl": 0.0654296875, + "kimi_kl": 0.2021484375, + "learning_rate": 2.609e-07, + "loss": 0.0026, + "ppl": 0.031005859375, + "reward": 0.9909946322441101, + "reward_std": 0.0009320893441326916, + "rewards/perpo_ocr_edit_distance_reward": 0.9909946918487549, "step": 2391, "temperature": 0.9 }, { - "advantages": -3.1505313700108672e-06, - "completion_length": 397.5, - "delta_ref_entropy_loss": 0.11181640625, - "delta_ref_ppl": -0.0927734375, - "entropy_loss": -0.1435546875, - "epoch": 0.9568, - "grad_norm": 1.7687744320893037, - "k1_kl": 0.0927734375, - "k3_kl": 0.055419921875, - "kimi_kl": 0.15380859375, - "learning_rate": 2.16e-08, - "loss": 0.0022, - "ppl": 0.0810546875, - "reward": 0.9494537115097046, - "reward_std": 0.006745398277416825, - "rewards/perpo_ocr_edit_distance_reward": 0.9494538009166718, + "advantages": -3.205878601875156e-05, + "completion_length": 397.0, + "delta_ref_entropy_loss": 0.06396484375, + "delta_ref_ppl": -0.0986328125, + "entropy_loss": -0.06298828125, + "epoch": 0.4784, + "grad_norm": 0.4697920239935718, + "k1_kl": 0.0986328125, + "k3_kl": 0.061767578125, + "kimi_kl": 0.1728515625, + "learning_rate": 2.6079999999999995e-07, + "loss": 0.0025, + "ppl": 0.0247802734375, + "reward": 0.9890241622924805, + "reward_std": 0.0009625355596654117, + "rewards/perpo_ocr_edit_distance_reward": 0.9890242218971252, "step": 2392, "temperature": 0.9 }, { - "advantages": -3.681864109239541e-05, - "completion_length": 482.5, - "delta_ref_entropy_loss": 0.03485107421875, - "delta_ref_ppl": -0.052978515625, - "entropy_loss": -0.019134521484375, - "epoch": 0.9572, - "grad_norm": 0.4096715763849597, - "k1_kl": 0.052978515625, - "k3_kl": 0.0391845703125, - "kimi_kl": 0.1953125, - "learning_rate": 2.1399999999999996e-08, - "loss": 0.0016, - "ppl": 0.0087890625, - "reward": 0.9742686748504639, - "reward_std": 0.00029688989161513746, - "rewards/perpo_ocr_edit_distance_reward": 0.9742687344551086, + "advantages": -1.5360968973254785e-05, + "completion_length": 111.0, + "delta_ref_entropy_loss": 0.08935546875, + "delta_ref_ppl": -0.400390625, + "entropy_loss": -0.138671875, + "epoch": 0.4786, + "grad_norm": 1.7832399059847914, + "k1_kl": 0.400390625, + "k3_kl": 0.314453125, + "kimi_kl": 1.46875, + "learning_rate": 2.607e-07, + "loss": 0.0126, + "ppl": 0.051025390625, + "reward": 0.991241455078125, + "reward_std": 0.0032292953692376614, + "rewards/perpo_ocr_edit_distance_reward": 0.9912415146827698, "step": 2393, "temperature": 0.9 }, { - "advantages": 0.0, - "completion_length": 872.5, - "delta_ref_entropy_loss": 0.034454345703125, - "delta_ref_ppl": -0.0479583740234375, - "entropy_loss": -0.01910400390625, - "epoch": 0.9576, - "grad_norm": 0.10932876227062249, - "k1_kl": 0.047698974609375, - "k3_kl": 0.0324859619140625, - "kimi_kl": 0.1168365478515625, - "learning_rate": 2.1199999999999998e-08, - "loss": 0.0013, - "ppl": 0.00688934326171875, - "reward": 0.9943336844444275, - "reward_std": 0.00016061250062193722, - "rewards/perpo_ocr_edit_distance_reward": 0.9943336844444275, + "advantages": -6.410053902072832e-05, + "completion_length": 761.0, + "delta_ref_entropy_loss": 0.0341796875, + "delta_ref_ppl": -0.046630859375, + "entropy_loss": -0.033447265625, + "epoch": 0.4788, + "grad_norm": 0.42077100625513975, + "k1_kl": 0.046630859375, + "k3_kl": 0.0279541015625, + "kimi_kl": 0.0654296875, + "learning_rate": 2.606e-07, + "loss": 0.0012, + "ppl": 0.01220703125, + "reward": 0.9853134751319885, + "reward_std": 0.0006970863323658705, + "rewards/perpo_ocr_edit_distance_reward": 0.9853135347366333, "step": 2394, "temperature": 0.9 }, { - "advantages": -0.00021809340978506953, - "completion_length": 486.5, - "delta_ref_entropy_loss": 0.025390625, - "delta_ref_ppl": -0.025390625, - "entropy_loss": -0.015625, - "epoch": 0.958, - "grad_norm": 0.17107055600315102, - "k1_kl": 0.025390625, - "k3_kl": 0.017303466796875, - "kimi_kl": 0.0592041015625, - "learning_rate": 2.1e-08, - "loss": 0.0009, - "ppl": 0.00656890869140625, - "reward": 0.9983098208904266, - "reward_std": 8.664753113407642e-05, - "rewards/perpo_ocr_edit_distance_reward": 0.998309850692749, + "advantages": -0.00023539578251075, + "completion_length": 692.0, + "delta_ref_entropy_loss": 0.0234375, + "delta_ref_ppl": -0.049072265625, + "entropy_loss": -0.0184326171875, + "epoch": 0.479, + "grad_norm": 0.14018117365923805, + "k1_kl": 0.049072265625, + "k3_kl": 0.031005859375, + "kimi_kl": 0.09423828125, + "learning_rate": 2.605e-07, + "loss": 0.0015, + "ppl": 0.006134033203125, + "reward": 0.9987854957580566, + "reward_std": 0.00018939060100819916, + "rewards/perpo_ocr_edit_distance_reward": 0.9987856149673462, "step": 2395, "temperature": 0.9 }, { - "advantages": -0.0002872262675737147, - "completion_length": 838.5, - "delta_ref_entropy_loss": 0.0250244140625, - "delta_ref_ppl": -0.017333984375, - "entropy_loss": -0.0157470703125, - "epoch": 0.9584, - "grad_norm": 0.18381769672745765, - "k1_kl": 0.017364501953125, - "k3_kl": 0.00897216796875, - "kimi_kl": 0.0201416015625, - "learning_rate": 2.0799999999999998e-08, - "loss": 0.0006, - "ppl": 0.0056304931640625, - "reward": 0.9968570470809937, - "reward_std": 0.00014709208335261792, - "rewards/perpo_ocr_edit_distance_reward": 0.9968571066856384, + "advantages": -5.204337139730342e-05, + "completion_length": 542.0, + "delta_ref_entropy_loss": 0.08349609375, + "delta_ref_ppl": -0.099609375, + "entropy_loss": -0.05517578125, + "epoch": 0.4792, + "grad_norm": 1.048387420922055, + "k1_kl": 0.099609375, + "k3_kl": 0.06396484375, + "kimi_kl": 0.1865234375, + "learning_rate": 2.6040000000000003e-07, + "loss": 0.0026, + "ppl": 0.0296630859375, + "reward": 0.983009397983551, + "reward_std": 0.0015356671065092087, + "rewards/perpo_ocr_edit_distance_reward": 0.9830095171928406, "step": 2396, "temperature": 0.9 }, { - "advantages": -7.901873686932959e-06, - "completion_length": 844.5, - "delta_ref_entropy_loss": 0.0211181640625, - "delta_ref_ppl": -0.033843994140625, - "entropy_loss": -0.024658203125, - "epoch": 0.9588, - "grad_norm": 0.31029162536219723, - "k1_kl": 0.033599853515625, - "k3_kl": 0.0228118896484375, - "kimi_kl": 0.0850677490234375, - "learning_rate": 2.06e-08, - "loss": 0.0009, - "ppl": 0.010650634765625, - "reward": 0.9513684809207916, - "reward_std": 0.0018344155978411436, - "rewards/perpo_ocr_edit_distance_reward": 0.951368510723114, + "advantages": -5.5500440794276074e-05, + "completion_length": 412.0, + "delta_ref_entropy_loss": 0.0322265625, + "delta_ref_ppl": -0.0712890625, + "entropy_loss": -0.04931640625, + "epoch": 0.4794, + "grad_norm": 0.7502194954058781, + "k1_kl": 0.0712890625, + "k3_kl": 0.05126953125, + "kimi_kl": 0.1845703125, + "learning_rate": 2.6029999999999997e-07, + "loss": 0.0021, + "ppl": 0.0213623046875, + "reward": 0.9904889464378357, + "reward_std": 0.001127237337641418, + "rewards/perpo_ocr_edit_distance_reward": 0.9904890656471252, "step": 2397, "temperature": 0.9 }, { - "advantages": -2.9150929549359716e-05, - "completion_length": 525.0, - "delta_ref_entropy_loss": 0.057861328125, - "delta_ref_ppl": -0.04864501953125, - "entropy_loss": -0.0308837890625, - "epoch": 0.9592, - "grad_norm": 0.6119208436301206, - "k1_kl": 0.04864501953125, - "k3_kl": 0.031951904296875, - "kimi_kl": 0.1175537109375, - "learning_rate": 2.04e-08, - "loss": 0.0013, - "ppl": 0.01361083984375, - "reward": 0.9608152210712433, - "reward_std": 0.001362842864182312, - "rewards/perpo_ocr_edit_distance_reward": 0.9608152508735657, + "advantages": -1.2712819625448901e-05, + "completion_length": 91.0, + "delta_ref_entropy_loss": 0.0322265625, + "delta_ref_ppl": -0.451171875, + "entropy_loss": -0.1533203125, + "epoch": 0.4796, + "grad_norm": 3.8383309963652126, + "k1_kl": 0.451171875, + "k3_kl": 0.37109375, + "kimi_kl": 1.859375, + "learning_rate": 2.6019999999999996e-07, + "loss": 0.0149, + "ppl": 0.068359375, + "reward": 0.9887640476226807, + "reward_std": 0.004587065428495407, + "rewards/perpo_ocr_edit_distance_reward": 0.9887641072273254, "step": 2398, "temperature": 0.9 }, { - "advantages": -8.84107193996897e-05, - "completion_length": 662.0, - "delta_ref_entropy_loss": 0.0594482421875, - "delta_ref_ppl": -0.030029296875, - "entropy_loss": -0.05718994140625, - "epoch": 0.9596, - "grad_norm": 1.3487617383203865, - "k1_kl": 0.0301513671875, - "k3_kl": 0.017791748046875, - "kimi_kl": 0.02557373046875, - "learning_rate": 2.02e-08, - "loss": 0.0008, - "ppl": 0.032318115234375, - "reward": 0.9874328374862671, - "reward_std": 0.0008863555267453194, - "rewards/perpo_ocr_edit_distance_reward": 0.9874329268932343, + "advantages": -8.753368092584424e-06, + "completion_length": 1255.0, + "delta_ref_entropy_loss": 0.0191650390625, + "delta_ref_ppl": -0.038818359375, + "entropy_loss": -0.054443359375, + "epoch": 0.4798, + "grad_norm": 3.0856694150074557, + "k1_kl": 0.038818359375, + "k3_kl": 0.0252685546875, + "kimi_kl": 0.056884765625, + "learning_rate": 2.601e-07, + "loss": 0.001, + "ppl": 0.0247802734375, + "reward": 0.9937516450881958, + "reward_std": 0.0028249749448150396, + "rewards/perpo_ocr_edit_distance_reward": 0.9937517046928406, "step": 2399, "temperature": 0.9 }, { - "advantages": -0.00011087316046953788, - "completion_length": 804.5, - "delta_ref_entropy_loss": 0.0289306640625, - "delta_ref_ppl": -0.016937255859375, - "entropy_loss": -0.03759765625, - "epoch": 0.96, - "grad_norm": 0.7226661241856357, - "k1_kl": 0.0169677734375, - "k3_kl": 0.015106201171875, - "kimi_kl": 0.021636962890625, - "learning_rate": 2e-08, - "loss": 0.0007, - "ppl": 0.020050048828125, - "reward": 0.7574648857116699, - "reward_std": 0.18437071137304883, - "rewards/perpo_ocr_edit_distance_reward": 0.7574649155139923, + "advantages": -1.958438360816217e-06, + "completion_length": 481.0, + "delta_ref_entropy_loss": 0.01153564453125, + "delta_ref_ppl": -0.07666015625, + "entropy_loss": -0.0625, + "epoch": 0.48, + "grad_norm": 0.9557776256881966, + "k1_kl": 0.07666015625, + "k3_kl": 0.06884765625, + "kimi_kl": 0.2373046875, + "learning_rate": 2.6e-07, + "loss": 0.0028, + "ppl": 0.02490234375, + "reward": 0.9745585918426514, + "reward_std": 0.004250324796885252, + "rewards/perpo_ocr_edit_distance_reward": 0.9745585918426514, "step": 2400, "temperature": 0.9 }, { - "advantages": -0.00034461064205970615, - "completion_length": 638.0, - "delta_ref_entropy_loss": 0.0418701171875, - "delta_ref_ppl": -0.0738525390625, - "entropy_loss": -0.0225830078125, - "epoch": 0.9604, - "grad_norm": 0.1582370949678998, - "k1_kl": 0.07415771484375, - "k3_kl": 0.05487060546875, - "kimi_kl": 0.28076171875, - "learning_rate": 1.9800000000000002e-08, - "loss": 0.0025, - "ppl": 0.008636474609375, - "reward": 0.9660614132881165, - "reward_std": 0.00017845154798123986, - "rewards/perpo_ocr_edit_distance_reward": 0.966061532497406, + "advantages": -0.0002498456451576203, + "completion_length": 1042.0, + "delta_ref_entropy_loss": 0.032470703125, + "delta_ref_ppl": -0.03125, + "entropy_loss": -0.034912109375, + "epoch": 0.4802, + "grad_norm": 0.23442338843565874, + "k1_kl": 0.03125, + "k3_kl": 0.015380859375, + "kimi_kl": 0.0458984375, + "learning_rate": 2.599e-07, + "loss": 0.0009, + "ppl": 0.0096435546875, + "reward": 0.9943548440933228, + "reward_std": 0.00017265795031562448, + "rewards/perpo_ocr_edit_distance_reward": 0.9943549633026123, "step": 2401, "temperature": 0.9 }, { - "advantages": -0.00013534086506927778, - "completion_length": 1366.0, - "delta_ref_entropy_loss": 0.017730712890625, - "delta_ref_ppl": -0.0087738037109375, - "entropy_loss": -0.0316162109375, - "epoch": 0.9608, - "grad_norm": 1.0922929526733172, - "k1_kl": 0.0087890625, - "k3_kl": 0.0283203125, - "kimi_kl": 0.01434326171875, - "learning_rate": 1.9599999999999997e-08, + "advantages": -1.851150045695249e-05, + "completion_length": 887.0, + "delta_ref_entropy_loss": 0.039794921875, + "delta_ref_ppl": -0.052490234375, + "entropy_loss": -0.03076171875, + "epoch": 0.4804, + "grad_norm": 0.29903055593531075, + "k1_kl": 0.05224609375, + "k3_kl": 0.03271484375, + "kimi_kl": 0.099609375, + "learning_rate": 2.598e-07, "loss": 0.0013, - "ppl": 0.018218994140625, - "reward": 0.9617837965488434, - "reward_std": 0.00047407626698259264, - "rewards/perpo_ocr_edit_distance_reward": 0.9617838561534882, + "ppl": 0.0103759765625, + "reward": 0.9876224398612976, + "reward_std": 0.0008197000133804977, + "rewards/perpo_ocr_edit_distance_reward": 0.9876224994659424, "step": 2402, "temperature": 0.9 }, { - "advantages": -2.7196749670110876e-05, - "completion_length": 374.0, - "delta_ref_entropy_loss": 0.0330810546875, - "delta_ref_ppl": -0.04779052734375, - "entropy_loss": -0.03424072265625, - "epoch": 0.9612, - "grad_norm": 0.48001050115488275, - "k1_kl": 0.0478515625, - "k3_kl": 0.03155517578125, - "kimi_kl": 0.083984375, - "learning_rate": 1.94e-08, - "loss": 0.0013, - "ppl": 0.017547607421875, - "reward": 0.9834740161895752, - "reward_std": 0.0009274821495637298, - "rewards/perpo_ocr_edit_distance_reward": 0.9834740459918976, + "advantages": -2.379076977376826e-05, + "completion_length": 281.0, + "delta_ref_entropy_loss": 0.0302734375, + "delta_ref_ppl": -0.107421875, + "entropy_loss": -0.055419921875, + "epoch": 0.4806, + "grad_norm": 0.8979399315625534, + "k1_kl": 0.107421875, + "k3_kl": 0.078125, + "kimi_kl": 0.373046875, + "learning_rate": 2.597e-07, + "loss": 0.0031, + "ppl": 0.022705078125, + "reward": 0.9949989318847656, + "reward_std": 0.0013329624198377132, + "rewards/perpo_ocr_edit_distance_reward": 0.9949989914894104, "step": 2403, "temperature": 0.9 }, { - "advantages": -3.253136583225569e-05, - "completion_length": 374.5, - "delta_ref_entropy_loss": 0.0556640625, - "delta_ref_ppl": -0.0645751953125, - "entropy_loss": -0.0684814453125, - "epoch": 0.9616, - "grad_norm": 1.1037231264550331, - "k1_kl": 0.064697265625, - "k3_kl": 0.0440673828125, - "kimi_kl": 0.11474609375, - "learning_rate": 1.9199999999999997e-08, + "advantages": -0.0002780301438178867, + "completion_length": 648.0, + "delta_ref_entropy_loss": 0.0576171875, + "delta_ref_ppl": -0.0654296875, + "entropy_loss": -0.05517578125, + "epoch": 0.4808, + "grad_norm": 0.3554241256476132, + "k1_kl": 0.0654296875, + "k3_kl": 0.0390625, + "kimi_kl": 0.1240234375, + "learning_rate": 2.5959999999999997e-07, "loss": 0.0018, - "ppl": 0.0394287109375, - "reward": 0.9762305915355682, - "reward_std": 0.0017563282162882388, - "rewards/perpo_ocr_edit_distance_reward": 0.9762306809425354, + "ppl": 0.01483154296875, + "reward": 0.9983529448509216, + "reward_std": 0.00023691353271715343, + "rewards/perpo_ocr_edit_distance_reward": 0.998353123664856, "step": 2404, "temperature": 0.9 }, { - "advantages": -5.670105019817129e-05, - "completion_length": 324.0, - "delta_ref_entropy_loss": 0.0408935546875, - "delta_ref_ppl": -0.0489501953125, - "entropy_loss": -0.01910400390625, - "epoch": 0.962, - "grad_norm": 0.18704505664474674, - "k1_kl": 0.0489501953125, - "k3_kl": 0.03271484375, - "kimi_kl": 0.1092529296875, - "learning_rate": 1.8999999999999998e-08, - "loss": 0.0014, - "ppl": 0.00775146484375, - "reward": 0.9977357089519501, - "reward_std": 0.0003630692954175174, - "rewards/perpo_ocr_edit_distance_reward": 0.9977357387542725, + "advantages": -4.7385696234414354e-05, + "completion_length": 1309.0, + "delta_ref_entropy_loss": 0.04833984375, + "delta_ref_ppl": -0.0673828125, + "entropy_loss": -0.08544921875, + "epoch": 0.481, + "grad_norm": 2.606416414816683, + "k1_kl": 0.0673828125, + "k3_kl": 0.045654296875, + "kimi_kl": 0.10693359375, + "learning_rate": 2.595e-07, + "loss": 0.0019, + "ppl": 0.048583984375, + "reward": 0.9059513211250305, + "reward_std": 0.0009781899861991405, + "rewards/perpo_ocr_edit_distance_reward": 0.9059513807296753, "step": 2405, "temperature": 0.9 }, { - "advantages": -3.582239367005968e-05, - "completion_length": 567.0, - "delta_ref_entropy_loss": 0.109130859375, - "delta_ref_ppl": -0.0599365234375, - "entropy_loss": -0.1312255859375, - "epoch": 0.9624, - "grad_norm": 1.0936696850430847, - "k1_kl": 0.0601806640625, - "k3_kl": 0.03369140625, - "kimi_kl": 0.0687255859375, - "learning_rate": 1.88e-08, - "loss": 0.0014, - "ppl": 0.07501220703125, - "reward": 0.8693528175354004, - "reward_std": 0.0023490047897212207, - "rewards/perpo_ocr_edit_distance_reward": 0.8693528771400452, + "advantages": -5.245209194981726e-06, + "completion_length": 763.0, + "delta_ref_entropy_loss": 0.0296630859375, + "delta_ref_ppl": -0.0419921875, + "entropy_loss": -0.05224609375, + "epoch": 0.4812, + "grad_norm": 0.787323933941204, + "k1_kl": 0.042236328125, + "k3_kl": 0.0264892578125, + "kimi_kl": 0.060546875, + "learning_rate": 2.594e-07, + "loss": 0.0011, + "ppl": 0.0235595703125, + "reward": 0.9832571744918823, + "reward_std": 0.004755630157887936, + "rewards/perpo_ocr_edit_distance_reward": 0.9832571744918823, "step": 2406, "temperature": 0.9 }, { - "advantages": -0.00036154474946670234, - "completion_length": 153.0, - "delta_ref_entropy_loss": 0.0771484375, - "delta_ref_ppl": -0.066650390625, - "entropy_loss": -0.05462646484375, - "epoch": 0.9628, - "grad_norm": 0.6632753161674132, - "k1_kl": 0.06689453125, - "k3_kl": 0.032470703125, - "kimi_kl": 0.0751953125, - "learning_rate": 1.8599999999999998e-08, - "loss": 0.0017, - "ppl": 0.02606201171875, - "reward": 0.9859894514083862, - "reward_std": 0.000151061947690323, - "rewards/perpo_ocr_edit_distance_reward": 0.9859894812107086, + "advantages": -4.649162292480469e-06, + "completion_length": 1466.0, + "delta_ref_entropy_loss": 0.0213623046875, + "delta_ref_ppl": -0.049560546875, + "entropy_loss": -0.04150390625, + "epoch": 0.4814, + "grad_norm": 1.2694575898962552, + "k1_kl": 0.04931640625, + "k3_kl": 0.032958984375, + "kimi_kl": 0.0859375, + "learning_rate": 2.5929999999999995e-07, + "loss": 0.0013, + "ppl": 0.0179443359375, + "reward": 0.9728668332099915, + "reward_std": 0.005366173107177019, + "rewards/perpo_ocr_edit_distance_reward": 0.9728668928146362, "step": 2407, "temperature": 0.9 }, { - "advantages": -3.0228071068449935e-07, - "completion_length": 294.5, - "delta_ref_entropy_loss": 0.0762939453125, - "delta_ref_ppl": -0.082763671875, - "entropy_loss": -0.210205078125, - "epoch": 0.9632, - "grad_norm": 8.647798718201784, - "k1_kl": 0.0828857421875, - "k3_kl": 0.0474853515625, - "kimi_kl": 0.099365234375, - "learning_rate": 1.84e-08, - "loss": 0.0019, - "ppl": 0.138427734375, - "reward": 0.8097843825817108, - "reward_std": 0.04187079053372145, - "rewards/perpo_ocr_edit_distance_reward": 0.8097844123840332, + "advantages": -1.0609627679514233e-05, + "completion_length": 598.0, + "delta_ref_entropy_loss": 0.05224609375, + "delta_ref_ppl": -0.060546875, + "entropy_loss": -0.038330078125, + "epoch": 0.4816, + "grad_norm": 0.39270751203788196, + "k1_kl": 0.060546875, + "k3_kl": 0.03857421875, + "kimi_kl": 0.1240234375, + "learning_rate": 2.592e-07, + "loss": 0.0016, + "ppl": 0.015380859375, + "reward": 0.9971603155136108, + "reward_std": 0.0007026444654911757, + "rewards/perpo_ocr_edit_distance_reward": 0.9971602559089661, "step": 2408, "temperature": 0.9 }, { - "advantages": -1.386659550917102e-05, - "completion_length": 525.0, - "delta_ref_entropy_loss": 0.070556640625, - "delta_ref_ppl": -0.070068359375, - "entropy_loss": -0.10693359375, - "epoch": 0.9636, - "grad_norm": 1.1684313525833465, - "k1_kl": 0.070068359375, - "k3_kl": 0.0465087890625, - "kimi_kl": 0.121337890625, - "learning_rate": 1.82e-08, - "loss": 0.0019, - "ppl": 0.0562744140625, - "reward": 0.9113759398460388, - "reward_std": 0.019052864867262542, - "rewards/perpo_ocr_edit_distance_reward": 0.9113759696483612, + "advantages": -3.218651045244769e-06, + "completion_length": 1202.0, + "delta_ref_entropy_loss": 0.0517578125, + "delta_ref_ppl": -0.041748046875, + "entropy_loss": -0.06201171875, + "epoch": 0.4818, + "grad_norm": 1.1920872403607812, + "k1_kl": 0.041748046875, + "k3_kl": 0.0189208984375, + "kimi_kl": 0.038330078125, + "learning_rate": 2.591e-07, + "loss": 0.0008, + "ppl": 0.02001953125, + "reward": 0.9895179867744446, + "reward_std": 0.007777308113873005, + "rewards/perpo_ocr_edit_distance_reward": 0.9895180463790894, "step": 2409, "temperature": 0.9 }, { - "advantages": -3.4740995715765166e-06, - "completion_length": 290.5, - "delta_ref_entropy_loss": 0.131591796875, - "delta_ref_ppl": -0.10107421875, - "entropy_loss": -0.171630859375, - "epoch": 0.964, - "grad_norm": 1.7888084409033167, - "k1_kl": 0.10107421875, - "k3_kl": 0.060546875, - "kimi_kl": 0.18359375, - "learning_rate": 1.8e-08, + "advantages": -7.540839578723535e-05, + "completion_length": 492.0, + "delta_ref_entropy_loss": 0.02587890625, + "delta_ref_ppl": -0.07958984375, + "entropy_loss": -0.0296630859375, + "epoch": 0.482, + "grad_norm": 0.38625509778547307, + "k1_kl": 0.0791015625, + "k3_kl": 0.058837890625, + "kimi_kl": 0.2314453125, + "learning_rate": 2.59e-07, "loss": 0.0024, - "ppl": 0.0902099609375, - "reward": 0.893569141626358, - "reward_std": 0.008442715276032686, - "rewards/perpo_ocr_edit_distance_reward": 0.8935692012310028, + "ppl": 0.00958251953125, + "reward": 0.9991240501403809, + "reward_std": 0.0006904975161887705, + "rewards/perpo_ocr_edit_distance_reward": 0.9991241693496704, "step": 2410, "temperature": 0.9 }, { - "advantages": -1.2602125480043469e-05, - "completion_length": 601.5, - "delta_ref_entropy_loss": 0.0565185546875, - "delta_ref_ppl": -0.03314208984375, - "entropy_loss": -0.0589599609375, - "epoch": 0.9644, - "grad_norm": 0.7935674360313003, - "k1_kl": 0.03314208984375, - "k3_kl": 0.017120361328125, - "kimi_kl": 0.029144287109375, - "learning_rate": 1.78e-08, - "loss": 0.0007, - "ppl": 0.03204345703125, - "reward": 0.9329221248626709, - "reward_std": 0.0025867267977446318, - "rewards/perpo_ocr_edit_distance_reward": 0.9329221546649933, + "advantages": -1.5991075997590087e-05, + "completion_length": 1049.0, + "delta_ref_entropy_loss": 0.09130859375, + "delta_ref_ppl": -0.09130859375, + "entropy_loss": -0.1240234375, + "epoch": 0.4822, + "grad_norm": 1.758650496670951, + "k1_kl": 0.0908203125, + "k3_kl": 0.05517578125, + "kimi_kl": 0.142578125, + "learning_rate": 2.589e-07, + "loss": 0.0022, + "ppl": 0.06591796875, + "reward": 0.9408838748931885, + "reward_std": 0.002030766336247325, + "rewards/perpo_ocr_edit_distance_reward": 0.9408839344978333, "step": 2411, "temperature": 0.9 }, { - "advantages": -2.6490009207069676e-05, - "completion_length": 369.0, - "delta_ref_entropy_loss": -0.052215576171875, - "delta_ref_ppl": -0.1826171875, - "entropy_loss": -0.36810302734375, - "epoch": 0.9648, - "grad_norm": 4.8813861844925, - "k1_kl": 0.1826171875, - "k3_kl": 0.140777587890625, - "kimi_kl": 0.4407958984375, - "learning_rate": 1.76e-08, - "loss": 0.0057, - "ppl": 0.23956298828125, - "reward": 0.7507514357566833, - "reward_std": 0.1585114502813667, - "rewards/perpo_ocr_edit_distance_reward": 0.7507514655590057, + "advantages": -7.414392166538164e-05, + "completion_length": 425.0, + "delta_ref_entropy_loss": 0.0478515625, + "delta_ref_ppl": -0.08251953125, + "entropy_loss": -0.025634765625, + "epoch": 0.4824, + "grad_norm": 0.3579817987486772, + "k1_kl": 0.08251953125, + "k3_kl": 0.055908203125, + "kimi_kl": 0.2099609375, + "learning_rate": 2.5879999999999996e-07, + "loss": 0.0023, + "ppl": 0.009033203125, + "reward": 0.9982338547706604, + "reward_std": 0.0003594621957745403, + "rewards/perpo_ocr_edit_distance_reward": 0.9982338547706604, "step": 2412, "temperature": 0.9 }, { - "advantages": -1.221043748955708e-05, - "completion_length": 237.0, - "delta_ref_entropy_loss": 0.042724609375, - "delta_ref_ppl": -0.0711669921875, - "entropy_loss": -0.027801513671875, - "epoch": 0.9652, - "grad_norm": 2.460197024832254, - "k1_kl": 0.0711669921875, - "k3_kl": 0.051513671875, - "kimi_kl": 0.166748046875, - "learning_rate": 1.7399999999999997e-08, - "loss": 0.0021, - "ppl": 0.017024993896484375, - "reward": 0.9993406236171722, - "reward_std": 0.0008223330369219184, - "rewards/perpo_ocr_edit_distance_reward": 0.999340683221817, + "advantages": -5.960464477539063e-08, + "completion_length": 1022.0, + "delta_ref_entropy_loss": 0.06982421875, + "delta_ref_ppl": -0.0712890625, + "entropy_loss": -0.04541015625, + "epoch": 0.4826, + "grad_norm": 1.08196821941076, + "k1_kl": 0.0712890625, + "k3_kl": 0.039794921875, + "kimi_kl": 0.1259765625, + "learning_rate": 2.5869999999999995e-07, + "loss": 0.0016, + "ppl": 0.01708984375, + "reward": 0.8849027156829834, + "reward_std": 0.09660538285970688, + "rewards/perpo_ocr_edit_distance_reward": 0.8849027156829834, "step": 2413, "temperature": 0.9 }, { - "advantages": -0.00015174917643889785, - "completion_length": 625.0, - "delta_ref_entropy_loss": 0.03167724609375, - "delta_ref_ppl": -0.017303466796875, - "entropy_loss": -0.01239013671875, - "epoch": 0.9656, - "grad_norm": 0.1882973898622438, - "k1_kl": 0.01727294921875, - "k3_kl": 0.00897216796875, - "kimi_kl": 0.02532958984375, - "learning_rate": 1.72e-08, - "loss": 0.0005, - "ppl": 0.00290679931640625, - "reward": 0.9999764859676361, - "reward_std": 6.222354568308219e-05, - "rewards/perpo_ocr_edit_distance_reward": 0.9999765157699585, + "advantages": -3.68995351891499e-05, + "completion_length": 342.0, + "delta_ref_entropy_loss": 0.10107421875, + "delta_ref_ppl": -0.11572265625, + "entropy_loss": -0.058349609375, + "epoch": 0.4828, + "grad_norm": 1.0639432117267125, + "k1_kl": 0.11572265625, + "k3_kl": 0.07275390625, + "kimi_kl": 0.232421875, + "learning_rate": 2.586e-07, + "loss": 0.0029, + "ppl": 0.02490234375, + "reward": 0.9936100244522095, + "reward_std": 0.0015152200357988477, + "rewards/perpo_ocr_edit_distance_reward": 0.993610143661499, "step": 2414, "temperature": 0.9 }, { - "advantages": -5.6615900803080876e-05, - "completion_length": 853.5, - "delta_ref_entropy_loss": 0.02215576171875, - "delta_ref_ppl": -0.01422119140625, - "entropy_loss": -0.015380859375, - "epoch": 0.966, - "grad_norm": 0.29654071006612603, - "k1_kl": 0.0142822265625, - "k3_kl": 0.00653076171875, - "kimi_kl": 0.012969970703125, - "learning_rate": 1.7e-08, - "loss": 0.0003, - "ppl": 0.006683349609375, - "reward": 0.9938782751560211, - "reward_std": 0.013955330054159276, - "rewards/perpo_ocr_edit_distance_reward": 0.9938783347606659, + "advantages": -8.83851753314957e-05, + "completion_length": 213.0, + "delta_ref_entropy_loss": 0.031494140625, + "delta_ref_ppl": -0.10302734375, + "entropy_loss": -0.039306640625, + "epoch": 0.483, + "grad_norm": 0.8123082770993756, + "k1_kl": 0.10302734375, + "k3_kl": 0.07763671875, + "kimi_kl": 0.29296875, + "learning_rate": 2.585e-07, + "loss": 0.0032, + "ppl": 0.01312255859375, + "reward": 0.9948229789733887, + "reward_std": 0.0004779087903443724, + "rewards/perpo_ocr_edit_distance_reward": 0.9948230385780334, "step": 2415, "temperature": 0.9 }, { - "advantages": -5.1311086281202734e-05, - "completion_length": 501.5, - "delta_ref_entropy_loss": 0.0634765625, - "delta_ref_ppl": -0.0555419921875, - "entropy_loss": -0.05267333984375, - "epoch": 0.9664, - "grad_norm": 0.8207591586875744, - "k1_kl": 0.0552978515625, - "k3_kl": 0.034759521484375, - "kimi_kl": 0.1309814453125, - "learning_rate": 1.6799999999999998e-08, - "loss": 0.0014, - "ppl": 0.026210784912109375, - "reward": 0.9308547377586365, - "reward_std": 0.0005307652172632515, - "rewards/perpo_ocr_edit_distance_reward": 0.9308548271656036, + "advantages": -0.00012394360965117812, + "completion_length": 940.0, + "delta_ref_entropy_loss": 0.06689453125, + "delta_ref_ppl": -0.068359375, + "entropy_loss": -0.041015625, + "epoch": 0.4832, + "grad_norm": 0.9574912187990777, + "k1_kl": 0.06787109375, + "k3_kl": 0.03564453125, + "kimi_kl": 0.08642578125, + "learning_rate": 2.584e-07, + "loss": 0.0016, + "ppl": 0.01507568359375, + "reward": 0.9602717161178589, + "reward_std": 0.0005182331660762429, + "rewards/perpo_ocr_edit_distance_reward": 0.9602718353271484, "step": 2416, "temperature": 0.9 }, { - "advantages": 1.0371208446713354e-05, - "completion_length": 1344.5, - "delta_ref_entropy_loss": 0.01739501953125, - "delta_ref_ppl": -0.00994873046875, - "entropy_loss": -0.067474365234375, - "epoch": 0.9668, - "grad_norm": 1.2610030888438206, - "k1_kl": 0.009857177734375, - "k3_kl": 0.0147247314453125, - "kimi_kl": 0.01531982421875, - "learning_rate": 1.66e-08, - "loss": 0.0006, - "ppl": 0.04229736328125, - "reward": 0.9259936809539795, - "reward_std": 0.05444993113633245, - "rewards/perpo_ocr_edit_distance_reward": 0.9259937703609467, + "advantages": -2.537454975026776e-06, + "completion_length": 560.0, + "delta_ref_entropy_loss": 0.10498046875, + "delta_ref_ppl": -0.1142578125, + "entropy_loss": -0.3671875, + "epoch": 0.4834, + "grad_norm": 2.5181296610465873, + "k1_kl": 0.1142578125, + "k3_kl": 0.0771484375, + "kimi_kl": 0.162109375, + "learning_rate": 2.583e-07, + "loss": 0.0031, + "ppl": 0.2041015625, + "reward": 0.3610108494758606, + "reward_std": 0.006591133773326874, + "rewards/perpo_ocr_edit_distance_reward": 0.3610108494758606, "step": 2417, "temperature": 0.9 }, { - "advantages": -4.448209646312762e-05, - "completion_length": 364.5, - "delta_ref_entropy_loss": 0.012451171875, - "delta_ref_ppl": -0.1722412109375, - "entropy_loss": -0.26068115234375, - "epoch": 0.9672, - "grad_norm": 3.1318024398892894, - "k1_kl": 0.1722412109375, - "k3_kl": 0.14495849609375, - "kimi_kl": 0.5994873046875, - "learning_rate": 1.64e-08, - "loss": 0.0059, - "ppl": 0.146881103515625, - "reward": 0.7065398991107941, - "reward_std": 0.04059380342368968, - "rewards/perpo_ocr_edit_distance_reward": 0.7065399736166, + "advantages": -3.227165962016443e-06, + "completion_length": 383.0, + "delta_ref_entropy_loss": 0.07568359375, + "delta_ref_ppl": -0.1318359375, + "entropy_loss": -0.0830078125, + "epoch": 0.4836, + "grad_norm": 2.2481478323519166, + "k1_kl": 0.130859375, + "k3_kl": 0.09521484375, + "kimi_kl": 0.3984375, + "learning_rate": 2.5819999999999997e-07, + "loss": 0.0038, + "ppl": 0.037353515625, + "reward": 0.9732158780097961, + "reward_std": 0.005195149220526218, + "rewards/perpo_ocr_edit_distance_reward": 0.9732159376144409, "step": 2418, "temperature": 0.9 }, { - "advantages": -0.00016894937289180234, - "completion_length": 530.5, - "delta_ref_entropy_loss": 0.0380859375, - "delta_ref_ppl": -0.0289306640625, - "entropy_loss": -0.019287109375, - "epoch": 0.9676, - "grad_norm": 0.4596554374073057, - "k1_kl": 0.0289306640625, - "k3_kl": 0.01739501953125, - "kimi_kl": 0.05645751953125, - "learning_rate": 1.62e-08, - "loss": 0.0009, - "ppl": 0.0061798095703125, - "reward": 0.997077226638794, - "reward_std": 0.00019824090122710913, - "rewards/perpo_ocr_edit_distance_reward": 0.9970772564411163, + "advantages": -0.000204716416192241, + "completion_length": 884.0, + "delta_ref_entropy_loss": 0.03515625, + "delta_ref_ppl": -0.0380859375, + "entropy_loss": -0.047119140625, + "epoch": 0.4838, + "grad_norm": 0.4803351967786461, + "k1_kl": 0.0380859375, + "k3_kl": 0.0223388671875, + "kimi_kl": 0.06689453125, + "learning_rate": 2.5809999999999996e-07, + "loss": 0.0011, + "ppl": 0.0203857421875, + "reward": 0.9836763739585876, + "reward_std": 0.00039908342296257615, + "rewards/perpo_ocr_edit_distance_reward": 0.9836764335632324, "step": 2419, "temperature": 0.9 }, { - "advantages": -6.433257658500224e-05, - "completion_length": 731.5, - "delta_ref_entropy_loss": 0.0526123046875, - "delta_ref_ppl": -0.03314208984375, - "entropy_loss": -0.05804443359375, - "epoch": 0.968, - "grad_norm": 1.194747974677112, - "k1_kl": 0.03326416015625, - "k3_kl": 0.017730712890625, - "kimi_kl": 0.0372314453125, - "learning_rate": 1.6e-08, - "loss": 0.0008, - "ppl": 0.0281524658203125, - "reward": 0.8824791312217712, - "reward_std": 0.03537902904645307, - "rewards/perpo_ocr_edit_distance_reward": 0.8824791610240936, + "advantages": -0.00014654654660262167, + "completion_length": 994.0, + "delta_ref_entropy_loss": 0.0400390625, + "delta_ref_ppl": -0.053466796875, + "entropy_loss": -0.03271484375, + "epoch": 0.484, + "grad_norm": 0.36125897048043204, + "k1_kl": 0.053466796875, + "k3_kl": 0.030029296875, + "kimi_kl": 0.10107421875, + "learning_rate": 2.58e-07, + "loss": 0.0013, + "ppl": 0.01544189453125, + "reward": 0.9975942373275757, + "reward_std": 0.0006553030107170343, + "rewards/perpo_ocr_edit_distance_reward": 0.9975943565368652, "step": 2420, "temperature": 0.9 }, { - "advantages": -7.050378442841065e-06, - "completion_length": 534.5, - "delta_ref_entropy_loss": 0.029449462890625, - "delta_ref_ppl": -0.0888671875, - "entropy_loss": -0.1895751953125, - "epoch": 0.9684, - "grad_norm": 35.443790668377815, - "k1_kl": 0.0888671875, - "k3_kl": 0.1865234375, - "kimi_kl": 0.248779296875, - "learning_rate": 1.58e-08, - "loss": 0.0075, - "ppl": 0.124176025390625, - "reward": 0.7633087635040283, - "reward_std": 0.07236124109476805, - "rewards/perpo_ocr_edit_distance_reward": 0.7633087635040283, + "advantages": 6.301062512648059e-06, + "completion_length": 242.0, + "delta_ref_entropy_loss": 0.060546875, + "delta_ref_ppl": -0.2294921875, + "entropy_loss": -0.08154296875, + "epoch": 0.4842, + "grad_norm": 0.998538891908574, + "k1_kl": 0.2294921875, + "k3_kl": 0.1806640625, + "kimi_kl": 0.9140625, + "learning_rate": 2.579e-07, + "loss": 0.0072, + "ppl": 0.036865234375, + "reward": 0.9720256924629211, + "reward_std": 0.002607767004519701, + "rewards/perpo_ocr_edit_distance_reward": 0.9720257520675659, "step": 2421, "temperature": 0.9 }, { - "advantages": -0.0002980232238769531, - "completion_length": 436.5, - "delta_ref_entropy_loss": 0.04345703125, - "delta_ref_ppl": -0.02691650390625, - "entropy_loss": -0.017303466796875, - "epoch": 0.9688, - "grad_norm": 0.01271404420789915, - "k1_kl": 0.02685546875, - "k3_kl": 0.014373779296875, - "kimi_kl": 0.03656005859375, - "learning_rate": 1.5599999999999997e-08, - "loss": 0.0009, - "ppl": 0.007293701171875, - "reward": 0.9994753003120422, - "reward_std": 0.0, - "rewards/perpo_ocr_edit_distance_reward": 0.999475359916687, + "advantages": -0.0001316922134719789, + "completion_length": 638.0, + "delta_ref_entropy_loss": 0.01904296875, + "delta_ref_ppl": -0.042236328125, + "entropy_loss": -0.031982421875, + "epoch": 0.4844, + "grad_norm": 0.7203580913099874, + "k1_kl": 0.042236328125, + "k3_kl": 0.0296630859375, + "kimi_kl": 0.087890625, + "learning_rate": 2.5779999999999994e-07, + "loss": 0.0013, + "ppl": 0.0135498046875, + "reward": 0.9969455599784851, + "reward_std": 0.0003526791697368026, + "rewards/perpo_ocr_edit_distance_reward": 0.9969456791877747, "step": 2422, "temperature": 0.9 }, { - "advantages": -8.097717000055127e-06, - "completion_length": 1031.0, - "delta_ref_entropy_loss": 0.009207725524902344, - "delta_ref_ppl": -0.03240966796875, - "entropy_loss": -0.123291015625, - "epoch": 0.9692, - "grad_norm": 1.3587179942375411, - "k1_kl": 0.032470703125, - "k3_kl": 0.02349853515625, - "kimi_kl": 0.060791015625, - "learning_rate": 1.54e-08, - "loss": 0.0009, - "ppl": 0.07464599609375, - "reward": 0.8751500248908997, - "reward_std": 0.08651034417562187, - "rewards/perpo_ocr_edit_distance_reward": 0.8751501142978668, + "advantages": -2.4386816221522167e-05, + "completion_length": 533.0, + "delta_ref_entropy_loss": 0.0380859375, + "delta_ref_ppl": -0.056640625, + "entropy_loss": -0.034912109375, + "epoch": 0.4846, + "grad_norm": 0.4853852726220672, + "k1_kl": 0.056640625, + "k3_kl": 0.034423828125, + "kimi_kl": 0.09375, + "learning_rate": 2.577e-07, + "loss": 0.0014, + "ppl": 0.01556396484375, + "reward": 0.9965845942497253, + "reward_std": 0.001296565867960453, + "rewards/perpo_ocr_edit_distance_reward": 0.9965845942497253, "step": 2423, "temperature": 0.9 }, { - "advantages": -4.016501770820469e-05, - "completion_length": 359.5, - "delta_ref_entropy_loss": 0.0543212890625, - "delta_ref_ppl": -0.0828857421875, - "entropy_loss": -0.075927734375, - "epoch": 0.9696, - "grad_norm": 1.6340564076857957, - "k1_kl": 0.08331298828125, - "k3_kl": 0.059173583984375, - "kimi_kl": 0.2261962890625, - "learning_rate": 1.52e-08, - "loss": 0.0024, - "ppl": 0.0317840576171875, - "reward": 0.9816226065158844, - "reward_std": 0.000781580078182742, - "rewards/perpo_ocr_edit_distance_reward": 0.9816226661205292, + "advantages": -1.5045916370581836e-05, + "completion_length": 172.0, + "delta_ref_entropy_loss": 0.111328125, + "delta_ref_ppl": -0.2314453125, + "entropy_loss": -0.0859375, + "epoch": 0.4848, + "grad_norm": 1.1368747318652095, + "k1_kl": 0.2314453125, + "k3_kl": 0.166015625, + "kimi_kl": 0.57421875, + "learning_rate": 2.576e-07, + "loss": 0.0067, + "ppl": 0.030029296875, + "reward": 0.9963768124580383, + "reward_std": 0.0015988588565960526, + "rewards/perpo_ocr_edit_distance_reward": 0.9963768720626831, "step": 2424, "temperature": 0.9 }, { - "advantages": -0.0002580647123977542, - "completion_length": 809.5, - "delta_ref_entropy_loss": 0.02960205078125, - "delta_ref_ppl": -0.0244140625, - "entropy_loss": -0.0201416015625, - "epoch": 0.97, - "grad_norm": 0.2590312404903033, - "k1_kl": 0.02447509765625, - "k3_kl": 0.014373779296875, - "kimi_kl": 0.0426025390625, - "learning_rate": 1.5e-08, - "loss": 0.0008, - "ppl": 0.008270263671875, - "reward": 0.9992561936378479, - "reward_std": 0.0002961040736408904, - "rewards/perpo_ocr_edit_distance_reward": 0.9992562532424927, + "advantages": 1.9703593352460302e-05, + "completion_length": 549.0, + "delta_ref_entropy_loss": 0.0194091796875, + "delta_ref_ppl": -0.027099609375, + "entropy_loss": -0.0279541015625, + "epoch": 0.485, + "grad_norm": 0.8241987442461921, + "k1_kl": 0.0269775390625, + "k3_kl": 0.0169677734375, + "kimi_kl": 0.052978515625, + "learning_rate": 2.5749999999999997e-07, + "loss": 0.0007, + "ppl": 0.008056640625, + "reward": 0.9866015911102295, + "reward_std": 0.0003320106479804963, + "rewards/perpo_ocr_edit_distance_reward": 0.9866015911102295, "step": 2425, "temperature": 0.9 }, { - "advantages": -1.3302480383003967e-05, - "completion_length": 751.0, - "delta_ref_entropy_loss": 0.080322265625, - "delta_ref_ppl": -0.054290771484375, - "entropy_loss": -0.08587646484375, - "epoch": 0.9704, - "grad_norm": 0.700352997451404, - "k1_kl": 0.054046630859375, - "k3_kl": 0.0292205810546875, - "kimi_kl": 0.075042724609375, - "learning_rate": 1.48e-08, - "loss": 0.0012, - "ppl": 0.0416107177734375, - "reward": 0.9320046305656433, - "reward_std": 0.10369387158425525, - "rewards/perpo_ocr_edit_distance_reward": 0.9320046901702881, + "advantages": -9.965045319404453e-05, + "completion_length": 898.0, + "delta_ref_entropy_loss": 0.03857421875, + "delta_ref_ppl": -0.0419921875, + "entropy_loss": -0.0279541015625, + "epoch": 0.4852, + "grad_norm": 0.2909718424652449, + "k1_kl": 0.042236328125, + "k3_kl": 0.023193359375, + "kimi_kl": 0.0595703125, + "learning_rate": 2.574e-07, + "loss": 0.001, + "ppl": 0.009765625, + "reward": 0.9968133568763733, + "reward_std": 0.00024178289459086955, + "rewards/perpo_ocr_edit_distance_reward": 0.9968134760856628, "step": 2426, "temperature": 0.9 }, { - "advantages": -0.0001642831838211123, - "completion_length": 380.5, - "delta_ref_entropy_loss": 0.03619384765625, - "delta_ref_ppl": -0.0262451171875, - "entropy_loss": -0.0380859375, - "epoch": 0.9708, - "grad_norm": 0.9656492001417296, - "k1_kl": 0.026123046875, - "k3_kl": 0.0150299072265625, - "kimi_kl": 0.0450286865234375, - "learning_rate": 1.46e-08, - "loss": 0.0008, - "ppl": 0.016845703125, - "reward": 0.8559696972370148, - "reward_std": 0.12063659597333753, - "rewards/perpo_ocr_edit_distance_reward": 0.8559698164463043, + "advantages": -3.5601005947683007e-05, + "completion_length": 475.0, + "delta_ref_entropy_loss": 0.0201416015625, + "delta_ref_ppl": -0.038330078125, + "entropy_loss": -0.037353515625, + "epoch": 0.4854, + "grad_norm": 0.6352188962756263, + "k1_kl": 0.038330078125, + "k3_kl": 0.0234375, + "kimi_kl": 0.05859375, + "learning_rate": 2.5729999999999995e-07, + "loss": 0.001, + "ppl": 0.0103759765625, + "reward": 0.9964861273765564, + "reward_std": 0.0013346766354516149, + "rewards/perpo_ocr_edit_distance_reward": 0.9964861869812012, "step": 2427, "temperature": 0.9 }, { - "advantages": -8.552841245546006e-05, - "completion_length": 929.0, - "delta_ref_entropy_loss": 0.01983642578125, - "delta_ref_ppl": -0.026214599609375, - "entropy_loss": -0.021484375, - "epoch": 0.9712, - "grad_norm": 0.38542893682537127, - "k1_kl": 0.026123046875, - "k3_kl": 0.017120361328125, - "kimi_kl": 0.05035400390625, - "learning_rate": 1.4399999999999998e-08, - "loss": 0.0008, - "ppl": 0.011474609375, - "reward": 0.9997888207435608, - "reward_std": 0.00026035123300971463, - "rewards/perpo_ocr_edit_distance_reward": 0.9997888803482056, + "advantages": -5.960464841336943e-05, + "completion_length": 560.0, + "delta_ref_entropy_loss": 0.040771484375, + "delta_ref_ppl": -0.07275390625, + "entropy_loss": -0.050048828125, + "epoch": 0.4856, + "grad_norm": 0.4505528154000861, + "k1_kl": 0.07275390625, + "k3_kl": 0.05078125, + "kimi_kl": 0.1953125, + "learning_rate": 2.5719999999999995e-07, + "loss": 0.0021, + "ppl": 0.0159912109375, + "reward": 0.9850339293479919, + "reward_std": 0.0006139926263131201, + "rewards/perpo_ocr_edit_distance_reward": 0.9850340485572815, "step": 2428, "temperature": 0.9 }, { - "advantages": -0.00012998922102269717, - "completion_length": 1193.5, - "delta_ref_entropy_loss": 0.0223388671875, - "delta_ref_ppl": -0.015655517578125, - "entropy_loss": -0.04071044921875, - "epoch": 0.9716, - "grad_norm": 0.6278390585755125, - "k1_kl": 0.01556396484375, - "k3_kl": 0.009765625, - "kimi_kl": 0.024932861328125, - "learning_rate": 1.42e-08, - "loss": 0.0005, - "ppl": 0.020904541015625, - "reward": 0.9970510900020599, - "reward_std": 0.00043372374784667045, - "rewards/perpo_ocr_edit_distance_reward": 0.9970511794090271, - "step": 2429, - "temperature": 0.9 - }, - { - "advantages": -2.3356506062555127e-05, - "completion_length": 557.0, - "delta_ref_entropy_loss": 0.04876708984375, - "delta_ref_ppl": -0.029815673828125, - "entropy_loss": -0.061676025390625, - "epoch": 0.972, - "grad_norm": 0.7587501464124035, - "k1_kl": 0.029937744140625, - "k3_kl": 0.013641357421875, - "kimi_kl": 0.02130126953125, - "learning_rate": 1.4e-08, - "loss": 0.0006, - "ppl": 0.0313568115234375, - "reward": 0.9869731068611145, - "reward_std": 0.0004968775901943445, - "rewards/perpo_ocr_edit_distance_reward": 0.9869731366634369, + "advantages": -1.3623919414840202e-07, + "completion_length": 294.0, + "delta_ref_entropy_loss": 0.0128173828125, + "delta_ref_ppl": -0.0791015625, + "entropy_loss": -0.287109375, + "epoch": 0.4858, + "grad_norm": 2.8801688661281286, + "k1_kl": 0.0791015625, + "k3_kl": 0.058349609375, + "kimi_kl": 0.130859375, + "learning_rate": 2.571e-07, + "loss": 0.0023, + "ppl": 0.1083984375, + "reward": 0.5380284190177917, + "reward_std": 0.12158114463090897, + "rewards/perpo_ocr_edit_distance_reward": 0.5380284190177917, + "step": 2429, + "temperature": 0.9 + }, + { + "advantages": 2.2522041035699658e-05, + "completion_length": 460.0, + "delta_ref_entropy_loss": 0.06689453125, + "delta_ref_ppl": -0.083984375, + "entropy_loss": -0.0771484375, + "epoch": 0.486, + "grad_norm": 1.4941972140481157, + "k1_kl": 0.083984375, + "k3_kl": 0.052001953125, + "kimi_kl": 0.1357421875, + "learning_rate": 2.57e-07, + "loss": 0.0021, + "ppl": 0.028076171875, + "reward": 0.2990114390850067, + "reward_std": 0.0008454371127299964, + "rewards/perpo_ocr_edit_distance_reward": 0.2990114390850067, "step": 2430, "temperature": 0.9 }, { - "advantages": -3.058995571336709e-05, - "completion_length": 378.5, - "delta_ref_entropy_loss": 0.07177734375, - "delta_ref_ppl": -0.0933837890625, - "entropy_loss": -0.1104736328125, - "epoch": 0.9724, - "grad_norm": 1.9581845566843985, - "k1_kl": 0.09326171875, - "k3_kl": 0.0733642578125, - "kimi_kl": 0.25439453125, - "learning_rate": 1.38e-08, - "loss": 0.003, - "ppl": 0.0589599609375, - "reward": 0.8415007591247559, - "reward_std": 0.024033930560108274, - "rewards/perpo_ocr_edit_distance_reward": 0.8415008187294006, + "advantages": 0.0, + "completion_length": 1845.0, + "delta_ref_entropy_loss": 0.01092529296875, + "delta_ref_ppl": -0.02783203125, + "entropy_loss": -0.032958984375, + "epoch": 0.4862, + "grad_norm": 0.7145388672152424, + "k1_kl": 0.02783203125, + "k3_kl": 0.023681640625, + "kimi_kl": 0.06884765625, + "learning_rate": 2.5690000000000003e-07, + "loss": 0.0009, + "ppl": 0.021484375, + "reward": 0.9942600131034851, + "reward_std": 0.001726237591356039, + "rewards/perpo_ocr_edit_distance_reward": 0.9942600131034851, "step": 2431, "temperature": 0.9 }, { - "advantages": -2.9674599772988586e-06, - "completion_length": 578.0, - "delta_ref_entropy_loss": 0.14794921875, - "delta_ref_ppl": -0.087646484375, - "entropy_loss": -0.12744140625, - "epoch": 0.9728, - "grad_norm": 1.102207589477724, - "k1_kl": 0.08740234375, - "k3_kl": 0.04345703125, - "kimi_kl": 0.088134765625, - "learning_rate": 1.36e-08, - "loss": 0.0017, - "ppl": 0.0645751953125, - "reward": 0.8744914531707764, - "reward_std": 0.004116407362744212, - "rewards/perpo_ocr_edit_distance_reward": 0.8744915127754211, + "advantages": -1.1103494216513354e-05, + "completion_length": 590.0, + "delta_ref_entropy_loss": 0.03857421875, + "delta_ref_ppl": -0.0634765625, + "entropy_loss": -0.035400390625, + "epoch": 0.4864, + "grad_norm": 0.45741208690811624, + "k1_kl": 0.0634765625, + "k3_kl": 0.044921875, + "kimi_kl": 0.1708984375, + "learning_rate": 2.5679999999999997e-07, + "loss": 0.0018, + "ppl": 0.014404296875, + "reward": 0.9954751133918762, + "reward_std": 0.0006664737593382597, + "rewards/perpo_ocr_edit_distance_reward": 0.9954751133918762, "step": 2432, "temperature": 0.9 }, { - "advantages": -7.133399049052969e-05, - "completion_length": 482.0, - "delta_ref_entropy_loss": 0.03594970703125, - "delta_ref_ppl": -0.0404052734375, - "entropy_loss": -0.0277099609375, - "epoch": 0.9732, - "grad_norm": 0.5826758627525396, - "k1_kl": 0.04046630859375, - "k3_kl": 0.0291748046875, - "kimi_kl": 0.1346435546875, - "learning_rate": 1.34e-08, - "loss": 0.0012, - "ppl": 0.014068603515625, - "reward": 0.998466968536377, - "reward_std": 0.000759404560085386, - "rewards/perpo_ocr_edit_distance_reward": 0.9984670579433441, + "advantages": -0.00010702865984058008, + "completion_length": 802.0, + "delta_ref_entropy_loss": 0.052490234375, + "delta_ref_ppl": -0.04541015625, + "entropy_loss": -0.0615234375, + "epoch": 0.4866, + "grad_norm": 0.7675542947507303, + "k1_kl": 0.045166015625, + "k3_kl": 0.0230712890625, + "kimi_kl": 0.045166015625, + "learning_rate": 2.5669999999999996e-07, + "loss": 0.001, + "ppl": 0.025634765625, + "reward": 0.9648423194885254, + "reward_std": 0.0006159498007036746, + "rewards/perpo_ocr_edit_distance_reward": 0.9648424386978149, "step": 2433, "temperature": 0.9 }, { - "advantages": -5.138771939527942e-06, - "completion_length": 625.5, - "delta_ref_entropy_loss": 0.0458984375, - "delta_ref_ppl": -0.024658203125, - "entropy_loss": -0.03851318359375, - "epoch": 0.9736, - "grad_norm": 1.2531794103919949, - "k1_kl": 0.024658203125, - "k3_kl": 0.014404296875, - "kimi_kl": 0.02874755859375, - "learning_rate": 1.3199999999999999e-08, - "loss": 0.0006, - "ppl": 0.02093505859375, - "reward": 0.9728123843669891, - "reward_std": 0.008096107165329158, - "rewards/perpo_ocr_edit_distance_reward": 0.9728124737739563, + "advantages": -1.2125287867092993e-05, + "completion_length": 112.0, + "delta_ref_entropy_loss": 0.03369140625, + "delta_ref_ppl": -0.2314453125, + "entropy_loss": -0.060791015625, + "epoch": 0.4868, + "grad_norm": 1.5198263515296957, + "k1_kl": 0.232421875, + "k3_kl": 0.197265625, + "kimi_kl": 1.1953125, + "learning_rate": 2.566e-07, + "loss": 0.0079, + "ppl": 0.021484375, + "reward": 0.9975454807281494, + "reward_std": 0.0020061288960278034, + "rewards/perpo_ocr_edit_distance_reward": 0.997545599937439, "step": 2434, "temperature": 0.9 }, { - "advantages": -1.4930964425730053e-05, - "completion_length": 483.0, - "delta_ref_entropy_loss": 0.02947998046875, - "delta_ref_ppl": -0.03021240234375, - "entropy_loss": -0.02630615234375, - "epoch": 0.974, - "grad_norm": 0.676878540360493, - "k1_kl": 0.03021240234375, - "k3_kl": 0.0211181640625, - "kimi_kl": 0.06109619140625, - "learning_rate": 1.2999999999999999e-08, - "loss": 0.0009, - "ppl": 0.013427734375, - "reward": 0.9959838092327118, - "reward_std": 0.003925947239622474, - "rewards/perpo_ocr_edit_distance_reward": 0.9959838092327118, + "advantages": -3.05431240121834e-05, + "completion_length": 602.0, + "delta_ref_entropy_loss": 0.04345703125, + "delta_ref_ppl": -0.0654296875, + "entropy_loss": -0.060546875, + "epoch": 0.487, + "grad_norm": 0.8410271226111248, + "k1_kl": 0.0654296875, + "k3_kl": 0.04150390625, + "kimi_kl": 0.1142578125, + "learning_rate": 2.565e-07, + "loss": 0.0017, + "ppl": 0.0260009765625, + "reward": 0.9392920136451721, + "reward_std": 0.00157304632011801, + "rewards/perpo_ocr_edit_distance_reward": 0.9392920732498169, "step": 2435, "temperature": 0.9 }, { - "advantages": -2.1287373641598606e-07, - "completion_length": 662.5, - "delta_ref_entropy_loss": 0.03985595703125, - "delta_ref_ppl": -0.04107666015625, - "entropy_loss": -0.11572265625, - "epoch": 0.9744, - "grad_norm": 0.7458902191742945, - "k1_kl": 0.041107177734375, - "k3_kl": 0.026824951171875, - "kimi_kl": 0.079345703125, - "learning_rate": 1.28e-08, - "loss": 0.0011, - "ppl": 0.06201171875, - "reward": 0.9156214892864227, - "reward_std": 0.15390311926603317, - "rewards/perpo_ocr_edit_distance_reward": 0.9156215190887451, + "advantages": 2.1287374085687816e-09, + "completion_length": 1140.0, + "delta_ref_entropy_loss": 0.04931640625, + "delta_ref_ppl": -0.0986328125, + "entropy_loss": -0.1376953125, + "epoch": 0.4872, + "grad_norm": 2.004184262688626, + "k1_kl": 0.0986328125, + "k3_kl": 0.068359375, + "kimi_kl": 0.2197265625, + "learning_rate": 2.564e-07, + "loss": 0.0027, + "ppl": 0.056396484375, + "reward": 0.3718861937522888, + "reward_std": 0.03878410905599594, + "rewards/perpo_ocr_edit_distance_reward": 0.3718861937522888, "step": 2436, "temperature": 0.9 }, { - "advantages": -6.732344627380371e-05, - "completion_length": 655.5, - "delta_ref_entropy_loss": 0.0225830078125, - "delta_ref_ppl": -0.012451171875, - "entropy_loss": -0.0162353515625, - "epoch": 0.9748, - "grad_norm": 0.3899271233856361, - "k1_kl": 0.012451171875, - "k3_kl": 0.00612640380859375, - "kimi_kl": 0.0130767822265625, - "learning_rate": 1.26e-08, - "loss": 0.0003, - "ppl": 0.00705718994140625, - "reward": 0.9994732439517975, - "reward_std": 7.64720025472343e-05, - "rewards/perpo_ocr_edit_distance_reward": 0.9994732737541199, + "advantages": -5.347388196241809e-06, + "completion_length": 863.0, + "delta_ref_entropy_loss": 0.045166015625, + "delta_ref_ppl": -0.06298828125, + "entropy_loss": -0.09033203125, + "epoch": 0.4874, + "grad_norm": 0.9535474996588162, + "k1_kl": 0.06298828125, + "k3_kl": 0.044921875, + "kimi_kl": 0.103515625, + "learning_rate": 2.563e-07, + "loss": 0.0018, + "ppl": 0.04541015625, + "reward": 0.8591819405555725, + "reward_std": 0.006277297157794237, + "rewards/perpo_ocr_edit_distance_reward": 0.8591820001602173, "step": 2437, "temperature": 0.9 }, { - "advantages": -6.30148861091584e-05, - "completion_length": 832.0, - "delta_ref_entropy_loss": 0.0357666015625, - "delta_ref_ppl": -0.0328369140625, - "entropy_loss": -0.03515625, - "epoch": 0.9752, - "grad_norm": 0.8463856187083475, - "k1_kl": 0.0328369140625, - "k3_kl": 0.020263671875, - "kimi_kl": 0.0589599609375, - "learning_rate": 1.2399999999999999e-08, - "loss": 0.0009, - "ppl": 0.018798828125, - "reward": 0.9777666926383972, - "reward_std": 0.0007775440899422392, - "rewards/perpo_ocr_edit_distance_reward": 0.9777668118476868, + "advantages": -5.10896995820076e-07, + "completion_length": 379.0, + "delta_ref_entropy_loss": 0.0966796875, + "delta_ref_ppl": -0.14453125, + "entropy_loss": -0.205078125, + "epoch": 0.4876, + "grad_norm": 2.367780710268364, + "k1_kl": 0.14453125, + "k3_kl": 0.0986328125, + "kimi_kl": 0.255859375, + "learning_rate": 2.562e-07, + "loss": 0.0039, + "ppl": 0.0908203125, + "reward": 0.7400416135787964, + "reward_std": 0.032516151666641235, + "rewards/perpo_ocr_edit_distance_reward": 0.7400416135787964, "step": 2438, "temperature": 0.9 }, { - "advantages": -0.0002980232238769531, - "completion_length": 561.5, - "delta_ref_entropy_loss": 0.0318603515625, - "delta_ref_ppl": -0.03369140625, - "entropy_loss": -0.018310546875, - "epoch": 0.9756, - "grad_norm": 0.010007696965424789, - "k1_kl": 0.03369140625, - "k3_kl": 0.020660400390625, - "kimi_kl": 0.06268310546875, - "learning_rate": 1.22e-08, - "loss": 0.0011, - "ppl": 0.00572967529296875, - "reward": 0.998841255903244, - "reward_std": 0.0, - "rewards/perpo_ocr_edit_distance_reward": 0.9988412857055664, + "advantages": -2.1772726540802978e-05, + "completion_length": 577.0, + "delta_ref_entropy_loss": 0.049072265625, + "delta_ref_ppl": -0.061279296875, + "entropy_loss": -0.028076171875, + "epoch": 0.4878, + "grad_norm": 0.31408790739554854, + "k1_kl": 0.06103515625, + "k3_kl": 0.037109375, + "kimi_kl": 0.10400390625, + "learning_rate": 2.5609999999999997e-07, + "loss": 0.0015, + "ppl": 0.00823974609375, + "reward": 0.9926905035972595, + "reward_std": 0.0006819390109740198, + "rewards/perpo_ocr_edit_distance_reward": 0.9926905632019043, "step": 2439, "temperature": 0.9 }, { - "advantages": -2.9027463824604638e-05, - "completion_length": 673.5, - "delta_ref_entropy_loss": 0.05450439453125, - "delta_ref_ppl": -0.051910400390625, - "entropy_loss": -0.038330078125, - "epoch": 0.976, - "grad_norm": 204.11268690833884, - "k1_kl": 0.05218505859375, - "k3_kl": 0.954833984375, - "kimi_kl": 0.1043701171875, - "learning_rate": 1.2e-08, - "loss": 0.0382, - "ppl": 0.0263671875, - "reward": 0.996667891740799, - "reward_std": 0.0013674696092493832, - "rewards/perpo_ocr_edit_distance_reward": 0.9966679513454437, + "advantages": -1.4339175322675146e-05, + "completion_length": 455.0, + "delta_ref_entropy_loss": 0.025146484375, + "delta_ref_ppl": -0.0537109375, + "entropy_loss": -0.052001953125, + "epoch": 0.488, + "grad_norm": 0.5052227594194585, + "k1_kl": 0.0537109375, + "k3_kl": 0.037109375, + "kimi_kl": 0.1220703125, + "learning_rate": 2.56e-07, + "loss": 0.0015, + "ppl": 0.0186767578125, + "reward": 0.860420823097229, + "reward_std": 0.0022757048718631268, + "rewards/perpo_ocr_edit_distance_reward": 0.8604208827018738, "step": 2440, "temperature": 0.9 }, { - "advantages": -2.7673593194776913e-07, - "completion_length": 404.5, - "delta_ref_entropy_loss": 0.109375, - "delta_ref_ppl": -0.08154296875, - "entropy_loss": -0.13623046875, - "epoch": 0.9764, - "grad_norm": 1.8019940465150452, - "k1_kl": 0.0810546875, - "k3_kl": 0.048095703125, - "kimi_kl": 0.11962890625, - "learning_rate": 1.18e-08, - "loss": 0.0019, - "ppl": 0.0748291015625, - "reward": 0.8739559352397919, - "reward_std": 0.0077761386055499315, - "rewards/perpo_ocr_edit_distance_reward": 0.8739559650421143, + "advantages": -1.7029899268550253e-08, + "completion_length": 227.0, + "delta_ref_entropy_loss": 0.023193359375, + "delta_ref_ppl": -0.353515625, + "entropy_loss": -0.177734375, + "epoch": 0.4882, + "grad_norm": 4.983202481911971, + "k1_kl": 0.353515625, + "k3_kl": 0.296875, + "kimi_kl": 1.3828125, + "learning_rate": 2.559e-07, + "loss": 0.0119, + "ppl": 0.07080078125, + "reward": 0.6357142925262451, + "reward_std": 0.04792284220457077, + "rewards/perpo_ocr_edit_distance_reward": 0.6357142925262451, "step": 2441, "temperature": 0.9 }, { - "advantages": -5.5594107834622264e-05, - "completion_length": 278.0, - "delta_ref_entropy_loss": 0.0623779296875, - "delta_ref_ppl": -0.133056640625, - "entropy_loss": -0.042236328125, - "epoch": 0.9768, - "grad_norm": 0.16635487525784637, - "k1_kl": 0.133544921875, - "k3_kl": 0.102020263671875, - "kimi_kl": 0.446044921875, - "learning_rate": 1.1599999999999998e-08, - "loss": 0.0041, - "ppl": 0.02276611328125, - "reward": 0.9912554621696472, - "reward_std": 0.0002563352172728628, - "rewards/perpo_ocr_edit_distance_reward": 0.991255521774292, + "advantages": -9.766647417563945e-05, + "completion_length": 455.0, + "delta_ref_entropy_loss": 0.0517578125, + "delta_ref_ppl": -0.10498046875, + "entropy_loss": -0.03369140625, + "epoch": 0.4884, + "grad_norm": 0.34884685181333536, + "k1_kl": 0.10498046875, + "k3_kl": 0.07080078125, + "kimi_kl": 0.265625, + "learning_rate": 2.558e-07, + "loss": 0.0029, + "ppl": 0.01104736328125, + "reward": 0.9976815581321716, + "reward_std": 0.0005102683207951486, + "rewards/perpo_ocr_edit_distance_reward": 0.9976816177368164, "step": 2442, "temperature": 0.9 }, { - "advantages": -0.00013127923011779785, - "completion_length": 627.0, - "delta_ref_entropy_loss": 0.052001953125, - "delta_ref_ppl": -0.04656982421875, - "entropy_loss": -0.017242431640625, - "epoch": 0.9772, - "grad_norm": 0.10763159574732142, - "k1_kl": 0.046539306640625, - "k3_kl": 0.029937744140625, - "kimi_kl": 0.07952880859375, - "learning_rate": 1.14e-08, - "loss": 0.0013, - "ppl": 0.00409698486328125, - "reward": 0.999945342540741, - "reward_std": 0.00014458752411883324, - "rewards/perpo_ocr_edit_distance_reward": 0.9999454021453857, + "advantages": 3.8317273265420226e-07, + "completion_length": 547.0, + "delta_ref_entropy_loss": 0.03125, + "delta_ref_ppl": -0.1259765625, + "entropy_loss": -0.275390625, + "epoch": 0.4886, + "grad_norm": 3.3450091542452047, + "k1_kl": 0.1259765625, + "k3_kl": 0.08984375, + "kimi_kl": 0.2216796875, + "learning_rate": 2.557e-07, + "loss": 0.0036, + "ppl": 0.1357421875, + "reward": 0.6177897453308105, + "reward_std": 0.05553613230586052, + "rewards/perpo_ocr_edit_distance_reward": 0.6177897453308105, "step": 2443, "temperature": 0.9 }, { - "advantages": -1.6808510849841696e-05, - "completion_length": 729.5, - "delta_ref_entropy_loss": 0.05975341796875, - "delta_ref_ppl": -0.049072265625, - "entropy_loss": -0.0357666015625, - "epoch": 0.9776, - "grad_norm": 1.175414135128516, - "k1_kl": 0.049346923828125, - "k3_kl": 0.028839111328125, - "kimi_kl": 0.0635986328125, - "learning_rate": 1.12e-08, - "loss": 0.0012, - "ppl": 0.017913818359375, - "reward": 0.9864355027675629, - "reward_std": 0.0323120369866956, - "rewards/perpo_ocr_edit_distance_reward": 0.9864355027675629, + "advantages": -6.668057176284492e-05, + "completion_length": 1222.0, + "delta_ref_entropy_loss": 0.02001953125, + "delta_ref_ppl": -0.03173828125, + "entropy_loss": -0.03955078125, + "epoch": 0.4888, + "grad_norm": 0.20630390889320266, + "k1_kl": 0.03173828125, + "k3_kl": 0.017578125, + "kimi_kl": 0.041748046875, + "learning_rate": 2.556e-07, + "loss": 0.0008, + "ppl": 0.012939453125, + "reward": 0.99953293800354, + "reward_std": 0.0005385937984101474, + "rewards/perpo_ocr_edit_distance_reward": 0.9995329976081848, "step": 2444, "temperature": 0.9 }, { - "advantages": -0.00010848896999959834, - "completion_length": 727.5, - "delta_ref_entropy_loss": 0.03057861328125, - "delta_ref_ppl": -0.01544189453125, - "entropy_loss": -0.015869140625, - "epoch": 0.978, - "grad_norm": 1.647812788607085, - "k1_kl": 0.0155029296875, - "k3_kl": 0.00970458984375, - "kimi_kl": 0.02349853515625, - "learning_rate": 1.1e-08, - "loss": 0.0005, - "ppl": 0.0070648193359375, - "reward": 0.9988190829753876, - "reward_std": 0.00046101550105959177, - "rewards/perpo_ocr_edit_distance_reward": 0.99881911277771, + "advantages": -7.544245363533264e-06, + "completion_length": 42.0, + "delta_ref_entropy_loss": 0.08251953125, + "delta_ref_ppl": -0.8125, + "entropy_loss": -0.201171875, + "epoch": 0.489, + "grad_norm": 4.589200898673225, + "k1_kl": 0.8125, + "k3_kl": 0.66015625, + "kimi_kl": 2.375, + "learning_rate": 2.555e-07, + "loss": 0.0264, + "ppl": 0.091796875, + "reward": 0.8616071343421936, + "reward_std": 0.006681519560515881, + "rewards/perpo_ocr_edit_distance_reward": 0.8616071939468384, "step": 2445, "temperature": 0.9 }, { - "advantages": -0.00011026008178305347, - "completion_length": 750.5, - "delta_ref_entropy_loss": 0.03173828125, - "delta_ref_ppl": -0.02691650390625, - "entropy_loss": -0.0220947265625, - "epoch": 0.9784, - "grad_norm": 0.28398299658794324, - "k1_kl": 0.02703857421875, - "k3_kl": 0.016510009765625, - "kimi_kl": 0.0577392578125, - "learning_rate": 1.08e-08, - "loss": 0.0008, - "ppl": 0.0098876953125, - "reward": 0.9939639568328857, - "reward_std": 0.00039961985021363944, - "rewards/perpo_ocr_edit_distance_reward": 0.9939640462398529, + "advantages": -5.2332881750771776e-05, + "completion_length": 933.0, + "delta_ref_entropy_loss": 0.0269775390625, + "delta_ref_ppl": -0.034912109375, + "entropy_loss": -0.0380859375, + "epoch": 0.4892, + "grad_norm": 0.38866361687854023, + "k1_kl": 0.034912109375, + "k3_kl": 0.02001953125, + "kimi_kl": 0.052490234375, + "learning_rate": 2.554e-07, + "loss": 0.0009, + "ppl": 0.01458740234375, + "reward": 0.7043594121932983, + "reward_std": 0.000713387387804687, + "rewards/perpo_ocr_edit_distance_reward": 0.7043594717979431, "step": 2446, "temperature": 0.9 }, { - "advantages": -2.2679567791783484e-05, - "completion_length": 618.0, - "delta_ref_entropy_loss": 0.0423583984375, - "delta_ref_ppl": -0.0430908203125, - "entropy_loss": -0.03814697265625, - "epoch": 0.9788, - "grad_norm": 1.2672394614455407, - "k1_kl": 0.043212890625, - "k3_kl": 0.0272216796875, - "kimi_kl": 0.08935546875, - "learning_rate": 1.0599999999999999e-08, - "loss": 0.0011, - "ppl": 0.020751953125, - "reward": 0.8633209466934204, - "reward_std": 0.0014230521919671446, - "rewards/perpo_ocr_edit_distance_reward": 0.8633210062980652, + "advantages": 5.160059572517639e-06, + "completion_length": 308.0, + "delta_ref_entropy_loss": 0.095703125, + "delta_ref_ppl": -0.12890625, + "entropy_loss": -0.083984375, + "epoch": 0.4894, + "grad_norm": 0.8415285791194895, + "k1_kl": 0.12890625, + "k3_kl": 0.09130859375, + "kimi_kl": 0.287109375, + "learning_rate": 2.553e-07, + "loss": 0.0037, + "ppl": 0.035888671875, + "reward": 0.9148969054222107, + "reward_std": 0.0032000564970076084, + "rewards/perpo_ocr_edit_distance_reward": 0.9148968458175659, "step": 2447, "temperature": 0.9 }, { - "advantages": -5.267348024062812e-05, - "completion_length": 376.5, - "delta_ref_entropy_loss": 0.033203125, - "delta_ref_ppl": -0.03363037109375, - "entropy_loss": -0.03271484375, - "epoch": 0.9792, - "grad_norm": 0.9909535987581862, - "k1_kl": 0.03363037109375, - "k3_kl": 0.022216796875, - "kimi_kl": 0.061767578125, - "learning_rate": 1.0399999999999999e-08, - "loss": 0.0009, - "ppl": 0.014404296875, - "reward": 0.9980642199516296, - "reward_std": 0.0009915644477587193, - "rewards/perpo_ocr_edit_distance_reward": 0.9980643093585968, + "advantages": -2.188342114095576e-05, + "completion_length": 614.0, + "delta_ref_entropy_loss": 0.06298828125, + "delta_ref_ppl": -0.08349609375, + "entropy_loss": -0.07373046875, + "epoch": 0.4896, + "grad_norm": 0.6738931079835019, + "k1_kl": 0.08349609375, + "k3_kl": 0.05322265625, + "kimi_kl": 0.185546875, + "learning_rate": 2.5519999999999996e-07, + "loss": 0.0021, + "ppl": 0.02685546875, + "reward": 0.9744420051574707, + "reward_std": 0.004182995297014713, + "rewards/perpo_ocr_edit_distance_reward": 0.9744420051574707, "step": 2448, "temperature": 0.9 }, { - "advantages": -0.000301895397115004, - "completion_length": 231.5, - "delta_ref_entropy_loss": 0.095458984375, - "delta_ref_ppl": -0.089111328125, - "entropy_loss": -0.0849609375, - "epoch": 0.9796, - "grad_norm": 0.7431827958386943, - "k1_kl": 0.088623046875, - "k3_kl": 0.0487060546875, - "kimi_kl": 0.1015625, - "learning_rate": 1.02e-08, - "loss": 0.0023, - "ppl": 0.0394134521484375, - "reward": 0.8980918824672699, - "reward_std": 0.0049161105416715145, - "rewards/perpo_ocr_edit_distance_reward": 0.8980919122695923, + "advantages": -4.8943929868983105e-05, + "completion_length": 1302.0, + "delta_ref_entropy_loss": 0.0228271484375, + "delta_ref_ppl": -0.0228271484375, + "entropy_loss": -0.035400390625, + "epoch": 0.4898, + "grad_norm": 0.5557640915507411, + "k1_kl": 0.0228271484375, + "k3_kl": 0.01318359375, + "kimi_kl": 0.0223388671875, + "learning_rate": 2.551e-07, + "loss": 0.0006, + "ppl": 0.01373291015625, + "reward": 0.9946120977401733, + "reward_std": 0.0014648218639194965, + "rewards/perpo_ocr_edit_distance_reward": 0.9946122169494629, "step": 2449, "temperature": 0.9 }, { - "advantages": -3.62651699106209e-05, - "completion_length": 265.0, - "delta_ref_entropy_loss": 0.1353759765625, - "delta_ref_ppl": -0.161102294921875, - "entropy_loss": -0.08380126953125, - "epoch": 0.98, - "grad_norm": 2.4090182897054664, - "k1_kl": 0.161102294921875, - "k3_kl": 0.111724853515625, - "kimi_kl": 0.266693115234375, - "learning_rate": 1e-08, - "loss": 0.0045, - "ppl": 0.0413818359375, - "reward": 0.6669008582830429, - "reward_std": 0.0007140116504160687, - "rewards/perpo_ocr_edit_distance_reward": 0.6669008582830429, + "advantages": -8.879389497451484e-05, + "completion_length": 510.0, + "delta_ref_entropy_loss": 0.0322265625, + "delta_ref_ppl": -0.07373046875, + "entropy_loss": -0.03662109375, + "epoch": 0.49, + "grad_norm": 0.5874327668733476, + "k1_kl": 0.07373046875, + "k3_kl": 0.05126953125, + "kimi_kl": 0.150390625, + "learning_rate": 2.55e-07, + "loss": 0.0021, + "ppl": 0.01312255859375, + "reward": 0.9956331849098206, + "reward_std": 0.0006670375587418675, + "rewards/perpo_ocr_edit_distance_reward": 0.9956332445144653, "step": 2450, "temperature": 0.9 }, { - "advantages": -3.387459738490861e-05, - "completion_length": 1366.0, - "delta_ref_entropy_loss": -0.00750732421875, - "delta_ref_ppl": -0.0244140625, - "entropy_loss": -0.13818359375, - "epoch": 0.9804, - "grad_norm": 0.9036746415660791, - "k1_kl": 0.0242919921875, - "k3_kl": 0.02001953125, - "kimi_kl": 0.051422119140625, - "learning_rate": 9.799999999999998e-09, - "loss": 0.0008, - "ppl": 0.0596771240234375, - "reward": 0.7675285041332245, - "reward_std": 0.1225477768239216, - "rewards/perpo_ocr_edit_distance_reward": 0.7675285637378693, + "advantages": -6.453480455093086e-05, + "completion_length": 1080.0, + "delta_ref_entropy_loss": 0.046630859375, + "delta_ref_ppl": -0.0478515625, + "entropy_loss": -0.057373046875, + "epoch": 0.4902, + "grad_norm": 1.9229012797501772, + "k1_kl": 0.0478515625, + "k3_kl": 0.027587890625, + "kimi_kl": 0.05517578125, + "learning_rate": 2.549e-07, + "loss": 0.0012, + "ppl": 0.026611328125, + "reward": 0.9160630702972412, + "reward_std": 0.0008233939297497272, + "rewards/perpo_ocr_edit_distance_reward": 0.9160631895065308, "step": 2451, "temperature": 0.9 }, { - "advantages": -4.161255856161006e-05, - "completion_length": 674.5, - "delta_ref_entropy_loss": 0.04541015625, - "delta_ref_ppl": -0.02325439453125, - "entropy_loss": -0.0289306640625, - "epoch": 0.9808, - "grad_norm": 0.39773813802789004, - "k1_kl": 0.02325439453125, - "k3_kl": 0.01300048828125, - "kimi_kl": 0.0372314453125, - "learning_rate": 9.599999999999998e-09, - "loss": 0.0006, - "ppl": 0.01422119140625, - "reward": 0.9199628829956055, - "reward_std": 0.0011976376990787685, - "rewards/perpo_ocr_edit_distance_reward": 0.9199629426002502, + "advantages": -6.23294317847467e-06, + "completion_length": 513.0, + "delta_ref_entropy_loss": 0.0751953125, + "delta_ref_ppl": -0.08203125, + "entropy_loss": -0.12158203125, + "epoch": 0.4904, + "grad_norm": 1.8811193255101601, + "k1_kl": 0.08251953125, + "k3_kl": 0.054443359375, + "kimi_kl": 0.17578125, + "learning_rate": 2.5480000000000003e-07, + "loss": 0.0022, + "ppl": 0.057861328125, + "reward": 0.7725909948348999, + "reward_std": 0.006730730179697275, + "rewards/perpo_ocr_edit_distance_reward": 0.7725910544395447, "step": 2452, "temperature": 0.9 }, { - "advantages": -0.00029802109513954456, - "completion_length": 553.5, - "delta_ref_entropy_loss": 0.0303955078125, - "delta_ref_ppl": -0.02838134765625, - "entropy_loss": -0.02410888671875, - "epoch": 0.9812, - "grad_norm": 0.1948547964961031, - "k1_kl": 0.0283203125, - "k3_kl": 0.017120361328125, - "kimi_kl": 0.056640625, - "learning_rate": 9.4e-09, - "loss": 0.001, - "ppl": 0.010833740234375, - "reward": 0.9983482658863068, - "reward_std": 8.05229356046766e-05, - "rewards/perpo_ocr_edit_distance_reward": 0.9983483254909515, + "advantages": -6.68253269395791e-05, + "completion_length": 318.0, + "delta_ref_entropy_loss": 0.053955078125, + "delta_ref_ppl": -0.10107421875, + "entropy_loss": -0.05517578125, + "epoch": 0.4906, + "grad_norm": 1.381401548809843, + "k1_kl": 0.10107421875, + "k3_kl": 0.06689453125, + "kimi_kl": 0.2109375, + "learning_rate": 2.5469999999999997e-07, + "loss": 0.0027, + "ppl": 0.0201416015625, + "reward": 0.9887260794639587, + "reward_std": 0.0013018101453781128, + "rewards/perpo_ocr_edit_distance_reward": 0.9887261390686035, "step": 2453, "temperature": 0.9 }, { - "advantages": -3.6282199289416894e-05, - "completion_length": 1176.5, - "delta_ref_entropy_loss": 0.028961181640625, - "delta_ref_ppl": -0.01751708984375, - "entropy_loss": -0.032470703125, - "epoch": 0.9816, - "grad_norm": 0.4528319361229277, - "k1_kl": 0.01751708984375, - "k3_kl": 0.0085906982421875, - "kimi_kl": 0.0194091796875, - "learning_rate": 9.2e-09, - "loss": 0.0004, - "ppl": 0.0165557861328125, - "reward": 0.9949601590633392, - "reward_std": 0.00047794508282095194, - "rewards/perpo_ocr_edit_distance_reward": 0.9949601590633392, + "advantages": -1.8221992377220886e-06, + "completion_length": 776.0, + "delta_ref_entropy_loss": 0.06494140625, + "delta_ref_ppl": -0.06982421875, + "entropy_loss": -0.057373046875, + "epoch": 0.4908, + "grad_norm": 0.9534504607684711, + "k1_kl": 0.06982421875, + "k3_kl": 0.0390625, + "kimi_kl": 0.091796875, + "learning_rate": 2.5459999999999996e-07, + "loss": 0.0016, + "ppl": 0.0223388671875, + "reward": 0.781505823135376, + "reward_std": 0.018627947196364403, + "rewards/perpo_ocr_edit_distance_reward": 0.7815058827400208, "step": 2454, "temperature": 0.9 }, { - "advantages": -6.61015510559082e-05, - "completion_length": 578.5, - "delta_ref_entropy_loss": 0.024200439453125, - "delta_ref_ppl": -0.0208740234375, - "entropy_loss": -0.02313232421875, - "epoch": 0.982, - "grad_norm": 0.27281536677499174, - "k1_kl": 0.02081298828125, - "k3_kl": 0.012481689453125, - "kimi_kl": 0.0306396484375, - "learning_rate": 9e-09, - "loss": 0.0006, - "ppl": 0.01006317138671875, - "reward": 0.9994042813777924, - "reward_std": 7.880447810748592e-05, - "rewards/perpo_ocr_edit_distance_reward": 0.9994042813777924, + "advantages": -2.7469228371046484e-05, + "completion_length": 706.0, + "delta_ref_entropy_loss": 0.10595703125, + "delta_ref_ppl": -0.0927734375, + "entropy_loss": -0.1181640625, + "epoch": 0.491, + "grad_norm": 1.4188582430264924, + "k1_kl": 0.09326171875, + "k3_kl": 0.052978515625, + "kimi_kl": 0.125, + "learning_rate": 2.545e-07, + "loss": 0.0021, + "ppl": 0.057861328125, + "reward": 0.9676038026809692, + "reward_std": 0.001761668361723423, + "rewards/perpo_ocr_edit_distance_reward": 0.967603862285614, "step": 2455, "temperature": 0.9 }, { - "advantages": -8.164985138137126e-05, - "completion_length": 628.0, - "delta_ref_entropy_loss": 0.03515625, - "delta_ref_ppl": -0.04541015625, - "entropy_loss": -0.0264892578125, - "epoch": 0.9824, - "grad_norm": 0.5033552160014808, - "k1_kl": 0.04541015625, - "k3_kl": 0.02996826171875, - "kimi_kl": 0.083251953125, - "learning_rate": 8.8e-09, - "loss": 0.0013, - "ppl": 0.01092529296875, - "reward": 0.998244971036911, - "reward_std": 0.0005264127830741927, - "rewards/perpo_ocr_edit_distance_reward": 0.9982450008392334, + "advantages": -2.0776476503669983e-06, + "completion_length": 741.0, + "delta_ref_entropy_loss": 0.06884765625, + "delta_ref_ppl": -0.072265625, + "entropy_loss": -0.05078125, + "epoch": 0.4912, + "grad_norm": 0.6874641876725378, + "k1_kl": 0.072265625, + "k3_kl": 0.042236328125, + "kimi_kl": 0.1435546875, + "learning_rate": 2.544e-07, + "loss": 0.0017, + "ppl": 0.0194091796875, + "reward": 0.9779689311981201, + "reward_std": 0.016207778826355934, + "rewards/perpo_ocr_edit_distance_reward": 0.9779690504074097, "step": 2456, "temperature": 0.9 }, { - "advantages": -1.893724720503087e-05, - "completion_length": 295.0, - "delta_ref_entropy_loss": 0.1077880859375, - "delta_ref_ppl": -0.07550048828125, - "entropy_loss": -0.1368408203125, - "epoch": 0.9828, - "grad_norm": 1.5883495340235525, - "k1_kl": 0.0751953125, - "k3_kl": 0.04193115234375, - "kimi_kl": 0.09625244140625, - "learning_rate": 8.6e-09, - "loss": 0.0017, - "ppl": 0.0775146484375, - "reward": 0.7413816154003143, - "reward_std": 0.005345444311387837, - "rewards/perpo_ocr_edit_distance_reward": 0.7413816601037979, + "advantages": 1.0456357813382056e-05, + "completion_length": 549.0, + "delta_ref_entropy_loss": 0.0869140625, + "delta_ref_ppl": -0.07421875, + "entropy_loss": -0.091796875, + "epoch": 0.4914, + "grad_norm": 1.0524434680195898, + "k1_kl": 0.07421875, + "k3_kl": 0.040771484375, + "kimi_kl": 0.130859375, + "learning_rate": 2.543e-07, + "loss": 0.0016, + "ppl": 0.047119140625, + "reward": 0.6420348882675171, + "reward_std": 0.0015310468152165413, + "rewards/perpo_ocr_edit_distance_reward": 0.6420348882675171, "step": 2457, "temperature": 0.9 }, { - "advantages": -8.983271982287988e-06, - "completion_length": 968.0, - "delta_ref_entropy_loss": 0.09710693359375, - "delta_ref_ppl": -0.102294921875, - "entropy_loss": -0.0538330078125, - "epoch": 0.9832, - "grad_norm": 0.7279322113244013, - "k1_kl": 0.102294921875, - "k3_kl": 0.064666748046875, - "kimi_kl": 0.19482421875, - "learning_rate": 8.399999999999999e-09, - "loss": 0.0026, - "ppl": 0.0255126953125, - "reward": 0.9917241930961609, - "reward_std": 0.003029109910130501, - "rewards/perpo_ocr_edit_distance_reward": 0.9917242527008057, + "advantages": -6.577798558282666e-06, + "completion_length": 479.0, + "delta_ref_entropy_loss": 0.046630859375, + "delta_ref_ppl": -0.08544921875, + "entropy_loss": -0.035888671875, + "epoch": 0.4916, + "grad_norm": 0.89655849575429, + "k1_kl": 0.08544921875, + "k3_kl": 0.060546875, + "kimi_kl": 0.333984375, + "learning_rate": 2.542e-07, + "loss": 0.0024, + "ppl": 0.01385498046875, + "reward": 0.954694390296936, + "reward_std": 0.003785658162087202, + "rewards/perpo_ocr_edit_distance_reward": 0.9546944499015808, "step": 2458, "temperature": 0.9 }, { - "advantages": -3.540941725077573e-05, - "completion_length": 677.5, - "delta_ref_entropy_loss": 0.03472900390625, - "delta_ref_ppl": -0.0318603515625, - "entropy_loss": -0.02935791015625, - "epoch": 0.9836, - "grad_norm": 0.5866990399918934, - "k1_kl": 0.03173828125, - "k3_kl": 0.01849365234375, - "kimi_kl": 0.04241943359375, - "learning_rate": 8.2e-09, - "loss": 0.0008, - "ppl": 0.01470947265625, - "reward": 0.9945443868637085, - "reward_std": 0.0024766880669631064, - "rewards/perpo_ocr_edit_distance_reward": 0.9945444762706757, + "advantages": -4.971027738065459e-05, + "completion_length": 373.0, + "delta_ref_entropy_loss": 0.064453125, + "delta_ref_ppl": -0.07763671875, + "entropy_loss": -0.0361328125, + "epoch": 0.4918, + "grad_norm": 0.9831978962166124, + "k1_kl": 0.07763671875, + "k3_kl": 0.04541015625, + "kimi_kl": 0.12353515625, + "learning_rate": 2.541e-07, + "loss": 0.0019, + "ppl": 0.013427734375, + "reward": 0.9976557493209839, + "reward_std": 0.0002426528953947127, + "rewards/perpo_ocr_edit_distance_reward": 0.9976557493209839, "step": 2459, "temperature": 0.9 }, { - "advantages": -4.2336331347314626e-05, - "completion_length": 550.5, - "delta_ref_entropy_loss": 0.0672607421875, - "delta_ref_ppl": -0.04998779296875, - "entropy_loss": -0.10504150390625, - "epoch": 0.984, - "grad_norm": 1.9867316278599838, - "k1_kl": 0.04998779296875, - "k3_kl": 0.0306396484375, - "kimi_kl": 0.061279296875, - "learning_rate": 8e-09, - "loss": 0.0013, - "ppl": 0.0616455078125, - "reward": 0.9390347003936768, - "reward_std": 0.017041690269252285, - "rewards/perpo_ocr_edit_distance_reward": 0.9390347301959991, + "advantages": -2.1287373783707153e-06, + "completion_length": 1346.0, + "delta_ref_entropy_loss": 0.0106201171875, + "delta_ref_ppl": -0.021728515625, + "entropy_loss": -0.0537109375, + "epoch": 0.492, + "grad_norm": 0.4753499941067004, + "k1_kl": 0.021728515625, + "k3_kl": 0.01409912109375, + "kimi_kl": 0.0279541015625, + "learning_rate": 2.5399999999999997e-07, + "loss": 0.0006, + "ppl": 0.0196533203125, + "reward": 0.9705917835235596, + "reward_std": 0.015646355226635933, + "rewards/perpo_ocr_edit_distance_reward": 0.9705918431282043, "step": 2460, "temperature": 0.9 }, { - "advantages": -4.0486456782673486e-05, - "completion_length": 682.0, - "delta_ref_entropy_loss": 0.04595947265625, - "delta_ref_ppl": -0.037750244140625, - "entropy_loss": -0.0794677734375, - "epoch": 0.9844, - "grad_norm": 1.233919393455732, - "k1_kl": 0.0377197265625, - "k3_kl": 0.02276611328125, - "kimi_kl": 0.0552520751953125, - "learning_rate": 7.799999999999999e-09, - "loss": 0.001, - "ppl": 0.0452880859375, - "reward": 0.9577571749687195, - "reward_std": 0.001512858783826232, - "rewards/perpo_ocr_edit_distance_reward": 0.9577572345733643, + "advantages": -3.5388129617786035e-05, + "completion_length": 241.0, + "delta_ref_entropy_loss": 0.09375, + "delta_ref_ppl": -0.146484375, + "entropy_loss": -0.057861328125, + "epoch": 0.4922, + "grad_norm": 1.3299001563692283, + "k1_kl": 0.146484375, + "k3_kl": 0.099609375, + "kimi_kl": 0.341796875, + "learning_rate": 2.539e-07, + "loss": 0.004, + "ppl": 0.0198974609375, + "reward": 0.9978905916213989, + "reward_std": 0.0008617423591203988, + "rewards/perpo_ocr_edit_distance_reward": 0.9978906512260437, "step": 2461, "temperature": 0.9 }, { - "advantages": -7.816723837095196e-06, - "completion_length": 709.5, - "delta_ref_entropy_loss": 0.07659912109375, - "delta_ref_ppl": -0.1481170654296875, - "entropy_loss": -0.26947021484375, - "epoch": 0.9848, - "grad_norm": 2.884456614903433, - "k1_kl": 0.1490936279296875, - "k3_kl": 0.112274169921875, - "kimi_kl": 0.4983062744140625, - "learning_rate": 7.6e-09, - "loss": 0.0045, - "ppl": 0.150726318359375, - "reward": 0.8611951768398285, - "reward_std": 0.030694019253132865, - "rewards/perpo_ocr_edit_distance_reward": 0.861195296049118, + "advantages": -4.564013124763733e-06, + "completion_length": 904.0, + "delta_ref_entropy_loss": 0.04833984375, + "delta_ref_ppl": -0.06396484375, + "entropy_loss": -0.053466796875, + "epoch": 0.4924, + "grad_norm": 0.919335972343151, + "k1_kl": 0.06396484375, + "k3_kl": 0.03759765625, + "kimi_kl": 0.0927734375, + "learning_rate": 2.538e-07, + "loss": 0.0015, + "ppl": 0.0220947265625, + "reward": 0.9280581474304199, + "reward_std": 0.018548201769590378, + "rewards/perpo_ocr_edit_distance_reward": 0.9280582070350647, "step": 2462, "temperature": 0.9 }, { - "advantages": 2.2292138964985497e-05, - "completion_length": 680.0, - "delta_ref_entropy_loss": 0.02740478515625, - "delta_ref_ppl": -0.01434326171875, - "entropy_loss": -0.015167236328125, - "epoch": 0.9852, - "grad_norm": 0.28536000014446117, - "k1_kl": 0.01434326171875, - "k3_kl": 0.0063629150390625, - "kimi_kl": 0.01190185546875, - "learning_rate": 7.4e-09, - "loss": 0.0002, - "ppl": 0.0062408447265625, - "reward": 0.9980997741222382, - "reward_std": 0.0002365330292377621, - "rewards/perpo_ocr_edit_distance_reward": 0.9980997443199158, + "advantages": -3.5337041481398046e-05, + "completion_length": 766.0, + "delta_ref_entropy_loss": 0.08837890625, + "delta_ref_ppl": -0.1220703125, + "entropy_loss": -0.2275390625, + "epoch": 0.4926, + "grad_norm": 1.9976299818786947, + "k1_kl": 0.1220703125, + "k3_kl": 0.0771484375, + "kimi_kl": 0.171875, + "learning_rate": 2.5369999999999995e-07, + "loss": 0.0031, + "ppl": 0.12060546875, + "reward": 0.8968474268913269, + "reward_std": 0.0015864699380472302, + "rewards/perpo_ocr_edit_distance_reward": 0.8968474864959717, "step": 2463, "temperature": 0.9 }, { - "advantages": -1.8932991224573925e-05, - "completion_length": 386.5, - "delta_ref_entropy_loss": 0.102783203125, - "delta_ref_ppl": -0.11328125, - "entropy_loss": -0.069091796875, - "epoch": 0.9856, - "grad_norm": 0.3709903280108638, - "k1_kl": 0.11328125, - "k3_kl": 0.07275390625, - "kimi_kl": 0.1614990234375, - "learning_rate": 7.199999999999999e-09, - "loss": 0.0029, - "ppl": 0.03717041015625, - "reward": 0.9979803264141083, - "reward_std": 0.00039975307299755514, - "rewards/perpo_ocr_edit_distance_reward": 0.9979803562164307, + "advantages": -7.833753556951706e-07, + "completion_length": 1170.0, + "delta_ref_entropy_loss": 0.10888671875, + "delta_ref_ppl": -0.1435546875, + "entropy_loss": -1.0390625, + "epoch": 0.4928, + "grad_norm": 340.5402505526596, + "k1_kl": 0.14453125, + "k3_kl": 2.484375, + "kimi_kl": 0.359375, + "learning_rate": 2.536e-07, + "loss": 0.0996, + "ppl": 0.87890625, + "reward": 0.7677524089813232, + "reward_std": 0.04312209039926529, + "rewards/perpo_ocr_edit_distance_reward": 0.767752468585968, "step": 2464, "temperature": 0.9 }, { - "advantages": -9.128877809416736e-05, - "completion_length": 832.0, - "delta_ref_entropy_loss": 0.03643798828125, - "delta_ref_ppl": -0.01953125, - "entropy_loss": -0.02618408203125, - "epoch": 0.986, - "grad_norm": 0.40234919246023354, - "k1_kl": 0.01947021484375, - "k3_kl": 0.0096282958984375, - "kimi_kl": 0.0181732177734375, - "learning_rate": 7e-09, - "loss": 0.0005, - "ppl": 0.013458251953125, - "reward": 0.9974769949913025, - "reward_std": 0.001527123269625008, - "rewards/perpo_ocr_edit_distance_reward": 0.9974770843982697, + "advantages": -5.4700038162991405e-05, + "completion_length": 612.0, + "delta_ref_entropy_loss": 0.048095703125, + "delta_ref_ppl": -0.05419921875, + "entropy_loss": -0.032958984375, + "epoch": 0.493, + "grad_norm": 0.5747358847423766, + "k1_kl": 0.05419921875, + "k3_kl": 0.031005859375, + "kimi_kl": 0.087890625, + "learning_rate": 2.535e-07, + "loss": 0.0013, + "ppl": 0.0146484375, + "reward": 0.9929394125938416, + "reward_std": 0.0006784586585126817, + "rewards/perpo_ocr_edit_distance_reward": 0.9929394721984863, "step": 2465, "temperature": 0.9 }, { - "advantages": -2.8576170734595507e-05, - "completion_length": 365.0, - "delta_ref_entropy_loss": 0.0296630859375, - "delta_ref_ppl": -0.065032958984375, - "entropy_loss": -0.0263671875, - "epoch": 0.9864, - "grad_norm": 0.35710802224679006, - "k1_kl": 0.065032958984375, - "k3_kl": 0.046905517578125, - "kimi_kl": 0.125518798828125, - "learning_rate": 6.8e-09, + "advantages": -5.330358362698462e-06, + "completion_length": 431.0, + "delta_ref_entropy_loss": 0.054443359375, + "delta_ref_ppl": -0.072265625, + "entropy_loss": -0.05126953125, + "epoch": 0.4932, + "grad_norm": 0.8771223850443545, + "k1_kl": 0.07275390625, + "k3_kl": 0.047119140625, + "kimi_kl": 0.185546875, + "learning_rate": 2.534e-07, "loss": 0.0019, - "ppl": 0.008026123046875, - "reward": 0.8002851903438568, - "reward_std": 0.00047169419121928513, - "rewards/perpo_ocr_edit_distance_reward": 0.8002852499485016, + "ppl": 0.0206298828125, + "reward": 0.9408304691314697, + "reward_std": 0.0015011520590633154, + "rewards/perpo_ocr_edit_distance_reward": 0.9408304691314697, "step": 2466, "temperature": 0.9 }, { - "advantages": -3.451960537859122e-05, - "completion_length": 1051.0, - "delta_ref_entropy_loss": 0.04022216796875, - "delta_ref_ppl": -0.05078125, - "entropy_loss": -0.022613525390625, - "epoch": 0.9868, - "grad_norm": 0.539251985588694, - "k1_kl": 0.05084228515625, - "k3_kl": 0.033447265625, - "kimi_kl": 0.111083984375, - "learning_rate": 6.5999999999999995e-09, - "loss": 0.0014, - "ppl": 0.01104736328125, - "reward": 0.9977380931377411, - "reward_std": 0.0007316582632483914, - "rewards/perpo_ocr_edit_distance_reward": 0.9977381229400635, + "advantages": -6.0336933529470116e-05, + "completion_length": 515.0, + "delta_ref_entropy_loss": 0.09375, + "delta_ref_ppl": -0.1064453125, + "entropy_loss": -0.07666015625, + "epoch": 0.4934, + "grad_norm": 1.2783027514313205, + "k1_kl": 0.1064453125, + "k3_kl": 0.068359375, + "kimi_kl": 0.197265625, + "learning_rate": 2.533e-07, + "loss": 0.0028, + "ppl": 0.03955078125, + "reward": 0.9804241061210632, + "reward_std": 0.0008879717206582427, + "rewards/perpo_ocr_edit_distance_reward": 0.9804242253303528, "step": 2467, "temperature": 0.9 }, { - "advantages": -1.9852604185022216e-05, - "completion_length": 726.5, - "delta_ref_entropy_loss": 0.03326416015625, - "delta_ref_ppl": -0.0587158203125, - "entropy_loss": -0.02410888671875, - "epoch": 0.9872, - "grad_norm": 0.3987912043193276, - "k1_kl": 0.0587158203125, - "k3_kl": 0.0419158935546875, - "kimi_kl": 0.14710235595703125, - "learning_rate": 6.4e-09, - "loss": 0.0017, - "ppl": 0.0123138427734375, - "reward": 0.677176758646965, - "reward_std": 0.0013093920424580574, - "rewards/perpo_ocr_edit_distance_reward": 0.6771767735481262, + "advantages": -3.864084283122793e-05, + "completion_length": 624.0, + "delta_ref_entropy_loss": 0.050048828125, + "delta_ref_ppl": -0.04736328125, + "entropy_loss": -0.0301513671875, + "epoch": 0.4936, + "grad_norm": 0.28285970791711185, + "k1_kl": 0.04736328125, + "k3_kl": 0.0252685546875, + "kimi_kl": 0.07568359375, + "learning_rate": 2.5319999999999996e-07, + "loss": 0.001, + "ppl": 0.00921630859375, + "reward": 0.9924485683441162, + "reward_std": 0.0003408778866287321, + "rewards/perpo_ocr_edit_distance_reward": 0.992448627948761, "step": 2468, "temperature": 0.9 }, { - "advantages": 0.0, - "completion_length": 410.0, - "delta_ref_entropy_loss": 0.02203369140625, - "delta_ref_ppl": -0.017364501953125, - "entropy_loss": -0.02001953125, - "epoch": 0.9876, - "grad_norm": 0.029377640859641868, - "k1_kl": 0.017303466796875, - "k3_kl": 0.0083770751953125, - "kimi_kl": 0.015594482421875, + "advantages": 1.3623919130623108e-06, + "completion_length": 1497.0, + "delta_ref_entropy_loss": 0.0111083984375, + "delta_ref_ppl": -0.0277099609375, + "entropy_loss": -0.044189453125, + "epoch": 0.4938, + "grad_norm": 1.4765287380893328, + "k1_kl": 0.0277099609375, + "k3_kl": 0.0216064453125, + "kimi_kl": 0.038818359375, + "learning_rate": 2.5309999999999996e-07, + "loss": 0.0009, + "ppl": 0.022705078125, + "reward": 0.9895055294036865, + "reward_std": 0.006241184659302235, + "rewards/perpo_ocr_edit_distance_reward": 0.9895055294036865, + "step": 2469, + "temperature": 0.9 + }, + { + "advantages": -4.919086495647207e-05, + "completion_length": 509.0, + "delta_ref_entropy_loss": 0.03662109375, + "delta_ref_ppl": -0.044921875, + "entropy_loss": -0.035888671875, + "epoch": 0.494, + "grad_norm": 0.32874847640706034, + "k1_kl": 0.044677734375, + "k3_kl": 0.028076171875, + "kimi_kl": 0.0830078125, + "learning_rate": 2.53e-07, + "loss": 0.0012, + "ppl": 0.0118408203125, + "reward": 0.9896641969680786, + "reward_std": 0.00024630807456560433, + "rewards/perpo_ocr_edit_distance_reward": 0.9896642565727234, + "step": 2470, + "temperature": 0.9 + }, + { + "advantages": -1.1818749953818042e-05, + "completion_length": 157.0, + "delta_ref_entropy_loss": 0.052978515625, + "delta_ref_ppl": -0.1533203125, + "entropy_loss": -0.06787109375, + "epoch": 0.4942, + "grad_norm": 1.562853500540061, + "k1_kl": 0.1533203125, + "k3_kl": 0.11376953125, + "kimi_kl": 0.365234375, + "learning_rate": 2.529e-07, + "loss": 0.0046, + "ppl": 0.0234375, + "reward": 0.9972683191299438, + "reward_std": 0.003498418489471078, + "rewards/perpo_ocr_edit_distance_reward": 0.9972683787345886, + "step": 2471, + "temperature": 0.9 + }, + { + "advantages": -8.382116357097402e-05, + "completion_length": 868.0, + "delta_ref_entropy_loss": 0.0269775390625, + "delta_ref_ppl": -0.03515625, + "entropy_loss": -0.041748046875, + "epoch": 0.4944, + "grad_norm": 0.4919874154287689, + "k1_kl": 0.03515625, + "k3_kl": 0.01953125, + "kimi_kl": 0.05712890625, + "learning_rate": 2.528e-07, + "loss": 0.0009, + "ppl": 0.01904296875, + "reward": 0.9951582551002502, + "reward_std": 0.0008142035221680999, + "rewards/perpo_ocr_edit_distance_reward": 0.9951583743095398, + "step": 2472, + "temperature": 0.9 + }, + { + "advantages": -8.97475729288999e-06, + "completion_length": 45.0, + "delta_ref_entropy_loss": 0.1240234375, + "delta_ref_ppl": -0.703125, + "entropy_loss": -0.1494140625, + "epoch": 0.4946, + "grad_norm": 4.615236147859294, + "k1_kl": 0.703125, + "k3_kl": 0.57421875, + "kimi_kl": 2.40625, + "learning_rate": 2.527e-07, + "loss": 0.023, + "ppl": 0.06494140625, + "reward": 0.9875457882881165, + "reward_std": 0.006525220349431038, + "rewards/perpo_ocr_edit_distance_reward": 0.9875458478927612, + "step": 2473, + "temperature": 0.9 + }, + { + "advantages": -3.893034954671748e-05, + "completion_length": 1058.0, + "delta_ref_entropy_loss": 0.033447265625, + "delta_ref_ppl": -0.045166015625, + "entropy_loss": -0.035888671875, + "epoch": 0.4948, + "grad_norm": 0.6083100366483202, + "k1_kl": 0.045166015625, + "k3_kl": 0.02392578125, + "kimi_kl": 0.061767578125, + "learning_rate": 2.5259999999999997e-07, + "loss": 0.001, + "ppl": 0.0118408203125, + "reward": 0.9935516119003296, + "reward_std": 0.0005563001031987369, + "rewards/perpo_ocr_edit_distance_reward": 0.9935516119003296, + "step": 2474, + "temperature": 0.9 + }, + { + "advantages": -7.143191032810137e-05, + "completion_length": 907.0, + "delta_ref_entropy_loss": 0.022705078125, + "delta_ref_ppl": -0.04296875, + "entropy_loss": -0.038818359375, + "epoch": 0.495, + "grad_norm": 0.42457735023413995, + "k1_kl": 0.04296875, + "k3_kl": 0.02978515625, + "kimi_kl": 0.078125, + "learning_rate": 2.5249999999999996e-07, + "loss": 0.0013, + "ppl": 0.0159912109375, + "reward": 0.9911164045333862, + "reward_std": 0.0004960834048688412, + "rewards/perpo_ocr_edit_distance_reward": 0.9911164045333862, + "step": 2475, + "temperature": 0.9 + }, + { + "advantages": -5.909374976909021e-06, + "completion_length": 655.0, + "delta_ref_entropy_loss": 0.08154296875, + "delta_ref_ppl": -0.0810546875, + "entropy_loss": -0.1767578125, + "epoch": 0.4952, + "grad_norm": 2.1200863253082254, + "k1_kl": 0.08056640625, + "k3_kl": 0.047119140625, + "kimi_kl": 0.10888671875, + "learning_rate": 2.524e-07, + "loss": 0.0019, + "ppl": 0.0830078125, + "reward": 0.9445086121559143, + "reward_std": 0.014331513084471226, + "rewards/perpo_ocr_edit_distance_reward": 0.9445087313652039, + "step": 2476, + "temperature": 0.9 + }, + { + "advantages": -2.8520824344013818e-05, + "completion_length": 541.0, + "delta_ref_entropy_loss": 0.0615234375, + "delta_ref_ppl": -0.0830078125, + "entropy_loss": -0.0556640625, + "epoch": 0.4954, + "grad_norm": 0.8123523195781014, + "k1_kl": 0.0830078125, + "k3_kl": 0.044921875, + "kimi_kl": 0.10546875, + "learning_rate": 2.523e-07, + "loss": 0.0018, + "ppl": 0.023681640625, + "reward": 0.9718022346496582, + "reward_std": 0.0031830205116420984, + "rewards/perpo_ocr_edit_distance_reward": 0.971802294254303, + "step": 2477, + "temperature": 0.9 + }, + { + "advantages": -0.00010466576350154355, + "completion_length": 904.0, + "delta_ref_entropy_loss": 0.0654296875, + "delta_ref_ppl": -0.06640625, + "entropy_loss": -0.0439453125, + "epoch": 0.4956, + "grad_norm": 0.40440144331211375, + "k1_kl": 0.06689453125, + "k3_kl": 0.039794921875, + "kimi_kl": 0.08935546875, + "learning_rate": 2.5219999999999994e-07, + "loss": 0.0017, + "ppl": 0.01904296875, + "reward": 0.9945784211158752, + "reward_std": 0.0004695276729762554, + "rewards/perpo_ocr_edit_distance_reward": 0.9945785403251648, + "step": 2478, + "temperature": 0.9 + }, + { + "advantages": -0.0005960464477539062, + "completion_length": 475.0, + "delta_ref_entropy_loss": 0.0517578125, + "delta_ref_ppl": -0.0654296875, + "entropy_loss": -0.0206298828125, + "epoch": 0.4958, + "grad_norm": 0.02757227491914591, + "k1_kl": 0.06494140625, + "k3_kl": 0.041748046875, + "kimi_kl": 0.11962890625, + "learning_rate": 2.521e-07, + "loss": 0.0023, + "ppl": 0.004791259765625, + "reward": 0.983308732509613, + "reward_std": 0.0, + "rewards/perpo_ocr_edit_distance_reward": 0.9833088517189026, + "step": 2479, + "temperature": 0.9 + }, + { + "advantages": -6.23123996774666e-05, + "completion_length": 181.0, + "delta_ref_entropy_loss": 0.058837890625, + "delta_ref_ppl": -0.236328125, + "entropy_loss": -0.091796875, + "epoch": 0.496, + "grad_norm": 3.842017946362654, + "k1_kl": 0.236328125, + "k3_kl": 0.18359375, + "kimi_kl": 0.8046875, + "learning_rate": 2.52e-07, + "loss": 0.0074, + "ppl": 0.037109375, + "reward": 0.9926595091819763, + "reward_std": 0.0016764472238719463, + "rewards/perpo_ocr_edit_distance_reward": 0.9926596283912659, + "step": 2480, + "temperature": 0.9 + }, + { + "advantages": -2.2309168343781494e-05, + "completion_length": 601.0, + "delta_ref_entropy_loss": 0.08154296875, + "delta_ref_ppl": -0.10791015625, + "entropy_loss": -0.11279296875, + "epoch": 0.4962, + "grad_norm": 0.976338607202946, + "k1_kl": 0.10791015625, + "k3_kl": 0.06396484375, + "kimi_kl": 0.1533203125, + "learning_rate": 2.519e-07, + "loss": 0.0026, + "ppl": 0.04345703125, + "reward": 0.9849744439125061, + "reward_std": 0.002188974292948842, + "rewards/perpo_ocr_edit_distance_reward": 0.9849745035171509, + "step": 2481, + "temperature": 0.9 + }, + { + "advantages": -7.56638401071541e-05, + "completion_length": 821.0, + "delta_ref_entropy_loss": 0.0262451171875, + "delta_ref_ppl": -0.028564453125, + "entropy_loss": -0.022216796875, + "epoch": 0.4964, + "grad_norm": 0.21927835909515322, + "k1_kl": 0.0286865234375, + "k3_kl": 0.0157470703125, + "kimi_kl": 0.033447265625, + "learning_rate": 2.518e-07, + "loss": 0.0007, + "ppl": 0.00677490234375, + "reward": 0.9955475330352783, + "reward_std": 0.0004626900190487504, + "rewards/perpo_ocr_edit_distance_reward": 0.9955475926399231, + "step": 2482, + "temperature": 0.9 + }, + { + "advantages": -1.488413181505166e-05, + "completion_length": 428.0, + "delta_ref_entropy_loss": 0.0703125, + "delta_ref_ppl": -0.11328125, + "entropy_loss": -0.05224609375, + "epoch": 0.4966, + "grad_norm": 0.7194638887055612, + "k1_kl": 0.11328125, + "k3_kl": 0.076171875, + "kimi_kl": 0.392578125, + "learning_rate": 2.5169999999999996e-07, + "loss": 0.0031, + "ppl": 0.02099609375, + "reward": 0.795357346534729, + "reward_std": 0.0016140356892719865, + "rewards/perpo_ocr_edit_distance_reward": 0.795357346534729, + "step": 2483, + "temperature": 0.9 + }, + { + "advantages": -0.0001622268173377961, + "completion_length": 838.0, + "delta_ref_entropy_loss": 0.055419921875, + "delta_ref_ppl": -0.06884765625, + "entropy_loss": -0.033203125, + "epoch": 0.4968, + "grad_norm": 0.38576341742692605, + "k1_kl": 0.06884765625, + "k3_kl": 0.04248046875, + "kimi_kl": 0.1552734375, + "learning_rate": 2.516e-07, + "loss": 0.0019, + "ppl": 0.01263427734375, + "reward": 0.9937461614608765, + "reward_std": 0.0005823667161166668, + "rewards/perpo_ocr_edit_distance_reward": 0.993746280670166, + "step": 2484, + "temperature": 0.9 + }, + { + "advantages": -3.990956975030713e-05, + "completion_length": 780.0, + "delta_ref_entropy_loss": 0.055908203125, + "delta_ref_ppl": -0.06494140625, + "entropy_loss": -0.1142578125, + "epoch": 0.497, + "grad_norm": 1.2104344078148617, + "k1_kl": 0.06494140625, + "k3_kl": 0.0380859375, + "kimi_kl": 0.08154296875, + "learning_rate": 2.515e-07, + "loss": 0.0016, + "ppl": 0.05859375, + "reward": 0.9831761717796326, + "reward_std": 0.0016077548498287797, + "rewards/perpo_ocr_edit_distance_reward": 0.9831762909889221, + "step": 2485, + "temperature": 0.9 + }, + { + "advantages": 5.236694050836377e-06, + "completion_length": 249.0, + "delta_ref_entropy_loss": 0.04736328125, + "delta_ref_ppl": -0.12890625, + "entropy_loss": -0.0498046875, + "epoch": 0.4972, + "grad_norm": 1.005336579161489, + "k1_kl": 0.12890625, + "k3_kl": 0.1025390625, + "kimi_kl": 0.41796875, + "learning_rate": 2.514e-07, + "loss": 0.0041, + "ppl": 0.01611328125, + "reward": 0.9947439432144165, + "reward_std": 0.0015267283888533711, + "rewards/perpo_ocr_edit_distance_reward": 0.9947439432144165, + "step": 2486, + "temperature": 0.9 + }, + { + "advantages": -2.9904502298450097e-05, + "completion_length": 544.0, + "delta_ref_entropy_loss": 0.057861328125, + "delta_ref_ppl": -0.07470703125, + "entropy_loss": -0.076171875, + "epoch": 0.4974, + "grad_norm": 0.9506349100861914, + "k1_kl": 0.07421875, + "k3_kl": 0.047119140625, + "kimi_kl": 0.1748046875, + "learning_rate": 2.5130000000000003e-07, + "loss": 0.0019, + "ppl": 0.034423828125, + "reward": 0.9930678009986877, + "reward_std": 0.0016076620668172836, + "rewards/perpo_ocr_edit_distance_reward": 0.9930679202079773, + "step": 2487, + "temperature": 0.9 + }, + { + "advantages": -6.287438736762851e-05, + "completion_length": 234.0, + "delta_ref_entropy_loss": 0.08544921875, + "delta_ref_ppl": -0.1611328125, + "entropy_loss": -0.06298828125, + "epoch": 0.4976, + "grad_norm": 0.9494848391747065, + "k1_kl": 0.16015625, + "k3_kl": 0.1220703125, + "kimi_kl": 0.5703125, + "learning_rate": 2.5119999999999997e-07, + "loss": 0.005, + "ppl": 0.024169921875, + "reward": 0.9914836287498474, + "reward_std": 0.0011184696340933442, + "rewards/perpo_ocr_edit_distance_reward": 0.9914836883544922, + "step": 2488, + "temperature": 0.9 + }, + { + "advantages": -0.0001920206268550828, + "completion_length": 256.0, + "delta_ref_entropy_loss": 0.04345703125, + "delta_ref_ppl": -0.08544921875, + "entropy_loss": -0.037841796875, + "epoch": 0.4978, + "grad_norm": 0.41624671951726444, + "k1_kl": 0.08544921875, + "k3_kl": 0.058349609375, + "kimi_kl": 0.1982421875, + "learning_rate": 2.5109999999999997e-07, + "loss": 0.0025, + "ppl": 0.01214599609375, + "reward": 0.9923778772354126, + "reward_std": 0.0004321117012295872, + "rewards/perpo_ocr_edit_distance_reward": 0.9923779964447021, + "step": 2489, + "temperature": 0.9 + }, + { + "advantages": -1.1043889571737964e-05, + "completion_length": 503.0, + "delta_ref_entropy_loss": 0.06201171875, + "delta_ref_ppl": -0.0703125, + "entropy_loss": -0.041748046875, + "epoch": 0.498, + "grad_norm": 1.0358749871644366, + "k1_kl": 0.0703125, + "k3_kl": 0.04833984375, + "kimi_kl": 0.1298828125, + "learning_rate": 2.51e-07, + "loss": 0.0019, + "ppl": 0.015869140625, + "reward": 0.9734965562820435, + "reward_std": 0.003760389983654022, + "rewards/perpo_ocr_edit_distance_reward": 0.9734966158866882, + "step": 2490, + "temperature": 0.9 + }, + { + "advantages": -1.021793991640152e-07, + "completion_length": 20.0, + "delta_ref_entropy_loss": -0.2412109375, + "delta_ref_ppl": -1.5078125, + "entropy_loss": -0.8046875, + "epoch": 0.4982, + "grad_norm": 15.80658118831105, + "k1_kl": 1.5078125, + "k3_kl": 1.3671875, + "kimi_kl": 6.65625, + "learning_rate": 2.509e-07, + "loss": 0.0548, + "ppl": 0.294921875, + "reward": 0.1950201392173767, + "reward_std": 0.1384500116109848, + "rewards/perpo_ocr_edit_distance_reward": 0.1950201690196991, + "step": 2491, + "temperature": 0.9 + }, + { + "advantages": -1.565047750773374e-05, + "completion_length": 781.0, + "delta_ref_entropy_loss": 0.048095703125, + "delta_ref_ppl": -0.05908203125, + "entropy_loss": -0.06787109375, + "epoch": 0.4984, + "grad_norm": 0.9726659637856035, + "k1_kl": 0.05908203125, + "k3_kl": 0.032958984375, + "kimi_kl": 0.07373046875, + "learning_rate": 2.508e-07, + "loss": 0.0013, + "ppl": 0.0294189453125, + "reward": 0.9797570109367371, + "reward_std": 0.000989266554825008, + "rewards/perpo_ocr_edit_distance_reward": 0.9797570705413818, + "step": 2492, + "temperature": 0.9 + }, + { + "advantages": -8.746555977268144e-05, + "completion_length": 626.0, + "delta_ref_entropy_loss": 0.03759765625, + "delta_ref_ppl": -0.0498046875, + "entropy_loss": -0.034912109375, + "epoch": 0.4986, + "grad_norm": 0.42829421171776766, + "k1_kl": 0.0498046875, + "k3_kl": 0.03125, + "kimi_kl": 0.08251953125, + "learning_rate": 2.507e-07, + "loss": 0.0013, + "ppl": 0.0128173828125, + "reward": 0.9937301874160767, + "reward_std": 0.0005816365592181683, + "rewards/perpo_ocr_edit_distance_reward": 0.9937303066253662, + "step": 2493, + "temperature": 0.9 + }, + { + "advantages": -1.4015607121109497e-05, + "completion_length": 124.0, + "delta_ref_entropy_loss": 0.051513671875, + "delta_ref_ppl": -0.158203125, + "entropy_loss": -0.08154296875, + "epoch": 0.4988, + "grad_norm": 2.4302452577067992, + "k1_kl": 0.158203125, + "k3_kl": 0.11376953125, + "kimi_kl": 0.361328125, + "learning_rate": 2.506e-07, + "loss": 0.0046, + "ppl": 0.039306640625, + "reward": 0.9869914650917053, + "reward_std": 0.002935120603069663, + "rewards/perpo_ocr_edit_distance_reward": 0.9869915246963501, + "step": 2494, + "temperature": 0.9 + }, + { + "advantages": -0.00014535018999595195, + "completion_length": 514.0, + "delta_ref_entropy_loss": 0.03857421875, + "delta_ref_ppl": -0.064453125, + "entropy_loss": -0.03076171875, + "epoch": 0.499, + "grad_norm": 0.38538296549848444, + "k1_kl": 0.064453125, + "k3_kl": 0.046630859375, + "kimi_kl": 0.17578125, + "learning_rate": 2.5049999999999997e-07, + "loss": 0.002, + "ppl": 0.01361083984375, + "reward": 0.9983533024787903, + "reward_std": 0.0005443591508083045, + "rewards/perpo_ocr_edit_distance_reward": 0.9983534812927246, + "step": 2495, + "temperature": 0.9 + }, + { + "advantages": -3.0824117857264355e-05, + "completion_length": 451.0, + "delta_ref_entropy_loss": 0.0771484375, + "delta_ref_ppl": -0.08642578125, + "entropy_loss": -0.06396484375, + "epoch": 0.4992, + "grad_norm": 1.8463530326484012, + "k1_kl": 0.08642578125, + "k3_kl": 0.0595703125, + "kimi_kl": 0.1650390625, + "learning_rate": 2.504e-07, + "loss": 0.0024, + "ppl": 0.033935546875, + "reward": 0.9850547909736633, + "reward_std": 0.0018346053548157215, + "rewards/perpo_ocr_edit_distance_reward": 0.9850547909736633, + "step": 2496, + "temperature": 0.9 + }, + { + "advantages": 0.0, + "completion_length": 416.0, + "delta_ref_entropy_loss": 0.04638671875, + "delta_ref_ppl": -0.0625, + "entropy_loss": -0.03173828125, + "epoch": 0.4994, + "grad_norm": 0.0183552478127436, + "k1_kl": 0.06298828125, + "k3_kl": 0.0361328125, + "kimi_kl": 0.1083984375, + "learning_rate": 2.503e-07, + "loss": 0.0015, + "ppl": 0.01092529296875, + "reward": 0.9952051639556885, + "reward_std": 0.0, + "rewards/perpo_ocr_edit_distance_reward": 0.9952051043510437, + "step": 2497, + "temperature": 0.9 + }, + { + "advantages": -5.653926564264111e-06, + "completion_length": 91.0, + "delta_ref_entropy_loss": 0.061767578125, + "delta_ref_ppl": -0.30859375, + "entropy_loss": -0.05322265625, + "epoch": 0.4996, + "grad_norm": 5.214517485925464, + "k1_kl": 0.30859375, + "k3_kl": 0.2421875, + "kimi_kl": 0.9296875, + "learning_rate": 2.5019999999999995e-07, + "loss": 0.0097, + "ppl": 0.0224609375, + "reward": 0.977344274520874, + "reward_std": 0.005925910081714392, + "rewards/perpo_ocr_edit_distance_reward": 0.977344274520874, + "step": 2498, + "temperature": 0.9 + }, + { + "advantages": -1.7029899268550253e-08, + "completion_length": 748.0, + "delta_ref_entropy_loss": 0.050537109375, + "delta_ref_ppl": -0.06005859375, + "entropy_loss": -0.06787109375, + "epoch": 0.4998, + "grad_norm": 0.7573871318239155, + "k1_kl": 0.06005859375, + "k3_kl": 0.03271484375, + "kimi_kl": 0.07080078125, + "learning_rate": 2.501e-07, + "loss": 0.0013, + "ppl": 0.031494140625, + "reward": 0.9393699765205383, + "reward_std": 0.0011823814129456878, + "rewards/perpo_ocr_edit_distance_reward": 0.9393699765205383, + "step": 2499, + "temperature": 0.9 + }, + { + "advantages": -5.892345143365674e-06, + "completion_length": 456.0, + "delta_ref_entropy_loss": 0.051025390625, + "delta_ref_ppl": -0.1494140625, + "entropy_loss": -0.462890625, + "epoch": 0.5, + "grad_norm": 2.816018221783589, + "k1_kl": 0.1484375, + "k3_kl": 0.11474609375, + "kimi_kl": 0.302734375, + "learning_rate": 2.5e-07, + "loss": 0.0046, + "ppl": 0.2392578125, + "reward": 0.64509117603302, + "reward_std": 0.010042575187981129, + "rewards/perpo_ocr_edit_distance_reward": 0.6450912356376648, + "step": 2500, + "temperature": 0.9 + }, + { + "advantages": -0.00017068642773665488, + "completion_length": 657.0, + "delta_ref_entropy_loss": 0.060546875, + "delta_ref_ppl": -0.09716796875, + "entropy_loss": -0.068359375, + "epoch": 0.5002, + "grad_norm": 1.7299502476354258, + "k1_kl": 0.09716796875, + "k3_kl": 0.06640625, + "kimi_kl": 0.2099609375, + "learning_rate": 2.499e-07, + "loss": 0.0028, + "ppl": 0.03271484375, + "reward": 0.903096079826355, + "reward_std": 0.0005983354058116674, + "rewards/perpo_ocr_edit_distance_reward": 0.9030962586402893, + "step": 2501, + "temperature": 0.9 + }, + { + "advantages": 4.564013124763733e-06, + "completion_length": 358.0, + "delta_ref_entropy_loss": 0.0189208984375, + "delta_ref_ppl": -0.05029296875, + "entropy_loss": -0.032470703125, + "epoch": 0.5004, + "grad_norm": 0.6479942983661404, + "k1_kl": 0.05029296875, + "k3_kl": 0.037841796875, + "kimi_kl": 0.126953125, + "learning_rate": 2.498e-07, + "loss": 0.0015, + "ppl": 0.0076904296875, + "reward": 0.9266269207000732, + "reward_std": 0.0017711303662508726, + "rewards/perpo_ocr_edit_distance_reward": 0.9266269207000732, + "step": 2502, + "temperature": 0.9 + }, + { + "advantages": -2.1151134205865674e-05, + "completion_length": 585.0, + "delta_ref_entropy_loss": 0.08984375, + "delta_ref_ppl": -0.115234375, + "entropy_loss": -0.13671875, + "epoch": 0.5006, + "grad_norm": 1.228033164252031, + "k1_kl": 0.115234375, + "k3_kl": 0.07568359375, + "kimi_kl": 0.16796875, + "learning_rate": 2.497e-07, + "loss": 0.003, + "ppl": 0.0703125, + "reward": 0.9574329257011414, + "reward_std": 0.0031213134061545134, + "rewards/perpo_ocr_edit_distance_reward": 0.9574329853057861, + "step": 2503, + "temperature": 0.9 + }, + { + "advantages": -4.810095197171904e-05, + "completion_length": 518.0, + "delta_ref_entropy_loss": 0.07373046875, + "delta_ref_ppl": -0.09228515625, + "entropy_loss": -0.0751953125, + "epoch": 0.5008, + "grad_norm": 0.8066905030550344, + "k1_kl": 0.09228515625, + "k3_kl": 0.05859375, + "kimi_kl": 0.203125, + "learning_rate": 2.4959999999999996e-07, + "loss": 0.0024, + "ppl": 0.03173828125, + "reward": 0.9748733639717102, + "reward_std": 0.001139609725214541, + "rewards/perpo_ocr_edit_distance_reward": 0.974873423576355, + "step": 2504, + "temperature": 0.9 + }, + { + "advantages": -2.3569380573462695e-05, + "completion_length": 578.0, + "delta_ref_entropy_loss": 0.017822265625, + "delta_ref_ppl": -0.0478515625, + "entropy_loss": -0.034912109375, + "epoch": 0.501, + "grad_norm": 0.44396432345044073, + "k1_kl": 0.0478515625, + "k3_kl": 0.03564453125, + "kimi_kl": 0.1123046875, + "learning_rate": 2.495e-07, + "loss": 0.0014, + "ppl": 0.01129150390625, + "reward": 0.9911385178565979, + "reward_std": 0.0013461792841553688, + "rewards/perpo_ocr_edit_distance_reward": 0.9911385774612427, + "step": 2505, + "temperature": 0.9 + }, + { + "advantages": -6.845167808933184e-05, + "completion_length": 553.0, + "delta_ref_entropy_loss": 0.10791015625, + "delta_ref_ppl": -0.1044921875, + "entropy_loss": -0.13671875, + "epoch": 0.5012, + "grad_norm": 1.777218491736213, + "k1_kl": 0.1044921875, + "k3_kl": 0.057373046875, + "kimi_kl": 0.125, + "learning_rate": 2.494e-07, + "loss": 0.0024, + "ppl": 0.064453125, + "reward": 0.9609473943710327, + "reward_std": 0.0013926412211731076, + "rewards/perpo_ocr_edit_distance_reward": 0.9609475135803223, + "step": 2506, + "temperature": 0.9 + }, + { + "advantages": -1.3479165318130981e-05, + "completion_length": 700.0, + "delta_ref_entropy_loss": 0.038818359375, + "delta_ref_ppl": -0.058837890625, + "entropy_loss": -0.08447265625, + "epoch": 0.5014, + "grad_norm": 9.321509437105629, + "k1_kl": 0.05908203125, + "k3_kl": 0.068359375, + "kimi_kl": 0.10986328125, + "learning_rate": 2.493e-07, + "loss": 0.0028, + "ppl": 0.037353515625, + "reward": 0.6363636255264282, + "reward_std": 0.004316279198974371, + "rewards/perpo_ocr_edit_distance_reward": 0.636363685131073, + "step": 2507, + "temperature": 0.9 + }, + { + "advantages": -2.384185791015625e-07, + "completion_length": 604.0, + "delta_ref_entropy_loss": -0.0118408203125, + "delta_ref_ppl": -0.1015625, + "entropy_loss": -0.220703125, + "epoch": 0.5016, + "grad_norm": 3.865154565822785, + "k1_kl": 0.1015625, + "k3_kl": 0.0751953125, + "kimi_kl": 0.2177734375, + "learning_rate": 2.492e-07, + "loss": 0.003, + "ppl": 0.0810546875, + "reward": 0.7793070673942566, + "reward_std": 0.28981953859329224, + "rewards/perpo_ocr_edit_distance_reward": 0.7793071269989014, + "step": 2508, + "temperature": 0.9 + }, + { + "advantages": -9.247235539078247e-06, + "completion_length": 1058.0, + "delta_ref_entropy_loss": 0.02294921875, + "delta_ref_ppl": -0.045654296875, + "entropy_loss": -0.03564453125, + "epoch": 0.5018, + "grad_norm": 0.4586896383812357, + "k1_kl": 0.045654296875, + "k3_kl": 0.0299072265625, + "kimi_kl": 0.08349609375, + "learning_rate": 2.491e-07, + "loss": 0.0012, + "ppl": 0.0125732421875, + "reward": 0.9937047958374023, + "reward_std": 0.0008216738351620734, + "rewards/perpo_ocr_edit_distance_reward": 0.9937048554420471, + "step": 2509, + "temperature": 0.9 + }, + { + "advantages": -3.549030952854082e-05, + "completion_length": 242.0, + "delta_ref_entropy_loss": 0.08544921875, + "delta_ref_ppl": -0.15625, + "entropy_loss": -0.1337890625, + "epoch": 0.502, + "grad_norm": 1.0192456013306348, + "k1_kl": 0.15625, + "k3_kl": 0.10888671875, + "kimi_kl": 0.37890625, + "learning_rate": 2.4899999999999997e-07, + "loss": 0.0044, + "ppl": 0.045166015625, + "reward": 0.8965185880661011, + "reward_std": 0.002540938090533018, + "rewards/perpo_ocr_edit_distance_reward": 0.8965187072753906, + "step": 2510, + "temperature": 0.9 + }, + { + "advantages": -2.1815301806782372e-05, + "completion_length": 460.0, + "delta_ref_entropy_loss": 0.1083984375, + "delta_ref_ppl": -0.0966796875, + "entropy_loss": -0.095703125, + "epoch": 0.5022, + "grad_norm": 0.8239404143967141, + "k1_kl": 0.0966796875, + "k3_kl": 0.0556640625, + "kimi_kl": 0.1572265625, + "learning_rate": 2.489e-07, + "loss": 0.0022, + "ppl": 0.03662109375, + "reward": 0.9923710227012634, + "reward_std": 0.0010715239914134145, + "rewards/perpo_ocr_edit_distance_reward": 0.9923710823059082, + "step": 2511, + "temperature": 0.9 + }, + { + "advantages": -6.772790948161855e-05, + "completion_length": 371.0, + "delta_ref_entropy_loss": 0.036376953125, + "delta_ref_ppl": -0.10498046875, + "entropy_loss": -0.054931640625, + "epoch": 0.5024, + "grad_norm": 0.4497025780695998, + "k1_kl": 0.10498046875, + "k3_kl": 0.08251953125, + "kimi_kl": 0.31640625, + "learning_rate": 2.488e-07, + "loss": 0.0034, + "ppl": 0.0242919921875, + "reward": 0.9824324250221252, + "reward_std": 0.0007802002946846187, + "rewards/perpo_ocr_edit_distance_reward": 0.98243248462677, + "step": 2512, + "temperature": 0.9 + }, + { + "advantages": -2.646446409926284e-05, + "completion_length": 521.0, + "delta_ref_entropy_loss": 0.054931640625, + "delta_ref_ppl": -0.08837890625, + "entropy_loss": -0.1044921875, + "epoch": 0.5026, + "grad_norm": 0.9051478009622445, + "k1_kl": 0.087890625, + "k3_kl": 0.0634765625, + "kimi_kl": 0.216796875, + "learning_rate": 2.487e-07, + "loss": 0.0025, + "ppl": 0.04345703125, + "reward": 0.991378128528595, + "reward_std": 0.0024737350177019835, + "rewards/perpo_ocr_edit_distance_reward": 0.9913783073425293, + "step": 2513, + "temperature": 0.9 + }, + { + "advantages": -3.9662634662818164e-05, + "completion_length": 1668.0, + "delta_ref_entropy_loss": 0.01202392578125, + "delta_ref_ppl": -0.0155029296875, + "entropy_loss": -0.0223388671875, + "epoch": 0.5028, + "grad_norm": 0.33414497706736507, + "k1_kl": 0.0155029296875, + "k3_kl": 0.0079345703125, + "kimi_kl": 0.017822265625, + "learning_rate": 2.486e-07, + "loss": 0.0004, + "ppl": 0.007476806640625, + "reward": 0.9952613711357117, + "reward_std": 0.0009730243473313749, + "rewards/perpo_ocr_edit_distance_reward": 0.9952613711357117, + "step": 2514, + "temperature": 0.9 + }, + { + "advantages": -1.139300275099231e-05, + "completion_length": 577.0, + "delta_ref_entropy_loss": 0.06494140625, + "delta_ref_ppl": -0.054443359375, + "entropy_loss": -0.05419921875, + "epoch": 0.503, + "grad_norm": 0.823833147129994, + "k1_kl": 0.054443359375, + "k3_kl": 0.02734375, + "kimi_kl": 0.0595703125, + "learning_rate": 2.485e-07, + "loss": 0.0011, + "ppl": 0.0228271484375, + "reward": 0.9810044765472412, + "reward_std": 0.002139804884791374, + "rewards/perpo_ocr_edit_distance_reward": 0.981004536151886, + "step": 2515, + "temperature": 0.9 + }, + { + "advantages": -2.760120878519956e-05, + "completion_length": 399.0, + "delta_ref_entropy_loss": 0.054931640625, + "delta_ref_ppl": -0.09423828125, + "entropy_loss": -0.06884765625, + "epoch": 0.5032, + "grad_norm": 0.6895055957385228, + "k1_kl": 0.09423828125, + "k3_kl": 0.064453125, + "kimi_kl": 0.212890625, + "learning_rate": 2.484e-07, + "loss": 0.0026, + "ppl": 0.031982421875, + "reward": 0.9903199672698975, + "reward_std": 0.0023684969637542963, + "rewards/perpo_ocr_edit_distance_reward": 0.990320086479187, + "step": 2516, + "temperature": 0.9 + }, + { + "advantages": -6.304468843154609e-05, + "completion_length": 472.0, + "delta_ref_entropy_loss": 0.05224609375, + "delta_ref_ppl": -0.0810546875, + "entropy_loss": -0.06787109375, + "epoch": 0.5034, + "grad_norm": 0.5416915744986199, + "k1_kl": 0.0810546875, + "k3_kl": 0.052978515625, + "kimi_kl": 0.1630859375, + "learning_rate": 2.4829999999999997e-07, + "loss": 0.0022, + "ppl": 0.0262451171875, + "reward": 0.9940630793571472, + "reward_std": 0.0009805900044739246, + "rewards/perpo_ocr_edit_distance_reward": 0.994063138961792, + "step": 2517, + "temperature": 0.9 + }, + { + "advantages": -3.545625077094883e-05, + "completion_length": 687.0, + "delta_ref_entropy_loss": 0.054931640625, + "delta_ref_ppl": -0.08349609375, + "entropy_loss": -0.0341796875, + "epoch": 0.5036, + "grad_norm": 0.3901658871876563, + "k1_kl": 0.08349609375, + "k3_kl": 0.055419921875, + "kimi_kl": 0.2138671875, + "learning_rate": 2.482e-07, + "loss": 0.0023, + "ppl": 0.01263427734375, + "reward": 0.9962351322174072, + "reward_std": 0.0006203069351613522, + "rewards/perpo_ocr_edit_distance_reward": 0.996235191822052, + "step": 2518, + "temperature": 0.9 + }, + { + "advantages": -1.0507447768759448e-05, + "completion_length": 436.0, + "delta_ref_entropy_loss": 0.004547119140625, + "delta_ref_ppl": -0.052734375, + "entropy_loss": -0.051513671875, + "epoch": 0.5038, + "grad_norm": 0.8279148393271177, + "k1_kl": 0.052734375, + "k3_kl": 0.040283203125, + "kimi_kl": 0.1591796875, + "learning_rate": 2.4809999999999995e-07, + "loss": 0.0016, + "ppl": 0.01708984375, + "reward": 0.8747678399085999, + "reward_std": 0.0023338927421718836, + "rewards/perpo_ocr_edit_distance_reward": 0.8747678399085999, + "step": 2519, + "temperature": 0.9 + }, + { + "advantages": -0.00011435577471274883, + "completion_length": 459.0, + "delta_ref_entropy_loss": 0.042236328125, + "delta_ref_ppl": -0.07568359375, + "entropy_loss": -0.046630859375, + "epoch": 0.504, + "grad_norm": 0.7738415022877468, + "k1_kl": 0.076171875, + "k3_kl": 0.05126953125, + "kimi_kl": 0.169921875, + "learning_rate": 2.48e-07, + "loss": 0.0022, + "ppl": 0.02099609375, + "reward": 0.9957757592201233, + "reward_std": 0.0005700716283172369, + "rewards/perpo_ocr_edit_distance_reward": 0.9957758188247681, + "step": 2520, + "temperature": 0.9 + }, + { + "advantages": -1.0085957910632715e-05, + "completion_length": 784.0, + "delta_ref_entropy_loss": 0.056640625, + "delta_ref_ppl": -0.06787109375, + "entropy_loss": -0.04443359375, + "epoch": 0.5042, + "grad_norm": 0.527933612328943, + "k1_kl": 0.06787109375, + "k3_kl": 0.036376953125, + "kimi_kl": 0.1015625, + "learning_rate": 2.479e-07, + "loss": 0.0015, + "ppl": 0.01953125, + "reward": 0.9705473184585571, + "reward_std": 0.0007430998957715929, + "rewards/perpo_ocr_edit_distance_reward": 0.9705473184585571, + "step": 2521, + "temperature": 0.9 + }, + { + "advantages": -1.4867102436255664e-05, + "completion_length": 357.0, + "delta_ref_entropy_loss": 0.0673828125, + "delta_ref_ppl": -0.1064453125, + "entropy_loss": -0.07568359375, + "epoch": 0.5044, + "grad_norm": 0.9066913401596777, + "k1_kl": 0.1064453125, + "k3_kl": 0.07421875, + "kimi_kl": 0.255859375, + "learning_rate": 2.478e-07, + "loss": 0.003, + "ppl": 0.0257568359375, + "reward": 0.9748441576957703, + "reward_std": 0.00391375320032239, + "rewards/perpo_ocr_edit_distance_reward": 0.9748442769050598, + "step": 2522, + "temperature": 0.9 + }, + { + "advantages": -4.5090917410561815e-05, + "completion_length": 1036.0, + "delta_ref_entropy_loss": 0.031982421875, + "delta_ref_ppl": -0.0380859375, + "entropy_loss": -0.046630859375, + "epoch": 0.5046, + "grad_norm": 0.33545568775784973, + "k1_kl": 0.0380859375, + "k3_kl": 0.0223388671875, + "kimi_kl": 0.0537109375, + "learning_rate": 2.477e-07, + "loss": 0.0009, + "ppl": 0.020751953125, + "reward": 0.9920611381530762, + "reward_std": 0.00046656609629280865, + "rewards/perpo_ocr_edit_distance_reward": 0.992061197757721, + "step": 2523, + "temperature": 0.9 + }, + { + "advantages": -1.6689301673977752e-06, + "completion_length": 424.0, + "delta_ref_entropy_loss": 0.06787109375, + "delta_ref_ppl": -0.076171875, + "entropy_loss": -0.099609375, + "epoch": 0.5048, + "grad_norm": 1.1456091702321431, + "k1_kl": 0.07666015625, + "k3_kl": 0.047119140625, + "kimi_kl": 0.1328125, + "learning_rate": 2.4759999999999997e-07, + "loss": 0.0019, + "ppl": 0.0400390625, + "reward": 0.9678341150283813, + "reward_std": 0.004969414323568344, + "rewards/perpo_ocr_edit_distance_reward": 0.9678341746330261, + "step": 2524, + "temperature": 0.9 + }, + { + "advantages": -4.83649137095199e-06, + "completion_length": 196.0, + "delta_ref_entropy_loss": 0.053955078125, + "delta_ref_ppl": -0.1708984375, + "entropy_loss": -0.053466796875, + "epoch": 0.505, + "grad_norm": 1.1878966164290876, + "k1_kl": 0.1708984375, + "k3_kl": 0.1279296875, + "kimi_kl": 0.453125, + "learning_rate": 2.475e-07, + "loss": 0.0051, + "ppl": 0.024169921875, + "reward": 0.9861992001533508, + "reward_std": 0.0016611118335276842, + "rewards/perpo_ocr_edit_distance_reward": 0.9861992001533508, + "step": 2525, + "temperature": 0.9 + }, + { + "advantages": -4.9812453653430566e-05, + "completion_length": 616.0, + "delta_ref_entropy_loss": 0.048095703125, + "delta_ref_ppl": -0.0693359375, + "entropy_loss": -0.054931640625, + "epoch": 0.5052, + "grad_norm": 0.4829186831672649, + "k1_kl": 0.0693359375, + "k3_kl": 0.0400390625, + "kimi_kl": 0.1005859375, + "learning_rate": 2.474e-07, + "loss": 0.0016, + "ppl": 0.0245361328125, + "reward": 0.9741430878639221, + "reward_std": 0.001438292907550931, + "rewards/perpo_ocr_edit_distance_reward": 0.9741431474685669, + "step": 2526, + "temperature": 0.9 + }, + { + "advantages": -8.804458047961816e-06, + "completion_length": 448.0, + "delta_ref_entropy_loss": 0.0576171875, + "delta_ref_ppl": -0.06982421875, + "entropy_loss": -0.0380859375, + "epoch": 0.5054, + "grad_norm": 0.6847193690848453, + "k1_kl": 0.0703125, + "k3_kl": 0.04833984375, + "kimi_kl": 0.125, + "learning_rate": 2.473e-07, + "loss": 0.0019, + "ppl": 0.0166015625, + "reward": 0.9954977035522461, + "reward_std": 0.000866461603436619, + "rewards/perpo_ocr_edit_distance_reward": 0.9954977631568909, + "step": 2527, + "temperature": 0.9 + }, + { + "advantages": -1.043932843458606e-05, + "completion_length": 556.0, + "delta_ref_entropy_loss": 0.05078125, + "delta_ref_ppl": -0.045654296875, + "entropy_loss": -0.03173828125, + "epoch": 0.5056, + "grad_norm": 0.6238789131152603, + "k1_kl": 0.045654296875, + "k3_kl": 0.0260009765625, + "kimi_kl": 0.06298828125, + "learning_rate": 2.472e-07, + "loss": 0.001, + "ppl": 0.01336669921875, + "reward": 0.9957893490791321, + "reward_std": 0.0007160123204812407, + "rewards/perpo_ocr_edit_distance_reward": 0.9957893490791321, + "step": 2528, + "temperature": 0.9 + }, + { + "advantages": -0.0001240713318111375, + "completion_length": 494.0, + "delta_ref_entropy_loss": 0.030517578125, + "delta_ref_ppl": -0.060302734375, + "entropy_loss": -0.029052734375, + "epoch": 0.5058, + "grad_norm": 0.3211213980157232, + "k1_kl": 0.060302734375, + "k3_kl": 0.0361328125, + "kimi_kl": 0.0986328125, + "learning_rate": 2.471e-07, + "loss": 0.0016, + "ppl": 0.00933837890625, + "reward": 0.9920853972434998, + "reward_std": 0.00017451091844122857, + "rewards/perpo_ocr_edit_distance_reward": 0.9920855164527893, + "step": 2529, + "temperature": 0.9 + }, + { + "advantages": -4.572527905111201e-05, + "completion_length": 492.0, + "delta_ref_entropy_loss": 0.058349609375, + "delta_ref_ppl": -0.08251953125, + "entropy_loss": -0.053466796875, + "epoch": 0.506, + "grad_norm": 0.5823638918467702, + "k1_kl": 0.08251953125, + "k3_kl": 0.04638671875, + "kimi_kl": 0.11669921875, + "learning_rate": 2.47e-07, + "loss": 0.0019, + "ppl": 0.0191650390625, + "reward": 0.9683406352996826, + "reward_std": 0.0010177289368584752, + "rewards/perpo_ocr_edit_distance_reward": 0.9683407545089722, + "step": 2530, + "temperature": 0.9 + }, + { + "advantages": -5.2588329708669335e-05, + "completion_length": 576.0, + "delta_ref_entropy_loss": 0.050537109375, + "delta_ref_ppl": -0.07763671875, + "entropy_loss": -0.025634765625, + "epoch": 0.5062, + "grad_norm": 0.3554267648546877, + "k1_kl": 0.07763671875, + "k3_kl": 0.044189453125, + "kimi_kl": 0.14453125, + "learning_rate": 2.469e-07, + "loss": 0.0018, + "ppl": 0.0091552734375, + "reward": 0.9969308376312256, + "reward_std": 0.0008711755508556962, + "rewards/perpo_ocr_edit_distance_reward": 0.9969309568405151, + "step": 2531, + "temperature": 0.9 + }, + { + "advantages": -6.474767724284902e-05, + "completion_length": 868.0, + "delta_ref_entropy_loss": 0.04541015625, + "delta_ref_ppl": -0.0771484375, + "entropy_loss": -0.052734375, + "epoch": 0.5064, + "grad_norm": 0.5950138478392212, + "k1_kl": 0.0771484375, + "k3_kl": 0.052001953125, + "kimi_kl": 0.2216796875, + "learning_rate": 2.4679999999999996e-07, + "loss": 0.0021, + "ppl": 0.0234375, + "reward": 0.9639758467674255, + "reward_std": 0.0005574562819674611, + "rewards/perpo_ocr_edit_distance_reward": 0.9639759659767151, + "step": 2532, + "temperature": 0.9 + }, + { + "advantages": -1.4560563386112335e-06, + "completion_length": 425.0, + "delta_ref_entropy_loss": 0.0419921875, + "delta_ref_ppl": -0.06103515625, + "entropy_loss": -0.08642578125, + "epoch": 0.5066, + "grad_norm": 1.0245666512410532, + "k1_kl": 0.060791015625, + "k3_kl": 0.038818359375, + "kimi_kl": 0.09765625, + "learning_rate": 2.467e-07, + "loss": 0.0016, + "ppl": 0.03662109375, + "reward": 0.7240025401115417, + "reward_std": 0.03513907641172409, + "rewards/perpo_ocr_edit_distance_reward": 0.7240025401115417, + "step": 2533, + "temperature": 0.9 + }, + { + "advantages": -0.00016555687761865556, + "completion_length": 632.0, + "delta_ref_entropy_loss": 0.02783203125, + "delta_ref_ppl": -0.040771484375, + "entropy_loss": -0.03271484375, + "epoch": 0.5068, + "grad_norm": 0.4003926562813881, + "k1_kl": 0.040771484375, + "k3_kl": 0.025390625, + "kimi_kl": 0.060302734375, + "learning_rate": 2.466e-07, + "loss": 0.0012, + "ppl": 0.01019287109375, + "reward": 0.9962243437767029, + "reward_std": 0.00031141823274083436, + "rewards/perpo_ocr_edit_distance_reward": 0.9962244629859924, + "step": 2534, + "temperature": 0.9 + }, + { + "advantages": -3.7414687540149316e-05, + "completion_length": 331.0, + "delta_ref_entropy_loss": 0.050537109375, + "delta_ref_ppl": -0.11669921875, + "entropy_loss": -0.05078125, + "epoch": 0.507, + "grad_norm": 0.6944379093662881, + "k1_kl": 0.11669921875, + "k3_kl": 0.08154296875, + "kimi_kl": 0.3125, + "learning_rate": 2.465e-07, + "loss": 0.0033, + "ppl": 0.0228271484375, + "reward": 0.9932444095611572, + "reward_std": 0.0017210444202646613, + "rewards/perpo_ocr_edit_distance_reward": 0.993244469165802, + "step": 2535, + "temperature": 0.9 + }, + { + "advantages": -4.8305308155249804e-05, + "completion_length": 566.0, + "delta_ref_entropy_loss": 0.06689453125, + "delta_ref_ppl": -0.0693359375, + "entropy_loss": -0.05859375, + "epoch": 0.5072, + "grad_norm": 1.4059514454835633, + "k1_kl": 0.0693359375, + "k3_kl": 0.0390625, + "kimi_kl": 0.10546875, + "learning_rate": 2.464e-07, + "loss": 0.0016, + "ppl": 0.0262451171875, + "reward": 0.9815715551376343, + "reward_std": 0.0014861957170069218, + "rewards/perpo_ocr_edit_distance_reward": 0.9815716743469238, + "step": 2536, + "temperature": 0.9 + }, + { + "advantages": -0.00020950182806700468, + "completion_length": 1211.0, + "delta_ref_entropy_loss": 0.05224609375, + "delta_ref_ppl": -0.06201171875, + "entropy_loss": -0.049560546875, + "epoch": 0.5074, + "grad_norm": 8.250218160453736, + "k1_kl": 0.06201171875, + "k3_kl": 0.08740234375, + "kimi_kl": 0.11962890625, + "learning_rate": 2.463e-07, + "loss": 0.0037, + "ppl": 0.021728515625, + "reward": 0.9965075850486755, + "reward_std": 0.0004689996421802789, + "rewards/perpo_ocr_edit_distance_reward": 0.9965077042579651, + "step": 2537, + "temperature": 0.9 + }, + { + "advantages": -4.543577233562246e-05, + "completion_length": 282.0, + "delta_ref_entropy_loss": 0.031982421875, + "delta_ref_ppl": -0.1123046875, + "entropy_loss": -0.0703125, + "epoch": 0.5076, + "grad_norm": 1.3782423797416172, + "k1_kl": 0.1123046875, + "k3_kl": 0.0908203125, + "kimi_kl": 0.3359375, + "learning_rate": 2.4619999999999997e-07, + "loss": 0.0037, + "ppl": 0.0341796875, + "reward": 0.9919028282165527, + "reward_std": 0.0023374382872134447, + "rewards/perpo_ocr_edit_distance_reward": 0.9919029474258423, + "step": 2538, + "temperature": 0.9 + }, + { + "advantages": -1.857961979112588e-05, + "completion_length": 258.0, + "delta_ref_entropy_loss": 0.08984375, + "delta_ref_ppl": -0.1474609375, + "entropy_loss": -0.28515625, + "epoch": 0.5078, + "grad_norm": 1.9925111348066769, + "k1_kl": 0.1474609375, + "k3_kl": 0.1015625, + "kimi_kl": 0.291015625, + "learning_rate": 2.461e-07, + "loss": 0.0041, + "ppl": 0.1533203125, + "reward": 0.8445339202880859, + "reward_std": 0.002649118425324559, + "rewards/perpo_ocr_edit_distance_reward": 0.8445340394973755, + "step": 2539, + "temperature": 0.9 + }, + { + "advantages": -3.294433918199502e-05, + "completion_length": 268.0, + "delta_ref_entropy_loss": 0.03662109375, + "delta_ref_ppl": -0.119140625, + "entropy_loss": -0.049072265625, + "epoch": 0.508, + "grad_norm": 1.4120958610897199, + "k1_kl": 0.119140625, + "k3_kl": 0.0888671875, + "kimi_kl": 0.28125, + "learning_rate": 2.46e-07, + "loss": 0.0036, + "ppl": 0.0235595703125, + "reward": 0.9962027072906494, + "reward_std": 0.00170894933398813, + "rewards/perpo_ocr_edit_distance_reward": 0.9962027668952942, + "step": 2540, + "temperature": 0.9 + }, + { + "advantages": -3.405979782655777e-07, + "completion_length": 177.0, + "delta_ref_entropy_loss": -0.036376953125, + "delta_ref_ppl": -0.287109375, + "entropy_loss": -0.29296875, + "epoch": 0.5082, + "grad_norm": 3.0000763316869272, + "k1_kl": 0.287109375, + "k3_kl": 0.2421875, + "kimi_kl": 1.0859375, + "learning_rate": 2.459e-07, + "loss": 0.0097, + "ppl": 0.1376953125, + "reward": 0.8585125207901001, + "reward_std": 0.05228548124432564, + "rewards/perpo_ocr_edit_distance_reward": 0.8585126399993896, + "step": 2541, + "temperature": 0.9 + }, + { + "advantages": -2.238580236735288e-05, + "completion_length": 1043.0, + "delta_ref_entropy_loss": 0.04296875, + "delta_ref_ppl": -0.043701171875, + "entropy_loss": -0.032958984375, + "epoch": 0.5084, + "grad_norm": 1.3878564082100258, + "k1_kl": 0.043701171875, + "k3_kl": 0.0234375, + "kimi_kl": 0.076171875, + "learning_rate": 2.458e-07, + "loss": 0.001, + "ppl": 0.0115966796875, + "reward": 0.993017852306366, + "reward_std": 0.001041377312503755, + "rewards/perpo_ocr_edit_distance_reward": 0.9930179119110107, + "step": 2542, + "temperature": 0.9 + }, + { + "advantages": -4.51292316938634e-06, + "completion_length": 207.0, + "delta_ref_entropy_loss": 0.039306640625, + "delta_ref_ppl": -0.12255859375, + "entropy_loss": -0.1201171875, + "epoch": 0.5086, + "grad_norm": 1.1447396850594038, + "k1_kl": 0.123046875, + "k3_kl": 0.09130859375, + "kimi_kl": 0.271484375, + "learning_rate": 2.457e-07, + "loss": 0.0037, + "ppl": 0.053466796875, + "reward": 0.9500319361686707, + "reward_std": 0.0074041071347892284, + "rewards/perpo_ocr_edit_distance_reward": 0.9500320553779602, + "step": 2543, + "temperature": 0.9 + }, + { + "advantages": -8.383819658774883e-05, + "completion_length": 381.0, + "delta_ref_entropy_loss": 0.017578125, + "delta_ref_ppl": -0.055908203125, + "entropy_loss": -0.021484375, + "epoch": 0.5088, + "grad_norm": 0.31723269667944526, + "k1_kl": 0.055908203125, + "k3_kl": 0.041015625, + "kimi_kl": 0.1484375, + "learning_rate": 2.456e-07, + "loss": 0.0017, + "ppl": 0.006591796875, + "reward": 0.9870033860206604, + "reward_std": 0.0005092623759992421, + "rewards/perpo_ocr_edit_distance_reward": 0.9870034456253052, + "step": 2544, + "temperature": 0.9 + }, + { + "advantages": -0.0001520088844699785, + "completion_length": 809.0, + "delta_ref_entropy_loss": 0.04052734375, + "delta_ref_ppl": -0.056396484375, + "entropy_loss": -0.048583984375, + "epoch": 0.509, + "grad_norm": 0.5319814154307861, + "k1_kl": 0.056396484375, + "k3_kl": 0.03564453125, + "kimi_kl": 0.1337890625, + "learning_rate": 2.4549999999999997e-07, + "loss": 0.0016, + "ppl": 0.0191650390625, + "reward": 0.9777202010154724, + "reward_std": 0.0004602271073963493, + "rewards/perpo_ocr_edit_distance_reward": 0.977720320224762, + "step": 2545, + "temperature": 0.9 + }, + { + "advantages": -0.0002577645645942539, + "completion_length": 1009.0, + "delta_ref_entropy_loss": 0.0244140625, + "delta_ref_ppl": -0.0291748046875, + "entropy_loss": -0.0286865234375, + "epoch": 0.5092, + "grad_norm": 0.18340196540553746, + "k1_kl": 0.0291748046875, + "k3_kl": 0.01611328125, + "kimi_kl": 0.044189453125, + "learning_rate": 2.454e-07, + "loss": 0.0009, + "ppl": 0.00762939453125, + "reward": 0.9886527061462402, + "reward_std": 0.00016428015078417957, + "rewards/perpo_ocr_edit_distance_reward": 0.9886528253555298, + "step": 2546, + "temperature": 0.9 + }, + { + "advantages": -1.043932843458606e-05, + "completion_length": 646.0, + "delta_ref_entropy_loss": 0.060791015625, + "delta_ref_ppl": -0.0615234375, + "entropy_loss": -0.078125, + "epoch": 0.5094, + "grad_norm": 0.8480634291550038, + "k1_kl": 0.0615234375, + "k3_kl": 0.0390625, + "kimi_kl": 0.12060546875, + "learning_rate": 2.4529999999999995e-07, + "loss": 0.0016, + "ppl": 0.0277099609375, + "reward": 0.460853636264801, + "reward_std": 0.0023469277657568455, + "rewards/perpo_ocr_edit_distance_reward": 0.4608536660671234, + "step": 2547, + "temperature": 0.9 + }, + { + "advantages": -2.806527481880039e-05, + "completion_length": 839.0, + "delta_ref_entropy_loss": 0.03515625, + "delta_ref_ppl": -0.043212890625, + "entropy_loss": -0.0712890625, + "epoch": 0.5096, + "grad_norm": 0.8835534204079085, + "k1_kl": 0.043212890625, + "k3_kl": 0.028564453125, + "kimi_kl": 0.0595703125, + "learning_rate": 2.452e-07, + "loss": 0.0012, + "ppl": 0.034423828125, + "reward": 0.9895601868629456, + "reward_std": 0.0035424595698714256, + "rewards/perpo_ocr_edit_distance_reward": 0.9895603060722351, + "step": 2548, + "temperature": 0.9 + }, + { + "advantages": 5.790165573671402e-07, + "completion_length": 1049.0, + "delta_ref_entropy_loss": 0.062255859375, + "delta_ref_ppl": -0.08447265625, + "entropy_loss": -0.1904296875, + "epoch": 0.5098, + "grad_norm": 1.8563194985330718, + "k1_kl": 0.08447265625, + "k3_kl": 0.0615234375, + "kimi_kl": 0.146484375, + "learning_rate": 2.451e-07, + "loss": 0.0025, + "ppl": 0.09375, + "reward": 0.9602043032646179, + "reward_std": 0.013920527882874012, + "rewards/perpo_ocr_edit_distance_reward": 0.9602043032646179, + "step": 2549, + "temperature": 0.9 + }, + { + "advantages": -7.538710633525625e-05, + "completion_length": 465.0, + "delta_ref_entropy_loss": 0.045654296875, + "delta_ref_ppl": -0.06689453125, + "entropy_loss": -0.04150390625, + "epoch": 0.51, + "grad_norm": 0.6961152154702616, + "k1_kl": 0.06689453125, + "k3_kl": 0.03955078125, + "kimi_kl": 0.10888671875, + "learning_rate": 2.45e-07, + "loss": 0.0017, + "ppl": 0.015869140625, + "reward": 0.9845919013023376, + "reward_std": 0.0011423641117289662, + "rewards/perpo_ocr_edit_distance_reward": 0.9845919609069824, + "step": 2550, + "temperature": 0.9 + }, + { + "advantages": -4.32389133493416e-05, + "completion_length": 576.0, + "delta_ref_entropy_loss": 0.0196533203125, + "delta_ref_ppl": -0.039794921875, + "entropy_loss": -0.02880859375, + "epoch": 0.5102, + "grad_norm": 0.3158610117370478, + "k1_kl": 0.0400390625, + "k3_kl": 0.0257568359375, + "kimi_kl": 0.07275390625, + "learning_rate": 2.449e-07, + "loss": 0.0011, + "ppl": 0.00933837890625, + "reward": 0.7778953313827515, + "reward_std": 0.0006878097774460912, + "rewards/perpo_ocr_edit_distance_reward": 0.7778953909873962, + "step": 2551, + "temperature": 0.9 + }, + { + "advantages": -5.630084706353955e-05, + "completion_length": 419.0, + "delta_ref_entropy_loss": 0.0712890625, + "delta_ref_ppl": -0.09716796875, + "entropy_loss": -0.0673828125, + "epoch": 0.5104, + "grad_norm": 0.7780227576818545, + "k1_kl": 0.0966796875, + "k3_kl": 0.0654296875, + "kimi_kl": 0.28515625, + "learning_rate": 2.4479999999999997e-07, + "loss": 0.0027, + "ppl": 0.031494140625, + "reward": 0.9380702972412109, + "reward_std": 0.0006563261849805713, + "rewards/perpo_ocr_edit_distance_reward": 0.9380703568458557, + "step": 2552, + "temperature": 0.9 + }, + { + "advantages": -2.016340113186743e-05, + "completion_length": 769.0, + "delta_ref_entropy_loss": 0.02734375, + "delta_ref_ppl": -0.051025390625, + "entropy_loss": -0.02783203125, + "epoch": 0.5106, + "grad_norm": 0.5007759412657916, + "k1_kl": 0.051025390625, + "k3_kl": 0.0341796875, + "kimi_kl": 0.115234375, + "learning_rate": 2.447e-07, + "loss": 0.0014, + "ppl": 0.008544921875, + "reward": 0.9939115047454834, + "reward_std": 0.0007442060741595924, + "rewards/perpo_ocr_edit_distance_reward": 0.9939115643501282, + "step": 2553, + "temperature": 0.9 + }, + { + "advantages": -7.780960731906816e-05, + "completion_length": 493.0, + "delta_ref_entropy_loss": 0.037353515625, + "delta_ref_ppl": -0.0703125, + "entropy_loss": -0.05322265625, + "epoch": 0.5108, + "grad_norm": 0.7099665587755591, + "k1_kl": 0.0703125, + "k3_kl": 0.046630859375, + "kimi_kl": 0.171875, + "learning_rate": 2.446e-07, + "loss": 0.0019, + "ppl": 0.02294921875, + "reward": 0.9952807426452637, + "reward_std": 0.0005567167536355555, + "rewards/perpo_ocr_edit_distance_reward": 0.9952808022499084, + "step": 2554, + "temperature": 0.9 + }, + { + "advantages": -9.961639443645254e-05, + "completion_length": 906.0, + "delta_ref_entropy_loss": 0.03466796875, + "delta_ref_ppl": -0.052001953125, + "entropy_loss": -0.037353515625, + "epoch": 0.511, + "grad_norm": 0.20030617127687997, + "k1_kl": 0.052001953125, + "k3_kl": 0.032958984375, + "kimi_kl": 0.0908203125, + "learning_rate": 2.445e-07, + "loss": 0.0014, + "ppl": 0.01190185546875, + "reward": 0.9664048552513123, + "reward_std": 0.00041277476702816784, + "rewards/perpo_ocr_edit_distance_reward": 0.966404914855957, + "step": 2555, + "temperature": 0.9 + }, + { + "advantages": -1.1461122085165698e-05, + "completion_length": 654.0, + "delta_ref_entropy_loss": 0.09130859375, + "delta_ref_ppl": -0.09326171875, + "entropy_loss": -0.2158203125, + "epoch": 0.5112, + "grad_norm": 1.6478503696233155, + "k1_kl": 0.09375, + "k3_kl": 0.05322265625, + "kimi_kl": 0.12158203125, + "learning_rate": 2.444e-07, + "loss": 0.0021, + "ppl": 0.10791015625, + "reward": 0.906921923160553, + "reward_std": 0.003617397276684642, + "rewards/perpo_ocr_edit_distance_reward": 0.9069219827651978, + "step": 2556, + "temperature": 0.9 + }, + { + "advantages": -0.00015766281285323203, + "completion_length": 512.0, + "delta_ref_entropy_loss": 0.03515625, + "delta_ref_ppl": -0.05322265625, + "entropy_loss": -0.0247802734375, + "epoch": 0.5114, + "grad_norm": 0.376389823401479, + "k1_kl": 0.05322265625, + "k3_kl": 0.032958984375, + "kimi_kl": 0.125, + "learning_rate": 2.443e-07, + "loss": 0.0015, + "ppl": 0.00860595703125, + "reward": 0.995876669883728, + "reward_std": 0.0004940983490087092, + "rewards/perpo_ocr_edit_distance_reward": 0.9958767890930176, + "step": 2557, + "temperature": 0.9 + }, + { + "advantages": -9.5367431640625e-07, + "completion_length": 308.0, + "delta_ref_entropy_loss": 0.06787109375, + "delta_ref_ppl": -0.146484375, + "entropy_loss": -0.125, + "epoch": 0.5116, + "grad_norm": 1.778152128762219, + "k1_kl": 0.146484375, + "k3_kl": 0.10595703125, + "kimi_kl": 0.361328125, + "learning_rate": 2.442e-07, + "loss": 0.0042, + "ppl": 0.0458984375, + "reward": 0.9008927941322327, + "reward_std": 0.05340002849698067, + "rewards/perpo_ocr_edit_distance_reward": 0.9008929133415222, + "step": 2558, + "temperature": 0.9 + }, + { + "advantages": -7.935932808322832e-05, + "completion_length": 334.0, + "delta_ref_entropy_loss": 0.039794921875, + "delta_ref_ppl": -0.103515625, + "entropy_loss": -0.034912109375, + "epoch": 0.5118, + "grad_norm": 0.8808750511412867, + "k1_kl": 0.103515625, + "k3_kl": 0.07373046875, + "kimi_kl": 0.373046875, + "learning_rate": 2.441e-07, + "loss": 0.003, + "ppl": 0.0106201171875, + "reward": 0.9984802603721619, + "reward_std": 0.0007581802783533931, + "rewards/perpo_ocr_edit_distance_reward": 0.9984803199768066, + "step": 2559, + "temperature": 0.9 + }, + { + "advantages": -0.0001271069049835205, + "completion_length": 821.0, + "delta_ref_entropy_loss": 0.06982421875, + "delta_ref_ppl": -0.0791015625, + "entropy_loss": -0.07177734375, + "epoch": 0.512, + "grad_norm": 1.250365114698152, + "k1_kl": 0.0791015625, + "k3_kl": 0.048828125, + "kimi_kl": 0.1767578125, + "learning_rate": 2.4399999999999996e-07, + "loss": 0.0021, + "ppl": 0.030517578125, + "reward": 0.9873999357223511, + "reward_std": 0.0006368652684614062, + "rewards/perpo_ocr_edit_distance_reward": 0.9874001145362854, + "step": 2560, + "temperature": 0.9 + }, + { + "advantages": -7.66345493730114e-08, + "completion_length": 840.0, + "delta_ref_entropy_loss": 0.056640625, + "delta_ref_ppl": -0.09033203125, + "entropy_loss": -0.1357421875, + "epoch": 0.5122, + "grad_norm": 1.3044438946667785, + "k1_kl": 0.09033203125, + "k3_kl": 0.059326171875, + "kimi_kl": 0.158203125, + "learning_rate": 2.439e-07, + "loss": 0.0024, + "ppl": 0.06640625, + "reward": 0.7935396432876587, + "reward_std": 0.08549045026302338, + "rewards/perpo_ocr_edit_distance_reward": 0.7935396432876587, + "step": 2561, + "temperature": 0.9 + }, + { + "advantages": -2.0555087758111767e-05, + "completion_length": 629.0, + "delta_ref_entropy_loss": 0.0849609375, + "delta_ref_ppl": -0.08544921875, + "entropy_loss": -0.1220703125, + "epoch": 0.5124, + "grad_norm": 1.3058028892055291, + "k1_kl": 0.0859375, + "k3_kl": 0.048095703125, + "kimi_kl": 0.1015625, + "learning_rate": 2.438e-07, + "loss": 0.0019, + "ppl": 0.052978515625, + "reward": 0.9496240019798279, + "reward_std": 0.003211145056411624, + "rewards/perpo_ocr_edit_distance_reward": 0.9496240615844727, + "step": 2562, + "temperature": 0.9 + }, + { + "advantages": -3.840668068733066e-05, + "completion_length": 658.0, + "delta_ref_entropy_loss": 0.052978515625, + "delta_ref_ppl": -0.06689453125, + "entropy_loss": -0.1015625, + "epoch": 0.5126, + "grad_norm": 1.1133023634668118, + "k1_kl": 0.06689453125, + "k3_kl": 0.039794921875, + "kimi_kl": 0.11572265625, + "learning_rate": 2.437e-07, + "loss": 0.0016, + "ppl": 0.056640625, + "reward": 0.9574572443962097, + "reward_std": 0.0007870227564126253, + "rewards/perpo_ocr_edit_distance_reward": 0.9574573040008545, + "step": 2563, + "temperature": 0.9 + }, + { + "advantages": -5.558559132623486e-05, + "completion_length": 774.0, + "delta_ref_entropy_loss": 0.0301513671875, + "delta_ref_ppl": -0.056884765625, + "entropy_loss": -0.0380859375, + "epoch": 0.5128, + "grad_norm": 0.6532146532112071, + "k1_kl": 0.056884765625, + "k3_kl": 0.036865234375, + "kimi_kl": 0.1181640625, + "learning_rate": 2.436e-07, + "loss": 0.0015, + "ppl": 0.0174560546875, + "reward": 0.9873015284538269, + "reward_std": 0.0008186835912056267, + "rewards/perpo_ocr_edit_distance_reward": 0.9873016476631165, + "step": 2564, + "temperature": 0.9 + }, + { + "advantages": -1.7608916095923632e-05, + "completion_length": 819.0, + "delta_ref_entropy_loss": 0.02587890625, + "delta_ref_ppl": -0.054443359375, + "entropy_loss": -0.040283203125, + "epoch": 0.513, + "grad_norm": 0.5773959692598437, + "k1_kl": 0.0546875, + "k3_kl": 0.037353515625, + "kimi_kl": 0.09765625, + "learning_rate": 2.435e-07, + "loss": 0.0015, + "ppl": 0.018798828125, + "reward": 0.984907865524292, + "reward_std": 0.0008665996720083058, + "rewards/perpo_ocr_edit_distance_reward": 0.984907865524292, + "step": 2565, + "temperature": 0.9 + }, + { + "advantages": -1.7029899268550253e-08, + "completion_length": 524.0, + "delta_ref_entropy_loss": 0.028564453125, + "delta_ref_ppl": -0.0634765625, + "entropy_loss": -0.027099609375, + "epoch": 0.5132, + "grad_norm": 0.7403541806701627, + "k1_kl": 0.0634765625, + "k3_kl": 0.049560546875, + "kimi_kl": 0.2158203125, + "learning_rate": 2.4339999999999997e-07, + "loss": 0.002, + "ppl": 0.01153564453125, + "reward": 0.9965558648109436, + "reward_std": 0.000675636634696275, + "rewards/perpo_ocr_edit_distance_reward": 0.9965559244155884, + "step": 2566, + "temperature": 0.9 + }, + { + "advantages": -5.032335138821509e-06, + "completion_length": 1339.0, + "delta_ref_entropy_loss": 0.05126953125, + "delta_ref_ppl": -0.060791015625, + "entropy_loss": -0.06689453125, + "epoch": 0.5134, + "grad_norm": 4.7187196962482405, + "k1_kl": 0.060791015625, + "k3_kl": 0.04443359375, + "kimi_kl": 0.08154296875, + "learning_rate": 2.4329999999999996e-07, + "loss": 0.0018, + "ppl": 0.039306640625, + "reward": 0.9645730257034302, + "reward_std": 0.011754239909350872, + "rewards/perpo_ocr_edit_distance_reward": 0.964573085308075, + "step": 2567, + "temperature": 0.9 + }, + { + "advantages": -4.7411238483618945e-05, + "completion_length": 422.0, + "delta_ref_entropy_loss": 0.07275390625, + "delta_ref_ppl": -0.08251953125, + "entropy_loss": -0.037109375, + "epoch": 0.5136, + "grad_norm": 0.42687424634296756, + "k1_kl": 0.08251953125, + "k3_kl": 0.0537109375, + "kimi_kl": 0.1796875, + "learning_rate": 2.432e-07, + "loss": 0.0022, + "ppl": 0.01397705078125, + "reward": 0.9968834519386292, + "reward_std": 0.0004386382643133402, + "rewards/perpo_ocr_edit_distance_reward": 0.9968834519386292, + "step": 2568, + "temperature": 0.9 + }, + { + "advantages": -2.3262842660187744e-05, + "completion_length": 1025.0, + "delta_ref_entropy_loss": 0.0179443359375, + "delta_ref_ppl": -0.046875, + "entropy_loss": -0.02587890625, + "epoch": 0.5138, + "grad_norm": 0.2738287368006397, + "k1_kl": 0.046875, + "k3_kl": 0.031494140625, + "kimi_kl": 0.103515625, + "learning_rate": 2.431e-07, + "loss": 0.0013, + "ppl": 0.00885009765625, + "reward": 0.9888408184051514, + "reward_std": 0.0006331752520054579, + "rewards/perpo_ocr_edit_distance_reward": 0.9888408780097961, + "step": 2569, + "temperature": 0.9 + }, + { + "advantages": -2.201965980930254e-05, + "completion_length": 237.0, + "delta_ref_entropy_loss": 0.0213623046875, + "delta_ref_ppl": -0.10205078125, + "entropy_loss": -0.0296630859375, + "epoch": 0.514, + "grad_norm": 0.7942388223217474, + "k1_kl": 0.10205078125, + "k3_kl": 0.08349609375, + "kimi_kl": 0.337890625, + "learning_rate": 2.43e-07, + "loss": 0.0034, + "ppl": 0.0107421875, + "reward": 0.9984472393989563, + "reward_std": 0.001060754875652492, + "rewards/perpo_ocr_edit_distance_reward": 0.9984472393989563, + "step": 2570, + "temperature": 0.9 + }, + { + "advantages": -1.151221204054309e-05, + "completion_length": 600.0, + "delta_ref_entropy_loss": 0.111328125, + "delta_ref_ppl": -0.09619140625, + "entropy_loss": -0.1923828125, + "epoch": 0.5142, + "grad_norm": 1.469733799496384, + "k1_kl": 0.09619140625, + "k3_kl": 0.05078125, + "kimi_kl": 0.10693359375, + "learning_rate": 2.429e-07, + "loss": 0.002, + "ppl": 0.0966796875, + "reward": 0.7872833013534546, + "reward_std": 0.0035946702118963003, + "rewards/perpo_ocr_edit_distance_reward": 0.7872833609580994, + "step": 2571, + "temperature": 0.9 + }, + { + "advantages": -5.183901157579385e-05, + "completion_length": 601.0, + "delta_ref_entropy_loss": 0.04541015625, + "delta_ref_ppl": -0.0537109375, + "entropy_loss": -0.11328125, + "epoch": 0.5144, + "grad_norm": 1.042124605784588, + "k1_kl": 0.0537109375, + "k3_kl": 0.03369140625, + "kimi_kl": 0.07373046875, + "learning_rate": 2.428e-07, + "loss": 0.0014, + "ppl": 0.0419921875, + "reward": 0.9164013862609863, + "reward_std": 0.0010501268552616239, + "rewards/perpo_ocr_edit_distance_reward": 0.9164014458656311, + "step": 2572, + "temperature": 0.9 + }, + { + "advantages": -4.129750777792651e-06, + "completion_length": 930.0, + "delta_ref_entropy_loss": 0.05126953125, + "delta_ref_ppl": -0.06689453125, + "entropy_loss": -0.0634765625, + "epoch": 0.5146, + "grad_norm": 0.859700132436087, + "k1_kl": 0.06689453125, + "k3_kl": 0.038330078125, + "kimi_kl": 0.10888671875, + "learning_rate": 2.4269999999999997e-07, + "loss": 0.0015, + "ppl": 0.023681640625, + "reward": 0.9755874872207642, + "reward_std": 0.008151778019964695, + "rewards/perpo_ocr_edit_distance_reward": 0.9755875468254089, + "step": 2573, + "temperature": 0.9 + }, + { + "advantages": 0.0, + "completion_length": 1038.0, + "delta_ref_entropy_loss": 0.08056640625, + "delta_ref_ppl": -0.0712890625, + "entropy_loss": -0.0654296875, + "epoch": 0.5148, + "grad_norm": 1.0324907980945857, + "k1_kl": 0.0712890625, + "k3_kl": 0.03955078125, + "kimi_kl": 0.087890625, + "learning_rate": 2.426e-07, + "loss": 0.0016, + "ppl": 0.0322265625, + "reward": 0.994764506816864, + "reward_std": 0.0008190279477275908, + "rewards/perpo_ocr_edit_distance_reward": 0.9947645664215088, + "step": 2574, + "temperature": 0.9 + }, + { + "advantages": -2.6549612812232226e-05, + "completion_length": 762.0, + "delta_ref_entropy_loss": 0.08203125, + "delta_ref_ppl": -0.08935546875, + "entropy_loss": -0.09765625, + "epoch": 0.515, + "grad_norm": 1.1788645742866155, + "k1_kl": 0.0888671875, + "k3_kl": 0.052001953125, + "kimi_kl": 0.1142578125, + "learning_rate": 2.425e-07, + "loss": 0.0021, + "ppl": 0.04541015625, + "reward": 0.9304566383361816, + "reward_std": 0.0015033046947792172, + "rewards/perpo_ocr_edit_distance_reward": 0.9304566979408264, + "step": 2575, + "temperature": 0.9 + }, + { + "advantages": -1.7029899026965722e-05, + "completion_length": 277.0, + "delta_ref_entropy_loss": 0.053466796875, + "delta_ref_ppl": -0.10791015625, + "entropy_loss": -0.07421875, + "epoch": 0.5152, + "grad_norm": 1.1583013951616115, + "k1_kl": 0.10791015625, + "k3_kl": 0.080078125, + "kimi_kl": 0.29296875, + "learning_rate": 2.424e-07, + "loss": 0.0032, + "ppl": 0.02490234375, + "reward": 0.9876945614814758, + "reward_std": 0.0039000017568469048, + "rewards/perpo_ocr_edit_distance_reward": 0.9876946210861206, + "step": 2576, + "temperature": 0.9 + }, + { + "advantages": -6.578650209121406e-05, + "completion_length": 533.0, + "delta_ref_entropy_loss": 0.05126953125, + "delta_ref_ppl": -0.051025390625, + "entropy_loss": -0.0341796875, + "epoch": 0.5154, + "grad_norm": 0.45697360196096537, + "k1_kl": 0.051025390625, + "k3_kl": 0.02587890625, + "kimi_kl": 0.053955078125, + "learning_rate": 2.423e-07, + "loss": 0.0011, + "ppl": 0.01214599609375, + "reward": 0.9962524175643921, + "reward_std": 0.0005471344338729978, + "rewards/perpo_ocr_edit_distance_reward": 0.9962524175643921, + "step": 2577, + "temperature": 0.9 + }, + { + "advantages": -1.2772424042850616e-06, + "completion_length": 585.0, + "delta_ref_entropy_loss": 0.0771484375, + "delta_ref_ppl": -0.11669921875, + "entropy_loss": -0.12255859375, + "epoch": 0.5156, + "grad_norm": 4.088147773355292, + "k1_kl": 0.1171875, + "k3_kl": 0.0732421875, + "kimi_kl": 0.208984375, + "learning_rate": 2.422e-07, + "loss": 0.0029, + "ppl": 0.053466796875, + "reward": 0.8677262663841248, + "reward_std": 0.05256127193570137, + "rewards/perpo_ocr_edit_distance_reward": 0.8677263855934143, + "step": 2578, + "temperature": 0.9 + }, + { + "advantages": 2.0614692402887158e-05, + "completion_length": 123.0, + "delta_ref_entropy_loss": 0.047607421875, + "delta_ref_ppl": -0.271484375, + "entropy_loss": -0.09326171875, + "epoch": 0.5158, + "grad_norm": 0.7730183552754023, + "k1_kl": 0.26953125, + "k3_kl": 0.2177734375, + "kimi_kl": 1.015625, + "learning_rate": 2.421e-07, + "loss": 0.0086, + "ppl": 0.0220947265625, + "reward": 0.9829996824264526, + "reward_std": 0.0007254551164805889, + "rewards/perpo_ocr_edit_distance_reward": 0.9829996824264526, + "step": 2579, + "temperature": 0.9 + }, + { + "advantages": -6.17163532297127e-05, + "completion_length": 641.0, + "delta_ref_entropy_loss": 0.0400390625, + "delta_ref_ppl": -0.037841796875, + "entropy_loss": -0.0289306640625, + "epoch": 0.516, + "grad_norm": 0.45337357154206176, + "k1_kl": 0.037841796875, + "k3_kl": 0.0238037109375, + "kimi_kl": 0.0634765625, + "learning_rate": 2.4199999999999997e-07, + "loss": 0.001, + "ppl": 0.0096435546875, + "reward": 0.9598909020423889, + "reward_std": 0.00045191161916591227, + "rewards/perpo_ocr_edit_distance_reward": 0.9598909616470337, + "step": 2580, + "temperature": 0.9 + }, + { + "advantages": -3.794261647271924e-05, + "completion_length": 677.0, + "delta_ref_entropy_loss": 0.08056640625, + "delta_ref_ppl": -0.06640625, + "entropy_loss": -0.0908203125, + "epoch": 0.5162, + "grad_norm": 0.6467352215480748, + "k1_kl": 0.06640625, + "k3_kl": 0.035400390625, + "kimi_kl": 0.08984375, + "learning_rate": 2.419e-07, + "loss": 0.0015, + "ppl": 0.040283203125, + "reward": 0.9610146284103394, + "reward_std": 0.001695523620583117, + "rewards/perpo_ocr_edit_distance_reward": 0.9610147476196289, + "step": 2581, + "temperature": 0.9 + }, + { + "advantages": -0.00022338118287734687, + "completion_length": 1025.0, + "delta_ref_entropy_loss": 0.044189453125, + "delta_ref_ppl": -0.04296875, + "entropy_loss": -0.024169921875, + "epoch": 0.5164, + "grad_norm": 0.2966510223938263, + "k1_kl": 0.04296875, + "k3_kl": 0.0264892578125, + "kimi_kl": 0.07861328125, + "learning_rate": 2.4179999999999995e-07, + "loss": 0.0013, + "ppl": 0.00823974609375, + "reward": 0.9898189306259155, + "reward_std": 0.00031931052217260003, + "rewards/perpo_ocr_edit_distance_reward": 0.9898190498352051, + "step": 2582, + "temperature": 0.9 + }, + { + "advantages": -6.600788765354082e-05, + "completion_length": 288.0, + "delta_ref_entropy_loss": 0.035400390625, + "delta_ref_ppl": -0.12451171875, + "entropy_loss": -0.056396484375, + "epoch": 0.5166, + "grad_norm": 0.7956228325263193, + "k1_kl": 0.12451171875, + "k3_kl": 0.09033203125, + "kimi_kl": 0.447265625, + "learning_rate": 2.417e-07, + "loss": 0.0037, + "ppl": 0.01806640625, + "reward": 0.9950079321861267, + "reward_std": 0.0009320210665464401, + "rewards/perpo_ocr_edit_distance_reward": 0.9950080513954163, + "step": 2583, + "temperature": 0.9 + }, + { + "advantages": -2.486365247023059e-06, + "completion_length": 1485.0, + "delta_ref_entropy_loss": 0.0150146484375, + "delta_ref_ppl": -0.0322265625, + "entropy_loss": -0.05224609375, + "epoch": 0.5168, + "grad_norm": 1.3570965074761305, + "k1_kl": 0.0322265625, + "k3_kl": 0.029052734375, + "kimi_kl": 0.07568359375, + "learning_rate": 2.416e-07, + "loss": 0.0012, + "ppl": 0.0322265625, + "reward": 0.9378570914268494, + "reward_std": 0.03740888088941574, + "rewards/perpo_ocr_edit_distance_reward": 0.9378572106361389, + "step": 2584, + "temperature": 0.9 + }, + { + "advantages": -1.6280584532069042e-05, + "completion_length": 334.0, + "delta_ref_entropy_loss": 0.029296875, + "delta_ref_ppl": -0.099609375, + "entropy_loss": -0.033447265625, + "epoch": 0.517, + "grad_norm": 0.4504486851598927, + "k1_kl": 0.099609375, + "k3_kl": 0.07666015625, + "kimi_kl": 0.32421875, + "learning_rate": 2.415e-07, + "loss": 0.0031, + "ppl": 0.01397705078125, + "reward": 0.9957504868507385, + "reward_std": 0.0009468357311561704, + "rewards/perpo_ocr_edit_distance_reward": 0.9957504868507385, + "step": 2585, + "temperature": 0.9 + }, + { + "advantages": 3.864935570163652e-05, + "completion_length": 970.0, + "delta_ref_entropy_loss": 0.046630859375, + "delta_ref_ppl": -0.0634765625, + "entropy_loss": -0.0546875, + "epoch": 0.5172, + "grad_norm": 1.3974661644666053, + "k1_kl": 0.0634765625, + "k3_kl": 0.04443359375, + "kimi_kl": 0.140625, + "learning_rate": 2.414e-07, + "loss": 0.0017, + "ppl": 0.0240478515625, + "reward": 0.6315683722496033, + "reward_std": 0.000781256181653589, + "rewards/perpo_ocr_edit_distance_reward": 0.6315683722496033, + "step": 2586, + "temperature": 0.9 + }, + { + "advantages": -4.3307034502504393e-05, + "completion_length": 778.0, + "delta_ref_entropy_loss": 0.0262451171875, + "delta_ref_ppl": -0.03076171875, + "entropy_loss": -0.0299072265625, + "epoch": 0.5174, + "grad_norm": 0.45287038836710175, + "k1_kl": 0.03076171875, + "k3_kl": 0.020263671875, + "kimi_kl": 0.060302734375, + "learning_rate": 2.4129999999999997e-07, + "loss": 0.0009, + "ppl": 0.01226806640625, + "reward": 0.9949083924293518, + "reward_std": 0.0022597513161599636, + "rewards/perpo_ocr_edit_distance_reward": 0.9949085116386414, + "step": 2587, + "temperature": 0.9 + }, + { + "advantages": -5.096197583043249e-06, + "completion_length": 86.0, + "delta_ref_entropy_loss": 0.1142578125, + "delta_ref_ppl": -0.474609375, + "entropy_loss": -0.25, + "epoch": 0.5176, + "grad_norm": 2.8327381619945595, + "k1_kl": 0.47265625, + "k3_kl": 0.349609375, + "kimi_kl": 1.4921875, + "learning_rate": 2.4119999999999996e-07, + "loss": 0.014, + "ppl": 0.10595703125, + "reward": 0.5652709603309631, + "reward_std": 0.004897520877420902, + "rewards/perpo_ocr_edit_distance_reward": 0.5652709603309631, + "step": 2588, + "temperature": 0.9 + }, + { + "advantages": -4.999978409614414e-05, + "completion_length": 745.0, + "delta_ref_entropy_loss": 0.020263671875, + "delta_ref_ppl": -0.027587890625, + "entropy_loss": -0.0240478515625, + "epoch": 0.5178, + "grad_norm": 0.22095340555056844, + "k1_kl": 0.027587890625, + "k3_kl": 0.01544189453125, + "kimi_kl": 0.040283203125, + "learning_rate": 2.411e-07, + "loss": 0.0007, + "ppl": 0.006988525390625, + "reward": 0.9951501488685608, + "reward_std": 0.00041098895599134266, + "rewards/perpo_ocr_edit_distance_reward": 0.9951502084732056, + "step": 2589, + "temperature": 0.9 + }, + { + "advantages": -6.9226539380906615e-06, + "completion_length": 609.0, + "delta_ref_entropy_loss": 0.08447265625, + "delta_ref_ppl": -0.09716796875, + "entropy_loss": -0.11181640625, + "epoch": 0.518, + "grad_norm": 0.9311535258589722, + "k1_kl": 0.09716796875, + "k3_kl": 0.059326171875, + "kimi_kl": 0.15625, + "learning_rate": 2.41e-07, + "loss": 0.0024, + "ppl": 0.056396484375, + "reward": 0.9543789029121399, + "reward_std": 0.0023616414982825518, + "rewards/perpo_ocr_edit_distance_reward": 0.9543789625167847, + "step": 2590, + "temperature": 0.9 + }, + { + "advantages": -6.123951607150957e-05, + "completion_length": 872.0, + "delta_ref_entropy_loss": 0.0252685546875, + "delta_ref_ppl": -0.0284423828125, + "entropy_loss": -0.030517578125, + "epoch": 0.5182, + "grad_norm": 0.21921429277779772, + "k1_kl": 0.0284423828125, + "k3_kl": 0.01806640625, + "kimi_kl": 0.052978515625, + "learning_rate": 2.409e-07, + "loss": 0.0008, + "ppl": 0.0108642578125, + "reward": 0.9972353577613831, + "reward_std": 0.00045606368803419173, + "rewards/perpo_ocr_edit_distance_reward": 0.9972354173660278, + "step": 2591, + "temperature": 0.9 + }, + { + "advantages": 2.997262299686554e-06, + "completion_length": 479.0, + "delta_ref_entropy_loss": 0.08740234375, + "delta_ref_ppl": -0.1142578125, + "entropy_loss": -0.353515625, + "epoch": 0.5184, + "grad_norm": 2.251508439694082, + "k1_kl": 0.1142578125, + "k3_kl": 0.0703125, + "kimi_kl": 0.1630859375, + "learning_rate": 2.408e-07, + "loss": 0.0028, + "ppl": 0.1806640625, + "reward": 0.8234715461730957, + "reward_std": 0.0055766659788787365, + "rewards/perpo_ocr_edit_distance_reward": 0.8234714865684509, + "step": 2592, + "temperature": 0.9 + }, + { + "advantages": -0.0002055764343822375, + "completion_length": 910.0, + "delta_ref_entropy_loss": 0.026611328125, + "delta_ref_ppl": -0.04150390625, + "entropy_loss": -0.0289306640625, + "epoch": 0.5186, + "grad_norm": 0.19574906089729602, + "k1_kl": 0.04150390625, + "k3_kl": 0.0247802734375, + "kimi_kl": 0.07666015625, + "learning_rate": 2.407e-07, + "loss": 0.0012, + "ppl": 0.00830078125, + "reward": 0.9971651434898376, + "reward_std": 0.0002727913670241833, + "rewards/perpo_ocr_edit_distance_reward": 0.9971652030944824, + "step": 2593, + "temperature": 0.9 + }, + { + "advantages": -0.0001011065105558373, + "completion_length": 543.0, + "delta_ref_entropy_loss": 0.0294189453125, + "delta_ref_ppl": -0.04638671875, + "entropy_loss": -0.0274658203125, + "epoch": 0.5188, + "grad_norm": 0.3592203147816806, + "k1_kl": 0.04638671875, + "k3_kl": 0.031494140625, + "kimi_kl": 0.08837890625, + "learning_rate": 2.406e-07, + "loss": 0.0014, + "ppl": 0.00933837890625, + "reward": 0.9695023894309998, + "reward_std": 0.0008263125782832503, + "rewards/perpo_ocr_edit_distance_reward": 0.9695025086402893, + "step": 2594, + "temperature": 0.9 + }, + { + "advantages": -5.040850282966858e-06, + "completion_length": 64.0, + "delta_ref_entropy_loss": 0.1474609375, + "delta_ref_ppl": -0.345703125, + "entropy_loss": -0.345703125, + "epoch": 0.519, + "grad_norm": 6.679361756689271, + "k1_kl": 0.345703125, + "k3_kl": 0.251953125, + "kimi_kl": 0.7578125, + "learning_rate": 2.4049999999999996e-07, + "loss": 0.0101, + "ppl": 0.15234375, + "reward": 0.9137749075889587, + "reward_std": 0.010071126744151115, + "rewards/perpo_ocr_edit_distance_reward": 0.9137749671936035, + "step": 2595, + "temperature": 0.9 + }, + { + "advantages": -1.565047750773374e-05, + "completion_length": 73.0, + "delta_ref_entropy_loss": 0.177734375, + "delta_ref_ppl": -0.58203125, + "entropy_loss": -0.2294921875, + "epoch": 0.5192, + "grad_norm": 3.222726451360649, + "k1_kl": 0.58203125, + "k3_kl": 0.45703125, + "kimi_kl": 2.03125, + "learning_rate": 2.404e-07, + "loss": 0.0183, + "ppl": 0.08251953125, + "reward": 0.9633359313011169, + "reward_std": 0.004795979708433151, + "rewards/perpo_ocr_edit_distance_reward": 0.9633359909057617, + "step": 2596, + "temperature": 0.9 + }, + { + "advantages": -2.086588392558042e-05, + "completion_length": 493.0, + "delta_ref_entropy_loss": 0.05078125, + "delta_ref_ppl": -0.08740234375, + "entropy_loss": -0.041259765625, + "epoch": 0.5194, + "grad_norm": 0.6757328037268358, + "k1_kl": 0.08740234375, + "k3_kl": 0.054931640625, + "kimi_kl": 0.177734375, + "learning_rate": 2.403e-07, + "loss": 0.0022, + "ppl": 0.01531982421875, + "reward": 0.9829300045967102, + "reward_std": 0.0011235737474635243, + "rewards/perpo_ocr_edit_distance_reward": 0.9829300045967102, + "step": 2597, + "temperature": 0.9 + }, + { + "advantages": -1.8187933164881542e-05, + "completion_length": 306.0, + "delta_ref_entropy_loss": 0.10986328125, + "delta_ref_ppl": -0.1171875, + "entropy_loss": -0.150390625, + "epoch": 0.5196, + "grad_norm": 1.610061223045391, + "k1_kl": 0.1171875, + "k3_kl": 0.06982421875, + "kimi_kl": 0.16796875, + "learning_rate": 2.402e-07, + "loss": 0.0028, + "ppl": 0.076171875, + "reward": 0.8734436631202698, + "reward_std": 0.0036469860933721066, + "rewards/perpo_ocr_edit_distance_reward": 0.8734437823295593, + "step": 2598, + "temperature": 0.9 + }, + { + "advantages": -1.4390265278052539e-05, + "completion_length": 510.0, + "delta_ref_entropy_loss": 0.040771484375, + "delta_ref_ppl": -0.07666015625, + "entropy_loss": -0.0712890625, + "epoch": 0.5198, + "grad_norm": 0.8385491695236236, + "k1_kl": 0.07666015625, + "k3_kl": 0.0546875, + "kimi_kl": 0.19140625, + "learning_rate": 2.401e-07, + "loss": 0.0022, + "ppl": 0.03271484375, + "reward": 0.995801568031311, + "reward_std": 0.0046345931477844715, + "rewards/perpo_ocr_edit_distance_reward": 0.9958016276359558, + "step": 2599, + "temperature": 0.9 + }, + { + "advantages": -1.4543534234690014e-05, + "completion_length": 337.0, + "delta_ref_entropy_loss": 0.0260009765625, + "delta_ref_ppl": -0.10546875, + "entropy_loss": -0.06298828125, + "epoch": 0.52, + "grad_norm": 0.8642031332690809, + "k1_kl": 0.10595703125, + "k3_kl": 0.08154296875, + "kimi_kl": 0.35546875, + "learning_rate": 2.4e-07, + "loss": 0.0033, + "ppl": 0.02197265625, + "reward": 0.9713695049285889, + "reward_std": 0.0022442585323005915, + "rewards/perpo_ocr_edit_distance_reward": 0.9713695645332336, + "step": 2600, + "temperature": 0.9 + }, + { + "advantages": 5.53471727471333e-06, + "completion_length": 72.0, + "delta_ref_entropy_loss": 0.049072265625, + "delta_ref_ppl": -0.4296875, + "entropy_loss": -0.1416015625, + "epoch": 0.5202, + "grad_norm": 8.129753431705103, + "k1_kl": 0.4296875, + "k3_kl": 0.373046875, + "kimi_kl": 2.203125, + "learning_rate": 2.3989999999999997e-07, + "loss": 0.0149, + "ppl": 0.0595703125, + "reward": 0.9925983548164368, + "reward_std": 0.002979957265779376, + "rewards/perpo_ocr_edit_distance_reward": 0.9925984144210815, + "step": 2601, + "temperature": 0.9 + }, + { + "advantages": -7.799694139976054e-05, + "completion_length": 722.0, + "delta_ref_entropy_loss": 0.043701171875, + "delta_ref_ppl": -0.05078125, + "entropy_loss": -0.03173828125, + "epoch": 0.5204, + "grad_norm": 0.41016323895938706, + "k1_kl": 0.05078125, + "k3_kl": 0.0308837890625, + "kimi_kl": 0.099609375, + "learning_rate": 2.398e-07, + "loss": 0.0013, + "ppl": 0.01123046875, + "reward": 0.994976282119751, + "reward_std": 0.000882586173247546, + "rewards/perpo_ocr_edit_distance_reward": 0.9949763417243958, + "step": 2602, + "temperature": 0.9 + }, + { + "advantages": -9.383474207425024e-06, + "completion_length": 454.0, + "delta_ref_entropy_loss": 0.0291748046875, + "delta_ref_ppl": -0.048583984375, + "entropy_loss": -0.041748046875, + "epoch": 0.5206, + "grad_norm": 0.629464225974072, + "k1_kl": 0.048828125, + "k3_kl": 0.037353515625, + "kimi_kl": 0.1337890625, + "learning_rate": 2.397e-07, + "loss": 0.0015, + "ppl": 0.01409912109375, + "reward": 0.9363799095153809, + "reward_std": 0.00535297766327858, + "rewards/perpo_ocr_edit_distance_reward": 0.9363800287246704, + "step": 2603, + "temperature": 0.9 + }, + { + "advantages": -5.926404810452368e-06, + "completion_length": 421.0, + "delta_ref_entropy_loss": 0.060546875, + "delta_ref_ppl": -0.07861328125, + "entropy_loss": -0.0390625, + "epoch": 0.5208, + "grad_norm": 1.0562569455458144, + "k1_kl": 0.07861328125, + "k3_kl": 0.048828125, + "kimi_kl": 0.11865234375, + "learning_rate": 2.396e-07, + "loss": 0.002, + "ppl": 0.01251220703125, + "reward": 0.9805889129638672, + "reward_std": 0.007083313539624214, + "rewards/perpo_ocr_edit_distance_reward": 0.980588972568512, + "step": 2604, + "temperature": 0.9 + }, + { + "advantages": -6.556510925292969e-05, + "completion_length": 466.0, + "delta_ref_entropy_loss": 0.0576171875, + "delta_ref_ppl": -0.09033203125, + "entropy_loss": -0.061279296875, + "epoch": 0.521, + "grad_norm": 0.7632325399416643, + "k1_kl": 0.09033203125, + "k3_kl": 0.05908203125, + "kimi_kl": 0.1884765625, + "learning_rate": 2.395e-07, + "loss": 0.0024, + "ppl": 0.02685546875, + "reward": 0.9746556282043457, + "reward_std": 0.0014577106339856982, + "rewards/perpo_ocr_edit_distance_reward": 0.9746557474136353, + "step": 2605, + "temperature": 0.9 + }, + { + "advantages": -9.687458077678457e-05, + "completion_length": 1026.0, + "delta_ref_entropy_loss": 0.0257568359375, + "delta_ref_ppl": -0.05419921875, + "entropy_loss": -0.038818359375, + "epoch": 0.5212, + "grad_norm": 0.3626165840073066, + "k1_kl": 0.05419921875, + "k3_kl": 0.034423828125, + "kimi_kl": 0.09716796875, + "learning_rate": 2.394e-07, + "loss": 0.0015, + "ppl": 0.01409912109375, + "reward": 0.9899623990058899, + "reward_std": 0.0009547119261696935, + "rewards/perpo_ocr_edit_distance_reward": 0.9899625778198242, + "step": 2606, + "temperature": 0.9 + }, + { + "advantages": -6.528837730002124e-06, + "completion_length": 1432.0, + "delta_ref_entropy_loss": 0.046142578125, + "delta_ref_ppl": -0.058837890625, + "entropy_loss": -0.1552734375, + "epoch": 0.5214, + "grad_norm": 3.133520830269694, + "k1_kl": 0.05859375, + "k3_kl": 0.042724609375, + "kimi_kl": 0.0908203125, + "learning_rate": 2.393e-07, + "loss": 0.0017, + "ppl": 0.0771484375, + "reward": 0.9709568023681641, + "reward_std": 0.001203531282953918, + "rewards/perpo_ocr_edit_distance_reward": 0.9709568619728088, + "step": 2607, + "temperature": 0.9 + }, + { + "advantages": -2.367155957472278e-06, + "completion_length": 1055.0, + "delta_ref_entropy_loss": 0.0257568359375, + "delta_ref_ppl": -0.052734375, + "entropy_loss": -0.08837890625, + "epoch": 0.5216, + "grad_norm": 2.5010707930265506, + "k1_kl": 0.052734375, + "k3_kl": 0.037109375, + "kimi_kl": 0.09716796875, + "learning_rate": 2.3919999999999997e-07, + "loss": 0.0015, + "ppl": 0.033935546875, + "reward": 0.7598468661308289, + "reward_std": 0.028429541736841202, + "rewards/perpo_ocr_edit_distance_reward": 0.7598469853401184, + "step": 2608, + "temperature": 0.9 + }, + { + "advantages": -5.313328529155115e-06, + "completion_length": 945.0, + "delta_ref_entropy_loss": 0.02001953125, + "delta_ref_ppl": -0.035400390625, + "entropy_loss": -0.0303955078125, + "epoch": 0.5218, + "grad_norm": 0.4903275685556915, + "k1_kl": 0.035400390625, + "k3_kl": 0.021240234375, + "kimi_kl": 0.05419921875, + "learning_rate": 2.391e-07, + "loss": 0.0009, + "ppl": 0.00982666015625, + "reward": 0.9941374659538269, + "reward_std": 0.003108666744083166, + "rewards/perpo_ocr_edit_distance_reward": 0.9941375255584717, + "step": 2609, + "temperature": 0.9 + }, + { + "advantages": -4.32559427281376e-05, + "completion_length": 524.0, + "delta_ref_entropy_loss": 0.0673828125, + "delta_ref_ppl": -0.0615234375, + "entropy_loss": -0.0712890625, + "epoch": 0.522, + "grad_norm": 0.9062904574005659, + "k1_kl": 0.0615234375, + "k3_kl": 0.036865234375, + "kimi_kl": 0.0810546875, + "learning_rate": 2.3899999999999996e-07, + "loss": 0.0015, + "ppl": 0.0341796875, + "reward": 0.9857909083366394, + "reward_std": 0.001474271877668798, + "rewards/perpo_ocr_edit_distance_reward": 0.9857909679412842, + "step": 2610, + "temperature": 0.9 + }, + { + "advantages": -7.033348538243445e-06, + "completion_length": 264.0, + "delta_ref_entropy_loss": 0.06298828125, + "delta_ref_ppl": -0.1474609375, + "entropy_loss": -0.1552734375, + "epoch": 0.5222, + "grad_norm": 2.271500754879269, + "k1_kl": 0.1474609375, + "k3_kl": 0.09814453125, + "kimi_kl": 0.4296875, + "learning_rate": 2.389e-07, + "loss": 0.0039, + "ppl": 0.049072265625, + "reward": 0.9025744199752808, + "reward_std": 0.011967742815613747, + "rewards/perpo_ocr_edit_distance_reward": 0.9025745391845703, + "step": 2611, + "temperature": 0.9 + }, + { + "advantages": -1.3845307876181323e-05, + "completion_length": 720.0, + "delta_ref_entropy_loss": 0.07373046875, + "delta_ref_ppl": -0.0888671875, + "entropy_loss": -0.1748046875, + "epoch": 0.5224, + "grad_norm": 1.193675041290368, + "k1_kl": 0.0888671875, + "k3_kl": 0.058349609375, + "kimi_kl": 0.158203125, + "learning_rate": 2.388e-07, + "loss": 0.0023, + "ppl": 0.0966796875, + "reward": 0.8858363628387451, + "reward_std": 0.0029761693440377712, + "rewards/perpo_ocr_edit_distance_reward": 0.8858364224433899, + "step": 2612, + "temperature": 0.9 + }, + { + "advantages": -1.7157623005914502e-05, + "completion_length": 196.0, + "delta_ref_entropy_loss": 0.0576171875, + "delta_ref_ppl": -0.212890625, + "entropy_loss": -0.05712890625, + "epoch": 0.5226, + "grad_norm": 1.511742018673547, + "k1_kl": 0.212890625, + "k3_kl": 0.1689453125, + "kimi_kl": 0.80078125, + "learning_rate": 2.387e-07, + "loss": 0.0068, + "ppl": 0.02294921875, + "reward": 0.9917545318603516, + "reward_std": 0.002379611600190401, + "rewards/perpo_ocr_edit_distance_reward": 0.9917545914649963, + "step": 2613, + "temperature": 0.9 + }, + { + "advantages": 2.043587983280304e-07, + "completion_length": 598.0, + "delta_ref_entropy_loss": 0.021728515625, + "delta_ref_ppl": -0.1162109375, + "entropy_loss": -0.2080078125, + "epoch": 0.5228, + "grad_norm": 6.914899864722051, + "k1_kl": 0.1162109375, + "k3_kl": 0.0849609375, + "kimi_kl": 0.2109375, + "learning_rate": 2.386e-07, + "loss": 0.0034, + "ppl": 0.10498046875, + "reward": 0.733418345451355, + "reward_std": 0.0783514678478241, + "rewards/perpo_ocr_edit_distance_reward": 0.733418345451355, + "step": 2614, + "temperature": 0.9 + }, + { + "advantages": 0.0, + "completion_length": 912.0, + "delta_ref_entropy_loss": 0.045654296875, + "delta_ref_ppl": -0.0537109375, + "entropy_loss": -0.0281982421875, + "epoch": 0.523, + "grad_norm": 0.252707475195991, + "k1_kl": 0.0537109375, + "k3_kl": 0.0301513671875, + "kimi_kl": 0.10986328125, + "learning_rate": 2.3849999999999997e-07, + "loss": 0.0012, + "ppl": 0.0089111328125, + "reward": 0.9980390071868896, + "reward_std": 0.0003295714850537479, + "rewards/perpo_ocr_edit_distance_reward": 0.9980390071868896, + "step": 2615, + "temperature": 0.9 + }, + { + "advantages": 1.6740390492486767e-05, + "completion_length": 319.0, + "delta_ref_entropy_loss": 0.050537109375, + "delta_ref_ppl": -0.1025390625, + "entropy_loss": -0.07763671875, + "epoch": 0.5232, + "grad_norm": 1.063374069152619, + "k1_kl": 0.1025390625, + "k3_kl": 0.07470703125, + "kimi_kl": 0.34765625, + "learning_rate": 2.384e-07, + "loss": 0.003, + "ppl": 0.041748046875, + "reward": 0.9933003783226013, + "reward_std": 0.001425713300704956, + "rewards/perpo_ocr_edit_distance_reward": 0.9933003187179565, + "step": 2616, + "temperature": 0.9 + }, + { + "advantages": 3.491129291433026e-06, + "completion_length": 516.0, + "delta_ref_entropy_loss": 0.058837890625, + "delta_ref_ppl": -0.068359375, + "entropy_loss": -0.083984375, + "epoch": 0.5234, + "grad_norm": 1.0096899491471776, + "k1_kl": 0.068359375, + "k3_kl": 0.044677734375, + "kimi_kl": 0.09375, + "learning_rate": 2.383e-07, + "loss": 0.0018, + "ppl": 0.0341796875, + "reward": 0.9290253520011902, + "reward_std": 0.0023288943339139223, + "rewards/perpo_ocr_edit_distance_reward": 0.929025411605835, + "step": 2617, + "temperature": 0.9 + }, + { + "advantages": -6.471361757576233e-06, + "completion_length": 809.0, + "delta_ref_entropy_loss": 0.057373046875, + "delta_ref_ppl": -0.064453125, + "entropy_loss": -0.1357421875, + "epoch": 0.5236, + "grad_norm": 1.1478063272203256, + "k1_kl": 0.064453125, + "k3_kl": 0.04345703125, + "kimi_kl": 0.1044921875, + "learning_rate": 2.3819999999999998e-07, + "loss": 0.0017, + "ppl": 0.06689453125, + "reward": 0.8827788233757019, + "reward_std": 0.003842707723379135, + "rewards/perpo_ocr_edit_distance_reward": 0.8827788829803467, + "step": 2618, + "temperature": 0.9 + }, + { + "advantages": -0.00015023777086753398, + "completion_length": 665.0, + "delta_ref_entropy_loss": 0.042724609375, + "delta_ref_ppl": -0.057373046875, + "entropy_loss": -0.05126953125, + "epoch": 0.5238, + "grad_norm": 0.7031236167031796, + "k1_kl": 0.0576171875, + "k3_kl": 0.03369140625, + "kimi_kl": 0.095703125, + "learning_rate": 2.381e-07, + "loss": 0.0015, + "ppl": 0.0179443359375, + "reward": 0.990394115447998, + "reward_std": 0.0004100114165339619, + "rewards/perpo_ocr_edit_distance_reward": 0.9903941750526428, + "step": 2619, + "temperature": 0.9 + }, + { + "advantages": -0.00011333398288115859, + "completion_length": 523.0, + "delta_ref_entropy_loss": 0.07958984375, + "delta_ref_ppl": -0.1171875, + "entropy_loss": -0.0732421875, + "epoch": 0.524, + "grad_norm": 1.0877154030802978, + "k1_kl": 0.11767578125, + "k3_kl": 0.07666015625, + "kimi_kl": 0.2734375, + "learning_rate": 2.38e-07, + "loss": 0.0032, + "ppl": 0.039794921875, + "reward": 0.9965108633041382, + "reward_std": 0.0008766628452576697, + "rewards/perpo_ocr_edit_distance_reward": 0.9965109825134277, + "step": 2620, + "temperature": 0.9 + }, + { + "advantages": -3.453663521213457e-05, + "completion_length": 380.0, + "delta_ref_entropy_loss": 0.0205078125, + "delta_ref_ppl": -0.07763671875, + "entropy_loss": -0.0390625, + "epoch": 0.5242, + "grad_norm": 0.4943782705182864, + "k1_kl": 0.0771484375, + "k3_kl": 0.05712890625, + "kimi_kl": 0.28125, + "learning_rate": 2.3789999999999998e-07, + "loss": 0.0023, + "ppl": 0.01708984375, + "reward": 0.9972873330116272, + "reward_std": 0.0006393544608727098, + "rewards/perpo_ocr_edit_distance_reward": 0.997287392616272, + "step": 2621, + "temperature": 0.9 + }, + { + "advantages": -3.286770515842363e-05, + "completion_length": 727.0, + "delta_ref_entropy_loss": 0.03515625, + "delta_ref_ppl": -0.06298828125, + "entropy_loss": -0.05224609375, + "epoch": 0.5244, + "grad_norm": 0.39536214588154717, + "k1_kl": 0.06298828125, + "k3_kl": 0.045166015625, + "kimi_kl": 0.1455078125, + "learning_rate": 2.378e-07, + "loss": 0.0018, + "ppl": 0.0189208984375, + "reward": 0.9861689209938049, + "reward_std": 0.0017131606582552195, + "rewards/perpo_ocr_edit_distance_reward": 0.9861689805984497, + "step": 2622, + "temperature": 0.9 + }, + { + "advantages": -1.0975770237564575e-05, + "completion_length": 458.0, + "delta_ref_entropy_loss": 0.08642578125, + "delta_ref_ppl": -0.142578125, + "entropy_loss": -0.33203125, + "epoch": 0.5246, + "grad_norm": 2.447320942044719, + "k1_kl": 0.1416015625, + "k3_kl": 0.09814453125, + "kimi_kl": 0.314453125, + "learning_rate": 2.377e-07, + "loss": 0.0039, + "ppl": 0.16796875, + "reward": 0.8878383040428162, + "reward_std": 0.009214797988533974, + "rewards/perpo_ocr_edit_distance_reward": 0.8878384232521057, + "step": 2623, + "temperature": 0.9 + }, + { + "advantages": -1.4322145034384448e-05, + "completion_length": 243.0, + "delta_ref_entropy_loss": 0.080078125, + "delta_ref_ppl": -0.12060546875, + "entropy_loss": -0.0517578125, + "epoch": 0.5248, + "grad_norm": 1.1218484255690182, + "k1_kl": 0.12109375, + "k3_kl": 0.08251953125, + "kimi_kl": 0.296875, + "learning_rate": 2.3759999999999998e-07, + "loss": 0.0033, + "ppl": 0.0240478515625, + "reward": 0.9970752000808716, + "reward_std": 0.0022811670787632465, + "rewards/perpo_ocr_edit_distance_reward": 0.9970752596855164, + "step": 2624, + "temperature": 0.9 + }, + { + "advantages": -5.0199883844470605e-05, + "completion_length": 692.0, + "delta_ref_entropy_loss": 0.05126953125, + "delta_ref_ppl": -0.06298828125, + "entropy_loss": -0.057373046875, + "epoch": 0.525, + "grad_norm": 0.38202114445824426, + "k1_kl": 0.0634765625, + "k3_kl": 0.03857421875, + "kimi_kl": 0.10888671875, + "learning_rate": 2.3749999999999998e-07, + "loss": 0.0016, + "ppl": 0.0277099609375, + "reward": 0.9809781312942505, + "reward_std": 0.0005786523688584566, + "rewards/perpo_ocr_edit_distance_reward": 0.9809781908988953, + "step": 2625, + "temperature": 0.9 + }, + { + "advantages": -3.644398475444177e-06, + "completion_length": 869.0, + "delta_ref_entropy_loss": 0.03759765625, + "delta_ref_ppl": -0.0361328125, + "entropy_loss": -0.05712890625, + "epoch": 0.5252, + "grad_norm": 4.599398371348274, + "k1_kl": 0.0361328125, + "k3_kl": 0.0206298828125, + "kimi_kl": 0.039306640625, + "learning_rate": 2.374e-07, + "loss": 0.0008, + "ppl": 0.0341796875, + "reward": 0.9829450249671936, + "reward_std": 0.009263315238058567, + "rewards/perpo_ocr_edit_distance_reward": 0.9829450845718384, + "step": 2626, + "temperature": 0.9 + }, + { + "advantages": -1.3726098586630542e-05, + "completion_length": 458.0, + "delta_ref_entropy_loss": 0.08349609375, + "delta_ref_ppl": -0.10595703125, + "entropy_loss": -0.12353515625, + "epoch": 0.5254, + "grad_norm": 2.046013031558039, + "k1_kl": 0.1064453125, + "k3_kl": 0.06982421875, + "kimi_kl": 0.1826171875, + "learning_rate": 2.373e-07, + "loss": 0.0028, + "ppl": 0.059326171875, + "reward": 0.8437302708625793, + "reward_std": 0.003613919485360384, + "rewards/perpo_ocr_edit_distance_reward": 0.8437303900718689, + "step": 2627, + "temperature": 0.9 + }, + { + "advantages": -3.7636077649949584e-06, + "completion_length": 39.0, + "delta_ref_entropy_loss": 0.0078125, + "delta_ref_ppl": -1.015625, + "entropy_loss": -0.337890625, + "epoch": 0.5256, + "grad_norm": 7.41326858158902, + "k1_kl": 1.015625, + "k3_kl": 0.8671875, + "kimi_kl": 3.96875, + "learning_rate": 2.3719999999999998e-07, + "loss": 0.0346, + "ppl": 0.1474609375, + "reward": 0.10301700979471207, + "reward_std": 0.0016000099712982774, + "rewards/perpo_ocr_edit_distance_reward": 0.10301702469587326, + "step": 2628, + "temperature": 0.9 + }, + { + "advantages": -9.437969856662676e-05, + "completion_length": 442.0, + "delta_ref_entropy_loss": 0.048583984375, + "delta_ref_ppl": -0.07666015625, + "entropy_loss": -0.04736328125, + "epoch": 0.5258, + "grad_norm": 1.0009362655842347, + "k1_kl": 0.07666015625, + "k3_kl": 0.05126953125, + "kimi_kl": 0.1572265625, + "learning_rate": 2.371e-07, + "loss": 0.0021, + "ppl": 0.0157470703125, + "reward": 0.9952772855758667, + "reward_std": 0.0008022222318686545, + "rewards/perpo_ocr_edit_distance_reward": 0.9952774047851562, + "step": 2629, + "temperature": 0.9 + }, + { + "advantages": 2.1287374693201855e-05, + "completion_length": 294.0, + "delta_ref_entropy_loss": 0.07666015625, + "delta_ref_ppl": -0.11279296875, + "entropy_loss": -0.05029296875, + "epoch": 0.526, + "grad_norm": 0.8423685684459572, + "k1_kl": 0.1123046875, + "k3_kl": 0.0791015625, + "kimi_kl": 0.29296875, + "learning_rate": 2.3699999999999996e-07, + "loss": 0.0032, + "ppl": 0.0194091796875, + "reward": 0.9957154393196106, + "reward_std": 0.0006996487500146031, + "rewards/perpo_ocr_edit_distance_reward": 0.9957154989242554, + "step": 2630, + "temperature": 0.9 + }, + { + "advantages": -8.412770512222778e-06, + "completion_length": 1907.0, + "delta_ref_entropy_loss": -0.003143310546875, + "delta_ref_ppl": -0.019775390625, + "entropy_loss": -0.05615234375, + "epoch": 0.5262, + "grad_norm": 0.6492800320931188, + "k1_kl": 0.0198974609375, + "k3_kl": 0.0184326171875, + "kimi_kl": 0.0478515625, + "learning_rate": 2.3689999999999998e-07, + "loss": 0.0007, + "ppl": 0.02734375, + "reward": 0.8851690888404846, + "reward_std": 0.003946283366531134, + "rewards/perpo_ocr_edit_distance_reward": 0.8851691484451294, + "step": 2631, + "temperature": 0.9 + }, + { + "advantages": -3.916876778475853e-07, + "completion_length": 1567.0, + "delta_ref_entropy_loss": 0.04248046875, + "delta_ref_ppl": -0.047607421875, + "entropy_loss": -0.150390625, + "epoch": 0.5264, + "grad_norm": 2.684260075277711, + "k1_kl": 0.0478515625, + "k3_kl": 0.042236328125, + "kimi_kl": 0.0654296875, + "learning_rate": 2.368e-07, + "loss": 0.0017, + "ppl": 0.07421875, + "reward": 0.7418301701545715, + "reward_std": 0.14433367550373077, + "rewards/perpo_ocr_edit_distance_reward": 0.7418302297592163, + "step": 2632, + "temperature": 0.9 + }, + { + "advantages": -1.7660006051301025e-05, + "completion_length": 653.0, + "delta_ref_entropy_loss": 0.0771484375, + "delta_ref_ppl": -0.049560546875, + "entropy_loss": -0.064453125, + "epoch": 0.5266, + "grad_norm": 0.5330575998423704, + "k1_kl": 0.0498046875, + "k3_kl": 0.0233154296875, + "kimi_kl": 0.060791015625, + "learning_rate": 2.367e-07, + "loss": 0.001, + "ppl": 0.028564453125, + "reward": 0.9850510358810425, + "reward_std": 0.0003817967954091728, + "rewards/perpo_ocr_edit_distance_reward": 0.9850510358810425, + "step": 2633, + "temperature": 0.9 + }, + { + "advantages": -9.780270920600742e-05, + "completion_length": 625.0, + "delta_ref_entropy_loss": 0.033935546875, + "delta_ref_ppl": -0.06884765625, + "entropy_loss": -0.032470703125, + "epoch": 0.5268, + "grad_norm": 0.4555088131361288, + "k1_kl": 0.06884765625, + "k3_kl": 0.0478515625, + "kimi_kl": 0.1279296875, + "learning_rate": 2.366e-07, + "loss": 0.002, + "ppl": 0.0120849609375, + "reward": 0.990943193435669, + "reward_std": 0.0006835677195340395, + "rewards/perpo_ocr_edit_distance_reward": 0.9909432530403137, + "step": 2634, + "temperature": 0.9 + }, + { + "advantages": -2.053805837931577e-05, + "completion_length": 259.0, + "delta_ref_entropy_loss": 0.043212890625, + "delta_ref_ppl": -0.150390625, + "entropy_loss": -0.09619140625, + "epoch": 0.527, + "grad_norm": 1.389414701043328, + "k1_kl": 0.150390625, + "k3_kl": 0.10400390625, + "kimi_kl": 0.3828125, + "learning_rate": 2.3649999999999998e-07, + "loss": 0.0042, + "ppl": 0.039794921875, + "reward": 0.8175451755523682, + "reward_std": 0.001973862061277032, + "rewards/perpo_ocr_edit_distance_reward": 0.8175452351570129, + "step": 2635, + "temperature": 0.9 + }, + { + "advantages": -0.00010005917283706367, + "completion_length": 455.0, + "delta_ref_entropy_loss": 0.05712890625, + "delta_ref_ppl": -0.08544921875, + "entropy_loss": -0.03515625, + "epoch": 0.5272, + "grad_norm": 0.3637985266267187, + "k1_kl": 0.0859375, + "k3_kl": 0.05810546875, + "kimi_kl": 0.203125, + "learning_rate": 2.364e-07, + "loss": 0.0024, + "ppl": 0.011962890625, + "reward": 0.9970845580101013, + "reward_std": 0.0006658460479229689, + "rewards/perpo_ocr_edit_distance_reward": 0.9970846176147461, + "step": 2636, + "temperature": 0.9 + }, + { + "advantages": -1.3726098586630542e-05, + "completion_length": 80.0, + "delta_ref_entropy_loss": 0.048583984375, + "delta_ref_ppl": -0.318359375, + "entropy_loss": -0.10986328125, + "epoch": 0.5274, + "grad_norm": 2.282213068878544, + "k1_kl": 0.318359375, + "k3_kl": 0.2431640625, + "kimi_kl": 0.78125, + "learning_rate": 2.363e-07, + "loss": 0.0098, + "ppl": 0.04150390625, + "reward": 0.9753929972648621, + "reward_std": 0.0029989732429385185, + "rewards/perpo_ocr_edit_distance_reward": 0.9753930568695068, + "step": 2637, + "temperature": 0.9 + }, + { + "advantages": -8.889607670425903e-06, + "completion_length": 885.0, + "delta_ref_entropy_loss": 0.055419921875, + "delta_ref_ppl": -0.07763671875, + "entropy_loss": -0.08154296875, + "epoch": 0.5276, + "grad_norm": 1.0325219539066377, + "k1_kl": 0.0771484375, + "k3_kl": 0.0458984375, + "kimi_kl": 0.1328125, + "learning_rate": 2.3619999999999998e-07, + "loss": 0.0018, + "ppl": 0.03955078125, + "reward": 0.9883008599281311, + "reward_std": 0.009483934380114079, + "rewards/perpo_ocr_edit_distance_reward": 0.9883009195327759, + "step": 2638, + "temperature": 0.9 + }, + { + "advantages": -8.514949634275126e-09, + "completion_length": 448.0, + "delta_ref_entropy_loss": 0.10009765625, + "delta_ref_ppl": -0.0966796875, + "entropy_loss": -0.21875, + "epoch": 0.5278, + "grad_norm": 1.4717460801503894, + "k1_kl": 0.0966796875, + "k3_kl": 0.0537109375, + "kimi_kl": 0.1279296875, + "learning_rate": 2.361e-07, + "loss": 0.0022, + "ppl": 0.1162109375, + "reward": 0.8709676861763, + "reward_std": 0.002644718624651432, + "rewards/perpo_ocr_edit_distance_reward": 0.8709677457809448, + "step": 2639, + "temperature": 0.9 + }, + { + "advantages": -2.806527481880039e-05, + "completion_length": 267.0, + "delta_ref_entropy_loss": 0.06494140625, + "delta_ref_ppl": -0.173828125, + "entropy_loss": -0.10302734375, + "epoch": 0.528, + "grad_norm": 1.5444679587628896, + "k1_kl": 0.1728515625, + "k3_kl": 0.125, + "kimi_kl": 0.431640625, + "learning_rate": 2.3599999999999997e-07, + "loss": 0.005, + "ppl": 0.048095703125, + "reward": 0.825313925743103, + "reward_std": 0.001719786785542965, + "rewards/perpo_ocr_edit_distance_reward": 0.8253139853477478, + "step": 2640, + "temperature": 0.9 + }, + { + "advantages": -2.0589148334693164e-05, + "completion_length": 706.0, + "delta_ref_entropy_loss": 0.057861328125, + "delta_ref_ppl": -0.0791015625, + "entropy_loss": -0.1064453125, + "epoch": 0.5282, + "grad_norm": 1.0989332228913185, + "k1_kl": 0.0791015625, + "k3_kl": 0.0458984375, + "kimi_kl": 0.1455078125, + "learning_rate": 2.359e-07, + "loss": 0.0019, + "ppl": 0.04638671875, + "reward": 0.8861674070358276, + "reward_std": 0.0032107681035995483, + "rewards/perpo_ocr_edit_distance_reward": 0.8861675262451172, + "step": 2641, + "temperature": 0.9 + }, + { + "advantages": -9.551644325256348e-06, + "completion_length": 359.0, + "delta_ref_entropy_loss": 0.08056640625, + "delta_ref_ppl": -0.1396484375, + "entropy_loss": -0.306640625, + "epoch": 0.5284, + "grad_norm": 2.730982489142522, + "k1_kl": 0.1396484375, + "k3_kl": 0.1025390625, + "kimi_kl": 0.34375, + "learning_rate": 2.358e-07, + "loss": 0.0041, + "ppl": 0.1689453125, + "reward": 0.7928619384765625, + "reward_std": 0.006141263525933027, + "rewards/perpo_ocr_edit_distance_reward": 0.7928619980812073, + "step": 2642, + "temperature": 0.9 + }, + { + "advantages": -1.9588642317103222e-05, + "completion_length": 1179.0, + "delta_ref_entropy_loss": 0.01953125, + "delta_ref_ppl": -0.04443359375, + "entropy_loss": -0.039794921875, + "epoch": 0.5286, + "grad_norm": 0.7104907448528472, + "k1_kl": 0.044677734375, + "k3_kl": 0.0311279296875, + "kimi_kl": 0.08251953125, + "learning_rate": 2.3569999999999997e-07, + "loss": 0.0013, + "ppl": 0.0169677734375, + "reward": 0.9931635856628418, + "reward_std": 0.0025093620643019676, + "rewards/perpo_ocr_edit_distance_reward": 0.9931635856628418, + "step": 2643, + "temperature": 0.9 + }, + { + "advantages": -3.1948089599609375e-05, + "completion_length": 528.0, + "delta_ref_entropy_loss": 0.031494140625, + "delta_ref_ppl": -0.0810546875, + "entropy_loss": -0.091796875, + "epoch": 0.5288, + "grad_norm": 1.5433246312337723, + "k1_kl": 0.0810546875, + "k3_kl": 0.056396484375, + "kimi_kl": 0.1611328125, + "learning_rate": 2.356e-07, + "loss": 0.0023, + "ppl": 0.033203125, + "reward": 0.9401716589927673, + "reward_std": 0.0017648703651502728, + "rewards/perpo_ocr_edit_distance_reward": 0.9401717782020569, + "step": 2644, + "temperature": 0.9 + }, + { + "advantages": -3.519228630466387e-05, + "completion_length": 349.0, + "delta_ref_entropy_loss": 0.08447265625, + "delta_ref_ppl": -0.1298828125, + "entropy_loss": -0.06201171875, + "epoch": 0.529, + "grad_norm": 1.2071000628772008, + "k1_kl": 0.1298828125, + "k3_kl": 0.08642578125, + "kimi_kl": 0.267578125, + "learning_rate": 2.3549999999999998e-07, + "loss": 0.0035, + "ppl": 0.0277099609375, + "reward": 0.8965106010437012, + "reward_std": 0.0015926880296319723, + "rewards/perpo_ocr_edit_distance_reward": 0.896510660648346, + "step": 2645, + "temperature": 0.9 + }, + { + "advantages": 1.6399793821619824e-05, + "completion_length": 408.0, + "delta_ref_entropy_loss": 0.041748046875, + "delta_ref_ppl": -0.09423828125, + "entropy_loss": -0.08056640625, + "epoch": 0.5292, + "grad_norm": 0.5311179423514458, + "k1_kl": 0.09423828125, + "k3_kl": 0.06201171875, + "kimi_kl": 0.2001953125, + "learning_rate": 2.3539999999999998e-07, + "loss": 0.0025, + "ppl": 0.0289306640625, + "reward": 0.9521797895431519, + "reward_std": 0.001457368372939527, + "rewards/perpo_ocr_edit_distance_reward": 0.9521797895431519, + "step": 2646, + "temperature": 0.9 + }, + { + "advantages": -9.128026249527466e-06, + "completion_length": 793.0, + "delta_ref_entropy_loss": 0.0166015625, + "delta_ref_ppl": -0.0576171875, + "entropy_loss": -0.06298828125, + "epoch": 0.5294, + "grad_norm": 0.7793724174896807, + "k1_kl": 0.057861328125, + "k3_kl": 0.04248046875, + "kimi_kl": 0.11767578125, + "learning_rate": 2.353e-07, + "loss": 0.0017, + "ppl": 0.02734375, + "reward": 0.9898820519447327, + "reward_std": 0.0045591434463858604, + "rewards/perpo_ocr_edit_distance_reward": 0.9898821115493774, + "step": 2647, + "temperature": 0.9 + }, + { + "advantages": -7.430145342368633e-05, + "completion_length": 257.0, + "delta_ref_entropy_loss": 0.06396484375, + "delta_ref_ppl": -0.1591796875, + "entropy_loss": -0.09423828125, + "epoch": 0.5296, + "grad_norm": 1.7116035140561345, + "k1_kl": 0.1591796875, + "k3_kl": 0.12109375, + "kimi_kl": 0.4453125, + "learning_rate": 2.352e-07, + "loss": 0.0049, + "ppl": 0.044189453125, + "reward": 0.9718335270881653, + "reward_std": 0.0017335088923573494, + "rewards/perpo_ocr_edit_distance_reward": 0.9718337059020996, + "step": 2648, + "temperature": 0.9 + }, + { + "advantages": -2.469335413479712e-06, + "completion_length": 1440.0, + "delta_ref_entropy_loss": 0.054443359375, + "delta_ref_ppl": -0.0859375, + "entropy_loss": -0.294921875, + "epoch": 0.5298, + "grad_norm": 14.919819248482064, + "k1_kl": 0.0859375, + "k3_kl": 0.056396484375, + "kimi_kl": 0.1376953125, + "learning_rate": 2.3509999999999998e-07, + "loss": 0.0023, + "ppl": 0.166015625, + "reward": 0.8700363039970398, + "reward_std": 0.00335606443695724, + "rewards/perpo_ocr_edit_distance_reward": 0.8700363039970398, + "step": 2649, + "temperature": 0.9 + }, + { + "advantages": 0.0, + "completion_length": 1004.0, + "delta_ref_entropy_loss": 0.03125, + "delta_ref_ppl": -0.041015625, + "entropy_loss": -0.03955078125, + "epoch": 0.53, + "grad_norm": 0.6539643508095044, + "k1_kl": 0.041015625, + "k3_kl": 0.0296630859375, + "kimi_kl": 0.07080078125, + "learning_rate": 2.3499999999999997e-07, + "loss": 0.0012, + "ppl": 0.015625, + "reward": 0.9695208072662354, + "reward_std": 0.002769741928204894, + "rewards/perpo_ocr_edit_distance_reward": 0.9695208072662354, + "step": 2650, + "temperature": 0.9 + }, + { + "advantages": -2.9035977604507934e-06, + "completion_length": 616.0, + "delta_ref_entropy_loss": 0.04296875, + "delta_ref_ppl": -0.07275390625, + "entropy_loss": -0.275390625, + "epoch": 0.5302, + "grad_norm": 1.980416135008477, + "k1_kl": 0.07275390625, + "k3_kl": 0.046630859375, + "kimi_kl": 0.091796875, + "learning_rate": 2.349e-07, + "loss": 0.0019, + "ppl": 0.1416015625, + "reward": 0.49769872426986694, + "reward_std": 0.005753695033490658, + "rewards/perpo_ocr_edit_distance_reward": 0.49769872426986694, + "step": 2651, + "temperature": 0.9 + }, + { + "advantages": -1.2704304936050903e-05, + "completion_length": 921.0, + "delta_ref_entropy_loss": 0.048583984375, + "delta_ref_ppl": -0.04638671875, + "entropy_loss": -0.036376953125, + "epoch": 0.5304, + "grad_norm": 8.0308288497863, + "k1_kl": 0.04638671875, + "k3_kl": 0.051025390625, + "kimi_kl": 0.0849609375, + "learning_rate": 2.3479999999999998e-07, + "loss": 0.0021, + "ppl": 0.01434326171875, + "reward": 0.996161162853241, + "reward_std": 0.0005706219235435128, + "rewards/perpo_ocr_edit_distance_reward": 0.996161162853241, + "step": 2652, + "temperature": 0.9 + }, + { + "advantages": -0.00011229515803279355, + "completion_length": 923.0, + "delta_ref_entropy_loss": 0.047119140625, + "delta_ref_ppl": -0.054931640625, + "entropy_loss": -0.0634765625, + "epoch": 0.5306, + "grad_norm": 1.916737278139449, + "k1_kl": 0.054931640625, + "k3_kl": 0.0294189453125, + "kimi_kl": 0.060791015625, + "learning_rate": 2.3469999999999998e-07, + "loss": 0.0013, + "ppl": 0.0301513671875, + "reward": 0.9901909232139587, + "reward_std": 0.0008099160040728748, + "rewards/perpo_ocr_edit_distance_reward": 0.9901909828186035, + "step": 2653, + "temperature": 0.9 + }, + { + "advantages": -2.5945051675080322e-05, + "completion_length": 395.0, + "delta_ref_entropy_loss": 0.12451171875, + "delta_ref_ppl": -0.1162109375, + "entropy_loss": -0.212890625, + "epoch": 0.5308, + "grad_norm": 1.877043793027987, + "k1_kl": 0.1162109375, + "k3_kl": 0.0712890625, + "kimi_kl": 0.205078125, + "learning_rate": 2.346e-07, + "loss": 0.0029, + "ppl": 0.1083984375, + "reward": 0.9355133175849915, + "reward_std": 0.0015411111526191235, + "rewards/perpo_ocr_edit_distance_reward": 0.9355133771896362, + "step": 2654, + "temperature": 0.9 + }, + { + "advantages": -0.0001024348457576707, + "completion_length": 846.0, + "delta_ref_entropy_loss": 0.0308837890625, + "delta_ref_ppl": -0.044921875, + "entropy_loss": -0.0269775390625, + "epoch": 0.531, + "grad_norm": 0.6244735604160775, + "k1_kl": 0.044921875, + "k3_kl": 0.0245361328125, + "kimi_kl": 0.07373046875, + "learning_rate": 2.3449999999999996e-07, + "loss": 0.0011, + "ppl": 0.01153564453125, + "reward": 0.9971852898597717, + "reward_std": 0.0008977041579782963, + "rewards/perpo_ocr_edit_distance_reward": 0.9971853494644165, + "step": 2655, + "temperature": 0.9 + }, + { + "advantages": 2.348423186049331e-05, + "completion_length": 1177.0, + "delta_ref_entropy_loss": 0.047607421875, + "delta_ref_ppl": -0.0634765625, + "entropy_loss": -0.06103515625, + "epoch": 0.5312, + "grad_norm": 0.3744652854798656, + "k1_kl": 0.0634765625, + "k3_kl": 0.04052734375, + "kimi_kl": 0.09423828125, + "learning_rate": 2.3439999999999998e-07, + "loss": 0.0016, + "ppl": 0.0291748046875, + "reward": 0.9935315251350403, + "reward_std": 0.0006250541773624718, + "rewards/perpo_ocr_edit_distance_reward": 0.9935315251350403, + "step": 2656, + "temperature": 0.9 + }, + { + "advantages": -3.630774517660029e-05, + "completion_length": 451.0, + "delta_ref_entropy_loss": 0.055419921875, + "delta_ref_ppl": -0.083984375, + "entropy_loss": -0.04541015625, + "epoch": 0.5314, + "grad_norm": 0.5838553025649179, + "k1_kl": 0.083984375, + "k3_kl": 0.05908203125, + "kimi_kl": 0.2294921875, + "learning_rate": 2.343e-07, + "loss": 0.0024, + "ppl": 0.0167236328125, + "reward": 0.9750204086303711, + "reward_std": 0.001307841157540679, + "rewards/perpo_ocr_edit_distance_reward": 0.9750204682350159, + "step": 2657, + "temperature": 0.9 + }, + { + "advantages": -4.196167355985381e-05, + "completion_length": 940.0, + "delta_ref_entropy_loss": 0.04248046875, + "delta_ref_ppl": -0.0693359375, + "entropy_loss": -0.06591796875, + "epoch": 0.5316, + "grad_norm": 1.1055156642155421, + "k1_kl": 0.06982421875, + "k3_kl": 0.0458984375, + "kimi_kl": 0.1396484375, + "learning_rate": 2.342e-07, + "loss": 0.0019, + "ppl": 0.0294189453125, + "reward": 0.994457483291626, + "reward_std": 0.001319860341027379, + "rewards/perpo_ocr_edit_distance_reward": 0.9944575428962708, + "step": 2658, + "temperature": 0.9 + }, + { + "advantages": -3.1403134926222265e-05, + "completion_length": 1080.0, + "delta_ref_entropy_loss": 0.0869140625, + "delta_ref_ppl": -0.083984375, + "entropy_loss": -0.2470703125, + "epoch": 0.5318, + "grad_norm": 1.6347091175831014, + "k1_kl": 0.083984375, + "k3_kl": 0.050537109375, + "kimi_kl": 0.09375, + "learning_rate": 2.3409999999999999e-07, + "loss": 0.0021, + "ppl": 0.130859375, + "reward": 0.7453126311302185, + "reward_std": 0.0031534736044704914, + "rewards/perpo_ocr_edit_distance_reward": 0.7453127503395081, + "step": 2659, + "temperature": 0.9 + }, + { + "advantages": 1.0303089084118255e-06, + "completion_length": 771.0, + "delta_ref_entropy_loss": 0.041015625, + "delta_ref_ppl": -0.0478515625, + "entropy_loss": -0.058837890625, + "epoch": 0.532, + "grad_norm": 1.036247610027109, + "k1_kl": 0.0478515625, + "k3_kl": 0.029296875, + "kimi_kl": 0.0703125, + "learning_rate": 2.34e-07, + "loss": 0.0012, + "ppl": 0.02392578125, + "reward": 0.9824880361557007, + "reward_std": 0.008113956078886986, + "rewards/perpo_ocr_edit_distance_reward": 0.9824880361557007, + "step": 2660, + "temperature": 0.9 + }, + { + "advantages": -1.5667507113903412e-06, + "completion_length": 497.0, + "delta_ref_entropy_loss": 0.08544921875, + "delta_ref_ppl": -0.08935546875, + "entropy_loss": -0.2197265625, + "epoch": 0.5322, + "grad_norm": 1.197893267712468, + "k1_kl": 0.08935546875, + "k3_kl": 0.054443359375, + "kimi_kl": 0.150390625, + "learning_rate": 2.339e-07, + "loss": 0.0022, + "ppl": 0.09716796875, + "reward": 0.39599815011024475, + "reward_std": 0.005334005691111088, + "rewards/perpo_ocr_edit_distance_reward": 0.39599815011024475, + "step": 2661, + "temperature": 0.9 + }, + { + "advantages": -0.00024374894564971328, + "completion_length": 473.0, + "delta_ref_entropy_loss": 0.0277099609375, + "delta_ref_ppl": -0.06103515625, + "entropy_loss": -0.019287109375, + "epoch": 0.5324, + "grad_norm": 0.17202675881588428, + "k1_kl": 0.06103515625, + "k3_kl": 0.04052734375, + "kimi_kl": 0.1220703125, + "learning_rate": 2.338e-07, + "loss": 0.0019, + "ppl": 0.0068359375, + "reward": 0.9985077381134033, + "reward_std": 0.00017946904699783772, + "rewards/perpo_ocr_edit_distance_reward": 0.9985077381134033, + "step": 2662, + "temperature": 0.9 + }, + { + "advantages": 1.7029899268550253e-08, + "completion_length": 297.0, + "delta_ref_entropy_loss": 0.045654296875, + "delta_ref_ppl": -0.109375, + "entropy_loss": -0.0615234375, + "epoch": 0.5326, + "grad_norm": 1.2038703633140437, + "k1_kl": 0.109375, + "k3_kl": 0.0791015625, + "kimi_kl": 0.298828125, + "learning_rate": 2.3369999999999998e-07, + "loss": 0.0032, + "ppl": 0.021728515625, + "reward": 0.9949806332588196, + "reward_std": 0.0011463143164291978, + "rewards/perpo_ocr_edit_distance_reward": 0.9949806928634644, + "step": 2663, + "temperature": 0.9 + }, + { + "advantages": -1.1239733794354834e-05, + "completion_length": 1027.0, + "delta_ref_entropy_loss": 0.006805419921875, + "delta_ref_ppl": -0.039306640625, + "entropy_loss": -0.054931640625, + "epoch": 0.5328, + "grad_norm": 0.4708715069344483, + "k1_kl": 0.039306640625, + "k3_kl": 0.0303955078125, + "kimi_kl": 0.0947265625, + "learning_rate": 2.336e-07, + "loss": 0.0012, + "ppl": 0.02001953125, + "reward": 0.895685613155365, + "reward_std": 0.004447383340448141, + "rewards/perpo_ocr_edit_distance_reward": 0.8956856727600098, + "step": 2664, + "temperature": 0.9 + }, + { + "advantages": -1.1750630619644653e-05, + "completion_length": 486.0, + "delta_ref_entropy_loss": 0.0201416015625, + "delta_ref_ppl": -0.0537109375, + "entropy_loss": -0.029541015625, + "epoch": 0.533, + "grad_norm": 0.8848998700813888, + "k1_kl": 0.053955078125, + "k3_kl": 0.040771484375, + "kimi_kl": 0.154296875, + "learning_rate": 2.335e-07, + "loss": 0.0016, + "ppl": 0.011474609375, + "reward": 0.9974750876426697, + "reward_std": 0.0042365072295069695, + "rewards/perpo_ocr_edit_distance_reward": 0.9974751472473145, + "step": 2665, + "temperature": 0.9 + }, + { + "advantages": -1.4816012480878271e-05, + "completion_length": 379.0, + "delta_ref_entropy_loss": 0.0966796875, + "delta_ref_ppl": -0.1220703125, + "entropy_loss": -0.1025390625, + "epoch": 0.5332, + "grad_norm": 1.517874783266774, + "k1_kl": 0.1220703125, + "k3_kl": 0.07861328125, + "kimi_kl": 0.2177734375, + "learning_rate": 2.3339999999999999e-07, + "loss": 0.0032, + "ppl": 0.04150390625, + "reward": 0.945534348487854, + "reward_std": 0.001622165204025805, + "rewards/perpo_ocr_edit_distance_reward": 0.9455344676971436, + "step": 2666, + "temperature": 0.9 + }, + { + "advantages": -2.6677336791181006e-05, + "completion_length": 2048.0, + "delta_ref_entropy_loss": 0.00909423828125, + "delta_ref_ppl": -0.0240478515625, + "entropy_loss": -0.059326171875, + "epoch": 0.5334, + "grad_norm": 5.696470794221101, + "k1_kl": 0.024169921875, + "k3_kl": 0.01904296875, + "kimi_kl": 0.04052734375, + "learning_rate": 2.333e-07, + "loss": 0.0008, + "ppl": 0.0283203125, + "reward": 0.883155345916748, + "reward_std": 0.004369908478111029, + "rewards/perpo_ocr_edit_distance_reward": 0.8831554651260376, + "step": 2667, + "temperature": 0.9 + }, + { + "advantages": -7.488472328986973e-05, + "completion_length": 979.0, + "delta_ref_entropy_loss": 0.07177734375, + "delta_ref_ppl": -0.07421875, + "entropy_loss": -0.057861328125, + "epoch": 0.5336, + "grad_norm": 0.5675983679198581, + "k1_kl": 0.07421875, + "k3_kl": 0.041748046875, + "kimi_kl": 0.10693359375, + "learning_rate": 2.3319999999999997e-07, + "loss": 0.0017, + "ppl": 0.021484375, + "reward": 0.9797303080558777, + "reward_std": 0.0005821792874485254, + "rewards/perpo_ocr_edit_distance_reward": 0.9797303676605225, + "step": 2668, + "temperature": 0.9 + }, + { + "advantages": -2.0197459889459424e-05, + "completion_length": 1068.0, + "delta_ref_entropy_loss": 0.04345703125, + "delta_ref_ppl": -0.052490234375, + "entropy_loss": -0.1015625, + "epoch": 0.5338, + "grad_norm": 6.9155383117991605, + "k1_kl": 0.052490234375, + "k3_kl": 0.0390625, + "kimi_kl": 0.060791015625, + "learning_rate": 2.331e-07, + "loss": 0.0016, + "ppl": 0.05224609375, + "reward": 0.9876699447631836, + "reward_std": 0.0028521642088890076, + "rewards/perpo_ocr_edit_distance_reward": 0.9876700043678284, + "step": 2669, + "temperature": 0.9 + }, + { + "advantages": -4.8135010729311034e-05, + "completion_length": 474.0, + "delta_ref_entropy_loss": 0.046142578125, + "delta_ref_ppl": -0.048583984375, + "entropy_loss": -0.033203125, + "epoch": 0.534, + "grad_norm": 0.46214436918370805, + "k1_kl": 0.048583984375, + "k3_kl": 0.02880859375, + "kimi_kl": 0.08154296875, + "learning_rate": 2.33e-07, + "loss": 0.0012, + "ppl": 0.01446533203125, + "reward": 0.9966559410095215, + "reward_std": 0.0007847289089113474, + "rewards/perpo_ocr_edit_distance_reward": 0.9966560006141663, + "step": 2670, + "temperature": 0.9 + }, + { + "advantages": -3.405979782655777e-07, + "completion_length": 1132.0, + "delta_ref_entropy_loss": 0.032470703125, + "delta_ref_ppl": -0.07666015625, + "entropy_loss": -0.27734375, + "epoch": 0.5342, + "grad_norm": 3.5765105051069885, + "k1_kl": 0.07666015625, + "k3_kl": 0.0537109375, + "kimi_kl": 0.11328125, + "learning_rate": 2.3289999999999997e-07, + "loss": 0.0021, + "ppl": 0.14453125, + "reward": 0.8271142244338989, + "reward_std": 0.0267932265996933, + "rewards/perpo_ocr_edit_distance_reward": 0.8271142840385437, + "step": 2671, + "temperature": 0.9 + }, + { + "advantages": -5.91618700127583e-05, + "completion_length": 734.0, + "delta_ref_entropy_loss": 0.046630859375, + "delta_ref_ppl": -0.060791015625, + "entropy_loss": -0.04638671875, + "epoch": 0.5344, + "grad_norm": 0.5477687378130743, + "k1_kl": 0.060791015625, + "k3_kl": 0.03564453125, + "kimi_kl": 0.09716796875, + "learning_rate": 2.328e-07, + "loss": 0.0015, + "ppl": 0.0205078125, + "reward": 0.981591522693634, + "reward_std": 0.0004757413116749376, + "rewards/perpo_ocr_edit_distance_reward": 0.981591522693634, + "step": 2672, + "temperature": 0.9 + }, + { + "advantages": -8.685248758411035e-06, + "completion_length": 428.0, + "delta_ref_entropy_loss": 0.060302734375, + "delta_ref_ppl": -0.11474609375, + "entropy_loss": -0.1279296875, + "epoch": 0.5346, + "grad_norm": 0.9932015774921706, + "k1_kl": 0.1142578125, + "k3_kl": 0.08056640625, + "kimi_kl": 0.3125, + "learning_rate": 2.3269999999999999e-07, + "loss": 0.0032, + "ppl": 0.046142578125, + "reward": 0.7345856428146362, + "reward_std": 0.003821754362434149, + "rewards/perpo_ocr_edit_distance_reward": 0.734585702419281, + "step": 2673, + "temperature": 0.9 + }, + { + "advantages": -1.4168876077746972e-05, + "completion_length": 913.0, + "delta_ref_entropy_loss": 0.09716796875, + "delta_ref_ppl": -0.0693359375, + "entropy_loss": -0.208984375, + "epoch": 0.5348, + "grad_norm": 1.5403762692258935, + "k1_kl": 0.0693359375, + "k3_kl": 0.03564453125, + "kimi_kl": 0.06591796875, + "learning_rate": 2.3259999999999998e-07, + "loss": 0.0014, + "ppl": 0.11083984375, + "reward": 0.7782309055328369, + "reward_std": 0.0011013232870027423, + "rewards/perpo_ocr_edit_distance_reward": 0.7782309651374817, + "step": 2674, + "temperature": 0.9 + }, + { + "advantages": -2.1457672119140625e-06, + "completion_length": 135.0, + "delta_ref_entropy_loss": 0.01202392578125, + "delta_ref_ppl": -0.24609375, + "entropy_loss": -0.150390625, + "epoch": 0.535, + "grad_norm": 2.6805607421391384, + "k1_kl": 0.2470703125, + "k3_kl": 0.203125, + "kimi_kl": 0.99609375, + "learning_rate": 2.325e-07, + "loss": 0.0081, + "ppl": 0.060302734375, + "reward": 0.9819344282150269, + "reward_std": 0.0038444509264081717, + "rewards/perpo_ocr_edit_distance_reward": 0.9819344282150269, + "step": 2675, + "temperature": 0.9 + }, + { + "advantages": -1.743861685099546e-05, + "completion_length": 1892.0, + "delta_ref_entropy_loss": 0.021728515625, + "delta_ref_ppl": -0.044677734375, + "entropy_loss": -0.103515625, + "epoch": 0.5352, + "grad_norm": 1.2247693251845606, + "k1_kl": 0.044677734375, + "k3_kl": 0.035400390625, + "kimi_kl": 0.095703125, + "learning_rate": 2.324e-07, + "loss": 0.0014, + "ppl": 0.05078125, + "reward": 0.9583621621131897, + "reward_std": 0.0038057775236666203, + "rewards/perpo_ocr_edit_distance_reward": 0.9583622217178345, + "step": 2676, + "temperature": 0.9 + }, + { + "advantages": -1.9584383892379265e-07, + "completion_length": 179.0, + "delta_ref_entropy_loss": -0.0023651123046875, + "delta_ref_ppl": -0.341796875, + "entropy_loss": -0.390625, + "epoch": 0.5354, + "grad_norm": 3.5914348799307017, + "k1_kl": 0.341796875, + "k3_kl": 0.287109375, + "kimi_kl": 1.2890625, + "learning_rate": 2.323e-07, + "loss": 0.0115, + "ppl": 0.173828125, + "reward": 0.8042789697647095, + "reward_std": 0.2663964033126831, + "rewards/perpo_ocr_edit_distance_reward": 0.804279088973999, + "step": 2677, + "temperature": 0.9 + }, + { + "advantages": -2.7588437660597265e-05, + "completion_length": 775.0, + "delta_ref_entropy_loss": 0.0546875, + "delta_ref_ppl": -0.07666015625, + "entropy_loss": -0.10595703125, + "epoch": 0.5356, + "grad_norm": 1.106876708748646, + "k1_kl": 0.076171875, + "k3_kl": 0.046142578125, + "kimi_kl": 0.12060546875, + "learning_rate": 2.3219999999999997e-07, + "loss": 0.0019, + "ppl": 0.051513671875, + "reward": 0.7346546053886414, + "reward_std": 0.0011343040969222784, + "rewards/perpo_ocr_edit_distance_reward": 0.7346546053886414, + "step": 2678, + "temperature": 0.9 + }, + { + "advantages": -6.672314339084551e-05, + "completion_length": 761.0, + "delta_ref_entropy_loss": 0.034912109375, + "delta_ref_ppl": -0.053955078125, + "entropy_loss": -0.0322265625, + "epoch": 0.5358, + "grad_norm": 0.2330988792686791, + "k1_kl": 0.05419921875, + "k3_kl": 0.033447265625, + "kimi_kl": 0.11279296875, + "learning_rate": 2.321e-07, + "loss": 0.0014, + "ppl": 0.0089111328125, + "reward": 0.985859751701355, + "reward_std": 0.0002827892603818327, + "rewards/perpo_ocr_edit_distance_reward": 0.985859751701355, + "step": 2679, + "temperature": 0.9 + }, + { + "advantages": 7.95296273281565e-06, + "completion_length": 673.0, + "delta_ref_entropy_loss": 0.056640625, + "delta_ref_ppl": -0.05908203125, + "entropy_loss": -0.051025390625, + "epoch": 0.536, + "grad_norm": 0.5086264686323502, + "k1_kl": 0.05908203125, + "k3_kl": 0.037841796875, + "kimi_kl": 0.10986328125, + "learning_rate": 2.32e-07, + "loss": 0.0015, + "ppl": 0.02197265625, + "reward": 0.996008574962616, + "reward_std": 0.000969197484664619, + "rewards/perpo_ocr_edit_distance_reward": 0.9960086345672607, + "step": 2680, + "temperature": 0.9 + }, + { + "advantages": -4.584448834066279e-05, + "completion_length": 549.0, + "delta_ref_entropy_loss": 0.0277099609375, + "delta_ref_ppl": -0.046875, + "entropy_loss": -0.04052734375, + "epoch": 0.5362, + "grad_norm": 0.6468100350285849, + "k1_kl": 0.046630859375, + "k3_kl": 0.0303955078125, + "kimi_kl": 0.09228515625, + "learning_rate": 2.3189999999999998e-07, + "loss": 0.0013, + "ppl": 0.0137939453125, + "reward": 0.8320262432098389, + "reward_std": 0.0012005583848804235, + "rewards/perpo_ocr_edit_distance_reward": 0.8320263624191284, + "step": 2681, + "temperature": 0.9 + }, + { + "advantages": -7.330945663852617e-05, + "completion_length": 502.0, + "delta_ref_entropy_loss": 0.04833984375, + "delta_ref_ppl": -0.0595703125, + "entropy_loss": -0.032470703125, + "epoch": 0.5364, + "grad_norm": 0.291833138825031, + "k1_kl": 0.059814453125, + "k3_kl": 0.032958984375, + "kimi_kl": 0.0849609375, + "learning_rate": 2.318e-07, + "loss": 0.0014, + "ppl": 0.007293701171875, + "reward": 0.9858706593513489, + "reward_std": 0.0003647228586487472, + "rewards/perpo_ocr_edit_distance_reward": 0.9858706593513489, + "step": 2682, + "temperature": 0.9 + }, + { + "advantages": -6.675720669591101e-06, + "completion_length": 96.0, + "delta_ref_entropy_loss": 0.049560546875, + "delta_ref_ppl": -0.44921875, + "entropy_loss": -0.0966796875, + "epoch": 0.5366, + "grad_norm": 2.14723830355604, + "k1_kl": 0.451171875, + "k3_kl": 0.396484375, + "kimi_kl": 2.046875, + "learning_rate": 2.317e-07, + "loss": 0.0159, + "ppl": 0.03857421875, + "reward": 0.954068660736084, + "reward_std": 0.006274635437875986, + "rewards/perpo_ocr_edit_distance_reward": 0.9540687799453735, + "step": 2683, + "temperature": 0.9 + }, + { + "advantages": -6.883485184516758e-05, + "completion_length": 1155.0, + "delta_ref_entropy_loss": 0.054931640625, + "delta_ref_ppl": -0.06640625, + "entropy_loss": -0.06494140625, + "epoch": 0.5368, + "grad_norm": 0.44338545288614545, + "k1_kl": 0.06591796875, + "k3_kl": 0.039306640625, + "kimi_kl": 0.10546875, + "learning_rate": 2.3159999999999998e-07, + "loss": 0.0016, + "ppl": 0.024658203125, + "reward": 0.9390283226966858, + "reward_std": 0.0002710728149395436, + "rewards/perpo_ocr_edit_distance_reward": 0.9390284419059753, + "step": 2684, + "temperature": 0.9 + }, + { + "advantages": -8.378710845136084e-06, + "completion_length": 688.0, + "delta_ref_entropy_loss": 0.06494140625, + "delta_ref_ppl": -0.0712890625, + "entropy_loss": -0.1474609375, + "epoch": 0.537, + "grad_norm": 1.446008104177925, + "k1_kl": 0.0712890625, + "k3_kl": 0.04541015625, + "kimi_kl": 0.1298828125, + "learning_rate": 2.315e-07, + "loss": 0.0018, + "ppl": 0.0693359375, + "reward": 0.9726113677024841, + "reward_std": 0.005990330595523119, + "rewards/perpo_ocr_edit_distance_reward": 0.9726114869117737, + "step": 2685, + "temperature": 0.9 + }, + { + "advantages": -1.820496254367754e-05, + "completion_length": 394.0, + "delta_ref_entropy_loss": 0.045166015625, + "delta_ref_ppl": -0.08642578125, + "entropy_loss": -0.04150390625, + "epoch": 0.5372, + "grad_norm": 1.023924183923483, + "k1_kl": 0.08642578125, + "k3_kl": 0.0634765625, + "kimi_kl": 0.2392578125, + "learning_rate": 2.314e-07, + "loss": 0.0025, + "ppl": 0.017333984375, + "reward": 0.9942317605018616, + "reward_std": 0.0027079004794359207, + "rewards/perpo_ocr_edit_distance_reward": 0.9942318201065063, + "step": 2686, + "temperature": 0.9 + }, + { + "advantages": 1.7029899268550253e-08, + "completion_length": 772.0, + "delta_ref_entropy_loss": 0.049560546875, + "delta_ref_ppl": -0.0888671875, + "entropy_loss": -0.059814453125, + "epoch": 0.5374, + "grad_norm": 2.9149574315561755, + "k1_kl": 0.0888671875, + "k3_kl": 0.0625, + "kimi_kl": 0.2236328125, + "learning_rate": 2.3129999999999999e-07, + "loss": 0.0025, + "ppl": 0.0218505859375, + "reward": 0.9314824938774109, + "reward_std": 0.02102854661643505, + "rewards/perpo_ocr_edit_distance_reward": 0.9314824938774109, + "step": 2687, + "temperature": 0.9 + }, + { + "advantages": -1.4833041859674267e-05, + "completion_length": 540.0, + "delta_ref_entropy_loss": 0.07470703125, + "delta_ref_ppl": -0.1064453125, + "entropy_loss": -0.2021484375, + "epoch": 0.5376, + "grad_norm": 1.3643080336010234, + "k1_kl": 0.10595703125, + "k3_kl": 0.068359375, + "kimi_kl": 0.185546875, + "learning_rate": 2.3119999999999998e-07, + "loss": 0.0027, + "ppl": 0.095703125, + "reward": 0.9711647033691406, + "reward_std": 0.002196670975536108, + "rewards/perpo_ocr_edit_distance_reward": 0.9711647629737854, + "step": 2688, + "temperature": 0.9 + }, + { + "advantages": -9.145055810222402e-05, + "completion_length": 607.0, + "delta_ref_entropy_loss": 0.046875, + "delta_ref_ppl": -0.051513671875, + "entropy_loss": -0.030517578125, + "epoch": 0.5378, + "grad_norm": 0.44425471109217185, + "k1_kl": 0.051513671875, + "k3_kl": 0.0341796875, + "kimi_kl": 0.0947265625, + "learning_rate": 2.311e-07, + "loss": 0.0015, + "ppl": 0.01318359375, + "reward": 0.9982471466064453, + "reward_std": 0.00045846521970815957, + "rewards/perpo_ocr_edit_distance_reward": 0.9982472062110901, + "step": 2689, + "temperature": 0.9 + }, + { + "advantages": -3.814697265625e-06, + "completion_length": 1048.0, + "delta_ref_entropy_loss": 0.053955078125, + "delta_ref_ppl": -0.05712890625, + "entropy_loss": -0.0654296875, + "epoch": 0.538, + "grad_norm": 0.9810234439427431, + "k1_kl": 0.05712890625, + "k3_kl": 0.033935546875, + "kimi_kl": 0.0869140625, + "learning_rate": 2.31e-07, + "loss": 0.0014, + "ppl": 0.025634765625, + "reward": 0.9775810241699219, + "reward_std": 0.015457826666533947, + "rewards/perpo_ocr_edit_distance_reward": 0.9775810837745667, + "step": 2690, + "temperature": 0.9 + }, + { + "advantages": -4.3017524149036035e-05, + "completion_length": 789.0, + "delta_ref_entropy_loss": 0.03076171875, + "delta_ref_ppl": -0.0380859375, + "entropy_loss": -0.0419921875, + "epoch": 0.5382, + "grad_norm": 0.6321362898793224, + "k1_kl": 0.0380859375, + "k3_kl": 0.0216064453125, + "kimi_kl": 0.06298828125, + "learning_rate": 2.3089999999999998e-07, + "loss": 0.0009, + "ppl": 0.018310546875, + "reward": 0.9929118156433105, + "reward_std": 0.0020770628470927477, + "rewards/perpo_ocr_edit_distance_reward": 0.9929119348526001, + "step": 2691, + "temperature": 0.9 + }, + { + "advantages": -3.777231540880166e-05, + "completion_length": 321.0, + "delta_ref_entropy_loss": 0.036865234375, + "delta_ref_ppl": -0.10107421875, + "entropy_loss": -0.0458984375, + "epoch": 0.5384, + "grad_norm": 0.8895097468293279, + "k1_kl": 0.10107421875, + "k3_kl": 0.0791015625, + "kimi_kl": 0.318359375, + "learning_rate": 2.308e-07, + "loss": 0.0032, + "ppl": 0.0225830078125, + "reward": 0.9964930415153503, + "reward_std": 0.0019296611426398158, + "rewards/perpo_ocr_edit_distance_reward": 0.9964931011199951, + "step": 2692, + "temperature": 0.9 + }, + { + "advantages": -2.053805837931577e-05, + "completion_length": 547.0, + "delta_ref_entropy_loss": 0.0703125, + "delta_ref_ppl": -0.07568359375, + "entropy_loss": -0.08056640625, + "epoch": 0.5386, + "grad_norm": 0.7241541612914185, + "k1_kl": 0.07568359375, + "k3_kl": 0.044921875, + "kimi_kl": 0.111328125, + "learning_rate": 2.3069999999999997e-07, + "loss": 0.0018, + "ppl": 0.038818359375, + "reward": 0.9595945477485657, + "reward_std": 0.0015586495865136385, + "rewards/perpo_ocr_edit_distance_reward": 0.9595946073532104, + "step": 2693, + "temperature": 0.9 + }, + { + "advantages": -1.2091228427379974e-06, + "completion_length": 1298.0, + "delta_ref_entropy_loss": 0.01373291015625, + "delta_ref_ppl": -0.04638671875, + "entropy_loss": -0.11572265625, + "epoch": 0.5388, + "grad_norm": 1.700189981714115, + "k1_kl": 0.046142578125, + "k3_kl": 0.0322265625, + "kimi_kl": 0.076171875, + "learning_rate": 2.306e-07, + "loss": 0.0013, + "ppl": 0.06201171875, + "reward": 0.8136711716651917, + "reward_std": 0.042583949863910675, + "rewards/perpo_ocr_edit_distance_reward": 0.8136712312698364, + "step": 2694, + "temperature": 0.9 + }, + { + "advantages": 8.514949634275126e-09, + "completion_length": 208.0, + "delta_ref_entropy_loss": 0.0264892578125, + "delta_ref_ppl": -0.1552734375, + "entropy_loss": -0.068359375, + "epoch": 0.539, + "grad_norm": 1.791583780832238, + "k1_kl": 0.1552734375, + "k3_kl": 0.12158203125, + "kimi_kl": 0.45703125, + "learning_rate": 2.305e-07, + "loss": 0.0049, + "ppl": 0.0206298828125, + "reward": 0.9771378040313721, + "reward_std": 0.00397240137681365, + "rewards/perpo_ocr_edit_distance_reward": 0.9771378636360168, + "step": 2695, + "temperature": 0.9 + }, + { + "advantages": -0.00021016599202994257, + "completion_length": 1080.0, + "delta_ref_entropy_loss": 0.007720947265625, + "delta_ref_ppl": -0.0189208984375, + "entropy_loss": -0.021728515625, + "epoch": 0.5392, + "grad_norm": 0.2763269072873763, + "k1_kl": 0.01904296875, + "k3_kl": 0.0125732421875, + "kimi_kl": 0.035400390625, + "learning_rate": 2.3039999999999997e-07, + "loss": 0.0007, + "ppl": 0.00616455078125, + "reward": 0.9977368712425232, + "reward_std": 0.00026462756795808673, + "rewards/perpo_ocr_edit_distance_reward": 0.9977369904518127, + "step": 2696, + "temperature": 0.9 + }, + { + "advantages": -8.116450044326484e-05, + "completion_length": 364.0, + "delta_ref_entropy_loss": 0.026611328125, + "delta_ref_ppl": -0.0947265625, + "entropy_loss": -0.03564453125, + "epoch": 0.5394, + "grad_norm": 0.595543232819597, + "k1_kl": 0.0947265625, + "k3_kl": 0.08056640625, + "kimi_kl": 0.322265625, + "learning_rate": 2.303e-07, + "loss": 0.0033, + "ppl": 0.01336669921875, + "reward": 0.997377336025238, + "reward_std": 0.001263582962565124, + "rewards/perpo_ocr_edit_distance_reward": 0.9973774552345276, + "step": 2697, + "temperature": 0.9 + }, + { + "advantages": -6.449222564697266e-05, + "completion_length": 502.0, + "delta_ref_entropy_loss": 0.0732421875, + "delta_ref_ppl": -0.0751953125, + "entropy_loss": -0.059814453125, + "epoch": 0.5396, + "grad_norm": 0.8806653032973663, + "k1_kl": 0.0751953125, + "k3_kl": 0.04248046875, + "kimi_kl": 0.115234375, + "learning_rate": 2.3019999999999998e-07, + "loss": 0.0018, + "ppl": 0.023193359375, + "reward": 0.9899117350578308, + "reward_std": 0.0005599399446509778, + "rewards/perpo_ocr_edit_distance_reward": 0.9899118542671204, + "step": 2698, + "temperature": 0.9 + }, + { + "advantages": -8.855547548591858e-07, + "completion_length": 705.0, + "delta_ref_entropy_loss": 0.028076171875, + "delta_ref_ppl": -0.07958984375, + "entropy_loss": -0.11474609375, + "epoch": 0.5398, + "grad_norm": 1.6441671933336712, + "k1_kl": 0.07958984375, + "k3_kl": 0.055908203125, + "kimi_kl": 0.177734375, + "learning_rate": 2.3009999999999998e-07, + "loss": 0.0022, + "ppl": 0.054443359375, + "reward": 0.964617133140564, + "reward_std": 0.019005054607987404, + "rewards/perpo_ocr_edit_distance_reward": 0.964617133140564, + "step": 2699, + "temperature": 0.9 + }, + { + "advantages": -3.831727372016758e-05, + "completion_length": 1239.0, + "delta_ref_entropy_loss": 0.0286865234375, + "delta_ref_ppl": -0.039794921875, + "entropy_loss": -0.06884765625, + "epoch": 0.54, + "grad_norm": 1.7707535337698033, + "k1_kl": 0.039794921875, + "k3_kl": 0.02734375, + "kimi_kl": 0.07958984375, + "learning_rate": 2.3e-07, + "loss": 0.0011, + "ppl": 0.0289306640625, + "reward": 0.9915070533752441, + "reward_std": 0.0007894195732660592, + "rewards/perpo_ocr_edit_distance_reward": 0.9915071129798889, + "step": 2700, + "temperature": 0.9 + }, + { + "advantages": 3.889629078912549e-05, + "completion_length": 751.0, + "delta_ref_entropy_loss": 0.059814453125, + "delta_ref_ppl": -0.06298828125, + "entropy_loss": -0.0269775390625, + "epoch": 0.5402, + "grad_norm": 0.31931478366307253, + "k1_kl": 0.06298828125, + "k3_kl": 0.035888671875, + "kimi_kl": 0.10400390625, + "learning_rate": 2.299e-07, + "loss": 0.0014, + "ppl": 0.0089111328125, + "reward": 0.99765944480896, + "reward_std": 0.0003378286201041192, + "rewards/perpo_ocr_edit_distance_reward": 0.99765944480896, + "step": 2701, + "temperature": 0.9 + }, + { + "advantages": -3.652913437690586e-05, + "completion_length": 252.0, + "delta_ref_entropy_loss": 0.045654296875, + "delta_ref_ppl": -0.2021484375, + "entropy_loss": -0.0830078125, + "epoch": 0.5404, + "grad_norm": 0.9316826522013177, + "k1_kl": 0.2021484375, + "k3_kl": 0.1552734375, + "kimi_kl": 0.65625, + "learning_rate": 2.298e-07, + "loss": 0.0062, + "ppl": 0.02587890625, + "reward": 0.7284436225891113, + "reward_std": 0.0015315909404307604, + "rewards/perpo_ocr_edit_distance_reward": 0.7284436821937561, + "step": 2702, + "temperature": 0.9 + }, + { + "advantages": -8.39574022393208e-06, + "completion_length": 273.0, + "delta_ref_entropy_loss": 0.11083984375, + "delta_ref_ppl": -0.2119140625, + "entropy_loss": -0.177734375, + "epoch": 0.5406, + "grad_norm": 2.7174609316659635, + "k1_kl": 0.212890625, + "k3_kl": 0.16015625, + "kimi_kl": 0.6640625, + "learning_rate": 2.2969999999999997e-07, + "loss": 0.0064, + "ppl": 0.095703125, + "reward": 0.9484373927116394, + "reward_std": 0.008018131367862225, + "rewards/perpo_ocr_edit_distance_reward": 0.9484375715255737, + "step": 2703, + "temperature": 0.9 + }, + { + "advantages": -0.00010899986955337226, + "completion_length": 1017.0, + "delta_ref_entropy_loss": 0.05859375, + "delta_ref_ppl": -0.064453125, + "entropy_loss": -0.07568359375, + "epoch": 0.5408, + "grad_norm": 1.036911265060968, + "k1_kl": 0.064453125, + "k3_kl": 0.038330078125, + "kimi_kl": 0.1171875, + "learning_rate": 2.296e-07, + "loss": 0.0016, + "ppl": 0.033935546875, + "reward": 0.9915698766708374, + "reward_std": 0.0007594111957587302, + "rewards/perpo_ocr_edit_distance_reward": 0.991569995880127, + "step": 2704, + "temperature": 0.9 + }, + { + "advantages": -2.0555087758111767e-05, + "completion_length": 237.0, + "delta_ref_entropy_loss": 0.08251953125, + "delta_ref_ppl": -0.171875, + "entropy_loss": -0.08935546875, + "epoch": 0.541, + "grad_norm": 1.2494040248513731, + "k1_kl": 0.171875, + "k3_kl": 0.12353515625, + "kimi_kl": 0.478515625, + "learning_rate": 2.295e-07, + "loss": 0.005, + "ppl": 0.0302734375, + "reward": 0.9626373648643494, + "reward_std": 0.0027998061850667, + "rewards/perpo_ocr_edit_distance_reward": 0.9626374244689941, + "step": 2705, + "temperature": 0.9 + }, + { + "advantages": 3.1845911507843994e-06, + "completion_length": 1011.0, + "delta_ref_entropy_loss": 0.0830078125, + "delta_ref_ppl": -0.091796875, + "entropy_loss": -0.2392578125, + "epoch": 0.5412, + "grad_norm": 1.6989472740532128, + "k1_kl": 0.09228515625, + "k3_kl": 0.057373046875, + "kimi_kl": 0.12353515625, + "learning_rate": 2.2939999999999998e-07, + "loss": 0.0023, + "ppl": 0.1201171875, + "reward": 0.9124735593795776, + "reward_std": 0.007885059341788292, + "rewards/perpo_ocr_edit_distance_reward": 0.9124735593795776, + "step": 2706, + "temperature": 0.9 + }, + { + "advantages": -1.1937959243368823e-05, + "completion_length": 1191.0, + "delta_ref_entropy_loss": 0.0198974609375, + "delta_ref_ppl": -0.03515625, + "entropy_loss": -0.05126953125, + "epoch": 0.5414, + "grad_norm": 0.4465424925158644, + "k1_kl": 0.03515625, + "k3_kl": 0.020751953125, + "kimi_kl": 0.043701171875, + "learning_rate": 2.293e-07, + "loss": 0.0008, + "ppl": 0.02099609375, + "reward": 0.9941729307174683, + "reward_std": 0.00204100226983428, + "rewards/perpo_ocr_edit_distance_reward": 0.9941729307174683, + "step": 2707, + "temperature": 0.9 + }, + { + "advantages": -7.488046685466543e-05, + "completion_length": 457.0, + "delta_ref_entropy_loss": 0.037109375, + "delta_ref_ppl": -0.0986328125, + "entropy_loss": -0.052490234375, + "epoch": 0.5416, + "grad_norm": 11.34528522097638, + "k1_kl": 0.0986328125, + "k3_kl": 0.0732421875, + "kimi_kl": 0.28515625, + "learning_rate": 2.292e-07, + "loss": 0.003, + "ppl": 0.025634765625, + "reward": 0.8753767609596252, + "reward_std": 0.0008098722901195288, + "rewards/perpo_ocr_edit_distance_reward": 0.8753768801689148, + "step": 2708, + "temperature": 0.9 + }, + { + "advantages": -2.557465086283628e-05, + "completion_length": 990.0, + "delta_ref_entropy_loss": 0.034912109375, + "delta_ref_ppl": -0.050537109375, + "entropy_loss": -0.0546875, + "epoch": 0.5418, + "grad_norm": 0.5099118334520709, + "k1_kl": 0.050537109375, + "k3_kl": 0.03125, + "kimi_kl": 0.07763671875, + "learning_rate": 2.2909999999999998e-07, + "loss": 0.0013, + "ppl": 0.0224609375, + "reward": 0.9914989471435547, + "reward_std": 0.0005660626338794827, + "rewards/perpo_ocr_edit_distance_reward": 0.9914990663528442, + "step": 2709, + "temperature": 0.9 + }, + { + "advantages": -2.0257064534234814e-05, + "completion_length": 780.0, + "delta_ref_entropy_loss": 0.068359375, + "delta_ref_ppl": -0.0986328125, + "entropy_loss": -0.1630859375, + "epoch": 0.542, + "grad_norm": 1.0094376846658457, + "k1_kl": 0.0986328125, + "k3_kl": 0.0615234375, + "kimi_kl": 0.154296875, + "learning_rate": 2.29e-07, + "loss": 0.0025, + "ppl": 0.076171875, + "reward": 0.9090253710746765, + "reward_std": 0.0011613357346504927, + "rewards/perpo_ocr_edit_distance_reward": 0.9090253710746765, + "step": 2710, + "temperature": 0.9 + }, + { + "advantages": -4.930155682814075e-06, + "completion_length": 481.0, + "delta_ref_entropy_loss": 0.0390625, + "delta_ref_ppl": -0.0966796875, + "entropy_loss": -0.1005859375, + "epoch": 0.5422, + "grad_norm": 1.2370439020906532, + "k1_kl": 0.0966796875, + "k3_kl": 0.064453125, + "kimi_kl": 0.1865234375, + "learning_rate": 2.289e-07, + "loss": 0.0026, + "ppl": 0.03759765625, + "reward": 0.949091374874115, + "reward_std": 0.01025715284049511, + "rewards/perpo_ocr_edit_distance_reward": 0.9490914344787598, + "step": 2711, + "temperature": 0.9 + }, + { + "advantages": -3.1897001463221386e-05, + "completion_length": 358.0, + "delta_ref_entropy_loss": 0.078125, + "delta_ref_ppl": -0.142578125, + "entropy_loss": -0.06103515625, + "epoch": 0.5424, + "grad_norm": 0.8720855351384302, + "k1_kl": 0.142578125, + "k3_kl": 0.099609375, + "kimi_kl": 0.4296875, + "learning_rate": 2.2879999999999998e-07, + "loss": 0.004, + "ppl": 0.0230712890625, + "reward": 0.9327995181083679, + "reward_std": 0.0007006775122135878, + "rewards/perpo_ocr_edit_distance_reward": 0.9327995777130127, + "step": 2712, + "temperature": 0.9 + }, + { + "advantages": -0.00013183696137275547, + "completion_length": 750.0, + "delta_ref_entropy_loss": 0.041259765625, + "delta_ref_ppl": -0.0380859375, + "entropy_loss": -0.04833984375, + "epoch": 0.5426, + "grad_norm": 0.4604109962423117, + "k1_kl": 0.0380859375, + "k3_kl": 0.0198974609375, + "kimi_kl": 0.048095703125, + "learning_rate": 2.2869999999999998e-07, + "loss": 0.0009, + "ppl": 0.0208740234375, + "reward": 0.997058629989624, + "reward_std": 0.00035206309985369444, + "rewards/perpo_ocr_edit_distance_reward": 0.9970586895942688, + "step": 2713, + "temperature": 0.9 + }, + { + "advantages": -5.350794526748359e-05, + "completion_length": 652.0, + "delta_ref_entropy_loss": 0.0654296875, + "delta_ref_ppl": -0.076171875, + "entropy_loss": -0.0703125, + "epoch": 0.5428, + "grad_norm": 0.6759893833584111, + "k1_kl": 0.076171875, + "k3_kl": 0.0439453125, + "kimi_kl": 0.09619140625, + "learning_rate": 2.286e-07, + "loss": 0.0018, + "ppl": 0.031982421875, + "reward": 0.9838528037071228, + "reward_std": 0.0006957692676223814, + "rewards/perpo_ocr_edit_distance_reward": 0.9838529229164124, + "step": 2714, + "temperature": 0.9 + }, + { + "advantages": -3.136055966024287e-05, + "completion_length": 131.0, + "delta_ref_entropy_loss": 0.083984375, + "delta_ref_ppl": -0.267578125, + "entropy_loss": -0.103515625, + "epoch": 0.543, + "grad_norm": 1.7842261760475362, + "k1_kl": 0.267578125, + "k3_kl": 0.19921875, + "kimi_kl": 0.7109375, + "learning_rate": 2.285e-07, + "loss": 0.008, + "ppl": 0.043701171875, + "reward": 0.9837661981582642, + "reward_std": 0.0026154243387281895, + "rewards/perpo_ocr_edit_distance_reward": 0.9837663173675537, + "step": 2715, + "temperature": 0.9 + }, + { + "advantages": -4.219157563056797e-05, + "completion_length": 547.0, + "delta_ref_entropy_loss": 0.007476806640625, + "delta_ref_ppl": -0.03515625, + "entropy_loss": -0.036865234375, + "epoch": 0.5432, + "grad_norm": 0.5488285265941929, + "k1_kl": 0.03515625, + "k3_kl": 0.0257568359375, + "kimi_kl": 0.0810546875, + "learning_rate": 2.2839999999999998e-07, + "loss": 0.0011, + "ppl": 0.01007080078125, + "reward": 0.9882014989852905, + "reward_std": 0.001917659305036068, + "rewards/perpo_ocr_edit_distance_reward": 0.9882016181945801, + "step": 2716, + "temperature": 0.9 + }, + { + "advantages": -1.6382762623834424e-05, + "completion_length": 63.0, + "delta_ref_entropy_loss": 0.158203125, + "delta_ref_ppl": -0.53125, + "entropy_loss": -0.208984375, + "epoch": 0.5434, + "grad_norm": 5.2978824905332464, + "k1_kl": 0.53125, + "k3_kl": 0.40625, + "kimi_kl": 1.3359375, + "learning_rate": 2.283e-07, + "loss": 0.0163, + "ppl": 0.09619140625, + "reward": 0.9706717133522034, + "reward_std": 0.003539878176525235, + "rewards/perpo_ocr_edit_distance_reward": 0.9706717729568481, + "step": 2717, + "temperature": 0.9 + }, + { + "advantages": -6.658690836047754e-06, + "completion_length": 639.0, + "delta_ref_entropy_loss": 0.1376953125, + "delta_ref_ppl": -0.1533203125, + "entropy_loss": -0.33984375, + "epoch": 0.5436, + "grad_norm": 2.0622292227632992, + "k1_kl": 0.1533203125, + "k3_kl": 0.09619140625, + "kimi_kl": 0.2373046875, + "learning_rate": 2.2819999999999997e-07, + "loss": 0.0039, + "ppl": 0.1787109375, + "reward": 0.7645864486694336, + "reward_std": 0.00503171281889081, + "rewards/perpo_ocr_edit_distance_reward": 0.7645865678787231, + "step": 2718, + "temperature": 0.9 + }, + { + "advantages": -4.615102807292715e-05, + "completion_length": 386.0, + "delta_ref_entropy_loss": 0.04443359375, + "delta_ref_ppl": -0.08740234375, + "entropy_loss": -0.051513671875, + "epoch": 0.5438, + "grad_norm": 0.7203677105181125, + "k1_kl": 0.08740234375, + "k3_kl": 0.0576171875, + "kimi_kl": 0.162109375, + "learning_rate": 2.2809999999999998e-07, + "loss": 0.0024, + "ppl": 0.021484375, + "reward": 0.9837546348571777, + "reward_std": 0.0011913224589079618, + "rewards/perpo_ocr_edit_distance_reward": 0.9837547540664673, + "step": 2719, + "temperature": 0.9 + }, + { + "advantages": -6.428786946344189e-06, + "completion_length": 110.0, + "delta_ref_entropy_loss": 0.025634765625, + "delta_ref_ppl": -0.26171875, + "entropy_loss": -0.10205078125, + "epoch": 0.544, + "grad_norm": 1.4627164701899498, + "k1_kl": 0.26171875, + "k3_kl": 0.216796875, + "kimi_kl": 1.4609375, + "learning_rate": 2.28e-07, + "loss": 0.0087, + "ppl": 0.035888671875, + "reward": 0.9769585728645325, + "reward_std": 0.0038698259741067886, + "rewards/perpo_ocr_edit_distance_reward": 0.9769585728645325, + "step": 2720, + "temperature": 0.9 + }, + { + "advantages": -3.2356808787881164e-06, + "completion_length": 319.0, + "delta_ref_entropy_loss": 0.051513671875, + "delta_ref_ppl": -0.1630859375, + "entropy_loss": -0.1474609375, + "epoch": 0.5442, + "grad_norm": 2.3381457533361645, + "k1_kl": 0.1630859375, + "k3_kl": 0.11376953125, + "kimi_kl": 0.388671875, + "learning_rate": 2.2789999999999997e-07, + "loss": 0.0046, + "ppl": 0.05517578125, + "reward": 0.9587254524230957, + "reward_std": 0.015685558319091797, + "rewards/perpo_ocr_edit_distance_reward": 0.9587255120277405, + "step": 2721, + "temperature": 0.9 + }, + { + "advantages": -0.000159008166519925, + "completion_length": 487.0, + "delta_ref_entropy_loss": 0.0257568359375, + "delta_ref_ppl": -0.07421875, + "entropy_loss": -0.034912109375, + "epoch": 0.5444, + "grad_norm": 0.28681929408439844, + "k1_kl": 0.07470703125, + "k3_kl": 0.055908203125, + "kimi_kl": 0.2060546875, + "learning_rate": 2.278e-07, + "loss": 0.0024, + "ppl": 0.01007080078125, + "reward": 0.993286669254303, + "reward_std": 0.0003819921694230288, + "rewards/perpo_ocr_edit_distance_reward": 0.9932867288589478, + "step": 2722, + "temperature": 0.9 + }, + { + "advantages": -1.9482204152154736e-05, + "completion_length": 679.0, + "delta_ref_entropy_loss": 0.03564453125, + "delta_ref_ppl": -0.048828125, + "entropy_loss": -0.057373046875, + "epoch": 0.5446, + "grad_norm": 0.7632556000463055, + "k1_kl": 0.048828125, + "k3_kl": 0.0322265625, + "kimi_kl": 0.11767578125, + "learning_rate": 2.277e-07, + "loss": 0.0013, + "ppl": 0.0262451171875, + "reward": 0.9922033548355103, + "reward_std": 0.0033954367972910404, + "rewards/perpo_ocr_edit_distance_reward": 0.992203414440155, + "step": 2723, + "temperature": 0.9 + }, + { + "advantages": -2.4454935555695556e-05, + "completion_length": 603.0, + "delta_ref_entropy_loss": 0.05078125, + "delta_ref_ppl": -0.062255859375, + "entropy_loss": -0.10205078125, + "epoch": 0.5448, + "grad_norm": 3.074165732478414, + "k1_kl": 0.0625, + "k3_kl": 0.03857421875, + "kimi_kl": 0.09423828125, + "learning_rate": 2.2759999999999997e-07, + "loss": 0.0016, + "ppl": 0.046875, + "reward": 0.9346477389335632, + "reward_std": 0.001640479196794331, + "rewards/perpo_ocr_edit_distance_reward": 0.9346478581428528, + "step": 2724, + "temperature": 0.9 + }, + { + "advantages": -8.514949456639442e-08, + "completion_length": 363.0, + "delta_ref_entropy_loss": -0.049072265625, + "delta_ref_ppl": -0.10400390625, + "entropy_loss": -0.39453125, + "epoch": 0.545, + "grad_norm": 2.185724994570508, + "k1_kl": 0.1044921875, + "k3_kl": 0.09375, + "kimi_kl": 0.2431640625, + "learning_rate": 2.275e-07, + "loss": 0.0038, + "ppl": 0.1787109375, + "reward": 0.7073244452476501, + "reward_std": 0.20239394903182983, + "rewards/perpo_ocr_edit_distance_reward": 0.7073244452476501, + "step": 2725, + "temperature": 0.9 + }, + { + "advantages": -2.7503287128638476e-05, + "completion_length": 435.0, + "delta_ref_entropy_loss": 0.05029296875, + "delta_ref_ppl": -0.09912109375, + "entropy_loss": -0.056640625, + "epoch": 0.5452, + "grad_norm": 0.9220843947337876, + "k1_kl": 0.09912109375, + "k3_kl": 0.0732421875, + "kimi_kl": 0.26171875, + "learning_rate": 2.2739999999999998e-07, + "loss": 0.003, + "ppl": 0.0230712890625, + "reward": 0.9943772554397583, + "reward_std": 0.0011399344075471163, + "rewards/perpo_ocr_edit_distance_reward": 0.9943773150444031, + "step": 2726, + "temperature": 0.9 + }, + { + "advantages": -1.5633448128937744e-05, + "completion_length": 215.0, + "delta_ref_entropy_loss": 0.06298828125, + "delta_ref_ppl": -0.16015625, + "entropy_loss": -0.087890625, + "epoch": 0.5454, + "grad_norm": 1.6772449516380252, + "k1_kl": 0.1591796875, + "k3_kl": 0.1171875, + "kimi_kl": 0.498046875, + "learning_rate": 2.273e-07, + "loss": 0.0047, + "ppl": 0.035400390625, + "reward": 0.9384379982948303, + "reward_std": 0.0020798665937036276, + "rewards/perpo_ocr_edit_distance_reward": 0.9384380578994751, + "step": 2727, + "temperature": 0.9 + }, + { + "advantages": 2.912112677222467e-06, + "completion_length": 948.0, + "delta_ref_entropy_loss": 0.0390625, + "delta_ref_ppl": -0.04736328125, + "entropy_loss": -0.12890625, + "epoch": 0.5456, + "grad_norm": 1.64270911655136, + "k1_kl": 0.04736328125, + "k3_kl": 0.02978515625, + "kimi_kl": 0.06201171875, + "learning_rate": 2.272e-07, + "loss": 0.0012, + "ppl": 0.056396484375, + "reward": 0.7833423018455505, + "reward_std": 0.0057453857734799385, + "rewards/perpo_ocr_edit_distance_reward": 0.7833423018455505, + "step": 2728, + "temperature": 0.9 + }, + { + "advantages": -3.821509380941279e-05, + "completion_length": 123.0, + "delta_ref_entropy_loss": 0.12255859375, + "delta_ref_ppl": -0.265625, + "entropy_loss": -0.123046875, + "epoch": 0.5458, + "grad_norm": 1.8131750019678325, + "k1_kl": 0.265625, + "k3_kl": 0.2021484375, + "kimi_kl": 0.8359375, + "learning_rate": 2.271e-07, + "loss": 0.0081, + "ppl": 0.048828125, + "reward": 0.9931136965751648, + "reward_std": 0.0012365736765787005, + "rewards/perpo_ocr_edit_distance_reward": 0.9931137561798096, + "step": 2729, + "temperature": 0.9 + }, + { + "advantages": -4.081215229234658e-05, + "completion_length": 492.0, + "delta_ref_entropy_loss": 0.04638671875, + "delta_ref_ppl": -0.0732421875, + "entropy_loss": -0.036865234375, + "epoch": 0.546, + "grad_norm": 0.5584172464521412, + "k1_kl": 0.0732421875, + "k3_kl": 0.053466796875, + "kimi_kl": 0.1689453125, + "learning_rate": 2.27e-07, + "loss": 0.0022, + "ppl": 0.01708984375, + "reward": 0.9961601495742798, + "reward_std": 0.0009431492653675377, + "rewards/perpo_ocr_edit_distance_reward": 0.9961601495742798, + "step": 2730, + "temperature": 0.9 + }, + { + "advantages": -1.1427062418079004e-05, + "completion_length": 1320.0, + "delta_ref_entropy_loss": 0.08837890625, + "delta_ref_ppl": -0.11083984375, + "entropy_loss": -0.21484375, + "epoch": 0.5462, + "grad_norm": 1.6666239347907152, + "k1_kl": 0.11083984375, + "k3_kl": 0.0693359375, + "kimi_kl": 0.1328125, + "learning_rate": 2.2689999999999997e-07, + "loss": 0.0028, + "ppl": 0.109375, + "reward": 0.9236820340156555, + "reward_std": 0.0013897004537284374, + "rewards/perpo_ocr_edit_distance_reward": 0.9236820936203003, + "step": 2731, + "temperature": 0.9 + }, + { + "advantages": -4.765817357110791e-05, + "completion_length": 568.0, + "delta_ref_entropy_loss": 0.061279296875, + "delta_ref_ppl": -0.08154296875, + "entropy_loss": -0.061767578125, + "epoch": 0.5464, + "grad_norm": 0.38697009628818996, + "k1_kl": 0.08203125, + "k3_kl": 0.050537109375, + "kimi_kl": 0.1533203125, + "learning_rate": 2.268e-07, + "loss": 0.0021, + "ppl": 0.02392578125, + "reward": 0.9966189861297607, + "reward_std": 0.0004360930761322379, + "rewards/perpo_ocr_edit_distance_reward": 0.9966189861297607, + "step": 2732, + "temperature": 0.9 + }, + { + "advantages": -0.00012063980830134824, + "completion_length": 587.0, + "delta_ref_entropy_loss": 0.042236328125, + "delta_ref_ppl": -0.055419921875, + "entropy_loss": -0.0361328125, + "epoch": 0.5466, + "grad_norm": 0.6153506298647398, + "k1_kl": 0.055419921875, + "k3_kl": 0.03173828125, + "kimi_kl": 0.08935546875, + "learning_rate": 2.267e-07, + "loss": 0.0014, + "ppl": 0.01348876953125, + "reward": 0.998532772064209, + "reward_std": 0.0006765133002772927, + "rewards/perpo_ocr_edit_distance_reward": 0.9985328912734985, + "step": 2733, + "temperature": 0.9 + }, + { + "advantages": 1.7029899268550253e-08, + "completion_length": 393.0, + "delta_ref_entropy_loss": 0.064453125, + "delta_ref_ppl": -0.083984375, + "entropy_loss": -0.05078125, + "epoch": 0.5468, + "grad_norm": 0.5732373329104867, + "k1_kl": 0.083984375, + "k3_kl": 0.05126953125, + "kimi_kl": 0.1201171875, + "learning_rate": 2.2659999999999998e-07, + "loss": 0.002, + "ppl": 0.01531982421875, + "reward": 0.9813316464424133, + "reward_std": 0.0010148269357159734, + "rewards/perpo_ocr_edit_distance_reward": 0.9813315868377686, + "step": 2734, + "temperature": 0.9 + }, + { + "advantages": -6.757464143447578e-05, + "completion_length": 597.0, + "delta_ref_entropy_loss": 0.057861328125, + "delta_ref_ppl": -0.05029296875, + "entropy_loss": -0.0556640625, + "epoch": 0.547, + "grad_norm": 0.7327248903286246, + "k1_kl": 0.05029296875, + "k3_kl": 0.0250244140625, + "kimi_kl": 0.052734375, + "learning_rate": 2.265e-07, + "loss": 0.0011, + "ppl": 0.020263671875, + "reward": 0.9958474636077881, + "reward_std": 0.0006558905588462949, + "rewards/perpo_ocr_edit_distance_reward": 0.9958475828170776, + "step": 2735, + "temperature": 0.9 + }, + { + "advantages": -5.21710971952416e-05, + "completion_length": 827.0, + "delta_ref_entropy_loss": 0.043212890625, + "delta_ref_ppl": -0.043212890625, + "entropy_loss": -0.05078125, + "epoch": 0.5472, + "grad_norm": 0.5226415174473997, + "k1_kl": 0.043212890625, + "k3_kl": 0.0240478515625, + "kimi_kl": 0.0615234375, + "learning_rate": 2.264e-07, + "loss": 0.001, + "ppl": 0.02001953125, + "reward": 0.9978722929954529, + "reward_std": 0.001532515394501388, + "rewards/perpo_ocr_edit_distance_reward": 0.9978723526000977, + "step": 2736, + "temperature": 0.9 + }, + { + "advantages": -1.6514744856976904e-05, + "completion_length": 200.0, + "delta_ref_entropy_loss": 0.044677734375, + "delta_ref_ppl": -0.18359375, + "entropy_loss": -0.10888671875, + "epoch": 0.5474, + "grad_norm": 1.625769587732468, + "k1_kl": 0.18359375, + "k3_kl": 0.1376953125, + "kimi_kl": 0.53515625, + "learning_rate": 2.2629999999999998e-07, + "loss": 0.0055, + "ppl": 0.0537109375, + "reward": 0.9792358875274658, + "reward_std": 0.0029922390822321177, + "rewards/perpo_ocr_edit_distance_reward": 0.9792359471321106, + "step": 2737, + "temperature": 0.9 + }, + { + "advantages": -2.448899431328755e-05, + "completion_length": 427.0, + "delta_ref_entropy_loss": 0.058837890625, + "delta_ref_ppl": -0.11376953125, + "entropy_loss": -0.06689453125, + "epoch": 0.5476, + "grad_norm": 0.5828742084294062, + "k1_kl": 0.11376953125, + "k3_kl": 0.078125, + "kimi_kl": 0.279296875, + "learning_rate": 2.262e-07, + "loss": 0.0032, + "ppl": 0.0242919921875, + "reward": 0.9791200160980225, + "reward_std": 0.0016367633361369371, + "rewards/perpo_ocr_edit_distance_reward": 0.9791200160980225, + "step": 2738, + "temperature": 0.9 + }, + { + "advantages": -6.811959565311554e-07, + "completion_length": 1016.0, + "delta_ref_entropy_loss": -0.01251220703125, + "delta_ref_ppl": -0.0537109375, + "entropy_loss": -0.1953125, + "epoch": 0.5478, + "grad_norm": 2.0196480147827587, + "k1_kl": 0.053466796875, + "k3_kl": 0.043212890625, + "kimi_kl": 0.08642578125, + "learning_rate": 2.261e-07, + "loss": 0.0017, + "ppl": 0.1025390625, + "reward": 0.9314526915550232, + "reward_std": 0.061948273330926895, + "rewards/perpo_ocr_edit_distance_reward": 0.9314528107643127, + "step": 2739, + "temperature": 0.9 + }, + { + "advantages": -3.076025677728467e-05, + "completion_length": 382.0, + "delta_ref_entropy_loss": 0.045166015625, + "delta_ref_ppl": -0.09619140625, + "entropy_loss": -0.072265625, + "epoch": 0.548, + "grad_norm": 1.2043232440879879, + "k1_kl": 0.095703125, + "k3_kl": 0.0673828125, + "kimi_kl": 0.1845703125, + "learning_rate": 2.2599999999999999e-07, + "loss": 0.0027, + "ppl": 0.03466796875, + "reward": 0.9604616761207581, + "reward_std": 0.001007415703497827, + "rewards/perpo_ocr_edit_distance_reward": 0.9604617953300476, + "step": 2740, + "temperature": 0.9 + }, + { + "advantages": -1.7506736185168847e-05, + "completion_length": 782.0, + "delta_ref_entropy_loss": 0.06396484375, + "delta_ref_ppl": -0.0576171875, + "entropy_loss": -0.1142578125, + "epoch": 0.5482, + "grad_norm": 1.676303118044544, + "k1_kl": 0.057373046875, + "k3_kl": 0.03564453125, + "kimi_kl": 0.06591796875, + "learning_rate": 2.2589999999999998e-07, + "loss": 0.0014, + "ppl": 0.055908203125, + "reward": 0.9578714966773987, + "reward_std": 0.0018455665558576584, + "rewards/perpo_ocr_edit_distance_reward": 0.9578715562820435, + "step": 2741, + "temperature": 0.9 + }, + { + "advantages": -0.00012963158951606601, + "completion_length": 376.0, + "delta_ref_entropy_loss": 0.054931640625, + "delta_ref_ppl": -0.046875, + "entropy_loss": -0.04931640625, + "epoch": 0.5484, + "grad_norm": 0.5777056783332833, + "k1_kl": 0.04638671875, + "k3_kl": 0.0260009765625, + "kimi_kl": 0.0849609375, + "learning_rate": 2.258e-07, + "loss": 0.0012, + "ppl": 0.01416015625, + "reward": 0.9689827561378479, + "reward_std": 0.00029413128504529595, + "rewards/perpo_ocr_edit_distance_reward": 0.9689828157424927, + "step": 2742, + "temperature": 0.9 + }, + { + "advantages": 1.7029899268550253e-08, + "completion_length": 1004.0, + "delta_ref_entropy_loss": 0.0294189453125, + "delta_ref_ppl": -0.049560546875, + "entropy_loss": -0.061767578125, + "epoch": 0.5486, + "grad_norm": 2.9425271831366646, + "k1_kl": 0.049560546875, + "k3_kl": 0.0341796875, + "kimi_kl": 0.07861328125, + "learning_rate": 2.257e-07, + "loss": 0.0014, + "ppl": 0.02587890625, + "reward": 0.9723829030990601, + "reward_std": 0.008678308688104153, + "rewards/perpo_ocr_edit_distance_reward": 0.9723829030990601, + "step": 2743, + "temperature": 0.9 + }, + { + "advantages": -6.668908463325351e-05, + "completion_length": 791.0, + "delta_ref_entropy_loss": 0.054443359375, + "delta_ref_ppl": -0.06494140625, + "entropy_loss": -0.03564453125, + "epoch": 0.5488, + "grad_norm": 0.41931515070721886, + "k1_kl": 0.06494140625, + "k3_kl": 0.033203125, + "kimi_kl": 0.0810546875, + "learning_rate": 2.2559999999999998e-07, + "loss": 0.0014, + "ppl": 0.0142822265625, + "reward": 0.9936199188232422, + "reward_std": 0.0006662520463578403, + "rewards/perpo_ocr_edit_distance_reward": 0.9936200380325317, + "step": 2744, + "temperature": 0.9 + }, + { + "advantages": -1.5003341104602441e-05, + "completion_length": 783.0, + "delta_ref_entropy_loss": 0.038330078125, + "delta_ref_ppl": -0.046142578125, + "entropy_loss": -0.11181640625, + "epoch": 0.549, + "grad_norm": 3.6310557973125963, + "k1_kl": 0.046142578125, + "k3_kl": 0.05078125, + "kimi_kl": 0.06298828125, + "learning_rate": 2.255e-07, + "loss": 0.002, + "ppl": 0.0546875, + "reward": 0.983905553817749, + "reward_std": 0.003871154971420765, + "rewards/perpo_ocr_edit_distance_reward": 0.983905553817749, + "step": 2745, + "temperature": 0.9 + }, + { + "advantages": -0.00015997886657714844, + "completion_length": 817.0, + "delta_ref_entropy_loss": 0.0458984375, + "delta_ref_ppl": -0.054443359375, + "entropy_loss": -0.034423828125, + "epoch": 0.5492, + "grad_norm": 0.230002815419524, + "k1_kl": 0.054443359375, + "k3_kl": 0.031982421875, + "kimi_kl": 0.07763671875, + "learning_rate": 2.2539999999999997e-07, + "loss": 0.0014, + "ppl": 0.011962890625, + "reward": 0.9948501586914062, + "reward_std": 0.0002725039958022535, + "rewards/perpo_ocr_edit_distance_reward": 0.994850218296051, + "step": 2746, + "temperature": 0.9 + }, + { + "advantages": -2.339908132853452e-05, + "completion_length": 312.0, + "delta_ref_entropy_loss": 0.039794921875, + "delta_ref_ppl": -0.12353515625, + "entropy_loss": -0.044921875, + "epoch": 0.5494, + "grad_norm": 0.5090586479043622, + "k1_kl": 0.123046875, + "k3_kl": 0.09375, + "kimi_kl": 0.416015625, + "learning_rate": 2.2529999999999999e-07, + "loss": 0.0038, + "ppl": 0.014404296875, + "reward": 0.9984727501869202, + "reward_std": 0.0009917430579662323, + "rewards/perpo_ocr_edit_distance_reward": 0.9984728097915649, + "step": 2747, + "temperature": 0.9 + }, + { + "advantages": -1.4935221770429052e-05, + "completion_length": 419.0, + "delta_ref_entropy_loss": 0.054443359375, + "delta_ref_ppl": -0.12890625, + "entropy_loss": -0.049072265625, + "epoch": 0.5496, + "grad_norm": 0.5629390071113702, + "k1_kl": 0.12890625, + "k3_kl": 0.091796875, + "kimi_kl": 0.40234375, + "learning_rate": 2.252e-07, + "loss": 0.0037, + "ppl": 0.02197265625, + "reward": 0.9978346228599548, + "reward_std": 0.0004695835232269019, + "rewards/perpo_ocr_edit_distance_reward": 0.9978346824645996, + "step": 2748, + "temperature": 0.9 + }, + { + "advantages": -4.713876114692539e-05, + "completion_length": 910.0, + "delta_ref_entropy_loss": 0.0693359375, + "delta_ref_ppl": -0.08056640625, + "entropy_loss": -0.1298828125, + "epoch": 0.5498, + "grad_norm": 1.0632604068408826, + "k1_kl": 0.08056640625, + "k3_kl": 0.050537109375, + "kimi_kl": 0.1220703125, + "learning_rate": 2.251e-07, + "loss": 0.0021, + "ppl": 0.07177734375, + "reward": 0.971301257610321, + "reward_std": 0.0009834819938987494, + "rewards/perpo_ocr_edit_distance_reward": 0.971301257610321, + "step": 2749, + "temperature": 0.9 + }, + { + "advantages": 1.1750630619644653e-05, + "completion_length": 971.0, + "delta_ref_entropy_loss": 0.049560546875, + "delta_ref_ppl": -0.058349609375, + "entropy_loss": -0.030517578125, + "epoch": 0.55, + "grad_norm": 1.4199386874027016, + "k1_kl": 0.058349609375, + "k3_kl": 0.036376953125, + "kimi_kl": 0.1220703125, + "learning_rate": 2.25e-07, + "loss": 0.0014, + "ppl": 0.01116943359375, + "reward": 0.9951586127281189, + "reward_std": 0.0006250223377719522, + "rewards/perpo_ocr_edit_distance_reward": 0.9951585531234741, + "step": 2750, + "temperature": 0.9 + }, + { + "advantages": -0.0001055513130268082, + "completion_length": 879.0, + "delta_ref_entropy_loss": 0.039794921875, + "delta_ref_ppl": -0.042724609375, + "entropy_loss": -0.046875, + "epoch": 0.5502, + "grad_norm": 0.45719985859951723, + "k1_kl": 0.042724609375, + "k3_kl": 0.0230712890625, + "kimi_kl": 0.06201171875, + "learning_rate": 2.2489999999999998e-07, + "loss": 0.001, + "ppl": 0.0167236328125, + "reward": 0.9938431978225708, + "reward_std": 0.00046468799700960517, + "rewards/perpo_ocr_edit_distance_reward": 0.9938432574272156, + "step": 2751, + "temperature": 0.9 + }, + { + "advantages": -3.276552524766885e-05, + "completion_length": 690.0, + "delta_ref_entropy_loss": 0.0400390625, + "delta_ref_ppl": -0.080078125, + "entropy_loss": -0.07568359375, + "epoch": 0.5504, + "grad_norm": 0.7573122823204991, + "k1_kl": 0.080078125, + "k3_kl": 0.054931640625, + "kimi_kl": 0.16796875, + "learning_rate": 2.248e-07, + "loss": 0.0022, + "ppl": 0.033935546875, + "reward": 0.9008421301841736, + "reward_std": 0.0011999000562354922, + "rewards/perpo_ocr_edit_distance_reward": 0.9008421897888184, + "step": 2752, + "temperature": 0.9 + }, + { + "advantages": -4.879066182184033e-05, + "completion_length": 509.0, + "delta_ref_entropy_loss": 0.0537109375, + "delta_ref_ppl": -0.07568359375, + "entropy_loss": -0.059814453125, + "epoch": 0.5506, + "grad_norm": 0.5549668545118215, + "k1_kl": 0.07568359375, + "k3_kl": 0.045654296875, + "kimi_kl": 0.125, + "learning_rate": 2.247e-07, + "loss": 0.0019, + "ppl": 0.021240234375, + "reward": 0.8587985038757324, + "reward_std": 0.001296346657909453, + "rewards/perpo_ocr_edit_distance_reward": 0.8587985634803772, + "step": 2753, + "temperature": 0.9 + }, + { + "advantages": -7.182786066550761e-05, + "completion_length": 621.0, + "delta_ref_entropy_loss": 0.03125, + "delta_ref_ppl": -0.049560546875, + "entropy_loss": -0.03369140625, + "epoch": 0.5508, + "grad_norm": 0.9564188842861241, + "k1_kl": 0.0498046875, + "k3_kl": 0.030517578125, + "kimi_kl": 0.08447265625, + "learning_rate": 2.2459999999999999e-07, + "loss": 0.0013, + "ppl": 0.01422119140625, + "reward": 0.9978317022323608, + "reward_std": 0.0008482945268042386, + "rewards/perpo_ocr_edit_distance_reward": 0.9978318214416504, + "step": 2754, + "temperature": 0.9 + }, + { + "advantages": -6.376845703925937e-05, + "completion_length": 224.0, + "delta_ref_entropy_loss": 0.04052734375, + "delta_ref_ppl": -0.1572265625, + "entropy_loss": -0.0595703125, + "epoch": 0.551, + "grad_norm": 0.7537589283911685, + "k1_kl": 0.1572265625, + "k3_kl": 0.12255859375, + "kimi_kl": 0.5625, + "learning_rate": 2.245e-07, + "loss": 0.005, + "ppl": 0.0224609375, + "reward": 0.8348894715309143, + "reward_std": 0.0012351891491562128, + "rewards/perpo_ocr_edit_distance_reward": 0.8348895907402039, + "step": 2755, + "temperature": 0.9 + }, + { + "advantages": -1.743861685099546e-05, + "completion_length": 51.0, + "delta_ref_entropy_loss": 0.1845703125, + "delta_ref_ppl": -0.828125, + "entropy_loss": -0.234375, + "epoch": 0.5512, + "grad_norm": 3.6397984435619435, + "k1_kl": 0.83203125, + "k3_kl": 0.6640625, + "kimi_kl": 2.84375, + "learning_rate": 2.2439999999999997e-07, + "loss": 0.0267, + "ppl": 0.09375, + "reward": 0.670843780040741, + "reward_std": 0.003315465059131384, + "rewards/perpo_ocr_edit_distance_reward": 0.6708438396453857, + "step": 2756, + "temperature": 0.9 + }, + { + "advantages": -9.332385161542334e-06, + "completion_length": 108.0, + "delta_ref_entropy_loss": -0.046630859375, + "delta_ref_ppl": -0.419921875, + "entropy_loss": -0.1689453125, + "epoch": 0.5514, + "grad_norm": 2.5108325947019505, + "k1_kl": 0.419921875, + "k3_kl": 0.359375, + "kimi_kl": 2.140625, + "learning_rate": 2.243e-07, + "loss": 0.0144, + "ppl": 0.0517578125, + "reward": 0.6123169660568237, + "reward_std": 0.0017241883324459195, + "rewards/perpo_ocr_edit_distance_reward": 0.6123170256614685, + "step": 2757, + "temperature": 0.9 + }, + { + "advantages": -1.549720877846994e-06, + "completion_length": 930.0, + "delta_ref_entropy_loss": 0.034912109375, + "delta_ref_ppl": -0.043701171875, + "entropy_loss": -0.056396484375, + "epoch": 0.5516, + "grad_norm": 1.1145552301652002, + "k1_kl": 0.043701171875, + "k3_kl": 0.0213623046875, + "kimi_kl": 0.059814453125, + "learning_rate": 2.242e-07, + "loss": 0.0009, + "ppl": 0.0159912109375, + "reward": 0.9864864349365234, + "reward_std": 0.005424762610346079, + "rewards/perpo_ocr_edit_distance_reward": 0.9864864945411682, + "step": 2758, + "temperature": 0.9 + }, + { + "advantages": -9.366444686520481e-08, + "completion_length": 1066.0, + "delta_ref_entropy_loss": 0.055419921875, + "delta_ref_ppl": -0.11181640625, + "entropy_loss": -0.6328125, + "epoch": 0.5518, + "grad_norm": 2.9056366217773997, + "k1_kl": 0.11328125, + "k3_kl": 0.09375, + "kimi_kl": 0.171875, + "learning_rate": 2.2409999999999998e-07, + "loss": 0.0037, + "ppl": 0.333984375, + "reward": 0.11450406908988953, + "reward_std": 0.010480486787855625, + "rewards/perpo_ocr_edit_distance_reward": 0.11450408399105072, + "step": 2759, + "temperature": 0.9 + }, + { + "advantages": -5.522796345758252e-05, + "completion_length": 880.0, + "delta_ref_entropy_loss": 0.03076171875, + "delta_ref_ppl": -0.064453125, + "entropy_loss": -0.06591796875, + "epoch": 0.552, + "grad_norm": 0.5781006413250788, + "k1_kl": 0.064453125, + "k3_kl": 0.04443359375, + "kimi_kl": 0.162109375, + "learning_rate": 2.24e-07, + "loss": 0.0018, + "ppl": 0.0284423828125, + "reward": 0.9928634762763977, + "reward_std": 0.000670631299726665, + "rewards/perpo_ocr_edit_distance_reward": 0.9928634762763977, + "step": 2760, + "temperature": 0.9 + }, + { + "advantages": 1.4356204701471142e-05, + "completion_length": 810.0, + "delta_ref_entropy_loss": 0.0625, + "delta_ref_ppl": -0.07470703125, + "entropy_loss": -0.06787109375, + "epoch": 0.5522, + "grad_norm": 0.636678278118772, + "k1_kl": 0.07470703125, + "k3_kl": 0.047119140625, + "kimi_kl": 0.140625, + "learning_rate": 2.2389999999999999e-07, + "loss": 0.0019, + "ppl": 0.031982421875, + "reward": 0.9438689947128296, + "reward_std": 0.0010850763646885753, + "rewards/perpo_ocr_edit_distance_reward": 0.9438689351081848, + "step": 2761, + "temperature": 0.9 + }, + { + "advantages": -1.8647739125299267e-05, + "completion_length": 160.0, + "delta_ref_entropy_loss": 0.055908203125, + "delta_ref_ppl": -0.1474609375, + "entropy_loss": -0.134765625, + "epoch": 0.5524, + "grad_norm": 1.4037354417289818, + "k1_kl": 0.1484375, + "k3_kl": 0.1083984375, + "kimi_kl": 0.41015625, + "learning_rate": 2.2379999999999998e-07, + "loss": 0.0044, + "ppl": 0.05615234375, + "reward": 0.8034905791282654, + "reward_std": 0.004010892007499933, + "rewards/perpo_ocr_edit_distance_reward": 0.8034906983375549, + "step": 2762, + "temperature": 0.9 + }, + { + "advantages": -9.626150131225586e-06, + "completion_length": 598.0, + "delta_ref_entropy_loss": 0.04638671875, + "delta_ref_ppl": -0.08642578125, + "entropy_loss": -0.07568359375, + "epoch": 0.5526, + "grad_norm": 1.3010814260390249, + "k1_kl": 0.08642578125, + "k3_kl": 0.0625, + "kimi_kl": 0.15625, + "learning_rate": 2.237e-07, + "loss": 0.0025, + "ppl": 0.057373046875, + "reward": 0.9731400012969971, + "reward_std": 0.003433287376537919, + "rewards/perpo_ocr_edit_distance_reward": 0.9731400012969971, + "step": 2763, + "temperature": 0.9 + }, + { + "advantages": -8.9066370492219e-06, + "completion_length": 447.0, + "delta_ref_entropy_loss": 0.0537109375, + "delta_ref_ppl": -0.0927734375, + "entropy_loss": -0.06396484375, + "epoch": 0.5528, + "grad_norm": 0.6596837720786997, + "k1_kl": 0.09326171875, + "k3_kl": 0.06396484375, + "kimi_kl": 0.1923828125, + "learning_rate": 2.236e-07, + "loss": 0.0026, + "ppl": 0.0341796875, + "reward": 0.9903638362884521, + "reward_std": 0.001811482710763812, + "rewards/perpo_ocr_edit_distance_reward": 0.9903638362884521, + "step": 2764, + "temperature": 0.9 + }, + { + "advantages": -7.660048868274316e-05, + "completion_length": 634.0, + "delta_ref_entropy_loss": 0.0252685546875, + "delta_ref_ppl": -0.04833984375, + "entropy_loss": -0.033203125, + "epoch": 0.553, + "grad_norm": 0.5007174047058621, + "k1_kl": 0.04833984375, + "k3_kl": 0.033203125, + "kimi_kl": 0.10595703125, + "learning_rate": 2.2349999999999998e-07, + "loss": 0.0014, + "ppl": 0.01318359375, + "reward": 0.9934477806091309, + "reward_std": 0.0006781623815186322, + "rewards/perpo_ocr_edit_distance_reward": 0.9934478998184204, + "step": 2765, + "temperature": 0.9 + }, + { + "advantages": -1.641682320041582e-05, + "completion_length": 317.0, + "delta_ref_entropy_loss": 0.040283203125, + "delta_ref_ppl": -0.10205078125, + "entropy_loss": -0.061767578125, + "epoch": 0.5532, + "grad_norm": 1.0699872378981095, + "k1_kl": 0.10205078125, + "k3_kl": 0.07568359375, + "kimi_kl": 0.2890625, + "learning_rate": 2.2339999999999998e-07, + "loss": 0.003, + "ppl": 0.0234375, + "reward": 0.9901771545410156, + "reward_std": 0.0035238543059676886, + "rewards/perpo_ocr_edit_distance_reward": 0.9901772141456604, + "step": 2766, + "temperature": 0.9 + }, + { + "advantages": -0.00021093233954161406, + "completion_length": 943.0, + "delta_ref_entropy_loss": 0.025390625, + "delta_ref_ppl": -0.0263671875, + "entropy_loss": -0.0277099609375, + "epoch": 0.5534, + "grad_norm": 0.2131149058618356, + "k1_kl": 0.0263671875, + "k3_kl": 0.0133056640625, + "kimi_kl": 0.038818359375, + "learning_rate": 2.233e-07, + "loss": 0.0007, + "ppl": 0.0089111328125, + "reward": 0.988230288028717, + "reward_std": 0.0003036419511772692, + "rewards/perpo_ocr_edit_distance_reward": 0.9882304668426514, + "step": 2767, + "temperature": 0.9 + }, + { + "advantages": -3.445148468017578e-05, + "completion_length": 842.0, + "delta_ref_entropy_loss": 0.059326171875, + "delta_ref_ppl": -0.07275390625, + "entropy_loss": -0.068359375, + "epoch": 0.5536, + "grad_norm": 0.4495256200602088, + "k1_kl": 0.072265625, + "k3_kl": 0.040283203125, + "kimi_kl": 0.0966796875, + "learning_rate": 2.232e-07, + "loss": 0.0016, + "ppl": 0.022705078125, + "reward": 0.9856498837471008, + "reward_std": 0.0008886854629963636, + "rewards/perpo_ocr_edit_distance_reward": 0.9856499433517456, + "step": 2768, + "temperature": 0.9 + }, + { + "advantages": -3.849608765449375e-05, + "completion_length": 474.0, + "delta_ref_entropy_loss": 0.060791015625, + "delta_ref_ppl": -0.0810546875, + "entropy_loss": -0.048095703125, + "epoch": 0.5538, + "grad_norm": 0.6316793370162639, + "k1_kl": 0.0810546875, + "k3_kl": 0.051025390625, + "kimi_kl": 0.173828125, + "learning_rate": 2.2309999999999998e-07, + "loss": 0.0021, + "ppl": 0.0164794921875, + "reward": 0.9933519959449768, + "reward_std": 0.001006001839414239, + "rewards/perpo_ocr_edit_distance_reward": 0.9933519959449768, + "step": 2769, + "temperature": 0.9 + }, + { + "advantages": -3.142868081340566e-05, + "completion_length": 623.0, + "delta_ref_entropy_loss": 0.03857421875, + "delta_ref_ppl": -0.062255859375, + "entropy_loss": -0.057861328125, + "epoch": 0.554, + "grad_norm": 0.8390752508453784, + "k1_kl": 0.062255859375, + "k3_kl": 0.040771484375, + "kimi_kl": 0.11865234375, + "learning_rate": 2.23e-07, + "loss": 0.0017, + "ppl": 0.0234375, + "reward": 0.9844734072685242, + "reward_std": 0.0020687272772192955, + "rewards/perpo_ocr_edit_distance_reward": 0.9844734072685242, + "step": 2770, + "temperature": 0.9 + }, + { + "advantages": -1.8221992377220886e-06, + "completion_length": 1323.0, + "delta_ref_entropy_loss": 0.04150390625, + "delta_ref_ppl": -0.04736328125, + "entropy_loss": -0.058837890625, + "epoch": 0.5542, + "grad_norm": 0.9054272608623755, + "k1_kl": 0.04736328125, + "k3_kl": 0.0245361328125, + "kimi_kl": 0.056640625, + "learning_rate": 2.2289999999999996e-07, + "loss": 0.001, + "ppl": 0.0194091796875, + "reward": 0.9538285732269287, + "reward_std": 0.018632177263498306, + "rewards/perpo_ocr_edit_distance_reward": 0.9538285732269287, + "step": 2771, + "temperature": 0.9 + }, + { + "advantages": -3.950936661567539e-05, + "completion_length": 313.0, + "delta_ref_entropy_loss": 0.0361328125, + "delta_ref_ppl": -0.11328125, + "entropy_loss": -0.039794921875, + "epoch": 0.5544, + "grad_norm": 0.5285705049014758, + "k1_kl": 0.11376953125, + "k3_kl": 0.08203125, + "kimi_kl": 0.279296875, + "learning_rate": 2.2279999999999998e-07, + "loss": 0.0033, + "ppl": 0.0152587890625, + "reward": 0.9949527382850647, + "reward_std": 0.0009772751946002245, + "rewards/perpo_ocr_edit_distance_reward": 0.9949528574943542, + "step": 2772, + "temperature": 0.9 + }, + { + "advantages": 3.048352027690271e-06, + "completion_length": 569.0, + "delta_ref_entropy_loss": 0.07763671875, + "delta_ref_ppl": -0.08740234375, + "entropy_loss": -0.140625, + "epoch": 0.5546, + "grad_norm": 1.329723792166207, + "k1_kl": 0.08740234375, + "k3_kl": 0.0537109375, + "kimi_kl": 0.134765625, + "learning_rate": 2.227e-07, + "loss": 0.0021, + "ppl": 0.0537109375, + "reward": 0.8232043385505676, + "reward_std": 0.002704764250665903, + "rewards/perpo_ocr_edit_distance_reward": 0.8232043981552124, + "step": 2773, + "temperature": 0.9 + }, + { + "advantages": -4.594666825141758e-05, + "completion_length": 427.0, + "delta_ref_entropy_loss": 0.0517578125, + "delta_ref_ppl": -0.05126953125, + "entropy_loss": -0.05810546875, + "epoch": 0.5548, + "grad_norm": 0.8059403267583316, + "k1_kl": 0.05126953125, + "k3_kl": 0.0281982421875, + "kimi_kl": 0.06982421875, + "learning_rate": 2.226e-07, + "loss": 0.0012, + "ppl": 0.0283203125, + "reward": 0.9903928637504578, + "reward_std": 0.0008266167133115232, + "rewards/perpo_ocr_edit_distance_reward": 0.9903929233551025, + "step": 2774, + "temperature": 0.9 + }, + { + "advantages": -8.531979256076738e-05, + "completion_length": 603.0, + "delta_ref_entropy_loss": 0.036865234375, + "delta_ref_ppl": -0.06787109375, + "entropy_loss": -0.03857421875, + "epoch": 0.555, + "grad_norm": 0.4762104270354902, + "k1_kl": 0.06787109375, + "k3_kl": 0.049072265625, + "kimi_kl": 0.162109375, + "learning_rate": 2.225e-07, + "loss": 0.002, + "ppl": 0.015625, + "reward": 0.9700717926025391, + "reward_std": 0.0007980987429618835, + "rewards/perpo_ocr_edit_distance_reward": 0.9700719118118286, + "step": 2775, + "temperature": 0.9 + }, + { + "advantages": -6.709780336677795e-06, + "completion_length": 1034.0, + "delta_ref_entropy_loss": 0.03125, + "delta_ref_ppl": -0.04443359375, + "entropy_loss": -0.0380859375, + "epoch": 0.5552, + "grad_norm": 0.6057168103519528, + "k1_kl": 0.044677734375, + "k3_kl": 0.0289306640625, + "kimi_kl": 0.0830078125, + "learning_rate": 2.2239999999999998e-07, + "loss": 0.0012, + "ppl": 0.013916015625, + "reward": 0.9934308528900146, + "reward_std": 0.0011706353398039937, + "rewards/perpo_ocr_edit_distance_reward": 0.9934308528900146, + "step": 2776, + "temperature": 0.9 + }, + { + "advantages": -9.59634780883789e-06, + "completion_length": 749.0, + "delta_ref_entropy_loss": 0.103515625, + "delta_ref_ppl": -0.09521484375, + "entropy_loss": -0.205078125, + "epoch": 0.5554, + "grad_norm": 1.634620035681359, + "k1_kl": 0.09521484375, + "k3_kl": 0.056884765625, + "kimi_kl": 0.1435546875, + "learning_rate": 2.223e-07, + "loss": 0.0023, + "ppl": 0.09716796875, + "reward": 0.9085283875465393, + "reward_std": 0.0061188144609332085, + "rewards/perpo_ocr_edit_distance_reward": 0.9085284471511841, + "step": 2777, + "temperature": 0.9 + }, + { + "advantages": -4.4550215534400195e-05, + "completion_length": 633.0, + "delta_ref_entropy_loss": 0.0283203125, + "delta_ref_ppl": -0.06298828125, + "entropy_loss": -0.041748046875, + "epoch": 0.5556, + "grad_norm": 0.5097353259546171, + "k1_kl": 0.06298828125, + "k3_kl": 0.04052734375, + "kimi_kl": 0.11767578125, + "learning_rate": 2.222e-07, + "loss": 0.0017, + "ppl": 0.01519775390625, + "reward": 0.998425304889679, + "reward_std": 0.0008556324173696339, + "rewards/perpo_ocr_edit_distance_reward": 0.9984253644943237, + "step": 2778, + "temperature": 0.9 + }, + { + "advantages": -0.0001720083673717454, + "completion_length": 1057.0, + "delta_ref_entropy_loss": 0.0311279296875, + "delta_ref_ppl": -0.030517578125, + "entropy_loss": -0.0220947265625, + "epoch": 0.5558, + "grad_norm": 0.2311230790273426, + "k1_kl": 0.0303955078125, + "k3_kl": 0.0159912109375, + "kimi_kl": 0.04150390625, + "learning_rate": 2.2209999999999998e-07, + "loss": 0.0008, + "ppl": 0.0057373046875, + "reward": 0.9990797638893127, + "reward_std": 0.0001475296594435349, + "rewards/perpo_ocr_edit_distance_reward": 0.9990798234939575, + "step": 2779, + "temperature": 0.9 + }, + { + "advantages": -3.358296089572832e-05, + "completion_length": 328.0, + "delta_ref_entropy_loss": 0.056396484375, + "delta_ref_ppl": -0.09619140625, + "entropy_loss": -0.060302734375, + "epoch": 0.556, + "grad_norm": 0.5017219382380744, + "k1_kl": 0.09619140625, + "k3_kl": 0.068359375, + "kimi_kl": 0.236328125, + "learning_rate": 2.22e-07, + "loss": 0.0028, + "ppl": 0.0225830078125, + "reward": 0.9934602379798889, + "reward_std": 0.0014214442344382405, + "rewards/perpo_ocr_edit_distance_reward": 0.9934602975845337, + "step": 2780, + "temperature": 0.9 + }, + { + "advantages": 3.6784581425308716e-06, + "completion_length": 1750.0, + "delta_ref_entropy_loss": 0.01055908203125, + "delta_ref_ppl": -0.0223388671875, + "entropy_loss": -0.041015625, + "epoch": 0.5562, + "grad_norm": 2.900637639963258, + "k1_kl": 0.022216796875, + "k3_kl": 0.024658203125, + "kimi_kl": 0.04638671875, + "learning_rate": 2.2189999999999997e-07, + "loss": 0.001, + "ppl": 0.0203857421875, + "reward": 0.9844467639923096, + "reward_std": 0.002208116464316845, + "rewards/perpo_ocr_edit_distance_reward": 0.9844468235969543, + "step": 2781, + "temperature": 0.9 + }, + { + "advantages": 9.758132364368066e-06, + "completion_length": 255.0, + "delta_ref_entropy_loss": 0.07958984375, + "delta_ref_ppl": -0.1298828125, + "entropy_loss": -0.06494140625, + "epoch": 0.5564, + "grad_norm": 1.5940181960532145, + "k1_kl": 0.130859375, + "k3_kl": 0.09130859375, + "kimi_kl": 0.314453125, + "learning_rate": 2.218e-07, + "loss": 0.0036, + "ppl": 0.0225830078125, + "reward": 0.9939867854118347, + "reward_std": 0.002520482987165451, + "rewards/perpo_ocr_edit_distance_reward": 0.9939868450164795, + "step": 2782, + "temperature": 0.9 + }, + { + "advantages": -2.868686533474829e-05, + "completion_length": 178.0, + "delta_ref_entropy_loss": 0.0120849609375, + "delta_ref_ppl": -0.22265625, + "entropy_loss": -0.08251953125, + "epoch": 0.5566, + "grad_norm": 1.163782581548069, + "k1_kl": 0.2216796875, + "k3_kl": 0.1767578125, + "kimi_kl": 0.7734375, + "learning_rate": 2.217e-07, + "loss": 0.0071, + "ppl": 0.03125, + "reward": 0.5543088912963867, + "reward_std": 0.0007901937933638692, + "rewards/perpo_ocr_edit_distance_reward": 0.5543088912963867, + "step": 2783, + "temperature": 0.9 + }, + { + "advantages": -1.3534512618207373e-05, + "completion_length": 315.0, + "delta_ref_entropy_loss": 0.06494140625, + "delta_ref_ppl": -0.130859375, + "entropy_loss": -0.1357421875, + "epoch": 0.5568, + "grad_norm": 1.8576195870024772, + "k1_kl": 0.1298828125, + "k3_kl": 0.09375, + "kimi_kl": 0.33984375, + "learning_rate": 2.2159999999999997e-07, + "loss": 0.0038, + "ppl": 0.048828125, + "reward": 0.9634981751441956, + "reward_std": 0.0049348315224051476, + "rewards/perpo_ocr_edit_distance_reward": 0.9634982943534851, + "step": 2784, + "temperature": 0.9 + }, + { + "advantages": 0.0, + "completion_length": 758.0, + "delta_ref_entropy_loss": 0.0791015625, + "delta_ref_ppl": -0.10400390625, + "entropy_loss": -0.2470703125, + "epoch": 0.557, + "grad_norm": 17.905150692950713, + "k1_kl": 0.103515625, + "k3_kl": 0.158203125, + "kimi_kl": 0.244140625, + "learning_rate": 2.215e-07, + "loss": 0.0063, + "ppl": 0.1259765625, + "reward": 0.7060174345970154, + "reward_std": 0.009537069126963615, + "rewards/perpo_ocr_edit_distance_reward": 0.7060174345970154, + "step": 2785, + "temperature": 0.9 + }, + { + "advantages": -3.705705967149697e-05, + "completion_length": 921.0, + "delta_ref_entropy_loss": 0.035400390625, + "delta_ref_ppl": -0.04736328125, + "entropy_loss": -0.058837890625, + "epoch": 0.5572, + "grad_norm": 0.8094322268142173, + "k1_kl": 0.04736328125, + "k3_kl": 0.0274658203125, + "kimi_kl": 0.0771484375, + "learning_rate": 2.214e-07, + "loss": 0.0011, + "ppl": 0.0308837890625, + "reward": 0.992995023727417, + "reward_std": 0.0008190517546609044, + "rewards/perpo_ocr_edit_distance_reward": 0.9929950833320618, + "step": 2786, + "temperature": 0.9 + }, + { + "advantages": -1.025199981086189e-05, + "completion_length": 315.0, + "delta_ref_entropy_loss": 0.03369140625, + "delta_ref_ppl": -0.09033203125, + "entropy_loss": -0.0751953125, + "epoch": 0.5574, + "grad_norm": 1.058835039763632, + "k1_kl": 0.09033203125, + "k3_kl": 0.06640625, + "kimi_kl": 0.298828125, + "learning_rate": 2.2129999999999998e-07, + "loss": 0.0027, + "ppl": 0.034423828125, + "reward": 0.9935714602470398, + "reward_std": 0.0023918317165225744, + "rewards/perpo_ocr_edit_distance_reward": 0.9935714602470398, + "step": 2787, + "temperature": 0.9 + }, + { + "advantages": -1.8690314391278662e-05, + "completion_length": 314.0, + "delta_ref_entropy_loss": 0.041748046875, + "delta_ref_ppl": -0.1455078125, + "entropy_loss": -0.0810546875, + "epoch": 0.5576, + "grad_norm": 1.4594525329769552, + "k1_kl": 0.1455078125, + "k3_kl": 0.10693359375, + "kimi_kl": 0.41796875, + "learning_rate": 2.212e-07, + "loss": 0.0043, + "ppl": 0.0299072265625, + "reward": 0.9622757434844971, + "reward_std": 0.001721139531582594, + "rewards/perpo_ocr_edit_distance_reward": 0.9622757434844971, + "step": 2788, + "temperature": 0.9 + }, + { + "advantages": -2.9870443540858105e-05, + "completion_length": 939.0, + "delta_ref_entropy_loss": 0.043212890625, + "delta_ref_ppl": -0.045166015625, + "entropy_loss": -0.04541015625, + "epoch": 0.5578, + "grad_norm": 9.940380662648545, + "k1_kl": 0.045166015625, + "k3_kl": 0.0267333984375, + "kimi_kl": 0.07080078125, + "learning_rate": 2.211e-07, + "loss": 0.0011, + "ppl": 0.029541015625, + "reward": 0.9957225918769836, + "reward_std": 0.0010403362102806568, + "rewards/perpo_ocr_edit_distance_reward": 0.9957226514816284, + "step": 2789, + "temperature": 0.9 + }, + { + "advantages": -6.8289896262285765e-06, + "completion_length": 825.0, + "delta_ref_entropy_loss": 0.03564453125, + "delta_ref_ppl": -0.0517578125, + "entropy_loss": -0.060302734375, + "epoch": 0.558, + "grad_norm": 2.0073116835310074, + "k1_kl": 0.052001953125, + "k3_kl": 0.057861328125, + "kimi_kl": 0.1259765625, + "learning_rate": 2.2099999999999998e-07, + "loss": 0.0023, + "ppl": 0.0245361328125, + "reward": 0.9806743264198303, + "reward_std": 0.003646229626610875, + "rewards/perpo_ocr_edit_distance_reward": 0.9806743860244751, + "step": 2790, + "temperature": 0.9 + }, + { + "advantages": -1.934596548380796e-05, + "completion_length": 144.0, + "delta_ref_entropy_loss": 0.035888671875, + "delta_ref_ppl": -0.1728515625, + "entropy_loss": -0.0830078125, + "epoch": 0.5582, + "grad_norm": 0.8857618944632722, + "k1_kl": 0.171875, + "k3_kl": 0.140625, + "kimi_kl": 0.56640625, + "learning_rate": 2.209e-07, + "loss": 0.0056, + "ppl": 0.034423828125, + "reward": 0.9939769506454468, + "reward_std": 0.0021016753744333982, + "rewards/perpo_ocr_edit_distance_reward": 0.9939770698547363, + "step": 2791, + "temperature": 0.9 + }, + { + "advantages": -2.2241049009608105e-05, + "completion_length": 447.0, + "delta_ref_entropy_loss": 0.0213623046875, + "delta_ref_ppl": -0.046142578125, + "entropy_loss": -0.03173828125, + "epoch": 0.5584, + "grad_norm": 0.8906744212521204, + "k1_kl": 0.046142578125, + "k3_kl": 0.034912109375, + "kimi_kl": 0.103515625, + "learning_rate": 2.208e-07, + "loss": 0.0014, + "ppl": 0.0111083984375, + "reward": 0.9954978823661804, + "reward_std": 0.0010467435931786895, + "rewards/perpo_ocr_edit_distance_reward": 0.9954979419708252, + "step": 2792, + "temperature": 0.9 + }, + { + "advantages": -6.397281686076894e-05, + "completion_length": 584.0, + "delta_ref_entropy_loss": 0.060302734375, + "delta_ref_ppl": -0.06982421875, + "entropy_loss": -0.07568359375, + "epoch": 0.5586, + "grad_norm": 0.6292811475382564, + "k1_kl": 0.0703125, + "k3_kl": 0.04443359375, + "kimi_kl": 0.11669921875, + "learning_rate": 2.2069999999999998e-07, + "loss": 0.0018, + "ppl": 0.0341796875, + "reward": 0.9812889695167542, + "reward_std": 0.0008316023158840835, + "rewards/perpo_ocr_edit_distance_reward": 0.9812890291213989, + "step": 2793, + "temperature": 0.9 + }, + { + "advantages": 3.956045475206338e-05, + "completion_length": 658.0, + "delta_ref_entropy_loss": 0.032958984375, + "delta_ref_ppl": -0.041748046875, + "entropy_loss": -0.03271484375, + "epoch": 0.5588, + "grad_norm": 0.3648855366695302, + "k1_kl": 0.041748046875, + "k3_kl": 0.0240478515625, + "kimi_kl": 0.060546875, + "learning_rate": 2.2059999999999998e-07, + "loss": 0.0009, + "ppl": 0.01165771484375, + "reward": 0.9966222047805786, + "reward_std": 0.00033073415397666395, + "rewards/perpo_ocr_edit_distance_reward": 0.9966222643852234, + "step": 2794, + "temperature": 0.9 + }, + { + "advantages": -5.5875098041724414e-05, + "completion_length": 702.0, + "delta_ref_entropy_loss": 0.038330078125, + "delta_ref_ppl": -0.0537109375, + "entropy_loss": -0.0478515625, + "epoch": 0.559, + "grad_norm": 0.5017272728533918, + "k1_kl": 0.053955078125, + "k3_kl": 0.0322265625, + "kimi_kl": 0.0859375, + "learning_rate": 2.205e-07, + "loss": 0.0013, + "ppl": 0.019287109375, + "reward": 0.9951311945915222, + "reward_std": 0.0009664517128840089, + "rewards/perpo_ocr_edit_distance_reward": 0.995131254196167, + "step": 2795, + "temperature": 0.9 + }, + { + "advantages": 0.0, + "completion_length": 637.0, + "delta_ref_entropy_loss": 0.01324462890625, + "delta_ref_ppl": -0.060302734375, + "entropy_loss": -0.1357421875, + "epoch": 0.5592, + "grad_norm": 1.5776612257560003, + "k1_kl": 0.06005859375, + "k3_kl": 0.0419921875, + "kimi_kl": 0.1357421875, + "learning_rate": 2.2040000000000001e-07, + "loss": 0.0017, + "ppl": 0.06640625, + "reward": 0.9234378337860107, + "reward_std": 0.19661273062229156, + "rewards/perpo_ocr_edit_distance_reward": 0.9234378933906555, + "step": 2796, + "temperature": 0.9 + }, + { + "advantages": -1.0456357813382056e-05, + "completion_length": 2048.0, + "delta_ref_entropy_loss": -0.00921630859375, + "delta_ref_ppl": -0.034423828125, + "entropy_loss": -0.263671875, + "epoch": 0.5594, + "grad_norm": 10.092520354751546, + "k1_kl": 0.034423828125, + "k3_kl": 0.08935546875, + "kimi_kl": 0.08349609375, + "learning_rate": 2.2029999999999998e-07, + "loss": 0.0036, + "ppl": 0.16015625, + "reward": 0.5789369344711304, + "reward_std": 0.004779270384460688, + "rewards/perpo_ocr_edit_distance_reward": 0.5789369940757751, + "step": 2797, + "temperature": 0.9 + }, + { + "advantages": -0.0001189027534564957, + "completion_length": 1022.0, + "delta_ref_entropy_loss": 0.0218505859375, + "delta_ref_ppl": -0.0224609375, + "entropy_loss": -0.0380859375, + "epoch": 0.5596, + "grad_norm": 0.2702392831208538, + "k1_kl": 0.0224609375, + "k3_kl": 0.01171875, + "kimi_kl": 0.017822265625, + "learning_rate": 2.202e-07, + "loss": 0.0006, + "ppl": 0.0147705078125, + "reward": 0.9974207878112793, + "reward_std": 0.00025804684264585376, + "rewards/perpo_ocr_edit_distance_reward": 0.9974208474159241, + "step": 2798, + "temperature": 0.9 + }, + { + "advantages": -1.78132740984438e-05, + "completion_length": 982.0, + "delta_ref_entropy_loss": 0.040771484375, + "delta_ref_ppl": -0.04296875, + "entropy_loss": -0.06982421875, + "epoch": 0.5598, + "grad_norm": 8.548765065810327, + "k1_kl": 0.04296875, + "k3_kl": 0.047119140625, + "kimi_kl": 0.049072265625, + "learning_rate": 2.201e-07, + "loss": 0.0019, + "ppl": 0.036376953125, + "reward": 0.9797239899635315, + "reward_std": 0.0008567723562009633, + "rewards/perpo_ocr_edit_distance_reward": 0.9797239899635315, + "step": 2799, + "temperature": 0.9 + }, + { + "advantages": -5.815710665046936e-06, + "completion_length": 667.0, + "delta_ref_entropy_loss": 0.03466796875, + "delta_ref_ppl": -0.09716796875, + "entropy_loss": -0.4765625, + "epoch": 0.56, + "grad_norm": 4.983373042481743, + "k1_kl": 0.0966796875, + "k3_kl": 0.07080078125, + "kimi_kl": 0.142578125, + "learning_rate": 2.1999999999999998e-07, + "loss": 0.0028, + "ppl": 0.2431640625, + "reward": 0.6296383738517761, + "reward_std": 0.004284490365535021, + "rewards/perpo_ocr_edit_distance_reward": 0.6296384334564209, + "step": 2800, + "temperature": 0.9 + }, + { + "advantages": -0.00014202084275893867, + "completion_length": 436.0, + "delta_ref_entropy_loss": 0.03466796875, + "delta_ref_ppl": -0.05078125, + "entropy_loss": -0.0634765625, + "epoch": 0.5602, + "grad_norm": 0.3318039777953965, + "k1_kl": 0.05078125, + "k3_kl": 0.03271484375, + "kimi_kl": 0.09033203125, + "learning_rate": 2.199e-07, + "loss": 0.0015, + "ppl": 0.023193359375, + "reward": 0.988296627998352, + "reward_std": 0.0001997690269490704, + "rewards/perpo_ocr_edit_distance_reward": 0.988296627998352, + "step": 2801, + "temperature": 0.9 + }, + { + "advantages": -5.790165573671402e-07, + "completion_length": 678.0, + "delta_ref_entropy_loss": -0.125, + "delta_ref_ppl": -0.043212890625, + "entropy_loss": -0.578125, + "epoch": 0.5604, + "grad_norm": 5.04614606443708, + "k1_kl": 0.043701171875, + "k3_kl": 0.056884765625, + "kimi_kl": 0.1044921875, + "learning_rate": 2.198e-07, + "loss": 0.0023, + "ppl": 0.302734375, + "reward": 0.8481200933456421, + "reward_std": 0.12918925285339355, + "rewards/perpo_ocr_edit_distance_reward": 0.8481201529502869, + "step": 2802, + "temperature": 0.9 + }, + { + "advantages": -4.64235054096207e-05, + "completion_length": 495.0, + "delta_ref_entropy_loss": 0.0693359375, + "delta_ref_ppl": -0.09619140625, + "entropy_loss": -0.07470703125, + "epoch": 0.5606, + "grad_norm": 1.0493931434637205, + "k1_kl": 0.09619140625, + "k3_kl": 0.06201171875, + "kimi_kl": 0.1982421875, + "learning_rate": 2.197e-07, + "loss": 0.0025, + "ppl": 0.030517578125, + "reward": 0.963742196559906, + "reward_std": 0.0011832480086013675, + "rewards/perpo_ocr_edit_distance_reward": 0.9637422561645508, + "step": 2803, + "temperature": 0.9 + }, + { + "advantages": -3.150531426854286e-07, + "completion_length": 70.0, + "delta_ref_entropy_loss": -0.1904296875, + "delta_ref_ppl": -0.6171875, + "entropy_loss": -1.234375, + "epoch": 0.5608, + "grad_norm": 14.082277995009061, + "k1_kl": 0.6171875, + "k3_kl": 0.53125, + "kimi_kl": 2.375, + "learning_rate": 2.1959999999999998e-07, + "loss": 0.0213, + "ppl": 0.55078125, + "reward": 0.5584415197372437, + "reward_std": 0.09675107151269913, + "rewards/perpo_ocr_edit_distance_reward": 0.5584415793418884, + "step": 2804, + "temperature": 0.9 + }, + { + "advantages": -1.8596649169921875e-05, + "completion_length": 342.0, + "delta_ref_entropy_loss": 0.051025390625, + "delta_ref_ppl": -0.1083984375, + "entropy_loss": -0.07080078125, + "epoch": 0.561, + "grad_norm": 1.112248048694371, + "k1_kl": 0.10791015625, + "k3_kl": 0.09765625, + "kimi_kl": 0.35546875, + "learning_rate": 2.195e-07, + "loss": 0.0039, + "ppl": 0.0281982421875, + "reward": 0.9811952114105225, + "reward_std": 0.0008150461362674832, + "rewards/perpo_ocr_edit_distance_reward": 0.9811952114105225, + "step": 2805, + "temperature": 0.9 + }, + { + "advantages": -0.00011284011270618066, + "completion_length": 695.0, + "delta_ref_entropy_loss": 0.0263671875, + "delta_ref_ppl": -0.03515625, + "entropy_loss": -0.03466796875, + "epoch": 0.5612, + "grad_norm": 0.5622734803287185, + "k1_kl": 0.03515625, + "k3_kl": 0.02197265625, + "kimi_kl": 0.0537109375, + "learning_rate": 2.194e-07, + "loss": 0.001, + "ppl": 0.01544189453125, + "reward": 0.9965795278549194, + "reward_std": 0.0007301876903511584, + "rewards/perpo_ocr_edit_distance_reward": 0.996579647064209, + "step": 2806, + "temperature": 0.9 + }, + { + "advantages": -6.049020157661289e-05, + "completion_length": 459.0, + "delta_ref_entropy_loss": 0.064453125, + "delta_ref_ppl": -0.06787109375, + "entropy_loss": -0.06396484375, + "epoch": 0.5614, + "grad_norm": 0.6321695886050278, + "k1_kl": 0.06787109375, + "k3_kl": 0.0419921875, + "kimi_kl": 0.1318359375, + "learning_rate": 2.1929999999999999e-07, + "loss": 0.0017, + "ppl": 0.0228271484375, + "reward": 0.9965382814407349, + "reward_std": 0.0014487295411527157, + "rewards/perpo_ocr_edit_distance_reward": 0.9965383410453796, + "step": 2807, + "temperature": 0.9 + }, + { + "advantages": -3.2356808787881164e-07, + "completion_length": 589.0, + "delta_ref_entropy_loss": -0.022216796875, + "delta_ref_ppl": -0.1083984375, + "entropy_loss": -0.31640625, + "epoch": 0.5616, + "grad_norm": 1.624471046559514, + "k1_kl": 0.10791015625, + "k3_kl": 0.09423828125, + "kimi_kl": 0.2470703125, + "learning_rate": 2.192e-07, + "loss": 0.0038, + "ppl": 0.14453125, + "reward": 0.8114186525344849, + "reward_std": 0.22696222364902496, + "rewards/perpo_ocr_edit_distance_reward": 0.8114187121391296, + "step": 2808, + "temperature": 0.9 + }, + { + "advantages": -3.993511199951172e-05, + "completion_length": 688.0, + "delta_ref_entropy_loss": 0.03173828125, + "delta_ref_ppl": -0.06494140625, + "entropy_loss": -0.05126953125, + "epoch": 0.5618, + "grad_norm": 0.7372495264951408, + "k1_kl": 0.0654296875, + "k3_kl": 0.04345703125, + "kimi_kl": 0.1337890625, + "learning_rate": 2.1909999999999997e-07, + "loss": 0.0018, + "ppl": 0.023681640625, + "reward": 0.9762024879455566, + "reward_std": 0.0007529159775003791, + "rewards/perpo_ocr_edit_distance_reward": 0.9762024879455566, + "step": 2809, + "temperature": 0.9 + }, + { + "advantages": -1.7029899268550253e-08, + "completion_length": 222.0, + "delta_ref_entropy_loss": -0.12890625, + "delta_ref_ppl": -0.10791015625, + "entropy_loss": -0.369140625, + "epoch": 0.562, + "grad_norm": 7.847722264921149, + "k1_kl": 0.1083984375, + "k3_kl": 0.10009765625, + "kimi_kl": 0.365234375, + "learning_rate": 2.19e-07, + "loss": 0.004, + "ppl": 0.11181640625, + "reward": 0.8040774464607239, + "reward_std": 0.2848260700702667, + "rewards/perpo_ocr_edit_distance_reward": 0.8040774464607239, + "step": 2810, + "temperature": 0.9 + }, + { + "advantages": -4.257474756741431e-06, + "completion_length": 636.0, + "delta_ref_entropy_loss": 0.0546875, + "delta_ref_ppl": -0.0986328125, + "entropy_loss": -0.5234375, + "epoch": 0.5622, + "grad_norm": 2.1025056086150298, + "k1_kl": 0.099609375, + "k3_kl": 0.07763671875, + "kimi_kl": 0.173828125, + "learning_rate": 2.189e-07, + "loss": 0.0031, + "ppl": 0.296875, + "reward": 0.687557578086853, + "reward_std": 0.00788147933781147, + "rewards/perpo_ocr_edit_distance_reward": 0.6875576376914978, + "step": 2811, + "temperature": 0.9 + }, + { + "advantages": -1.660415136939264e-06, + "completion_length": 1766.0, + "delta_ref_entropy_loss": -0.0026092529296875, + "delta_ref_ppl": -0.0267333984375, + "entropy_loss": -0.05224609375, + "epoch": 0.5624, + "grad_norm": 0.6654027549084619, + "k1_kl": 0.0267333984375, + "k3_kl": 0.02392578125, + "kimi_kl": 0.0810546875, + "learning_rate": 2.1879999999999997e-07, + "loss": 0.001, + "ppl": 0.0225830078125, + "reward": 0.8547801971435547, + "reward_std": 0.015456327237188816, + "rewards/perpo_ocr_edit_distance_reward": 0.8547802567481995, + "step": 2812, + "temperature": 0.9 + }, + { + "advantages": -1.7434358596801758e-05, + "completion_length": 526.0, + "delta_ref_entropy_loss": 0.0294189453125, + "delta_ref_ppl": -0.054443359375, + "entropy_loss": -0.059814453125, + "epoch": 0.5626, + "grad_norm": 0.9984939431487704, + "k1_kl": 0.054443359375, + "k3_kl": 0.032958984375, + "kimi_kl": 0.0732421875, + "learning_rate": 2.187e-07, + "loss": 0.0013, + "ppl": 0.026123046875, + "reward": 0.9910098910331726, + "reward_std": 0.001853736350312829, + "rewards/perpo_ocr_edit_distance_reward": 0.9910099506378174, + "step": 2813, + "temperature": 0.9 + }, + { + "advantages": -4.853521318182175e-07, + "completion_length": 533.0, + "delta_ref_entropy_loss": -0.166015625, + "delta_ref_ppl": -0.09033203125, + "entropy_loss": -0.62890625, + "epoch": 0.5628, + "grad_norm": 5.198545953815228, + "k1_kl": 0.08984375, + "k3_kl": 0.09375, + "kimi_kl": 0.203125, + "learning_rate": 2.1859999999999999e-07, + "loss": 0.0038, + "ppl": 0.296875, + "reward": 0.5915753245353699, + "reward_std": 0.17699997127056122, + "rewards/perpo_ocr_edit_distance_reward": 0.5915753841400146, + "step": 2814, + "temperature": 0.9 + }, + { + "advantages": -3.916876778475853e-07, + "completion_length": 1147.0, + "delta_ref_entropy_loss": -0.1806640625, + "delta_ref_ppl": -0.08447265625, + "entropy_loss": -0.83984375, + "epoch": 0.563, + "grad_norm": 1997679.428467229, + "k1_kl": 0.0849609375, + "k3_kl": 39680.0, + "kimi_kl": 0.23828125, + "learning_rate": 2.1849999999999998e-07, + "loss": 1590.2682, + "ppl": 0.39453125, + "reward": 0.7025689482688904, + "reward_std": 0.19935007393360138, + "rewards/perpo_ocr_edit_distance_reward": 0.7025690674781799, + "step": 2815, + "temperature": 0.9 + }, + { + "advantages": 1.9669532775878906e-06, + "completion_length": 64.0, + "delta_ref_entropy_loss": 0.051513671875, + "delta_ref_ppl": -0.291015625, + "entropy_loss": -0.2119140625, + "epoch": 0.5632, + "grad_norm": 3.7150476838502047, + "k1_kl": 0.291015625, + "k3_kl": 0.24609375, + "kimi_kl": 0.78515625, + "learning_rate": 2.184e-07, + "loss": 0.0098, + "ppl": 0.06689453125, + "reward": 0.5965400338172913, + "reward_std": 0.012956112623214722, + "rewards/perpo_ocr_edit_distance_reward": 0.5965399742126465, + "step": 2816, + "temperature": 0.9 + }, + { + "advantages": -2.5102071958826855e-05, + "completion_length": 949.0, + "delta_ref_entropy_loss": 0.0250244140625, + "delta_ref_ppl": -0.047119140625, + "entropy_loss": -0.03271484375, + "epoch": 0.5634, + "grad_norm": 0.7298016153193114, + "k1_kl": 0.047119140625, + "k3_kl": 0.0308837890625, + "kimi_kl": 0.1083984375, + "learning_rate": 2.183e-07, + "loss": 0.0013, + "ppl": 0.010986328125, + "reward": 0.9874436259269714, + "reward_std": 0.0015957029536366463, + "rewards/perpo_ocr_edit_distance_reward": 0.9874436855316162, + "step": 2817, + "temperature": 0.9 + }, + { + "advantages": -7.31434192857705e-05, + "completion_length": 1361.0, + "delta_ref_entropy_loss": 0.04541015625, + "delta_ref_ppl": -0.0595703125, + "entropy_loss": -0.0966796875, + "epoch": 0.5636, + "grad_norm": 1.43533117261397, + "k1_kl": 0.059814453125, + "k3_kl": 0.034912109375, + "kimi_kl": 0.0732421875, + "learning_rate": 2.182e-07, + "loss": 0.0015, + "ppl": 0.046142578125, + "reward": 0.9898043274879456, + "reward_std": 0.0003658560453914106, + "rewards/perpo_ocr_edit_distance_reward": 0.9898043870925903, + "step": 2818, + "temperature": 0.9 + }, + { + "advantages": 1.7029899268550253e-08, + "completion_length": 559.0, + "delta_ref_entropy_loss": 0.04248046875, + "delta_ref_ppl": -0.06689453125, + "entropy_loss": -0.032958984375, + "epoch": 0.5638, + "grad_norm": 0.23278660624734937, + "k1_kl": 0.0673828125, + "k3_kl": 0.044677734375, + "kimi_kl": 0.15234375, + "learning_rate": 2.1809999999999997e-07, + "loss": 0.0018, + "ppl": 0.01031494140625, + "reward": 0.9733136892318726, + "reward_std": 0.0005169289070181549, + "rewards/perpo_ocr_edit_distance_reward": 0.9733136892318726, + "step": 2819, + "temperature": 0.9 + }, + { + "advantages": -6.324904825305566e-05, + "completion_length": 821.0, + "delta_ref_entropy_loss": 0.031982421875, + "delta_ref_ppl": -0.048583984375, + "entropy_loss": -0.0595703125, + "epoch": 0.564, + "grad_norm": 0.941575304826265, + "k1_kl": 0.048583984375, + "k3_kl": 0.02880859375, + "kimi_kl": 0.06884765625, + "learning_rate": 2.18e-07, + "loss": 0.0012, + "ppl": 0.02587890625, + "reward": 0.9972862601280212, + "reward_std": 0.0009772019693627954, + "rewards/perpo_ocr_edit_distance_reward": 0.9972863793373108, + "step": 2820, + "temperature": 0.9 + }, + { + "advantages": -7.729445496806875e-05, + "completion_length": 582.0, + "delta_ref_entropy_loss": 0.0634765625, + "delta_ref_ppl": -0.0654296875, + "entropy_loss": -0.07373046875, + "epoch": 0.5642, + "grad_norm": 0.620837103291378, + "k1_kl": 0.06591796875, + "k3_kl": 0.035400390625, + "kimi_kl": 0.09423828125, + "learning_rate": 2.179e-07, + "loss": 0.0015, + "ppl": 0.0247802734375, + "reward": 0.9184183478355408, + "reward_std": 0.0008912306511774659, + "rewards/perpo_ocr_edit_distance_reward": 0.9184184670448303, + "step": 2821, + "temperature": 0.9 + }, + { + "advantages": -8.83851771504851e-06, + "completion_length": 520.0, + "delta_ref_entropy_loss": 0.0311279296875, + "delta_ref_ppl": -0.059814453125, + "entropy_loss": -0.032958984375, + "epoch": 0.5644, + "grad_norm": 0.483830657196831, + "k1_kl": 0.06005859375, + "k3_kl": 0.040283203125, + "kimi_kl": 0.1669921875, + "learning_rate": 2.1779999999999998e-07, + "loss": 0.0016, + "ppl": 0.0086669921875, + "reward": 0.9981939792633057, + "reward_std": 0.0008631816599518061, + "rewards/perpo_ocr_edit_distance_reward": 0.9981940388679504, + "step": 2822, + "temperature": 0.9 + }, + { + "advantages": -0.0001589570747455582, + "completion_length": 720.0, + "delta_ref_entropy_loss": 0.036865234375, + "delta_ref_ppl": -0.0693359375, + "entropy_loss": -0.054931640625, + "epoch": 0.5646, + "grad_norm": 0.6005828840294721, + "k1_kl": 0.0693359375, + "k3_kl": 0.046875, + "kimi_kl": 0.185546875, + "learning_rate": 2.177e-07, + "loss": 0.002, + "ppl": 0.0260009765625, + "reward": 0.920124888420105, + "reward_std": 0.0005963547737337649, + "rewards/perpo_ocr_edit_distance_reward": 0.9201250076293945, + "step": 2823, + "temperature": 0.9 + }, + { + "advantages": -7.217271195258945e-05, + "completion_length": 488.0, + "delta_ref_entropy_loss": 0.048828125, + "delta_ref_ppl": -0.06005859375, + "entropy_loss": -0.044921875, + "epoch": 0.5648, + "grad_norm": 0.34676657771603003, + "k1_kl": 0.059814453125, + "k3_kl": 0.0301513671875, + "kimi_kl": 0.06005859375, + "learning_rate": 2.176e-07, + "loss": 0.0013, + "ppl": 0.0142822265625, + "reward": 0.9967687726020813, + "reward_std": 0.0006077663274481893, + "rewards/perpo_ocr_edit_distance_reward": 0.9967688322067261, + "step": 2824, + "temperature": 0.9 + }, + { + "advantages": -2.912112677222467e-06, + "completion_length": 433.0, + "delta_ref_entropy_loss": 0.12451171875, + "delta_ref_ppl": -0.1875, + "entropy_loss": -0.318359375, + "epoch": 0.565, + "grad_norm": 2.0408522606447197, + "k1_kl": 0.1884765625, + "k3_kl": 0.1279296875, + "kimi_kl": 0.400390625, + "learning_rate": 2.1749999999999998e-07, + "loss": 0.0051, + "ppl": 0.15234375, + "reward": 0.8205776214599609, + "reward_std": 0.008618347346782684, + "rewards/perpo_ocr_edit_distance_reward": 0.8205776810646057, + "step": 2825, + "temperature": 0.9 + }, + { + "advantages": -1.8732889373040962e-07, + "completion_length": 763.0, + "delta_ref_entropy_loss": 0.0196533203125, + "delta_ref_ppl": -0.08837890625, + "entropy_loss": -0.1650390625, + "epoch": 0.5652, + "grad_norm": 1.361355448294186, + "k1_kl": 0.08837890625, + "k3_kl": 0.062255859375, + "kimi_kl": 0.154296875, + "learning_rate": 2.174e-07, + "loss": 0.0025, + "ppl": 0.06982421875, + "reward": 0.7805926203727722, + "reward_std": 0.33018919825553894, + "rewards/perpo_ocr_edit_distance_reward": 0.7805926203727722, + "step": 2826, + "temperature": 0.9 + }, + { + "advantages": 4.146780611335998e-06, + "completion_length": 303.0, + "delta_ref_entropy_loss": 0.03173828125, + "delta_ref_ppl": -0.10205078125, + "entropy_loss": -0.0625, + "epoch": 0.5654, + "grad_norm": 0.8784308166308865, + "k1_kl": 0.10205078125, + "k3_kl": 0.07177734375, + "kimi_kl": 0.2041015625, + "learning_rate": 2.173e-07, + "loss": 0.0029, + "ppl": 0.024658203125, + "reward": 0.9927638173103333, + "reward_std": 0.0019687958993017673, + "rewards/perpo_ocr_edit_distance_reward": 0.9927638173103333, + "step": 2827, + "temperature": 0.9 + }, + { + "advantages": -1.24105395116203e-06, + "completion_length": 2035.0, + "delta_ref_entropy_loss": -0.0206298828125, + "delta_ref_ppl": -0.00823974609375, + "entropy_loss": -0.298828125, + "epoch": 0.5656, + "grad_norm": 183.01527330676905, + "k1_kl": 0.00830078125, + "k3_kl": 2.328125, + "kimi_kl": 0.07861328125, + "learning_rate": 2.1719999999999999e-07, + "loss": 0.0932, + "ppl": 0.171875, + "reward": 0.9257774949073792, + "reward_std": 0.020670806989073753, + "rewards/perpo_ocr_edit_distance_reward": 0.9257776141166687, + "step": 2828, + "temperature": 0.9 + }, + { + "advantages": -7.993834879016504e-05, + "completion_length": 917.0, + "delta_ref_entropy_loss": 0.0615234375, + "delta_ref_ppl": -0.07421875, + "entropy_loss": -0.08935546875, + "epoch": 0.5658, + "grad_norm": 1.1669685734585094, + "k1_kl": 0.07421875, + "k3_kl": 0.046630859375, + "kimi_kl": 0.11962890625, + "learning_rate": 2.1709999999999998e-07, + "loss": 0.0019, + "ppl": 0.041015625, + "reward": 0.9675992727279663, + "reward_std": 0.0011784447124227881, + "rewards/perpo_ocr_edit_distance_reward": 0.9675993919372559, + "step": 2829, + "temperature": 0.9 + }, + { + "advantages": -1.1495181752252392e-05, + "completion_length": 2048.0, + "delta_ref_entropy_loss": -0.0023345947265625, + "delta_ref_ppl": -0.0257568359375, + "entropy_loss": -0.072265625, + "epoch": 0.566, + "grad_norm": 132.12865558629747, + "k1_kl": 0.02587890625, + "k3_kl": 0.7734375, + "kimi_kl": 0.05908203125, + "learning_rate": 2.17e-07, + "loss": 0.0309, + "ppl": 0.03759765625, + "reward": 0.8204036951065063, + "reward_std": 0.00657895440235734, + "rewards/perpo_ocr_edit_distance_reward": 0.8204038143157959, + "step": 2830, + "temperature": 0.9 + }, + { + "advantages": -2.4829592803143896e-05, + "completion_length": 192.0, + "delta_ref_entropy_loss": 0.03369140625, + "delta_ref_ppl": -0.09619140625, + "entropy_loss": -0.04296875, + "epoch": 0.5662, + "grad_norm": 1.2563522770426923, + "k1_kl": 0.09619140625, + "k3_kl": 0.06982421875, + "kimi_kl": 0.275390625, + "learning_rate": 2.169e-07, + "loss": 0.0028, + "ppl": 0.014892578125, + "reward": 0.9865689873695374, + "reward_std": 0.0022994629107415676, + "rewards/perpo_ocr_edit_distance_reward": 0.9865690469741821, + "step": 2831, + "temperature": 0.9 + }, + { + "advantages": 3.4059798537100505e-08, + "completion_length": 633.0, + "delta_ref_entropy_loss": -0.392578125, + "delta_ref_ppl": -0.07275390625, + "entropy_loss": -0.875, + "epoch": 0.5664, + "grad_norm": 4.3711922469616775, + "k1_kl": 0.07275390625, + "k3_kl": 0.1328125, + "kimi_kl": 0.369140625, + "learning_rate": 2.1679999999999998e-07, + "loss": 0.0053, + "ppl": 0.443359375, + "reward": 0.3648548126220703, + "reward_std": 0.08192714303731918, + "rewards/perpo_ocr_edit_distance_reward": 0.3648548424243927, + "step": 2832, + "temperature": 0.9 + }, + { + "advantages": -2.888270864787046e-05, + "completion_length": 688.0, + "delta_ref_entropy_loss": 0.057373046875, + "delta_ref_ppl": -0.06787109375, + "entropy_loss": -0.0625, + "epoch": 0.5666, + "grad_norm": 0.871289523859012, + "k1_kl": 0.068359375, + "k3_kl": 0.041748046875, + "kimi_kl": 0.11962890625, + "learning_rate": 2.167e-07, + "loss": 0.0017, + "ppl": 0.0247802734375, + "reward": 0.9902231693267822, + "reward_std": 0.0025536739267408848, + "rewards/perpo_ocr_edit_distance_reward": 0.990223228931427, + "step": 2833, + "temperature": 0.9 + }, + { + "advantages": -0.00010674340592231601, + "completion_length": 316.0, + "delta_ref_entropy_loss": 0.05712890625, + "delta_ref_ppl": -0.1005859375, + "entropy_loss": -0.05126953125, + "epoch": 0.5668, + "grad_norm": 0.6338241462424422, + "k1_kl": 0.10009765625, + "k3_kl": 0.0693359375, + "kimi_kl": 0.2099609375, + "learning_rate": 2.1659999999999997e-07, + "loss": 0.0029, + "ppl": 0.0189208984375, + "reward": 0.9955925345420837, + "reward_std": 0.000937118660658598, + "rewards/perpo_ocr_edit_distance_reward": 0.9955927133560181, + "step": 2834, + "temperature": 0.9 + }, + { + "advantages": -4.087175966560608e-07, + "completion_length": 485.0, + "delta_ref_entropy_loss": -0.022705078125, + "delta_ref_ppl": -0.0869140625, + "entropy_loss": -0.384765625, + "epoch": 0.567, + "grad_norm": 5.82645862801037, + "k1_kl": 0.0869140625, + "k3_kl": 0.072265625, + "kimi_kl": 0.1767578125, + "learning_rate": 2.1649999999999999e-07, + "loss": 0.0029, + "ppl": 0.1982421875, + "reward": 0.8901128172874451, + "reward_std": 0.08348097652196884, + "rewards/perpo_ocr_edit_distance_reward": 0.8901128768920898, + "step": 2835, + "temperature": 0.9 + }, + { + "advantages": -3.1403134926222265e-05, + "completion_length": 884.0, + "delta_ref_entropy_loss": 0.06982421875, + "delta_ref_ppl": -0.07373046875, + "entropy_loss": -0.11865234375, + "epoch": 0.5672, + "grad_norm": 1.3988144265458475, + "k1_kl": 0.07373046875, + "k3_kl": 0.046875, + "kimi_kl": 0.10888671875, + "learning_rate": 2.164e-07, + "loss": 0.0019, + "ppl": 0.06396484375, + "reward": 0.9638456702232361, + "reward_std": 0.0028834363911300898, + "rewards/perpo_ocr_edit_distance_reward": 0.9638457298278809, + "step": 2836, + "temperature": 0.9 + }, + { + "advantages": -4.858630200033076e-05, + "completion_length": 789.0, + "delta_ref_entropy_loss": 0.04150390625, + "delta_ref_ppl": -0.031982421875, + "entropy_loss": -0.03515625, + "epoch": 0.5674, + "grad_norm": 0.26935692448366166, + "k1_kl": 0.031982421875, + "k3_kl": 0.0140380859375, + "kimi_kl": 0.025634765625, + "learning_rate": 2.1629999999999997e-07, + "loss": 0.0006, + "ppl": 0.0103759765625, + "reward": 0.9952393174171448, + "reward_std": 0.00025054646539501846, + "rewards/perpo_ocr_edit_distance_reward": 0.9952393174171448, + "step": 2837, + "temperature": 0.9 + }, + { + "advantages": -1.7694064808893017e-05, + "completion_length": 130.0, + "delta_ref_entropy_loss": 0.087890625, + "delta_ref_ppl": -0.181640625, + "entropy_loss": -0.1337890625, + "epoch": 0.5676, + "grad_norm": 2.094130025668324, + "k1_kl": 0.181640625, + "k3_kl": 0.1279296875, + "kimi_kl": 0.341796875, + "learning_rate": 2.162e-07, + "loss": 0.0051, + "ppl": 0.06787109375, + "reward": 0.7904641032218933, + "reward_std": 0.005190785508602858, + "rewards/perpo_ocr_edit_distance_reward": 0.7904641628265381, + "step": 2838, + "temperature": 0.9 + }, + { + "advantages": 6.811959707420101e-08, + "completion_length": 835.0, + "delta_ref_entropy_loss": 0.053466796875, + "delta_ref_ppl": -0.099609375, + "entropy_loss": -0.1591796875, + "epoch": 0.5678, + "grad_norm": 4.193831751486923, + "k1_kl": 0.099609375, + "k3_kl": 0.07373046875, + "kimi_kl": 0.2001953125, + "learning_rate": 2.1609999999999998e-07, + "loss": 0.0029, + "ppl": 0.072265625, + "reward": 0.5627378821372986, + "reward_std": 0.25561758875846863, + "rewards/perpo_ocr_edit_distance_reward": 0.5627378821372986, + "step": 2839, + "temperature": 0.9 + }, + { + "advantages": -1.0405268767499365e-05, + "completion_length": 797.0, + "delta_ref_entropy_loss": 0.041015625, + "delta_ref_ppl": -0.05224609375, + "entropy_loss": -0.142578125, + "epoch": 0.568, + "grad_norm": 0.7395154852717993, + "k1_kl": 0.052001953125, + "k3_kl": 0.03173828125, + "kimi_kl": 0.07373046875, + "learning_rate": 2.1599999999999998e-07, + "loss": 0.0013, + "ppl": 0.05322265625, + "reward": 0.9000911712646484, + "reward_std": 0.004813615698367357, + "rewards/perpo_ocr_edit_distance_reward": 0.900091290473938, + "step": 2840, + "temperature": 0.9 + }, + { + "advantages": -2.1985599232721142e-05, + "completion_length": 477.0, + "delta_ref_entropy_loss": 0.033935546875, + "delta_ref_ppl": -0.059326171875, + "entropy_loss": -0.05029296875, + "epoch": 0.5682, + "grad_norm": 0.8128189758103637, + "k1_kl": 0.059326171875, + "k3_kl": 0.04248046875, + "kimi_kl": 0.1171875, + "learning_rate": 2.159e-07, + "loss": 0.0017, + "ppl": 0.021484375, + "reward": 0.9937981963157654, + "reward_std": 0.001836639828979969, + "rewards/perpo_ocr_edit_distance_reward": 0.9937982559204102, + "step": 2841, + "temperature": 0.9 + }, + { + "advantages": -3.015995207533706e-05, + "completion_length": 1418.0, + "delta_ref_entropy_loss": 0.05419921875, + "delta_ref_ppl": -0.0712890625, + "entropy_loss": -0.2314453125, + "epoch": 0.5684, + "grad_norm": 1.8002613985595888, + "k1_kl": 0.07177734375, + "k3_kl": 0.04736328125, + "kimi_kl": 0.09033203125, + "learning_rate": 2.158e-07, + "loss": 0.0019, + "ppl": 0.12255859375, + "reward": 0.968000590801239, + "reward_std": 0.0015926981577649713, + "rewards/perpo_ocr_edit_distance_reward": 0.9680007100105286, + "step": 2842, + "temperature": 0.9 + }, + { + "advantages": -7.910388376330957e-05, + "completion_length": 458.0, + "delta_ref_entropy_loss": 0.041259765625, + "delta_ref_ppl": -0.07568359375, + "entropy_loss": -0.03955078125, + "epoch": 0.5686, + "grad_norm": 0.7878772866361481, + "k1_kl": 0.0751953125, + "k3_kl": 0.0498046875, + "kimi_kl": 0.158203125, + "learning_rate": 2.157e-07, + "loss": 0.0021, + "ppl": 0.0115966796875, + "reward": 0.9921441078186035, + "reward_std": 0.0006535255233757198, + "rewards/perpo_ocr_edit_distance_reward": 0.9921442270278931, + "step": 2843, + "temperature": 0.9 + }, + { + "advantages": 1.7515250874566846e-05, + "completion_length": 745.0, + "delta_ref_entropy_loss": 0.04443359375, + "delta_ref_ppl": -0.062255859375, + "entropy_loss": -0.05322265625, + "epoch": 0.5688, + "grad_norm": 0.43718317370682785, + "k1_kl": 0.062255859375, + "k3_kl": 0.039306640625, + "kimi_kl": 0.1142578125, + "learning_rate": 2.156e-07, + "loss": 0.0016, + "ppl": 0.021240234375, + "reward": 0.9820958971977234, + "reward_std": 0.0008717746823094785, + "rewards/perpo_ocr_edit_distance_reward": 0.9820958971977234, + "step": 2844, + "temperature": 0.9 + }, + { + "advantages": -2.3007394702290185e-05, + "completion_length": 553.0, + "delta_ref_entropy_loss": 0.06689453125, + "delta_ref_ppl": -0.072265625, + "entropy_loss": -0.041748046875, + "epoch": 0.569, + "grad_norm": 0.35027720007249497, + "k1_kl": 0.07275390625, + "k3_kl": 0.043212890625, + "kimi_kl": 0.134765625, + "learning_rate": 2.155e-07, + "loss": 0.0018, + "ppl": 0.0118408203125, + "reward": 0.9975235462188721, + "reward_std": 0.0006402382859960198, + "rewards/perpo_ocr_edit_distance_reward": 0.9975236058235168, + "step": 2845, + "temperature": 0.9 + }, + { + "advantages": -6.495203706435859e-05, + "completion_length": 693.0, + "delta_ref_entropy_loss": 0.1162109375, + "delta_ref_ppl": -0.11865234375, + "entropy_loss": -0.10302734375, + "epoch": 0.5692, + "grad_norm": 0.8119423963077822, + "k1_kl": 0.11865234375, + "k3_kl": 0.076171875, + "kimi_kl": 0.201171875, + "learning_rate": 2.154e-07, + "loss": 0.0031, + "ppl": 0.05078125, + "reward": 0.9561340808868408, + "reward_std": 0.0008178827702067792, + "rewards/perpo_ocr_edit_distance_reward": 0.9561341404914856, + "step": 2846, + "temperature": 0.9 + }, + { + "advantages": -2.9291426471900195e-05, + "completion_length": 650.0, + "delta_ref_entropy_loss": 0.0242919921875, + "delta_ref_ppl": -0.051025390625, + "entropy_loss": -0.040283203125, + "epoch": 0.5694, + "grad_norm": 0.687425616863842, + "k1_kl": 0.051025390625, + "k3_kl": 0.03369140625, + "kimi_kl": 0.08203125, + "learning_rate": 2.1529999999999998e-07, + "loss": 0.0014, + "ppl": 0.0146484375, + "reward": 0.9748942852020264, + "reward_std": 0.0010627374285832047, + "rewards/perpo_ocr_edit_distance_reward": 0.9748942852020264, + "step": 2847, + "temperature": 0.9 + }, + { + "advantages": -1.687663097982295e-05, + "completion_length": 1458.0, + "delta_ref_entropy_loss": 0.052734375, + "delta_ref_ppl": -0.054443359375, + "entropy_loss": -0.1064453125, + "epoch": 0.5696, + "grad_norm": 0.9229252954686948, + "k1_kl": 0.054443359375, + "k3_kl": 0.033203125, + "kimi_kl": 0.0673828125, + "learning_rate": 2.152e-07, + "loss": 0.0013, + "ppl": 0.042236328125, + "reward": 0.9037835597991943, + "reward_std": 0.0019173282198607922, + "rewards/perpo_ocr_edit_distance_reward": 0.9037836194038391, + "step": 2848, + "temperature": 0.9 + }, + { + "advantages": -5.177089406060986e-06, + "completion_length": 553.0, + "delta_ref_entropy_loss": 0.11767578125, + "delta_ref_ppl": -0.1796875, + "entropy_loss": -0.412109375, + "epoch": 0.5698, + "grad_norm": 2.0186384780560243, + "k1_kl": 0.1787109375, + "k3_kl": 0.11767578125, + "kimi_kl": 0.298828125, + "learning_rate": 2.1510000000000001e-07, + "loss": 0.0047, + "ppl": 0.2109375, + "reward": 0.8574023842811584, + "reward_std": 0.01143658347427845, + "rewards/perpo_ocr_edit_distance_reward": 0.857402503490448, + "step": 2849, + "temperature": 0.9 + }, + { + "advantages": -1.546314888400957e-05, + "completion_length": 971.0, + "delta_ref_entropy_loss": 0.033203125, + "delta_ref_ppl": -0.056884765625, + "entropy_loss": -0.07177734375, + "epoch": 0.57, + "grad_norm": 0.7417673648171739, + "k1_kl": 0.056884765625, + "k3_kl": 0.03466796875, + "kimi_kl": 0.11962890625, + "learning_rate": 2.1499999999999998e-07, + "loss": 0.0014, + "ppl": 0.0322265625, + "reward": 0.9920477867126465, + "reward_std": 0.004858987405896187, + "rewards/perpo_ocr_edit_distance_reward": 0.992047905921936, + "step": 2850, + "temperature": 0.9 + }, + { + "advantages": -2.1287374352141342e-07, + "completion_length": 1093.0, + "delta_ref_entropy_loss": 0.0257568359375, + "delta_ref_ppl": -0.0498046875, + "entropy_loss": -0.1015625, + "epoch": 0.5702, + "grad_norm": 2.7260267912156775, + "k1_kl": 0.0498046875, + "k3_kl": 0.03857421875, + "kimi_kl": 0.07177734375, + "learning_rate": 2.149e-07, + "loss": 0.0015, + "ppl": 0.04931640625, + "reward": 0.9590681195259094, + "reward_std": 0.042695302516222, + "rewards/perpo_ocr_edit_distance_reward": 0.9590681791305542, + "step": 2851, + "temperature": 0.9 + }, + { + "advantages": 3.4059798537100505e-08, + "completion_length": 184.0, + "delta_ref_entropy_loss": 0.07421875, + "delta_ref_ppl": -0.181640625, + "entropy_loss": -0.10302734375, + "epoch": 0.5704, + "grad_norm": 1.9909452126740166, + "k1_kl": 0.181640625, + "k3_kl": 0.119140625, + "kimi_kl": 0.3515625, + "learning_rate": 2.148e-07, + "loss": 0.0048, + "ppl": 0.038330078125, + "reward": 0.9808394312858582, + "reward_std": 0.002454457338899374, + "rewards/perpo_ocr_edit_distance_reward": 0.9808394908905029, + "step": 2852, + "temperature": 0.9 + }, + { + "advantages": -7.876328140810074e-07, + "completion_length": 515.0, + "delta_ref_entropy_loss": 0.041015625, + "delta_ref_ppl": -0.10546875, + "entropy_loss": -0.259765625, + "epoch": 0.5706, + "grad_norm": 1.5773225217420856, + "k1_kl": 0.10546875, + "k3_kl": 0.07275390625, + "kimi_kl": 0.1845703125, + "learning_rate": 2.1469999999999998e-07, + "loss": 0.0029, + "ppl": 0.1171875, + "reward": 0.7802799344062805, + "reward_std": 0.021593239158391953, + "rewards/perpo_ocr_edit_distance_reward": 0.7802799940109253, + "step": 2853, + "temperature": 0.9 + }, + { + "advantages": -2.8610231311176904e-05, + "completion_length": 300.0, + "delta_ref_entropy_loss": 0.0341796875, + "delta_ref_ppl": -0.111328125, + "entropy_loss": -0.068359375, + "epoch": 0.5708, + "grad_norm": 1.1927421566876022, + "k1_kl": 0.11181640625, + "k3_kl": 0.08349609375, + "kimi_kl": 0.333984375, + "learning_rate": 2.146e-07, + "loss": 0.0034, + "ppl": 0.028564453125, + "reward": 0.6610549688339233, + "reward_std": 0.0007936075562611222, + "rewards/perpo_ocr_edit_distance_reward": 0.6610550284385681, + "step": 2854, + "temperature": 0.9 + }, + { + "advantages": 5.841255642735632e-06, + "completion_length": 232.0, + "delta_ref_entropy_loss": 0.038818359375, + "delta_ref_ppl": -0.154296875, + "entropy_loss": -0.0634765625, + "epoch": 0.571, + "grad_norm": 0.9179708256141595, + "k1_kl": 0.154296875, + "k3_kl": 0.1171875, + "kimi_kl": 0.6953125, + "learning_rate": 2.145e-07, + "loss": 0.0047, + "ppl": 0.0230712890625, + "reward": 0.9960248470306396, + "reward_std": 0.001357580884359777, + "rewards/perpo_ocr_edit_distance_reward": 0.9960248470306396, + "step": 2855, + "temperature": 0.9 + }, + { + "advantages": 1.27724248955019e-08, + "completion_length": 896.0, + "delta_ref_entropy_loss": 0.0703125, + "delta_ref_ppl": -0.09765625, + "entropy_loss": -0.4375, + "epoch": 0.5712, + "grad_norm": 2.379634026263767, + "k1_kl": 0.0966796875, + "k3_kl": 0.0634765625, + "kimi_kl": 0.11865234375, + "learning_rate": 2.144e-07, + "loss": 0.0025, + "ppl": 0.240234375, + "reward": 0.7714194059371948, + "reward_std": 0.007020390592515469, + "rewards/perpo_ocr_edit_distance_reward": 0.7714194059371948, + "step": 2856, + "temperature": 0.9 + }, + { + "advantages": -9.179115295410156e-06, + "completion_length": 94.0, + "delta_ref_entropy_loss": 0.099609375, + "delta_ref_ppl": -0.32421875, + "entropy_loss": -0.234375, + "epoch": 0.5714, + "grad_norm": 2.772571894605845, + "k1_kl": 0.32421875, + "k3_kl": 0.240234375, + "kimi_kl": 0.72265625, + "learning_rate": 2.1429999999999998e-07, + "loss": 0.0097, + "ppl": 0.1005859375, + "reward": 0.927376389503479, + "reward_std": 0.00918322429060936, + "rewards/perpo_ocr_edit_distance_reward": 0.9273765087127686, + "step": 2857, + "temperature": 0.9 + }, + { + "advantages": -1.977171268663369e-05, + "completion_length": 630.0, + "delta_ref_entropy_loss": 0.0172119140625, + "delta_ref_ppl": -0.038818359375, + "entropy_loss": -0.0439453125, + "epoch": 0.5716, + "grad_norm": 0.7140338897899667, + "k1_kl": 0.038818359375, + "k3_kl": 0.0279541015625, + "kimi_kl": 0.07666015625, + "learning_rate": 2.142e-07, + "loss": 0.0011, + "ppl": 0.01422119140625, + "reward": 0.9812312126159668, + "reward_std": 0.0029109427705407143, + "rewards/perpo_ocr_edit_distance_reward": 0.9812312126159668, + "step": 2858, + "temperature": 0.9 + }, + { + "advantages": 7.749668839096557e-06, + "completion_length": 991.0, + "delta_ref_entropy_loss": 0.055419921875, + "delta_ref_ppl": -0.06201171875, + "entropy_loss": -0.044921875, + "epoch": 0.5718, + "grad_norm": 4.63428852022808, + "k1_kl": 0.06201171875, + "k3_kl": 0.08056640625, + "kimi_kl": 0.07177734375, + "learning_rate": 2.141e-07, + "loss": 0.0032, + "ppl": 0.0201416015625, + "reward": 0.9924485087394714, + "reward_std": 0.000997391645796597, + "rewards/perpo_ocr_edit_distance_reward": 0.9924485683441162, + "step": 2859, + "temperature": 0.9 + }, + { + "advantages": -9.747062722453848e-05, + "completion_length": 506.0, + "delta_ref_entropy_loss": 0.054443359375, + "delta_ref_ppl": -0.0703125, + "entropy_loss": -0.05859375, + "epoch": 0.572, + "grad_norm": 0.6410459745936412, + "k1_kl": 0.0703125, + "k3_kl": 0.043212890625, + "kimi_kl": 0.10986328125, + "learning_rate": 2.1399999999999998e-07, + "loss": 0.0018, + "ppl": 0.0244140625, + "reward": 0.9888700246810913, + "reward_std": 0.0008611110970377922, + "rewards/perpo_ocr_edit_distance_reward": 0.9888701438903809, + "step": 2860, + "temperature": 0.9 + }, + { + "advantages": -5.361863804864697e-05, + "completion_length": 934.0, + "delta_ref_entropy_loss": 0.06640625, + "delta_ref_ppl": -0.07373046875, + "entropy_loss": -0.1572265625, + "epoch": 0.5722, + "grad_norm": 1.0525662725728404, + "k1_kl": 0.07373046875, + "k3_kl": 0.04052734375, + "kimi_kl": 0.09375, + "learning_rate": 2.139e-07, + "loss": 0.0017, + "ppl": 0.07421875, + "reward": 0.8861595392227173, + "reward_std": 0.0010115415789186954, + "rewards/perpo_ocr_edit_distance_reward": 0.8861595988273621, + "step": 2861, + "temperature": 0.9 + }, + { + "advantages": -1.8562590412329882e-05, + "completion_length": 1260.0, + "delta_ref_entropy_loss": 0.009765625, + "delta_ref_ppl": -0.0311279296875, + "entropy_loss": -0.06103515625, + "epoch": 0.5724, + "grad_norm": 0.7672332136586364, + "k1_kl": 0.031005859375, + "k3_kl": 0.0244140625, + "kimi_kl": 0.06103515625, + "learning_rate": 2.1379999999999997e-07, + "loss": 0.001, + "ppl": 0.0294189453125, + "reward": 0.9819105267524719, + "reward_std": 0.0026520071551203728, + "rewards/perpo_ocr_edit_distance_reward": 0.9819105863571167, + "step": 2862, + "temperature": 0.9 + }, + { + "advantages": -7.3058267844317015e-06, + "completion_length": 282.0, + "delta_ref_entropy_loss": 0.05322265625, + "delta_ref_ppl": -0.1064453125, + "entropy_loss": -0.04248046875, + "epoch": 0.5726, + "grad_norm": 1.058196139027536, + "k1_kl": 0.10693359375, + "k3_kl": 0.07275390625, + "kimi_kl": 0.251953125, + "learning_rate": 2.137e-07, + "loss": 0.0029, + "ppl": 0.019287109375, + "reward": 0.9092102646827698, + "reward_std": 0.0045697796158492565, + "rewards/perpo_ocr_edit_distance_reward": 0.9092102646827698, + "step": 2863, + "temperature": 0.9 + }, + { + "advantages": -3.773825665120967e-05, + "completion_length": 789.0, + "delta_ref_entropy_loss": 0.03173828125, + "delta_ref_ppl": -0.045654296875, + "entropy_loss": -0.050048828125, + "epoch": 0.5728, + "grad_norm": 0.4299085239455075, + "k1_kl": 0.045654296875, + "k3_kl": 0.0262451171875, + "kimi_kl": 0.064453125, + "learning_rate": 2.136e-07, + "loss": 0.0011, + "ppl": 0.0185546875, + "reward": 0.9805499315261841, + "reward_std": 0.00035139688407070935, + "rewards/perpo_ocr_edit_distance_reward": 0.9805499911308289, + "step": 2864, + "temperature": 0.9 + }, + { + "advantages": -1.704692840576172e-05, + "completion_length": 366.0, + "delta_ref_entropy_loss": 0.07666015625, + "delta_ref_ppl": -0.1201171875, + "entropy_loss": -0.072265625, + "epoch": 0.573, + "grad_norm": 1.5372455664967821, + "k1_kl": 0.1201171875, + "k3_kl": 0.07568359375, + "kimi_kl": 0.2373046875, + "learning_rate": 2.1349999999999997e-07, + "loss": 0.003, + "ppl": 0.0262451171875, + "reward": 0.9824371933937073, + "reward_std": 0.0028995205648243427, + "rewards/perpo_ocr_edit_distance_reward": 0.9824373126029968, + "step": 2865, + "temperature": 0.9 + }, + { + "advantages": -5.5364202125929296e-05, + "completion_length": 491.0, + "delta_ref_entropy_loss": 0.0625, + "delta_ref_ppl": -0.080078125, + "entropy_loss": -0.045166015625, + "epoch": 0.5732, + "grad_norm": 0.639777149832801, + "k1_kl": 0.080078125, + "k3_kl": 0.04443359375, + "kimi_kl": 0.11328125, + "learning_rate": 2.134e-07, + "loss": 0.0018, + "ppl": 0.0157470703125, + "reward": 0.9947063326835632, + "reward_std": 0.0009763152920641005, + "rewards/perpo_ocr_edit_distance_reward": 0.9947064518928528, + "step": 2866, + "temperature": 0.9 + }, + { + "advantages": -0.00016501120990142226, + "completion_length": 770.0, + "delta_ref_entropy_loss": 0.0380859375, + "delta_ref_ppl": -0.0927734375, + "entropy_loss": -0.06884765625, + "epoch": 0.5734, + "grad_norm": 0.8328009328559007, + "k1_kl": 0.09326171875, + "k3_kl": 0.060546875, + "kimi_kl": 0.240234375, + "learning_rate": 2.1329999999999998e-07, + "loss": 0.0026, + "ppl": 0.031494140625, + "reward": 0.9903775453567505, + "reward_std": 0.0005192854441702366, + "rewards/perpo_ocr_edit_distance_reward": 0.9903776049613953, + "step": 2867, + "temperature": 0.9 + }, + { + "advantages": -4.172325134277344e-06, + "completion_length": 379.0, + "delta_ref_entropy_loss": 0.029541015625, + "delta_ref_ppl": -0.08203125, + "entropy_loss": -0.0986328125, + "epoch": 0.5736, + "grad_norm": 1.3207127070083977, + "k1_kl": 0.08203125, + "k3_kl": 0.05712890625, + "kimi_kl": 0.16015625, + "learning_rate": 2.132e-07, + "loss": 0.0023, + "ppl": 0.045166015625, + "reward": 0.8570623993873596, + "reward_std": 0.024177711457014084, + "rewards/perpo_ocr_edit_distance_reward": 0.8570625185966492, + "step": 2868, + "temperature": 0.9 + }, + { + "advantages": -3.6716461181640625e-05, + "completion_length": 210.0, + "delta_ref_entropy_loss": 0.00701904296875, + "delta_ref_ppl": -0.1396484375, + "entropy_loss": -0.06396484375, + "epoch": 0.5738, + "grad_norm": 1.5617785926023926, + "k1_kl": 0.140625, + "k3_kl": 0.11328125, + "kimi_kl": 0.46484375, + "learning_rate": 2.131e-07, + "loss": 0.0046, + "ppl": 0.033447265625, + "reward": 0.9942829012870789, + "reward_std": 0.0010590292513370514, + "rewards/perpo_ocr_edit_distance_reward": 0.9942829608917236, + "step": 2869, + "temperature": 0.9 + }, + { + "advantages": -1.7017126083374023e-05, + "completion_length": 1586.0, + "delta_ref_entropy_loss": 0.0068359375, + "delta_ref_ppl": -0.0294189453125, + "entropy_loss": -0.052490234375, + "epoch": 0.574, + "grad_norm": 2.5685850640149916, + "k1_kl": 0.0294189453125, + "k3_kl": 0.024658203125, + "kimi_kl": 0.058837890625, + "learning_rate": 2.13e-07, + "loss": 0.001, + "ppl": 0.0228271484375, + "reward": 0.9886996746063232, + "reward_std": 0.0019004213390871882, + "rewards/perpo_ocr_edit_distance_reward": 0.988699734210968, + "step": 2870, + "temperature": 0.9 + }, + { + "advantages": -1.3904912520956714e-05, + "completion_length": 687.0, + "delta_ref_entropy_loss": 0.04541015625, + "delta_ref_ppl": -0.061767578125, + "entropy_loss": -0.0615234375, + "epoch": 0.5742, + "grad_norm": 0.467635063484141, + "k1_kl": 0.061767578125, + "k3_kl": 0.03466796875, + "kimi_kl": 0.07861328125, + "learning_rate": 2.129e-07, + "loss": 0.0014, + "ppl": 0.0247802734375, + "reward": 0.6095570921897888, + "reward_std": 0.0005131805664859712, + "rewards/perpo_ocr_edit_distance_reward": 0.6095570921897888, + "step": 2871, + "temperature": 0.9 + }, + { + "advantages": -2.050399962172378e-05, + "completion_length": 561.0, + "delta_ref_entropy_loss": 0.042724609375, + "delta_ref_ppl": -0.06689453125, + "entropy_loss": -0.050048828125, + "epoch": 0.5744, + "grad_norm": 0.5913423855702388, + "k1_kl": 0.06689453125, + "k3_kl": 0.041748046875, + "kimi_kl": 0.10107421875, + "learning_rate": 2.1279999999999997e-07, + "loss": 0.0017, + "ppl": 0.025390625, + "reward": 0.9952742457389832, + "reward_std": 0.002806401811540127, + "rewards/perpo_ocr_edit_distance_reward": 0.9952743053436279, + "step": 2872, + "temperature": 0.9 + }, + { + "advantages": -0.00010447842942085117, + "completion_length": 961.0, + "delta_ref_entropy_loss": 0.0301513671875, + "delta_ref_ppl": -0.04248046875, + "entropy_loss": -0.034423828125, + "epoch": 0.5746, + "grad_norm": 0.827551374158478, + "k1_kl": 0.04248046875, + "k3_kl": 0.0257568359375, + "kimi_kl": 0.0712890625, + "learning_rate": 2.127e-07, + "loss": 0.0011, + "ppl": 0.0111083984375, + "reward": 0.9971861839294434, + "reward_std": 0.0005519766127690673, + "rewards/perpo_ocr_edit_distance_reward": 0.9971863031387329, + "step": 2873, + "temperature": 0.9 + }, + { + "advantages": -1.1410032811909332e-06, + "completion_length": 872.0, + "delta_ref_entropy_loss": 0.01092529296875, + "delta_ref_ppl": -0.04736328125, + "entropy_loss": -0.103515625, + "epoch": 0.5748, + "grad_norm": 2.915818658458379, + "k1_kl": 0.04736328125, + "k3_kl": 0.03173828125, + "kimi_kl": 0.07958984375, + "learning_rate": 2.126e-07, + "loss": 0.0013, + "ppl": 0.04150390625, + "reward": 0.9904397130012512, + "reward_std": 0.01472469698637724, + "rewards/perpo_ocr_edit_distance_reward": 0.990439772605896, + "step": 2874, + "temperature": 0.9 + }, + { + "advantages": 6.811959707420101e-08, + "completion_length": 548.0, + "delta_ref_entropy_loss": -0.1484375, + "delta_ref_ppl": -0.049560546875, + "entropy_loss": -0.416015625, + "epoch": 0.575, + "grad_norm": 11.674383696969423, + "k1_kl": 0.04931640625, + "k3_kl": 0.05810546875, + "kimi_kl": 0.140625, + "learning_rate": 2.1249999999999998e-07, + "loss": 0.0023, + "ppl": 0.1650390625, + "reward": 0.8878657817840576, + "reward_std": 0.28731581568717957, + "rewards/perpo_ocr_edit_distance_reward": 0.8878657817840576, + "step": 2875, + "temperature": 0.9 + }, + { + "advantages": 1.7029899268550253e-08, + "completion_length": 774.0, + "delta_ref_entropy_loss": 0.03515625, + "delta_ref_ppl": -0.053955078125, + "entropy_loss": -0.040771484375, + "epoch": 0.5752, + "grad_norm": 0.3801198628725433, + "k1_kl": 0.053955078125, + "k3_kl": 0.033447265625, + "kimi_kl": 0.09228515625, + "learning_rate": 2.124e-07, + "loss": 0.0013, + "ppl": 0.012451171875, + "reward": 0.992819607257843, + "reward_std": 0.00095668516587466, + "rewards/perpo_ocr_edit_distance_reward": 0.9928196668624878, + "step": 2876, + "temperature": 0.9 + }, + { + "advantages": -3.7806375985383056e-06, + "completion_length": 722.0, + "delta_ref_entropy_loss": 0.053466796875, + "delta_ref_ppl": -0.1123046875, + "entropy_loss": -0.068359375, + "epoch": 0.5754, + "grad_norm": 0.5859611624962028, + "k1_kl": 0.1123046875, + "k3_kl": 0.076171875, + "kimi_kl": 0.240234375, + "learning_rate": 2.123e-07, + "loss": 0.0031, + "ppl": 0.0260009765625, + "reward": 0.921241283416748, + "reward_std": 0.011232679709792137, + "rewards/perpo_ocr_edit_distance_reward": 0.9212413430213928, + "step": 2877, + "temperature": 0.9 + }, + { + "advantages": 8.514949634275126e-09, + "completion_length": 236.0, + "delta_ref_entropy_loss": 0.0576171875, + "delta_ref_ppl": -0.078125, + "entropy_loss": -0.0517578125, + "epoch": 0.5756, + "grad_norm": 0.7262127423714833, + "k1_kl": 0.07763671875, + "k3_kl": 0.046630859375, + "kimi_kl": 0.1240234375, + "learning_rate": 2.1219999999999998e-07, + "loss": 0.0019, + "ppl": 0.026611328125, + "reward": 0.9882497787475586, + "reward_std": 0.0017030228627845645, + "rewards/perpo_ocr_edit_distance_reward": 0.9882498383522034, + "step": 2878, + "temperature": 0.9 + }, + { + "advantages": -5.245209194981726e-06, + "completion_length": 155.0, + "delta_ref_entropy_loss": 0.06689453125, + "delta_ref_ppl": -0.2177734375, + "entropy_loss": -0.1181640625, + "epoch": 0.5758, + "grad_norm": 1.1290805262093229, + "k1_kl": 0.2177734375, + "k3_kl": 0.1611328125, + "kimi_kl": 0.5859375, + "learning_rate": 2.121e-07, + "loss": 0.0065, + "ppl": 0.049560546875, + "reward": 0.967715859413147, + "reward_std": 0.003156617283821106, + "rewards/perpo_ocr_edit_distance_reward": 0.9677159190177917, + "step": 2879, + "temperature": 0.9 + }, + { + "advantages": -2.1593912606476806e-05, + "completion_length": 939.0, + "delta_ref_entropy_loss": 0.03173828125, + "delta_ref_ppl": -0.053955078125, + "entropy_loss": -0.0556640625, + "epoch": 0.576, + "grad_norm": 0.9249399593363058, + "k1_kl": 0.05419921875, + "k3_kl": 0.03125, + "kimi_kl": 0.09765625, + "learning_rate": 2.12e-07, + "loss": 0.0013, + "ppl": 0.0211181640625, + "reward": 0.9898858666419983, + "reward_std": 0.0006887574563734233, + "rewards/perpo_ocr_edit_distance_reward": 0.9898859262466431, + "step": 2880, + "temperature": 0.9 + }, + { + "advantages": -1.3909169865655713e-05, + "completion_length": 152.0, + "delta_ref_entropy_loss": 0.08544921875, + "delta_ref_ppl": -0.294921875, + "entropy_loss": -0.12890625, + "epoch": 0.5762, + "grad_norm": 1.854399622928641, + "k1_kl": 0.294921875, + "k3_kl": 0.2236328125, + "kimi_kl": 0.828125, + "learning_rate": 2.1189999999999999e-07, + "loss": 0.009, + "ppl": 0.0537109375, + "reward": 0.9265412092208862, + "reward_std": 0.001737966318614781, + "rewards/perpo_ocr_edit_distance_reward": 0.9265412092208862, + "step": 2881, + "temperature": 0.9 + }, + { + "advantages": 5.960464477539062e-07, + "completion_length": 154.0, + "delta_ref_entropy_loss": 0.10400390625, + "delta_ref_ppl": -0.2578125, + "entropy_loss": -0.359375, + "epoch": 0.5764, + "grad_norm": 3.1394662053723006, + "k1_kl": 0.2578125, + "k3_kl": 0.193359375, + "kimi_kl": 0.80859375, + "learning_rate": 2.1179999999999998e-07, + "loss": 0.0077, + "ppl": 0.1591796875, + "reward": 0.86634761095047, + "reward_std": 0.014262797310948372, + "rewards/perpo_ocr_edit_distance_reward": 0.86634761095047, + "step": 2882, + "temperature": 0.9 + }, + { + "advantages": -1.748970680637285e-05, + "completion_length": 187.0, + "delta_ref_entropy_loss": 0.1689453125, + "delta_ref_ppl": -0.25, + "entropy_loss": -0.240234375, + "epoch": 0.5766, + "grad_norm": 2.898117496971569, + "k1_kl": 0.25, + "k3_kl": 0.177734375, + "kimi_kl": 0.59765625, + "learning_rate": 2.117e-07, + "loss": 0.0071, + "ppl": 0.08837890625, + "reward": 0.9123015999794006, + "reward_std": 0.003797606797888875, + "rewards/perpo_ocr_edit_distance_reward": 0.9123016595840454, + "step": 2883, + "temperature": 0.9 + }, + { + "advantages": -1.021793991640152e-07, + "completion_length": 68.0, + "delta_ref_entropy_loss": -0.2197265625, + "delta_ref_ppl": -1.046875, + "entropy_loss": -0.953125, + "epoch": 0.5768, + "grad_norm": 21.184787265219928, + "k1_kl": 1.046875, + "k3_kl": 1.1875, + "kimi_kl": 4.03125, + "learning_rate": 2.116e-07, + "loss": 0.0475, + "ppl": 0.388671875, + "reward": 0.2779502868652344, + "reward_std": 0.09535054862499237, + "rewards/perpo_ocr_edit_distance_reward": 0.27795031666755676, + "step": 2884, + "temperature": 0.9 + }, + { + "advantages": -1.2772424270224292e-05, + "completion_length": 283.0, + "delta_ref_entropy_loss": 0.03857421875, + "delta_ref_ppl": -0.1298828125, + "entropy_loss": -0.1259765625, + "epoch": 0.577, + "grad_norm": 1.4095054835127907, + "k1_kl": 0.1298828125, + "k3_kl": 0.09521484375, + "kimi_kl": 0.2294921875, + "learning_rate": 2.1149999999999998e-07, + "loss": 0.0038, + "ppl": 0.06591796875, + "reward": 0.9528856873512268, + "reward_std": 0.003906527068465948, + "rewards/perpo_ocr_edit_distance_reward": 0.9528858065605164, + "step": 2885, + "temperature": 0.9 + }, + { + "advantages": 1.7029899268550253e-08, + "completion_length": 1021.0, + "delta_ref_entropy_loss": 0.054931640625, + "delta_ref_ppl": -0.0810546875, + "entropy_loss": -0.1708984375, + "epoch": 0.5772, + "grad_norm": 1.4662948747659, + "k1_kl": 0.0810546875, + "k3_kl": 0.054443359375, + "kimi_kl": 0.140625, + "learning_rate": 2.114e-07, + "loss": 0.0022, + "ppl": 0.08740234375, + "reward": 0.9144966006278992, + "reward_std": 0.0026643683668226004, + "rewards/perpo_ocr_edit_distance_reward": 0.9144966006278992, + "step": 2886, + "temperature": 0.9 + }, + { + "advantages": -3.916876778475853e-07, + "completion_length": 356.0, + "delta_ref_entropy_loss": -0.035888671875, + "delta_ref_ppl": -0.0927734375, + "entropy_loss": -0.193359375, + "epoch": 0.5774, + "grad_norm": 2.9372425281729244, + "k1_kl": 0.09228515625, + "k3_kl": 0.08154296875, + "kimi_kl": 0.216796875, + "learning_rate": 2.1129999999999997e-07, + "loss": 0.0033, + "ppl": 0.0810546875, + "reward": 0.6655734777450562, + "reward_std": 0.11472414433956146, + "rewards/perpo_ocr_edit_distance_reward": 0.6655735373497009, + "step": 2887, + "temperature": 0.9 + }, + { + "advantages": -0.00010643686982803047, + "completion_length": 486.0, + "delta_ref_entropy_loss": 0.037353515625, + "delta_ref_ppl": -0.09033203125, + "entropy_loss": -0.05126953125, + "epoch": 0.5776, + "grad_norm": 0.5464492833474467, + "k1_kl": 0.09033203125, + "k3_kl": 0.0595703125, + "kimi_kl": 0.2197265625, + "learning_rate": 2.1119999999999999e-07, + "loss": 0.0025, + "ppl": 0.0177001953125, + "reward": 0.9990125298500061, + "reward_std": 0.0006199062336236238, + "rewards/perpo_ocr_edit_distance_reward": 0.9990125894546509, + "step": 2888, + "temperature": 0.9 + }, + { + "advantages": -1.0728836059570312e-06, + "completion_length": 791.0, + "delta_ref_entropy_loss": 0.01336669921875, + "delta_ref_ppl": -0.08154296875, + "entropy_loss": -0.2578125, + "epoch": 0.5778, + "grad_norm": 1.382215728361598, + "k1_kl": 0.08154296875, + "k3_kl": 0.0625, + "kimi_kl": 0.15234375, + "learning_rate": 2.111e-07, + "loss": 0.0025, + "ppl": 0.12890625, + "reward": 0.7606003284454346, + "reward_std": 0.007794068194925785, + "rewards/perpo_ocr_edit_distance_reward": 0.7606003880500793, + "step": 2889, + "temperature": 0.9 + }, + { + "advantages": -4.938671054333099e-07, + "completion_length": 807.0, + "delta_ref_entropy_loss": 0.006683349609375, + "delta_ref_ppl": -0.060791015625, + "entropy_loss": -0.294921875, + "epoch": 0.578, + "grad_norm": 1.9297465331884314, + "k1_kl": 0.060546875, + "k3_kl": 0.0546875, + "kimi_kl": 0.09716796875, + "learning_rate": 2.1099999999999997e-07, + "loss": 0.0022, + "ppl": 0.154296875, + "reward": 0.914929986000061, + "reward_std": 0.016682595014572144, + "rewards/perpo_ocr_edit_distance_reward": 0.914929986000061, + "step": 2890, + "temperature": 0.9 + }, + { + "advantages": -2.5170191293000244e-05, + "completion_length": 221.0, + "delta_ref_entropy_loss": 0.05224609375, + "delta_ref_ppl": -0.162109375, + "entropy_loss": -0.08251953125, + "epoch": 0.5782, + "grad_norm": 2.1647843650670184, + "k1_kl": 0.162109375, + "k3_kl": 0.1181640625, + "kimi_kl": 0.392578125, + "learning_rate": 2.109e-07, + "loss": 0.0047, + "ppl": 0.038818359375, + "reward": 0.8764944672584534, + "reward_std": 0.0026088994927704334, + "rewards/perpo_ocr_edit_distance_reward": 0.8764945864677429, + "step": 2891, + "temperature": 0.9 + }, + { + "advantages": -2.183233118557837e-05, + "completion_length": 646.0, + "delta_ref_entropy_loss": 0.0185546875, + "delta_ref_ppl": -0.08447265625, + "entropy_loss": -0.11083984375, + "epoch": 0.5784, + "grad_norm": 0.8732918461634992, + "k1_kl": 0.083984375, + "k3_kl": 0.06298828125, + "kimi_kl": 0.1455078125, + "learning_rate": 2.1079999999999998e-07, + "loss": 0.0025, + "ppl": 0.051513671875, + "reward": 0.9623818397521973, + "reward_std": 0.004188581369817257, + "rewards/perpo_ocr_edit_distance_reward": 0.9623819589614868, + "step": 2892, + "temperature": 0.9 + }, + { + "advantages": -1.9035169316339307e-05, + "completion_length": 1358.0, + "delta_ref_entropy_loss": 0.0194091796875, + "delta_ref_ppl": -0.0478515625, + "entropy_loss": -0.07470703125, + "epoch": 0.5786, + "grad_norm": 0.812751764462901, + "k1_kl": 0.047607421875, + "k3_kl": 0.0296630859375, + "kimi_kl": 0.0654296875, + "learning_rate": 2.107e-07, + "loss": 0.0012, + "ppl": 0.0322265625, + "reward": 0.9876008033752441, + "reward_std": 0.0030303725507110357, + "rewards/perpo_ocr_edit_distance_reward": 0.9876008629798889, + "step": 2893, + "temperature": 0.9 + }, + { + "advantages": -9.991441766032949e-05, + "completion_length": 562.0, + "delta_ref_entropy_loss": 0.04345703125, + "delta_ref_ppl": -0.072265625, + "entropy_loss": -0.07177734375, + "epoch": 0.5788, + "grad_norm": 0.7082931955237078, + "k1_kl": 0.072265625, + "k3_kl": 0.046630859375, + "kimi_kl": 0.1298828125, + "learning_rate": 2.106e-07, + "loss": 0.002, + "ppl": 0.02685546875, + "reward": 0.9859268665313721, + "reward_std": 0.0005817743949592113, + "rewards/perpo_ocr_edit_distance_reward": 0.9859269261360168, + "step": 2894, + "temperature": 0.9 + }, + { + "advantages": -4.002026230409683e-07, + "completion_length": 1619.0, + "delta_ref_entropy_loss": -0.00732421875, + "delta_ref_ppl": -0.0306396484375, + "entropy_loss": -0.09716796875, + "epoch": 0.579, + "grad_norm": 2.147775717902986, + "k1_kl": 0.0306396484375, + "k3_kl": 0.023681640625, + "kimi_kl": 0.0556640625, + "learning_rate": 2.1049999999999999e-07, + "loss": 0.001, + "ppl": 0.040283203125, + "reward": 0.928967297077179, + "reward_std": 0.04211786389350891, + "rewards/perpo_ocr_edit_distance_reward": 0.928967297077179, + "step": 2895, + "temperature": 0.9 + }, + { + "advantages": -0.00012362003326416016, + "completion_length": 589.0, + "delta_ref_entropy_loss": 0.035888671875, + "delta_ref_ppl": -0.08203125, + "entropy_loss": -0.049072265625, + "epoch": 0.5792, + "grad_norm": 0.35463354835052996, + "k1_kl": 0.08203125, + "k3_kl": 0.0537109375, + "kimi_kl": 0.1953125, + "learning_rate": 2.104e-07, + "loss": 0.0023, + "ppl": 0.01556396484375, + "reward": 0.6614514589309692, + "reward_std": 0.000382114143576473, + "rewards/perpo_ocr_edit_distance_reward": 0.661451518535614, + "step": 2896, + "temperature": 0.9 + }, + { + "advantages": -5.790165573671402e-07, + "completion_length": 436.0, + "delta_ref_entropy_loss": 0.034912109375, + "delta_ref_ppl": -0.060302734375, + "entropy_loss": -0.07763671875, + "epoch": 0.5794, + "grad_norm": 1.0384366814903006, + "k1_kl": 0.060302734375, + "k3_kl": 0.04296875, + "kimi_kl": 0.12890625, + "learning_rate": 2.1029999999999997e-07, + "loss": 0.0017, + "ppl": 0.02685546875, + "reward": 0.8651179075241089, + "reward_std": 0.11776620149612427, + "rewards/perpo_ocr_edit_distance_reward": 0.8651180267333984, + "step": 2897, + "temperature": 0.9 + }, + { + "advantages": -2.7673587510435027e-07, + "completion_length": 602.0, + "delta_ref_entropy_loss": -0.1572265625, + "delta_ref_ppl": -0.1318359375, + "entropy_loss": -1.1640625, + "epoch": 0.5796, + "grad_norm": 5.076445950324685, + "k1_kl": 0.1298828125, + "k3_kl": 0.11962890625, + "kimi_kl": 0.224609375, + "learning_rate": 2.102e-07, + "loss": 0.0048, + "ppl": 0.65625, + "reward": 0.3235325217247009, + "reward_std": 0.030774911865592003, + "rewards/perpo_ocr_edit_distance_reward": 0.3235325515270233, + "step": 2898, + "temperature": 0.9 + }, + { + "advantages": -5.7901655964087695e-05, + "completion_length": 950.0, + "delta_ref_entropy_loss": 0.021240234375, + "delta_ref_ppl": -0.0390625, + "entropy_loss": -0.048095703125, + "epoch": 0.5798, + "grad_norm": 0.7201817335313624, + "k1_kl": 0.0390625, + "k3_kl": 0.0244140625, + "kimi_kl": 0.06298828125, + "learning_rate": 2.101e-07, + "loss": 0.001, + "ppl": 0.021728515625, + "reward": 0.9977491497993469, + "reward_std": 0.000782428658567369, + "rewards/perpo_ocr_edit_distance_reward": 0.9977492094039917, + "step": 2899, + "temperature": 0.9 + }, + { + "advantages": -5.815710665046936e-06, + "completion_length": 670.0, + "delta_ref_entropy_loss": 0.033203125, + "delta_ref_ppl": -0.1025390625, + "entropy_loss": -0.51171875, + "epoch": 0.58, + "grad_norm": 2.302137531781873, + "k1_kl": 0.10205078125, + "k3_kl": 0.076171875, + "kimi_kl": 0.142578125, + "learning_rate": 2.0999999999999997e-07, + "loss": 0.003, + "ppl": 0.26953125, + "reward": 0.7074395418167114, + "reward_std": 0.011612385511398315, + "rewards/perpo_ocr_edit_distance_reward": 0.7074396014213562, + "step": 2900, + "temperature": 0.9 + }, + { + "advantages": -6.226131517905742e-05, + "completion_length": 552.0, + "delta_ref_entropy_loss": 0.047119140625, + "delta_ref_ppl": -0.05322265625, + "entropy_loss": -0.037353515625, + "epoch": 0.5802, + "grad_norm": 0.3968242084689733, + "k1_kl": 0.053466796875, + "k3_kl": 0.038330078125, + "kimi_kl": 0.10986328125, + "learning_rate": 2.099e-07, + "loss": 0.0016, + "ppl": 0.013916015625, + "reward": 0.9780259132385254, + "reward_std": 0.0007204711437225342, + "rewards/perpo_ocr_edit_distance_reward": 0.9780259728431702, + "step": 2901, + "temperature": 0.9 + }, + { + "advantages": -9.310246241511777e-05, + "completion_length": 692.0, + "delta_ref_entropy_loss": 0.0240478515625, + "delta_ref_ppl": -0.057373046875, + "entropy_loss": -0.025390625, + "epoch": 0.5804, + "grad_norm": 0.13850860173153545, + "k1_kl": 0.0576171875, + "k3_kl": 0.04345703125, + "kimi_kl": 0.1796875, + "learning_rate": 2.0979999999999999e-07, + "loss": 0.0018, + "ppl": 0.006805419921875, + "reward": 0.9824867844581604, + "reward_std": 0.0002657795266713947, + "rewards/perpo_ocr_edit_distance_reward": 0.9824868440628052, + "step": 2902, + "temperature": 0.9 + }, + { + "advantages": 0.0, + "completion_length": 104.0, + "delta_ref_entropy_loss": 0.0169677734375, + "delta_ref_ppl": -0.361328125, + "entropy_loss": -0.109375, + "epoch": 0.5806, + "grad_norm": 1.4533106628991281, + "k1_kl": 0.36328125, + "k3_kl": 0.3046875, + "kimi_kl": 1.5, + "learning_rate": 2.0969999999999998e-07, + "loss": 0.0122, + "ppl": 0.0419921875, + "reward": 0.9906976222991943, + "reward_std": 0.003288902109488845, + "rewards/perpo_ocr_edit_distance_reward": 0.9906976819038391, + "step": 2903, + "temperature": 0.9 + }, + { + "advantages": -6.130763949840912e-07, + "completion_length": 480.0, + "delta_ref_entropy_loss": 0.0272216796875, + "delta_ref_ppl": -0.0810546875, + "entropy_loss": -0.061767578125, + "epoch": 0.5808, + "grad_norm": 0.8077391695373183, + "k1_kl": 0.0810546875, + "k3_kl": 0.05224609375, + "kimi_kl": 0.134765625, + "learning_rate": 2.096e-07, + "loss": 0.0021, + "ppl": 0.0235595703125, + "reward": 0.9588239192962646, + "reward_std": 0.027648866176605225, + "rewards/perpo_ocr_edit_distance_reward": 0.9588239192962646, + "step": 2904, + "temperature": 0.9 + }, + { + "advantages": -4.529953366727568e-05, + "completion_length": 461.0, + "delta_ref_entropy_loss": 0.0284423828125, + "delta_ref_ppl": -0.0517578125, + "entropy_loss": -0.0380859375, + "epoch": 0.581, + "grad_norm": 0.3471542038676673, + "k1_kl": 0.0517578125, + "k3_kl": 0.03564453125, + "kimi_kl": 0.111328125, + "learning_rate": 2.095e-07, + "loss": 0.0015, + "ppl": 0.011474609375, + "reward": 0.9834626317024231, + "reward_std": 0.000651758920866996, + "rewards/perpo_ocr_edit_distance_reward": 0.9834626913070679, + "step": 2905, + "temperature": 0.9 + }, + { + "advantages": -3.563506425052765e-06, + "completion_length": 357.0, + "delta_ref_entropy_loss": 0.07763671875, + "delta_ref_ppl": -0.1533203125, + "entropy_loss": -0.341796875, + "epoch": 0.5812, + "grad_norm": 2.2730696948940805, + "k1_kl": 0.154296875, + "k3_kl": 0.1064453125, + "kimi_kl": 0.2294921875, + "learning_rate": 2.0939999999999998e-07, + "loss": 0.0043, + "ppl": 0.173828125, + "reward": 0.6515172123908997, + "reward_std": 0.004675648175179958, + "rewards/perpo_ocr_edit_distance_reward": 0.6515172719955444, + "step": 2906, + "temperature": 0.9 + }, + { + "advantages": -4.792213803739287e-05, + "completion_length": 537.0, + "delta_ref_entropy_loss": 0.04736328125, + "delta_ref_ppl": -0.06591796875, + "entropy_loss": -0.045166015625, + "epoch": 0.5814, + "grad_norm": 0.7019879138808948, + "k1_kl": 0.06591796875, + "k3_kl": 0.038330078125, + "kimi_kl": 0.10595703125, + "learning_rate": 2.093e-07, + "loss": 0.0016, + "ppl": 0.0174560546875, + "reward": 0.9897959232330322, + "reward_std": 0.0009669676655903459, + "rewards/perpo_ocr_edit_distance_reward": 0.989795982837677, + "step": 2907, + "temperature": 0.9 + }, + { + "advantages": 3.4059798537100505e-08, + "completion_length": 342.0, + "delta_ref_entropy_loss": 0.05615234375, + "delta_ref_ppl": -0.07861328125, + "entropy_loss": -0.07861328125, + "epoch": 0.5816, + "grad_norm": 0.6177776179549969, + "k1_kl": 0.078125, + "k3_kl": 0.05517578125, + "kimi_kl": 0.171875, + "learning_rate": 2.092e-07, + "loss": 0.0022, + "ppl": 0.031494140625, + "reward": 0.9892094731330872, + "reward_std": 0.0011044297134503722, + "rewards/perpo_ocr_edit_distance_reward": 0.9892094731330872, + "step": 2908, + "temperature": 0.9 + }, + { + "advantages": -4.942928353557363e-05, + "completion_length": 161.0, + "delta_ref_entropy_loss": 0.01544189453125, + "delta_ref_ppl": -0.1689453125, + "entropy_loss": -0.0703125, + "epoch": 0.5818, + "grad_norm": 0.9647365464565294, + "k1_kl": 0.1689453125, + "k3_kl": 0.13671875, + "kimi_kl": 0.7890625, + "learning_rate": 2.0909999999999999e-07, + "loss": 0.0055, + "ppl": 0.029541015625, + "reward": 0.9984498023986816, + "reward_std": 0.00145062361843884, + "rewards/perpo_ocr_edit_distance_reward": 0.9984499216079712, + "step": 2909, + "temperature": 0.9 + }, + { + "advantages": -1.0711806680774316e-05, + "completion_length": 892.0, + "delta_ref_entropy_loss": 0.03759765625, + "delta_ref_ppl": -0.044921875, + "entropy_loss": -0.06982421875, + "epoch": 0.582, + "grad_norm": 1.0840291440411138, + "k1_kl": 0.044921875, + "k3_kl": 0.0247802734375, + "kimi_kl": 0.052734375, + "learning_rate": 2.0899999999999998e-07, + "loss": 0.001, + "ppl": 0.0341796875, + "reward": 0.9787095189094543, + "reward_std": 0.0014908155426383018, + "rewards/perpo_ocr_edit_distance_reward": 0.9787095785140991, + "step": 2910, + "temperature": 0.9 + }, + { + "advantages": -2.7247838261246216e-06, + "completion_length": 796.0, + "delta_ref_entropy_loss": 0.036376953125, + "delta_ref_ppl": -0.061279296875, + "entropy_loss": -0.0634765625, + "epoch": 0.5822, + "grad_norm": 1.0585683619928965, + "k1_kl": 0.06103515625, + "k3_kl": 0.03759765625, + "kimi_kl": 0.1015625, + "learning_rate": 2.089e-07, + "loss": 0.0015, + "ppl": 0.026123046875, + "reward": 0.9668880701065063, + "reward_std": 0.006108838599175215, + "rewards/perpo_ocr_edit_distance_reward": 0.9668881893157959, + "step": 2911, + "temperature": 0.9 + }, + { + "advantages": -1.5156609833866241e-06, + "completion_length": 25.0, + "delta_ref_entropy_loss": -0.1298828125, + "delta_ref_ppl": -1.484375, + "entropy_loss": -0.62890625, + "epoch": 0.5824, + "grad_norm": 11.515419420830305, + "k1_kl": 1.484375, + "k3_kl": 1.21875, + "kimi_kl": 5.25, + "learning_rate": 2.0880000000000002e-07, + "loss": 0.0488, + "ppl": 0.25, + "reward": 0.39115649461746216, + "reward_std": 0.016968967393040657, + "rewards/perpo_ocr_edit_distance_reward": 0.39115649461746216, + "step": 2912, + "temperature": 0.9 + }, + { + "advantages": -9.953124390449375e-05, + "completion_length": 579.0, + "delta_ref_entropy_loss": 0.037353515625, + "delta_ref_ppl": -0.0634765625, + "entropy_loss": -0.05859375, + "epoch": 0.5826, + "grad_norm": 1.316939784246134, + "k1_kl": 0.06298828125, + "k3_kl": 0.037109375, + "kimi_kl": 0.09326171875, + "learning_rate": 2.0869999999999998e-07, + "loss": 0.0016, + "ppl": 0.0262451171875, + "reward": 0.9962257742881775, + "reward_std": 0.0004988480359315872, + "rewards/perpo_ocr_edit_distance_reward": 0.996225893497467, + "step": 2913, + "temperature": 0.9 + }, + { + "advantages": -1.1120524504804052e-05, + "completion_length": 94.0, + "delta_ref_entropy_loss": 0.043212890625, + "delta_ref_ppl": -0.375, + "entropy_loss": -0.140625, + "epoch": 0.5828, + "grad_norm": 2.2338802021249875, + "k1_kl": 0.375, + "k3_kl": 0.3125, + "kimi_kl": 1.3984375, + "learning_rate": 2.086e-07, + "loss": 0.0125, + "ppl": 0.053466796875, + "reward": 0.9710884094238281, + "reward_std": 0.003730837954208255, + "rewards/perpo_ocr_edit_distance_reward": 0.9710884690284729, + "step": 2914, + "temperature": 0.9 + }, + { + "advantages": -2.8099334485887084e-06, + "completion_length": 1601.0, + "delta_ref_entropy_loss": 0.007659912109375, + "delta_ref_ppl": -0.0177001953125, + "entropy_loss": -0.042724609375, + "epoch": 0.583, + "grad_norm": 1.1660244637904005, + "k1_kl": 0.017822265625, + "k3_kl": 0.0184326171875, + "kimi_kl": 0.037109375, + "learning_rate": 2.085e-07, + "loss": 0.0007, + "ppl": 0.0208740234375, + "reward": 0.973834216594696, + "reward_std": 0.0029343802016228437, + "rewards/perpo_ocr_edit_distance_reward": 0.973834216594696, + "step": 2915, + "temperature": 0.9 + }, + { + "advantages": -6.864752504043281e-05, + "completion_length": 807.0, + "delta_ref_entropy_loss": 0.025634765625, + "delta_ref_ppl": -0.0308837890625, + "entropy_loss": -0.03369140625, + "epoch": 0.5832, + "grad_norm": 0.2944772584451753, + "k1_kl": 0.0308837890625, + "k3_kl": 0.0169677734375, + "kimi_kl": 0.042724609375, + "learning_rate": 2.0839999999999999e-07, + "loss": 0.0007, + "ppl": 0.01092529296875, + "reward": 0.9881284832954407, + "reward_std": 0.0005201110034249723, + "rewards/perpo_ocr_edit_distance_reward": 0.9881285429000854, + "step": 2916, + "temperature": 0.9 + }, + { + "advantages": -9.038618736667559e-05, + "completion_length": 773.0, + "delta_ref_entropy_loss": 0.020751953125, + "delta_ref_ppl": -0.0498046875, + "entropy_loss": -0.052490234375, + "epoch": 0.5834, + "grad_norm": 0.6882477645320655, + "k1_kl": 0.0498046875, + "k3_kl": 0.029052734375, + "kimi_kl": 0.07666015625, + "learning_rate": 2.083e-07, + "loss": 0.0013, + "ppl": 0.02099609375, + "reward": 0.9029383659362793, + "reward_std": 0.0008420062367804348, + "rewards/perpo_ocr_edit_distance_reward": 0.9029384851455688, + "step": 2917, + "temperature": 0.9 + }, + { + "advantages": -3.6435470974538475e-05, + "completion_length": 347.0, + "delta_ref_entropy_loss": 0.0311279296875, + "delta_ref_ppl": -0.046142578125, + "entropy_loss": -0.03515625, + "epoch": 0.5836, + "grad_norm": 0.3653395168926123, + "k1_kl": 0.04638671875, + "k3_kl": 0.0264892578125, + "kimi_kl": 0.076171875, + "learning_rate": 2.082e-07, + "loss": 0.0011, + "ppl": 0.00946044921875, + "reward": 0.9965914487838745, + "reward_std": 0.0006012258236296475, + "rewards/perpo_ocr_edit_distance_reward": 0.9965914487838745, + "step": 2918, + "temperature": 0.9 + }, + { + "advantages": -0.00010567904246272519, + "completion_length": 1015.0, + "delta_ref_entropy_loss": 0.033447265625, + "delta_ref_ppl": -0.054931640625, + "entropy_loss": -0.0703125, + "epoch": 0.5838, + "grad_norm": 0.8824369352216443, + "k1_kl": 0.054931640625, + "k3_kl": 0.036376953125, + "kimi_kl": 0.0986328125, + "learning_rate": 2.081e-07, + "loss": 0.0016, + "ppl": 0.031982421875, + "reward": 0.991950273513794, + "reward_std": 0.0004640463157556951, + "rewards/perpo_ocr_edit_distance_reward": 0.9919503331184387, + "step": 2919, + "temperature": 0.9 + }, + { + "advantages": -2.282006425957661e-05, + "completion_length": 434.0, + "delta_ref_entropy_loss": 0.055419921875, + "delta_ref_ppl": -0.1025390625, + "entropy_loss": -0.08154296875, + "epoch": 0.584, + "grad_norm": 0.4907989881515679, + "k1_kl": 0.1025390625, + "k3_kl": 0.0712890625, + "kimi_kl": 0.21875, + "learning_rate": 2.0799999999999998e-07, + "loss": 0.0029, + "ppl": 0.03466796875, + "reward": 0.9147583246231079, + "reward_std": 0.0006468477658927441, + "rewards/perpo_ocr_edit_distance_reward": 0.9147583842277527, + "step": 2920, + "temperature": 0.9 + }, + { + "advantages": -1.1648450708889868e-05, + "completion_length": 417.0, + "delta_ref_entropy_loss": 0.0235595703125, + "delta_ref_ppl": -0.0615234375, + "entropy_loss": -0.029296875, + "epoch": 0.5842, + "grad_norm": 0.3211764049885739, + "k1_kl": 0.0615234375, + "k3_kl": 0.043701171875, + "kimi_kl": 0.1650390625, + "learning_rate": 2.079e-07, + "loss": 0.0017, + "ppl": 0.01025390625, + "reward": 0.9955445528030396, + "reward_std": 0.0006311890901997685, + "rewards/perpo_ocr_edit_distance_reward": 0.9955446124076843, + "step": 2921, + "temperature": 0.9 + }, + { + "advantages": 9.843281986832153e-06, + "completion_length": 607.0, + "delta_ref_entropy_loss": 0.033447265625, + "delta_ref_ppl": -0.050048828125, + "entropy_loss": -0.060546875, + "epoch": 0.5844, + "grad_norm": 0.6055472498141062, + "k1_kl": 0.050048828125, + "k3_kl": 0.0289306640625, + "kimi_kl": 0.05126953125, + "learning_rate": 2.078e-07, + "loss": 0.0011, + "ppl": 0.025390625, + "reward": 0.9262782335281372, + "reward_std": 0.000766403041779995, + "rewards/perpo_ocr_edit_distance_reward": 0.9262781739234924, + "step": 2922, + "temperature": 0.9 + }, + { + "advantages": -1.4867102436255664e-05, + "completion_length": 717.0, + "delta_ref_entropy_loss": 0.09619140625, + "delta_ref_ppl": -0.09521484375, + "entropy_loss": -0.1787109375, + "epoch": 0.5846, + "grad_norm": 1.1174717661642006, + "k1_kl": 0.0947265625, + "k3_kl": 0.052978515625, + "kimi_kl": 0.142578125, + "learning_rate": 2.077e-07, + "loss": 0.0021, + "ppl": 0.08349609375, + "reward": 0.9580875039100647, + "reward_std": 0.0027641647029668093, + "rewards/perpo_ocr_edit_distance_reward": 0.9580876231193542, + "step": 2923, + "temperature": 0.9 + }, + { + "advantages": -3.3310483559034765e-05, + "completion_length": 237.0, + "delta_ref_entropy_loss": 0.040283203125, + "delta_ref_ppl": -0.1708984375, + "entropy_loss": -0.11474609375, + "epoch": 0.5848, + "grad_norm": 0.8784746426499301, + "k1_kl": 0.1708984375, + "k3_kl": 0.1259765625, + "kimi_kl": 0.423828125, + "learning_rate": 2.076e-07, + "loss": 0.0051, + "ppl": 0.048583984375, + "reward": 0.9818210601806641, + "reward_std": 0.002197953639551997, + "rewards/perpo_ocr_edit_distance_reward": 0.9818211793899536, + "step": 2924, + "temperature": 0.9 + }, + { + "advantages": 6.948198915779358e-06, + "completion_length": 619.0, + "delta_ref_entropy_loss": 0.048583984375, + "delta_ref_ppl": -0.07568359375, + "entropy_loss": -0.0712890625, + "epoch": 0.585, + "grad_norm": 0.684580409024159, + "k1_kl": 0.07568359375, + "k3_kl": 0.049072265625, + "kimi_kl": 0.1455078125, + "learning_rate": 2.0749999999999997e-07, + "loss": 0.002, + "ppl": 0.032958984375, + "reward": 0.986580491065979, + "reward_std": 0.0023487997241318226, + "rewards/perpo_ocr_edit_distance_reward": 0.986580491065979, + "step": 2925, + "temperature": 0.9 + }, + { + "advantages": -1.7029899268550253e-08, + "completion_length": 197.0, + "delta_ref_entropy_loss": 0.059814453125, + "delta_ref_ppl": -0.1494140625, + "entropy_loss": -0.11474609375, + "epoch": 0.5852, + "grad_norm": 2.276238466115497, + "k1_kl": 0.1494140625, + "k3_kl": 0.107421875, + "kimi_kl": 0.361328125, + "learning_rate": 2.074e-07, + "loss": 0.0043, + "ppl": 0.05029296875, + "reward": 0.9389579892158508, + "reward_std": 0.005500722676515579, + "rewards/perpo_ocr_edit_distance_reward": 0.938957929611206, + "step": 2926, + "temperature": 0.9 + }, + { + "advantages": -5.807195520901587e-06, + "completion_length": 582.0, + "delta_ref_entropy_loss": 0.062255859375, + "delta_ref_ppl": -0.09814453125, + "entropy_loss": -0.06787109375, + "epoch": 0.5854, + "grad_norm": 0.4858853498876169, + "k1_kl": 0.09814453125, + "k3_kl": 0.06201171875, + "kimi_kl": 0.2080078125, + "learning_rate": 2.073e-07, + "loss": 0.0025, + "ppl": 0.026123046875, + "reward": 0.9854080677032471, + "reward_std": 0.0013650791952386498, + "rewards/perpo_ocr_edit_distance_reward": 0.9854080080986023, + "step": 2927, + "temperature": 0.9 + }, + { + "advantages": -5.132811565999873e-05, + "completion_length": 1380.0, + "delta_ref_entropy_loss": 0.031494140625, + "delta_ref_ppl": -0.046875, + "entropy_loss": -0.08642578125, + "epoch": 0.5856, + "grad_norm": 5.051665978732241, + "k1_kl": 0.046875, + "k3_kl": 0.0419921875, + "kimi_kl": 0.06298828125, + "learning_rate": 2.0719999999999998e-07, + "loss": 0.0017, + "ppl": 0.044189453125, + "reward": 0.9898175597190857, + "reward_std": 0.0007295502000488341, + "rewards/perpo_ocr_edit_distance_reward": 0.9898176193237305, + "step": 2928, + "temperature": 0.9 + }, + { + "advantages": -1.088210592570249e-05, + "completion_length": 870.0, + "delta_ref_entropy_loss": 0.0087890625, + "delta_ref_ppl": -0.059814453125, + "entropy_loss": -0.1201171875, + "epoch": 0.5858, + "grad_norm": 2.0186504994611245, + "k1_kl": 0.059814453125, + "k3_kl": 0.0419921875, + "kimi_kl": 0.130859375, + "learning_rate": 2.071e-07, + "loss": 0.0017, + "ppl": 0.0537109375, + "reward": 0.948028564453125, + "reward_std": 0.005370710976421833, + "rewards/perpo_ocr_edit_distance_reward": 0.9480286240577698, + "step": 2929, + "temperature": 0.9 + }, + { + "advantages": 5.594321919488721e-06, + "completion_length": 218.0, + "delta_ref_entropy_loss": 0.0751953125, + "delta_ref_ppl": -0.19921875, + "entropy_loss": -0.142578125, + "epoch": 0.586, + "grad_norm": 2.4068296270613554, + "k1_kl": 0.19921875, + "k3_kl": 0.142578125, + "kimi_kl": 0.478515625, + "learning_rate": 2.07e-07, + "loss": 0.0057, + "ppl": 0.056396484375, + "reward": 0.9263838529586792, + "reward_std": 0.002948958659544587, + "rewards/perpo_ocr_edit_distance_reward": 0.9263838529586792, + "step": 2930, + "temperature": 0.9 + }, + { + "advantages": -1.2142318155383691e-05, + "completion_length": 444.0, + "delta_ref_entropy_loss": 0.0267333984375, + "delta_ref_ppl": -0.0537109375, + "entropy_loss": -0.033447265625, + "epoch": 0.5862, + "grad_norm": 0.5399354951791258, + "k1_kl": 0.0537109375, + "k3_kl": 0.0390625, + "kimi_kl": 0.11767578125, + "learning_rate": 2.0689999999999998e-07, + "loss": 0.0016, + "ppl": 0.01202392578125, + "reward": 0.9950738549232483, + "reward_std": 0.0012999456375837326, + "rewards/perpo_ocr_edit_distance_reward": 0.9950739145278931, + "step": 2931, + "temperature": 0.9 + }, + { + "advantages": -1.5812262063263915e-05, + "completion_length": 526.0, + "delta_ref_entropy_loss": 0.0296630859375, + "delta_ref_ppl": -0.05712890625, + "entropy_loss": -0.03466796875, + "epoch": 0.5864, + "grad_norm": 0.3598127666140361, + "k1_kl": 0.05712890625, + "k3_kl": 0.04296875, + "kimi_kl": 0.130859375, + "learning_rate": 2.068e-07, + "loss": 0.0017, + "ppl": 0.01470947265625, + "reward": 0.9962873458862305, + "reward_std": 0.0009782577399164438, + "rewards/perpo_ocr_edit_distance_reward": 0.9962874054908752, + "step": 2932, + "temperature": 0.9 + }, + { + "advantages": -1.8715858459472656e-05, + "completion_length": 885.0, + "delta_ref_entropy_loss": 0.052978515625, + "delta_ref_ppl": -0.048583984375, + "entropy_loss": -0.1533203125, + "epoch": 0.5866, + "grad_norm": 1.9488830607332508, + "k1_kl": 0.048583984375, + "k3_kl": 0.03125, + "kimi_kl": 0.0458984375, + "learning_rate": 2.067e-07, + "loss": 0.0013, + "ppl": 0.0791015625, + "reward": 0.9315990209579468, + "reward_std": 0.002631837036460638, + "rewards/perpo_ocr_edit_distance_reward": 0.9315991401672363, + "step": 2933, + "temperature": 0.9 + }, + { + "advantages": -5.875315309822327e-07, + "completion_length": 85.0, + "delta_ref_entropy_loss": -0.11083984375, + "delta_ref_ppl": -0.373046875, + "entropy_loss": -0.4140625, + "epoch": 0.5868, + "grad_norm": 5.587865642769471, + "k1_kl": 0.37109375, + "k3_kl": 0.310546875, + "kimi_kl": 1.078125, + "learning_rate": 2.0659999999999998e-07, + "loss": 0.0124, + "ppl": 0.1767578125, + "reward": 0.51006680727005, + "reward_std": 0.0871165543794632, + "rewards/perpo_ocr_edit_distance_reward": 0.5100668668746948, + "step": 2934, + "temperature": 0.9 + }, + { + "advantages": -3.5635064705275e-05, + "completion_length": 510.0, + "delta_ref_entropy_loss": 0.050048828125, + "delta_ref_ppl": -0.07958984375, + "entropy_loss": -0.08056640625, + "epoch": 0.587, + "grad_norm": 0.4466043282367405, + "k1_kl": 0.0791015625, + "k3_kl": 0.047119140625, + "kimi_kl": 0.09912109375, + "learning_rate": 2.0649999999999998e-07, + "loss": 0.0019, + "ppl": 0.03564453125, + "reward": 0.9783987402915955, + "reward_std": 0.0015731985913589597, + "rewards/perpo_ocr_edit_distance_reward": 0.9783987998962402, + "step": 2935, + "temperature": 0.9 + }, + { + "advantages": -5.100454654893838e-05, + "completion_length": 584.0, + "delta_ref_entropy_loss": 0.04345703125, + "delta_ref_ppl": -0.056884765625, + "entropy_loss": -0.06005859375, + "epoch": 0.5872, + "grad_norm": 1.1438807239873712, + "k1_kl": 0.05712890625, + "k3_kl": 0.03564453125, + "kimi_kl": 0.1103515625, + "learning_rate": 2.064e-07, + "loss": 0.0015, + "ppl": 0.0301513671875, + "reward": 0.99532550573349, + "reward_std": 0.0010686644818633795, + "rewards/perpo_ocr_edit_distance_reward": 0.9953255653381348, + "step": 2936, + "temperature": 0.9 + }, + { + "advantages": -2.838032742147334e-05, + "completion_length": 241.0, + "delta_ref_entropy_loss": 0.020751953125, + "delta_ref_ppl": -0.1103515625, + "entropy_loss": -0.080078125, + "epoch": 0.5874, + "grad_norm": 1.456082151319905, + "k1_kl": 0.1103515625, + "k3_kl": 0.08154296875, + "kimi_kl": 0.28125, + "learning_rate": 2.0630000000000001e-07, + "loss": 0.0033, + "ppl": 0.0289306640625, + "reward": 0.9703647494316101, + "reward_std": 0.002602694556117058, + "rewards/perpo_ocr_edit_distance_reward": 0.9703648090362549, + "step": 2937, + "temperature": 0.9 + }, + { + "advantages": 1.0098729944729712e-05, + "completion_length": 1568.0, + "delta_ref_entropy_loss": 0.007659912109375, + "delta_ref_ppl": -0.033447265625, + "entropy_loss": -0.037841796875, + "epoch": 0.5876, + "grad_norm": 0.7010527311124788, + "k1_kl": 0.033447265625, + "k3_kl": 0.0218505859375, + "kimi_kl": 0.05712890625, + "learning_rate": 2.0619999999999998e-07, + "loss": 0.0009, + "ppl": 0.01611328125, + "reward": 0.9971016645431519, + "reward_std": 0.0015842622378841043, + "rewards/perpo_ocr_edit_distance_reward": 0.9971016645431519, + "step": 2938, + "temperature": 0.9 + }, + { + "advantages": 1.549720877846994e-06, + "completion_length": 1094.0, + "delta_ref_entropy_loss": 0.007476806640625, + "delta_ref_ppl": -0.031982421875, + "entropy_loss": -0.03857421875, + "epoch": 0.5878, + "grad_norm": 0.6576842276336636, + "k1_kl": 0.031982421875, + "k3_kl": 0.02685546875, + "kimi_kl": 0.068359375, + "learning_rate": 2.061e-07, + "loss": 0.0011, + "ppl": 0.020263671875, + "reward": 0.9905255436897278, + "reward_std": 0.010806618258357048, + "rewards/perpo_ocr_edit_distance_reward": 0.9905255436897278, + "step": 2939, + "temperature": 0.9 + }, + { + "advantages": -6.846019459771924e-06, + "completion_length": 751.0, + "delta_ref_entropy_loss": 0.054443359375, + "delta_ref_ppl": -0.0810546875, + "entropy_loss": -0.1533203125, + "epoch": 0.588, + "grad_norm": 1.6233752212885206, + "k1_kl": 0.0810546875, + "k3_kl": 0.052490234375, + "kimi_kl": 0.1572265625, + "learning_rate": 2.06e-07, + "loss": 0.0021, + "ppl": 0.07177734375, + "reward": 0.9517901539802551, + "reward_std": 0.0011446427088230848, + "rewards/perpo_ocr_edit_distance_reward": 0.9517901539802551, + "step": 2940, + "temperature": 0.9 + }, + { + "advantages": -1.1954989531659521e-05, + "completion_length": 732.0, + "delta_ref_entropy_loss": 0.0252685546875, + "delta_ref_ppl": -0.061767578125, + "entropy_loss": -0.04736328125, + "epoch": 0.5882, + "grad_norm": 1.3478288308704867, + "k1_kl": 0.06201171875, + "k3_kl": 0.038818359375, + "kimi_kl": 0.09716796875, + "learning_rate": 2.0589999999999998e-07, + "loss": 0.0016, + "ppl": 0.0205078125, + "reward": 0.9630420207977295, + "reward_std": 0.0013237816747277975, + "rewards/perpo_ocr_edit_distance_reward": 0.9630420804023743, + "step": 2941, + "temperature": 0.9 + }, + { + "advantages": -1.2866088582086377e-05, + "completion_length": 461.0, + "delta_ref_entropy_loss": 0.036376953125, + "delta_ref_ppl": -0.060546875, + "entropy_loss": -0.12060546875, + "epoch": 0.5884, + "grad_norm": 8.942395516348952, + "k1_kl": 0.060546875, + "k3_kl": 0.03759765625, + "kimi_kl": 0.0810546875, + "learning_rate": 2.058e-07, + "loss": 0.0015, + "ppl": 0.0517578125, + "reward": 0.9711974859237671, + "reward_std": 0.00851286482065916, + "rewards/perpo_ocr_edit_distance_reward": 0.9711976051330566, + "step": 2942, + "temperature": 0.9 + }, + { + "advantages": -3.773825665120967e-05, + "completion_length": 496.0, + "delta_ref_entropy_loss": 0.0478515625, + "delta_ref_ppl": -0.0830078125, + "entropy_loss": -0.078125, + "epoch": 0.5886, + "grad_norm": 0.5066197959754738, + "k1_kl": 0.08349609375, + "k3_kl": 0.0576171875, + "kimi_kl": 0.185546875, + "learning_rate": 2.057e-07, + "loss": 0.0023, + "ppl": 0.02490234375, + "reward": 0.97545325756073, + "reward_std": 0.0014790808781981468, + "rewards/perpo_ocr_edit_distance_reward": 0.9754533767700195, + "step": 2943, + "temperature": 0.9 + }, + { + "advantages": -3.595011730794795e-05, + "completion_length": 550.0, + "delta_ref_entropy_loss": 0.0341796875, + "delta_ref_ppl": -0.06640625, + "entropy_loss": -0.041015625, + "epoch": 0.5888, + "grad_norm": 0.8133287622868273, + "k1_kl": 0.06640625, + "k3_kl": 0.043212890625, + "kimi_kl": 0.11767578125, + "learning_rate": 2.056e-07, + "loss": 0.0018, + "ppl": 0.0164794921875, + "reward": 0.9926605224609375, + "reward_std": 0.0008474916103295982, + "rewards/perpo_ocr_edit_distance_reward": 0.9926605820655823, + "step": 2944, + "temperature": 0.9 + }, + { + "advantages": -2.7852400307892822e-05, + "completion_length": 338.0, + "delta_ref_entropy_loss": 0.0107421875, + "delta_ref_ppl": -0.055419921875, + "entropy_loss": -0.033447265625, + "epoch": 0.589, + "grad_norm": 0.31319567625084194, + "k1_kl": 0.055419921875, + "k3_kl": 0.03857421875, + "kimi_kl": 0.126953125, + "learning_rate": 2.0549999999999998e-07, + "loss": 0.0016, + "ppl": 0.00958251953125, + "reward": 0.9788635969161987, + "reward_std": 0.00020581981516443193, + "rewards/perpo_ocr_edit_distance_reward": 0.9788635969161987, + "step": 2945, + "temperature": 0.9 + }, + { + "advantages": -0.00011600767174968496, + "completion_length": 814.0, + "delta_ref_entropy_loss": 0.0279541015625, + "delta_ref_ppl": -0.033447265625, + "entropy_loss": -0.0308837890625, + "epoch": 0.5892, + "grad_norm": 0.40890858528220503, + "k1_kl": 0.033447265625, + "k3_kl": 0.0196533203125, + "kimi_kl": 0.06689453125, + "learning_rate": 2.054e-07, + "loss": 0.0009, + "ppl": 0.01226806640625, + "reward": 0.9995364546775818, + "reward_std": 0.0004871670389547944, + "rewards/perpo_ocr_edit_distance_reward": 0.9995365142822266, + "step": 2946, + "temperature": 0.9 + }, + { + "advantages": -8.089202310657129e-06, + "completion_length": 475.0, + "delta_ref_entropy_loss": 0.08056640625, + "delta_ref_ppl": -0.107421875, + "entropy_loss": -0.287109375, + "epoch": 0.5894, + "grad_norm": 1.8827783945822094, + "k1_kl": 0.107421875, + "k3_kl": 0.06494140625, + "kimi_kl": 0.12451171875, + "learning_rate": 2.053e-07, + "loss": 0.0026, + "ppl": 0.1435546875, + "reward": 0.8925978541374207, + "reward_std": 0.0030573178082704544, + "rewards/perpo_ocr_edit_distance_reward": 0.8925979137420654, + "step": 2947, + "temperature": 0.9 + }, + { + "advantages": -2.855914135579951e-05, + "completion_length": 441.0, + "delta_ref_entropy_loss": 0.0274658203125, + "delta_ref_ppl": -0.0439453125, + "entropy_loss": -0.0390625, + "epoch": 0.5896, + "grad_norm": 0.5496309163601845, + "k1_kl": 0.0439453125, + "k3_kl": 0.02880859375, + "kimi_kl": 0.0703125, + "learning_rate": 2.0519999999999998e-07, + "loss": 0.0012, + "ppl": 0.0142822265625, + "reward": 0.9839668869972229, + "reward_std": 0.0010912383440881968, + "rewards/perpo_ocr_edit_distance_reward": 0.9839669466018677, + "step": 2948, + "temperature": 0.9 + }, + { + "advantages": -2.176421185140498e-05, + "completion_length": 339.0, + "delta_ref_entropy_loss": 0.0262451171875, + "delta_ref_ppl": -0.1435546875, + "entropy_loss": -0.0654296875, + "epoch": 0.5898, + "grad_norm": 0.549431743634973, + "k1_kl": 0.1435546875, + "k3_kl": 0.107421875, + "kimi_kl": 0.453125, + "learning_rate": 2.051e-07, + "loss": 0.0043, + "ppl": 0.0240478515625, + "reward": 0.9880311489105225, + "reward_std": 0.0010740146972239017, + "rewards/perpo_ocr_edit_distance_reward": 0.9880312085151672, + "step": 2949, + "temperature": 0.9 + }, + { + "advantages": -1.8187933164881542e-05, + "completion_length": 875.0, + "delta_ref_entropy_loss": 0.060791015625, + "delta_ref_ppl": -0.0986328125, + "entropy_loss": -0.169921875, + "epoch": 0.59, + "grad_norm": 4.274547975195318, + "k1_kl": 0.0986328125, + "k3_kl": 0.0615234375, + "kimi_kl": 0.16796875, + "learning_rate": 2.0499999999999997e-07, + "loss": 0.0025, + "ppl": 0.08349609375, + "reward": 0.9085450172424316, + "reward_std": 0.0036406898871064186, + "rewards/perpo_ocr_edit_distance_reward": 0.9085451364517212, + "step": 2950, + "temperature": 0.9 + }, + { + "advantages": -2.259867687826045e-05, + "completion_length": 568.0, + "delta_ref_entropy_loss": 0.033203125, + "delta_ref_ppl": -0.057373046875, + "entropy_loss": -0.0478515625, + "epoch": 0.5902, + "grad_norm": 0.9708348135436948, + "k1_kl": 0.057373046875, + "k3_kl": 0.034423828125, + "kimi_kl": 0.1142578125, + "learning_rate": 2.049e-07, + "loss": 0.0014, + "ppl": 0.016357421875, + "reward": 0.9977964162826538, + "reward_std": 0.0010284420568495989, + "rewards/perpo_ocr_edit_distance_reward": 0.9977964162826538, + "step": 2951, + "temperature": 0.9 + }, + { + "advantages": 2.3952552510309033e-05, + "completion_length": 815.0, + "delta_ref_entropy_loss": 0.039794921875, + "delta_ref_ppl": -0.033447265625, + "entropy_loss": -0.072265625, + "epoch": 0.5904, + "grad_norm": 9919793185.373747, + "k1_kl": 0.033447265625, + "k3_kl": 2998272.0, + "kimi_kl": 0.373046875, + "learning_rate": 2.048e-07, + "loss": 120176.1328, + "ppl": 0.058349609375, + "reward": 0.9843119382858276, + "reward_std": 0.0016775509575381875, + "rewards/perpo_ocr_edit_distance_reward": 0.9843119382858276, + "step": 2952, + "temperature": 0.9 + }, + { + "advantages": -4.0492843254469335e-05, + "completion_length": 293.0, + "delta_ref_entropy_loss": 0.06298828125, + "delta_ref_ppl": -0.1640625, + "entropy_loss": -0.07470703125, + "epoch": 0.5906, + "grad_norm": 0.5761291154020082, + "k1_kl": 0.1640625, + "k3_kl": 0.1279296875, + "kimi_kl": 0.51953125, + "learning_rate": 2.0469999999999997e-07, + "loss": 0.0052, + "ppl": 0.0284423828125, + "reward": 0.30850109457969666, + "reward_std": 0.0008463646518066525, + "rewards/perpo_ocr_edit_distance_reward": 0.30850112438201904, + "step": 2953, + "temperature": 0.9 + }, + { + "advantages": -9.081193638849072e-06, + "completion_length": 160.0, + "delta_ref_entropy_loss": 0.031005859375, + "delta_ref_ppl": -0.2197265625, + "entropy_loss": -0.08837890625, + "epoch": 0.5908, + "grad_norm": 1.2845072767732457, + "k1_kl": 0.220703125, + "k3_kl": 0.173828125, + "kimi_kl": 0.70703125, + "learning_rate": 2.046e-07, + "loss": 0.007, + "ppl": 0.031494140625, + "reward": 0.994886577129364, + "reward_std": 0.0027126085478812456, + "rewards/perpo_ocr_edit_distance_reward": 0.994886577129364, + "step": 2954, + "temperature": 0.9 + }, + { + "advantages": -1.3794218602924957e-06, + "completion_length": 593.0, + "delta_ref_entropy_loss": -0.04833984375, + "delta_ref_ppl": -0.0703125, + "entropy_loss": -0.1953125, + "epoch": 0.591, + "grad_norm": 1.8968998431263149, + "k1_kl": 0.0703125, + "k3_kl": 0.06396484375, + "kimi_kl": 0.17578125, + "learning_rate": 2.0449999999999998e-07, + "loss": 0.0026, + "ppl": 0.08056640625, + "reward": 0.9252833724021912, + "reward_std": 0.05584995448589325, + "rewards/perpo_ocr_edit_distance_reward": 0.9252834320068359, + "step": 2955, + "temperature": 0.9 + }, + { + "advantages": -3.167561317241052e-06, + "completion_length": 1201.0, + "delta_ref_entropy_loss": 0.00982666015625, + "delta_ref_ppl": -0.027587890625, + "entropy_loss": -0.0361328125, + "epoch": 0.5912, + "grad_norm": 0.4462701684920018, + "k1_kl": 0.027587890625, + "k3_kl": 0.01904296875, + "kimi_kl": 0.054931640625, + "learning_rate": 2.0439999999999998e-07, + "loss": 0.0008, + "ppl": 0.01470947265625, + "reward": 0.9908417463302612, + "reward_std": 0.002590787597000599, + "rewards/perpo_ocr_edit_distance_reward": 0.9908417463302612, + "step": 2956, + "temperature": 0.9 + }, + { + "advantages": -3.647804260253906e-05, + "completion_length": 1750.0, + "delta_ref_entropy_loss": 0.000698089599609375, + "delta_ref_ppl": -0.025390625, + "entropy_loss": -0.1298828125, + "epoch": 0.5914, + "grad_norm": 38.63466859777355, + "k1_kl": 0.0252685546875, + "k3_kl": 0.53515625, + "kimi_kl": 0.07177734375, + "learning_rate": 2.043e-07, + "loss": 0.0215, + "ppl": 0.0751953125, + "reward": 0.9662918448448181, + "reward_std": 0.0013013642746955156, + "rewards/perpo_ocr_edit_distance_reward": 0.9662919044494629, + "step": 2957, + "temperature": 0.9 + }, + { + "advantages": -1.539502954983618e-05, + "completion_length": 142.0, + "delta_ref_entropy_loss": 0.07421875, + "delta_ref_ppl": -0.2490234375, + "entropy_loss": -0.12255859375, + "epoch": 0.5916, + "grad_norm": 3.711638212741253, + "k1_kl": 0.2490234375, + "k3_kl": 0.1953125, + "kimi_kl": 0.796875, + "learning_rate": 2.042e-07, + "loss": 0.0078, + "ppl": 0.054931640625, + "reward": 0.9699515104293823, + "reward_std": 0.0037712915800511837, + "rewards/perpo_ocr_edit_distance_reward": 0.9699516892433167, + "step": 2958, + "temperature": 0.9 + }, + { + "advantages": -1.3998576832818799e-05, + "completion_length": 65.0, + "delta_ref_entropy_loss": -0.055419921875, + "delta_ref_ppl": -0.72265625, + "entropy_loss": -0.33984375, + "epoch": 0.5918, + "grad_norm": 3.729771600609773, + "k1_kl": 0.71875, + "k3_kl": 0.6015625, + "kimi_kl": 2.859375, + "learning_rate": 2.0409999999999998e-07, + "loss": 0.024, + "ppl": 0.140625, + "reward": 0.45062655210494995, + "reward_std": 0.002940635196864605, + "rewards/perpo_ocr_edit_distance_reward": 0.4506266117095947, + "step": 2959, + "temperature": 0.9 + }, + { + "advantages": -0.00016886848607100546, + "completion_length": 916.0, + "delta_ref_entropy_loss": 0.06005859375, + "delta_ref_ppl": -0.053955078125, + "entropy_loss": -0.130859375, + "epoch": 0.592, + "grad_norm": 0.6710032200338183, + "k1_kl": 0.0537109375, + "k3_kl": 0.0264892578125, + "kimi_kl": 0.052978515625, + "learning_rate": 2.0399999999999997e-07, + "loss": 0.0012, + "ppl": 0.054443359375, + "reward": 0.9720396399497986, + "reward_std": 0.00040424527833238244, + "rewards/perpo_ocr_edit_distance_reward": 0.9720398187637329, + "step": 2960, + "temperature": 0.9 + }, + { + "advantages": -8.429800004705612e-07, + "completion_length": 464.0, + "delta_ref_entropy_loss": 0.01434326171875, + "delta_ref_ppl": -0.12158203125, + "entropy_loss": -0.271484375, + "epoch": 0.5922, + "grad_norm": 1.8811448971703584, + "k1_kl": 0.12158203125, + "k3_kl": 0.09130859375, + "kimi_kl": 0.265625, + "learning_rate": 2.039e-07, + "loss": 0.0037, + "ppl": 0.1357421875, + "reward": 0.9174736142158508, + "reward_std": 0.03035896085202694, + "rewards/perpo_ocr_edit_distance_reward": 0.9174736142158508, + "step": 2961, + "temperature": 0.9 + }, + { + "advantages": -2.043587983280304e-06, + "completion_length": 85.0, + "delta_ref_entropy_loss": -0.004730224609375, + "delta_ref_ppl": -0.267578125, + "entropy_loss": -0.12060546875, + "epoch": 0.5924, + "grad_norm": 2.959345283051039, + "k1_kl": 0.267578125, + "k3_kl": 0.224609375, + "kimi_kl": 0.9453125, + "learning_rate": 2.038e-07, + "loss": 0.0089, + "ppl": 0.042724609375, + "reward": 0.9326923489570618, + "reward_std": 0.01665434055030346, + "rewards/perpo_ocr_edit_distance_reward": 0.9326923489570618, + "step": 2962, + "temperature": 0.9 + }, + { + "advantages": -1.881803837022744e-05, + "completion_length": 628.0, + "delta_ref_entropy_loss": 0.0274658203125, + "delta_ref_ppl": -0.0830078125, + "entropy_loss": -0.314453125, + "epoch": 0.5926, + "grad_norm": 1.336019317912692, + "k1_kl": 0.08349609375, + "k3_kl": 0.060302734375, + "kimi_kl": 0.1396484375, + "learning_rate": 2.0369999999999998e-07, + "loss": 0.0024, + "ppl": 0.1484375, + "reward": 0.8465119004249573, + "reward_std": 0.0035169823095202446, + "rewards/perpo_ocr_edit_distance_reward": 0.8465120196342468, + "step": 2963, + "temperature": 0.9 + }, + { + "advantages": -9.48565411817981e-06, + "completion_length": 419.0, + "delta_ref_entropy_loss": 0.02587890625, + "delta_ref_ppl": -0.06640625, + "entropy_loss": -0.0966796875, + "epoch": 0.5928, + "grad_norm": 0.625055048107834, + "k1_kl": 0.06689453125, + "k3_kl": 0.03955078125, + "kimi_kl": 0.0888671875, + "learning_rate": 2.036e-07, + "loss": 0.0016, + "ppl": 0.04541015625, + "reward": 0.9715259075164795, + "reward_std": 0.0043946816585958, + "rewards/perpo_ocr_edit_distance_reward": 0.9715259671211243, + "step": 2964, + "temperature": 0.9 + }, + { + "advantages": -3.695488203447894e-06, + "completion_length": 763.0, + "delta_ref_entropy_loss": 0.0849609375, + "delta_ref_ppl": -0.078125, + "entropy_loss": -0.1328125, + "epoch": 0.593, + "grad_norm": 1.1861586429049833, + "k1_kl": 0.078125, + "k3_kl": 0.04443359375, + "kimi_kl": 0.10546875, + "learning_rate": 2.035e-07, + "loss": 0.0018, + "ppl": 0.0634765625, + "reward": 0.9656631350517273, + "reward_std": 0.0044916728511452675, + "rewards/perpo_ocr_edit_distance_reward": 0.9656631350517273, + "step": 2965, + "temperature": 0.9 + }, + { + "advantages": -7.18235969543457e-05, + "completion_length": 988.0, + "delta_ref_entropy_loss": 0.037109375, + "delta_ref_ppl": -0.055908203125, + "entropy_loss": -0.06640625, + "epoch": 0.5932, + "grad_norm": 0.7827566860799576, + "k1_kl": 0.05615234375, + "k3_kl": 0.037353515625, + "kimi_kl": 0.08251953125, + "learning_rate": 2.0339999999999998e-07, + "loss": 0.0016, + "ppl": 0.027099609375, + "reward": 0.9910259246826172, + "reward_std": 0.0010856561129912734, + "rewards/perpo_ocr_edit_distance_reward": 0.991025984287262, + "step": 2966, + "temperature": 0.9 + }, + { + "advantages": -2.5800296498346142e-05, + "completion_length": 895.0, + "delta_ref_entropy_loss": 0.061279296875, + "delta_ref_ppl": -0.06787109375, + "entropy_loss": -0.08349609375, + "epoch": 0.5934, + "grad_norm": 0.8068892964317563, + "k1_kl": 0.06787109375, + "k3_kl": 0.037109375, + "kimi_kl": 0.1181640625, + "learning_rate": 2.033e-07, + "loss": 0.0015, + "ppl": 0.041015625, + "reward": 0.9915438294410706, + "reward_std": 0.0012198865879327059, + "rewards/perpo_ocr_edit_distance_reward": 0.9915439486503601, + "step": 2967, + "temperature": 0.9 + }, + { + "advantages": -5.859136945218779e-05, + "completion_length": 399.0, + "delta_ref_entropy_loss": 0.053955078125, + "delta_ref_ppl": -0.09228515625, + "entropy_loss": -0.0810546875, + "epoch": 0.5936, + "grad_norm": 0.6330073680129211, + "k1_kl": 0.09228515625, + "k3_kl": 0.06201171875, + "kimi_kl": 0.2021484375, + "learning_rate": 2.032e-07, + "loss": 0.0025, + "ppl": 0.0299072265625, + "reward": 0.9736961722373962, + "reward_std": 0.00120784982573241, + "rewards/perpo_ocr_edit_distance_reward": 0.973696231842041, + "step": 2968, + "temperature": 0.9 + }, + { + "advantages": -0.00010071482392959297, + "completion_length": 1009.0, + "delta_ref_entropy_loss": 0.016845703125, + "delta_ref_ppl": -0.033935546875, + "entropy_loss": -0.03564453125, + "epoch": 0.5938, + "grad_norm": 0.3480009794268032, + "k1_kl": 0.03369140625, + "k3_kl": 0.022216796875, + "kimi_kl": 0.0712890625, + "learning_rate": 2.0309999999999999e-07, + "loss": 0.001, + "ppl": 0.01361083984375, + "reward": 0.9987376928329468, + "reward_std": 0.00040738482493907213, + "rewards/perpo_ocr_edit_distance_reward": 0.9987378120422363, + "step": 2969, + "temperature": 0.9 + }, + { + "advantages": -2.9725688364123926e-05, + "completion_length": 379.0, + "delta_ref_entropy_loss": 0.040771484375, + "delta_ref_ppl": -0.09912109375, + "entropy_loss": -0.0888671875, + "epoch": 0.594, + "grad_norm": 0.6926054342322865, + "k1_kl": 0.09912109375, + "k3_kl": 0.08154296875, + "kimi_kl": 0.3359375, + "learning_rate": 2.03e-07, + "loss": 0.0033, + "ppl": 0.036865234375, + "reward": 0.9815570712089539, + "reward_std": 0.0021906853653490543, + "rewards/perpo_ocr_edit_distance_reward": 0.9815571308135986, + "step": 2970, + "temperature": 0.9 + }, + { + "advantages": -3.4468517696950585e-05, + "completion_length": 835.0, + "delta_ref_entropy_loss": 0.041748046875, + "delta_ref_ppl": -0.061279296875, + "entropy_loss": -0.037109375, + "epoch": 0.5942, + "grad_norm": 0.556218616251867, + "k1_kl": 0.061279296875, + "k3_kl": 0.041015625, + "kimi_kl": 0.1533203125, + "learning_rate": 2.029e-07, + "loss": 0.0017, + "ppl": 0.01416015625, + "reward": 0.9872322678565979, + "reward_std": 0.002121962374076247, + "rewards/perpo_ocr_edit_distance_reward": 0.9872322678565979, + "step": 2971, + "temperature": 0.9 + }, + { + "advantages": -3.405979782655777e-07, + "completion_length": 986.0, + "delta_ref_entropy_loss": -0.09521484375, + "delta_ref_ppl": -0.09716796875, + "entropy_loss": -0.91015625, + "epoch": 0.5944, + "grad_norm": 5.947662642460159, + "k1_kl": 0.09716796875, + "k3_kl": 0.12158203125, + "kimi_kl": 0.181640625, + "learning_rate": 2.028e-07, + "loss": 0.0049, + "ppl": 0.478515625, + "reward": 0.28583067655563354, + "reward_std": 0.02904147282242775, + "rewards/perpo_ocr_edit_distance_reward": 0.28583067655563354, + "step": 2972, + "temperature": 0.9 + }, + { + "advantages": -5.4665975767420605e-05, + "completion_length": 352.0, + "delta_ref_entropy_loss": 0.0478515625, + "delta_ref_ppl": -0.09033203125, + "entropy_loss": -0.03369140625, + "epoch": 0.5946, + "grad_norm": 1.5206284012037552, + "k1_kl": 0.08984375, + "k3_kl": 0.0634765625, + "kimi_kl": 0.228515625, + "learning_rate": 2.0269999999999998e-07, + "loss": 0.0026, + "ppl": 0.0111083984375, + "reward": 0.9971103072166443, + "reward_std": 0.0011458031367510557, + "rewards/perpo_ocr_edit_distance_reward": 0.9971104264259338, + "step": 2973, + "temperature": 0.9 + }, + { + "advantages": -3.424712849664502e-05, + "completion_length": 604.0, + "delta_ref_entropy_loss": 0.04296875, + "delta_ref_ppl": -0.06298828125, + "entropy_loss": -0.09765625, + "epoch": 0.5948, + "grad_norm": 0.4788712273873966, + "k1_kl": 0.06298828125, + "k3_kl": 0.044677734375, + "kimi_kl": 0.1591796875, + "learning_rate": 2.026e-07, + "loss": 0.0018, + "ppl": 0.0380859375, + "reward": 0.9806186556816101, + "reward_std": 0.0011432317551225424, + "rewards/perpo_ocr_edit_distance_reward": 0.9806187152862549, + "step": 2974, + "temperature": 0.9 + }, + { + "advantages": -4.931007424602285e-05, + "completion_length": 1350.0, + "delta_ref_entropy_loss": 0.006744384765625, + "delta_ref_ppl": -0.024658203125, + "entropy_loss": -0.0252685546875, + "epoch": 0.595, + "grad_norm": 0.5288831361906196, + "k1_kl": 0.024658203125, + "k3_kl": 0.0172119140625, + "kimi_kl": 0.043212890625, + "learning_rate": 2.025e-07, + "loss": 0.0007, + "ppl": 0.00994873046875, + "reward": 0.997506856918335, + "reward_std": 0.0009362415876239538, + "rewards/perpo_ocr_edit_distance_reward": 0.9975069761276245, + "step": 2975, + "temperature": 0.9 + }, + { + "advantages": -1.7540796761750244e-05, + "completion_length": 939.0, + "delta_ref_entropy_loss": 0.03271484375, + "delta_ref_ppl": -0.037841796875, + "entropy_loss": -0.0712890625, + "epoch": 0.5952, + "grad_norm": 0.6300769983997305, + "k1_kl": 0.037841796875, + "k3_kl": 0.02197265625, + "kimi_kl": 0.055908203125, + "learning_rate": 2.0239999999999999e-07, + "loss": 0.0009, + "ppl": 0.0322265625, + "reward": 0.9783619046211243, + "reward_std": 0.0037820113357156515, + "rewards/perpo_ocr_edit_distance_reward": 0.978361964225769, + "step": 2976, + "temperature": 0.9 + }, + { + "advantages": -6.605897942790762e-05, + "completion_length": 695.0, + "delta_ref_entropy_loss": 0.03857421875, + "delta_ref_ppl": -0.07373046875, + "entropy_loss": -0.0537109375, + "epoch": 0.5954, + "grad_norm": 0.6411207460869837, + "k1_kl": 0.07373046875, + "k3_kl": 0.0537109375, + "kimi_kl": 0.171875, + "learning_rate": 2.023e-07, + "loss": 0.0022, + "ppl": 0.024658203125, + "reward": 0.9895699620246887, + "reward_std": 0.0011889375746250153, + "rewards/perpo_ocr_edit_distance_reward": 0.989570140838623, + "step": 2977, + "temperature": 0.9 + }, + { + "advantages": -4.4924872781848535e-05, + "completion_length": 450.0, + "delta_ref_entropy_loss": 0.044921875, + "delta_ref_ppl": -0.07421875, + "entropy_loss": -0.03515625, + "epoch": 0.5956, + "grad_norm": 0.4109334737174238, + "k1_kl": 0.07470703125, + "k3_kl": 0.050048828125, + "kimi_kl": 0.2177734375, + "learning_rate": 2.0219999999999997e-07, + "loss": 0.0021, + "ppl": 0.01031494140625, + "reward": 0.9981224536895752, + "reward_std": 0.00046859707799740136, + "rewards/perpo_ocr_edit_distance_reward": 0.9981224536895752, + "step": 2978, + "temperature": 0.9 + }, + { + "advantages": -1.4322145034384448e-05, + "completion_length": 453.0, + "delta_ref_entropy_loss": 0.10009765625, + "delta_ref_ppl": -0.1484375, + "entropy_loss": -0.29296875, + "epoch": 0.5958, + "grad_norm": 2.0147037154703846, + "k1_kl": 0.1484375, + "k3_kl": 0.1015625, + "kimi_kl": 0.306640625, + "learning_rate": 2.021e-07, + "loss": 0.0041, + "ppl": 0.1494140625, + "reward": 0.91262286901474, + "reward_std": 0.005263668019324541, + "rewards/perpo_ocr_edit_distance_reward": 0.9126229882240295, + "step": 2979, + "temperature": 0.9 + }, + { + "advantages": 1.9175666238879785e-05, + "completion_length": 997.0, + "delta_ref_entropy_loss": 0.03173828125, + "delta_ref_ppl": -0.042236328125, + "entropy_loss": -0.044921875, + "epoch": 0.596, + "grad_norm": 0.7573391433755715, + "k1_kl": 0.042236328125, + "k3_kl": 0.0240478515625, + "kimi_kl": 0.056396484375, + "learning_rate": 2.02e-07, + "loss": 0.0009, + "ppl": 0.018310546875, + "reward": 0.9974315166473389, + "reward_std": 0.0007878611795604229, + "rewards/perpo_ocr_edit_distance_reward": 0.9974315166473389, + "step": 2980, + "temperature": 0.9 + }, + { + "advantages": 1.4364720300363842e-05, + "completion_length": 891.0, + "delta_ref_entropy_loss": 0.06689453125, + "delta_ref_ppl": -0.0732421875, + "entropy_loss": -0.061279296875, + "epoch": 0.5962, + "grad_norm": 1.1979814386305059, + "k1_kl": 0.0732421875, + "k3_kl": 0.042236328125, + "kimi_kl": 0.1181640625, + "learning_rate": 2.0189999999999997e-07, + "loss": 0.0017, + "ppl": 0.02685546875, + "reward": 0.9691312909126282, + "reward_std": 0.0016787550412118435, + "rewards/perpo_ocr_edit_distance_reward": 0.969131350517273, + "step": 2981, + "temperature": 0.9 + }, + { + "advantages": -6.789821054553613e-05, + "completion_length": 385.0, + "delta_ref_entropy_loss": 0.0284423828125, + "delta_ref_ppl": -0.06591796875, + "entropy_loss": -0.0230712890625, + "epoch": 0.5964, + "grad_norm": 0.36893673531543836, + "k1_kl": 0.06640625, + "k3_kl": 0.04638671875, + "kimi_kl": 0.1923828125, + "learning_rate": 2.018e-07, + "loss": 0.0019, + "ppl": 0.00823974609375, + "reward": 0.9959635734558105, + "reward_std": 0.0005269875400699675, + "rewards/perpo_ocr_edit_distance_reward": 0.9959636330604553, + "step": 2982, + "temperature": 0.9 + }, + { + "advantages": 0.0, + "completion_length": 759.0, + "delta_ref_entropy_loss": 0.025146484375, + "delta_ref_ppl": -0.048095703125, + "entropy_loss": -0.06884765625, + "epoch": 0.5966, + "grad_norm": 0.7906924660985444, + "k1_kl": 0.048095703125, + "k3_kl": 0.033447265625, + "kimi_kl": 0.08935546875, + "learning_rate": 2.0169999999999999e-07, + "loss": 0.0013, + "ppl": 0.03173828125, + "reward": 0.9517896771430969, + "reward_std": 0.08552895486354828, + "rewards/perpo_ocr_edit_distance_reward": 0.9517897367477417, + "step": 2983, + "temperature": 0.9 + }, + { + "advantages": -1.8026148609351367e-05, + "completion_length": 326.0, + "delta_ref_entropy_loss": 0.07861328125, + "delta_ref_ppl": -0.1708984375, + "entropy_loss": -0.15625, + "epoch": 0.5968, + "grad_norm": 2.161847815137208, + "k1_kl": 0.1708984375, + "k3_kl": 0.11474609375, + "kimi_kl": 0.310546875, + "learning_rate": 2.016e-07, + "loss": 0.0046, + "ppl": 0.07568359375, + "reward": 0.9854139089584351, + "reward_std": 0.003677777945995331, + "rewards/perpo_ocr_edit_distance_reward": 0.9854140281677246, + "step": 2984, + "temperature": 0.9 + }, + { + "advantages": -2.9291427381394897e-06, + "completion_length": 279.0, + "delta_ref_entropy_loss": 0.07373046875, + "delta_ref_ppl": -0.1396484375, + "entropy_loss": -0.07861328125, + "epoch": 0.597, + "grad_norm": 1.9875381197271118, + "k1_kl": 0.1396484375, + "k3_kl": 0.10498046875, + "kimi_kl": 0.322265625, + "learning_rate": 2.015e-07, + "loss": 0.0042, + "ppl": 0.0400390625, + "reward": 0.9769667387008667, + "reward_std": 0.023246359080076218, + "rewards/perpo_ocr_edit_distance_reward": 0.9769668579101562, + "step": 2985, + "temperature": 0.9 + }, + { + "advantages": -0.0001787798828445375, + "completion_length": 492.0, + "delta_ref_entropy_loss": 0.0341796875, + "delta_ref_ppl": -0.04931640625, + "entropy_loss": -0.041748046875, + "epoch": 0.5972, + "grad_norm": 0.7035934970005417, + "k1_kl": 0.04931640625, + "k3_kl": 0.031005859375, + "kimi_kl": 0.0986328125, + "learning_rate": 2.014e-07, + "loss": 0.0014, + "ppl": 0.0169677734375, + "reward": 0.9979012608528137, + "reward_std": 0.0006621418287977576, + "rewards/perpo_ocr_edit_distance_reward": 0.997901439666748, + "step": 2986, + "temperature": 0.9 + }, + { + "advantages": -9.664468052505981e-06, + "completion_length": 838.0, + "delta_ref_entropy_loss": 0.08154296875, + "delta_ref_ppl": -0.08349609375, + "entropy_loss": -0.2265625, + "epoch": 0.5974, + "grad_norm": 1.7179610555083822, + "k1_kl": 0.08349609375, + "k3_kl": 0.05419921875, + "kimi_kl": 0.10009765625, + "learning_rate": 2.013e-07, + "loss": 0.0022, + "ppl": 0.11474609375, + "reward": 0.7706272006034851, + "reward_std": 0.0051902965642511845, + "rewards/perpo_ocr_edit_distance_reward": 0.7706273198127747, + "step": 2987, + "temperature": 0.9 + }, + { + "advantages": -1.27724248955019e-08, + "completion_length": 1234.0, + "delta_ref_entropy_loss": 0.06591796875, + "delta_ref_ppl": -0.0830078125, + "entropy_loss": -0.166015625, + "epoch": 0.5976, + "grad_norm": 1.4202887882995239, + "k1_kl": 0.0830078125, + "k3_kl": 0.050537109375, + "kimi_kl": 0.1279296875, + "learning_rate": 2.0119999999999998e-07, + "loss": 0.002, + "ppl": 0.0830078125, + "reward": 0.9571866393089294, + "reward_std": 0.001838534721173346, + "rewards/perpo_ocr_edit_distance_reward": 0.9571865797042847, + "step": 2988, + "temperature": 0.9 + }, + { + "advantages": -1.1358943083905615e-05, + "completion_length": 931.0, + "delta_ref_entropy_loss": 0.03271484375, + "delta_ref_ppl": -0.03173828125, + "entropy_loss": -0.033203125, + "epoch": 0.5978, + "grad_norm": 0.399441393670723, + "k1_kl": 0.03173828125, + "k3_kl": 0.0169677734375, + "kimi_kl": 0.040283203125, + "learning_rate": 2.011e-07, + "loss": 0.0007, + "ppl": 0.011962890625, + "reward": 0.9980108141899109, + "reward_std": 0.0006491235108114779, + "rewards/perpo_ocr_edit_distance_reward": 0.9980108141899109, + "step": 2989, + "temperature": 0.9 + }, + { + "advantages": -6.709780427627265e-05, + "completion_length": 394.0, + "delta_ref_entropy_loss": 0.0556640625, + "delta_ref_ppl": -0.10595703125, + "entropy_loss": -0.07470703125, + "epoch": 0.598, + "grad_norm": 1.6111191337013568, + "k1_kl": 0.10595703125, + "k3_kl": 0.07080078125, + "kimi_kl": 0.2255859375, + "learning_rate": 2.01e-07, + "loss": 0.0029, + "ppl": 0.0247802734375, + "reward": 0.9941599369049072, + "reward_std": 0.0007884101360104978, + "rewards/perpo_ocr_edit_distance_reward": 0.994159996509552, + "step": 2990, + "temperature": 0.9 + }, + { + "advantages": -1.1239733339607483e-06, + "completion_length": 615.0, + "delta_ref_entropy_loss": 0.00139617919921875, + "delta_ref_ppl": -0.138671875, + "entropy_loss": -0.75, + "epoch": 0.5982, + "grad_norm": 2.5057745023271494, + "k1_kl": 0.138671875, + "k3_kl": 0.11767578125, + "kimi_kl": 0.20703125, + "learning_rate": 2.0089999999999998e-07, + "loss": 0.0047, + "ppl": 0.421875, + "reward": 0.7532273530960083, + "reward_std": 0.014275969006121159, + "rewards/perpo_ocr_edit_distance_reward": 0.7532274127006531, + "step": 2991, + "temperature": 0.9 + }, + { + "advantages": -1.3308866073202807e-05, + "completion_length": 181.0, + "delta_ref_entropy_loss": 0.08349609375, + "delta_ref_ppl": -0.1513671875, + "entropy_loss": -0.0830078125, + "epoch": 0.5984, + "grad_norm": 1.4248572879674535, + "k1_kl": 0.1513671875, + "k3_kl": 0.09716796875, + "kimi_kl": 0.28125, + "learning_rate": 2.008e-07, + "loss": 0.0039, + "ppl": 0.033935546875, + "reward": 0.9785807132720947, + "reward_std": 0.0018217423930764198, + "rewards/perpo_ocr_edit_distance_reward": 0.9785807728767395, + "step": 2992, + "temperature": 0.9 + }, + { + "advantages": -2.964905434055254e-05, + "completion_length": 1601.0, + "delta_ref_entropy_loss": 0.00848388671875, + "delta_ref_ppl": -0.02392578125, + "entropy_loss": -0.043212890625, + "epoch": 0.5986, + "grad_norm": 0.4438641190027823, + "k1_kl": 0.0238037109375, + "k3_kl": 0.0172119140625, + "kimi_kl": 0.042236328125, + "learning_rate": 2.007e-07, + "loss": 0.0007, + "ppl": 0.0203857421875, + "reward": 0.9924041628837585, + "reward_std": 0.0016216989606618881, + "rewards/perpo_ocr_edit_distance_reward": 0.9924042224884033, + "step": 2993, + "temperature": 0.9 + }, + { + "advantages": -2.4591174224042334e-05, + "completion_length": 1188.0, + "delta_ref_entropy_loss": 0.03564453125, + "delta_ref_ppl": -0.0458984375, + "entropy_loss": -0.060546875, + "epoch": 0.5988, + "grad_norm": 1.0670116376196517, + "k1_kl": 0.045654296875, + "k3_kl": 0.02685546875, + "kimi_kl": 0.0712890625, + "learning_rate": 2.0059999999999998e-07, + "loss": 0.0011, + "ppl": 0.0269775390625, + "reward": 0.9852520227432251, + "reward_std": 0.001976652769371867, + "rewards/perpo_ocr_edit_distance_reward": 0.9852520823478699, + "step": 2994, + "temperature": 0.9 + }, + { + "advantages": -2.043587983280304e-07, + "completion_length": 1017.0, + "delta_ref_entropy_loss": 0.046142578125, + "delta_ref_ppl": -0.05224609375, + "entropy_loss": -0.0732421875, + "epoch": 0.599, + "grad_norm": 1.5616406240186322, + "k1_kl": 0.05224609375, + "k3_kl": 0.034423828125, + "kimi_kl": 0.06689453125, + "learning_rate": 2.005e-07, + "loss": 0.0014, + "ppl": 0.041015625, + "reward": 0.9236404299736023, + "reward_std": 0.15969574451446533, + "rewards/perpo_ocr_edit_distance_reward": 0.9236404895782471, + "step": 2995, + "temperature": 0.9 + }, + { + "advantages": -1.5548297596978955e-05, + "completion_length": 731.0, + "delta_ref_entropy_loss": 0.033935546875, + "delta_ref_ppl": -0.043701171875, + "entropy_loss": -0.03076171875, + "epoch": 0.5992, + "grad_norm": 0.46265877325116145, + "k1_kl": 0.043701171875, + "k3_kl": 0.0260009765625, + "kimi_kl": 0.0751953125, + "learning_rate": 2.004e-07, + "loss": 0.0011, + "ppl": 0.0089111328125, + "reward": 0.9966199994087219, + "reward_std": 0.0009947342332452536, + "rewards/perpo_ocr_edit_distance_reward": 0.9966200590133667, + "step": 2996, + "temperature": 0.9 + }, + { + "advantages": 0.0, + "completion_length": 611.0, + "delta_ref_entropy_loss": 0.0264892578125, + "delta_ref_ppl": -0.04345703125, + "entropy_loss": -0.0238037109375, + "epoch": 0.5994, + "grad_norm": 0.008304583531196038, + "k1_kl": 0.04345703125, + "k3_kl": 0.0291748046875, + "kimi_kl": 0.09814453125, + "learning_rate": 2.003e-07, + "loss": 0.0012, + "ppl": 0.005279541015625, + "reward": 0.6521937847137451, + "reward_std": 0.0, + "rewards/perpo_ocr_edit_distance_reward": 0.6521937847137451, + "step": 2997, + "temperature": 0.9 + }, + { + "advantages": -9.877341653918847e-05, + "completion_length": 790.0, + "delta_ref_entropy_loss": 0.0634765625, + "delta_ref_ppl": -0.06005859375, + "entropy_loss": -0.083984375, + "epoch": 0.5996, + "grad_norm": 0.8773413792986935, + "k1_kl": 0.06005859375, + "k3_kl": 0.0341796875, + "kimi_kl": 0.09033203125, + "learning_rate": 2.0019999999999998e-07, + "loss": 0.0015, + "ppl": 0.037841796875, + "reward": 0.9812576174736023, + "reward_std": 0.0006759653333574533, + "rewards/perpo_ocr_edit_distance_reward": 0.9812576174736023, + "step": 2998, + "temperature": 0.9 + }, + { + "advantages": -3.748280869331211e-05, + "completion_length": 1113.0, + "delta_ref_entropy_loss": 0.01177978515625, + "delta_ref_ppl": -0.0308837890625, + "entropy_loss": -0.06005859375, + "epoch": 0.5998, + "grad_norm": 0.942590699397429, + "k1_kl": 0.0308837890625, + "k3_kl": 0.0220947265625, + "kimi_kl": 0.042236328125, + "learning_rate": 2.001e-07, + "loss": 0.0009, + "ppl": 0.025146484375, + "reward": 0.9687137007713318, + "reward_std": 0.0019449427491053939, + "rewards/perpo_ocr_edit_distance_reward": 0.9687138199806213, + "step": 2999, + "temperature": 0.9 + }, + { + "advantages": -1.4722348169016186e-05, + "completion_length": 60.0, + "delta_ref_entropy_loss": 0.1376953125, + "delta_ref_ppl": -0.345703125, + "entropy_loss": -0.1142578125, + "epoch": 0.6, + "grad_norm": 2.735066721283916, + "k1_kl": 0.345703125, + "k3_kl": 0.2734375, + "kimi_kl": 1.0546875, + "learning_rate": 2e-07, + "loss": 0.0109, + "ppl": 0.044677734375, + "reward": 0.984528124332428, + "reward_std": 0.004525520373135805, + "rewards/perpo_ocr_edit_distance_reward": 0.9845281839370728, + "step": 3000, + "temperature": 0.9 + }, + { + "advantages": -0.00016544546815566719, + "completion_length": 536.0, + "delta_ref_entropy_loss": 0.051513671875, + "delta_ref_ppl": -0.049072265625, + "entropy_loss": -0.0277099609375, + "epoch": 0.6002, + "grad_norm": 0.13756673630431726, + "k1_kl": 0.048828125, + "k3_kl": 0.0255126953125, + "kimi_kl": 0.0673828125, + "learning_rate": 1.9989999999999998e-07, + "loss": 0.0012, + "ppl": 0.005645751953125, + "reward": 0.9974352717399597, + "reward_std": 0.00015733861073385924, + "rewards/perpo_ocr_edit_distance_reward": 0.9974353313446045, + "step": 3001, + "temperature": 0.9 + }, + { + "advantages": -1.571859684190713e-05, + "completion_length": 848.0, + "delta_ref_entropy_loss": 0.033203125, + "delta_ref_ppl": -0.055908203125, + "entropy_loss": -0.04345703125, + "epoch": 0.6004, + "grad_norm": 0.532902828240178, + "k1_kl": 0.0556640625, + "k3_kl": 0.03857421875, + "kimi_kl": 0.10302734375, + "learning_rate": 1.998e-07, + "loss": 0.0016, + "ppl": 0.0181884765625, + "reward": 0.9921663999557495, + "reward_std": 0.0026101372204720974, + "rewards/perpo_ocr_edit_distance_reward": 0.9921663999557495, + "step": 3002, + "temperature": 0.9 + }, + { + "advantages": 4.666192580771167e-06, + "completion_length": 98.0, + "delta_ref_entropy_loss": 0.04833984375, + "delta_ref_ppl": -0.23828125, + "entropy_loss": -0.1591796875, + "epoch": 0.6006, + "grad_norm": 3.116692751813697, + "k1_kl": 0.23828125, + "k3_kl": 0.185546875, + "kimi_kl": 0.6796875, + "learning_rate": 1.9969999999999997e-07, + "loss": 0.0074, + "ppl": 0.07177734375, + "reward": 0.9552238583564758, + "reward_std": 0.0017234598053619266, + "rewards/perpo_ocr_edit_distance_reward": 0.9552238583564758, + "step": 3003, + "temperature": 0.9 + }, + { + "advantages": -3.4059798537100505e-08, + "completion_length": 103.0, + "delta_ref_entropy_loss": 0.0184326171875, + "delta_ref_ppl": -0.291015625, + "entropy_loss": -0.1298828125, + "epoch": 0.6008, + "grad_norm": 2.8052219557963434, + "k1_kl": 0.291015625, + "k3_kl": 0.216796875, + "kimi_kl": 0.78515625, + "learning_rate": 1.996e-07, + "loss": 0.0086, + "ppl": 0.06494140625, + "reward": 0.9840515851974487, + "reward_std": 0.0038089596200734377, + "rewards/perpo_ocr_edit_distance_reward": 0.9840515851974487, + "step": 3004, + "temperature": 0.9 + }, + { + "advantages": -4.223415089654736e-05, + "completion_length": 334.0, + "delta_ref_entropy_loss": 0.04248046875, + "delta_ref_ppl": -0.09228515625, + "entropy_loss": -0.07763671875, + "epoch": 0.601, + "grad_norm": 0.9015048755697574, + "k1_kl": 0.091796875, + "k3_kl": 0.06591796875, + "kimi_kl": 0.1923828125, + "learning_rate": 1.995e-07, + "loss": 0.0027, + "ppl": 0.0380859375, + "reward": 0.9828080534934998, + "reward_std": 0.0013107513077557087, + "rewards/perpo_ocr_edit_distance_reward": 0.9828081130981445, + "step": 3005, + "temperature": 0.9 + }, + { + "advantages": 6.709780336677795e-06, + "completion_length": 361.0, + "delta_ref_entropy_loss": 0.0120849609375, + "delta_ref_ppl": -0.134765625, + "entropy_loss": -0.1201171875, + "epoch": 0.6012, + "grad_norm": 1.2881363977523492, + "k1_kl": 0.134765625, + "k3_kl": 0.09521484375, + "kimi_kl": 0.357421875, + "learning_rate": 1.9939999999999997e-07, + "loss": 0.0038, + "ppl": 0.0439453125, + "reward": 0.39124149084091187, + "reward_std": 0.0011762242065742612, + "rewards/perpo_ocr_edit_distance_reward": 0.39124149084091187, + "step": 3006, + "temperature": 0.9 + }, + { + "advantages": -1.7029899268550253e-08, + "completion_length": 730.0, + "delta_ref_entropy_loss": 0.01806640625, + "delta_ref_ppl": -0.044189453125, + "entropy_loss": -0.028564453125, + "epoch": 0.6014, + "grad_norm": 0.5212001288137449, + "k1_kl": 0.04443359375, + "k3_kl": 0.03076171875, + "kimi_kl": 0.095703125, + "learning_rate": 1.993e-07, + "loss": 0.0012, + "ppl": 0.00970458984375, + "reward": 0.9805194735527039, + "reward_std": 0.0016256265807896852, + "rewards/perpo_ocr_edit_distance_reward": 0.9805194735527039, + "step": 3007, + "temperature": 0.9 + }, + { + "advantages": 1.1648450708889868e-05, + "completion_length": 63.0, + "delta_ref_entropy_loss": 0.09130859375, + "delta_ref_ppl": -0.54296875, + "entropy_loss": -0.11962890625, + "epoch": 0.6016, + "grad_norm": 4.520583313024365, + "k1_kl": 0.5390625, + "k3_kl": 0.44140625, + "kimi_kl": 2.6875, + "learning_rate": 1.9919999999999998e-07, + "loss": 0.0177, + "ppl": 0.0400390625, + "reward": 0.9504263401031494, + "reward_std": 0.0028206459246575832, + "rewards/perpo_ocr_edit_distance_reward": 0.9504263997077942, + "step": 3008, + "temperature": 0.9 + }, + { + "advantages": 8.514949634275126e-09, + "completion_length": 998.0, + "delta_ref_entropy_loss": 0.031982421875, + "delta_ref_ppl": -0.050048828125, + "entropy_loss": -0.024169921875, + "epoch": 0.6018, + "grad_norm": 0.23523268484131224, + "k1_kl": 0.050048828125, + "k3_kl": 0.03125, + "kimi_kl": 0.0927734375, + "learning_rate": 1.991e-07, + "loss": 0.0013, + "ppl": 0.006561279296875, + "reward": 0.9884174466133118, + "reward_std": 0.0003902579774148762, + "rewards/perpo_ocr_edit_distance_reward": 0.9884175062179565, + "step": 3009, + "temperature": 0.9 + }, + { + "advantages": -4.747935963678174e-05, + "completion_length": 722.0, + "delta_ref_entropy_loss": 0.0277099609375, + "delta_ref_ppl": -0.030517578125, + "entropy_loss": -0.029296875, + "epoch": 0.602, + "grad_norm": 0.33218657739037105, + "k1_kl": 0.0306396484375, + "k3_kl": 0.0159912109375, + "kimi_kl": 0.042724609375, + "learning_rate": 1.99e-07, + "loss": 0.0007, + "ppl": 0.00714111328125, + "reward": 0.9959151148796082, + "reward_std": 0.0004381293256301433, + "rewards/perpo_ocr_edit_distance_reward": 0.9959152340888977, + "step": 3010, + "temperature": 0.9 + }, + { + "advantages": -2.2138868871479644e-07, + "completion_length": 53.0, + "delta_ref_entropy_loss": -0.173828125, + "delta_ref_ppl": -0.48828125, + "entropy_loss": -0.5859375, + "epoch": 0.6022, + "grad_norm": 7.568705585040809, + "k1_kl": 0.48828125, + "k3_kl": 0.423828125, + "kimi_kl": 1.6953125, + "learning_rate": 1.989e-07, + "loss": 0.0169, + "ppl": 0.279296875, + "reward": 0.8686724305152893, + "reward_std": 0.26497113704681396, + "rewards/perpo_ocr_edit_distance_reward": 0.8686724305152893, + "step": 3011, + "temperature": 0.9 + }, + { + "advantages": -2.3433141905115917e-05, + "completion_length": 58.0, + "delta_ref_entropy_loss": 0.0004177093505859375, + "delta_ref_ppl": -0.50390625, + "entropy_loss": -0.09912109375, + "epoch": 0.6024, + "grad_norm": 3.05151547777207, + "k1_kl": 0.50390625, + "k3_kl": 0.44140625, + "kimi_kl": 3.09375, + "learning_rate": 1.988e-07, + "loss": 0.0177, + "ppl": 0.0322265625, + "reward": 0.9980430603027344, + "reward_std": 0.0024407326709479094, + "rewards/perpo_ocr_edit_distance_reward": 0.9980431199073792, + "step": 3012, + "temperature": 0.9 + }, + { + "advantages": -1.3513224985217676e-05, + "completion_length": 511.0, + "delta_ref_entropy_loss": 0.006561279296875, + "delta_ref_ppl": -0.060791015625, + "entropy_loss": -0.1884765625, + "epoch": 0.6026, + "grad_norm": 1.3979428636871125, + "k1_kl": 0.060791015625, + "k3_kl": 0.050537109375, + "kimi_kl": 0.125, + "learning_rate": 1.9869999999999997e-07, + "loss": 0.002, + "ppl": 0.072265625, + "reward": 0.7136600613594055, + "reward_std": 0.004945475608110428, + "rewards/perpo_ocr_edit_distance_reward": 0.7136601209640503, + "step": 3013, + "temperature": 0.9 + }, + { + "advantages": -1.8860613636206836e-05, + "completion_length": 736.0, + "delta_ref_entropy_loss": 0.06884765625, + "delta_ref_ppl": -0.095703125, + "entropy_loss": -0.1064453125, + "epoch": 0.6028, + "grad_norm": 1.1812664468624363, + "k1_kl": 0.095703125, + "k3_kl": 0.0625, + "kimi_kl": 0.205078125, + "learning_rate": 1.986e-07, + "loss": 0.0025, + "ppl": 0.054931640625, + "reward": 0.967822790145874, + "reward_std": 0.0017059907549992204, + "rewards/perpo_ocr_edit_distance_reward": 0.9678228497505188, + "step": 3014, + "temperature": 0.9 + }, + { + "advantages": -4.58104295830708e-06, + "completion_length": 266.0, + "delta_ref_entropy_loss": 0.06787109375, + "delta_ref_ppl": -0.11083984375, + "entropy_loss": -0.06884765625, + "epoch": 0.603, + "grad_norm": 0.9949404955446056, + "k1_kl": 0.11083984375, + "k3_kl": 0.07568359375, + "kimi_kl": 0.23828125, + "learning_rate": 1.985e-07, + "loss": 0.003, + "ppl": 0.0286865234375, + "reward": 0.9846560955047607, + "reward_std": 0.001761469291523099, + "rewards/perpo_ocr_edit_distance_reward": 0.9846560955047607, + "step": 3015, + "temperature": 0.9 + }, + { + "advantages": -9.770053293323144e-05, + "completion_length": 1018.0, + "delta_ref_entropy_loss": 0.035400390625, + "delta_ref_ppl": -0.060546875, + "entropy_loss": -0.049560546875, + "epoch": 0.6032, + "grad_norm": 0.4907302333726947, + "k1_kl": 0.060546875, + "k3_kl": 0.037841796875, + "kimi_kl": 0.1162109375, + "learning_rate": 1.9839999999999998e-07, + "loss": 0.0016, + "ppl": 0.0198974609375, + "reward": 0.9934545755386353, + "reward_std": 0.0005101450369693339, + "rewards/perpo_ocr_edit_distance_reward": 0.99345463514328, + "step": 3016, + "temperature": 0.9 + }, + { + "advantages": -0.0001355878048343584, + "completion_length": 504.0, + "delta_ref_entropy_loss": 0.043701171875, + "delta_ref_ppl": -0.04638671875, + "entropy_loss": -0.03759765625, + "epoch": 0.6034, + "grad_norm": 0.3537156675851632, + "k1_kl": 0.046630859375, + "k3_kl": 0.027587890625, + "kimi_kl": 0.07763671875, + "learning_rate": 1.983e-07, + "loss": 0.0012, + "ppl": 0.01171875, + "reward": 0.9959825873374939, + "reward_std": 0.0004651696654036641, + "rewards/perpo_ocr_edit_distance_reward": 0.9959826469421387, + "step": 3017, + "temperature": 0.9 + }, + { + "advantages": -6.4458167798875365e-06, + "completion_length": 551.0, + "delta_ref_entropy_loss": 0.11865234375, + "delta_ref_ppl": -0.134765625, + "entropy_loss": -0.29296875, + "epoch": 0.6036, + "grad_norm": 3.433949860752211, + "k1_kl": 0.1357421875, + "k3_kl": 0.08837890625, + "kimi_kl": 0.2734375, + "learning_rate": 1.982e-07, + "loss": 0.0035, + "ppl": 0.14453125, + "reward": 0.8956353068351746, + "reward_std": 0.009110814891755581, + "rewards/perpo_ocr_edit_distance_reward": 0.8956353664398193, + "step": 3018, + "temperature": 0.9 + }, + { + "advantages": -6.641660547757056e-06, + "completion_length": 344.0, + "delta_ref_entropy_loss": 0.041015625, + "delta_ref_ppl": -0.0849609375, + "entropy_loss": -0.1025390625, + "epoch": 0.6038, + "grad_norm": 1.2618949389427705, + "k1_kl": 0.08447265625, + "k3_kl": 0.056884765625, + "kimi_kl": 0.1787109375, + "learning_rate": 1.9809999999999998e-07, + "loss": 0.0023, + "ppl": 0.03515625, + "reward": 0.979345977306366, + "reward_std": 0.012740959413349628, + "rewards/perpo_ocr_edit_distance_reward": 0.9793460369110107, + "step": 3019, + "temperature": 0.9 + }, + { + "advantages": -2.1193709471845068e-05, + "completion_length": 528.0, + "delta_ref_entropy_loss": 0.0458984375, + "delta_ref_ppl": -0.0771484375, + "entropy_loss": -0.0390625, + "epoch": 0.604, + "grad_norm": 0.24804429265413908, + "k1_kl": 0.07666015625, + "k3_kl": 0.04931640625, + "kimi_kl": 0.1611328125, + "learning_rate": 1.98e-07, + "loss": 0.002, + "ppl": 0.0126953125, + "reward": 0.99061518907547, + "reward_std": 0.00030222898931242526, + "rewards/perpo_ocr_edit_distance_reward": 0.9906152486801147, + "step": 3020, + "temperature": 0.9 + }, + { + "advantages": -1.037120910041267e-05, + "completion_length": 677.0, + "delta_ref_entropy_loss": 0.034423828125, + "delta_ref_ppl": -0.09619140625, + "entropy_loss": -0.11669921875, + "epoch": 0.6042, + "grad_norm": 2.47201129107872, + "k1_kl": 0.0966796875, + "k3_kl": 0.060791015625, + "kimi_kl": 0.150390625, + "learning_rate": 1.979e-07, + "loss": 0.0024, + "ppl": 0.057861328125, + "reward": 0.9319071769714355, + "reward_std": 0.010567090474069118, + "rewards/perpo_ocr_edit_distance_reward": 0.9319072961807251, + "step": 3021, + "temperature": 0.9 + }, + { + "advantages": -0.00016007253725547343, + "completion_length": 408.0, + "delta_ref_entropy_loss": 0.083984375, + "delta_ref_ppl": -0.10498046875, + "entropy_loss": -0.060791015625, + "epoch": 0.6044, + "grad_norm": 0.5326773928130537, + "k1_kl": 0.10498046875, + "k3_kl": 0.06396484375, + "kimi_kl": 0.17578125, + "learning_rate": 1.9779999999999998e-07, + "loss": 0.0027, + "ppl": 0.0203857421875, + "reward": 0.9846938848495483, + "reward_std": 0.0008043131674639881, + "rewards/perpo_ocr_edit_distance_reward": 0.9846940040588379, + "step": 3022, + "temperature": 0.9 + }, + { + "advantages": -3.9322036172961816e-05, + "completion_length": 617.0, + "delta_ref_entropy_loss": 0.052490234375, + "delta_ref_ppl": -0.08544921875, + "entropy_loss": -0.07568359375, + "epoch": 0.6046, + "grad_norm": 0.7738033355629982, + "k1_kl": 0.08544921875, + "k3_kl": 0.0537109375, + "kimi_kl": 0.2353515625, + "learning_rate": 1.9769999999999998e-07, + "loss": 0.0022, + "ppl": 0.0299072265625, + "reward": 0.9933398365974426, + "reward_std": 0.0009833576623350382, + "rewards/perpo_ocr_edit_distance_reward": 0.9933398962020874, + "step": 3023, + "temperature": 0.9 + }, + { + "advantages": -7.663455107831396e-06, + "completion_length": 1008.0, + "delta_ref_entropy_loss": 0.025146484375, + "delta_ref_ppl": -0.064453125, + "entropy_loss": -0.1650390625, + "epoch": 0.6048, + "grad_norm": 3.7763615558268024, + "k1_kl": 0.064453125, + "k3_kl": 0.0498046875, + "kimi_kl": 0.11669921875, + "learning_rate": 1.976e-07, + "loss": 0.002, + "ppl": 0.0712890625, + "reward": 0.7950809597969055, + "reward_std": 0.005443853326141834, + "rewards/perpo_ocr_edit_distance_reward": 0.7950810194015503, + "step": 3024, + "temperature": 0.9 + }, + { + "advantages": -2.1202224161243066e-05, + "completion_length": 434.0, + "delta_ref_entropy_loss": 0.052001953125, + "delta_ref_ppl": -0.07275390625, + "entropy_loss": -0.04150390625, + "epoch": 0.605, + "grad_norm": 0.36787291607558303, + "k1_kl": 0.07275390625, + "k3_kl": 0.04736328125, + "kimi_kl": 0.146484375, + "learning_rate": 1.975e-07, + "loss": 0.0019, + "ppl": 0.0126953125, + "reward": 0.9903761744499207, + "reward_std": 0.000702600518707186, + "rewards/perpo_ocr_edit_distance_reward": 0.9903762936592102, + "step": 3025, + "temperature": 0.9 + }, + { + "advantages": -8.514949456639442e-08, + "completion_length": 167.0, + "delta_ref_entropy_loss": -0.7578125, + "delta_ref_ppl": -0.56640625, + "entropy_loss": -2.59375, + "epoch": 0.6052, + "grad_norm": 18.087863476453176, + "k1_kl": 0.5625, + "k3_kl": 0.65625, + "kimi_kl": 2.078125, + "learning_rate": 1.9739999999999998e-07, + "loss": 0.0263, + "ppl": 1.2578125, + "reward": 0.21170085668563843, + "reward_std": 0.06116020306944847, + "rewards/perpo_ocr_edit_distance_reward": 0.21170087158679962, + "step": 3026, + "temperature": 0.9 + }, + { + "advantages": -1.5497207641601562e-05, + "completion_length": 560.0, + "delta_ref_entropy_loss": 0.047119140625, + "delta_ref_ppl": -0.1162109375, + "entropy_loss": -0.494140625, + "epoch": 0.6054, + "grad_norm": 2.154545147487282, + "k1_kl": 0.11669921875, + "k3_kl": 0.0849609375, + "kimi_kl": 0.1806640625, + "learning_rate": 1.973e-07, + "loss": 0.0034, + "ppl": 0.265625, + "reward": 0.7429431080818176, + "reward_std": 0.0037513277493417263, + "rewards/perpo_ocr_edit_distance_reward": 0.7429431676864624, + "step": 3027, + "temperature": 0.9 + }, + { + "advantages": 3.4059798537100505e-08, + "completion_length": 1384.0, + "delta_ref_entropy_loss": 0.04296875, + "delta_ref_ppl": -0.06640625, + "entropy_loss": -0.14453125, + "epoch": 0.6056, + "grad_norm": 1.6623148731770845, + "k1_kl": 0.06640625, + "k3_kl": 0.048583984375, + "kimi_kl": 0.1142578125, + "learning_rate": 1.9719999999999997e-07, + "loss": 0.0019, + "ppl": 0.0712890625, + "reward": 0.6187390685081482, + "reward_std": 0.3356574773788452, + "rewards/perpo_ocr_edit_distance_reward": 0.618739128112793, + "step": 3028, + "temperature": 0.9 + }, + { + "advantages": -4.853521318182175e-07, + "completion_length": 2048.0, + "delta_ref_entropy_loss": -0.03173828125, + "delta_ref_ppl": -0.0306396484375, + "entropy_loss": -0.2890625, + "epoch": 0.6058, + "grad_norm": 32.36864134923674, + "k1_kl": 0.030517578125, + "k3_kl": 0.16796875, + "kimi_kl": 0.10595703125, + "learning_rate": 1.9709999999999998e-07, + "loss": 0.0067, + "ppl": 0.189453125, + "reward": 0.3929673433303833, + "reward_std": 0.026180412620306015, + "rewards/perpo_ocr_edit_distance_reward": 0.3929673433303833, + "step": 3029, + "temperature": 0.9 + }, + { + "advantages": 0.0, + "completion_length": 483.0, + "delta_ref_entropy_loss": 0.060546875, + "delta_ref_ppl": -0.11962890625, + "entropy_loss": -0.271484375, + "epoch": 0.606, + "grad_norm": 2.2277001834200356, + "k1_kl": 0.1201171875, + "k3_kl": 0.08544921875, + "kimi_kl": 0.2177734375, + "learning_rate": 1.97e-07, + "loss": 0.0034, + "ppl": 0.111328125, + "reward": 0.7392891049385071, + "reward_std": 0.011636556126177311, + "rewards/perpo_ocr_edit_distance_reward": 0.7392891049385071, + "step": 3030, + "temperature": 0.9 + }, + { + "advantages": -1.8869128325604834e-05, + "completion_length": 646.0, + "delta_ref_entropy_loss": 0.054931640625, + "delta_ref_ppl": -0.076171875, + "entropy_loss": -0.1171875, + "epoch": 0.6062, + "grad_norm": 1.0230601578192433, + "k1_kl": 0.076171875, + "k3_kl": 0.0478515625, + "kimi_kl": 0.12890625, + "learning_rate": 1.9689999999999997e-07, + "loss": 0.0019, + "ppl": 0.054443359375, + "reward": 0.9505760073661804, + "reward_std": 0.0008027565781958401, + "rewards/perpo_ocr_edit_distance_reward": 0.9505760073661804, + "step": 3031, + "temperature": 0.9 + }, + { + "advantages": 0.0, + "completion_length": 60.0, + "delta_ref_entropy_loss": 0.10986328125, + "delta_ref_ppl": -0.5625, + "entropy_loss": -0.1689453125, + "epoch": 0.6064, + "grad_norm": 2.167444054871082, + "k1_kl": 0.56640625, + "k3_kl": 0.45703125, + "kimi_kl": 1.96875, + "learning_rate": 1.968e-07, + "loss": 0.0183, + "ppl": 0.068359375, + "reward": 0.9487180113792419, + "reward_std": 0.0029607622418552637, + "rewards/perpo_ocr_edit_distance_reward": 0.9487179517745972, + "step": 3032, + "temperature": 0.9 + }, + { + "advantages": -1.915863686008379e-05, + "completion_length": 994.0, + "delta_ref_entropy_loss": 0.0093994140625, + "delta_ref_ppl": -0.0267333984375, + "entropy_loss": -0.036376953125, + "epoch": 0.6066, + "grad_norm": 0.6451956180699283, + "k1_kl": 0.0267333984375, + "k3_kl": 0.017578125, + "kimi_kl": 0.044677734375, + "learning_rate": 1.967e-07, + "loss": 0.0007, + "ppl": 0.0157470703125, + "reward": 0.9952811002731323, + "reward_std": 0.003458564868196845, + "rewards/perpo_ocr_edit_distance_reward": 0.9952812790870667, + "step": 3033, + "temperature": 0.9 + }, + { + "advantages": -7.271767117345007e-06, + "completion_length": 388.0, + "delta_ref_entropy_loss": 0.040771484375, + "delta_ref_ppl": -0.0927734375, + "entropy_loss": -0.0654296875, + "epoch": 0.6068, + "grad_norm": 0.9760716087878825, + "k1_kl": 0.0927734375, + "k3_kl": 0.0654296875, + "kimi_kl": 0.2080078125, + "learning_rate": 1.966e-07, + "loss": 0.0026, + "ppl": 0.0225830078125, + "reward": 0.9928614497184753, + "reward_std": 0.002244937466457486, + "rewards/perpo_ocr_edit_distance_reward": 0.9928615093231201, + "step": 3034, + "temperature": 0.9 + }, + { + "advantages": -2.384185791015625e-07, + "completion_length": 50.0, + "delta_ref_entropy_loss": -0.62890625, + "delta_ref_ppl": -0.6015625, + "entropy_loss": -1.5546875, + "epoch": 0.607, + "grad_norm": 17.50945883069217, + "k1_kl": 0.6015625, + "k3_kl": 0.5703125, + "kimi_kl": 3.03125, + "learning_rate": 1.965e-07, + "loss": 0.0228, + "ppl": 0.57421875, + "reward": 0.29034745693206787, + "reward_std": 0.13436588644981384, + "rewards/perpo_ocr_edit_distance_reward": 0.29034751653671265, + "step": 3035, + "temperature": 0.9 + }, + { + "advantages": -2.4710383513593115e-05, + "completion_length": 590.0, + "delta_ref_entropy_loss": 0.0791015625, + "delta_ref_ppl": -0.08447265625, + "entropy_loss": -0.0859375, + "epoch": 0.6072, + "grad_norm": 0.9314124367167573, + "k1_kl": 0.08447265625, + "k3_kl": 0.050048828125, + "kimi_kl": 0.1357421875, + "learning_rate": 1.9639999999999999e-07, + "loss": 0.002, + "ppl": 0.0380859375, + "reward": 0.8529455661773682, + "reward_std": 0.0016231751069426537, + "rewards/perpo_ocr_edit_distance_reward": 0.8529456257820129, + "step": 3036, + "temperature": 0.9 + }, + { + "advantages": -4.402229023980908e-05, + "completion_length": 335.0, + "delta_ref_entropy_loss": 0.04736328125, + "delta_ref_ppl": -0.13671875, + "entropy_loss": -0.07080078125, + "epoch": 0.6074, + "grad_norm": 1.4855067385807297, + "k1_kl": 0.13671875, + "k3_kl": 0.10107421875, + "kimi_kl": 0.4609375, + "learning_rate": 1.963e-07, + "loss": 0.0041, + "ppl": 0.033447265625, + "reward": 0.9236025214195251, + "reward_std": 0.002028549322858453, + "rewards/perpo_ocr_edit_distance_reward": 0.9236025810241699, + "step": 3037, + "temperature": 0.9 + }, + { + "advantages": -3.916876778475853e-07, + "completion_length": 1364.0, + "delta_ref_entropy_loss": 0.0322265625, + "delta_ref_ppl": -0.06591796875, + "entropy_loss": -0.10498046875, + "epoch": 0.6076, + "grad_norm": 1.3034288396076439, + "k1_kl": 0.06591796875, + "k3_kl": 0.043701171875, + "kimi_kl": 0.09765625, + "learning_rate": 1.962e-07, + "loss": 0.0018, + "ppl": 0.048095703125, + "reward": 0.7038152813911438, + "reward_std": 0.04355667158961296, + "rewards/perpo_ocr_edit_distance_reward": 0.7038153409957886, + "step": 3038, + "temperature": 0.9 + }, + { + "advantages": -5.6607386795803905e-05, + "completion_length": 1202.0, + "delta_ref_entropy_loss": 0.07080078125, + "delta_ref_ppl": -0.07470703125, + "entropy_loss": -0.05908203125, + "epoch": 0.6078, + "grad_norm": 0.8480451024024838, + "k1_kl": 0.07421875, + "k3_kl": 0.04296875, + "kimi_kl": 0.109375, + "learning_rate": 1.961e-07, + "loss": 0.0018, + "ppl": 0.02587890625, + "reward": 0.9775680303573608, + "reward_std": 0.0008023708942346275, + "rewards/perpo_ocr_edit_distance_reward": 0.9775681495666504, + "step": 3039, + "temperature": 0.9 + }, + { + "advantages": -0.00016388723452109843, + "completion_length": 720.0, + "delta_ref_entropy_loss": 0.04638671875, + "delta_ref_ppl": -0.0703125, + "entropy_loss": -0.0478515625, + "epoch": 0.608, + "grad_norm": 0.6152682930236335, + "k1_kl": 0.0703125, + "k3_kl": 0.046142578125, + "kimi_kl": 0.1298828125, + "learning_rate": 1.96e-07, + "loss": 0.002, + "ppl": 0.0244140625, + "reward": 0.8409462571144104, + "reward_std": 0.00047153810737654567, + "rewards/perpo_ocr_edit_distance_reward": 0.8409463763237, + "step": 3040, + "temperature": 0.9 + }, + { + "advantages": 1.7029899268550253e-08, + "completion_length": 696.0, + "delta_ref_entropy_loss": -0.0033416748046875, + "delta_ref_ppl": -0.07763671875, + "entropy_loss": -0.1845703125, + "epoch": 0.6082, + "grad_norm": 1.8225857194895723, + "k1_kl": 0.078125, + "k3_kl": 0.06005859375, + "kimi_kl": 0.1708984375, + "learning_rate": 1.9589999999999997e-07, + "loss": 0.0024, + "ppl": 0.0830078125, + "reward": 0.8917537927627563, + "reward_std": 0.02847985550761223, + "rewards/perpo_ocr_edit_distance_reward": 0.8917539119720459, + "step": 3041, + "temperature": 0.9 + }, + { + "advantages": -4.8245703510474414e-05, + "completion_length": 664.0, + "delta_ref_entropy_loss": 0.0703125, + "delta_ref_ppl": -0.1162109375, + "entropy_loss": -0.0732421875, + "epoch": 0.6084, + "grad_norm": 24.65497134509828, + "k1_kl": 0.1162109375, + "k3_kl": 0.318359375, + "kimi_kl": 0.2236328125, + "learning_rate": 1.958e-07, + "loss": 0.0128, + "ppl": 0.0380859375, + "reward": 0.9781147241592407, + "reward_std": 0.0011352254077792168, + "rewards/perpo_ocr_edit_distance_reward": 0.9781148433685303, + "step": 3042, + "temperature": 0.9 + }, + { + "advantages": -5.921295814914629e-05, + "completion_length": 1023.0, + "delta_ref_entropy_loss": 0.03759765625, + "delta_ref_ppl": -0.05615234375, + "entropy_loss": -0.05126953125, + "epoch": 0.6086, + "grad_norm": 0.676181713637581, + "k1_kl": 0.05615234375, + "k3_kl": 0.034912109375, + "kimi_kl": 0.0927734375, + "learning_rate": 1.957e-07, + "loss": 0.0015, + "ppl": 0.0242919921875, + "reward": 0.967689037322998, + "reward_std": 0.0011938543757423759, + "rewards/perpo_ocr_edit_distance_reward": 0.9676891565322876, + "step": 3043, + "temperature": 0.9 + }, + { + "advantages": -1.7332180505036376e-05, + "completion_length": 104.0, + "delta_ref_entropy_loss": 0.06103515625, + "delta_ref_ppl": -0.3671875, + "entropy_loss": -0.134765625, + "epoch": 0.6088, + "grad_norm": 1.8100092652437807, + "k1_kl": 0.3671875, + "k3_kl": 0.296875, + "kimi_kl": 1.2734375, + "learning_rate": 1.9559999999999998e-07, + "loss": 0.0119, + "ppl": 0.046630859375, + "reward": 0.9833812713623047, + "reward_std": 0.002846555318683386, + "rewards/perpo_ocr_edit_distance_reward": 0.9833813309669495, + "step": 3044, + "temperature": 0.9 + }, + { + "advantages": -1.3709069207834546e-05, + "completion_length": 276.0, + "delta_ref_entropy_loss": 0.1171875, + "delta_ref_ppl": -0.162109375, + "entropy_loss": -0.251953125, + "epoch": 0.609, + "grad_norm": 1.8236723483494308, + "k1_kl": 0.162109375, + "k3_kl": 0.11328125, + "kimi_kl": 0.322265625, + "learning_rate": 1.955e-07, + "loss": 0.0045, + "ppl": 0.10791015625, + "reward": 0.7834533452987671, + "reward_std": 0.0036269444972276688, + "rewards/perpo_ocr_edit_distance_reward": 0.7834534049034119, + "step": 3045, + "temperature": 0.9 + }, + { + "advantages": -3.8138459785841405e-05, + "completion_length": 908.0, + "delta_ref_entropy_loss": 0.01190185546875, + "delta_ref_ppl": -0.041748046875, + "entropy_loss": -0.0361328125, + "epoch": 0.6092, + "grad_norm": 0.7233392318836901, + "k1_kl": 0.041748046875, + "k3_kl": 0.027587890625, + "kimi_kl": 0.06884765625, + "learning_rate": 1.954e-07, + "loss": 0.0011, + "ppl": 0.0162353515625, + "reward": 0.977758526802063, + "reward_std": 0.0012393114157021046, + "rewards/perpo_ocr_edit_distance_reward": 0.977758526802063, + "step": 3046, + "temperature": 0.9 + }, + { + "advantages": -0.00014899458619765937, + "completion_length": 905.0, + "delta_ref_entropy_loss": 0.057373046875, + "delta_ref_ppl": -0.06689453125, + "entropy_loss": -0.035400390625, + "epoch": 0.6094, + "grad_norm": 0.20486300570187577, + "k1_kl": 0.06640625, + "k3_kl": 0.0380859375, + "kimi_kl": 0.123046875, + "learning_rate": 1.9529999999999998e-07, + "loss": 0.0017, + "ppl": 0.01214599609375, + "reward": 0.998245894908905, + "reward_std": 0.00018576979346107692, + "rewards/perpo_ocr_edit_distance_reward": 0.9982459545135498, + "step": 3047, + "temperature": 0.9 + }, + { + "advantages": 0.0, + "completion_length": 985.0, + "delta_ref_entropy_loss": 0.0162353515625, + "delta_ref_ppl": -0.05126953125, + "entropy_loss": -0.0615234375, + "epoch": 0.6096, + "grad_norm": 1.9187666490226714, + "k1_kl": 0.05126953125, + "k3_kl": 0.03662109375, + "kimi_kl": 0.09423828125, + "learning_rate": 1.952e-07, + "loss": 0.0015, + "ppl": 0.0269775390625, + "reward": 0.9860771894454956, + "reward_std": 0.005180820822715759, + "rewards/perpo_ocr_edit_distance_reward": 0.9860771894454956, + "step": 3048, + "temperature": 0.9 + }, + { + "advantages": -7.70432670833543e-05, + "completion_length": 1316.0, + "delta_ref_entropy_loss": 0.0216064453125, + "delta_ref_ppl": -0.0296630859375, + "entropy_loss": -0.0458984375, + "epoch": 0.6098, + "grad_norm": 0.5156320276794766, + "k1_kl": 0.02978515625, + "k3_kl": 0.017333984375, + "kimi_kl": 0.0419921875, + "learning_rate": 1.951e-07, + "loss": 0.0008, + "ppl": 0.02099609375, + "reward": 0.9966777563095093, + "reward_std": 0.0006738516967743635, + "rewards/perpo_ocr_edit_distance_reward": 0.9966778755187988, + "step": 3049, + "temperature": 0.9 + }, + { + "advantages": -9.79219184955582e-05, + "completion_length": 973.0, + "delta_ref_entropy_loss": 0.035400390625, + "delta_ref_ppl": -0.05615234375, + "entropy_loss": -0.034423828125, + "epoch": 0.61, + "grad_norm": 0.3559849923092482, + "k1_kl": 0.05615234375, + "k3_kl": 0.0322265625, + "kimi_kl": 0.0859375, + "learning_rate": 1.9499999999999999e-07, + "loss": 0.0014, + "ppl": 0.01153564453125, + "reward": 0.9985027313232422, + "reward_std": 0.000247852731263265, + "rewards/perpo_ocr_edit_distance_reward": 0.998502790927887, + "step": 3050, + "temperature": 0.9 + }, + { + "advantages": -2.9802324206684716e-05, + "completion_length": 89.0, + "delta_ref_entropy_loss": 0.0203857421875, + "delta_ref_ppl": -0.328125, + "entropy_loss": -0.09375, + "epoch": 0.6102, + "grad_norm": 1.5200522552622444, + "k1_kl": 0.330078125, + "k3_kl": 0.26953125, + "kimi_kl": 1.28125, + "learning_rate": 1.9489999999999998e-07, + "loss": 0.0108, + "ppl": 0.034423828125, + "reward": 0.9877938032150269, + "reward_std": 0.0021837526001036167, + "rewards/perpo_ocr_edit_distance_reward": 0.9877939224243164, + "step": 3051, + "temperature": 0.9 + }, + { + "advantages": 1.8528529835748486e-05, + "completion_length": 418.0, + "delta_ref_entropy_loss": 0.08740234375, + "delta_ref_ppl": -0.1484375, + "entropy_loss": -0.369140625, + "epoch": 0.6104, + "grad_norm": 2.652900543369394, + "k1_kl": 0.1484375, + "k3_kl": 0.1181640625, + "kimi_kl": 0.2373046875, + "learning_rate": 1.948e-07, + "loss": 0.0047, + "ppl": 0.1865234375, + "reward": 0.8296273350715637, + "reward_std": 0.0021988586522638798, + "rewards/perpo_ocr_edit_distance_reward": 0.8296273350715637, + "step": 3052, + "temperature": 0.9 + }, + { + "advantages": -7.493155749216385e-07, + "completion_length": 1121.0, + "delta_ref_entropy_loss": 0.060302734375, + "delta_ref_ppl": -0.08837890625, + "entropy_loss": -0.26953125, + "epoch": 0.6106, + "grad_norm": 11.525831349292867, + "k1_kl": 0.08837890625, + "k3_kl": 0.0546875, + "kimi_kl": 0.1240234375, + "learning_rate": 1.9470000000000002e-07, + "loss": 0.0022, + "ppl": 0.1318359375, + "reward": 0.7550888061523438, + "reward_std": 0.09099966287612915, + "rewards/perpo_ocr_edit_distance_reward": 0.7550888657569885, + "step": 3053, + "temperature": 0.9 + }, + { + "advantages": -2.069132824544795e-05, + "completion_length": 1387.0, + "delta_ref_entropy_loss": 0.029052734375, + "delta_ref_ppl": -0.051025390625, + "entropy_loss": -0.10009765625, + "epoch": 0.6108, + "grad_norm": 2.5940328810135878, + "k1_kl": 0.051025390625, + "k3_kl": 0.033447265625, + "kimi_kl": 0.0634765625, + "learning_rate": 1.9459999999999998e-07, + "loss": 0.0014, + "ppl": 0.04541015625, + "reward": 0.9870648980140686, + "reward_std": 0.001958449836820364, + "rewards/perpo_ocr_edit_distance_reward": 0.9870648980140686, + "step": 3054, + "temperature": 0.9 + }, + { + "advantages": -3.4059798963426147e-06, + "completion_length": 531.0, + "delta_ref_entropy_loss": 0.0830078125, + "delta_ref_ppl": -0.11767578125, + "entropy_loss": -0.1328125, + "epoch": 0.611, + "grad_norm": 1.0359893728995626, + "k1_kl": 0.1181640625, + "k3_kl": 0.07470703125, + "kimi_kl": 0.240234375, + "learning_rate": 1.945e-07, + "loss": 0.003, + "ppl": 0.05859375, + "reward": 0.9461311101913452, + "reward_std": 0.0023982468992471695, + "rewards/perpo_ocr_edit_distance_reward": 0.9461311101913452, + "step": 3055, + "temperature": 0.9 + }, + { + "advantages": -1.801763391995337e-05, + "completion_length": 551.0, + "delta_ref_entropy_loss": 0.032958984375, + "delta_ref_ppl": -0.057861328125, + "entropy_loss": -0.1474609375, + "epoch": 0.6112, + "grad_norm": 1.1596298504895692, + "k1_kl": 0.05810546875, + "k3_kl": 0.039794921875, + "kimi_kl": 0.10595703125, + "learning_rate": 1.944e-07, + "loss": 0.0016, + "ppl": 0.0869140625, + "reward": 0.5928919315338135, + "reward_std": 0.002734998008236289, + "rewards/perpo_ocr_edit_distance_reward": 0.5928919911384583, + "step": 3056, + "temperature": 0.9 + }, + { + "advantages": -1.3453620340442285e-05, + "completion_length": 232.0, + "delta_ref_entropy_loss": 0.030517578125, + "delta_ref_ppl": -0.1123046875, + "entropy_loss": -0.06982421875, + "epoch": 0.6114, + "grad_norm": 0.8441418178204432, + "k1_kl": 0.11181640625, + "k3_kl": 0.0830078125, + "kimi_kl": 0.302734375, + "learning_rate": 1.9429999999999999e-07, + "loss": 0.0033, + "ppl": 0.022216796875, + "reward": 0.6842054724693298, + "reward_std": 0.0005326967220753431, + "rewards/perpo_ocr_edit_distance_reward": 0.6842054724693298, + "step": 3057, + "temperature": 0.9 + }, + { + "advantages": -1.8170901967096142e-05, + "completion_length": 660.0, + "delta_ref_entropy_loss": 0.068359375, + "delta_ref_ppl": -0.0693359375, + "entropy_loss": -0.1708984375, + "epoch": 0.6116, + "grad_norm": 1.1796488580819346, + "k1_kl": 0.0693359375, + "k3_kl": 0.0341796875, + "kimi_kl": 0.060302734375, + "learning_rate": 1.942e-07, + "loss": 0.0014, + "ppl": 0.07763671875, + "reward": 0.8901922702789307, + "reward_std": 0.0027106108609586954, + "rewards/perpo_ocr_edit_distance_reward": 0.8901923894882202, + "step": 3058, + "temperature": 0.9 + }, + { + "advantages": -8.514949634275126e-09, + "completion_length": 443.0, + "delta_ref_entropy_loss": 0.0498046875, + "delta_ref_ppl": -0.1279296875, + "entropy_loss": -0.07421875, + "epoch": 0.6118, + "grad_norm": 0.5810570206672473, + "k1_kl": 0.1279296875, + "k3_kl": 0.08984375, + "kimi_kl": 0.359375, + "learning_rate": 1.941e-07, + "loss": 0.0036, + "ppl": 0.028076171875, + "reward": 0.9906352162361145, + "reward_std": 0.0009731416939757764, + "rewards/perpo_ocr_edit_distance_reward": 0.9906352162361145, + "step": 3059, + "temperature": 0.9 + }, + { + "advantages": -3.441742592258379e-05, + "completion_length": 168.0, + "delta_ref_entropy_loss": 0.0380859375, + "delta_ref_ppl": -0.1806640625, + "entropy_loss": -0.048583984375, + "epoch": 0.612, + "grad_norm": 1.4408637447788146, + "k1_kl": 0.1806640625, + "k3_kl": 0.14453125, + "kimi_kl": 0.8046875, + "learning_rate": 1.94e-07, + "loss": 0.0058, + "ppl": 0.0206298828125, + "reward": 0.9875550270080566, + "reward_std": 0.0016329215141013265, + "rewards/perpo_ocr_edit_distance_reward": 0.9875551462173462, + "step": 3060, + "temperature": 0.9 + }, + { + "advantages": -5.7101253332803026e-05, + "completion_length": 372.0, + "delta_ref_entropy_loss": 0.050537109375, + "delta_ref_ppl": -0.078125, + "entropy_loss": -0.060791015625, + "epoch": 0.6122, + "grad_norm": 1.2057396252291375, + "k1_kl": 0.078125, + "k3_kl": 0.049072265625, + "kimi_kl": 0.1513671875, + "learning_rate": 1.9389999999999998e-07, + "loss": 0.002, + "ppl": 0.03271484375, + "reward": 0.9529517889022827, + "reward_std": 0.0004964044201187789, + "rewards/perpo_ocr_edit_distance_reward": 0.9529517889022827, + "step": 3061, + "temperature": 0.9 + }, + { + "advantages": -2.843993206624873e-05, + "completion_length": 483.0, + "delta_ref_entropy_loss": 0.0390625, + "delta_ref_ppl": -0.04248046875, + "entropy_loss": -0.053466796875, + "epoch": 0.6124, + "grad_norm": 0.4407140818005528, + "k1_kl": 0.04248046875, + "k3_kl": 0.024658203125, + "kimi_kl": 0.06201171875, + "learning_rate": 1.938e-07, + "loss": 0.001, + "ppl": 0.0228271484375, + "reward": 0.6888888478279114, + "reward_std": 0.000499171728733927, + "rewards/perpo_ocr_edit_distance_reward": 0.6888889074325562, + "step": 3062, + "temperature": 0.9 + }, + { + "advantages": -1.6467913155793212e-05, + "completion_length": 763.0, + "delta_ref_entropy_loss": 0.0301513671875, + "delta_ref_ppl": -0.04736328125, + "entropy_loss": -0.045654296875, + "epoch": 0.6126, + "grad_norm": 0.5629070097834094, + "k1_kl": 0.047607421875, + "k3_kl": 0.0277099609375, + "kimi_kl": 0.06396484375, + "learning_rate": 1.937e-07, + "loss": 0.0011, + "ppl": 0.0184326171875, + "reward": 0.9915596842765808, + "reward_std": 0.001967606134712696, + "rewards/perpo_ocr_edit_distance_reward": 0.9915597438812256, + "step": 3063, + "temperature": 0.9 + }, + { + "advantages": -2.6566642645775573e-06, + "completion_length": 874.0, + "delta_ref_entropy_loss": 0.1328125, + "delta_ref_ppl": -0.1298828125, + "entropy_loss": -0.3203125, + "epoch": 0.6128, + "grad_norm": 1.7900727497786268, + "k1_kl": 0.1298828125, + "k3_kl": 0.0849609375, + "kimi_kl": 0.1962890625, + "learning_rate": 1.9359999999999999e-07, + "loss": 0.0034, + "ppl": 0.1669921875, + "reward": 0.8683425784111023, + "reward_std": 0.0030996431596577168, + "rewards/perpo_ocr_edit_distance_reward": 0.8683425784111023, + "step": 3064, + "temperature": 0.9 + }, + { + "advantages": -9.071614476852119e-05, + "completion_length": 544.0, + "delta_ref_entropy_loss": 0.0693359375, + "delta_ref_ppl": -0.09326171875, + "entropy_loss": -0.1650390625, + "epoch": 0.613, + "grad_norm": 1.2288750983461334, + "k1_kl": 0.09326171875, + "k3_kl": 0.056640625, + "kimi_kl": 0.1337890625, + "learning_rate": 1.935e-07, + "loss": 0.0024, + "ppl": 0.08056640625, + "reward": 0.8418539762496948, + "reward_std": 0.00121412705630064, + "rewards/perpo_ocr_edit_distance_reward": 0.8418541550636292, + "step": 3065, + "temperature": 0.9 + }, + { + "advantages": -0.0001005019512376748, + "completion_length": 972.0, + "delta_ref_entropy_loss": 0.033935546875, + "delta_ref_ppl": -0.04296875, + "entropy_loss": -0.04052734375, + "epoch": 0.6132, + "grad_norm": 0.3991336406516254, + "k1_kl": 0.04296875, + "k3_kl": 0.0264892578125, + "kimi_kl": 0.08056640625, + "learning_rate": 1.9339999999999997e-07, + "loss": 0.0012, + "ppl": 0.01531982421875, + "reward": 0.9881404638290405, + "reward_std": 0.0005777575424872339, + "rewards/perpo_ocr_edit_distance_reward": 0.9881405830383301, + "step": 3066, + "temperature": 0.9 + }, + { + "advantages": 0.0, + "completion_length": 478.0, + "delta_ref_entropy_loss": 0.07177734375, + "delta_ref_ppl": -0.080078125, + "entropy_loss": -0.06884765625, + "epoch": 0.6134, + "grad_norm": 0.9097577511556448, + "k1_kl": 0.08056640625, + "k3_kl": 0.048828125, + "kimi_kl": 0.1591796875, + "learning_rate": 1.933e-07, + "loss": 0.002, + "ppl": 0.024169921875, + "reward": 0.9593923091888428, + "reward_std": 0.08704116940498352, + "rewards/perpo_ocr_edit_distance_reward": 0.9593923091888428, + "step": 3067, + "temperature": 0.9 + }, + { + "advantages": 4.108463144802954e-06, + "completion_length": 244.0, + "delta_ref_entropy_loss": 0.08203125, + "delta_ref_ppl": -0.189453125, + "entropy_loss": -0.10498046875, + "epoch": 0.6136, + "grad_norm": 1.056406244435667, + "k1_kl": 0.189453125, + "k3_kl": 0.1337890625, + "kimi_kl": 0.50390625, + "learning_rate": 1.932e-07, + "loss": 0.0054, + "ppl": 0.04296875, + "reward": 0.44676488637924194, + "reward_std": 0.001966869691386819, + "rewards/perpo_ocr_edit_distance_reward": 0.44676488637924194, + "step": 3068, + "temperature": 0.9 + }, + { + "advantages": -0.00016152198077179492, + "completion_length": 553.0, + "delta_ref_entropy_loss": 0.046875, + "delta_ref_ppl": -0.0673828125, + "entropy_loss": -0.029296875, + "epoch": 0.6138, + "grad_norm": 0.3288431709900852, + "k1_kl": 0.0673828125, + "k3_kl": 0.03955078125, + "kimi_kl": 0.11767578125, + "learning_rate": 1.9309999999999998e-07, + "loss": 0.0017, + "ppl": 0.007232666015625, + "reward": 0.996738076210022, + "reward_std": 0.0002690389519557357, + "rewards/perpo_ocr_edit_distance_reward": 0.9967381954193115, + "step": 3069, + "temperature": 0.9 + }, + { + "advantages": -0.0001075012405635789, + "completion_length": 542.0, + "delta_ref_entropy_loss": 0.042724609375, + "delta_ref_ppl": -0.0556640625, + "entropy_loss": -0.03564453125, + "epoch": 0.614, + "grad_norm": 0.44578789004404257, + "k1_kl": 0.055908203125, + "k3_kl": 0.0341796875, + "kimi_kl": 0.10693359375, + "learning_rate": 1.93e-07, + "loss": 0.0015, + "ppl": 0.013671875, + "reward": 0.9953431487083435, + "reward_std": 0.0006921281456016004, + "rewards/perpo_ocr_edit_distance_reward": 0.9953432679176331, + "step": 3070, + "temperature": 0.9 + }, + { + "advantages": 3.639715214376338e-05, + "completion_length": 634.0, + "delta_ref_entropy_loss": 0.048828125, + "delta_ref_ppl": -0.061767578125, + "entropy_loss": -0.0625, + "epoch": 0.6142, + "grad_norm": 0.5670944491503819, + "k1_kl": 0.061767578125, + "k3_kl": 0.0322265625, + "kimi_kl": 0.06689453125, + "learning_rate": 1.929e-07, + "loss": 0.0013, + "ppl": 0.0269775390625, + "reward": 0.8781799077987671, + "reward_std": 0.00036779927904717624, + "rewards/perpo_ocr_edit_distance_reward": 0.8781798481941223, + "step": 3071, + "temperature": 0.9 + }, + { + "advantages": -7.554463081760332e-05, + "completion_length": 200.0, + "delta_ref_entropy_loss": 0.045166015625, + "delta_ref_ppl": -0.138671875, + "entropy_loss": -0.0712890625, + "epoch": 0.6144, + "grad_norm": 0.9777384292001261, + "k1_kl": 0.138671875, + "k3_kl": 0.10498046875, + "kimi_kl": 0.384765625, + "learning_rate": 1.9279999999999998e-07, + "loss": 0.0043, + "ppl": 0.02490234375, + "reward": 0.9879698157310486, + "reward_std": 0.0010272776708006859, + "rewards/perpo_ocr_edit_distance_reward": 0.9879699945449829, + "step": 3072, + "temperature": 0.9 + }, + { + "advantages": -2.55448497910038e-08, + "completion_length": 123.0, + "delta_ref_entropy_loss": -0.60546875, + "delta_ref_ppl": -1.6640625, + "entropy_loss": -1.3984375, + "epoch": 0.6146, + "grad_norm": 49.254296227273336, + "k1_kl": 1.671875, + "k3_kl": 1.484375, + "kimi_kl": 7.0625, + "learning_rate": 1.927e-07, + "loss": 0.0593, + "ppl": 0.515625, + "reward": 0.24098797142505646, + "reward_std": 0.11004836857318878, + "rewards/perpo_ocr_edit_distance_reward": 0.24098798632621765, + "step": 3073, + "temperature": 0.9 + }, + { + "advantages": -6.811959707420101e-08, + "completion_length": 663.0, + "delta_ref_entropy_loss": -0.78125, + "delta_ref_ppl": -0.0869140625, + "entropy_loss": -1.5859375, + "epoch": 0.6148, + "grad_norm": 9.541763598898267, + "k1_kl": 0.0859375, + "k3_kl": 0.171875, + "kimi_kl": 0.349609375, + "learning_rate": 1.926e-07, + "loss": 0.0069, + "ppl": 0.66015625, + "reward": 0.24645984172821045, + "reward_std": 0.10096569359302521, + "rewards/perpo_ocr_edit_distance_reward": 0.24645987153053284, + "step": 3074, + "temperature": 0.9 + }, + { + "advantages": -1.6697817045496777e-05, + "completion_length": 496.0, + "delta_ref_entropy_loss": 0.02197265625, + "delta_ref_ppl": -0.053466796875, + "entropy_loss": -0.04150390625, + "epoch": 0.615, + "grad_norm": 0.6564364511188091, + "k1_kl": 0.05322265625, + "k3_kl": 0.036376953125, + "kimi_kl": 0.10888671875, + "learning_rate": 1.9249999999999998e-07, + "loss": 0.0015, + "ppl": 0.016845703125, + "reward": 0.9926753044128418, + "reward_std": 0.004494238644838333, + "rewards/perpo_ocr_edit_distance_reward": 0.9926753640174866, + "step": 3075, + "temperature": 0.9 + }, + { + "advantages": -1.4611653568863403e-05, + "completion_length": 327.0, + "delta_ref_entropy_loss": 0.0191650390625, + "delta_ref_ppl": -0.11572265625, + "entropy_loss": -0.05224609375, + "epoch": 0.6152, + "grad_norm": 0.9925990735041229, + "k1_kl": 0.11572265625, + "k3_kl": 0.0927734375, + "kimi_kl": 0.4296875, + "learning_rate": 1.9239999999999998e-07, + "loss": 0.0037, + "ppl": 0.022705078125, + "reward": 0.9829641580581665, + "reward_std": 0.0016445255605503917, + "rewards/perpo_ocr_edit_distance_reward": 0.9829642176628113, + "step": 3076, + "temperature": 0.9 + }, + { + "advantages": -1.5999590686988086e-05, + "completion_length": 654.0, + "delta_ref_entropy_loss": 0.06396484375, + "delta_ref_ppl": -0.07080078125, + "entropy_loss": -0.05078125, + "epoch": 0.6154, + "grad_norm": 0.7142157813417939, + "k1_kl": 0.07080078125, + "k3_kl": 0.0400390625, + "kimi_kl": 0.10400390625, + "learning_rate": 1.923e-07, + "loss": 0.0016, + "ppl": 0.0224609375, + "reward": 0.9868984222412109, + "reward_std": 0.0009644735255278647, + "rewards/perpo_ocr_edit_distance_reward": 0.9868984818458557, + "step": 3077, + "temperature": 0.9 + }, + { + "advantages": -1.106943454942666e-05, + "completion_length": 1175.0, + "delta_ref_entropy_loss": 0.049560546875, + "delta_ref_ppl": -0.06494140625, + "entropy_loss": -0.09521484375, + "epoch": 0.6156, + "grad_norm": 1.0637122447978236, + "k1_kl": 0.06494140625, + "k3_kl": 0.039306640625, + "kimi_kl": 0.080078125, + "learning_rate": 1.9220000000000001e-07, + "loss": 0.0016, + "ppl": 0.04833984375, + "reward": 0.9460561275482178, + "reward_std": 0.003743840381503105, + "rewards/perpo_ocr_edit_distance_reward": 0.9460561275482178, + "step": 3078, + "temperature": 0.9 + }, + { + "advantages": -7.811615068931133e-05, + "completion_length": 1114.0, + "delta_ref_entropy_loss": 0.0286865234375, + "delta_ref_ppl": -0.040283203125, + "entropy_loss": -0.023193359375, + "epoch": 0.6158, + "grad_norm": 0.618785150129311, + "k1_kl": 0.040283203125, + "k3_kl": 0.0242919921875, + "kimi_kl": 0.07421875, + "learning_rate": 1.9209999999999998e-07, + "loss": 0.0011, + "ppl": 0.0086669921875, + "reward": 0.9973082542419434, + "reward_std": 0.0002270136756123975, + "rewards/perpo_ocr_edit_distance_reward": 0.9973083734512329, + "step": 3079, + "temperature": 0.9 + }, + { + "advantages": -0.0001112733589252457, + "completion_length": 599.0, + "delta_ref_entropy_loss": 0.04052734375, + "delta_ref_ppl": -0.0546875, + "entropy_loss": -0.031982421875, + "epoch": 0.616, + "grad_norm": 0.3161886307483224, + "k1_kl": 0.054931640625, + "k3_kl": 0.03271484375, + "kimi_kl": 0.09130859375, + "learning_rate": 1.92e-07, + "loss": 0.0014, + "ppl": 0.0103759765625, + "reward": 0.9976420998573303, + "reward_std": 0.00051209976663813, + "rewards/perpo_ocr_edit_distance_reward": 0.9976421594619751, + "step": 3080, + "temperature": 0.9 + }, + { + "advantages": -1.7370497289448394e-06, + "completion_length": 471.0, + "delta_ref_entropy_loss": -0.111328125, + "delta_ref_ppl": -0.11328125, + "entropy_loss": -0.44140625, + "epoch": 0.6162, + "grad_norm": 2.438649526142226, + "k1_kl": 0.11376953125, + "k3_kl": 0.109375, + "kimi_kl": 0.275390625, + "learning_rate": 1.919e-07, + "loss": 0.0044, + "ppl": 0.2021484375, + "reward": 0.6668271422386169, + "reward_std": 0.058614786714315414, + "rewards/perpo_ocr_edit_distance_reward": 0.6668272614479065, + "step": 3081, + "temperature": 0.9 + }, + { + "advantages": 7.552760507678613e-06, + "completion_length": 1999.0, + "delta_ref_entropy_loss": -0.00020885467529296875, + "delta_ref_ppl": -0.0233154296875, + "entropy_loss": -0.08154296875, + "epoch": 0.6164, + "grad_norm": 8.431623410115591, + "k1_kl": 0.0233154296875, + "k3_kl": 0.018798828125, + "kimi_kl": 0.042724609375, + "learning_rate": 1.9179999999999998e-07, + "loss": 0.0007, + "ppl": 0.0390625, + "reward": 0.9729310274124146, + "reward_std": 0.003284490667283535, + "rewards/perpo_ocr_edit_distance_reward": 0.9729310870170593, + "step": 3082, + "temperature": 0.9 + }, + { + "advantages": 3.4059798537100505e-08, + "completion_length": 382.0, + "delta_ref_entropy_loss": 0.0419921875, + "delta_ref_ppl": -0.115234375, + "entropy_loss": -0.08935546875, + "epoch": 0.6166, + "grad_norm": 0.7828809721156537, + "k1_kl": 0.115234375, + "k3_kl": 0.07763671875, + "kimi_kl": 0.2578125, + "learning_rate": 1.917e-07, + "loss": 0.0031, + "ppl": 0.033203125, + "reward": 0.6644364595413208, + "reward_std": 0.001025959150865674, + "rewards/perpo_ocr_edit_distance_reward": 0.6644365191459656, + "step": 3083, + "temperature": 0.9 + }, + { + "advantages": -6.118843157310039e-05, + "completion_length": 787.0, + "delta_ref_entropy_loss": 0.046875, + "delta_ref_ppl": -0.0478515625, + "entropy_loss": -0.02783203125, + "epoch": 0.6168, + "grad_norm": 0.2531452919462967, + "k1_kl": 0.048095703125, + "k3_kl": 0.028564453125, + "kimi_kl": 0.08251953125, + "learning_rate": 1.916e-07, + "loss": 0.0012, + "ppl": 0.00640869140625, + "reward": 0.9908949136734009, + "reward_std": 0.0003173967415932566, + "rewards/perpo_ocr_edit_distance_reward": 0.9908949732780457, + "step": 3084, + "temperature": 0.9 + }, + { + "advantages": -1.1929444553970825e-05, + "completion_length": 466.0, + "delta_ref_entropy_loss": 0.05712890625, + "delta_ref_ppl": -0.09912109375, + "entropy_loss": -0.06298828125, + "epoch": 0.617, + "grad_norm": 0.6784532977712138, + "k1_kl": 0.09912109375, + "k3_kl": 0.061767578125, + "kimi_kl": 0.158203125, + "learning_rate": 1.915e-07, + "loss": 0.0025, + "ppl": 0.0245361328125, + "reward": 0.9809591770172119, + "reward_std": 0.0006129046087153256, + "rewards/perpo_ocr_edit_distance_reward": 0.9809591770172119, + "step": 3085, + "temperature": 0.9 + }, + { + "advantages": -1.6621181202935986e-05, + "completion_length": 234.0, + "delta_ref_entropy_loss": 0.048828125, + "delta_ref_ppl": -0.1103515625, + "entropy_loss": -0.06494140625, + "epoch": 0.6172, + "grad_norm": 1.2121291535570893, + "k1_kl": 0.1103515625, + "k3_kl": 0.078125, + "kimi_kl": 0.2412109375, + "learning_rate": 1.9139999999999998e-07, + "loss": 0.0031, + "ppl": 0.0235595703125, + "reward": 0.947396457195282, + "reward_std": 0.003484951565042138, + "rewards/perpo_ocr_edit_distance_reward": 0.9473965764045715, + "step": 3086, + "temperature": 0.9 + }, + { + "advantages": -0.00021065732289571315, + "completion_length": 748.0, + "delta_ref_entropy_loss": 0.059814453125, + "delta_ref_ppl": -0.07763671875, + "entropy_loss": -0.032470703125, + "epoch": 0.6174, + "grad_norm": 0.18735707992098996, + "k1_kl": 0.078125, + "k3_kl": 0.045166015625, + "kimi_kl": 0.138671875, + "learning_rate": 1.913e-07, + "loss": 0.002, + "ppl": 0.00872802734375, + "reward": 0.9955639243125916, + "reward_std": 0.00018293820903636515, + "rewards/perpo_ocr_edit_distance_reward": 0.9955640435218811, + "step": 3087, + "temperature": 0.9 + }, + { + "advantages": -6.914139248692663e-06, + "completion_length": 162.0, + "delta_ref_entropy_loss": 0.0419921875, + "delta_ref_ppl": -0.1171875, + "entropy_loss": -0.09716796875, + "epoch": 0.6176, + "grad_norm": 1.731877726815646, + "k1_kl": 0.11669921875, + "k3_kl": 0.080078125, + "kimi_kl": 0.234375, + "learning_rate": 1.912e-07, + "loss": 0.0032, + "ppl": 0.04150390625, + "reward": 0.9805929660797119, + "reward_std": 0.002365212654694915, + "rewards/perpo_ocr_edit_distance_reward": 0.9805930256843567, + "step": 3088, + "temperature": 0.9 + }, + { + "advantages": 8.242471267294604e-06, + "completion_length": 465.0, + "delta_ref_entropy_loss": 0.0286865234375, + "delta_ref_ppl": -0.0751953125, + "entropy_loss": -0.0693359375, + "epoch": 0.6178, + "grad_norm": 0.51282670583464, + "k1_kl": 0.0751953125, + "k3_kl": 0.049072265625, + "kimi_kl": 0.1513671875, + "learning_rate": 1.9109999999999998e-07, + "loss": 0.002, + "ppl": 0.026123046875, + "reward": 0.8770496249198914, + "reward_std": 0.0009328349260613322, + "rewards/perpo_ocr_edit_distance_reward": 0.8770496249198914, + "step": 3089, + "temperature": 0.9 + }, + { + "advantages": -7.615771028213203e-05, + "completion_length": 156.0, + "delta_ref_entropy_loss": 0.0262451171875, + "delta_ref_ppl": -0.1044921875, + "entropy_loss": -0.055908203125, + "epoch": 0.618, + "grad_norm": 0.9072937871761527, + "k1_kl": 0.1044921875, + "k3_kl": 0.07421875, + "kimi_kl": 0.2255859375, + "learning_rate": 1.91e-07, + "loss": 0.003, + "ppl": 0.0185546875, + "reward": 0.9864135980606079, + "reward_std": 0.0006824368028901517, + "rewards/perpo_ocr_edit_distance_reward": 0.9864136576652527, + "step": 3090, + "temperature": 0.9 + }, + { + "advantages": -4.0880273445509374e-05, + "completion_length": 1111.0, + "delta_ref_entropy_loss": 0.0162353515625, + "delta_ref_ppl": -0.03564453125, + "entropy_loss": -0.060546875, + "epoch": 0.6182, + "grad_norm": 0.5500543139096417, + "k1_kl": 0.035400390625, + "k3_kl": 0.0235595703125, + "kimi_kl": 0.061767578125, + "learning_rate": 1.9089999999999997e-07, + "loss": 0.001, + "ppl": 0.026611328125, + "reward": 0.9867404103279114, + "reward_std": 0.0009414033265784383, + "rewards/perpo_ocr_edit_distance_reward": 0.9867404699325562, + "step": 3091, + "temperature": 0.9 + }, + { + "advantages": -4.294940663385205e-05, + "completion_length": 330.0, + "delta_ref_entropy_loss": 0.052490234375, + "delta_ref_ppl": -0.053466796875, + "entropy_loss": -0.033203125, + "epoch": 0.6184, + "grad_norm": 0.4754429075113071, + "k1_kl": 0.053466796875, + "k3_kl": 0.0279541015625, + "kimi_kl": 0.052001953125, + "learning_rate": 1.908e-07, + "loss": 0.0012, + "ppl": 0.01226806640625, + "reward": 0.9989768862724304, + "reward_std": 0.0004951484734192491, + "rewards/perpo_ocr_edit_distance_reward": 0.9989768862724304, + "step": 3092, + "temperature": 0.9 + }, + { + "advantages": -0.00015432495274581015, + "completion_length": 404.0, + "delta_ref_entropy_loss": 0.051025390625, + "delta_ref_ppl": -0.06884765625, + "entropy_loss": -0.0299072265625, + "epoch": 0.6186, + "grad_norm": 0.2791105717123056, + "k1_kl": 0.06884765625, + "k3_kl": 0.04248046875, + "kimi_kl": 0.134765625, + "learning_rate": 1.907e-07, + "loss": 0.0019, + "ppl": 0.01043701171875, + "reward": 0.9911642074584961, + "reward_std": 0.0003414125822018832, + "rewards/perpo_ocr_edit_distance_reward": 0.9911642670631409, + "step": 3093, + "temperature": 0.9 + }, + { + "advantages": -6.454331742133945e-05, + "completion_length": 1210.0, + "delta_ref_entropy_loss": 0.0172119140625, + "delta_ref_ppl": -0.03955078125, + "entropy_loss": -0.044189453125, + "epoch": 0.6188, + "grad_norm": 0.5163339773800356, + "k1_kl": 0.03955078125, + "k3_kl": 0.0252685546875, + "kimi_kl": 0.0712890625, + "learning_rate": 1.9059999999999997e-07, + "loss": 0.0011, + "ppl": 0.0184326171875, + "reward": 0.9854528307914734, + "reward_std": 0.000559658685233444, + "rewards/perpo_ocr_edit_distance_reward": 0.9854528903961182, + "step": 3094, + "temperature": 0.9 + }, + { + "advantages": -4.748787250719033e-05, + "completion_length": 762.0, + "delta_ref_entropy_loss": 0.0255126953125, + "delta_ref_ppl": -0.0380859375, + "entropy_loss": -0.03271484375, + "epoch": 0.619, + "grad_norm": 0.23303256453517912, + "k1_kl": 0.0380859375, + "k3_kl": 0.0250244140625, + "kimi_kl": 0.051513671875, + "learning_rate": 1.905e-07, + "loss": 0.001, + "ppl": 0.00982666015625, + "reward": 0.9939780235290527, + "reward_std": 0.0006170585984364152, + "rewards/perpo_ocr_edit_distance_reward": 0.9939781427383423, + "step": 3095, + "temperature": 0.9 + }, + { + "advantages": 3.4059798537100505e-08, + "completion_length": 113.0, + "delta_ref_entropy_loss": -0.76171875, + "delta_ref_ppl": -0.96875, + "entropy_loss": -1.890625, + "epoch": 0.6192, + "grad_norm": 33.385919597286524, + "k1_kl": 0.97265625, + "k3_kl": 1.234375, + "kimi_kl": 5.84375, + "learning_rate": 1.904e-07, + "loss": 0.0493, + "ppl": 0.91015625, + "reward": 0.11920849233865738, + "reward_std": 0.04859521985054016, + "rewards/perpo_ocr_edit_distance_reward": 0.11920849233865738, + "step": 3096, + "temperature": 0.9 + }, + { + "advantages": -4.855224324273877e-05, + "completion_length": 635.0, + "delta_ref_entropy_loss": 0.04736328125, + "delta_ref_ppl": -0.07080078125, + "entropy_loss": -0.043701171875, + "epoch": 0.6194, + "grad_norm": 0.517648447794618, + "k1_kl": 0.07080078125, + "k3_kl": 0.04150390625, + "kimi_kl": 0.1298828125, + "learning_rate": 1.9029999999999998e-07, + "loss": 0.0017, + "ppl": 0.015869140625, + "reward": 0.9975695610046387, + "reward_std": 0.0007770240190438926, + "rewards/perpo_ocr_edit_distance_reward": 0.9975696206092834, + "step": 3097, + "temperature": 0.9 + }, + { + "advantages": -0.00013084072270430624, + "completion_length": 976.0, + "delta_ref_entropy_loss": 0.049072265625, + "delta_ref_ppl": -0.043701171875, + "entropy_loss": -0.047119140625, + "epoch": 0.6196, + "grad_norm": 0.5172439402077854, + "k1_kl": 0.0439453125, + "k3_kl": 0.02197265625, + "kimi_kl": 0.057861328125, + "learning_rate": 1.902e-07, + "loss": 0.001, + "ppl": 0.019287109375, + "reward": 0.997963011264801, + "reward_std": 0.0004206307348795235, + "rewards/perpo_ocr_edit_distance_reward": 0.9979630708694458, + "step": 3098, + "temperature": 0.9 + }, + { + "advantages": 1.7029898913278885e-07, + "completion_length": 789.0, + "delta_ref_entropy_loss": 0.0654296875, + "delta_ref_ppl": -0.1103515625, + "entropy_loss": -0.1416015625, + "epoch": 0.6198, + "grad_norm": 4.271569844072087, + "k1_kl": 0.10986328125, + "k3_kl": 0.0771484375, + "kimi_kl": 0.220703125, + "learning_rate": 1.901e-07, + "loss": 0.0031, + "ppl": 0.07373046875, + "reward": 0.5189929008483887, + "reward_std": 0.07801329344511032, + "rewards/perpo_ocr_edit_distance_reward": 0.5189929008483887, + "step": 3099, + "temperature": 0.9 + }, + { + "advantages": -0.00013024467625655234, + "completion_length": 712.0, + "delta_ref_entropy_loss": 0.0458984375, + "delta_ref_ppl": -0.07373046875, + "entropy_loss": -0.061279296875, + "epoch": 0.62, + "grad_norm": 0.4289759764287324, + "k1_kl": 0.07373046875, + "k3_kl": 0.04736328125, + "kimi_kl": 0.1396484375, + "learning_rate": 1.8999999999999998e-07, + "loss": 0.002, + "ppl": 0.0250244140625, + "reward": 0.9929500818252563, + "reward_std": 0.0006191028514876962, + "rewards/perpo_ocr_edit_distance_reward": 0.9929501414299011, + "step": 3100, + "temperature": 0.9 + }, + { + "advantages": -4.3971198465442285e-05, + "completion_length": 856.0, + "delta_ref_entropy_loss": 0.04638671875, + "delta_ref_ppl": -0.0693359375, + "entropy_loss": -0.078125, + "epoch": 0.6202, + "grad_norm": 1.0113260399806738, + "k1_kl": 0.0693359375, + "k3_kl": 0.04296875, + "kimi_kl": 0.08837890625, + "learning_rate": 1.899e-07, + "loss": 0.0018, + "ppl": 0.03759765625, + "reward": 0.9753222465515137, + "reward_std": 0.0012551848776638508, + "rewards/perpo_ocr_edit_distance_reward": 0.9753223061561584, + "step": 3101, + "temperature": 0.9 + }, + { + "advantages": -8.089202196970291e-07, + "completion_length": 588.0, + "delta_ref_entropy_loss": 0.027099609375, + "delta_ref_ppl": -0.08056640625, + "entropy_loss": -0.08251953125, + "epoch": 0.6204, + "grad_norm": 7.105713831002217, + "k1_kl": 0.08056640625, + "k3_kl": 0.111328125, + "kimi_kl": 0.19921875, + "learning_rate": 1.898e-07, + "loss": 0.0045, + "ppl": 0.03662109375, + "reward": 0.2740015685558319, + "reward_std": 0.03458355739712715, + "rewards/perpo_ocr_edit_distance_reward": 0.2740015983581543, + "step": 3102, + "temperature": 0.9 + }, + { + "advantages": -0.00015597684250678867, + "completion_length": 441.0, + "delta_ref_entropy_loss": 0.06298828125, + "delta_ref_ppl": -0.10009765625, + "entropy_loss": -0.040283203125, + "epoch": 0.6206, + "grad_norm": 0.2610804503854609, + "k1_kl": 0.099609375, + "k3_kl": 0.06494140625, + "kimi_kl": 0.205078125, + "learning_rate": 1.897e-07, + "loss": 0.0027, + "ppl": 0.012939453125, + "reward": 0.997505784034729, + "reward_std": 0.00022754223027732223, + "rewards/perpo_ocr_edit_distance_reward": 0.9975058436393738, + "step": 3103, + "temperature": 0.9 + }, + { + "advantages": -4.499299393501133e-05, + "completion_length": 545.0, + "delta_ref_entropy_loss": 0.09423828125, + "delta_ref_ppl": -0.1220703125, + "entropy_loss": -0.0693359375, + "epoch": 0.6208, + "grad_norm": 0.6791160058166484, + "k1_kl": 0.1220703125, + "k3_kl": 0.07373046875, + "kimi_kl": 0.24609375, + "learning_rate": 1.8959999999999998e-07, + "loss": 0.003, + "ppl": 0.026123046875, + "reward": 0.9684569239616394, + "reward_std": 0.0008464140119031072, + "rewards/perpo_ocr_edit_distance_reward": 0.968457043170929, + "step": 3104, + "temperature": 0.9 + }, + { + "advantages": -2.1389552784967236e-05, + "completion_length": 768.0, + "delta_ref_entropy_loss": 0.06689453125, + "delta_ref_ppl": -0.061279296875, + "entropy_loss": -0.07373046875, + "epoch": 0.621, + "grad_norm": 1.353130566644689, + "k1_kl": 0.061279296875, + "k3_kl": 0.032958984375, + "kimi_kl": 0.07275390625, + "learning_rate": 1.895e-07, + "loss": 0.0013, + "ppl": 0.0322265625, + "reward": 0.8522258996963501, + "reward_std": 0.001094589359126985, + "rewards/perpo_ocr_edit_distance_reward": 0.8522260189056396, + "step": 3105, + "temperature": 0.9 + }, + { + "advantages": -4.915680256090127e-05, + "completion_length": 216.0, + "delta_ref_entropy_loss": 0.08349609375, + "delta_ref_ppl": -0.1484375, + "entropy_loss": -0.05712890625, + "epoch": 0.6212, + "grad_norm": 1.1781151938569294, + "k1_kl": 0.1484375, + "k3_kl": 0.09912109375, + "kimi_kl": 0.291015625, + "learning_rate": 1.8940000000000002e-07, + "loss": 0.004, + "ppl": 0.0245361328125, + "reward": 0.9808077812194824, + "reward_std": 0.0016323230229318142, + "rewards/perpo_ocr_edit_distance_reward": 0.9808078408241272, + "step": 3106, + "temperature": 0.9 + }, + { + "advantages": -3.1505312563240295e-06, + "completion_length": 304.0, + "delta_ref_entropy_loss": 0.058837890625, + "delta_ref_ppl": -0.109375, + "entropy_loss": -0.0751953125, + "epoch": 0.6214, + "grad_norm": 1.3698309684249779, + "k1_kl": 0.109375, + "k3_kl": 0.07177734375, + "kimi_kl": 0.1787109375, + "learning_rate": 1.8929999999999998e-07, + "loss": 0.0029, + "ppl": 0.03759765625, + "reward": 0.9726357460021973, + "reward_std": 0.0188544113188982, + "rewards/perpo_ocr_edit_distance_reward": 0.9726358652114868, + "step": 3107, + "temperature": 0.9 + }, + { + "advantages": -1.6246523955487646e-05, + "completion_length": 653.0, + "delta_ref_entropy_loss": 0.0225830078125, + "delta_ref_ppl": -0.050537109375, + "entropy_loss": -0.052001953125, + "epoch": 0.6216, + "grad_norm": 0.856500700450923, + "k1_kl": 0.05078125, + "k3_kl": 0.033447265625, + "kimi_kl": 0.0859375, + "learning_rate": 1.892e-07, + "loss": 0.0014, + "ppl": 0.0255126953125, + "reward": 0.9851702451705933, + "reward_std": 0.0004238509282004088, + "rewards/perpo_ocr_edit_distance_reward": 0.9851702451705933, + "step": 3108, + "temperature": 0.9 + }, + { + "advantages": 5.828483153891284e-06, + "completion_length": 1565.0, + "delta_ref_entropy_loss": 0.0272216796875, + "delta_ref_ppl": -0.044677734375, + "entropy_loss": -0.08642578125, + "epoch": 0.6218, + "grad_norm": 176.1373201534853, + "k1_kl": 0.044677734375, + "k3_kl": 0.306640625, + "kimi_kl": 0.1435546875, + "learning_rate": 1.891e-07, + "loss": 0.0123, + "ppl": 0.050048828125, + "reward": 0.9822039008140564, + "reward_std": 0.0013611220056191087, + "rewards/perpo_ocr_edit_distance_reward": 0.9822038412094116, + "step": 3109, + "temperature": 0.9 + }, + { + "advantages": -6.597382889594883e-05, + "completion_length": 943.0, + "delta_ref_entropy_loss": 0.01544189453125, + "delta_ref_ppl": -0.0240478515625, + "entropy_loss": -0.028076171875, + "epoch": 0.622, + "grad_norm": 0.28991322769014616, + "k1_kl": 0.0240478515625, + "k3_kl": 0.01397705078125, + "kimi_kl": 0.033447265625, + "learning_rate": 1.8899999999999999e-07, + "loss": 0.0006, + "ppl": 0.00982666015625, + "reward": 0.9924747347831726, + "reward_std": 0.000544946116860956, + "rewards/perpo_ocr_edit_distance_reward": 0.9924747943878174, + "step": 3110, + "temperature": 0.9 + }, + { + "advantages": 4.427773774295929e-07, + "completion_length": 322.0, + "delta_ref_entropy_loss": 0.0267333984375, + "delta_ref_ppl": -0.140625, + "entropy_loss": -0.1064453125, + "epoch": 0.6222, + "grad_norm": 1.367396655989267, + "k1_kl": 0.140625, + "k3_kl": 0.1025390625, + "kimi_kl": 0.3984375, + "learning_rate": 1.889e-07, + "loss": 0.0041, + "ppl": 0.037841796875, + "reward": 0.8473633527755737, + "reward_std": 0.03450311720371246, + "rewards/perpo_ocr_edit_distance_reward": 0.8473633527755737, + "step": 3111, + "temperature": 0.9 + }, + { + "advantages": -3.065381974920456e-07, + "completion_length": 153.0, + "delta_ref_entropy_loss": 0.00055694580078125, + "delta_ref_ppl": -0.3125, + "entropy_loss": -0.451171875, + "epoch": 0.6224, + "grad_norm": 3.726862558450848, + "k1_kl": 0.314453125, + "k3_kl": 0.2392578125, + "kimi_kl": 0.8671875, + "learning_rate": 1.888e-07, + "loss": 0.0096, + "ppl": 0.162109375, + "reward": 0.3645263612270355, + "reward_std": 0.06617094576358795, + "rewards/perpo_ocr_edit_distance_reward": 0.3645263910293579, + "step": 3112, + "temperature": 0.9 + }, + { + "advantages": -0.00011054533388232812, + "completion_length": 687.0, + "delta_ref_entropy_loss": 0.038818359375, + "delta_ref_ppl": -0.07470703125, + "entropy_loss": -0.041748046875, + "epoch": 0.6226, + "grad_norm": 0.3423196712483351, + "k1_kl": 0.07470703125, + "k3_kl": 0.049560546875, + "kimi_kl": 0.1640625, + "learning_rate": 1.887e-07, + "loss": 0.0021, + "ppl": 0.01708984375, + "reward": 0.9970900416374207, + "reward_std": 0.00028516966267488897, + "rewards/perpo_ocr_edit_distance_reward": 0.9970901012420654, + "step": 3113, + "temperature": 0.9 + }, + { + "advantages": -7.551057206001133e-05, + "completion_length": 797.0, + "delta_ref_entropy_loss": 0.06494140625, + "delta_ref_ppl": -0.080078125, + "entropy_loss": -0.1396484375, + "epoch": 0.6228, + "grad_norm": 1.1150618164594035, + "k1_kl": 0.080078125, + "k3_kl": 0.0439453125, + "kimi_kl": 0.0849609375, + "learning_rate": 1.8859999999999998e-07, + "loss": 0.0018, + "ppl": 0.06005859375, + "reward": 0.9448309540748596, + "reward_std": 0.0011406663106754422, + "rewards/perpo_ocr_edit_distance_reward": 0.9448310732841492, + "step": 3114, + "temperature": 0.9 + }, + { + "advantages": -5.2911898819729686e-05, + "completion_length": 724.0, + "delta_ref_entropy_loss": 0.058349609375, + "delta_ref_ppl": -0.0791015625, + "entropy_loss": -0.1005859375, + "epoch": 0.623, + "grad_norm": 0.7699105440782985, + "k1_kl": 0.0791015625, + "k3_kl": 0.043212890625, + "kimi_kl": 0.083984375, + "learning_rate": 1.885e-07, + "loss": 0.0018, + "ppl": 0.039306640625, + "reward": 0.9766895174980164, + "reward_std": 0.0015092807589098811, + "rewards/perpo_ocr_edit_distance_reward": 0.9766896367073059, + "step": 3115, + "temperature": 0.9 + }, + { + "advantages": -6.205695535754785e-05, + "completion_length": 894.0, + "delta_ref_entropy_loss": 0.0291748046875, + "delta_ref_ppl": -0.0341796875, + "entropy_loss": -0.047119140625, + "epoch": 0.6232, + "grad_norm": 0.5860573232391667, + "k1_kl": 0.0341796875, + "k3_kl": 0.0172119140625, + "kimi_kl": 0.038330078125, + "learning_rate": 1.884e-07, + "loss": 0.0008, + "ppl": 0.0159912109375, + "reward": 0.986665666103363, + "reward_std": 0.0009975893190130591, + "rewards/perpo_ocr_edit_distance_reward": 0.9866657257080078, + "step": 3116, + "temperature": 0.9 + }, + { + "advantages": -2.130440407199785e-05, + "completion_length": 210.0, + "delta_ref_entropy_loss": 0.060791015625, + "delta_ref_ppl": -0.11328125, + "entropy_loss": -0.08154296875, + "epoch": 0.6234, + "grad_norm": 0.9764914606750532, + "k1_kl": 0.11328125, + "k3_kl": 0.076171875, + "kimi_kl": 0.2060546875, + "learning_rate": 1.8829999999999999e-07, + "loss": 0.0031, + "ppl": 0.03125, + "reward": 0.980373203754425, + "reward_std": 0.0010989845031872392, + "rewards/perpo_ocr_edit_distance_reward": 0.9803732633590698, + "step": 3117, + "temperature": 0.9 + }, + { + "advantages": -2.4574144845246337e-05, + "completion_length": 821.0, + "delta_ref_entropy_loss": 0.05322265625, + "delta_ref_ppl": -0.05712890625, + "entropy_loss": -0.0419921875, + "epoch": 0.6236, + "grad_norm": 1.1541325112922363, + "k1_kl": 0.056884765625, + "k3_kl": 0.028564453125, + "kimi_kl": 0.057373046875, + "learning_rate": 1.882e-07, + "loss": 0.0012, + "ppl": 0.018310546875, + "reward": 0.9926053881645203, + "reward_std": 0.0002464933495502919, + "rewards/perpo_ocr_edit_distance_reward": 0.9926053881645203, + "step": 3118, + "temperature": 0.9 + }, + { + "advantages": -9.025846520671621e-05, + "completion_length": 626.0, + "delta_ref_entropy_loss": 0.034912109375, + "delta_ref_ppl": -0.0400390625, + "entropy_loss": -0.028564453125, + "epoch": 0.6238, + "grad_norm": 0.08793129500124794, + "k1_kl": 0.0400390625, + "k3_kl": 0.0198974609375, + "kimi_kl": 0.041259765625, + "learning_rate": 1.8809999999999997e-07, + "loss": 0.0009, + "ppl": 0.00732421875, + "reward": 0.9987679719924927, + "reward_std": 0.0001829767134040594, + "rewards/perpo_ocr_edit_distance_reward": 0.9987680315971375, + "step": 3119, + "temperature": 0.9 + }, + { + "advantages": -0.00023482527467422187, + "completion_length": 378.0, + "delta_ref_entropy_loss": 0.023681640625, + "delta_ref_ppl": -0.0439453125, + "entropy_loss": -0.026123046875, + "epoch": 0.624, + "grad_norm": 0.38099620069615026, + "k1_kl": 0.0439453125, + "k3_kl": 0.0299072265625, + "kimi_kl": 0.10009765625, + "learning_rate": 1.88e-07, + "loss": 0.0014, + "ppl": 0.0089111328125, + "reward": 0.9629400968551636, + "reward_std": 0.00037139543564990163, + "rewards/perpo_ocr_edit_distance_reward": 0.9629402756690979, + "step": 3120, + "temperature": 0.9 + }, + { + "advantages": -1.5667507113903412e-06, + "completion_length": 334.0, + "delta_ref_entropy_loss": 0.036865234375, + "delta_ref_ppl": -0.15234375, + "entropy_loss": -0.1845703125, + "epoch": 0.6242, + "grad_norm": 1.7119722979392966, + "k1_kl": 0.15234375, + "k3_kl": 0.12060546875, + "kimi_kl": 0.48828125, + "learning_rate": 1.879e-07, + "loss": 0.0048, + "ppl": 0.0791015625, + "reward": 0.9386968612670898, + "reward_std": 0.016226718202233315, + "rewards/perpo_ocr_edit_distance_reward": 0.9386969208717346, + "step": 3121, + "temperature": 0.9 + }, + { + "advantages": -4.1059087379835546e-05, + "completion_length": 644.0, + "delta_ref_entropy_loss": 0.045654296875, + "delta_ref_ppl": -0.046630859375, + "entropy_loss": -0.042236328125, + "epoch": 0.6244, + "grad_norm": 0.29067317816195726, + "k1_kl": 0.046630859375, + "k3_kl": 0.0228271484375, + "kimi_kl": 0.06005859375, + "learning_rate": 1.8779999999999997e-07, + "loss": 0.001, + "ppl": 0.0130615234375, + "reward": 0.9960154294967651, + "reward_std": 0.0007290140492841601, + "rewards/perpo_ocr_edit_distance_reward": 0.9960154294967651, + "step": 3122, + "temperature": 0.9 + }, + { + "advantages": 1.5156609833866241e-06, + "completion_length": 843.0, + "delta_ref_entropy_loss": 0.07080078125, + "delta_ref_ppl": -0.115234375, + "entropy_loss": -0.359375, + "epoch": 0.6246, + "grad_norm": 1.981247518766871, + "k1_kl": 0.115234375, + "k3_kl": 0.08154296875, + "kimi_kl": 0.2041015625, + "learning_rate": 1.877e-07, + "loss": 0.0033, + "ppl": 0.1875, + "reward": 0.6189984679222107, + "reward_std": 0.005648460704833269, + "rewards/perpo_ocr_edit_distance_reward": 0.6189984679222107, + "step": 3123, + "temperature": 0.9 + }, + { + "advantages": -0.00010042531357612461, + "completion_length": 621.0, + "delta_ref_entropy_loss": 0.03857421875, + "delta_ref_ppl": -0.05078125, + "entropy_loss": -0.0322265625, + "epoch": 0.6248, + "grad_norm": 0.5613707003923242, + "k1_kl": 0.05078125, + "k3_kl": 0.0299072265625, + "kimi_kl": 0.09814453125, + "learning_rate": 1.8759999999999999e-07, + "loss": 0.0013, + "ppl": 0.01153564453125, + "reward": 0.9957532286643982, + "reward_std": 0.0008326456882059574, + "rewards/perpo_ocr_edit_distance_reward": 0.995753288269043, + "step": 3124, + "temperature": 0.9 + }, + { + "advantages": -5.7816509979602415e-06, + "completion_length": 503.0, + "delta_ref_entropy_loss": 0.072265625, + "delta_ref_ppl": -0.0830078125, + "entropy_loss": -0.1708984375, + "epoch": 0.625, + "grad_norm": 1.6347654269835932, + "k1_kl": 0.0830078125, + "k3_kl": 0.05419921875, + "kimi_kl": 0.126953125, + "learning_rate": 1.875e-07, + "loss": 0.0022, + "ppl": 0.0859375, + "reward": 0.7746954560279846, + "reward_std": 0.004319407977163792, + "rewards/perpo_ocr_edit_distance_reward": 0.7746955156326294, + "step": 3125, + "temperature": 0.9 + }, + { + "advantages": -2.8031214242218994e-05, + "completion_length": 503.0, + "delta_ref_entropy_loss": 0.038330078125, + "delta_ref_ppl": -0.054931640625, + "entropy_loss": -0.04541015625, + "epoch": 0.6252, + "grad_norm": 0.36243849548509816, + "k1_kl": 0.0546875, + "k3_kl": 0.032958984375, + "kimi_kl": 0.09716796875, + "learning_rate": 1.874e-07, + "loss": 0.0013, + "ppl": 0.01287841796875, + "reward": 0.9951120018959045, + "reward_std": 0.0005075963563285768, + "rewards/perpo_ocr_edit_distance_reward": 0.9951121211051941, + "step": 3126, + "temperature": 0.9 + }, + { + "advantages": -2.2479466679214966e-06, + "completion_length": 1695.0, + "delta_ref_entropy_loss": 0.06396484375, + "delta_ref_ppl": -0.0947265625, + "entropy_loss": -0.212890625, + "epoch": 0.6254, + "grad_norm": 17.470956197471075, + "k1_kl": 0.0947265625, + "k3_kl": 0.0908203125, + "kimi_kl": 0.2099609375, + "learning_rate": 1.873e-07, + "loss": 0.0036, + "ppl": 0.10400390625, + "reward": 0.9002105593681335, + "reward_std": 0.007498243357986212, + "rewards/perpo_ocr_edit_distance_reward": 0.9002105593681335, + "step": 3127, + "temperature": 0.9 + }, + { + "advantages": -8.344650836988876e-07, + "completion_length": 589.0, + "delta_ref_entropy_loss": 0.058349609375, + "delta_ref_ppl": -0.11376953125, + "entropy_loss": -0.263671875, + "epoch": 0.6256, + "grad_norm": 1.7051891249653726, + "k1_kl": 0.11376953125, + "k3_kl": 0.080078125, + "kimi_kl": 0.171875, + "learning_rate": 1.872e-07, + "loss": 0.0032, + "ppl": 0.130859375, + "reward": 0.8567682504653931, + "reward_std": 0.020428435876965523, + "rewards/perpo_ocr_edit_distance_reward": 0.8567682504653931, + "step": 3128, + "temperature": 0.9 + }, + { + "advantages": -0.0002858809311874211, + "completion_length": 687.0, + "delta_ref_entropy_loss": 0.033203125, + "delta_ref_ppl": -0.03662109375, + "entropy_loss": -0.0235595703125, + "epoch": 0.6258, + "grad_norm": 0.25905494085252295, + "k1_kl": 0.03662109375, + "k3_kl": 0.021728515625, + "kimi_kl": 0.05859375, + "learning_rate": 1.8709999999999997e-07, + "loss": 0.0012, + "ppl": 0.00689697265625, + "reward": 0.9968805909156799, + "reward_std": 0.00019785073527600616, + "rewards/perpo_ocr_edit_distance_reward": 0.9968807101249695, + "step": 3129, + "temperature": 0.9 + }, + { + "advantages": -0.0003391674836166203, + "completion_length": 639.0, + "delta_ref_entropy_loss": 0.0308837890625, + "delta_ref_ppl": -0.044921875, + "entropy_loss": -0.03173828125, + "epoch": 0.626, + "grad_norm": 0.17154990979601015, + "k1_kl": 0.044921875, + "k3_kl": 0.025390625, + "kimi_kl": 0.0654296875, + "learning_rate": 1.87e-07, + "loss": 0.0014, + "ppl": 0.00994873046875, + "reward": 0.9968923926353455, + "reward_std": 0.0001761482999427244, + "rewards/perpo_ocr_edit_distance_reward": 0.9968924522399902, + "step": 3130, + "temperature": 0.9 + }, + { + "advantages": -6.1563087001559325e-06, + "completion_length": 672.0, + "delta_ref_entropy_loss": 0.06884765625, + "delta_ref_ppl": -0.09521484375, + "entropy_loss": -0.17578125, + "epoch": 0.6262, + "grad_norm": 1.5154356592557245, + "k1_kl": 0.095703125, + "k3_kl": 0.059326171875, + "kimi_kl": 0.1494140625, + "learning_rate": 1.869e-07, + "loss": 0.0024, + "ppl": 0.08154296875, + "reward": 0.8160496950149536, + "reward_std": 0.008226138539612293, + "rewards/perpo_ocr_edit_distance_reward": 0.8160498142242432, + "step": 3131, + "temperature": 0.9 + }, + { + "advantages": -6.42027180219884e-06, + "completion_length": 492.0, + "delta_ref_entropy_loss": 0.0791015625, + "delta_ref_ppl": -0.09375, + "entropy_loss": -0.1494140625, + "epoch": 0.6264, + "grad_norm": 1.455540747410472, + "k1_kl": 0.09423828125, + "k3_kl": 0.053955078125, + "kimi_kl": 0.11962890625, + "learning_rate": 1.8679999999999998e-07, + "loss": 0.0022, + "ppl": 0.06884765625, + "reward": 0.9537292718887329, + "reward_std": 0.006551661994308233, + "rewards/perpo_ocr_edit_distance_reward": 0.9537293314933777, + "step": 3132, + "temperature": 0.9 + }, + { + "advantages": -5.926404810452368e-06, + "completion_length": 620.0, + "delta_ref_entropy_loss": 0.00592041015625, + "delta_ref_ppl": -0.04833984375, + "entropy_loss": -0.0908203125, + "epoch": 0.6266, + "grad_norm": 0.5119012955414067, + "k1_kl": 0.048095703125, + "k3_kl": 0.03515625, + "kimi_kl": 0.126953125, + "learning_rate": 1.867e-07, + "loss": 0.0014, + "ppl": 0.0233154296875, + "reward": 0.9675247669219971, + "reward_std": 0.001335565815679729, + "rewards/perpo_ocr_edit_distance_reward": 0.9675248265266418, + "step": 3133, + "temperature": 0.9 + }, + { + "advantages": -1.8749919036054052e-05, + "completion_length": 858.0, + "delta_ref_entropy_loss": 0.06591796875, + "delta_ref_ppl": -0.08837890625, + "entropy_loss": -0.271484375, + "epoch": 0.6268, + "grad_norm": 1.5213437228158078, + "k1_kl": 0.08837890625, + "k3_kl": 0.0625, + "kimi_kl": 0.126953125, + "learning_rate": 1.866e-07, + "loss": 0.0025, + "ppl": 0.1357421875, + "reward": 0.8994945287704468, + "reward_std": 0.004902387969195843, + "rewards/perpo_ocr_edit_distance_reward": 0.8994946479797363, + "step": 3134, + "temperature": 0.9 + }, + { + "advantages": 0.0, + "completion_length": 442.0, + "delta_ref_entropy_loss": -0.01556396484375, + "delta_ref_ppl": -0.06884765625, + "entropy_loss": -0.162109375, + "epoch": 0.627, + "grad_norm": 1.028888294409679, + "k1_kl": 0.06884765625, + "k3_kl": 0.052734375, + "kimi_kl": 0.1376953125, + "learning_rate": 1.8649999999999998e-07, + "loss": 0.0021, + "ppl": 0.058349609375, + "reward": 0.785141110420227, + "reward_std": 0.03870750218629837, + "rewards/perpo_ocr_edit_distance_reward": 0.7851411700248718, + "step": 3135, + "temperature": 0.9 + }, + { + "advantages": -1.2900148249173071e-05, + "completion_length": 269.0, + "delta_ref_entropy_loss": 0.07763671875, + "delta_ref_ppl": -0.1357421875, + "entropy_loss": -0.1279296875, + "epoch": 0.6272, + "grad_norm": 1.3183105406255535, + "k1_kl": 0.134765625, + "k3_kl": 0.095703125, + "kimi_kl": 0.30078125, + "learning_rate": 1.864e-07, + "loss": 0.0038, + "ppl": 0.053466796875, + "reward": 0.9486356377601624, + "reward_std": 0.0051831211894750595, + "rewards/perpo_ocr_edit_distance_reward": 0.9486356973648071, + "step": 3136, + "temperature": 0.9 + }, + { + "advantages": -4.9148289690492675e-05, + "completion_length": 536.0, + "delta_ref_entropy_loss": 0.04541015625, + "delta_ref_ppl": -0.10791015625, + "entropy_loss": -0.054931640625, + "epoch": 0.6274, + "grad_norm": 0.5147187905915984, + "k1_kl": 0.10791015625, + "k3_kl": 0.072265625, + "kimi_kl": 0.283203125, + "learning_rate": 1.863e-07, + "loss": 0.0029, + "ppl": 0.022216796875, + "reward": 0.9816531538963318, + "reward_std": 0.0007660578121431172, + "rewards/perpo_ocr_edit_distance_reward": 0.9816532135009766, + "step": 3137, + "temperature": 0.9 + }, + { + "advantages": -3.5268920328235254e-05, + "completion_length": 810.0, + "delta_ref_entropy_loss": 0.04248046875, + "delta_ref_ppl": -0.046630859375, + "entropy_loss": -0.04345703125, + "epoch": 0.6276, + "grad_norm": 0.45817000601753927, + "k1_kl": 0.046630859375, + "k3_kl": 0.0289306640625, + "kimi_kl": 0.0751953125, + "learning_rate": 1.8619999999999999e-07, + "loss": 0.0012, + "ppl": 0.016357421875, + "reward": 0.990283727645874, + "reward_std": 0.000624098174739629, + "rewards/perpo_ocr_edit_distance_reward": 0.9902837872505188, + "step": 3138, + "temperature": 0.9 + }, + { + "advantages": -1.8681799701880664e-05, + "completion_length": 208.0, + "delta_ref_entropy_loss": 0.0162353515625, + "delta_ref_ppl": -0.1552734375, + "entropy_loss": -0.1689453125, + "epoch": 0.6278, + "grad_norm": 1.7637071834849556, + "k1_kl": 0.1552734375, + "k3_kl": 0.10595703125, + "kimi_kl": 0.314453125, + "learning_rate": 1.8609999999999998e-07, + "loss": 0.0043, + "ppl": 0.06396484375, + "reward": 0.9646048545837402, + "reward_std": 0.003544826526194811, + "rewards/perpo_ocr_edit_distance_reward": 0.9646049737930298, + "step": 3139, + "temperature": 0.9 + }, + { + "advantages": -2.341611207157257e-07, + "completion_length": 2048.0, + "delta_ref_entropy_loss": 0.0009765625, + "delta_ref_ppl": -0.05712890625, + "entropy_loss": -0.14453125, + "epoch": 0.628, + "grad_norm": 1.3456594973619254, + "k1_kl": 0.05712890625, + "k3_kl": 0.048583984375, + "kimi_kl": 0.1005859375, + "learning_rate": 1.86e-07, + "loss": 0.0019, + "ppl": 0.06689453125, + "reward": 0.4532660245895386, + "reward_std": 0.05678971856832504, + "rewards/perpo_ocr_edit_distance_reward": 0.4532660245895386, + "step": 3140, + "temperature": 0.9 + }, + { + "advantages": -1.6961779692792334e-05, + "completion_length": 365.0, + "delta_ref_entropy_loss": 0.0289306640625, + "delta_ref_ppl": -0.0810546875, + "entropy_loss": -0.051513671875, + "epoch": 0.6282, + "grad_norm": 0.768669214267872, + "k1_kl": 0.0810546875, + "k3_kl": 0.052734375, + "kimi_kl": 0.126953125, + "learning_rate": 1.859e-07, + "loss": 0.0021, + "ppl": 0.01495361328125, + "reward": 0.9817531108856201, + "reward_std": 0.00040232876199297607, + "rewards/perpo_ocr_edit_distance_reward": 0.9817532300949097, + "step": 3141, + "temperature": 0.9 + }, + { + "advantages": 4.598072791850427e-06, + "completion_length": 68.0, + "delta_ref_entropy_loss": -0.0257568359375, + "delta_ref_ppl": -0.419921875, + "entropy_loss": -0.1982421875, + "epoch": 0.6284, + "grad_norm": 4.799368835675703, + "k1_kl": 0.419921875, + "k3_kl": 0.337890625, + "kimi_kl": 1.625, + "learning_rate": 1.8579999999999998e-07, + "loss": 0.0135, + "ppl": 0.0908203125, + "reward": 0.9761075377464294, + "reward_std": 0.005482898559421301, + "rewards/perpo_ocr_edit_distance_reward": 0.9761074781417847, + "step": 3142, + "temperature": 0.9 + }, + { + "advantages": -3.395761814317666e-05, + "completion_length": 1607.0, + "delta_ref_entropy_loss": 0.01483154296875, + "delta_ref_ppl": -0.037109375, + "entropy_loss": -0.04736328125, + "epoch": 0.6286, + "grad_norm": 162.88404933323665, + "k1_kl": 0.037109375, + "k3_kl": 0.83984375, + "kimi_kl": 0.06298828125, + "learning_rate": 1.857e-07, + "loss": 0.0335, + "ppl": 0.021484375, + "reward": 0.9943508505821228, + "reward_std": 0.0009027839405462146, + "rewards/perpo_ocr_edit_distance_reward": 0.9943509697914124, + "step": 3143, + "temperature": 0.9 + }, + { + "advantages": -1.3623919414840202e-07, + "completion_length": 387.0, + "delta_ref_entropy_loss": -0.04443359375, + "delta_ref_ppl": -0.16015625, + "entropy_loss": -0.53125, + "epoch": 0.6288, + "grad_norm": 2.993199285419558, + "k1_kl": 0.16015625, + "k3_kl": 0.1279296875, + "kimi_kl": 0.33984375, + "learning_rate": 1.8559999999999997e-07, + "loss": 0.0051, + "ppl": 0.25390625, + "reward": 0.7745969891548157, + "reward_std": 0.11781632155179977, + "rewards/perpo_ocr_edit_distance_reward": 0.7745970487594604, + "step": 3144, + "temperature": 0.9 + }, + { + "advantages": -0.00010001660120906308, + "completion_length": 529.0, + "delta_ref_entropy_loss": 0.00139617919921875, + "delta_ref_ppl": -0.03271484375, + "entropy_loss": -0.0311279296875, + "epoch": 0.629, + "grad_norm": 0.20578717111526748, + "k1_kl": 0.03271484375, + "k3_kl": 0.0250244140625, + "kimi_kl": 0.0810546875, + "learning_rate": 1.855e-07, + "loss": 0.0011, + "ppl": 0.00933837890625, + "reward": 0.9958273768424988, + "reward_std": 0.00041086506098508835, + "rewards/perpo_ocr_edit_distance_reward": 0.9958274364471436, + "step": 3145, + "temperature": 0.9 + }, + { + "advantages": -1.1818749953818042e-05, + "completion_length": 362.0, + "delta_ref_entropy_loss": 0.06982421875, + "delta_ref_ppl": -0.12890625, + "entropy_loss": -0.0654296875, + "epoch": 0.6292, + "grad_norm": 1.0694763937033689, + "k1_kl": 0.1279296875, + "k3_kl": 0.0947265625, + "kimi_kl": 0.29296875, + "learning_rate": 1.854e-07, + "loss": 0.0038, + "ppl": 0.02880859375, + "reward": 0.982876181602478, + "reward_std": 0.0013375584967434406, + "rewards/perpo_ocr_edit_distance_reward": 0.982876181602478, + "step": 3146, + "temperature": 0.9 + }, + { + "advantages": 1.6433853033959167e-06, + "completion_length": 237.0, + "delta_ref_entropy_loss": 0.0439453125, + "delta_ref_ppl": -0.162109375, + "entropy_loss": -0.055419921875, + "epoch": 0.6294, + "grad_norm": 1.9368967308913323, + "k1_kl": 0.162109375, + "k3_kl": 0.12158203125, + "kimi_kl": 0.5, + "learning_rate": 1.8529999999999997e-07, + "loss": 0.0049, + "ppl": 0.0213623046875, + "reward": 0.9878138303756714, + "reward_std": 0.005084626842290163, + "rewards/perpo_ocr_edit_distance_reward": 0.9878138303756714, + "step": 3147, + "temperature": 0.9 + }, + { + "advantages": -5.0876824388979e-05, + "completion_length": 513.0, + "delta_ref_entropy_loss": 0.05615234375, + "delta_ref_ppl": -0.0703125, + "entropy_loss": -0.1083984375, + "epoch": 0.6296, + "grad_norm": 1.690551636066666, + "k1_kl": 0.0703125, + "k3_kl": 0.040283203125, + "kimi_kl": 0.09033203125, + "learning_rate": 1.852e-07, + "loss": 0.0017, + "ppl": 0.0439453125, + "reward": 0.9441395998001099, + "reward_std": 0.0014064435381442308, + "rewards/perpo_ocr_edit_distance_reward": 0.9441396594047546, + "step": 3148, + "temperature": 0.9 + }, + { + "advantages": -5.9519497881410643e-05, + "completion_length": 640.0, + "delta_ref_entropy_loss": 0.04248046875, + "delta_ref_ppl": -0.0869140625, + "entropy_loss": -0.0615234375, + "epoch": 0.6298, + "grad_norm": 1.5233981194694521, + "k1_kl": 0.08740234375, + "k3_kl": 0.056640625, + "kimi_kl": 0.150390625, + "learning_rate": 1.8509999999999998e-07, + "loss": 0.0023, + "ppl": 0.02734375, + "reward": 0.9235844612121582, + "reward_std": 0.000615194090642035, + "rewards/perpo_ocr_edit_distance_reward": 0.923584520816803, + "step": 3149, + "temperature": 0.9 + }, + { + "advantages": -5.3542004025075585e-05, + "completion_length": 494.0, + "delta_ref_entropy_loss": 0.05419921875, + "delta_ref_ppl": -0.0732421875, + "entropy_loss": -0.057373046875, + "epoch": 0.63, + "grad_norm": 0.7547750546846413, + "k1_kl": 0.07373046875, + "k3_kl": 0.0498046875, + "kimi_kl": 0.177734375, + "learning_rate": 1.85e-07, + "loss": 0.002, + "ppl": 0.023681640625, + "reward": 0.9880355596542358, + "reward_std": 0.0005361072835512459, + "rewards/perpo_ocr_edit_distance_reward": 0.9880355596542358, + "step": 3150, + "temperature": 0.9 + }, + { + "advantages": -6.977150042075664e-05, + "completion_length": 258.0, + "delta_ref_entropy_loss": 0.035888671875, + "delta_ref_ppl": -0.1640625, + "entropy_loss": -0.059326171875, + "epoch": 0.6302, + "grad_norm": 1.1848946327633458, + "k1_kl": 0.1640625, + "k3_kl": 0.1279296875, + "kimi_kl": 0.5625, + "learning_rate": 1.849e-07, + "loss": 0.0052, + "ppl": 0.022705078125, + "reward": 0.9916420578956604, + "reward_std": 0.0012422464787960052, + "rewards/perpo_ocr_edit_distance_reward": 0.9916421175003052, + "step": 3151, + "temperature": 0.9 + }, + { + "advantages": -1.7540796761750244e-05, + "completion_length": 602.0, + "delta_ref_entropy_loss": 0.043701171875, + "delta_ref_ppl": -0.07763671875, + "entropy_loss": -0.08642578125, + "epoch": 0.6304, + "grad_norm": 1.0176417369832587, + "k1_kl": 0.07763671875, + "k3_kl": 0.0419921875, + "kimi_kl": 0.0869140625, + "learning_rate": 1.848e-07, + "loss": 0.0017, + "ppl": 0.035888671875, + "reward": 0.980579137802124, + "reward_std": 0.0018407166935503483, + "rewards/perpo_ocr_edit_distance_reward": 0.9805791974067688, + "step": 3152, + "temperature": 0.9 + }, + { + "advantages": -1.120567412726814e-05, + "completion_length": 817.0, + "delta_ref_entropy_loss": 0.039794921875, + "delta_ref_ppl": -0.05810546875, + "entropy_loss": -0.0673828125, + "epoch": 0.6306, + "grad_norm": 0.5967578111686677, + "k1_kl": 0.057861328125, + "k3_kl": 0.02978515625, + "kimi_kl": 0.060546875, + "learning_rate": 1.847e-07, + "loss": 0.0012, + "ppl": 0.0281982421875, + "reward": 0.9877502918243408, + "reward_std": 0.0006588621181435883, + "rewards/perpo_ocr_edit_distance_reward": 0.9877503514289856, + "step": 3153, + "temperature": 0.9 + }, + { + "advantages": -4.361357059678994e-05, + "completion_length": 427.0, + "delta_ref_entropy_loss": 0.07421875, + "delta_ref_ppl": -0.09765625, + "entropy_loss": -0.0693359375, + "epoch": 0.6308, + "grad_norm": 0.6509538557638376, + "k1_kl": 0.09765625, + "k3_kl": 0.06103515625, + "kimi_kl": 0.18359375, + "learning_rate": 1.8459999999999997e-07, + "loss": 0.0025, + "ppl": 0.027587890625, + "reward": 0.9888878464698792, + "reward_std": 0.0012666049879044294, + "rewards/perpo_ocr_edit_distance_reward": 0.9888879656791687, + "step": 3154, + "temperature": 0.9 + }, + { + "advantages": -3.669943180284463e-05, + "completion_length": 447.0, + "delta_ref_entropy_loss": 0.057373046875, + "delta_ref_ppl": -0.080078125, + "entropy_loss": -0.171875, + "epoch": 0.631, + "grad_norm": 2.377565098385559, + "k1_kl": 0.07958984375, + "k3_kl": 0.052978515625, + "kimi_kl": 0.12451171875, + "learning_rate": 1.845e-07, + "loss": 0.0022, + "ppl": 0.0771484375, + "reward": 0.9233808517456055, + "reward_std": 0.0015247022965922952, + "rewards/perpo_ocr_edit_distance_reward": 0.9233809113502502, + "step": 3155, + "temperature": 0.9 + }, + { + "advantages": -7.115092012099922e-05, + "completion_length": 544.0, + "delta_ref_entropy_loss": 0.0224609375, + "delta_ref_ppl": -0.0537109375, + "entropy_loss": -0.03662109375, + "epoch": 0.6312, + "grad_norm": 0.48749251201137045, + "k1_kl": 0.0537109375, + "k3_kl": 0.034423828125, + "kimi_kl": 0.0888671875, + "learning_rate": 1.844e-07, + "loss": 0.0014, + "ppl": 0.0133056640625, + "reward": 0.9991602897644043, + "reward_std": 0.000977157847955823, + "rewards/perpo_ocr_edit_distance_reward": 0.9991603493690491, + "step": 3156, + "temperature": 0.9 + }, + { + "advantages": 0.0, + "completion_length": 442.0, + "delta_ref_entropy_loss": 0.03369140625, + "delta_ref_ppl": -0.05712890625, + "entropy_loss": -0.033447265625, + "epoch": 0.6314, + "grad_norm": 0.31350158589516014, + "k1_kl": 0.05712890625, + "k3_kl": 0.03662109375, + "kimi_kl": 0.1064453125, + "learning_rate": 1.8429999999999998e-07, + "loss": 0.0015, + "ppl": 0.00970458984375, + "reward": 0.9969053864479065, + "reward_std": 0.0003609853738453239, + "rewards/perpo_ocr_edit_distance_reward": 0.9969053864479065, + "step": 3157, + "temperature": 0.9 + }, + { + "advantages": 1.7029899268550253e-08, + "completion_length": 130.0, + "delta_ref_entropy_loss": -0.05615234375, + "delta_ref_ppl": -0.380859375, + "entropy_loss": -0.40234375, + "epoch": 0.6316, + "grad_norm": 6.802905078112929, + "k1_kl": 0.380859375, + "k3_kl": 0.333984375, + "kimi_kl": 1.2265625, + "learning_rate": 1.842e-07, + "loss": 0.0134, + "ppl": 0.1640625, + "reward": 0.8920543193817139, + "reward_std": 0.11084788292646408, + "rewards/perpo_ocr_edit_distance_reward": 0.8920543789863586, + "step": 3158, + "temperature": 0.9 + }, + { + "advantages": -1.6280584532069042e-05, + "completion_length": 129.0, + "delta_ref_entropy_loss": 0.0308837890625, + "delta_ref_ppl": -0.306640625, + "entropy_loss": -0.0625, + "epoch": 0.6318, + "grad_norm": 1.6906484187229975, + "k1_kl": 0.306640625, + "k3_kl": 0.25390625, + "kimi_kl": 1.4921875, + "learning_rate": 1.8410000000000001e-07, + "loss": 0.0102, + "ppl": 0.021240234375, + "reward": 0.9775807857513428, + "reward_std": 0.0025184934493154287, + "rewards/perpo_ocr_edit_distance_reward": 0.9775808453559875, + "step": 3159, + "temperature": 0.9 + }, + { + "advantages": -4.3545453081605956e-05, + "completion_length": 228.0, + "delta_ref_entropy_loss": 0.05419921875, + "delta_ref_ppl": -0.21484375, + "entropy_loss": -0.12451171875, + "epoch": 0.632, + "grad_norm": 1.1786390272216543, + "k1_kl": 0.2158203125, + "k3_kl": 0.1591796875, + "kimi_kl": 0.609375, + "learning_rate": 1.8399999999999998e-07, + "loss": 0.0064, + "ppl": 0.04345703125, + "reward": 0.9666156768798828, + "reward_std": 0.0016595497727394104, + "rewards/perpo_ocr_edit_distance_reward": 0.9666157364845276, + "step": 3160, + "temperature": 0.9 + }, + { + "advantages": -8.428097498835996e-05, + "completion_length": 568.0, + "delta_ref_entropy_loss": 0.050048828125, + "delta_ref_ppl": -0.052001953125, + "entropy_loss": -0.04638671875, + "epoch": 0.6322, + "grad_norm": 0.5095663809413521, + "k1_kl": 0.052001953125, + "k3_kl": 0.0281982421875, + "kimi_kl": 0.076171875, + "learning_rate": 1.839e-07, + "loss": 0.0012, + "ppl": 0.01708984375, + "reward": 0.9876577854156494, + "reward_std": 0.0008092848001979291, + "rewards/perpo_ocr_edit_distance_reward": 0.9876578450202942, + "step": 3161, + "temperature": 0.9 + }, + { + "advantages": -6.23507221462205e-05, + "completion_length": 586.0, + "delta_ref_entropy_loss": 0.0230712890625, + "delta_ref_ppl": -0.04443359375, + "entropy_loss": -0.031982421875, + "epoch": 0.6324, + "grad_norm": 0.31771970990772985, + "k1_kl": 0.04443359375, + "k3_kl": 0.0267333984375, + "kimi_kl": 0.06494140625, + "learning_rate": 1.838e-07, + "loss": 0.0011, + "ppl": 0.01141357421875, + "reward": 0.9979137182235718, + "reward_std": 0.0003098046872764826, + "rewards/perpo_ocr_edit_distance_reward": 0.9979138374328613, + "step": 3162, + "temperature": 0.9 + }, + { + "advantages": -9.105887147597969e-05, + "completion_length": 434.0, + "delta_ref_entropy_loss": 0.03857421875, + "delta_ref_ppl": -0.0732421875, + "entropy_loss": -0.0439453125, + "epoch": 0.6326, + "grad_norm": 0.2444182033188289, + "k1_kl": 0.0732421875, + "k3_kl": 0.050048828125, + "kimi_kl": 0.1796875, + "learning_rate": 1.8369999999999998e-07, + "loss": 0.0021, + "ppl": 0.013427734375, + "reward": 0.991949737071991, + "reward_std": 0.0001805028587114066, + "rewards/perpo_ocr_edit_distance_reward": 0.9919497966766357, + "step": 3163, + "temperature": 0.9 + }, + { + "advantages": -5.743333531427197e-06, + "completion_length": 396.0, + "delta_ref_entropy_loss": 0.0546875, + "delta_ref_ppl": -0.10302734375, + "entropy_loss": -0.0673828125, + "epoch": 0.6328, + "grad_norm": 0.6979099609721583, + "k1_kl": 0.10302734375, + "k3_kl": 0.06982421875, + "kimi_kl": 0.2373046875, + "learning_rate": 1.836e-07, + "loss": 0.0028, + "ppl": 0.0262451171875, + "reward": 0.9882888793945312, + "reward_std": 0.0013858970487490296, + "rewards/perpo_ocr_edit_distance_reward": 0.988288938999176, + "step": 3164, + "temperature": 0.9 + }, + { + "advantages": -1.84433811227791e-05, + "completion_length": 120.0, + "delta_ref_entropy_loss": 0.00860595703125, + "delta_ref_ppl": -0.1826171875, + "entropy_loss": -0.0673828125, + "epoch": 0.633, + "grad_norm": 1.3896932375600541, + "k1_kl": 0.1826171875, + "k3_kl": 0.146484375, + "kimi_kl": 0.5390625, + "learning_rate": 1.835e-07, + "loss": 0.0059, + "ppl": 0.0279541015625, + "reward": 0.9895104765892029, + "reward_std": 0.0026704880874603987, + "rewards/perpo_ocr_edit_distance_reward": 0.9895105361938477, + "step": 3165, + "temperature": 0.9 + }, + { + "advantages": -2.9163702492951415e-05, + "completion_length": 368.0, + "delta_ref_entropy_loss": 0.08642578125, + "delta_ref_ppl": -0.119140625, + "entropy_loss": -0.1484375, + "epoch": 0.6332, + "grad_norm": 1.0694118408041369, + "k1_kl": 0.119140625, + "k3_kl": 0.078125, + "kimi_kl": 0.232421875, + "learning_rate": 1.834e-07, + "loss": 0.0032, + "ppl": 0.06103515625, + "reward": 0.8986883759498596, + "reward_std": 0.0016521969810128212, + "rewards/perpo_ocr_edit_distance_reward": 0.8986884951591492, + "step": 3166, + "temperature": 0.9 + }, + { + "advantages": -1.4496701624011621e-05, + "completion_length": 1401.0, + "delta_ref_entropy_loss": 0.044677734375, + "delta_ref_ppl": -0.057373046875, + "entropy_loss": -0.16796875, + "epoch": 0.6334, + "grad_norm": 1.5752094437448365, + "k1_kl": 0.057373046875, + "k3_kl": 0.033935546875, + "kimi_kl": 0.062255859375, + "learning_rate": 1.8329999999999998e-07, + "loss": 0.0014, + "ppl": 0.087890625, + "reward": 0.951678454875946, + "reward_std": 0.0016624446725472808, + "rewards/perpo_ocr_edit_distance_reward": 0.951678454875946, + "step": 3167, + "temperature": 0.9 + }, + { + "advantages": -2.213887000834802e-06, + "completion_length": 449.0, + "delta_ref_entropy_loss": 0.044921875, + "delta_ref_ppl": -0.1171875, + "entropy_loss": -0.51171875, + "epoch": 0.6336, + "grad_norm": 2.9878504277880413, + "k1_kl": 0.11669921875, + "k3_kl": 0.08642578125, + "kimi_kl": 0.177734375, + "learning_rate": 1.832e-07, + "loss": 0.0035, + "ppl": 0.271484375, + "reward": 0.6895906925201416, + "reward_std": 0.02277779020369053, + "rewards/perpo_ocr_edit_distance_reward": 0.6895907521247864, + "step": 3168, + "temperature": 0.9 + }, + { + "advantages": -4.495893335842993e-06, + "completion_length": 1851.0, + "delta_ref_entropy_loss": 0.022216796875, + "delta_ref_ppl": -0.04931640625, + "entropy_loss": -0.1298828125, + "epoch": 0.6338, + "grad_norm": 2.026907308500868, + "k1_kl": 0.04931640625, + "k3_kl": 0.035400390625, + "kimi_kl": 0.06982421875, + "learning_rate": 1.831e-07, + "loss": 0.0014, + "ppl": 0.0673828125, + "reward": 0.9730702638626099, + "reward_std": 0.00558475311845541, + "rewards/perpo_ocr_edit_distance_reward": 0.9730703234672546, + "step": 3169, + "temperature": 0.9 + }, + { + "advantages": -1.0234969522571191e-05, + "completion_length": 1116.0, + "delta_ref_entropy_loss": 0.043212890625, + "delta_ref_ppl": -0.06689453125, + "entropy_loss": -0.1884765625, + "epoch": 0.634, + "grad_norm": 4.589580160766446, + "k1_kl": 0.06689453125, + "k3_kl": 0.056640625, + "kimi_kl": 0.08642578125, + "learning_rate": 1.8299999999999998e-07, + "loss": 0.0023, + "ppl": 0.09130859375, + "reward": 0.8799601793289185, + "reward_std": 0.009044564329087734, + "rewards/perpo_ocr_edit_distance_reward": 0.8799602389335632, + "step": 3170, + "temperature": 0.9 + }, + { + "advantages": 4.5554980943052215e-07, + "completion_length": 1425.0, + "delta_ref_entropy_loss": 0.0390625, + "delta_ref_ppl": -0.056640625, + "entropy_loss": -0.08056640625, + "epoch": 0.6342, + "grad_norm": 0.9476337959236348, + "k1_kl": 0.056640625, + "k3_kl": 0.031982421875, + "kimi_kl": 0.0810546875, + "learning_rate": 1.829e-07, + "loss": 0.0013, + "ppl": 0.033203125, + "reward": 0.7281614542007446, + "reward_std": 0.01838352531194687, + "rewards/perpo_ocr_edit_distance_reward": 0.7281614542007446, + "step": 3171, + "temperature": 0.9 + }, + { + "advantages": -1.0907650903391186e-05, + "completion_length": 260.0, + "delta_ref_entropy_loss": 0.07421875, + "delta_ref_ppl": -0.134765625, + "entropy_loss": -0.138671875, + "epoch": 0.6344, + "grad_norm": 1.2325313977694636, + "k1_kl": 0.134765625, + "k3_kl": 0.08203125, + "kimi_kl": 0.21484375, + "learning_rate": 1.8279999999999997e-07, + "loss": 0.0033, + "ppl": 0.050048828125, + "reward": 0.9939728379249573, + "reward_std": 0.0022420103196054697, + "rewards/perpo_ocr_edit_distance_reward": 0.9939728379249573, + "step": 3172, + "temperature": 0.9 + }, + { + "advantages": -1.8273081877850927e-05, + "completion_length": 1070.0, + "delta_ref_entropy_loss": 0.016845703125, + "delta_ref_ppl": -0.0380859375, + "entropy_loss": -0.0311279296875, + "epoch": 0.6346, + "grad_norm": 0.5712559525333585, + "k1_kl": 0.0380859375, + "k3_kl": 0.02392578125, + "kimi_kl": 0.0703125, + "learning_rate": 1.827e-07, + "loss": 0.001, + "ppl": 0.0107421875, + "reward": 0.9936714768409729, + "reward_std": 0.0012979537714272738, + "rewards/perpo_ocr_edit_distance_reward": 0.9936715364456177, + "step": 3173, + "temperature": 0.9 + }, + { + "advantages": -2.7077539925812744e-05, + "completion_length": 480.0, + "delta_ref_entropy_loss": 0.07568359375, + "delta_ref_ppl": -0.146484375, + "entropy_loss": -0.2041015625, + "epoch": 0.6348, + "grad_norm": 1.0513823169769911, + "k1_kl": 0.1474609375, + "k3_kl": 0.09521484375, + "kimi_kl": 0.34375, + "learning_rate": 1.826e-07, + "loss": 0.0038, + "ppl": 0.08984375, + "reward": 0.686959981918335, + "reward_std": 0.0008436384960077703, + "rewards/perpo_ocr_edit_distance_reward": 0.6869600415229797, + "step": 3174, + "temperature": 0.9 + }, + { + "advantages": -9.452445374336094e-05, + "completion_length": 678.0, + "delta_ref_entropy_loss": 0.06640625, + "delta_ref_ppl": -0.07568359375, + "entropy_loss": -0.06298828125, + "epoch": 0.635, + "grad_norm": 0.6718723815861902, + "k1_kl": 0.07568359375, + "k3_kl": 0.045166015625, + "kimi_kl": 0.1328125, + "learning_rate": 1.825e-07, + "loss": 0.0019, + "ppl": 0.0252685546875, + "reward": 0.9863809943199158, + "reward_std": 0.0008908712188713253, + "rewards/perpo_ocr_edit_distance_reward": 0.9863810539245605, + "step": 3175, + "temperature": 0.9 + }, + { + "advantages": -3.604378071031533e-05, + "completion_length": 396.0, + "delta_ref_entropy_loss": 0.05224609375, + "delta_ref_ppl": -0.109375, + "entropy_loss": -0.048095703125, + "epoch": 0.6352, + "grad_norm": 0.7632938703288403, + "k1_kl": 0.109375, + "k3_kl": 0.07958984375, + "kimi_kl": 0.28125, + "learning_rate": 1.824e-07, + "loss": 0.0032, + "ppl": 0.01904296875, + "reward": 0.9952985644340515, + "reward_std": 0.0008451850153505802, + "rewards/perpo_ocr_edit_distance_reward": 0.9952986240386963, + "step": 3176, + "temperature": 0.9 + }, + { + "advantages": -1.3257777027320117e-05, + "completion_length": 249.0, + "delta_ref_entropy_loss": 0.03125, + "delta_ref_ppl": -0.123046875, + "entropy_loss": -0.1103515625, + "epoch": 0.6354, + "grad_norm": 2.4921260183994, + "k1_kl": 0.12255859375, + "k3_kl": 0.08984375, + "kimi_kl": 0.26171875, + "learning_rate": 1.8229999999999998e-07, + "loss": 0.0036, + "ppl": 0.038330078125, + "reward": 0.969330906867981, + "reward_std": 0.0043920231983065605, + "rewards/perpo_ocr_edit_distance_reward": 0.969330906867981, + "step": 3177, + "temperature": 0.9 + }, + { + "advantages": -4.938671054333099e-07, + "completion_length": 2048.0, + "delta_ref_entropy_loss": -0.55078125, + "delta_ref_ppl": -0.11181640625, + "entropy_loss": -1.3359375, + "epoch": 0.6356, + "grad_norm": 8.218694373618415, + "k1_kl": 0.111328125, + "k3_kl": 0.19140625, + "kimi_kl": 0.470703125, + "learning_rate": 1.822e-07, + "loss": 0.0077, + "ppl": 0.6875, + "reward": 0.29179540276527405, + "reward_std": 0.0348871573805809, + "rewards/perpo_ocr_edit_distance_reward": 0.29179543256759644, + "step": 3178, + "temperature": 0.9 + }, + { + "advantages": -5.194119125917496e-07, + "completion_length": 638.0, + "delta_ref_entropy_loss": -0.043701171875, + "delta_ref_ppl": -0.1162109375, + "entropy_loss": -0.53515625, + "epoch": 0.6358, + "grad_norm": 4.711170665290434, + "k1_kl": 0.1162109375, + "k3_kl": 0.095703125, + "kimi_kl": 0.201171875, + "learning_rate": 1.821e-07, + "loss": 0.0038, + "ppl": 0.291015625, + "reward": 0.8923490047454834, + "reward_std": 0.031547896564006805, + "rewards/perpo_ocr_edit_distance_reward": 0.8923490643501282, + "step": 3179, + "temperature": 0.9 + }, + { + "advantages": 1.8869128325604834e-05, + "completion_length": 458.0, + "delta_ref_entropy_loss": 0.0198974609375, + "delta_ref_ppl": -0.045166015625, + "entropy_loss": -0.032958984375, + "epoch": 0.636, + "grad_norm": 0.7847363925548556, + "k1_kl": 0.045166015625, + "k3_kl": 0.031005859375, + "kimi_kl": 0.0966796875, + "learning_rate": 1.82e-07, + "loss": 0.0012, + "ppl": 0.01141357421875, + "reward": 0.99113929271698, + "reward_std": 0.0017035620985552669, + "rewards/perpo_ocr_edit_distance_reward": 0.99113929271698, + "step": 3180, + "temperature": 0.9 + }, + { + "advantages": 1.0541507435846142e-05, + "completion_length": 585.0, + "delta_ref_entropy_loss": 0.109375, + "delta_ref_ppl": -0.11181640625, + "entropy_loss": -0.150390625, + "epoch": 0.6362, + "grad_norm": 1.0432731006608027, + "k1_kl": 0.11181640625, + "k3_kl": 0.06689453125, + "kimi_kl": 0.1806640625, + "learning_rate": 1.819e-07, + "loss": 0.0027, + "ppl": 0.06640625, + "reward": 0.9432470798492432, + "reward_std": 0.0015169188845902681, + "rewards/perpo_ocr_edit_distance_reward": 0.9432471394538879, + "step": 3181, + "temperature": 0.9 + }, + { + "advantages": -0.00011416844790801406, + "completion_length": 552.0, + "delta_ref_entropy_loss": 0.043212890625, + "delta_ref_ppl": -0.08642578125, + "entropy_loss": -0.027587890625, + "epoch": 0.6364, + "grad_norm": 0.16439951927955682, + "k1_kl": 0.0859375, + "k3_kl": 0.059326171875, + "kimi_kl": 0.1806640625, + "learning_rate": 1.8179999999999997e-07, + "loss": 0.0025, + "ppl": 0.00811767578125, + "reward": 0.9979663491249084, + "reward_std": 0.00027285696705803275, + "rewards/perpo_ocr_edit_distance_reward": 0.997966468334198, + "step": 3182, + "temperature": 0.9 + }, + { + "advantages": -5.909374976909021e-06, + "completion_length": 622.0, + "delta_ref_entropy_loss": 0.05712890625, + "delta_ref_ppl": -0.12109375, + "entropy_loss": -0.404296875, + "epoch": 0.6366, + "grad_norm": 2.327325658023765, + "k1_kl": 0.12109375, + "k3_kl": 0.07275390625, + "kimi_kl": 0.169921875, + "learning_rate": 1.817e-07, + "loss": 0.0029, + "ppl": 0.185546875, + "reward": 0.7122841477394104, + "reward_std": 0.008549327962100506, + "rewards/perpo_ocr_edit_distance_reward": 0.7122842669487, + "step": 3183, + "temperature": 0.9 + }, + { + "advantages": -1.805169267754536e-05, + "completion_length": 465.0, + "delta_ref_entropy_loss": 0.0220947265625, + "delta_ref_ppl": -0.07861328125, + "entropy_loss": -0.045654296875, + "epoch": 0.6368, + "grad_norm": 0.4975058231635728, + "k1_kl": 0.07861328125, + "k3_kl": 0.05615234375, + "kimi_kl": 0.1953125, + "learning_rate": 1.816e-07, + "loss": 0.0023, + "ppl": 0.0174560546875, + "reward": 0.9943949580192566, + "reward_std": 0.0008431421010755002, + "rewards/perpo_ocr_edit_distance_reward": 0.9943949580192566, + "step": 3184, + "temperature": 0.9 + }, + { + "advantages": -4.775183697347529e-05, + "completion_length": 869.0, + "delta_ref_entropy_loss": 0.000244140625, + "delta_ref_ppl": -0.028076171875, + "entropy_loss": -0.039794921875, + "epoch": 0.637, + "grad_norm": 0.48846875061337375, + "k1_kl": 0.028076171875, + "k3_kl": 0.0233154296875, + "kimi_kl": 0.0751953125, + "learning_rate": 1.8149999999999998e-07, + "loss": 0.001, + "ppl": 0.0164794921875, + "reward": 0.9829445481300354, + "reward_std": 0.0009706040727905929, + "rewards/perpo_ocr_edit_distance_reward": 0.982944667339325, + "step": 3185, + "temperature": 0.9 + }, + { + "advantages": -0.00011433874169597402, + "completion_length": 583.0, + "delta_ref_entropy_loss": 0.023193359375, + "delta_ref_ppl": -0.04833984375, + "entropy_loss": -0.04345703125, + "epoch": 0.6372, + "grad_norm": 0.5301731648181605, + "k1_kl": 0.048583984375, + "k3_kl": 0.0296630859375, + "kimi_kl": 0.08203125, + "learning_rate": 1.814e-07, + "loss": 0.0013, + "ppl": 0.016357421875, + "reward": 0.9957332015037537, + "reward_std": 0.000644636107608676, + "rewards/perpo_ocr_edit_distance_reward": 0.9957333207130432, + "step": 3186, + "temperature": 0.9 + }, + { + "advantages": -6.781306001357734e-05, + "completion_length": 749.0, + "delta_ref_entropy_loss": 0.0228271484375, + "delta_ref_ppl": -0.041015625, + "entropy_loss": -0.03662109375, + "epoch": 0.6374, + "grad_norm": 0.6900889311731008, + "k1_kl": 0.041015625, + "k3_kl": 0.028564453125, + "kimi_kl": 0.083984375, + "learning_rate": 1.813e-07, + "loss": 0.0012, + "ppl": 0.0145263671875, + "reward": 0.9908614158630371, + "reward_std": 0.0014073620550334454, + "rewards/perpo_ocr_edit_distance_reward": 0.9908615350723267, + "step": 3187, + "temperature": 0.9 + }, + { + "advantages": -3.593308792915195e-05, + "completion_length": 923.0, + "delta_ref_entropy_loss": 0.04345703125, + "delta_ref_ppl": -0.07080078125, + "entropy_loss": -0.048095703125, + "epoch": 0.6376, + "grad_norm": 1.4296268618127272, + "k1_kl": 0.0712890625, + "k3_kl": 0.041015625, + "kimi_kl": 0.1142578125, + "learning_rate": 1.8119999999999998e-07, + "loss": 0.0017, + "ppl": 0.0234375, + "reward": 0.9915797710418701, + "reward_std": 0.0010849195532500744, + "rewards/perpo_ocr_edit_distance_reward": 0.9915797710418701, + "step": 3188, + "temperature": 0.9 + }, + { + "advantages": -5.1634655392263085e-05, + "completion_length": 299.0, + "delta_ref_entropy_loss": 0.017822265625, + "delta_ref_ppl": -0.083984375, + "entropy_loss": -0.04541015625, + "epoch": 0.6378, + "grad_norm": 0.5516731068477221, + "k1_kl": 0.083984375, + "k3_kl": 0.07666015625, + "kimi_kl": 0.26171875, + "learning_rate": 1.811e-07, + "loss": 0.0031, + "ppl": 0.017578125, + "reward": 0.9926352500915527, + "reward_std": 0.0008890674216672778, + "rewards/perpo_ocr_edit_distance_reward": 0.9926353096961975, + "step": 3189, + "temperature": 0.9 + }, + { + "advantages": -1.4798982192587573e-05, + "completion_length": 537.0, + "delta_ref_entropy_loss": 0.047607421875, + "delta_ref_ppl": -0.10400390625, + "entropy_loss": -0.11572265625, + "epoch": 0.638, + "grad_norm": 0.8538407709039502, + "k1_kl": 0.10400390625, + "k3_kl": 0.072265625, + "kimi_kl": 0.265625, + "learning_rate": 1.81e-07, + "loss": 0.0029, + "ppl": 0.045166015625, + "reward": 0.9306226372718811, + "reward_std": 0.002199068432673812, + "rewards/perpo_ocr_edit_distance_reward": 0.9306226968765259, + "step": 3190, + "temperature": 0.9 + }, + { + "advantages": -8.229699233197607e-06, + "completion_length": 471.0, + "delta_ref_entropy_loss": 0.034423828125, + "delta_ref_ppl": -0.076171875, + "entropy_loss": -0.1884765625, + "epoch": 0.6382, + "grad_norm": 1.5787289063265255, + "k1_kl": 0.076171875, + "k3_kl": 0.04736328125, + "kimi_kl": 0.1005859375, + "learning_rate": 1.8089999999999999e-07, + "loss": 0.0019, + "ppl": 0.09619140625, + "reward": 0.972836971282959, + "reward_std": 0.0061057317070662975, + "rewards/perpo_ocr_edit_distance_reward": 0.9728370904922485, + "step": 3191, + "temperature": 0.9 + }, + { + "advantages": 8.514949456639442e-08, + "completion_length": 386.0, + "delta_ref_entropy_loss": -0.01043701171875, + "delta_ref_ppl": -0.08056640625, + "entropy_loss": -0.1259765625, + "epoch": 0.6384, + "grad_norm": 6.861270066441821, + "k1_kl": 0.08056640625, + "k3_kl": 0.0595703125, + "kimi_kl": 0.19140625, + "learning_rate": 1.8079999999999998e-07, + "loss": 0.0024, + "ppl": 0.04638671875, + "reward": 0.9511672854423523, + "reward_std": 0.09720387309789658, + "rewards/perpo_ocr_edit_distance_reward": 0.9511672854423523, + "step": 3192, + "temperature": 0.9 + }, + { + "advantages": -2.155985202989541e-05, + "completion_length": 230.0, + "delta_ref_entropy_loss": 0.02001953125, + "delta_ref_ppl": -0.12158203125, + "entropy_loss": -0.050537109375, + "epoch": 0.6386, + "grad_norm": 0.5551142074866745, + "k1_kl": 0.12158203125, + "k3_kl": 0.099609375, + "kimi_kl": 0.400390625, + "learning_rate": 1.807e-07, + "loss": 0.004, + "ppl": 0.0147705078125, + "reward": 0.9318594932556152, + "reward_std": 0.0014807538827881217, + "rewards/perpo_ocr_edit_distance_reward": 0.93185955286026, + "step": 3193, + "temperature": 0.9 + }, + { + "advantages": -1.934596548380796e-05, + "completion_length": 353.0, + "delta_ref_entropy_loss": 0.0211181640625, + "delta_ref_ppl": -0.06591796875, + "entropy_loss": -0.10009765625, + "epoch": 0.6388, + "grad_norm": 2.0228160297283146, + "k1_kl": 0.06591796875, + "k3_kl": 0.047607421875, + "kimi_kl": 0.1376953125, + "learning_rate": 1.806e-07, + "loss": 0.0019, + "ppl": 0.04931640625, + "reward": 0.9410560727119446, + "reward_std": 0.002097195480018854, + "rewards/perpo_ocr_edit_distance_reward": 0.9410561323165894, + "step": 3194, + "temperature": 0.9 + }, + { + "advantages": -7.063150405883789e-05, + "completion_length": 316.0, + "delta_ref_entropy_loss": 0.054443359375, + "delta_ref_ppl": -0.1220703125, + "entropy_loss": -0.049560546875, + "epoch": 0.639, + "grad_norm": 0.6068286255780223, + "k1_kl": 0.1220703125, + "k3_kl": 0.07861328125, + "kimi_kl": 0.2392578125, + "learning_rate": 1.8049999999999998e-07, + "loss": 0.0032, + "ppl": 0.020751953125, + "reward": 0.9723848700523376, + "reward_std": 0.0011054431088268757, + "rewards/perpo_ocr_edit_distance_reward": 0.9723849296569824, + "step": 3195, + "temperature": 0.9 + }, + { + "advantages": -1.8221991922473535e-05, + "completion_length": 509.0, + "delta_ref_entropy_loss": 0.0181884765625, + "delta_ref_ppl": -0.061279296875, + "entropy_loss": -0.053955078125, + "epoch": 0.6392, + "grad_norm": 0.7498703286206205, + "k1_kl": 0.061279296875, + "k3_kl": 0.044921875, + "kimi_kl": 0.177734375, + "learning_rate": 1.804e-07, + "loss": 0.0018, + "ppl": 0.021240234375, + "reward": 0.9854380488395691, + "reward_std": 0.0031739058904349804, + "rewards/perpo_ocr_edit_distance_reward": 0.9854381084442139, + "step": 3196, + "temperature": 0.9 + }, + { + "advantages": -9.02584633877268e-06, + "completion_length": 524.0, + "delta_ref_entropy_loss": 0.062255859375, + "delta_ref_ppl": -0.0810546875, + "entropy_loss": -0.1279296875, + "epoch": 0.6394, + "grad_norm": 1.1560281621287554, + "k1_kl": 0.0810546875, + "k3_kl": 0.048583984375, + "kimi_kl": 0.1337890625, + "learning_rate": 1.8029999999999997e-07, + "loss": 0.0019, + "ppl": 0.050537109375, + "reward": 0.9880130290985107, + "reward_std": 0.003664715215563774, + "rewards/perpo_ocr_edit_distance_reward": 0.9880130887031555, + "step": 3197, + "temperature": 0.9 + }, + { + "advantages": -9.533337288303301e-05, + "completion_length": 1401.0, + "delta_ref_entropy_loss": 0.0201416015625, + "delta_ref_ppl": -0.0419921875, + "entropy_loss": -0.0458984375, + "epoch": 0.6396, + "grad_norm": 0.389254801852999, + "k1_kl": 0.0419921875, + "k3_kl": 0.02685546875, + "kimi_kl": 0.0634765625, + "learning_rate": 1.8019999999999999e-07, + "loss": 0.0012, + "ppl": 0.0159912109375, + "reward": 0.9973602890968323, + "reward_std": 0.00043584094964899123, + "rewards/perpo_ocr_edit_distance_reward": 0.997360348701477, + "step": 3198, + "temperature": 0.9 + }, + { + "advantages": -8.038112355279736e-06, + "completion_length": 155.0, + "delta_ref_entropy_loss": 0.07177734375, + "delta_ref_ppl": -0.2119140625, + "entropy_loss": -0.10205078125, + "epoch": 0.6398, + "grad_norm": 1.5298201998336778, + "k1_kl": 0.2119140625, + "k3_kl": 0.15234375, + "kimi_kl": 0.54296875, + "learning_rate": 1.801e-07, + "loss": 0.0061, + "ppl": 0.039306640625, + "reward": 0.9940700531005859, + "reward_std": 0.002017055405303836, + "rewards/perpo_ocr_edit_distance_reward": 0.9940701127052307, + "step": 3199, + "temperature": 0.9 + }, + { + "advantages": -4.078661004314199e-05, + "completion_length": 749.0, + "delta_ref_entropy_loss": 0.05224609375, + "delta_ref_ppl": -0.06298828125, + "entropy_loss": -0.07080078125, + "epoch": 0.64, + "grad_norm": 0.5422854309976209, + "k1_kl": 0.06298828125, + "k3_kl": 0.041748046875, + "kimi_kl": 0.1025390625, + "learning_rate": 1.8e-07, + "loss": 0.0017, + "ppl": 0.025634765625, + "reward": 0.8638960719108582, + "reward_std": 0.0007353525143116713, + "rewards/perpo_ocr_edit_distance_reward": 0.8638961911201477, + "step": 3200, + "temperature": 0.9 + }, + { + "advantages": -2.588544703030493e-05, + "completion_length": 196.0, + "delta_ref_entropy_loss": 0.06982421875, + "delta_ref_ppl": -0.1767578125, + "entropy_loss": -0.06884765625, + "epoch": 0.6402, + "grad_norm": 0.5945737598488884, + "k1_kl": 0.1767578125, + "k3_kl": 0.1328125, + "kimi_kl": 0.52734375, + "learning_rate": 1.799e-07, + "loss": 0.0053, + "ppl": 0.0203857421875, + "reward": 0.9850649237632751, + "reward_std": 0.0012148227542638779, + "rewards/perpo_ocr_edit_distance_reward": 0.9850649833679199, + "step": 3201, + "temperature": 0.9 + }, + { + "advantages": -7.234301301650703e-05, + "completion_length": 1080.0, + "delta_ref_entropy_loss": 0.015869140625, + "delta_ref_ppl": -0.034423828125, + "entropy_loss": -0.0537109375, + "epoch": 0.6404, + "grad_norm": 0.33823156953996414, + "k1_kl": 0.034423828125, + "k3_kl": 0.0225830078125, + "kimi_kl": 0.0712890625, + "learning_rate": 1.7979999999999998e-07, + "loss": 0.001, + "ppl": 0.018310546875, + "reward": 0.8724910616874695, + "reward_std": 0.0004883929505012929, + "rewards/perpo_ocr_edit_distance_reward": 0.872491180896759, + "step": 3202, + "temperature": 0.9 + }, + { + "advantages": -5.187307397136465e-05, + "completion_length": 679.0, + "delta_ref_entropy_loss": 0.040771484375, + "delta_ref_ppl": -0.0390625, + "entropy_loss": -0.043212890625, + "epoch": 0.6406, + "grad_norm": 1.2723896536986243, + "k1_kl": 0.0390625, + "k3_kl": 0.02099609375, + "kimi_kl": 0.0517578125, + "learning_rate": 1.797e-07, + "loss": 0.0009, + "ppl": 0.017578125, + "reward": 0.9963207244873047, + "reward_std": 0.0005566519103012979, + "rewards/perpo_ocr_edit_distance_reward": 0.9963207840919495, + "step": 3203, + "temperature": 0.9 + }, + { + "advantages": -0.00013046605454292148, + "completion_length": 629.0, + "delta_ref_entropy_loss": 0.023193359375, + "delta_ref_ppl": -0.051025390625, + "entropy_loss": -0.0301513671875, + "epoch": 0.6408, + "grad_norm": 0.4143344314184982, + "k1_kl": 0.051025390625, + "k3_kl": 0.032470703125, + "kimi_kl": 0.09765625, + "learning_rate": 1.796e-07, + "loss": 0.0014, + "ppl": 0.01153564453125, + "reward": 0.9955341219902039, + "reward_std": 0.0004220191913191229, + "rewards/perpo_ocr_edit_distance_reward": 0.9955342411994934, + "step": 3204, + "temperature": 0.9 + }, + { + "advantages": 8.932182026910596e-06, + "completion_length": 962.0, + "delta_ref_entropy_loss": 0.0341796875, + "delta_ref_ppl": -0.045654296875, + "entropy_loss": -0.07470703125, + "epoch": 0.641, + "grad_norm": 0.6503194149146473, + "k1_kl": 0.04541015625, + "k3_kl": 0.02880859375, + "kimi_kl": 0.07470703125, + "learning_rate": 1.7949999999999999e-07, + "loss": 0.0011, + "ppl": 0.03369140625, + "reward": 0.9645659327507019, + "reward_std": 0.002755634021013975, + "rewards/perpo_ocr_edit_distance_reward": 0.9645659923553467, + "step": 3205, + "temperature": 0.9 + }, + { + "advantages": -8.511544001521543e-05, + "completion_length": 812.0, + "delta_ref_entropy_loss": 0.024658203125, + "delta_ref_ppl": -0.03271484375, + "entropy_loss": -0.042236328125, + "epoch": 0.6412, + "grad_norm": 0.756947066003112, + "k1_kl": 0.03271484375, + "k3_kl": 0.018310546875, + "kimi_kl": 0.04345703125, + "learning_rate": 1.794e-07, + "loss": 0.0008, + "ppl": 0.017822265625, + "reward": 0.9972343444824219, + "reward_std": 0.0008001584210433066, + "rewards/perpo_ocr_edit_distance_reward": 0.9972344040870667, + "step": 3206, + "temperature": 0.9 + }, + { + "advantages": -1.183578024210874e-05, + "completion_length": 1998.0, + "delta_ref_entropy_loss": 0.00799560546875, + "delta_ref_ppl": -0.04052734375, + "entropy_loss": -0.1298828125, + "epoch": 0.6414, + "grad_norm": 1.3776172526774584, + "k1_kl": 0.04052734375, + "k3_kl": 0.03759765625, + "kimi_kl": 0.076171875, + "learning_rate": 1.7929999999999997e-07, + "loss": 0.0015, + "ppl": 0.0751953125, + "reward": 0.9725078344345093, + "reward_std": 0.003495451994240284, + "rewards/perpo_ocr_edit_distance_reward": 0.9725079536437988, + "step": 3207, + "temperature": 0.9 + }, + { + "advantages": -1.9431114196777344e-05, + "completion_length": 506.0, + "delta_ref_entropy_loss": 0.01458740234375, + "delta_ref_ppl": -0.0947265625, + "entropy_loss": -0.0625, + "epoch": 0.6416, + "grad_norm": 1.090726804328655, + "k1_kl": 0.09423828125, + "k3_kl": 0.07470703125, + "kimi_kl": 0.255859375, + "learning_rate": 1.792e-07, + "loss": 0.003, + "ppl": 0.0264892578125, + "reward": 0.9910579919815063, + "reward_std": 0.0016538893105462193, + "rewards/perpo_ocr_edit_distance_reward": 0.9910579919815063, + "step": 3208, + "temperature": 0.9 + }, + { + "advantages": -1.2559550668811426e-05, + "completion_length": 386.0, + "delta_ref_entropy_loss": 0.115234375, + "delta_ref_ppl": -0.1435546875, + "entropy_loss": -0.2021484375, + "epoch": 0.6418, + "grad_norm": 2.0772675416436304, + "k1_kl": 0.1435546875, + "k3_kl": 0.09375, + "kimi_kl": 0.21484375, + "learning_rate": 1.791e-07, + "loss": 0.0038, + "ppl": 0.09130859375, + "reward": 0.9046960473060608, + "reward_std": 0.0032879551872611046, + "rewards/perpo_ocr_edit_distance_reward": 0.9046961665153503, + "step": 3209, + "temperature": 0.9 + }, + { + "advantages": -1.5216214706015307e-05, + "completion_length": 1469.0, + "delta_ref_entropy_loss": 0.000698089599609375, + "delta_ref_ppl": -0.06494140625, + "entropy_loss": -0.2099609375, + "epoch": 0.642, + "grad_norm": 1.736319729749555, + "k1_kl": 0.06494140625, + "k3_kl": 0.05224609375, + "kimi_kl": 0.11083984375, + "learning_rate": 1.7899999999999997e-07, + "loss": 0.0021, + "ppl": 0.09521484375, + "reward": 0.8822174072265625, + "reward_std": 0.0060558198019862175, + "rewards/perpo_ocr_edit_distance_reward": 0.8822174668312073, + "step": 3210, + "temperature": 0.9 + }, + { + "advantages": -8.489404535794165e-06, + "completion_length": 398.0, + "delta_ref_entropy_loss": 0.042236328125, + "delta_ref_ppl": -0.07177734375, + "entropy_loss": -0.058837890625, + "epoch": 0.6422, + "grad_norm": 0.47660814260352136, + "k1_kl": 0.07177734375, + "k3_kl": 0.051513671875, + "kimi_kl": 0.1630859375, + "learning_rate": 1.789e-07, + "loss": 0.0021, + "ppl": 0.0218505859375, + "reward": 0.9886940121650696, + "reward_std": 0.0009022032609209418, + "rewards/perpo_ocr_edit_distance_reward": 0.9886940717697144, + "step": 3211, + "temperature": 0.9 + }, + { + "advantages": -2.711160050239414e-05, + "completion_length": 755.0, + "delta_ref_entropy_loss": 0.0732421875, + "delta_ref_ppl": -0.10205078125, + "entropy_loss": -0.189453125, + "epoch": 0.6424, + "grad_norm": 1.3651909615295035, + "k1_kl": 0.10205078125, + "k3_kl": 0.0634765625, + "kimi_kl": 0.1748046875, + "learning_rate": 1.7879999999999999e-07, + "loss": 0.0026, + "ppl": 0.10595703125, + "reward": 0.9581789970397949, + "reward_std": 0.002099759876728058, + "rewards/perpo_ocr_edit_distance_reward": 0.9581790566444397, + "step": 3212, + "temperature": 0.9 + }, + { + "advantages": -2.384185791015625e-07, + "completion_length": 1550.0, + "delta_ref_entropy_loss": -0.546875, + "delta_ref_ppl": -0.000835418701171875, + "entropy_loss": -2.25, + "epoch": 0.6426, + "grad_norm": 6.992388265512887, + "k1_kl": -0.000751495361328125, + "k3_kl": 0.10205078125, + "kimi_kl": 0.125, + "learning_rate": 1.7869999999999998e-07, + "loss": 0.0041, + "ppl": 1.3671875, + "reward": 0.22660064697265625, + "reward_std": 0.10032425820827484, + "rewards/perpo_ocr_edit_distance_reward": 0.22660066187381744, + "step": 3213, + "temperature": 0.9 + }, + { + "advantages": -2.3160662294685608e-06, + "completion_length": 637.0, + "delta_ref_entropy_loss": 0.0458984375, + "delta_ref_ppl": -0.0771484375, + "entropy_loss": -0.2734375, + "epoch": 0.6428, + "grad_norm": 1.5463915487765172, + "k1_kl": 0.0771484375, + "k3_kl": 0.053466796875, + "kimi_kl": 0.1103515625, + "learning_rate": 1.786e-07, + "loss": 0.0021, + "ppl": 0.126953125, + "reward": 0.9120923280715942, + "reward_std": 0.00356305786408484, + "rewards/perpo_ocr_edit_distance_reward": 0.912092387676239, + "step": 3214, + "temperature": 0.9 + }, + { + "advantages": -4.427773774295929e-07, + "completion_length": 749.0, + "delta_ref_entropy_loss": -0.033447265625, + "delta_ref_ppl": -0.049560546875, + "entropy_loss": -0.1923828125, + "epoch": 0.643, + "grad_norm": 1.0362206285075464, + "k1_kl": 0.0498046875, + "k3_kl": 0.0439453125, + "kimi_kl": 0.09228515625, + "learning_rate": 1.785e-07, + "loss": 0.0018, + "ppl": 0.10107421875, + "reward": 0.6466525197029114, + "reward_std": 0.0396084301173687, + "rewards/perpo_ocr_edit_distance_reward": 0.6466525197029114, + "step": 3215, + "temperature": 0.9 + }, + { + "advantages": -3.065381974920456e-07, + "completion_length": 195.0, + "delta_ref_entropy_loss": -0.1689453125, + "delta_ref_ppl": -0.5, + "entropy_loss": -0.439453125, + "epoch": 0.6432, + "grad_norm": 6.547848233001411, + "k1_kl": 0.5, + "k3_kl": 0.423828125, + "kimi_kl": 1.8828125, + "learning_rate": 1.7839999999999998e-07, + "loss": 0.0169, + "ppl": 0.1826171875, + "reward": 0.8601794838905334, + "reward_std": 0.15338116884231567, + "rewards/perpo_ocr_edit_distance_reward": 0.8601795434951782, + "step": 3216, + "temperature": 0.9 + }, + { + "advantages": -2.360344115004409e-05, + "completion_length": 292.0, + "delta_ref_entropy_loss": -0.036376953125, + "delta_ref_ppl": -0.061767578125, + "entropy_loss": -0.1826171875, + "epoch": 0.6434, + "grad_norm": 1.2624189234853174, + "k1_kl": 0.061767578125, + "k3_kl": 0.04833984375, + "kimi_kl": 0.1328125, + "learning_rate": 1.7829999999999998e-07, + "loss": 0.002, + "ppl": 0.064453125, + "reward": 0.9682163000106812, + "reward_std": 0.004230587277561426, + "rewards/perpo_ocr_edit_distance_reward": 0.9682164192199707, + "step": 3217, + "temperature": 0.9 + }, + { + "advantages": -0.0004255227104295045, + "completion_length": 724.0, + "delta_ref_entropy_loss": 0.0220947265625, + "delta_ref_ppl": -0.02587890625, + "entropy_loss": -0.016845703125, + "epoch": 0.6436, + "grad_norm": 0.02242059679117111, + "k1_kl": 0.02587890625, + "k3_kl": 0.013427734375, + "kimi_kl": 0.03369140625, + "learning_rate": 1.782e-07, + "loss": 0.001, + "ppl": 0.005126953125, + "reward": 0.9997131824493408, + "reward_std": 5.2833872388191594e-08, + "rewards/perpo_ocr_edit_distance_reward": 0.9997131824493408, + "step": 3218, + "temperature": 0.9 + }, + { + "advantages": -8.450236055068672e-05, + "completion_length": 630.0, + "delta_ref_entropy_loss": 0.051513671875, + "delta_ref_ppl": -0.076171875, + "entropy_loss": -0.07275390625, + "epoch": 0.6438, + "grad_norm": 0.713864609984913, + "k1_kl": 0.076171875, + "k3_kl": 0.04345703125, + "kimi_kl": 0.10400390625, + "learning_rate": 1.781e-07, + "loss": 0.0018, + "ppl": 0.032958984375, + "reward": 0.9898921251296997, + "reward_std": 0.0008068343158811331, + "rewards/perpo_ocr_edit_distance_reward": 0.9898921847343445, + "step": 3219, + "temperature": 0.9 + }, + { + "advantages": -4.332406388130039e-05, + "completion_length": 638.0, + "delta_ref_entropy_loss": 0.0205078125, + "delta_ref_ppl": -0.048828125, + "entropy_loss": -0.036865234375, + "epoch": 0.644, + "grad_norm": 0.45627982497525293, + "k1_kl": 0.048828125, + "k3_kl": 0.0390625, + "kimi_kl": 0.12451171875, + "learning_rate": 1.7799999999999998e-07, + "loss": 0.0016, + "ppl": 0.016845703125, + "reward": 0.9952921867370605, + "reward_std": 0.0006866090116091073, + "rewards/perpo_ocr_edit_distance_reward": 0.9952922463417053, + "step": 3220, + "temperature": 0.9 + }, + { + "advantages": -5.10896995820076e-08, + "completion_length": 1232.0, + "delta_ref_entropy_loss": 0.0223388671875, + "delta_ref_ppl": -0.06591796875, + "entropy_loss": -0.1650390625, + "epoch": 0.6442, + "grad_norm": 1.997036499829671, + "k1_kl": 0.06591796875, + "k3_kl": 0.044189453125, + "kimi_kl": 0.1298828125, + "learning_rate": 1.779e-07, + "loss": 0.0018, + "ppl": 0.06494140625, + "reward": 0.7497548460960388, + "reward_std": 0.11883895099163055, + "rewards/perpo_ocr_edit_distance_reward": 0.7497548460960388, + "step": 3221, + "temperature": 0.9 + }, + { + "advantages": -3.811291389865801e-05, + "completion_length": 324.0, + "delta_ref_entropy_loss": 0.035888671875, + "delta_ref_ppl": -0.0849609375, + "entropy_loss": -0.087890625, + "epoch": 0.6444, + "grad_norm": 0.9838165204693721, + "k1_kl": 0.0849609375, + "k3_kl": 0.05517578125, + "kimi_kl": 0.1552734375, + "learning_rate": 1.7780000000000002e-07, + "loss": 0.0022, + "ppl": 0.031982421875, + "reward": 0.9827477335929871, + "reward_std": 0.0014639608561992645, + "rewards/perpo_ocr_edit_distance_reward": 0.9827477931976318, + "step": 3222, + "temperature": 0.9 + }, + { + "advantages": -3.482614556560293e-05, + "completion_length": 270.0, + "delta_ref_entropy_loss": 0.04052734375, + "delta_ref_ppl": -0.1337890625, + "entropy_loss": -0.08154296875, + "epoch": 0.6446, + "grad_norm": 1.1374709394436757, + "k1_kl": 0.1328125, + "k3_kl": 0.09814453125, + "kimi_kl": 0.515625, + "learning_rate": 1.7769999999999998e-07, + "loss": 0.004, + "ppl": 0.0322265625, + "reward": 0.9827539920806885, + "reward_std": 0.001855329843237996, + "rewards/perpo_ocr_edit_distance_reward": 0.9827539920806885, + "step": 3223, + "temperature": 0.9 + }, + { + "advantages": -0.00012549333041533828, + "completion_length": 724.0, + "delta_ref_entropy_loss": 0.0269775390625, + "delta_ref_ppl": -0.043212890625, + "entropy_loss": -0.0299072265625, + "epoch": 0.6448, + "grad_norm": 0.3142343855604335, + "k1_kl": 0.04296875, + "k3_kl": 0.0286865234375, + "kimi_kl": 0.08837890625, + "learning_rate": 1.776e-07, + "loss": 0.0013, + "ppl": 0.01080322265625, + "reward": 0.9938311576843262, + "reward_std": 0.0003748868184629828, + "rewards/perpo_ocr_edit_distance_reward": 0.993831217288971, + "step": 3224, + "temperature": 0.9 + }, + { + "advantages": -4.366466237115674e-05, + "completion_length": 988.0, + "delta_ref_entropy_loss": 0.040771484375, + "delta_ref_ppl": -0.06201171875, + "entropy_loss": -0.0517578125, + "epoch": 0.645, + "grad_norm": 0.4887463527439679, + "k1_kl": 0.061767578125, + "k3_kl": 0.040771484375, + "kimi_kl": 0.13671875, + "learning_rate": 1.775e-07, + "loss": 0.0017, + "ppl": 0.02197265625, + "reward": 0.9847159385681152, + "reward_std": 0.000680066179484129, + "rewards/perpo_ocr_edit_distance_reward": 0.9847160577774048, + "step": 3225, + "temperature": 0.9 + }, + { + "advantages": -8.344650996150449e-05, + "completion_length": 821.0, + "delta_ref_entropy_loss": 0.033447265625, + "delta_ref_ppl": -0.04541015625, + "entropy_loss": -0.044677734375, + "epoch": 0.6452, + "grad_norm": 0.5453503001323626, + "k1_kl": 0.04541015625, + "k3_kl": 0.02685546875, + "kimi_kl": 0.0625, + "learning_rate": 1.774e-07, + "loss": 0.0012, + "ppl": 0.0172119140625, + "reward": 0.9951500296592712, + "reward_std": 0.0006142249330878258, + "rewards/perpo_ocr_edit_distance_reward": 0.995150089263916, + "step": 3226, + "temperature": 0.9 + }, + { + "advantages": 3.4059798537100505e-08, + "completion_length": 439.0, + "delta_ref_entropy_loss": 0.046630859375, + "delta_ref_ppl": -0.134765625, + "entropy_loss": -0.072265625, + "epoch": 0.6454, + "grad_norm": 0.6619298561214776, + "k1_kl": 0.134765625, + "k3_kl": 0.0908203125, + "kimi_kl": 0.328125, + "learning_rate": 1.773e-07, + "loss": 0.0037, + "ppl": 0.029296875, + "reward": 0.9959908127784729, + "reward_std": 0.0012911406811326742, + "rewards/perpo_ocr_edit_distance_reward": 0.9959908127784729, + "step": 3227, + "temperature": 0.9 + }, + { + "advantages": -2.9042364985798486e-05, + "completion_length": 795.0, + "delta_ref_entropy_loss": 0.029541015625, + "delta_ref_ppl": -0.06396484375, + "entropy_loss": -0.034912109375, + "epoch": 0.6456, + "grad_norm": 0.40201202690335, + "k1_kl": 0.06396484375, + "k3_kl": 0.0400390625, + "kimi_kl": 0.1318359375, + "learning_rate": 1.772e-07, + "loss": 0.0016, + "ppl": 0.01422119140625, + "reward": 0.9959616661071777, + "reward_std": 0.0007792412070557475, + "rewards/perpo_ocr_edit_distance_reward": 0.9959617853164673, + "step": 3228, + "temperature": 0.9 + }, + { + "advantages": -1.1333398106216919e-05, + "completion_length": 185.0, + "delta_ref_entropy_loss": 0.09375, + "delta_ref_ppl": -0.21484375, + "entropy_loss": -0.146484375, + "epoch": 0.6458, + "grad_norm": 2.004469573688461, + "k1_kl": 0.2138671875, + "k3_kl": 0.1611328125, + "kimi_kl": 0.5234375, + "learning_rate": 1.771e-07, + "loss": 0.0065, + "ppl": 0.06103515625, + "reward": 0.9889183640480042, + "reward_std": 0.003657742403447628, + "rewards/perpo_ocr_edit_distance_reward": 0.9889184236526489, + "step": 3229, + "temperature": 0.9 + }, + { + "advantages": -1.1580331147342804e-06, + "completion_length": 1299.0, + "delta_ref_entropy_loss": 0.03857421875, + "delta_ref_ppl": -0.08349609375, + "entropy_loss": -0.1455078125, + "epoch": 0.646, + "grad_norm": 1.7705312035117735, + "k1_kl": 0.08349609375, + "k3_kl": 0.05859375, + "kimi_kl": 0.19140625, + "learning_rate": 1.7699999999999998e-07, + "loss": 0.0023, + "ppl": 0.07275390625, + "reward": 0.9601808190345764, + "reward_std": 0.00739016430452466, + "rewards/perpo_ocr_edit_distance_reward": 0.9601808786392212, + "step": 3230, + "temperature": 0.9 + }, + { + "advantages": 1.774515476427041e-05, + "completion_length": 1074.0, + "delta_ref_entropy_loss": 0.037841796875, + "delta_ref_ppl": -0.043701171875, + "entropy_loss": -0.0947265625, + "epoch": 0.6462, + "grad_norm": 0.8470670478935534, + "k1_kl": 0.043701171875, + "k3_kl": 0.02734375, + "kimi_kl": 0.06640625, + "learning_rate": 1.769e-07, + "loss": 0.0011, + "ppl": 0.04052734375, + "reward": 0.9024742841720581, + "reward_std": 0.0013379198499023914, + "rewards/perpo_ocr_edit_distance_reward": 0.9024742841720581, + "step": 3231, + "temperature": 0.9 + }, + { + "advantages": -1.2014594176434912e-05, + "completion_length": 750.0, + "delta_ref_entropy_loss": 0.0177001953125, + "delta_ref_ppl": -0.03076171875, + "entropy_loss": -0.060302734375, + "epoch": 0.6464, + "grad_norm": 1.0331520838176036, + "k1_kl": 0.03076171875, + "k3_kl": 0.0198974609375, + "kimi_kl": 0.047119140625, + "learning_rate": 1.768e-07, + "loss": 0.0008, + "ppl": 0.026123046875, + "reward": 0.9877859354019165, + "reward_std": 0.00415780209004879, + "rewards/perpo_ocr_edit_distance_reward": 0.9877859950065613, + "step": 3232, + "temperature": 0.9 + }, + { + "advantages": -4.07184888899792e-05, + "completion_length": 365.0, + "delta_ref_entropy_loss": 0.0947265625, + "delta_ref_ppl": -0.1494140625, + "entropy_loss": -0.1552734375, + "epoch": 0.6466, + "grad_norm": 1.1635420559141727, + "k1_kl": 0.1494140625, + "k3_kl": 0.095703125, + "kimi_kl": 0.341796875, + "learning_rate": 1.767e-07, + "loss": 0.0039, + "ppl": 0.0615234375, + "reward": 0.9745544791221619, + "reward_std": 0.001782901119440794, + "rewards/perpo_ocr_edit_distance_reward": 0.9745545387268066, + "step": 3233, + "temperature": 0.9 + }, + { + "advantages": -2.174718065361958e-05, + "completion_length": 191.0, + "delta_ref_entropy_loss": 0.013671875, + "delta_ref_ppl": -0.279296875, + "entropy_loss": -0.154296875, + "epoch": 0.6468, + "grad_norm": 1.9266309704689022, + "k1_kl": 0.279296875, + "k3_kl": 0.22265625, + "kimi_kl": 0.82421875, + "learning_rate": 1.766e-07, + "loss": 0.0089, + "ppl": 0.0576171875, + "reward": 0.9689860343933105, + "reward_std": 0.002249360317364335, + "rewards/perpo_ocr_edit_distance_reward": 0.9689860343933105, + "step": 3234, + "temperature": 0.9 + }, + { + "advantages": -8.07217202236643e-06, + "completion_length": 293.0, + "delta_ref_entropy_loss": -0.000732421875, + "delta_ref_ppl": -0.0869140625, + "entropy_loss": -0.07861328125, + "epoch": 0.647, + "grad_norm": 1.5937770175933734, + "k1_kl": 0.0869140625, + "k3_kl": 0.06787109375, + "kimi_kl": 0.259765625, + "learning_rate": 1.7649999999999997e-07, + "loss": 0.0027, + "ppl": 0.044921875, + "reward": 0.9904252290725708, + "reward_std": 0.006222426891326904, + "rewards/perpo_ocr_edit_distance_reward": 0.9904253482818604, + "step": 3235, + "temperature": 0.9 + }, + { + "advantages": -5.040850010118447e-05, + "completion_length": 563.0, + "delta_ref_entropy_loss": 0.033935546875, + "delta_ref_ppl": -0.060546875, + "entropy_loss": -0.07177734375, + "epoch": 0.6472, + "grad_norm": 0.743465738737531, + "k1_kl": 0.060546875, + "k3_kl": 0.037353515625, + "kimi_kl": 0.103515625, + "learning_rate": 1.764e-07, + "loss": 0.0015, + "ppl": 0.031494140625, + "reward": 0.989844560623169, + "reward_std": 0.0007448000251315534, + "rewards/perpo_ocr_edit_distance_reward": 0.9898446202278137, + "step": 3236, + "temperature": 0.9 + }, + { + "advantages": -3.329345418023877e-05, + "completion_length": 550.0, + "delta_ref_entropy_loss": 0.041748046875, + "delta_ref_ppl": -0.07666015625, + "entropy_loss": -0.0751953125, + "epoch": 0.6474, + "grad_norm": 4.185386229229406, + "k1_kl": 0.07666015625, + "k3_kl": 0.05517578125, + "kimi_kl": 0.1689453125, + "learning_rate": 1.763e-07, + "loss": 0.0022, + "ppl": 0.03125, + "reward": 0.9791079163551331, + "reward_std": 0.0019452343694865704, + "rewards/perpo_ocr_edit_distance_reward": 0.9791079163551331, + "step": 3237, + "temperature": 0.9 + }, + { + "advantages": -3.358296089572832e-05, + "completion_length": 570.0, + "delta_ref_entropy_loss": 0.032470703125, + "delta_ref_ppl": -0.044677734375, + "entropy_loss": -0.03955078125, + "epoch": 0.6476, + "grad_norm": 0.2034612135739937, + "k1_kl": 0.044677734375, + "k3_kl": 0.023193359375, + "kimi_kl": 0.051513671875, + "learning_rate": 1.7619999999999998e-07, + "loss": 0.001, + "ppl": 0.01190185546875, + "reward": 0.99358069896698, + "reward_std": 0.0004071146540809423, + "rewards/perpo_ocr_edit_distance_reward": 0.99358069896698, + "step": 3238, + "temperature": 0.9 + }, + { + "advantages": -2.213887000834802e-06, + "completion_length": 645.0, + "delta_ref_entropy_loss": 0.07763671875, + "delta_ref_ppl": -0.119140625, + "entropy_loss": -0.2255859375, + "epoch": 0.6478, + "grad_norm": 1.5684248625054178, + "k1_kl": 0.11865234375, + "k3_kl": 0.07080078125, + "kimi_kl": 0.1572265625, + "learning_rate": 1.761e-07, + "loss": 0.0028, + "ppl": 0.1044921875, + "reward": 0.9075630307197571, + "reward_std": 0.0037761309649795294, + "rewards/perpo_ocr_edit_distance_reward": 0.9075630307197571, + "step": 3239, + "temperature": 0.9 + }, + { + "advantages": 1.7029899268550253e-08, + "completion_length": 193.0, + "delta_ref_entropy_loss": 0.0230712890625, + "delta_ref_ppl": -0.19140625, + "entropy_loss": -0.07763671875, + "epoch": 0.648, + "grad_norm": 1.0695325372490834, + "k1_kl": 0.1923828125, + "k3_kl": 0.1494140625, + "kimi_kl": 0.6328125, + "learning_rate": 1.76e-07, + "loss": 0.006, + "ppl": 0.02783203125, + "reward": 0.9960835576057434, + "reward_std": 0.0016853568376973271, + "rewards/perpo_ocr_edit_distance_reward": 0.9960835576057434, + "step": 3240, + "temperature": 0.9 + }, + { + "advantages": -1.3453620795189636e-06, + "completion_length": 2048.0, + "delta_ref_entropy_loss": 0.004669189453125, + "delta_ref_ppl": -0.03857421875, + "entropy_loss": -0.08740234375, + "epoch": 0.6482, + "grad_norm": 6.40251886439729, + "k1_kl": 0.038330078125, + "k3_kl": 0.0291748046875, + "kimi_kl": 0.06884765625, + "learning_rate": 1.7589999999999998e-07, + "loss": 0.0012, + "ppl": 0.038330078125, + "reward": 0.8032123446464539, + "reward_std": 0.006239388138055801, + "rewards/perpo_ocr_edit_distance_reward": 0.8032124042510986, + "step": 3241, + "temperature": 0.9 + }, + { + "advantages": -3.955194188165478e-06, + "completion_length": 259.0, + "delta_ref_entropy_loss": 0.048095703125, + "delta_ref_ppl": -0.2080078125, + "entropy_loss": -0.11279296875, + "epoch": 0.6484, + "grad_norm": 1.109852800888222, + "k1_kl": 0.2080078125, + "k3_kl": 0.1650390625, + "kimi_kl": 0.81640625, + "learning_rate": 1.758e-07, + "loss": 0.0066, + "ppl": 0.046142578125, + "reward": 0.9868505001068115, + "reward_std": 0.002045302651822567, + "rewards/perpo_ocr_edit_distance_reward": 0.9868505001068115, + "step": 3242, + "temperature": 0.9 + }, + { + "advantages": 1.6553061868762597e-05, + "completion_length": 525.0, + "delta_ref_entropy_loss": 0.040283203125, + "delta_ref_ppl": -0.052001953125, + "entropy_loss": -0.06298828125, + "epoch": 0.6486, + "grad_norm": 0.6316533561412642, + "k1_kl": 0.05224609375, + "k3_kl": 0.03076171875, + "kimi_kl": 0.0693359375, + "learning_rate": 1.757e-07, + "loss": 0.0012, + "ppl": 0.023681640625, + "reward": 0.9974945783615112, + "reward_std": 0.000929530942812562, + "rewards/perpo_ocr_edit_distance_reward": 0.9974945783615112, + "step": 3243, + "temperature": 0.9 + }, + { + "advantages": -7.850783731555566e-06, + "completion_length": 941.0, + "delta_ref_entropy_loss": 0.04736328125, + "delta_ref_ppl": -0.07177734375, + "entropy_loss": -0.0673828125, + "epoch": 0.6488, + "grad_norm": 1.539330895721437, + "k1_kl": 0.07177734375, + "k3_kl": 0.05126953125, + "kimi_kl": 0.095703125, + "learning_rate": 1.756e-07, + "loss": 0.0021, + "ppl": 0.0303955078125, + "reward": 0.9684090614318848, + "reward_std": 0.008576387539505959, + "rewards/perpo_ocr_edit_distance_reward": 0.9684091210365295, + "step": 3244, + "temperature": 0.9 + }, + { + "advantages": -8.514949740856537e-07, + "completion_length": 680.0, + "delta_ref_entropy_loss": 0.034912109375, + "delta_ref_ppl": -0.08251953125, + "entropy_loss": -0.0830078125, + "epoch": 0.649, + "grad_norm": 0.7675918616993146, + "k1_kl": 0.08251953125, + "k3_kl": 0.053955078125, + "kimi_kl": 0.154296875, + "learning_rate": 1.7549999999999998e-07, + "loss": 0.0022, + "ppl": 0.02978515625, + "reward": 0.9515274167060852, + "reward_std": 0.0607360303401947, + "rewards/perpo_ocr_edit_distance_reward": 0.95152747631073, + "step": 3245, + "temperature": 0.9 + }, + { + "advantages": -4.460130730876699e-05, + "completion_length": 266.0, + "delta_ref_entropy_loss": 0.06640625, + "delta_ref_ppl": -0.181640625, + "entropy_loss": -0.0791015625, + "epoch": 0.6492, + "grad_norm": 1.8632644898063688, + "k1_kl": 0.181640625, + "k3_kl": 0.140625, + "kimi_kl": 0.55078125, + "learning_rate": 1.754e-07, + "loss": 0.0057, + "ppl": 0.0400390625, + "reward": 0.9723663330078125, + "reward_std": 0.0021910632494837046, + "rewards/perpo_ocr_edit_distance_reward": 0.972366452217102, + "step": 3246, + "temperature": 0.9 + }, + { + "advantages": -1.7029899268550253e-08, + "completion_length": 1100.0, + "delta_ref_entropy_loss": 0.07958984375, + "delta_ref_ppl": -0.09765625, + "entropy_loss": -0.27734375, + "epoch": 0.6494, + "grad_norm": 2.3587821767586297, + "k1_kl": 0.09814453125, + "k3_kl": 0.07177734375, + "kimi_kl": 0.15234375, + "learning_rate": 1.7530000000000001e-07, + "loss": 0.0029, + "ppl": 0.1416015625, + "reward": 0.8542259335517883, + "reward_std": 0.003369885729625821, + "rewards/perpo_ocr_edit_distance_reward": 0.8542259931564331, + "step": 3247, + "temperature": 0.9 + }, + { + "advantages": -6.461143493652344e-05, + "completion_length": 835.0, + "delta_ref_entropy_loss": 0.0439453125, + "delta_ref_ppl": -0.07421875, + "entropy_loss": -0.05810546875, + "epoch": 0.6496, + "grad_norm": 0.8638807416523429, + "k1_kl": 0.07421875, + "k3_kl": 0.043212890625, + "kimi_kl": 0.09814453125, + "learning_rate": 1.7519999999999998e-07, + "loss": 0.0018, + "ppl": 0.02099609375, + "reward": 0.988800048828125, + "reward_std": 0.0005589551292359829, + "rewards/perpo_ocr_edit_distance_reward": 0.9888001084327698, + "step": 3248, + "temperature": 0.9 + }, + { + "advantages": -6.214210588950664e-05, + "completion_length": 873.0, + "delta_ref_entropy_loss": 0.025634765625, + "delta_ref_ppl": -0.03515625, + "entropy_loss": -0.04736328125, + "epoch": 0.6498, + "grad_norm": 0.3176486989852211, + "k1_kl": 0.03515625, + "k3_kl": 0.02001953125, + "kimi_kl": 0.035888671875, + "learning_rate": 1.751e-07, + "loss": 0.0009, + "ppl": 0.0235595703125, + "reward": 0.9935115575790405, + "reward_std": 0.000585220055654645, + "rewards/perpo_ocr_edit_distance_reward": 0.9935115575790405, + "step": 3249, + "temperature": 0.9 + }, + { + "advantages": -0.00011496884690131992, + "completion_length": 385.0, + "delta_ref_entropy_loss": 0.039306640625, + "delta_ref_ppl": -0.1083984375, + "entropy_loss": -0.044189453125, + "epoch": 0.65, + "grad_norm": 0.7101031467146295, + "k1_kl": 0.1083984375, + "k3_kl": 0.08447265625, + "kimi_kl": 0.37109375, + "learning_rate": 1.75e-07, + "loss": 0.0035, + "ppl": 0.015625, + "reward": 0.9654539823532104, + "reward_std": 0.0008628491195850074, + "rewards/perpo_ocr_edit_distance_reward": 0.9654541611671448, + "step": 3250, + "temperature": 0.9 + }, + { + "advantages": -4.124641782254912e-05, + "completion_length": 1962.0, + "delta_ref_entropy_loss": 0.0216064453125, + "delta_ref_ppl": -0.04296875, + "entropy_loss": -0.11181640625, + "epoch": 0.6502, + "grad_norm": 2.6127413912849904, + "k1_kl": 0.043212890625, + "k3_kl": 0.04296875, + "kimi_kl": 0.055908203125, + "learning_rate": 1.7489999999999998e-07, + "loss": 0.0018, + "ppl": 0.0712890625, + "reward": 0.9767670631408691, + "reward_std": 0.0019623495172709227, + "rewards/perpo_ocr_edit_distance_reward": 0.9767671823501587, + "step": 3251, + "temperature": 0.9 + }, + { + "advantages": -1.97206227312563e-05, + "completion_length": 1278.0, + "delta_ref_entropy_loss": 0.00885009765625, + "delta_ref_ppl": -0.029296875, + "entropy_loss": -0.027587890625, + "epoch": 0.6504, + "grad_norm": 0.21743429583425303, + "k1_kl": 0.0294189453125, + "k3_kl": 0.01953125, + "kimi_kl": 0.04736328125, + "learning_rate": 1.748e-07, + "loss": 0.0008, + "ppl": 0.00830078125, + "reward": 0.9986733198165894, + "reward_std": 0.0003316534566693008, + "rewards/perpo_ocr_edit_distance_reward": 0.9986733198165894, + "step": 3252, + "temperature": 0.9 + }, + { + "advantages": -1.4236995411920361e-05, + "completion_length": 234.0, + "delta_ref_entropy_loss": 0.009765625, + "delta_ref_ppl": -0.09765625, + "entropy_loss": -0.1259765625, + "epoch": 0.6506, + "grad_norm": 1.4889023182842294, + "k1_kl": 0.09765625, + "k3_kl": 0.087890625, + "kimi_kl": 0.216796875, + "learning_rate": 1.747e-07, + "loss": 0.0035, + "ppl": 0.07275390625, + "reward": 0.9509890079498291, + "reward_std": 0.003488890826702118, + "rewards/perpo_ocr_edit_distance_reward": 0.9509890675544739, + "step": 3253, + "temperature": 0.9 + }, + { + "advantages": -0.00014391967852134258, + "completion_length": 527.0, + "delta_ref_entropy_loss": 0.01483154296875, + "delta_ref_ppl": -0.03857421875, + "entropy_loss": -0.037109375, + "epoch": 0.6508, + "grad_norm": 0.5522379836125678, + "k1_kl": 0.03857421875, + "k3_kl": 0.0262451171875, + "kimi_kl": 0.0791015625, + "learning_rate": 1.746e-07, + "loss": 0.0012, + "ppl": 0.01275634765625, + "reward": 0.9934158325195312, + "reward_std": 0.00037327376776374876, + "rewards/perpo_ocr_edit_distance_reward": 0.993415892124176, + "step": 3254, + "temperature": 0.9 + }, + { + "advantages": -1.005189824354602e-05, + "completion_length": 772.0, + "delta_ref_entropy_loss": 0.049560546875, + "delta_ref_ppl": -0.0771484375, + "entropy_loss": -0.1220703125, + "epoch": 0.651, + "grad_norm": 1.658444379559472, + "k1_kl": 0.0771484375, + "k3_kl": 0.0498046875, + "kimi_kl": 0.10791015625, + "learning_rate": 1.7449999999999998e-07, + "loss": 0.002, + "ppl": 0.062255859375, + "reward": 0.9665653109550476, + "reward_std": 0.0024415473453700542, + "rewards/perpo_ocr_edit_distance_reward": 0.9665653705596924, + "step": 3255, + "temperature": 0.9 + }, + { + "advantages": -3.68995351891499e-05, + "completion_length": 775.0, + "delta_ref_entropy_loss": 0.0634765625, + "delta_ref_ppl": -0.080078125, + "entropy_loss": -0.0537109375, + "epoch": 0.6512, + "grad_norm": 0.6399118793559199, + "k1_kl": 0.080078125, + "k3_kl": 0.052490234375, + "kimi_kl": 0.1884765625, + "learning_rate": 1.744e-07, + "loss": 0.0021, + "ppl": 0.02392578125, + "reward": 0.985939085483551, + "reward_std": 0.001284693251363933, + "rewards/perpo_ocr_edit_distance_reward": 0.985939085483551, + "step": 3256, + "temperature": 0.9 + }, + { + "advantages": -1.3692038919543847e-05, + "completion_length": 244.0, + "delta_ref_entropy_loss": 0.001953125, + "delta_ref_ppl": -0.142578125, + "entropy_loss": -0.236328125, + "epoch": 0.6514, + "grad_norm": 1.4732948827631311, + "k1_kl": 0.1435546875, + "k3_kl": 0.1064453125, + "kimi_kl": 0.322265625, + "learning_rate": 1.743e-07, + "loss": 0.0043, + "ppl": 0.083984375, + "reward": 0.8964829444885254, + "reward_std": 0.0030081754084676504, + "rewards/perpo_ocr_edit_distance_reward": 0.8964830040931702, + "step": 3257, + "temperature": 0.9 + }, + { + "advantages": -9.366444828629028e-07, + "completion_length": 343.0, + "delta_ref_entropy_loss": 0.047607421875, + "delta_ref_ppl": -0.10009765625, + "entropy_loss": -0.0791015625, + "epoch": 0.6516, + "grad_norm": 1.9589164692329473, + "k1_kl": 0.10009765625, + "k3_kl": 0.06982421875, + "kimi_kl": 0.1943359375, + "learning_rate": 1.7419999999999998e-07, + "loss": 0.0028, + "ppl": 0.03466796875, + "reward": 0.883047342300415, + "reward_std": 0.0177654679864645, + "rewards/perpo_ocr_edit_distance_reward": 0.8830474019050598, + "step": 3258, + "temperature": 0.9 + }, + { + "advantages": -5.311625500326045e-05, + "completion_length": 226.0, + "delta_ref_entropy_loss": 0.01556396484375, + "delta_ref_ppl": -0.11669921875, + "entropy_loss": -0.06787109375, + "epoch": 0.6518, + "grad_norm": 0.6533551504818889, + "k1_kl": 0.11669921875, + "k3_kl": 0.09130859375, + "kimi_kl": 0.357421875, + "learning_rate": 1.741e-07, + "loss": 0.0037, + "ppl": 0.0263671875, + "reward": 0.9843840003013611, + "reward_std": 0.0005410274607129395, + "rewards/perpo_ocr_edit_distance_reward": 0.9843840599060059, + "step": 3259, + "temperature": 0.9 + }, + { + "advantages": 8.957727004599292e-06, + "completion_length": 687.0, + "delta_ref_entropy_loss": 0.03955078125, + "delta_ref_ppl": -0.08642578125, + "entropy_loss": -0.123046875, + "epoch": 0.652, + "grad_norm": 0.8657758678633957, + "k1_kl": 0.08642578125, + "k3_kl": 0.05859375, + "kimi_kl": 0.1640625, + "learning_rate": 1.7399999999999997e-07, + "loss": 0.0023, + "ppl": 0.05810546875, + "reward": 0.9620903134346008, + "reward_std": 0.000849980628117919, + "rewards/perpo_ocr_edit_distance_reward": 0.9620903730392456, + "step": 3260, + "temperature": 0.9 + }, + { + "advantages": 0.0, + "completion_length": 737.0, + "delta_ref_entropy_loss": 0.052978515625, + "delta_ref_ppl": -0.064453125, + "entropy_loss": -0.0732421875, + "epoch": 0.6522, + "grad_norm": 2.5751353004606266, + "k1_kl": 0.064453125, + "k3_kl": 0.035888671875, + "kimi_kl": 0.09423828125, + "learning_rate": 1.739e-07, + "loss": 0.0014, + "ppl": 0.031982421875, + "reward": 0.9941557049751282, + "reward_std": 0.001039993017911911, + "rewards/perpo_ocr_edit_distance_reward": 0.994155764579773, + "step": 3261, + "temperature": 0.9 + }, + { + "advantages": -2.653258343343623e-05, + "completion_length": 254.0, + "delta_ref_entropy_loss": 0.02587890625, + "delta_ref_ppl": -0.10107421875, + "entropy_loss": -0.05029296875, + "epoch": 0.6524, + "grad_norm": 0.9217856036126787, + "k1_kl": 0.1005859375, + "k3_kl": 0.0732421875, + "kimi_kl": 0.322265625, + "learning_rate": 1.738e-07, + "loss": 0.003, + "ppl": 0.016357421875, + "reward": 0.9957242608070374, + "reward_std": 0.0011841553496196866, + "rewards/perpo_ocr_edit_distance_reward": 0.9957243800163269, + "step": 3262, + "temperature": 0.9 + }, + { + "advantages": -1.3879367770641693e-06, + "completion_length": 696.0, + "delta_ref_entropy_loss": 0.0240478515625, + "delta_ref_ppl": -0.07861328125, + "entropy_loss": -0.08447265625, + "epoch": 0.6526, + "grad_norm": 0.8779170930471091, + "k1_kl": 0.07861328125, + "k3_kl": 0.05029296875, + "kimi_kl": 0.1396484375, + "learning_rate": 1.7369999999999997e-07, + "loss": 0.002, + "ppl": 0.0380859375, + "reward": 0.9598317742347717, + "reward_std": 0.042912885546684265, + "rewards/perpo_ocr_edit_distance_reward": 0.9598318338394165, + "step": 3263, + "temperature": 0.9 + }, + { + "advantages": -5.27926886206842e-07, + "completion_length": 1369.0, + "delta_ref_entropy_loss": 0.0230712890625, + "delta_ref_ppl": -0.09228515625, + "entropy_loss": -0.2021484375, + "epoch": 0.6528, + "grad_norm": 3.505162766136471, + "k1_kl": 0.09228515625, + "k3_kl": 0.0703125, + "kimi_kl": 0.1455078125, + "learning_rate": 1.736e-07, + "loss": 0.0028, + "ppl": 0.09814453125, + "reward": 0.8827309608459473, + "reward_std": 0.16541382670402527, + "rewards/perpo_ocr_edit_distance_reward": 0.8827310800552368, + "step": 3264, + "temperature": 0.9 + }, + { + "advantages": -7.846525841159746e-05, + "completion_length": 841.0, + "delta_ref_entropy_loss": 0.033203125, + "delta_ref_ppl": -0.05322265625, + "entropy_loss": -0.05126953125, + "epoch": 0.653, + "grad_norm": 4.675827401436543, + "k1_kl": 0.05322265625, + "k3_kl": 0.032470703125, + "kimi_kl": 0.0966796875, + "learning_rate": 1.7349999999999999e-07, + "loss": 0.0014, + "ppl": 0.0230712890625, + "reward": 0.9952154159545898, + "reward_std": 0.0007682265131734312, + "rewards/perpo_ocr_edit_distance_reward": 0.9952155351638794, + "step": 3265, + "temperature": 0.9 + }, + { + "advantages": -5.449567765936081e-07, + "completion_length": 589.0, + "delta_ref_entropy_loss": -0.0947265625, + "delta_ref_ppl": -0.1220703125, + "entropy_loss": -0.81640625, + "epoch": 0.6532, + "grad_norm": 4.067969607908174, + "k1_kl": 0.1220703125, + "k3_kl": 0.10791015625, + "kimi_kl": 0.2177734375, + "learning_rate": 1.7339999999999998e-07, + "loss": 0.0043, + "ppl": 0.412109375, + "reward": 0.3390761613845825, + "reward_std": 0.022848520427942276, + "rewards/perpo_ocr_edit_distance_reward": 0.3390761911869049, + "step": 3266, + "temperature": 0.9 + }, + { + "advantages": -5.007641811971553e-05, + "completion_length": 760.0, + "delta_ref_entropy_loss": 0.0286865234375, + "delta_ref_ppl": -0.045654296875, + "entropy_loss": -0.0390625, + "epoch": 0.6534, + "grad_norm": 1.5260929933649303, + "k1_kl": 0.04541015625, + "k3_kl": 0.0301513671875, + "kimi_kl": 0.05859375, + "learning_rate": 1.733e-07, + "loss": 0.0013, + "ppl": 0.01544189453125, + "reward": 0.9985896944999695, + "reward_std": 0.0005800433573313057, + "rewards/perpo_ocr_edit_distance_reward": 0.9985897541046143, + "step": 3267, + "temperature": 0.9 + }, + { + "advantages": -1.3215201761340722e-05, + "completion_length": 522.0, + "delta_ref_entropy_loss": 0.025634765625, + "delta_ref_ppl": -0.04931640625, + "entropy_loss": -0.045654296875, + "epoch": 0.6536, + "grad_norm": 0.7614409042999637, + "k1_kl": 0.049072265625, + "k3_kl": 0.034912109375, + "kimi_kl": 0.0966796875, + "learning_rate": 1.732e-07, + "loss": 0.0014, + "ppl": 0.019775390625, + "reward": 0.9978150129318237, + "reward_std": 0.0011893694754689932, + "rewards/perpo_ocr_edit_distance_reward": 0.9978150129318237, + "step": 3268, + "temperature": 0.9 + }, + { + "advantages": -0.0001087401615222916, + "completion_length": 470.0, + "delta_ref_entropy_loss": 0.043212890625, + "delta_ref_ppl": -0.1142578125, + "entropy_loss": -0.1142578125, + "epoch": 0.6538, + "grad_norm": 0.8646104475873861, + "k1_kl": 0.1142578125, + "k3_kl": 0.08251953125, + "kimi_kl": 0.26171875, + "learning_rate": 1.731e-07, + "loss": 0.0034, + "ppl": 0.049072265625, + "reward": 0.9917348623275757, + "reward_std": 0.0009179618209600449, + "rewards/perpo_ocr_edit_distance_reward": 0.9917349815368652, + "step": 3269, + "temperature": 0.9 + }, + { + "advantages": 2.9802322387695312e-08, + "completion_length": 579.0, + "delta_ref_entropy_loss": 0.0194091796875, + "delta_ref_ppl": -0.056640625, + "entropy_loss": -0.036865234375, + "epoch": 0.654, + "grad_norm": 0.33011004749864997, + "k1_kl": 0.056640625, + "k3_kl": 0.04150390625, + "kimi_kl": 0.1513671875, + "learning_rate": 1.7299999999999997e-07, + "loss": 0.0017, + "ppl": 0.01483154296875, + "reward": 0.9981669187545776, + "reward_std": 0.0005934516666457057, + "rewards/perpo_ocr_edit_distance_reward": 0.9981669187545776, + "step": 3270, + "temperature": 0.9 + }, + { + "advantages": -1.7472677427576855e-05, + "completion_length": 469.0, + "delta_ref_entropy_loss": 0.0272216796875, + "delta_ref_ppl": -0.09521484375, + "entropy_loss": -0.07373046875, + "epoch": 0.6542, + "grad_norm": 0.49800016474596914, + "k1_kl": 0.09521484375, + "k3_kl": 0.06982421875, + "kimi_kl": 0.2412109375, + "learning_rate": 1.729e-07, + "loss": 0.0028, + "ppl": 0.03125, + "reward": 0.9648303985595703, + "reward_std": 0.0013641300611197948, + "rewards/perpo_ocr_edit_distance_reward": 0.9648304581642151, + "step": 3271, + "temperature": 0.9 + }, + { + "advantages": -3.7465778746081924e-07, + "completion_length": 442.0, + "delta_ref_entropy_loss": 0.006500244140625, + "delta_ref_ppl": -0.10546875, + "entropy_loss": -0.12890625, + "epoch": 0.6544, + "grad_norm": 1.8764658464862989, + "k1_kl": 0.10595703125, + "k3_kl": 0.07421875, + "kimi_kl": 0.265625, + "learning_rate": 1.728e-07, + "loss": 0.003, + "ppl": 0.05615234375, + "reward": 0.9342113733291626, + "reward_std": 0.022122200578451157, + "rewards/perpo_ocr_edit_distance_reward": 0.9342114925384521, + "step": 3272, + "temperature": 0.9 + }, + { + "advantages": 2.895082786835701e-07, + "completion_length": 546.0, + "delta_ref_entropy_loss": -0.1611328125, + "delta_ref_ppl": -0.0732421875, + "entropy_loss": -0.65234375, + "epoch": 0.6546, + "grad_norm": 5.323502140971498, + "k1_kl": 0.07373046875, + "k3_kl": 0.09521484375, + "kimi_kl": 0.1669921875, + "learning_rate": 1.7269999999999998e-07, + "loss": 0.0038, + "ppl": 0.353515625, + "reward": 0.7893990278244019, + "reward_std": 0.14831188321113586, + "rewards/perpo_ocr_edit_distance_reward": 0.7893990278244019, + "step": 3273, + "temperature": 0.9 + }, + { + "advantages": -2.576623774075415e-05, + "completion_length": 172.0, + "delta_ref_entropy_loss": 0.043701171875, + "delta_ref_ppl": -0.23046875, + "entropy_loss": -0.1474609375, + "epoch": 0.6548, + "grad_norm": 1.7132112981560732, + "k1_kl": 0.2314453125, + "k3_kl": 0.17578125, + "kimi_kl": 0.54296875, + "learning_rate": 1.726e-07, + "loss": 0.0071, + "ppl": 0.06298828125, + "reward": 0.9882818460464478, + "reward_std": 0.003866403829306364, + "rewards/perpo_ocr_edit_distance_reward": 0.9882819652557373, + "step": 3274, + "temperature": 0.9 + }, + { + "advantages": -9.877342108666198e-07, + "completion_length": 1501.0, + "delta_ref_entropy_loss": 0.0147705078125, + "delta_ref_ppl": -0.06396484375, + "entropy_loss": -0.28125, + "epoch": 0.655, + "grad_norm": 1.7925885533900836, + "k1_kl": 0.06396484375, + "k3_kl": 0.044189453125, + "kimi_kl": 0.08984375, + "learning_rate": 1.725e-07, + "loss": 0.0018, + "ppl": 0.14453125, + "reward": 0.921824038028717, + "reward_std": 0.09391395002603531, + "rewards/perpo_ocr_edit_distance_reward": 0.9218242168426514, + "step": 3275, + "temperature": 0.9 + }, + { + "advantages": -3.440039654378779e-05, + "completion_length": 501.0, + "delta_ref_entropy_loss": 0.031982421875, + "delta_ref_ppl": -0.053955078125, + "entropy_loss": -0.0419921875, + "epoch": 0.6552, + "grad_norm": 0.8407575654174859, + "k1_kl": 0.053955078125, + "k3_kl": 0.0390625, + "kimi_kl": 0.11328125, + "learning_rate": 1.7239999999999998e-07, + "loss": 0.0016, + "ppl": 0.015380859375, + "reward": 0.9971507787704468, + "reward_std": 0.0016338026616722345, + "rewards/perpo_ocr_edit_distance_reward": 0.9971508979797363, + "step": 3276, + "temperature": 0.9 + }, + { + "advantages": -2.384185791015625e-07, + "completion_length": 28.0, + "delta_ref_entropy_loss": -0.609375, + "delta_ref_ppl": -1.609375, + "entropy_loss": -1.125, + "epoch": 0.6554, + "grad_norm": 14.418348522058068, + "k1_kl": 1.609375, + "k3_kl": 1.4765625, + "kimi_kl": 7.25, + "learning_rate": 1.723e-07, + "loss": 0.0591, + "ppl": 0.4375, + "reward": 0.8394423127174377, + "reward_std": 0.1751878708600998, + "rewards/perpo_ocr_edit_distance_reward": 0.8394424319267273, + "step": 3277, + "temperature": 0.9 + }, + { + "advantages": -1.0324376944481628e-06, + "completion_length": 461.0, + "delta_ref_entropy_loss": -0.00897216796875, + "delta_ref_ppl": -0.078125, + "entropy_loss": -0.16796875, + "epoch": 0.6556, + "grad_norm": 1.5005178687966096, + "k1_kl": 0.07763671875, + "k3_kl": 0.058349609375, + "kimi_kl": 0.150390625, + "learning_rate": 1.722e-07, + "loss": 0.0023, + "ppl": 0.076171875, + "reward": 0.9027596116065979, + "reward_std": 0.04037705063819885, + "rewards/perpo_ocr_edit_distance_reward": 0.9027596116065979, + "step": 3278, + "temperature": 0.9 + }, + { + "advantages": -5.747590876126196e-06, + "completion_length": 852.0, + "delta_ref_entropy_loss": 0.0625, + "delta_ref_ppl": -0.07373046875, + "entropy_loss": -0.1298828125, + "epoch": 0.6558, + "grad_norm": 1.2297766602996507, + "k1_kl": 0.07421875, + "k3_kl": 0.04736328125, + "kimi_kl": 0.12255859375, + "learning_rate": 1.7209999999999999e-07, + "loss": 0.0019, + "ppl": 0.0615234375, + "reward": 0.9272991418838501, + "reward_std": 0.004351933486759663, + "rewards/perpo_ocr_edit_distance_reward": 0.9272992014884949, + "step": 3279, + "temperature": 0.9 + }, + { + "advantages": -7.459095741069177e-06, + "completion_length": 1713.0, + "delta_ref_entropy_loss": 0.00341796875, + "delta_ref_ppl": -0.0272216796875, + "entropy_loss": -0.11376953125, + "epoch": 0.656, + "grad_norm": 2.050732011668434, + "k1_kl": 0.0272216796875, + "k3_kl": 0.0286865234375, + "kimi_kl": 0.039794921875, + "learning_rate": 1.7199999999999998e-07, + "loss": 0.0012, + "ppl": 0.064453125, + "reward": 0.973778247833252, + "reward_std": 0.00677450792863965, + "rewards/perpo_ocr_edit_distance_reward": 0.9737783074378967, + "step": 3280, + "temperature": 0.9 + }, + { + "advantages": -2.963202405226184e-06, + "completion_length": 177.0, + "delta_ref_entropy_loss": 0.03173828125, + "delta_ref_ppl": -0.193359375, + "entropy_loss": -0.18359375, + "epoch": 0.6562, + "grad_norm": 2.179789171111758, + "k1_kl": 0.193359375, + "k3_kl": 0.1416015625, + "kimi_kl": 0.470703125, + "learning_rate": 1.719e-07, + "loss": 0.0057, + "ppl": 0.0732421875, + "reward": 0.9665524959564209, + "reward_std": 0.002787104807794094, + "rewards/perpo_ocr_edit_distance_reward": 0.9665524959564209, + "step": 3281, + "temperature": 0.9 + }, + { + "advantages": -8.685248758411035e-06, + "completion_length": 397.0, + "delta_ref_entropy_loss": 0.06689453125, + "delta_ref_ppl": -0.1181640625, + "entropy_loss": -0.08349609375, + "epoch": 0.6564, + "grad_norm": 1.0776157563728326, + "k1_kl": 0.1181640625, + "k3_kl": 0.078125, + "kimi_kl": 0.24609375, + "learning_rate": 1.718e-07, + "loss": 0.0031, + "ppl": 0.0390625, + "reward": 0.9625738859176636, + "reward_std": 0.0018571376567706466, + "rewards/perpo_ocr_edit_distance_reward": 0.9625738859176636, + "step": 3282, + "temperature": 0.9 + }, + { + "advantages": -0.00013336114352568984, + "completion_length": 783.0, + "delta_ref_entropy_loss": 0.0279541015625, + "delta_ref_ppl": -0.05322265625, + "entropy_loss": -0.03271484375, + "epoch": 0.6566, + "grad_norm": 0.23105381790904295, + "k1_kl": 0.052978515625, + "k3_kl": 0.036865234375, + "kimi_kl": 0.12890625, + "learning_rate": 1.7169999999999998e-07, + "loss": 0.0016, + "ppl": 0.0106201171875, + "reward": 0.996974527835846, + "reward_std": 0.0002831115562003106, + "rewards/perpo_ocr_edit_distance_reward": 0.9969746470451355, + "step": 3283, + "temperature": 0.9 + }, + { + "advantages": -1.4424324945139233e-05, + "completion_length": 646.0, + "delta_ref_entropy_loss": 0.048095703125, + "delta_ref_ppl": -0.08154296875, + "entropy_loss": -0.0751953125, + "epoch": 0.6568, + "grad_norm": 0.8487611757000002, + "k1_kl": 0.08154296875, + "k3_kl": 0.05419921875, + "kimi_kl": 0.1943359375, + "learning_rate": 1.716e-07, + "loss": 0.0022, + "ppl": 0.029296875, + "reward": 0.9839919805526733, + "reward_std": 0.0010827223304659128, + "rewards/perpo_ocr_edit_distance_reward": 0.9839919805526733, + "step": 3284, + "temperature": 0.9 + }, + { + "advantages": -1.021793991640152e-07, + "completion_length": 97.0, + "delta_ref_entropy_loss": -0.1455078125, + "delta_ref_ppl": -0.302734375, + "entropy_loss": -0.6015625, + "epoch": 0.657, + "grad_norm": 6.1677586553673285, + "k1_kl": 0.302734375, + "k3_kl": 0.28515625, + "kimi_kl": 1.7421875, + "learning_rate": 1.715e-07, + "loss": 0.0114, + "ppl": 0.306640625, + "reward": 0.7203911542892456, + "reward_std": 0.35058850049972534, + "rewards/perpo_ocr_edit_distance_reward": 0.7203912138938904, + "step": 3285, + "temperature": 0.9 + }, + { + "advantages": -7.944447861518711e-05, + "completion_length": 609.0, + "delta_ref_entropy_loss": 0.0277099609375, + "delta_ref_ppl": -0.0654296875, + "entropy_loss": -0.052734375, + "epoch": 0.6572, + "grad_norm": 0.33156440838388623, + "k1_kl": 0.0654296875, + "k3_kl": 0.044921875, + "kimi_kl": 0.15234375, + "learning_rate": 1.7139999999999999e-07, + "loss": 0.0019, + "ppl": 0.017578125, + "reward": 0.9878401160240173, + "reward_std": 0.0005431485478766263, + "rewards/perpo_ocr_edit_distance_reward": 0.9878401756286621, + "step": 3286, + "temperature": 0.9 + }, + { + "advantages": -3.2356808787881164e-07, + "completion_length": 455.0, + "delta_ref_entropy_loss": -0.00125885009765625, + "delta_ref_ppl": -0.10791015625, + "entropy_loss": -0.3125, + "epoch": 0.6574, + "grad_norm": 4.181301592380546, + "k1_kl": 0.107421875, + "k3_kl": 0.07568359375, + "kimi_kl": 0.283203125, + "learning_rate": 1.713e-07, + "loss": 0.003, + "ppl": 0.150390625, + "reward": 0.5694430470466614, + "reward_std": 0.07720377296209335, + "rewards/perpo_ocr_edit_distance_reward": 0.5694431066513062, + "step": 3287, + "temperature": 0.9 + }, + { + "advantages": 2.0844596292590722e-05, + "completion_length": 366.0, + "delta_ref_entropy_loss": 0.036376953125, + "delta_ref_ppl": -0.06494140625, + "entropy_loss": -0.0654296875, + "epoch": 0.6576, + "grad_norm": 0.9125756000385968, + "k1_kl": 0.0654296875, + "k3_kl": 0.044189453125, + "kimi_kl": 0.1435546875, + "learning_rate": 1.7119999999999997e-07, + "loss": 0.0018, + "ppl": 0.0250244140625, + "reward": 0.9957941770553589, + "reward_std": 0.000716696958988905, + "rewards/perpo_ocr_edit_distance_reward": 0.9957941770553589, + "step": 3288, + "temperature": 0.9 + }, + { + "advantages": -2.7537347705219872e-05, + "completion_length": 990.0, + "delta_ref_entropy_loss": 0.02490234375, + "delta_ref_ppl": -0.0595703125, + "entropy_loss": -0.058349609375, + "epoch": 0.6578, + "grad_norm": 2.3535472004445364, + "k1_kl": 0.0595703125, + "k3_kl": 0.04052734375, + "kimi_kl": 0.12890625, + "learning_rate": 1.711e-07, + "loss": 0.0016, + "ppl": 0.023681640625, + "reward": 0.9949532151222229, + "reward_std": 0.0020621195435523987, + "rewards/perpo_ocr_edit_distance_reward": 0.9949532747268677, + "step": 3289, + "temperature": 0.9 + }, + { + "advantages": -1.3862338164472021e-05, + "completion_length": 802.0, + "delta_ref_entropy_loss": 0.0220947265625, + "delta_ref_ppl": -0.059326171875, + "entropy_loss": -0.05615234375, + "epoch": 0.658, + "grad_norm": 0.9121748491980962, + "k1_kl": 0.059326171875, + "k3_kl": 0.03564453125, + "kimi_kl": 0.0908203125, + "learning_rate": 1.71e-07, + "loss": 0.0014, + "ppl": 0.0238037109375, + "reward": 0.9961349368095398, + "reward_std": 0.002355561126023531, + "rewards/perpo_ocr_edit_distance_reward": 0.9961349964141846, + "step": 3290, + "temperature": 0.9 + }, + { + "advantages": -1.2338162378000561e-05, + "completion_length": 770.0, + "delta_ref_entropy_loss": 0.03173828125, + "delta_ref_ppl": -0.05615234375, + "entropy_loss": -0.050537109375, + "epoch": 0.6582, + "grad_norm": 0.37516722538650105, + "k1_kl": 0.05615234375, + "k3_kl": 0.0361328125, + "kimi_kl": 0.11669921875, + "learning_rate": 1.709e-07, + "loss": 0.0015, + "ppl": 0.0189208984375, + "reward": 0.983845591545105, + "reward_std": 0.0005897781811654568, + "rewards/perpo_ocr_edit_distance_reward": 0.983845591545105, + "step": 3291, + "temperature": 0.9 + }, + { + "advantages": -1.9618444639490917e-05, + "completion_length": 507.0, + "delta_ref_entropy_loss": 0.034912109375, + "delta_ref_ppl": -0.1005859375, + "entropy_loss": -0.0966796875, + "epoch": 0.6584, + "grad_norm": 0.8055453352606199, + "k1_kl": 0.1005859375, + "k3_kl": 0.06689453125, + "kimi_kl": 0.1904296875, + "learning_rate": 1.708e-07, + "loss": 0.0027, + "ppl": 0.04345703125, + "reward": 0.9773366451263428, + "reward_std": 0.0016367969801649451, + "rewards/perpo_ocr_edit_distance_reward": 0.9773367047309875, + "step": 3292, + "temperature": 0.9 + }, + { + "advantages": -4.938671054333099e-07, + "completion_length": 474.0, + "delta_ref_entropy_loss": -0.2001953125, + "delta_ref_ppl": -0.10302734375, + "entropy_loss": -0.421875, + "epoch": 0.6586, + "grad_norm": 1.9859280810340945, + "k1_kl": 0.10302734375, + "k3_kl": 0.12060546875, + "kimi_kl": 0.34375, + "learning_rate": 1.7069999999999999e-07, + "loss": 0.0048, + "ppl": 0.16796875, + "reward": 0.7843724489212036, + "reward_std": 0.16029545664787292, + "rewards/perpo_ocr_edit_distance_reward": 0.7843725085258484, + "step": 3293, + "temperature": 0.9 + }, + { + "advantages": -0.00011843443644465879, + "completion_length": 1062.0, + "delta_ref_entropy_loss": 0.0220947265625, + "delta_ref_ppl": -0.039306640625, + "entropy_loss": -0.035400390625, + "epoch": 0.6588, + "grad_norm": 0.2399011992941665, + "k1_kl": 0.039306640625, + "k3_kl": 0.0252685546875, + "kimi_kl": 0.07763671875, + "learning_rate": 1.706e-07, + "loss": 0.0011, + "ppl": 0.01141357421875, + "reward": 0.9985895752906799, + "reward_std": 0.0003314840141683817, + "rewards/perpo_ocr_edit_distance_reward": 0.9985895752906799, + "step": 3294, + "temperature": 0.9 + }, + { + "advantages": -6.448796921176836e-05, + "completion_length": 877.0, + "delta_ref_entropy_loss": 0.017333984375, + "delta_ref_ppl": -0.039794921875, + "entropy_loss": -0.045654296875, + "epoch": 0.659, + "grad_norm": 0.4990551143513372, + "k1_kl": 0.039794921875, + "k3_kl": 0.0252685546875, + "kimi_kl": 0.06591796875, + "learning_rate": 1.705e-07, + "loss": 0.0011, + "ppl": 0.0167236328125, + "reward": 0.9963854551315308, + "reward_std": 0.0004282205190975219, + "rewards/perpo_ocr_edit_distance_reward": 0.9963854551315308, + "step": 3295, + "temperature": 0.9 + }, + { + "advantages": 2.6566642645775573e-06, + "completion_length": 114.0, + "delta_ref_entropy_loss": 0.07421875, + "delta_ref_ppl": -0.2197265625, + "entropy_loss": -0.16796875, + "epoch": 0.6592, + "grad_norm": 1.5432193171825992, + "k1_kl": 0.2197265625, + "k3_kl": 0.1865234375, + "kimi_kl": 0.53125, + "learning_rate": 1.704e-07, + "loss": 0.0075, + "ppl": 0.06884765625, + "reward": 0.9123809933662415, + "reward_std": 0.00310504250228405, + "rewards/perpo_ocr_edit_distance_reward": 0.9123809337615967, + "step": 3296, + "temperature": 0.9 + }, + { + "advantages": -3.187145557603799e-05, + "completion_length": 380.0, + "delta_ref_entropy_loss": 0.00872802734375, + "delta_ref_ppl": -0.044921875, + "entropy_loss": -0.038330078125, + "epoch": 0.6594, + "grad_norm": 0.9731719740657817, + "k1_kl": 0.045166015625, + "k3_kl": 0.035400390625, + "kimi_kl": 0.11669921875, + "learning_rate": 1.703e-07, + "loss": 0.0014, + "ppl": 0.016357421875, + "reward": 0.9907805323600769, + "reward_std": 0.0020372262224555016, + "rewards/perpo_ocr_edit_distance_reward": 0.9907805919647217, + "step": 3297, + "temperature": 0.9 + }, + { + "advantages": -1.5173640349530615e-05, + "completion_length": 448.0, + "delta_ref_entropy_loss": 0.01104736328125, + "delta_ref_ppl": -0.080078125, + "entropy_loss": -0.04296875, + "epoch": 0.6596, + "grad_norm": 0.9746710497123485, + "k1_kl": 0.080078125, + "k3_kl": 0.056396484375, + "kimi_kl": 0.220703125, + "learning_rate": 1.7019999999999998e-07, + "loss": 0.0023, + "ppl": 0.0150146484375, + "reward": 0.9983551502227783, + "reward_std": 0.0010220922995358706, + "rewards/perpo_ocr_edit_distance_reward": 0.9983552098274231, + "step": 3298, + "temperature": 0.9 + }, + { + "advantages": -2.2138868871479644e-07, + "completion_length": 148.0, + "delta_ref_entropy_loss": -0.134765625, + "delta_ref_ppl": -0.484375, + "entropy_loss": -0.921875, + "epoch": 0.6598, + "grad_norm": 10.873051202185698, + "k1_kl": 0.484375, + "k3_kl": 0.40234375, + "kimi_kl": 1.4453125, + "learning_rate": 1.701e-07, + "loss": 0.0161, + "ppl": 0.486328125, + "reward": 0.6646859049797058, + "reward_std": 0.3012255132198334, + "rewards/perpo_ocr_edit_distance_reward": 0.6646860241889954, + "step": 3299, + "temperature": 0.9 + }, + { + "advantages": -4.257474817137563e-09, + "completion_length": 552.0, + "delta_ref_entropy_loss": 0.055908203125, + "delta_ref_ppl": -0.087890625, + "entropy_loss": -0.05322265625, + "epoch": 0.66, + "grad_norm": 0.27790884099552743, + "k1_kl": 0.087890625, + "k3_kl": 0.056640625, + "kimi_kl": 0.240234375, + "learning_rate": 1.7000000000000001e-07, + "loss": 0.0023, + "ppl": 0.0194091796875, + "reward": 0.9713292717933655, + "reward_std": 0.0002917454403359443, + "rewards/perpo_ocr_edit_distance_reward": 0.9713293313980103, + "step": 3300, + "temperature": 0.9 + }, + { + "advantages": -3.334454231662676e-05, + "completion_length": 586.0, + "delta_ref_entropy_loss": 0.058837890625, + "delta_ref_ppl": -0.09521484375, + "entropy_loss": -0.1064453125, + "epoch": 0.6602, + "grad_norm": 1.1712682773754803, + "k1_kl": 0.095703125, + "k3_kl": 0.057861328125, + "kimi_kl": 0.1474609375, + "learning_rate": 1.6989999999999998e-07, + "loss": 0.0023, + "ppl": 0.05078125, + "reward": 0.9531126618385315, + "reward_std": 0.0024533341638743877, + "rewards/perpo_ocr_edit_distance_reward": 0.953112781047821, + "step": 3301, + "temperature": 0.9 + }, + { + "advantages": -5.960464477539062e-07, + "completion_length": 551.0, + "delta_ref_entropy_loss": 0.06298828125, + "delta_ref_ppl": -0.1484375, + "entropy_loss": -0.19921875, + "epoch": 0.6604, + "grad_norm": 1.003069409838016, + "k1_kl": 0.1484375, + "k3_kl": 0.09326171875, + "kimi_kl": 0.2421875, + "learning_rate": 1.698e-07, + "loss": 0.0037, + "ppl": 0.09033203125, + "reward": 0.017956901341676712, + "reward_std": 0.002155838767066598, + "rewards/perpo_ocr_edit_distance_reward": 0.01795690320432186, + "step": 3302, + "temperature": 0.9 + }, + { + "advantages": -7.944448043417651e-06, + "completion_length": 304.0, + "delta_ref_entropy_loss": 0.053466796875, + "delta_ref_ppl": -0.1767578125, + "entropy_loss": -0.294921875, + "epoch": 0.6606, + "grad_norm": 2.309949756020677, + "k1_kl": 0.1767578125, + "k3_kl": 0.1376953125, + "kimi_kl": 0.47265625, + "learning_rate": 1.697e-07, + "loss": 0.0055, + "ppl": 0.12353515625, + "reward": 0.9600411057472229, + "reward_std": 0.008480580523610115, + "rewards/perpo_ocr_edit_distance_reward": 0.9600412249565125, + "step": 3303, + "temperature": 0.9 + }, + { + "advantages": 0.0, + "completion_length": 576.0, + "delta_ref_entropy_loss": 0.046630859375, + "delta_ref_ppl": -0.052734375, + "entropy_loss": -0.04296875, + "epoch": 0.6608, + "grad_norm": 0.754473111513462, + "k1_kl": 0.052734375, + "k3_kl": 0.03125, + "kimi_kl": 0.07373046875, + "learning_rate": 1.6959999999999998e-07, + "loss": 0.0013, + "ppl": 0.0157470703125, + "reward": 0.9975140690803528, + "reward_std": 0.0010260078124701977, + "rewards/perpo_ocr_edit_distance_reward": 0.9975140690803528, + "step": 3304, + "temperature": 0.9 + }, + { + "advantages": -4.3307034502504393e-05, + "completion_length": 527.0, + "delta_ref_entropy_loss": 0.055908203125, + "delta_ref_ppl": -0.0703125, + "entropy_loss": -0.061279296875, + "epoch": 0.661, + "grad_norm": 0.6389926503287621, + "k1_kl": 0.06982421875, + "k3_kl": 0.043212890625, + "kimi_kl": 0.1220703125, + "learning_rate": 1.695e-07, + "loss": 0.0018, + "ppl": 0.0211181640625, + "reward": 0.9885802268981934, + "reward_std": 0.0016692881472408772, + "rewards/perpo_ocr_edit_distance_reward": 0.9885803461074829, + "step": 3305, + "temperature": 0.9 + }, + { + "advantages": -1.5939986042212695e-05, + "completion_length": 308.0, + "delta_ref_entropy_loss": 0.01556396484375, + "delta_ref_ppl": -0.10888671875, + "entropy_loss": -0.055419921875, + "epoch": 0.6612, + "grad_norm": 0.7703428288050543, + "k1_kl": 0.10888671875, + "k3_kl": 0.08251953125, + "kimi_kl": 0.2890625, + "learning_rate": 1.694e-07, + "loss": 0.0033, + "ppl": 0.0224609375, + "reward": 0.9952235817909241, + "reward_std": 0.001501533086411655, + "rewards/perpo_ocr_edit_distance_reward": 0.9952236413955688, + "step": 3306, + "temperature": 0.9 + }, + { + "advantages": -2.939360638265498e-05, + "completion_length": 1663.0, + "delta_ref_entropy_loss": 0.00970458984375, + "delta_ref_ppl": -0.0322265625, + "entropy_loss": -0.0576171875, + "epoch": 0.6614, + "grad_norm": 2.127799942638608, + "k1_kl": 0.0322265625, + "k3_kl": 0.0284423828125, + "kimi_kl": 0.06689453125, + "learning_rate": 1.693e-07, + "loss": 0.0012, + "ppl": 0.0294189453125, + "reward": 0.9803398847579956, + "reward_std": 0.003666957374662161, + "rewards/perpo_ocr_edit_distance_reward": 0.9803400039672852, + "step": 3307, + "temperature": 0.9 + }, + { + "advantages": -6.370459595927969e-05, + "completion_length": 538.0, + "delta_ref_entropy_loss": 0.043212890625, + "delta_ref_ppl": -0.07177734375, + "entropy_loss": -0.03759765625, + "epoch": 0.6616, + "grad_norm": 0.25489392839424857, + "k1_kl": 0.07177734375, + "k3_kl": 0.04638671875, + "kimi_kl": 0.15234375, + "learning_rate": 1.6919999999999998e-07, + "loss": 0.0019, + "ppl": 0.0111083984375, + "reward": 0.9978579878807068, + "reward_std": 0.0005682491464540362, + "rewards/perpo_ocr_edit_distance_reward": 0.9978581070899963, + "step": 3308, + "temperature": 0.9 + }, + { + "advantages": 7.646424819540698e-06, + "completion_length": 477.0, + "delta_ref_entropy_loss": 0.050048828125, + "delta_ref_ppl": -0.0791015625, + "entropy_loss": -0.1064453125, + "epoch": 0.6618, + "grad_norm": 1.2302041068357628, + "k1_kl": 0.0791015625, + "k3_kl": 0.05126953125, + "kimi_kl": 0.126953125, + "learning_rate": 1.691e-07, + "loss": 0.002, + "ppl": 0.048095703125, + "reward": 0.9731415510177612, + "reward_std": 0.002127352636307478, + "rewards/perpo_ocr_edit_distance_reward": 0.9731415510177612, + "step": 3309, + "temperature": 0.9 + }, + { + "advantages": 0.0, + "completion_length": 286.0, + "delta_ref_entropy_loss": 0.02587890625, + "delta_ref_ppl": -0.10791015625, + "entropy_loss": -0.0458984375, + "epoch": 0.662, + "grad_norm": 0.6241648904114595, + "k1_kl": 0.1083984375, + "k3_kl": 0.07568359375, + "kimi_kl": 0.2490234375, + "learning_rate": 1.69e-07, + "loss": 0.003, + "ppl": 0.0174560546875, + "reward": 0.9947546720504761, + "reward_std": 0.0010686402674764395, + "rewards/perpo_ocr_edit_distance_reward": 0.9947546720504761, + "step": 3310, + "temperature": 0.9 + }, + { + "advantages": 1.0090215255331714e-05, + "completion_length": 213.0, + "delta_ref_entropy_loss": 0.021484375, + "delta_ref_ppl": -0.1376953125, + "entropy_loss": -0.0615234375, + "epoch": 0.6622, + "grad_norm": 1.0658909921280604, + "k1_kl": 0.1376953125, + "k3_kl": 0.1103515625, + "kimi_kl": 0.4296875, + "learning_rate": 1.6889999999999998e-07, + "loss": 0.0044, + "ppl": 0.0260009765625, + "reward": 0.9918187856674194, + "reward_std": 0.0015854369848966599, + "rewards/perpo_ocr_edit_distance_reward": 0.9918187260627747, + "step": 3311, + "temperature": 0.9 + }, + { + "advantages": -0.00013979844516143203, + "completion_length": 439.0, + "delta_ref_entropy_loss": 0.036865234375, + "delta_ref_ppl": -0.06787109375, + "entropy_loss": -0.039794921875, + "epoch": 0.6624, + "grad_norm": 0.3967225373763382, + "k1_kl": 0.06787109375, + "k3_kl": 0.0478515625, + "kimi_kl": 0.1640625, + "learning_rate": 1.688e-07, + "loss": 0.0021, + "ppl": 0.0147705078125, + "reward": 0.9961248636245728, + "reward_std": 0.0005090624326840043, + "rewards/perpo_ocr_edit_distance_reward": 0.9961249828338623, + "step": 3312, + "temperature": 0.9 + }, + { + "advantages": -5.156653423910029e-05, + "completion_length": 622.0, + "delta_ref_entropy_loss": 0.03515625, + "delta_ref_ppl": -0.06982421875, + "entropy_loss": -0.0361328125, + "epoch": 0.6626, + "grad_norm": 0.5536873635895821, + "k1_kl": 0.06982421875, + "k3_kl": 0.046142578125, + "kimi_kl": 0.1611328125, + "learning_rate": 1.6869999999999997e-07, + "loss": 0.0019, + "ppl": 0.0152587890625, + "reward": 0.9974870085716248, + "reward_std": 0.0007254123338498175, + "rewards/perpo_ocr_edit_distance_reward": 0.9974871277809143, + "step": 3313, + "temperature": 0.9 + }, + { + "advantages": 2.711160050239414e-05, + "completion_length": 408.0, + "delta_ref_entropy_loss": 0.080078125, + "delta_ref_ppl": -0.1103515625, + "entropy_loss": -0.08251953125, + "epoch": 0.6628, + "grad_norm": 0.7806666280977922, + "k1_kl": 0.1103515625, + "k3_kl": 0.0703125, + "kimi_kl": 0.2490234375, + "learning_rate": 1.686e-07, + "loss": 0.0028, + "ppl": 0.03466796875, + "reward": 0.975831925868988, + "reward_std": 0.0008414539624936879, + "rewards/perpo_ocr_edit_distance_reward": 0.9758318662643433, + "step": 3314, + "temperature": 0.9 + }, + { + "advantages": -9.170600606012158e-06, + "completion_length": 486.0, + "delta_ref_entropy_loss": 0.057861328125, + "delta_ref_ppl": -0.1123046875, + "entropy_loss": -0.439453125, + "epoch": 0.663, + "grad_norm": 1.8995787051090658, + "k1_kl": 0.1123046875, + "k3_kl": 0.08349609375, + "kimi_kl": 0.1962890625, + "learning_rate": 1.685e-07, + "loss": 0.0034, + "ppl": 0.2275390625, + "reward": 0.8483556509017944, + "reward_std": 0.0045438543893396854, + "rewards/perpo_ocr_edit_distance_reward": 0.8483557105064392, + "step": 3315, + "temperature": 0.9 + }, + { + "advantages": -5.635193519992754e-05, + "completion_length": 461.0, + "delta_ref_entropy_loss": 0.095703125, + "delta_ref_ppl": -0.1181640625, + "entropy_loss": -0.10986328125, + "epoch": 0.6632, + "grad_norm": 1.6538912402566919, + "k1_kl": 0.11865234375, + "k3_kl": 0.06982421875, + "kimi_kl": 0.146484375, + "learning_rate": 1.684e-07, + "loss": 0.0029, + "ppl": 0.0478515625, + "reward": 0.9569314122200012, + "reward_std": 0.0018642055802047253, + "rewards/perpo_ocr_edit_distance_reward": 0.9569315910339355, + "step": 3316, + "temperature": 0.9 + }, + { + "advantages": -6.309577656793408e-06, + "completion_length": 612.0, + "delta_ref_entropy_loss": 0.0145263671875, + "delta_ref_ppl": -0.1328125, + "entropy_loss": -0.6328125, + "epoch": 0.6634, + "grad_norm": 3.4235486585253825, + "k1_kl": 0.1318359375, + "k3_kl": 0.10546875, + "kimi_kl": 0.1806640625, + "learning_rate": 1.683e-07, + "loss": 0.0042, + "ppl": 0.337890625, + "reward": 0.5848340392112732, + "reward_std": 0.005301459692418575, + "rewards/perpo_ocr_edit_distance_reward": 0.584834098815918, + "step": 3317, + "temperature": 0.9 + }, + { + "advantages": -1.7029899268550253e-08, + "completion_length": 271.0, + "delta_ref_entropy_loss": 0.052490234375, + "delta_ref_ppl": -0.1357421875, + "entropy_loss": -0.1015625, + "epoch": 0.6636, + "grad_norm": 1.1375114061002853, + "k1_kl": 0.134765625, + "k3_kl": 0.1015625, + "kimi_kl": 0.3828125, + "learning_rate": 1.6819999999999998e-07, + "loss": 0.0041, + "ppl": 0.04150390625, + "reward": 0.7042460441589355, + "reward_std": 0.0011182369198650122, + "rewards/perpo_ocr_edit_distance_reward": 0.7042459845542908, + "step": 3318, + "temperature": 0.9 + }, + { + "advantages": -9.624447557143867e-05, + "completion_length": 286.0, + "delta_ref_entropy_loss": 0.055908203125, + "delta_ref_ppl": -0.177734375, + "entropy_loss": -0.0712890625, + "epoch": 0.6638, + "grad_norm": 0.8351598889043572, + "k1_kl": 0.177734375, + "k3_kl": 0.1328125, + "kimi_kl": 0.474609375, + "learning_rate": 1.681e-07, + "loss": 0.0054, + "ppl": 0.033203125, + "reward": 0.9961522817611694, + "reward_std": 0.001050198683515191, + "rewards/perpo_ocr_edit_distance_reward": 0.996152400970459, + "step": 3319, + "temperature": 0.9 + }, + { + "advantages": 7.527215529989917e-06, + "completion_length": 529.0, + "delta_ref_entropy_loss": 0.058349609375, + "delta_ref_ppl": -0.1318359375, + "entropy_loss": -0.111328125, + "epoch": 0.664, + "grad_norm": 1.0938794875605469, + "k1_kl": 0.1318359375, + "k3_kl": 0.083984375, + "kimi_kl": 0.20703125, + "learning_rate": 1.68e-07, + "loss": 0.0034, + "ppl": 0.046630859375, + "reward": 0.9687861800193787, + "reward_std": 0.00216281833127141, + "rewards/perpo_ocr_edit_distance_reward": 0.9687861204147339, + "step": 3320, + "temperature": 0.9 + }, + { + "advantages": -5.447864896268584e-05, + "completion_length": 976.0, + "delta_ref_entropy_loss": 0.00823974609375, + "delta_ref_ppl": -0.034423828125, + "entropy_loss": -0.0390625, + "epoch": 0.6642, + "grad_norm": 0.617304783411672, + "k1_kl": 0.034423828125, + "k3_kl": 0.0311279296875, + "kimi_kl": 0.07958984375, + "learning_rate": 1.679e-07, + "loss": 0.0013, + "ppl": 0.0166015625, + "reward": 0.9809728860855103, + "reward_std": 0.0008378551574423909, + "rewards/perpo_ocr_edit_distance_reward": 0.9809728860855103, + "step": 3321, + "temperature": 0.9 + }, + { + "advantages": 8.514949740856537e-07, + "completion_length": 317.0, + "delta_ref_entropy_loss": 0.00946044921875, + "delta_ref_ppl": -0.17578125, + "entropy_loss": -0.74609375, + "epoch": 0.6644, + "grad_norm": 2.999870173176203, + "k1_kl": 0.1767578125, + "k3_kl": 0.1279296875, + "kimi_kl": 0.265625, + "learning_rate": 1.678e-07, + "loss": 0.0051, + "ppl": 0.3671875, + "reward": 0.6382513046264648, + "reward_std": 0.019724586978554726, + "rewards/perpo_ocr_edit_distance_reward": 0.6382513046264648, + "step": 3322, + "temperature": 0.9 + }, + { + "advantages": -1.6774451069068164e-05, + "completion_length": 418.0, + "delta_ref_entropy_loss": 0.032958984375, + "delta_ref_ppl": -0.072265625, + "entropy_loss": -0.054931640625, + "epoch": 0.6646, + "grad_norm": 0.830825254986902, + "k1_kl": 0.072265625, + "k3_kl": 0.051513671875, + "kimi_kl": 0.19921875, + "learning_rate": 1.6769999999999997e-07, + "loss": 0.0021, + "ppl": 0.0244140625, + "reward": 0.9971433877944946, + "reward_std": 0.001933324383571744, + "rewards/perpo_ocr_edit_distance_reward": 0.9971434473991394, + "step": 3323, + "temperature": 0.9 + }, + { + "advantages": -7.474848825950176e-05, + "completion_length": 517.0, + "delta_ref_entropy_loss": 0.054443359375, + "delta_ref_ppl": -0.0712890625, + "entropy_loss": -0.0439453125, + "epoch": 0.6648, + "grad_norm": 0.3354271337740802, + "k1_kl": 0.07080078125, + "k3_kl": 0.0439453125, + "kimi_kl": 0.126953125, + "learning_rate": 1.676e-07, + "loss": 0.0018, + "ppl": 0.0167236328125, + "reward": 0.9961534142494202, + "reward_std": 0.00035570672480389476, + "rewards/perpo_ocr_edit_distance_reward": 0.9961534738540649, + "step": 3324, + "temperature": 0.9 + }, + { + "advantages": -3.167561226291582e-05, + "completion_length": 770.0, + "delta_ref_entropy_loss": 0.08544921875, + "delta_ref_ppl": -0.08447265625, + "entropy_loss": -0.057861328125, + "epoch": 0.665, + "grad_norm": 0.3304850475596103, + "k1_kl": 0.08447265625, + "k3_kl": 0.046630859375, + "kimi_kl": 0.1298828125, + "learning_rate": 1.675e-07, + "loss": 0.0019, + "ppl": 0.0213623046875, + "reward": 0.9484683275222778, + "reward_std": 0.0004373200354166329, + "rewards/perpo_ocr_edit_distance_reward": 0.9484683871269226, + "step": 3325, + "temperature": 0.9 + }, + { + "advantages": -4.040343628730625e-05, + "completion_length": 543.0, + "delta_ref_entropy_loss": 0.043701171875, + "delta_ref_ppl": -0.07421875, + "entropy_loss": -0.091796875, + "epoch": 0.6652, + "grad_norm": 0.7512989528103658, + "k1_kl": 0.07421875, + "k3_kl": 0.05419921875, + "kimi_kl": 0.138671875, + "learning_rate": 1.6739999999999998e-07, + "loss": 0.0022, + "ppl": 0.033203125, + "reward": 0.9667424559593201, + "reward_std": 0.0015859405975788832, + "rewards/perpo_ocr_edit_distance_reward": 0.9667425155639648, + "step": 3326, + "temperature": 0.9 + }, + { + "advantages": -5.728858013753779e-05, + "completion_length": 701.0, + "delta_ref_entropy_loss": 0.0240478515625, + "delta_ref_ppl": -0.042236328125, + "entropy_loss": -0.04638671875, + "epoch": 0.6654, + "grad_norm": 0.35221614823330083, + "k1_kl": 0.042236328125, + "k3_kl": 0.026123046875, + "kimi_kl": 0.0751953125, + "learning_rate": 1.673e-07, + "loss": 0.0011, + "ppl": 0.01556396484375, + "reward": 0.9961158633232117, + "reward_std": 0.0007918050978332758, + "rewards/perpo_ocr_edit_distance_reward": 0.9961159229278564, + "step": 3327, + "temperature": 0.9 + }, + { + "advantages": -1.7200198271893896e-05, + "completion_length": 522.0, + "delta_ref_entropy_loss": 0.0615234375, + "delta_ref_ppl": -0.06982421875, + "entropy_loss": -0.0888671875, + "epoch": 0.6656, + "grad_norm": 0.8265886809361592, + "k1_kl": 0.0693359375, + "k3_kl": 0.041259765625, + "kimi_kl": 0.119140625, + "learning_rate": 1.672e-07, + "loss": 0.0017, + "ppl": 0.03564453125, + "reward": 0.9858518242835999, + "reward_std": 0.001879852614365518, + "rewards/perpo_ocr_edit_distance_reward": 0.9858518838882446, + "step": 3328, + "temperature": 0.9 + }, + { + "advantages": -6.406648026313633e-05, + "completion_length": 1260.0, + "delta_ref_entropy_loss": 0.031494140625, + "delta_ref_ppl": -0.05517578125, + "entropy_loss": -0.0419921875, + "epoch": 0.6658, + "grad_norm": 1.5473888864333447, + "k1_kl": 0.05517578125, + "k3_kl": 0.0380859375, + "kimi_kl": 0.189453125, + "learning_rate": 1.6709999999999998e-07, + "loss": 0.0016, + "ppl": 0.016845703125, + "reward": 0.9941332340240479, + "reward_std": 0.0006972816772758961, + "rewards/perpo_ocr_edit_distance_reward": 0.9941332340240479, + "step": 3329, + "temperature": 0.9 + }, + { + "advantages": -1.5220472050714307e-05, + "completion_length": 86.0, + "delta_ref_entropy_loss": 0.08740234375, + "delta_ref_ppl": -0.361328125, + "entropy_loss": -0.146484375, + "epoch": 0.666, + "grad_norm": 3.292765619434748, + "k1_kl": 0.361328125, + "k3_kl": 0.29296875, + "kimi_kl": 1.2109375, + "learning_rate": 1.67e-07, + "loss": 0.0117, + "ppl": 0.07080078125, + "reward": 0.8913480639457703, + "reward_std": 0.006054066587239504, + "rewards/perpo_ocr_edit_distance_reward": 0.8913481831550598, + "step": 3330, + "temperature": 0.9 + }, + { + "advantages": -3.9662634662818164e-05, + "completion_length": 614.0, + "delta_ref_entropy_loss": 0.02197265625, + "delta_ref_ppl": -0.047607421875, + "entropy_loss": -0.033935546875, + "epoch": 0.6662, + "grad_norm": 0.39207860742937684, + "k1_kl": 0.047607421875, + "k3_kl": 0.03173828125, + "kimi_kl": 0.0927734375, + "learning_rate": 1.669e-07, + "loss": 0.0013, + "ppl": 0.012939453125, + "reward": 0.9979093074798584, + "reward_std": 0.0009735948406159878, + "rewards/perpo_ocr_edit_distance_reward": 0.9979093670845032, + "step": 3331, + "temperature": 0.9 + }, + { + "advantages": -5.41039917152375e-05, + "completion_length": 1381.0, + "delta_ref_entropy_loss": 0.050537109375, + "delta_ref_ppl": -0.07421875, + "entropy_loss": -0.2119140625, + "epoch": 0.6664, + "grad_norm": 1.616929245426968, + "k1_kl": 0.07373046875, + "k3_kl": 0.048095703125, + "kimi_kl": 0.10693359375, + "learning_rate": 1.6679999999999998e-07, + "loss": 0.002, + "ppl": 0.107421875, + "reward": 0.9201688170433044, + "reward_std": 0.001631156774237752, + "rewards/perpo_ocr_edit_distance_reward": 0.920168936252594, + "step": 3332, + "temperature": 0.9 + }, + { + "advantages": -7.033348538243445e-06, + "completion_length": 284.0, + "delta_ref_entropy_loss": 0.049072265625, + "delta_ref_ppl": -0.10546875, + "entropy_loss": -0.054443359375, + "epoch": 0.6666, + "grad_norm": 0.8498465686919683, + "k1_kl": 0.10498046875, + "k3_kl": 0.07421875, + "kimi_kl": 0.25390625, + "learning_rate": 1.6669999999999998e-07, + "loss": 0.003, + "ppl": 0.02197265625, + "reward": 0.9889663457870483, + "reward_std": 0.003538761753588915, + "rewards/perpo_ocr_edit_distance_reward": 0.9889662861824036, + "step": 3333, + "temperature": 0.9 + }, + { + "advantages": -1.1920929864572827e-05, + "completion_length": 749.0, + "delta_ref_entropy_loss": 0.0458984375, + "delta_ref_ppl": -0.05712890625, + "entropy_loss": -0.05322265625, + "epoch": 0.6668, + "grad_norm": 0.35394763012675406, + "k1_kl": 0.056884765625, + "k3_kl": 0.02880859375, + "kimi_kl": 0.050048828125, + "learning_rate": 1.666e-07, + "loss": 0.0012, + "ppl": 0.01544189453125, + "reward": 0.9950119256973267, + "reward_std": 0.000612603616900742, + "rewards/perpo_ocr_edit_distance_reward": 0.9950119853019714, + "step": 3334, + "temperature": 0.9 + }, + { + "advantages": -6.709780336677795e-06, + "completion_length": 877.0, + "delta_ref_entropy_loss": 0.00750732421875, + "delta_ref_ppl": -0.039794921875, + "entropy_loss": -0.042724609375, + "epoch": 0.667, + "grad_norm": 0.4467614667613096, + "k1_kl": 0.039794921875, + "k3_kl": 0.030517578125, + "kimi_kl": 0.09521484375, + "learning_rate": 1.665e-07, + "loss": 0.0012, + "ppl": 0.014892578125, + "reward": 0.9884400367736816, + "reward_std": 0.006222826894372702, + "rewards/perpo_ocr_edit_distance_reward": 0.9884401559829712, + "step": 3335, + "temperature": 0.9 + }, + { + "advantages": -0.00010655607911758125, + "completion_length": 1086.0, + "delta_ref_entropy_loss": 0.048095703125, + "delta_ref_ppl": -0.0556640625, + "entropy_loss": -0.03857421875, + "epoch": 0.6672, + "grad_norm": 1.1051677052644062, + "k1_kl": 0.0556640625, + "k3_kl": 0.0281982421875, + "kimi_kl": 0.06640625, + "learning_rate": 1.6639999999999998e-07, + "loss": 0.0012, + "ppl": 0.01251220703125, + "reward": 0.970346212387085, + "reward_std": 0.0002996172697748989, + "rewards/perpo_ocr_edit_distance_reward": 0.9703463315963745, + "step": 3336, + "temperature": 0.9 + }, + { + "advantages": -3.041540003323462e-05, + "completion_length": 580.0, + "delta_ref_entropy_loss": 0.0089111328125, + "delta_ref_ppl": -0.1220703125, + "entropy_loss": -0.361328125, + "epoch": 0.6674, + "grad_norm": 1.732703937763607, + "k1_kl": 0.12158203125, + "k3_kl": 0.08935546875, + "kimi_kl": 0.294921875, + "learning_rate": 1.663e-07, + "loss": 0.0036, + "ppl": 0.1845703125, + "reward": 0.8911862969398499, + "reward_std": 0.0018602462951093912, + "rewards/perpo_ocr_edit_distance_reward": 0.8911863565444946, + "step": 3337, + "temperature": 0.9 + }, + { + "advantages": -3.639289570855908e-05, + "completion_length": 1567.0, + "delta_ref_entropy_loss": 0.0025177001953125, + "delta_ref_ppl": -0.028076171875, + "entropy_loss": -0.1669921875, + "epoch": 0.6676, + "grad_norm": 8.34367725200598, + "k1_kl": 0.0279541015625, + "k3_kl": 0.03271484375, + "kimi_kl": 0.047607421875, + "learning_rate": 1.6619999999999997e-07, + "loss": 0.0013, + "ppl": 0.09912109375, + "reward": 0.9793523550033569, + "reward_std": 0.002239976543933153, + "rewards/perpo_ocr_edit_distance_reward": 0.9793524742126465, + "step": 3338, + "temperature": 0.9 + }, + { + "advantages": 1.163993601949187e-05, + "completion_length": 545.0, + "delta_ref_entropy_loss": 0.0189208984375, + "delta_ref_ppl": -0.041259765625, + "entropy_loss": -0.034912109375, + "epoch": 0.6678, + "grad_norm": 0.35451837445163653, + "k1_kl": 0.041259765625, + "k3_kl": 0.0286865234375, + "kimi_kl": 0.083984375, + "learning_rate": 1.6609999999999999e-07, + "loss": 0.0011, + "ppl": 0.01336669921875, + "reward": 0.99692702293396, + "reward_std": 0.0006316489307209849, + "rewards/perpo_ocr_edit_distance_reward": 0.9969270825386047, + "step": 3339, + "temperature": 0.9 + }, + { + "advantages": -7.82012939453125e-05, + "completion_length": 985.0, + "delta_ref_entropy_loss": 0.0126953125, + "delta_ref_ppl": -0.029052734375, + "entropy_loss": -0.0341796875, + "epoch": 0.668, + "grad_norm": 0.39014090434144355, + "k1_kl": 0.0291748046875, + "k3_kl": 0.021240234375, + "kimi_kl": 0.06494140625, + "learning_rate": 1.66e-07, + "loss": 0.0009, + "ppl": 0.0146484375, + "reward": 0.9900346398353577, + "reward_std": 0.0007710309000685811, + "rewards/perpo_ocr_edit_distance_reward": 0.9900346994400024, + "step": 3340, + "temperature": 0.9 + }, + { + "advantages": -8.634158803033642e-06, + "completion_length": 533.0, + "delta_ref_entropy_loss": 0.039306640625, + "delta_ref_ppl": -0.11376953125, + "entropy_loss": -0.390625, + "epoch": 0.6682, + "grad_norm": 2.2876870129944016, + "k1_kl": 0.11376953125, + "k3_kl": 0.08154296875, + "kimi_kl": 0.234375, + "learning_rate": 1.659e-07, + "loss": 0.0033, + "ppl": 0.1689453125, + "reward": 0.8590614795684814, + "reward_std": 0.007783735170960426, + "rewards/perpo_ocr_edit_distance_reward": 0.8590615391731262, + "step": 3341, + "temperature": 0.9 + }, + { + "advantages": -4.3749812903115526e-05, + "completion_length": 320.0, + "delta_ref_entropy_loss": 0.046630859375, + "delta_ref_ppl": -0.123046875, + "entropy_loss": -0.07470703125, + "epoch": 0.6684, + "grad_norm": 0.6312317063290078, + "k1_kl": 0.123046875, + "k3_kl": 0.09033203125, + "kimi_kl": 0.35546875, + "learning_rate": 1.658e-07, + "loss": 0.0037, + "ppl": 0.0272216796875, + "reward": 0.9701383709907532, + "reward_std": 0.001261503784917295, + "rewards/perpo_ocr_edit_distance_reward": 0.970138430595398, + "step": 3342, + "temperature": 0.9 + }, + { + "advantages": -7.322856845348724e-07, + "completion_length": 511.0, + "delta_ref_entropy_loss": -0.046875, + "delta_ref_ppl": -0.06787109375, + "entropy_loss": -0.291015625, + "epoch": 0.6686, + "grad_norm": 4.835866165969751, + "k1_kl": 0.0673828125, + "k3_kl": 0.08056640625, + "kimi_kl": 0.1708984375, + "learning_rate": 1.6569999999999998e-07, + "loss": 0.0032, + "ppl": 0.1533203125, + "reward": 0.9601204991340637, + "reward_std": 0.05805868282914162, + "rewards/perpo_ocr_edit_distance_reward": 0.9601204991340637, + "step": 3343, + "temperature": 0.9 + }, + { + "advantages": -3.6597251892089844e-05, + "completion_length": 624.0, + "delta_ref_entropy_loss": 0.06787109375, + "delta_ref_ppl": -0.07666015625, + "entropy_loss": -0.11083984375, + "epoch": 0.6688, + "grad_norm": 1.3891208666951784, + "k1_kl": 0.07666015625, + "k3_kl": 0.04052734375, + "kimi_kl": 0.0888671875, + "learning_rate": 1.656e-07, + "loss": 0.0017, + "ppl": 0.04931640625, + "reward": 0.6519041657447815, + "reward_std": 0.0012970245443284512, + "rewards/perpo_ocr_edit_distance_reward": 0.651904284954071, + "step": 3344, + "temperature": 0.9 + }, + { + "advantages": -3.3378603347955504e-06, + "completion_length": 689.0, + "delta_ref_entropy_loss": 0.0400390625, + "delta_ref_ppl": -0.072265625, + "entropy_loss": -0.07080078125, + "epoch": 0.669, + "grad_norm": 0.9462375690684617, + "k1_kl": 0.072265625, + "k3_kl": 0.0498046875, + "kimi_kl": 0.1357421875, + "learning_rate": 1.655e-07, + "loss": 0.002, + "ppl": 0.0240478515625, + "reward": 0.9289167523384094, + "reward_std": 0.007568973116576672, + "rewards/perpo_ocr_edit_distance_reward": 0.9289167523384094, + "step": 3345, + "temperature": 0.9 + }, + { + "advantages": -1.7029899268550253e-08, + "completion_length": 469.0, + "delta_ref_entropy_loss": -0.0206298828125, + "delta_ref_ppl": -0.0458984375, + "entropy_loss": -0.453125, + "epoch": 0.6692, + "grad_norm": 4.6925204744845965, + "k1_kl": 0.046142578125, + "k3_kl": 0.03955078125, + "kimi_kl": 0.10302734375, + "learning_rate": 1.6539999999999999e-07, + "loss": 0.0016, + "ppl": 0.3125, + "reward": 0.8405764698982239, + "reward_std": 0.3224744200706482, + "rewards/perpo_ocr_edit_distance_reward": 0.8405764698982239, + "step": 3346, + "temperature": 0.9 + }, + { + "advantages": -8.685248644724197e-07, + "completion_length": 644.0, + "delta_ref_entropy_loss": -0.04931640625, + "delta_ref_ppl": -0.11279296875, + "entropy_loss": -0.6484375, + "epoch": 0.6694, + "grad_norm": 2.9771655174906666, + "k1_kl": 0.11279296875, + "k3_kl": 0.09619140625, + "kimi_kl": 0.2265625, + "learning_rate": 1.653e-07, + "loss": 0.0038, + "ppl": 0.349609375, + "reward": 0.5777047276496887, + "reward_std": 0.009669942781329155, + "rewards/perpo_ocr_edit_distance_reward": 0.5777047276496887, + "step": 3347, + "temperature": 0.9 + }, + { + "advantages": -3.595863381633535e-05, + "completion_length": 576.0, + "delta_ref_entropy_loss": 0.0230712890625, + "delta_ref_ppl": -0.057861328125, + "entropy_loss": -0.035888671875, + "epoch": 0.6696, + "grad_norm": 0.8680231227881103, + "k1_kl": 0.057861328125, + "k3_kl": 0.039306640625, + "kimi_kl": 0.1318359375, + "learning_rate": 1.652e-07, + "loss": 0.0016, + "ppl": 0.0113525390625, + "reward": 0.9920861124992371, + "reward_std": 0.0006105475476942956, + "rewards/perpo_ocr_edit_distance_reward": 0.9920861721038818, + "step": 3348, + "temperature": 0.9 + }, + { + "advantages": -5.803789827041328e-05, + "completion_length": 625.0, + "delta_ref_entropy_loss": 0.0322265625, + "delta_ref_ppl": -0.049072265625, + "entropy_loss": -0.03564453125, + "epoch": 0.6698, + "grad_norm": 0.31515738088499173, + "k1_kl": 0.049072265625, + "k3_kl": 0.03369140625, + "kimi_kl": 0.119140625, + "learning_rate": 1.651e-07, + "loss": 0.0014, + "ppl": 0.01239013671875, + "reward": 0.9779879450798035, + "reward_std": 0.00048687384696677327, + "rewards/perpo_ocr_edit_distance_reward": 0.9779880046844482, + "step": 3349, + "temperature": 0.9 + }, + { + "advantages": -3.445148468017578e-05, + "completion_length": 482.0, + "delta_ref_entropy_loss": 0.03662109375, + "delta_ref_ppl": -0.05517578125, + "entropy_loss": -0.0380859375, + "epoch": 0.67, + "grad_norm": 0.44420428126042194, + "k1_kl": 0.05517578125, + "k3_kl": 0.03369140625, + "kimi_kl": 0.0810546875, + "learning_rate": 1.65e-07, + "loss": 0.0014, + "ppl": 0.01483154296875, + "reward": 0.9990603923797607, + "reward_std": 0.00039425634895451367, + "rewards/perpo_ocr_edit_distance_reward": 0.9990604519844055, + "step": 3350, + "temperature": 0.9 + }, + { + "advantages": -4.829679528484121e-05, + "completion_length": 1074.0, + "delta_ref_entropy_loss": 0.0279541015625, + "delta_ref_ppl": -0.04345703125, + "entropy_loss": -0.06201171875, + "epoch": 0.6702, + "grad_norm": 1.7105263286660097, + "k1_kl": 0.04345703125, + "k3_kl": 0.0283203125, + "kimi_kl": 0.072265625, + "learning_rate": 1.6489999999999997e-07, + "loss": 0.0012, + "ppl": 0.0283203125, + "reward": 0.9884918928146362, + "reward_std": 0.0011339696357026696, + "rewards/perpo_ocr_edit_distance_reward": 0.9884920120239258, + "step": 3351, + "temperature": 0.9 + }, + { + "advantages": -1.6178404393940582e-07, + "completion_length": 10.0, + "delta_ref_entropy_loss": -0.99609375, + "delta_ref_ppl": -3.546875, + "entropy_loss": -1.8515625, + "epoch": 0.6704, + "grad_norm": 26.39486955933952, + "k1_kl": 3.546875, + "k3_kl": 3.15625, + "kimi_kl": 13.9375, + "learning_rate": 1.648e-07, + "loss": 0.1263, + "ppl": 0.73046875, + "reward": 0.12868046760559082, + "reward_std": 0.01423247717320919, + "rewards/perpo_ocr_edit_distance_reward": 0.12868048250675201, + "step": 3352, + "temperature": 0.9 + }, + { + "advantages": 0.0, + "completion_length": 63.0, + "delta_ref_entropy_loss": 0.005584716796875, + "delta_ref_ppl": -0.46484375, + "entropy_loss": -0.296875, + "epoch": 0.6706, + "grad_norm": 3.166462523905822, + "k1_kl": 0.46484375, + "k3_kl": 0.357421875, + "kimi_kl": 1.1484375, + "learning_rate": 1.647e-07, + "loss": 0.0143, + "ppl": 0.111328125, + "reward": 0.7800830006599426, + "reward_std": 0.006775907706469297, + "rewards/perpo_ocr_edit_distance_reward": 0.7800830006599426, + "step": 3353, + "temperature": 0.9 + }, + { + "advantages": -2.4906228190957336e-06, + "completion_length": 48.0, + "delta_ref_entropy_loss": 0.0220947265625, + "delta_ref_ppl": -0.443359375, + "entropy_loss": -0.140625, + "epoch": 0.6708, + "grad_norm": 5.697403109678509, + "k1_kl": 0.44140625, + "k3_kl": 0.375, + "kimi_kl": 1.7421875, + "learning_rate": 1.6459999999999998e-07, + "loss": 0.015, + "ppl": 0.07080078125, + "reward": 0.9511754512786865, + "reward_std": 0.013532213866710663, + "rewards/perpo_ocr_edit_distance_reward": 0.9511754512786865, + "step": 3354, + "temperature": 0.9 + }, + { + "advantages": 2.975549068651162e-05, + "completion_length": 754.0, + "delta_ref_entropy_loss": 0.026123046875, + "delta_ref_ppl": -0.04638671875, + "entropy_loss": -0.044921875, + "epoch": 0.671, + "grad_norm": 0.4390344578082426, + "k1_kl": 0.04638671875, + "k3_kl": 0.029541015625, + "kimi_kl": 0.07568359375, + "learning_rate": 1.645e-07, + "loss": 0.0011, + "ppl": 0.016357421875, + "reward": 0.9947183132171631, + "reward_std": 0.0004721368313767016, + "rewards/perpo_ocr_edit_distance_reward": 0.9947183728218079, + "step": 3355, + "temperature": 0.9 + }, + { + "advantages": -1.268727487513388e-06, + "completion_length": 2048.0, + "delta_ref_entropy_loss": -0.0303955078125, + "delta_ref_ppl": -0.04541015625, + "entropy_loss": -0.40234375, + "epoch": 0.6712, + "grad_norm": 12.59334023335218, + "k1_kl": 0.04541015625, + "k3_kl": 0.0771484375, + "kimi_kl": 0.10009765625, + "learning_rate": 1.644e-07, + "loss": 0.0031, + "ppl": 0.255859375, + "reward": 0.5325482487678528, + "reward_std": 0.026820240542292595, + "rewards/perpo_ocr_edit_distance_reward": 0.5325483083724976, + "step": 3356, + "temperature": 0.9 + }, + { + "advantages": -1.0388238479208667e-05, + "completion_length": 412.0, + "delta_ref_entropy_loss": 0.0634765625, + "delta_ref_ppl": -0.08642578125, + "entropy_loss": -0.1494140625, + "epoch": 0.6714, + "grad_norm": 1.1740391567783899, + "k1_kl": 0.0869140625, + "k3_kl": 0.054931640625, + "kimi_kl": 0.1474609375, + "learning_rate": 1.6429999999999998e-07, + "loss": 0.0022, + "ppl": 0.064453125, + "reward": 0.9828222990036011, + "reward_std": 0.008060228079557419, + "rewards/perpo_ocr_edit_distance_reward": 0.9828224778175354, + "step": 3357, + "temperature": 0.9 + }, + { + "advantages": -5.415507985162549e-06, + "completion_length": 670.0, + "delta_ref_entropy_loss": 0.03759765625, + "delta_ref_ppl": -0.076171875, + "entropy_loss": -0.17578125, + "epoch": 0.6716, + "grad_norm": 1.152816662238795, + "k1_kl": 0.076171875, + "k3_kl": 0.04833984375, + "kimi_kl": 0.11083984375, + "learning_rate": 1.642e-07, + "loss": 0.0019, + "ppl": 0.072265625, + "reward": 0.2798471450805664, + "reward_std": 0.00460538174957037, + "rewards/perpo_ocr_edit_distance_reward": 0.2798472046852112, + "step": 3358, + "temperature": 0.9 + }, + { + "advantages": -2.2309168343781494e-06, + "completion_length": 1183.0, + "delta_ref_entropy_loss": 0.0196533203125, + "delta_ref_ppl": -0.042724609375, + "entropy_loss": -0.06982421875, + "epoch": 0.6718, + "grad_norm": 1.2520993509432823, + "k1_kl": 0.042724609375, + "k3_kl": 0.0294189453125, + "kimi_kl": 0.0615234375, + "learning_rate": 1.641e-07, + "loss": 0.0012, + "ppl": 0.0296630859375, + "reward": 0.9171380400657654, + "reward_std": 0.03797828406095505, + "rewards/perpo_ocr_edit_distance_reward": 0.9171380996704102, + "step": 3359, + "temperature": 0.9 + }, + { + "advantages": 1.3794218602924957e-06, + "completion_length": 743.0, + "delta_ref_entropy_loss": 0.0380859375, + "delta_ref_ppl": -0.08056640625, + "entropy_loss": -0.06640625, + "epoch": 0.672, + "grad_norm": 1.8676039432406606, + "k1_kl": 0.08056640625, + "k3_kl": 0.052490234375, + "kimi_kl": 0.162109375, + "learning_rate": 1.64e-07, + "loss": 0.0021, + "ppl": 0.0306396484375, + "reward": 0.9823710918426514, + "reward_std": 0.018327292054891586, + "rewards/perpo_ocr_edit_distance_reward": 0.9823710918426514, + "step": 3360, + "temperature": 0.9 + }, + { + "advantages": -0.00016381060413550586, + "completion_length": 984.0, + "delta_ref_entropy_loss": 0.0296630859375, + "delta_ref_ppl": -0.041015625, + "entropy_loss": -0.03369140625, + "epoch": 0.6722, + "grad_norm": 0.39760726781581024, + "k1_kl": 0.041015625, + "k3_kl": 0.02783203125, + "kimi_kl": 0.08642578125, + "learning_rate": 1.6389999999999998e-07, + "loss": 0.0013, + "ppl": 0.01544189453125, + "reward": 0.9962450861930847, + "reward_std": 0.0004198041860945523, + "rewards/perpo_ocr_edit_distance_reward": 0.9962451457977295, + "step": 3361, + "temperature": 0.9 + }, + { + "advantages": -9.70704263636435e-07, + "completion_length": 756.0, + "delta_ref_entropy_loss": 0.007659912109375, + "delta_ref_ppl": -0.056884765625, + "entropy_loss": -0.2158203125, + "epoch": 0.6724, + "grad_norm": 11.203597272661222, + "k1_kl": 0.05712890625, + "k3_kl": 0.048583984375, + "kimi_kl": 0.09423828125, + "learning_rate": 1.638e-07, + "loss": 0.0019, + "ppl": 0.11279296875, + "reward": 0.9483262300491333, + "reward_std": 0.07013890892267227, + "rewards/perpo_ocr_edit_distance_reward": 0.9483262896537781, + "step": 3362, + "temperature": 0.9 + }, + { + "advantages": -0.0001005104641080834, + "completion_length": 639.0, + "delta_ref_entropy_loss": 0.0390625, + "delta_ref_ppl": -0.046875, + "entropy_loss": -0.04541015625, + "epoch": 0.6726, + "grad_norm": 0.3318561204062793, + "k1_kl": 0.046630859375, + "k3_kl": 0.03125, + "kimi_kl": 0.076171875, + "learning_rate": 1.6370000000000002e-07, + "loss": 0.0014, + "ppl": 0.01373291015625, + "reward": 0.9776734113693237, + "reward_std": 0.0005776845500804484, + "rewards/perpo_ocr_edit_distance_reward": 0.9776735305786133, + "step": 3363, + "temperature": 0.9 + }, + { + "advantages": -1.97121080418583e-05, + "completion_length": 106.0, + "delta_ref_entropy_loss": 0.04443359375, + "delta_ref_ppl": -0.173828125, + "entropy_loss": -0.0791015625, + "epoch": 0.6728, + "grad_norm": 2.2574476671113293, + "k1_kl": 0.173828125, + "k3_kl": 0.1572265625, + "kimi_kl": 0.43359375, + "learning_rate": 1.6359999999999998e-07, + "loss": 0.0063, + "ppl": 0.0439453125, + "reward": 0.9793997406959534, + "reward_std": 0.0020606559701263905, + "rewards/perpo_ocr_edit_distance_reward": 0.9793998599052429, + "step": 3364, + "temperature": 0.9 + }, + { + "advantages": -1.5667507113903412e-06, + "completion_length": 2048.0, + "delta_ref_entropy_loss": -0.0201416015625, + "delta_ref_ppl": -0.01324462890625, + "entropy_loss": -0.1123046875, + "epoch": 0.673, + "grad_norm": 1.7412147520023882, + "k1_kl": 0.01312255859375, + "k3_kl": 0.0128173828125, + "kimi_kl": 0.0230712890625, + "learning_rate": 1.635e-07, + "loss": 0.0005, + "ppl": 0.0576171875, + "reward": 0.7286651134490967, + "reward_std": 0.02657831832766533, + "rewards/perpo_ocr_edit_distance_reward": 0.7286651730537415, + "step": 3365, + "temperature": 0.9 + }, + { + "advantages": -4.286425610189326e-05, + "completion_length": 523.0, + "delta_ref_entropy_loss": 0.0299072265625, + "delta_ref_ppl": -0.07861328125, + "entropy_loss": -0.05517578125, + "epoch": 0.6732, + "grad_norm": 0.46482691738375903, + "k1_kl": 0.07861328125, + "k3_kl": 0.0556640625, + "kimi_kl": 0.1650390625, + "learning_rate": 1.634e-07, + "loss": 0.0023, + "ppl": 0.01904296875, + "reward": 0.9953992366790771, + "reward_std": 0.0006948186201043427, + "rewards/perpo_ocr_edit_distance_reward": 0.9953992366790771, + "step": 3366, + "temperature": 0.9 + }, + { + "advantages": -2.3143633370636962e-05, + "completion_length": 930.0, + "delta_ref_entropy_loss": 0.008056640625, + "delta_ref_ppl": -0.0294189453125, + "entropy_loss": -0.03662109375, + "epoch": 0.6734, + "grad_norm": 0.5475074478515375, + "k1_kl": 0.029296875, + "k3_kl": 0.021484375, + "kimi_kl": 0.0634765625, + "learning_rate": 1.6329999999999999e-07, + "loss": 0.0009, + "ppl": 0.0167236328125, + "reward": 0.9883061647415161, + "reward_std": 0.0028435394633561373, + "rewards/perpo_ocr_edit_distance_reward": 0.9883061647415161, + "step": 3367, + "temperature": 0.9 + }, + { + "advantages": -6.352152558974922e-05, + "completion_length": 753.0, + "delta_ref_entropy_loss": 0.08251953125, + "delta_ref_ppl": -0.07373046875, + "entropy_loss": -0.07666015625, + "epoch": 0.6736, + "grad_norm": 3.0121343062279387, + "k1_kl": 0.07373046875, + "k3_kl": 0.056640625, + "kimi_kl": 0.09619140625, + "learning_rate": 1.632e-07, + "loss": 0.0023, + "ppl": 0.037109375, + "reward": 0.981688916683197, + "reward_std": 0.000837827508803457, + "rewards/perpo_ocr_edit_distance_reward": 0.9816889762878418, + "step": 3368, + "temperature": 0.9 + }, + { + "advantages": -9.451594451093115e-06, + "completion_length": 128.0, + "delta_ref_entropy_loss": 0.029052734375, + "delta_ref_ppl": -0.263671875, + "entropy_loss": -0.0947265625, + "epoch": 0.6738, + "grad_norm": 1.2379448017739645, + "k1_kl": 0.263671875, + "k3_kl": 0.240234375, + "kimi_kl": 0.95703125, + "learning_rate": 1.631e-07, + "loss": 0.0096, + "ppl": 0.042724609375, + "reward": 0.9906725287437439, + "reward_std": 0.0025977103505283594, + "rewards/perpo_ocr_edit_distance_reward": 0.9906725883483887, + "step": 3369, + "temperature": 0.9 + }, + { + "advantages": -9.461811714572832e-05, + "completion_length": 505.0, + "delta_ref_entropy_loss": 0.03369140625, + "delta_ref_ppl": -0.076171875, + "entropy_loss": -0.039306640625, + "epoch": 0.674, + "grad_norm": 0.3788187573410626, + "k1_kl": 0.076171875, + "k3_kl": 0.05419921875, + "kimi_kl": 0.2216796875, + "learning_rate": 1.63e-07, + "loss": 0.0023, + "ppl": 0.0162353515625, + "reward": 0.9636492133140564, + "reward_std": 0.0006199887720867991, + "rewards/perpo_ocr_edit_distance_reward": 0.963649332523346, + "step": 3370, + "temperature": 0.9 + }, + { + "advantages": -4.204682045383379e-05, + "completion_length": 198.0, + "delta_ref_entropy_loss": 0.055908203125, + "delta_ref_ppl": -0.224609375, + "entropy_loss": -0.08447265625, + "epoch": 0.6742, + "grad_norm": 0.94289914226097, + "k1_kl": 0.224609375, + "k3_kl": 0.173828125, + "kimi_kl": 0.6171875, + "learning_rate": 1.6289999999999998e-07, + "loss": 0.007, + "ppl": 0.036376953125, + "reward": 0.9931984543800354, + "reward_std": 0.0009131113765761256, + "rewards/perpo_ocr_edit_distance_reward": 0.993198573589325, + "step": 3371, + "temperature": 0.9 + }, + { + "advantages": -5.5040633014868945e-05, + "completion_length": 680.0, + "delta_ref_entropy_loss": 0.026123046875, + "delta_ref_ppl": -0.06787109375, + "entropy_loss": -0.1474609375, + "epoch": 0.6744, + "grad_norm": 1.0051378343394213, + "k1_kl": 0.0673828125, + "k3_kl": 0.04443359375, + "kimi_kl": 0.09033203125, + "learning_rate": 1.628e-07, + "loss": 0.0018, + "ppl": 0.06298828125, + "reward": 0.8470612168312073, + "reward_std": 0.001292258151806891, + "rewards/perpo_ocr_edit_distance_reward": 0.8470613360404968, + "step": 3372, + "temperature": 0.9 + }, + { + "advantages": 1.1920928955078125e-07, + "completion_length": 319.0, + "delta_ref_entropy_loss": -1.25, + "delta_ref_ppl": -0.208984375, + "entropy_loss": -3.140625, + "epoch": 0.6746, + "grad_norm": 6.061419113380516, + "k1_kl": 0.2099609375, + "k3_kl": 0.431640625, + "kimi_kl": 1.03125, + "learning_rate": 1.627e-07, + "loss": 0.0172, + "ppl": 1.6796875, + "reward": 0.2597465515136719, + "reward_std": 0.06636738777160645, + "rewards/perpo_ocr_edit_distance_reward": 0.2597465515136719, + "step": 3373, + "temperature": 0.9 + }, + { + "advantages": -4.087175966560608e-07, + "completion_length": 2048.0, + "delta_ref_entropy_loss": -0.7421875, + "delta_ref_ppl": -0.1455078125, + "entropy_loss": -1.4296875, + "epoch": 0.6748, + "grad_norm": 8.149153023631548, + "k1_kl": 0.1455078125, + "k3_kl": 0.2353515625, + "kimi_kl": 0.54296875, + "learning_rate": 1.626e-07, + "loss": 0.0094, + "ppl": 0.625, + "reward": 0.1041768491268158, + "reward_std": 0.01585511490702629, + "rewards/perpo_ocr_edit_distance_reward": 0.1041768491268158, + "step": 3374, + "temperature": 0.9 + }, + { + "advantages": -1.380273351969663e-05, + "completion_length": 1508.0, + "delta_ref_entropy_loss": 0.07373046875, + "delta_ref_ppl": -0.091796875, + "entropy_loss": -0.2470703125, + "epoch": 0.675, + "grad_norm": 1.9919006607658067, + "k1_kl": 0.091796875, + "k3_kl": 0.0595703125, + "kimi_kl": 0.1396484375, + "learning_rate": 1.625e-07, + "loss": 0.0024, + "ppl": 0.125, + "reward": 0.879555344581604, + "reward_std": 0.005456628277897835, + "rewards/perpo_ocr_edit_distance_reward": 0.8795554637908936, + "step": 3375, + "temperature": 0.9 + }, + { + "advantages": 1.4015607121109497e-05, + "completion_length": 406.0, + "delta_ref_entropy_loss": 0.0262451171875, + "delta_ref_ppl": -0.0654296875, + "entropy_loss": -0.07080078125, + "epoch": 0.6752, + "grad_norm": 0.7454703172260962, + "k1_kl": 0.06591796875, + "k3_kl": 0.04296875, + "kimi_kl": 0.11279296875, + "learning_rate": 1.6239999999999997e-07, + "loss": 0.0017, + "ppl": 0.0308837890625, + "reward": 0.980941116809845, + "reward_std": 0.0011156080290675163, + "rewards/perpo_ocr_edit_distance_reward": 0.980941116809845, + "step": 3376, + "temperature": 0.9 + }, + { + "advantages": -7.418224413413554e-05, + "completion_length": 888.0, + "delta_ref_entropy_loss": 0.0213623046875, + "delta_ref_ppl": -0.039794921875, + "entropy_loss": -0.036376953125, + "epoch": 0.6754, + "grad_norm": 0.4383732142900898, + "k1_kl": 0.03955078125, + "k3_kl": 0.026123046875, + "kimi_kl": 0.072265625, + "learning_rate": 1.623e-07, + "loss": 0.0011, + "ppl": 0.01397705078125, + "reward": 0.9983699321746826, + "reward_std": 0.00035911196027882397, + "rewards/perpo_ocr_edit_distance_reward": 0.9983699917793274, + "step": 3377, + "temperature": 0.9 + }, + { + "advantages": -3.520080281305127e-05, + "completion_length": 517.0, + "delta_ref_entropy_loss": 0.030029296875, + "delta_ref_ppl": -0.042236328125, + "entropy_loss": -0.035888671875, + "epoch": 0.6756, + "grad_norm": 0.3214938895635475, + "k1_kl": 0.042236328125, + "k3_kl": 0.0230712890625, + "kimi_kl": 0.052978515625, + "learning_rate": 1.622e-07, + "loss": 0.001, + "ppl": 0.01080322265625, + "reward": 0.9982151389122009, + "reward_std": 0.001836446113884449, + "rewards/perpo_ocr_edit_distance_reward": 0.9982151985168457, + "step": 3378, + "temperature": 0.9 + }, + { + "advantages": -1.614434404473286e-05, + "completion_length": 263.0, + "delta_ref_entropy_loss": 0.059814453125, + "delta_ref_ppl": -0.12890625, + "entropy_loss": -0.0693359375, + "epoch": 0.6758, + "grad_norm": 0.9428427929431173, + "k1_kl": 0.12890625, + "k3_kl": 0.103515625, + "kimi_kl": 0.333984375, + "learning_rate": 1.6209999999999998e-07, + "loss": 0.0042, + "ppl": 0.031982421875, + "reward": 0.9914475679397583, + "reward_std": 0.00253734621219337, + "rewards/perpo_ocr_edit_distance_reward": 0.9914476275444031, + "step": 3379, + "temperature": 0.9 + }, + { + "advantages": -2.384185791015625e-06, + "completion_length": 776.0, + "delta_ref_entropy_loss": -0.0137939453125, + "delta_ref_ppl": -0.05517578125, + "entropy_loss": -0.1376953125, + "epoch": 0.676, + "grad_norm": 1.2071174018313424, + "k1_kl": 0.05517578125, + "k3_kl": 0.041015625, + "kimi_kl": 0.1123046875, + "learning_rate": 1.62e-07, + "loss": 0.0016, + "ppl": 0.049072265625, + "reward": 0.9865829944610596, + "reward_std": 0.010627727024257183, + "rewards/perpo_ocr_edit_distance_reward": 0.9865830540657043, + "step": 3380, + "temperature": 0.9 + }, + { + "advantages": 6.318092800938757e-06, + "completion_length": 816.0, + "delta_ref_entropy_loss": 0.0576171875, + "delta_ref_ppl": -0.064453125, + "entropy_loss": -0.1025390625, + "epoch": 0.6762, + "grad_norm": 0.957020827209389, + "k1_kl": 0.064453125, + "k3_kl": 0.03466796875, + "kimi_kl": 0.0810546875, + "learning_rate": 1.619e-07, + "loss": 0.0014, + "ppl": 0.045654296875, + "reward": 0.9885697960853577, + "reward_std": 0.0012457042466849089, + "rewards/perpo_ocr_edit_distance_reward": 0.9885698556900024, + "step": 3381, + "temperature": 0.9 + }, + { + "advantages": -1.1920928955078125e-06, + "completion_length": 1423.0, + "delta_ref_entropy_loss": -0.01470947265625, + "delta_ref_ppl": -0.0478515625, + "entropy_loss": -0.16796875, + "epoch": 0.6764, + "grad_norm": 1.380884409196743, + "k1_kl": 0.048095703125, + "k3_kl": 0.03662109375, + "kimi_kl": 0.0791015625, + "learning_rate": 1.6179999999999998e-07, + "loss": 0.0015, + "ppl": 0.07373046875, + "reward": 0.940021276473999, + "reward_std": 0.02840997464954853, + "rewards/perpo_ocr_edit_distance_reward": 0.9400213360786438, + "step": 3382, + "temperature": 0.9 + }, + { + "advantages": 9.664468052505981e-06, + "completion_length": 616.0, + "delta_ref_entropy_loss": 0.0301513671875, + "delta_ref_ppl": -0.06103515625, + "entropy_loss": -0.0380859375, + "epoch": 0.6766, + "grad_norm": 0.3805309585991989, + "k1_kl": 0.06103515625, + "k3_kl": 0.039306640625, + "kimi_kl": 0.123046875, + "learning_rate": 1.617e-07, + "loss": 0.0016, + "ppl": 0.0125732421875, + "reward": 0.9982274174690247, + "reward_std": 0.0007816511788405478, + "rewards/perpo_ocr_edit_distance_reward": 0.9982274174690247, + "step": 3383, + "temperature": 0.9 + }, + { + "advantages": -7.322856845348724e-07, + "completion_length": 586.0, + "delta_ref_entropy_loss": 0.072265625, + "delta_ref_ppl": -0.12890625, + "entropy_loss": -0.28125, + "epoch": 0.6768, + "grad_norm": 1.7791535676211452, + "k1_kl": 0.12890625, + "k3_kl": 0.087890625, + "kimi_kl": 0.1669921875, + "learning_rate": 1.616e-07, + "loss": 0.0035, + "ppl": 0.15234375, + "reward": 0.8152939081192017, + "reward_std": 0.01156909391283989, + "rewards/perpo_ocr_edit_distance_reward": 0.8152939081192017, + "step": 3384, + "temperature": 0.9 + }, + { + "advantages": -0.00022708519827574492, + "completion_length": 513.0, + "delta_ref_entropy_loss": 0.033447265625, + "delta_ref_ppl": -0.068359375, + "entropy_loss": -0.068359375, + "epoch": 0.677, + "grad_norm": 0.31112431586146, + "k1_kl": 0.068359375, + "k3_kl": 0.0400390625, + "kimi_kl": 0.10791015625, + "learning_rate": 1.615e-07, + "loss": 0.0018, + "ppl": 0.0205078125, + "reward": 0.9847384095191956, + "reward_std": 0.00031247077276930213, + "rewards/perpo_ocr_edit_distance_reward": 0.9847385287284851, + "step": 3385, + "temperature": 0.9 + }, + { + "advantages": -7.957220077514648e-06, + "completion_length": 382.0, + "delta_ref_entropy_loss": 0.039306640625, + "delta_ref_ppl": -0.1181640625, + "entropy_loss": -0.0986328125, + "epoch": 0.6772, + "grad_norm": 1.1044785920946982, + "k1_kl": 0.1181640625, + "k3_kl": 0.08349609375, + "kimi_kl": 0.259765625, + "learning_rate": 1.6139999999999998e-07, + "loss": 0.0033, + "ppl": 0.046142578125, + "reward": 0.9595887660980225, + "reward_std": 0.0031106050591915846, + "rewards/perpo_ocr_edit_distance_reward": 0.9595888257026672, + "step": 3386, + "temperature": 0.9 + }, + { + "advantages": 2.5987626941059716e-05, + "completion_length": 480.0, + "delta_ref_entropy_loss": 0.0240478515625, + "delta_ref_ppl": -0.062255859375, + "entropy_loss": -0.0308837890625, + "epoch": 0.6774, + "grad_norm": 0.163606027865366, + "k1_kl": 0.062255859375, + "k3_kl": 0.046630859375, + "kimi_kl": 0.22265625, + "learning_rate": 1.613e-07, + "loss": 0.0018, + "ppl": 0.00823974609375, + "reward": 0.9989345073699951, + "reward_std": 0.00022757639817427844, + "rewards/perpo_ocr_edit_distance_reward": 0.9989345073699951, + "step": 3387, + "temperature": 0.9 + }, + { + "advantages": -4.870551038038684e-06, + "completion_length": 123.0, + "delta_ref_entropy_loss": 0.03369140625, + "delta_ref_ppl": -0.29296875, + "entropy_loss": -0.1435546875, + "epoch": 0.6776, + "grad_norm": 2.6379510832350523, + "k1_kl": 0.29296875, + "k3_kl": 0.2197265625, + "kimi_kl": 0.7109375, + "learning_rate": 1.6120000000000001e-07, + "loss": 0.0088, + "ppl": 0.0712890625, + "reward": 0.951935887336731, + "reward_std": 0.008598921820521355, + "rewards/perpo_ocr_edit_distance_reward": 0.9519359469413757, + "step": 3388, + "temperature": 0.9 + }, + { + "advantages": 0.0, + "completion_length": 784.0, + "delta_ref_entropy_loss": 0.0184326171875, + "delta_ref_ppl": -0.032470703125, + "entropy_loss": -0.032958984375, + "epoch": 0.6778, + "grad_norm": 0.32309075125763176, + "k1_kl": 0.032470703125, + "k3_kl": 0.019287109375, + "kimi_kl": 0.054443359375, + "learning_rate": 1.6109999999999998e-07, + "loss": 0.0008, + "ppl": 0.01214599609375, + "reward": 0.9950516819953918, + "reward_std": 0.0003672938619274646, + "rewards/perpo_ocr_edit_distance_reward": 0.9950517416000366, + "step": 3389, + "temperature": 0.9 + }, + { + "advantages": -2.506801138224546e-05, + "completion_length": 175.0, + "delta_ref_entropy_loss": 0.024169921875, + "delta_ref_ppl": -0.2421875, + "entropy_loss": -0.07421875, + "epoch": 0.678, + "grad_norm": 1.4407152799757208, + "k1_kl": 0.2412109375, + "k3_kl": 0.1953125, + "kimi_kl": 0.93359375, + "learning_rate": 1.61e-07, + "loss": 0.0078, + "ppl": 0.0361328125, + "reward": 0.9881644248962402, + "reward_std": 0.003296217881143093, + "rewards/perpo_ocr_edit_distance_reward": 0.9881645441055298, + "step": 3390, + "temperature": 0.9 + }, + { + "advantages": -7.069110870361328e-05, + "completion_length": 521.0, + "delta_ref_entropy_loss": 0.040283203125, + "delta_ref_ppl": -0.0517578125, + "entropy_loss": -0.04345703125, + "epoch": 0.6782, + "grad_norm": 0.2971143842888635, + "k1_kl": 0.0517578125, + "k3_kl": 0.0301513671875, + "kimi_kl": 0.09033203125, + "learning_rate": 1.609e-07, + "loss": 0.0013, + "ppl": 0.013427734375, + "reward": 0.9933428168296814, + "reward_std": 0.0012250222498551011, + "rewards/perpo_ocr_edit_distance_reward": 0.9933428764343262, + "step": 3391, + "temperature": 0.9 + }, + { + "advantages": -4.1433744627283886e-05, + "completion_length": 793.0, + "delta_ref_entropy_loss": 0.0264892578125, + "delta_ref_ppl": -0.043212890625, + "entropy_loss": -0.050537109375, + "epoch": 0.6784, + "grad_norm": 0.39754785179501434, + "k1_kl": 0.04345703125, + "k3_kl": 0.0255126953125, + "kimi_kl": 0.05712890625, + "learning_rate": 1.6079999999999998e-07, + "loss": 0.0011, + "ppl": 0.0185546875, + "reward": 0.8204553127288818, + "reward_std": 0.0009277334320358932, + "rewards/perpo_ocr_edit_distance_reward": 0.8204553723335266, + "step": 3392, + "temperature": 0.9 + }, + { + "advantages": -0.00016675677034072578, + "completion_length": 611.0, + "delta_ref_entropy_loss": 0.0284423828125, + "delta_ref_ppl": -0.056396484375, + "entropy_loss": -0.037109375, + "epoch": 0.6786, + "grad_norm": 0.22535522374497316, + "k1_kl": 0.05615234375, + "k3_kl": 0.036376953125, + "kimi_kl": 0.109375, + "learning_rate": 1.607e-07, + "loss": 0.0016, + "ppl": 0.009765625, + "reward": 0.998081624507904, + "reward_std": 0.0002574015816207975, + "rewards/perpo_ocr_edit_distance_reward": 0.9980818033218384, + "step": 3393, + "temperature": 0.9 + }, + { + "advantages": -4.308564712118823e-06, + "completion_length": 445.0, + "delta_ref_entropy_loss": 0.035888671875, + "delta_ref_ppl": -0.08544921875, + "entropy_loss": -0.09375, + "epoch": 0.6788, + "grad_norm": 0.7367242011111506, + "k1_kl": 0.08544921875, + "k3_kl": 0.055908203125, + "kimi_kl": 0.189453125, + "learning_rate": 1.606e-07, + "loss": 0.0022, + "ppl": 0.03466796875, + "reward": 0.9916087985038757, + "reward_std": 0.0018906376790255308, + "rewards/perpo_ocr_edit_distance_reward": 0.9916087985038757, + "step": 3394, + "temperature": 0.9 + }, + { + "advantages": -6.731067696819082e-05, + "completion_length": 717.0, + "delta_ref_entropy_loss": 0.01068115234375, + "delta_ref_ppl": -0.0390625, + "entropy_loss": -0.048583984375, + "epoch": 0.679, + "grad_norm": 0.36353357715975115, + "k1_kl": 0.0390625, + "k3_kl": 0.028076171875, + "kimi_kl": 0.091796875, + "learning_rate": 1.605e-07, + "loss": 0.0012, + "ppl": 0.0198974609375, + "reward": 0.9926685690879822, + "reward_std": 0.0011648988584056497, + "rewards/perpo_ocr_edit_distance_reward": 0.9926687479019165, + "step": 3395, + "temperature": 0.9 + }, + { + "advantages": -2.5544848085701233e-06, + "completion_length": 451.0, + "delta_ref_entropy_loss": 0.07470703125, + "delta_ref_ppl": -0.15625, + "entropy_loss": -0.1513671875, + "epoch": 0.6792, + "grad_norm": 1.2079271562207088, + "k1_kl": 0.15625, + "k3_kl": 0.109375, + "kimi_kl": 0.326171875, + "learning_rate": 1.6039999999999998e-07, + "loss": 0.0044, + "ppl": 0.06298828125, + "reward": 0.8486641645431519, + "reward_std": 0.003228012705221772, + "rewards/perpo_ocr_edit_distance_reward": 0.8486642241477966, + "step": 3396, + "temperature": 0.9 + }, + { + "advantages": -0.0001736538833938539, + "completion_length": 606.0, + "delta_ref_entropy_loss": 0.01953125, + "delta_ref_ppl": -0.042236328125, + "entropy_loss": -0.022705078125, + "epoch": 0.6794, + "grad_norm": 0.1766394164952927, + "k1_kl": 0.0419921875, + "k3_kl": 0.0289306640625, + "kimi_kl": 0.12060546875, + "learning_rate": 1.603e-07, + "loss": 0.0013, + "ppl": 0.00518798828125, + "reward": 0.9989957213401794, + "reward_std": 0.0003413500089664012, + "rewards/perpo_ocr_edit_distance_reward": 0.9989957809448242, + "step": 3397, + "temperature": 0.9 + }, + { + "advantages": 3.4059798537100505e-08, + "completion_length": 198.0, + "delta_ref_entropy_loss": -1.234375, + "delta_ref_ppl": -1.0, + "entropy_loss": -2.578125, + "epoch": 0.6796, + "grad_norm": 18.199367352285908, + "k1_kl": 1.0, + "k3_kl": 1.7421875, + "kimi_kl": 5.03125, + "learning_rate": 1.602e-07, + "loss": 0.0696, + "ppl": 1.1875, + "reward": 0.09968671947717667, + "reward_std": 0.09630334377288818, + "rewards/perpo_ocr_edit_distance_reward": 0.09968671947717667, + "step": 3398, + "temperature": 0.9 + }, + { + "advantages": -5.435943967313506e-05, + "completion_length": 452.0, + "delta_ref_entropy_loss": 0.047119140625, + "delta_ref_ppl": -0.10546875, + "entropy_loss": -0.0810546875, + "epoch": 0.6798, + "grad_norm": 2.461649107880324, + "k1_kl": 0.10498046875, + "k3_kl": 0.07861328125, + "kimi_kl": 0.3203125, + "learning_rate": 1.6009999999999998e-07, + "loss": 0.0032, + "ppl": 0.03662109375, + "reward": 0.9921488761901855, + "reward_std": 0.0011532059870660305, + "rewards/perpo_ocr_edit_distance_reward": 0.9921489953994751, + "step": 3399, + "temperature": 0.9 + }, + { + "advantages": -0.00012452263035811484, + "completion_length": 460.0, + "delta_ref_entropy_loss": 0.029296875, + "delta_ref_ppl": -0.083984375, + "entropy_loss": -0.05322265625, + "epoch": 0.68, + "grad_norm": 0.8277518222962563, + "k1_kl": 0.083984375, + "k3_kl": 0.06787109375, + "kimi_kl": 0.23046875, + "learning_rate": 1.6e-07, + "loss": 0.0028, + "ppl": 0.021728515625, + "reward": 0.9957069158554077, + "reward_std": 0.0006522452458739281, + "rewards/perpo_ocr_edit_distance_reward": 0.9957070350646973, + "step": 3400, + "temperature": 0.9 + }, + { + "advantages": -3.065381974920456e-07, + "completion_length": 1986.0, + "delta_ref_entropy_loss": -0.125, + "delta_ref_ppl": -0.0264892578125, + "entropy_loss": -0.390625, + "epoch": 0.6802, + "grad_norm": 7.20541695206614, + "k1_kl": 0.0262451171875, + "k3_kl": 0.03759765625, + "kimi_kl": 0.06689453125, + "learning_rate": 1.5989999999999997e-07, + "loss": 0.0015, + "ppl": 0.2109375, + "reward": 0.8049660325050354, + "reward_std": 0.20625649392604828, + "rewards/perpo_ocr_edit_distance_reward": 0.804966151714325, + "step": 3401, + "temperature": 0.9 + }, + { + "advantages": -5.674362546415068e-05, + "completion_length": 461.0, + "delta_ref_entropy_loss": 0.0654296875, + "delta_ref_ppl": -0.0966796875, + "entropy_loss": -0.0859375, + "epoch": 0.6804, + "grad_norm": 0.46370482969241106, + "k1_kl": 0.0966796875, + "k3_kl": 0.057861328125, + "kimi_kl": 0.1474609375, + "learning_rate": 1.598e-07, + "loss": 0.0024, + "ppl": 0.03125, + "reward": 0.6785522699356079, + "reward_std": 0.0006503775948658586, + "rewards/perpo_ocr_edit_distance_reward": 0.6785523295402527, + "step": 3402, + "temperature": 0.9 + }, + { + "advantages": -7.455689774360508e-05, + "completion_length": 842.0, + "delta_ref_entropy_loss": 0.03955078125, + "delta_ref_ppl": -0.06396484375, + "entropy_loss": -0.037353515625, + "epoch": 0.6806, + "grad_norm": 0.5512840082437078, + "k1_kl": 0.06396484375, + "k3_kl": 0.041748046875, + "kimi_kl": 0.1474609375, + "learning_rate": 1.597e-07, + "loss": 0.0017, + "ppl": 0.0133056640625, + "reward": 0.9953542351722717, + "reward_std": 0.0005851965979672968, + "rewards/perpo_ocr_edit_distance_reward": 0.9953542947769165, + "step": 3403, + "temperature": 0.9 + }, + { + "advantages": -0.00010232414933852851, + "completion_length": 538.0, + "delta_ref_entropy_loss": 0.055419921875, + "delta_ref_ppl": -0.06787109375, + "entropy_loss": -0.0634765625, + "epoch": 0.6808, + "grad_norm": 0.9755346794477971, + "k1_kl": 0.06787109375, + "k3_kl": 0.0400390625, + "kimi_kl": 0.10986328125, + "learning_rate": 1.5959999999999997e-07, + "loss": 0.0017, + "ppl": 0.0279541015625, + "reward": 0.9943145513534546, + "reward_std": 0.0008985447930172086, + "rewards/perpo_ocr_edit_distance_reward": 0.9943146705627441, + "step": 3404, + "temperature": 0.9 + }, + { + "advantages": 1.1103494216513354e-05, + "completion_length": 779.0, + "delta_ref_entropy_loss": 0.0198974609375, + "delta_ref_ppl": -0.0341796875, + "entropy_loss": -0.039306640625, + "epoch": 0.681, + "grad_norm": 0.383927802699907, + "k1_kl": 0.0341796875, + "k3_kl": 0.02001953125, + "kimi_kl": 0.0595703125, + "learning_rate": 1.595e-07, + "loss": 0.0008, + "ppl": 0.011962890625, + "reward": 0.9972954392433167, + "reward_std": 0.0006698347278870642, + "rewards/perpo_ocr_edit_distance_reward": 0.9972954392433167, + "step": 3405, + "temperature": 0.9 + }, + { + "advantages": 6.123951607150957e-05, + "completion_length": 398.0, + "delta_ref_entropy_loss": 0.056640625, + "delta_ref_ppl": -0.10400390625, + "entropy_loss": -0.04296875, + "epoch": 0.6812, + "grad_norm": 0.45462158916598866, + "k1_kl": 0.10400390625, + "k3_kl": 0.0732421875, + "kimi_kl": 0.2490234375, + "learning_rate": 1.5939999999999998e-07, + "loss": 0.0029, + "ppl": 0.0172119140625, + "reward": 0.9935277700424194, + "reward_std": 0.0007342671742662787, + "rewards/perpo_ocr_edit_distance_reward": 0.9935277104377747, + "step": 3406, + "temperature": 0.9 + }, + { + "advantages": -1.7029899268550253e-08, + "completion_length": 622.0, + "delta_ref_entropy_loss": 0.0233154296875, + "delta_ref_ppl": -0.041748046875, + "entropy_loss": -0.0361328125, + "epoch": 0.6814, + "grad_norm": 0.287053866907503, + "k1_kl": 0.041748046875, + "k3_kl": 0.0255126953125, + "kimi_kl": 0.064453125, + "learning_rate": 1.5929999999999998e-07, + "loss": 0.001, + "ppl": 0.01177978515625, + "reward": 0.9971191883087158, + "reward_std": 0.000563415524084121, + "rewards/perpo_ocr_edit_distance_reward": 0.9971192479133606, + "step": 3407, + "temperature": 0.9 + }, + { + "advantages": -4.364763299236074e-05, + "completion_length": 504.0, + "delta_ref_entropy_loss": 0.044677734375, + "delta_ref_ppl": -0.0595703125, + "entropy_loss": -0.0419921875, + "epoch": 0.6816, + "grad_norm": 0.6479596161229131, + "k1_kl": 0.0595703125, + "k3_kl": 0.033935546875, + "kimi_kl": 0.0888671875, + "learning_rate": 1.592e-07, + "loss": 0.0014, + "ppl": 0.013916015625, + "reward": 0.9977877736091614, + "reward_std": 0.00048530337517149746, + "rewards/perpo_ocr_edit_distance_reward": 0.9977877736091614, + "step": 3408, + "temperature": 0.9 + }, + { + "advantages": -3.4894263080786914e-05, + "completion_length": 267.0, + "delta_ref_entropy_loss": 0.1123046875, + "delta_ref_ppl": -0.1904296875, + "entropy_loss": -0.298828125, + "epoch": 0.6818, + "grad_norm": 4.047539537052404, + "k1_kl": 0.1904296875, + "k3_kl": 0.130859375, + "kimi_kl": 0.482421875, + "learning_rate": 1.591e-07, + "loss": 0.0053, + "ppl": 0.150390625, + "reward": 0.9458287954330444, + "reward_std": 0.002340438077226281, + "rewards/perpo_ocr_edit_distance_reward": 0.945828914642334, + "step": 3409, + "temperature": 0.9 + }, + { + "advantages": 0.0, + "completion_length": 606.0, + "delta_ref_entropy_loss": 0.035400390625, + "delta_ref_ppl": -0.049072265625, + "entropy_loss": -0.06103515625, + "epoch": 0.682, + "grad_norm": 0.8887564163531192, + "k1_kl": 0.049072265625, + "k3_kl": 0.026611328125, + "kimi_kl": 0.068359375, + "learning_rate": 1.59e-07, + "loss": 0.0011, + "ppl": 0.02197265625, + "reward": 0.955611526966095, + "reward_std": 0.0009939405135810375, + "rewards/perpo_ocr_edit_distance_reward": 0.9556115865707397, + "step": 3410, + "temperature": 0.9 + }, + { + "advantages": -2.486365247023059e-06, + "completion_length": 355.0, + "delta_ref_entropy_loss": 0.08251953125, + "delta_ref_ppl": -0.1826171875, + "entropy_loss": -0.5859375, + "epoch": 0.6822, + "grad_norm": 3.433994568613095, + "k1_kl": 0.1826171875, + "k3_kl": 0.13671875, + "kimi_kl": 0.40234375, + "learning_rate": 1.589e-07, + "loss": 0.0055, + "ppl": 0.3046875, + "reward": 0.7862300276756287, + "reward_std": 0.006722245831042528, + "rewards/perpo_ocr_edit_distance_reward": 0.7862300276756287, + "step": 3411, + "temperature": 0.9 + }, + { + "advantages": -7.40800601306546e-07, + "completion_length": 258.0, + "delta_ref_entropy_loss": -0.09912109375, + "delta_ref_ppl": -0.072265625, + "entropy_loss": -0.7421875, + "epoch": 0.6824, + "grad_norm": 3.32112799501516, + "k1_kl": 0.0732421875, + "k3_kl": 0.0751953125, + "kimi_kl": 0.1787109375, + "learning_rate": 1.588e-07, + "loss": 0.003, + "ppl": 0.416015625, + "reward": 0.2961522936820984, + "reward_std": 0.03983399644494057, + "rewards/perpo_ocr_edit_distance_reward": 0.2961523234844208, + "step": 3412, + "temperature": 0.9 + }, + { + "advantages": -2.7450068955658935e-05, + "completion_length": 1098.0, + "delta_ref_entropy_loss": 0.01055908203125, + "delta_ref_ppl": -0.046875, + "entropy_loss": -0.04296875, + "epoch": 0.6826, + "grad_norm": 0.547511916052681, + "k1_kl": 0.047119140625, + "k3_kl": 0.03466796875, + "kimi_kl": 0.087890625, + "learning_rate": 1.587e-07, + "loss": 0.0014, + "ppl": 0.0172119140625, + "reward": 0.9746757745742798, + "reward_std": 0.0014507152372971177, + "rewards/perpo_ocr_edit_distance_reward": 0.9746758341789246, + "step": 3413, + "temperature": 0.9 + }, + { + "advantages": -0.00013595819473266602, + "completion_length": 671.0, + "delta_ref_entropy_loss": 0.035400390625, + "delta_ref_ppl": -0.05078125, + "entropy_loss": -0.043212890625, + "epoch": 0.6828, + "grad_norm": 0.313406459439613, + "k1_kl": 0.050537109375, + "k3_kl": 0.0289306640625, + "kimi_kl": 0.07373046875, + "learning_rate": 1.5859999999999998e-07, + "loss": 0.0013, + "ppl": 0.0169677734375, + "reward": 0.9946398735046387, + "reward_std": 0.0004010105039924383, + "rewards/perpo_ocr_edit_distance_reward": 0.9946399331092834, + "step": 3414, + "temperature": 0.9 + }, + { + "advantages": -1.2380736734485254e-05, + "completion_length": 366.0, + "delta_ref_entropy_loss": 0.055908203125, + "delta_ref_ppl": -0.10302734375, + "entropy_loss": -0.060302734375, + "epoch": 0.683, + "grad_norm": 0.5978950232243228, + "k1_kl": 0.10302734375, + "k3_kl": 0.06640625, + "kimi_kl": 0.1875, + "learning_rate": 1.585e-07, + "loss": 0.0027, + "ppl": 0.0228271484375, + "reward": 0.9953314065933228, + "reward_std": 0.0005880560493096709, + "rewards/perpo_ocr_edit_distance_reward": 0.9953314661979675, + "step": 3415, + "temperature": 0.9 + }, + { + "advantages": -1.291717853746377e-05, + "completion_length": 319.0, + "delta_ref_entropy_loss": 0.055908203125, + "delta_ref_ppl": -0.1455078125, + "entropy_loss": -0.1953125, + "epoch": 0.6832, + "grad_norm": 1.7396218801154189, + "k1_kl": 0.1455078125, + "k3_kl": 0.103515625, + "kimi_kl": 0.275390625, + "learning_rate": 1.5840000000000002e-07, + "loss": 0.0041, + "ppl": 0.08740234375, + "reward": 0.8018943667411804, + "reward_std": 0.002540633315220475, + "rewards/perpo_ocr_edit_distance_reward": 0.8018943667411804, + "step": 3416, + "temperature": 0.9 + }, + { + "advantages": -1.043081283569336e-06, + "completion_length": 480.0, + "delta_ref_entropy_loss": -0.04150390625, + "delta_ref_ppl": -0.1376953125, + "entropy_loss": -0.37109375, + "epoch": 0.6834, + "grad_norm": 3.7259128748126593, + "k1_kl": 0.1376953125, + "k3_kl": 0.1162109375, + "kimi_kl": 0.42578125, + "learning_rate": 1.5829999999999998e-07, + "loss": 0.0047, + "ppl": 0.1474609375, + "reward": 0.7390894889831543, + "reward_std": 0.10449808090925217, + "rewards/perpo_ocr_edit_distance_reward": 0.7390896081924438, + "step": 3417, + "temperature": 0.9 + }, + { + "advantages": -0.00015247668488882482, + "completion_length": 860.0, + "delta_ref_entropy_loss": 0.0198974609375, + "delta_ref_ppl": -0.041015625, + "entropy_loss": -0.0537109375, + "epoch": 0.6836, + "grad_norm": 0.35714836496220304, + "k1_kl": 0.041015625, + "k3_kl": 0.0272216796875, + "kimi_kl": 0.060791015625, + "learning_rate": 1.582e-07, + "loss": 0.0012, + "ppl": 0.0191650390625, + "reward": 0.9971500039100647, + "reward_std": 0.0002908825990743935, + "rewards/perpo_ocr_edit_distance_reward": 0.9971500635147095, + "step": 3418, + "temperature": 0.9 + }, + { + "advantages": -3.136907616863027e-05, + "completion_length": 1208.0, + "delta_ref_entropy_loss": 0.0322265625, + "delta_ref_ppl": -0.038818359375, + "entropy_loss": -0.06640625, + "epoch": 0.6838, + "grad_norm": 0.6266446827173443, + "k1_kl": 0.038818359375, + "k3_kl": 0.0218505859375, + "kimi_kl": 0.0537109375, + "learning_rate": 1.581e-07, + "loss": 0.0009, + "ppl": 0.0289306640625, + "reward": 0.9800387024879456, + "reward_std": 0.0009853239171206951, + "rewards/perpo_ocr_edit_distance_reward": 0.9800387620925903, + "step": 3419, + "temperature": 0.9 + }, + { + "advantages": -2.384185791015625e-07, + "completion_length": 762.0, + "delta_ref_entropy_loss": -0.09130859375, + "delta_ref_ppl": -0.08642578125, + "entropy_loss": -0.39453125, + "epoch": 0.684, + "grad_norm": 3.754731781945414, + "k1_kl": 0.0859375, + "k3_kl": 0.07958984375, + "kimi_kl": 0.189453125, + "learning_rate": 1.5799999999999999e-07, + "loss": 0.0032, + "ppl": 0.177734375, + "reward": 0.6236670017242432, + "reward_std": 0.2222256064414978, + "rewards/perpo_ocr_edit_distance_reward": 0.6236671209335327, + "step": 3420, + "temperature": 0.9 + }, + { + "advantages": 9.238720508619735e-07, + "completion_length": 63.0, + "delta_ref_entropy_loss": 0.1259765625, + "delta_ref_ppl": -0.63671875, + "entropy_loss": -0.33203125, + "epoch": 0.6842, + "grad_norm": 6.431714642026962, + "k1_kl": 0.63671875, + "k3_kl": 0.49609375, + "kimi_kl": 1.9375, + "learning_rate": 1.579e-07, + "loss": 0.0198, + "ppl": 0.12353515625, + "reward": 0.9398682117462158, + "reward_std": 0.00904919020831585, + "rewards/perpo_ocr_edit_distance_reward": 0.9398682713508606, + "step": 3421, + "temperature": 0.9 + }, + { + "advantages": -2.741813887041644e-06, + "completion_length": 599.0, + "delta_ref_entropy_loss": 0.035888671875, + "delta_ref_ppl": -0.078125, + "entropy_loss": -0.232421875, + "epoch": 0.6844, + "grad_norm": 1.5730867472172831, + "k1_kl": 0.078125, + "k3_kl": 0.056396484375, + "kimi_kl": 0.1376953125, + "learning_rate": 1.578e-07, + "loss": 0.0023, + "ppl": 0.123046875, + "reward": 0.9512845873832703, + "reward_std": 0.003006379120051861, + "rewards/perpo_ocr_edit_distance_reward": 0.951284646987915, + "step": 3422, + "temperature": 0.9 + }, + { + "advantages": -1.7029899268550253e-08, + "completion_length": 359.0, + "delta_ref_entropy_loss": 0.05322265625, + "delta_ref_ppl": -0.1337890625, + "entropy_loss": -0.0810546875, + "epoch": 0.6846, + "grad_norm": 1.1382186849483908, + "k1_kl": 0.134765625, + "k3_kl": 0.09912109375, + "kimi_kl": 0.31640625, + "learning_rate": 1.577e-07, + "loss": 0.004, + "ppl": 0.037109375, + "reward": 0.9822876453399658, + "reward_std": 0.008570464327931404, + "rewards/perpo_ocr_edit_distance_reward": 0.9822877049446106, + "step": 3423, + "temperature": 0.9 + }, + { + "advantages": -5.538123150472529e-05, + "completion_length": 1214.0, + "delta_ref_entropy_loss": 0.044677734375, + "delta_ref_ppl": -0.08203125, + "entropy_loss": -0.0830078125, + "epoch": 0.6848, + "grad_norm": 0.6533534366938051, + "k1_kl": 0.08203125, + "k3_kl": 0.06103515625, + "kimi_kl": 0.12451171875, + "learning_rate": 1.5759999999999998e-07, + "loss": 0.0025, + "ppl": 0.037353515625, + "reward": 0.9436575174331665, + "reward_std": 0.0008223854820244014, + "rewards/perpo_ocr_edit_distance_reward": 0.9436575770378113, + "step": 3424, + "temperature": 0.9 + }, + { + "advantages": 0.0, + "completion_length": 975.0, + "delta_ref_entropy_loss": 0.028564453125, + "delta_ref_ppl": -0.03369140625, + "entropy_loss": -0.035888671875, + "epoch": 0.685, + "grad_norm": 2.5443863543158436, + "k1_kl": 0.033447265625, + "k3_kl": 0.017578125, + "kimi_kl": 0.04052734375, + "learning_rate": 1.575e-07, + "loss": 0.0007, + "ppl": 0.0133056640625, + "reward": 0.9948180913925171, + "reward_std": 0.0003199756029061973, + "rewards/perpo_ocr_edit_distance_reward": 0.9948180913925171, + "step": 3425, + "temperature": 0.9 + }, + { + "advantages": -1.2602125707417144e-06, + "completion_length": 1010.0, + "delta_ref_entropy_loss": 0.0035858154296875, + "delta_ref_ppl": -0.027587890625, + "entropy_loss": -0.06494140625, + "epoch": 0.6852, + "grad_norm": 0.6100789966648846, + "k1_kl": 0.027587890625, + "k3_kl": 0.019287109375, + "kimi_kl": 0.054931640625, + "learning_rate": 1.574e-07, + "loss": 0.0008, + "ppl": 0.0191650390625, + "reward": 0.9749366641044617, + "reward_std": 0.048096008598804474, + "rewards/perpo_ocr_edit_distance_reward": 0.9749366641044617, + "step": 3426, + "temperature": 0.9 + }, + { + "advantages": -7.74860438923497e-07, + "completion_length": 398.0, + "delta_ref_entropy_loss": 0.004058837890625, + "delta_ref_ppl": -0.06787109375, + "entropy_loss": -0.11181640625, + "epoch": 0.6854, + "grad_norm": 1.1094803714025674, + "k1_kl": 0.068359375, + "k3_kl": 0.053466796875, + "kimi_kl": 0.12255859375, + "learning_rate": 1.5729999999999999e-07, + "loss": 0.0021, + "ppl": 0.045166015625, + "reward": 0.854506254196167, + "reward_std": 0.07664426416158676, + "rewards/perpo_ocr_edit_distance_reward": 0.8545063138008118, + "step": 3427, + "temperature": 0.9 + }, + { + "advantages": -7.19172676326707e-05, + "completion_length": 938.0, + "delta_ref_entropy_loss": 0.007568359375, + "delta_ref_ppl": -0.03955078125, + "entropy_loss": -0.0400390625, + "epoch": 0.6856, + "grad_norm": 0.36216172183386214, + "k1_kl": 0.039306640625, + "k3_kl": 0.02978515625, + "kimi_kl": 0.10498046875, + "learning_rate": 1.572e-07, + "loss": 0.0013, + "ppl": 0.016845703125, + "reward": 0.9861342310905457, + "reward_std": 0.0008472228073514998, + "rewards/perpo_ocr_edit_distance_reward": 0.9861342906951904, + "step": 3428, + "temperature": 0.9 + }, + { + "advantages": -2.0095281797694042e-05, + "completion_length": 1217.0, + "delta_ref_entropy_loss": 0.018310546875, + "delta_ref_ppl": -0.03076171875, + "entropy_loss": -0.08984375, + "epoch": 0.6858, + "grad_norm": 1.2257822368047497, + "k1_kl": 0.0308837890625, + "k3_kl": 0.0233154296875, + "kimi_kl": 0.04296875, + "learning_rate": 1.5709999999999997e-07, + "loss": 0.001, + "ppl": 0.0390625, + "reward": 0.9889686703681946, + "reward_std": 0.001595254405401647, + "rewards/perpo_ocr_edit_distance_reward": 0.9889687299728394, + "step": 3429, + "temperature": 0.9 + }, + { + "advantages": -2.418245685475995e-06, + "completion_length": 722.0, + "delta_ref_entropy_loss": 0.017333984375, + "delta_ref_ppl": -0.041748046875, + "entropy_loss": -0.04541015625, + "epoch": 0.686, + "grad_norm": 0.6843533138768455, + "k1_kl": 0.04150390625, + "k3_kl": 0.031494140625, + "kimi_kl": 0.09033203125, + "learning_rate": 1.57e-07, + "loss": 0.0013, + "ppl": 0.0181884765625, + "reward": 0.9865135550498962, + "reward_std": 0.010469142347574234, + "rewards/perpo_ocr_edit_distance_reward": 0.986513614654541, + "step": 3430, + "temperature": 0.9 + }, + { + "advantages": -6.931169082236011e-06, + "completion_length": 468.0, + "delta_ref_entropy_loss": 0.09228515625, + "delta_ref_ppl": -0.10498046875, + "entropy_loss": -0.11962890625, + "epoch": 0.6862, + "grad_norm": 1.12881183565663, + "k1_kl": 0.10498046875, + "k3_kl": 0.061767578125, + "kimi_kl": 0.1533203125, + "learning_rate": 1.569e-07, + "loss": 0.0025, + "ppl": 0.054931640625, + "reward": 0.9105949997901917, + "reward_std": 0.0035905237309634686, + "rewards/perpo_ocr_edit_distance_reward": 0.9105950593948364, + "step": 3431, + "temperature": 0.9 + }, + { + "advantages": 7.808208465576172e-06, + "completion_length": 703.0, + "delta_ref_entropy_loss": 0.046630859375, + "delta_ref_ppl": -0.08251953125, + "entropy_loss": -0.1708984375, + "epoch": 0.6864, + "grad_norm": 1.1395412414968653, + "k1_kl": 0.08251953125, + "k3_kl": 0.05224609375, + "kimi_kl": 0.1640625, + "learning_rate": 1.5679999999999997e-07, + "loss": 0.0021, + "ppl": 0.07421875, + "reward": 0.9491336941719055, + "reward_std": 0.004266089294105768, + "rewards/perpo_ocr_edit_distance_reward": 0.9491337537765503, + "step": 3432, + "temperature": 0.9 + }, + { + "advantages": -1.6178404621314257e-05, + "completion_length": 578.0, + "delta_ref_entropy_loss": 0.08642578125, + "delta_ref_ppl": -0.08251953125, + "entropy_loss": -0.25, + "epoch": 0.6866, + "grad_norm": 1.1781863331204907, + "k1_kl": 0.08203125, + "k3_kl": 0.048583984375, + "kimi_kl": 0.08251953125, + "learning_rate": 1.567e-07, + "loss": 0.002, + "ppl": 0.10595703125, + "reward": 0.8799965977668762, + "reward_std": 0.0020080641843378544, + "rewards/perpo_ocr_edit_distance_reward": 0.879996657371521, + "step": 3433, + "temperature": 0.9 + }, + { + "advantages": -1.7029899268550253e-08, + "completion_length": 197.0, + "delta_ref_entropy_loss": 0.060546875, + "delta_ref_ppl": -0.166015625, + "entropy_loss": -0.1474609375, + "epoch": 0.6868, + "grad_norm": 0.8533442221313281, + "k1_kl": 0.166015625, + "k3_kl": 0.11865234375, + "kimi_kl": 0.392578125, + "learning_rate": 1.5659999999999999e-07, + "loss": 0.0047, + "ppl": 0.050537109375, + "reward": 0.9519274234771729, + "reward_std": 0.003632847685366869, + "rewards/perpo_ocr_edit_distance_reward": 0.9519274234771729, + "step": 3434, + "temperature": 0.9 + }, + { + "advantages": -0.00011500291293486953, + "completion_length": 687.0, + "delta_ref_entropy_loss": 0.04541015625, + "delta_ref_ppl": -0.06689453125, + "entropy_loss": -0.060791015625, + "epoch": 0.687, + "grad_norm": 0.5252352715417573, + "k1_kl": 0.06689453125, + "k3_kl": 0.03857421875, + "kimi_kl": 0.1201171875, + "learning_rate": 1.565e-07, + "loss": 0.0017, + "ppl": 0.0208740234375, + "reward": 0.8403076529502869, + "reward_std": 0.0006405194872058928, + "rewards/perpo_ocr_edit_distance_reward": 0.8403077721595764, + "step": 3435, + "temperature": 0.9 + }, + { + "advantages": -3.451108932495117e-05, + "completion_length": 180.0, + "delta_ref_entropy_loss": 0.03076171875, + "delta_ref_ppl": -0.11376953125, + "entropy_loss": -0.07080078125, + "epoch": 0.6872, + "grad_norm": 1.0785238181284738, + "k1_kl": 0.11376953125, + "k3_kl": 0.0927734375, + "kimi_kl": 0.2392578125, + "learning_rate": 1.564e-07, + "loss": 0.0038, + "ppl": 0.0284423828125, + "reward": 0.9844686388969421, + "reward_std": 0.0016274163499474525, + "rewards/perpo_ocr_edit_distance_reward": 0.9844687581062317, + "step": 3436, + "temperature": 0.9 + }, + { + "advantages": -3.193106022081338e-05, + "completion_length": 501.0, + "delta_ref_entropy_loss": 0.036376953125, + "delta_ref_ppl": -0.0703125, + "entropy_loss": -0.10888671875, + "epoch": 0.6874, + "grad_norm": 0.8509395440218318, + "k1_kl": 0.0703125, + "k3_kl": 0.04833984375, + "kimi_kl": 0.12060546875, + "learning_rate": 1.563e-07, + "loss": 0.002, + "ppl": 0.043212890625, + "reward": 0.9556910395622253, + "reward_std": 0.0012333656195551157, + "rewards/perpo_ocr_edit_distance_reward": 0.9556910395622253, + "step": 3437, + "temperature": 0.9 + }, + { + "advantages": -5.500657607626636e-06, + "completion_length": 1065.0, + "delta_ref_entropy_loss": 0.0419921875, + "delta_ref_ppl": -0.06640625, + "entropy_loss": -0.08544921875, + "epoch": 0.6876, + "grad_norm": 4.337205789496146, + "k1_kl": 0.06640625, + "k3_kl": 0.049072265625, + "kimi_kl": 0.10107421875, + "learning_rate": 1.562e-07, + "loss": 0.002, + "ppl": 0.0439453125, + "reward": 0.9571397304534912, + "reward_std": 0.009206713177263737, + "rewards/perpo_ocr_edit_distance_reward": 0.9571398496627808, + "step": 3438, + "temperature": 0.9 + }, + { + "advantages": -2.0776476503669983e-06, + "completion_length": 747.0, + "delta_ref_entropy_loss": 0.0228271484375, + "delta_ref_ppl": -0.1650390625, + "entropy_loss": -0.8125, + "epoch": 0.6878, + "grad_norm": 3.3491347327648997, + "k1_kl": 0.1640625, + "k3_kl": 0.1376953125, + "kimi_kl": 0.279296875, + "learning_rate": 1.5609999999999998e-07, + "loss": 0.0055, + "ppl": 0.46484375, + "reward": 0.6597687602043152, + "reward_std": 0.008164102211594582, + "rewards/perpo_ocr_edit_distance_reward": 0.65976881980896, + "step": 3439, + "temperature": 0.9 + }, + { + "advantages": -0.0002423184341751039, + "completion_length": 1159.0, + "delta_ref_entropy_loss": 0.052734375, + "delta_ref_ppl": -0.054443359375, + "entropy_loss": -0.053955078125, + "epoch": 0.688, + "grad_norm": 0.4466818556656486, + "k1_kl": 0.0546875, + "k3_kl": 0.031494140625, + "kimi_kl": 0.07666015625, + "learning_rate": 1.56e-07, + "loss": 0.0015, + "ppl": 0.021240234375, + "reward": 0.9930791258811951, + "reward_std": 0.0003568078391253948, + "rewards/perpo_ocr_edit_distance_reward": 0.9930792450904846, + "step": 3440, + "temperature": 0.9 + }, + { + "advantages": -4.2932377255056053e-05, + "completion_length": 544.0, + "delta_ref_entropy_loss": 0.01092529296875, + "delta_ref_ppl": -0.050537109375, + "entropy_loss": -0.056396484375, + "epoch": 0.6882, + "grad_norm": 0.626090681575527, + "k1_kl": 0.05029296875, + "k3_kl": 0.03857421875, + "kimi_kl": 0.11279296875, + "learning_rate": 1.559e-07, + "loss": 0.0016, + "ppl": 0.01513671875, + "reward": 0.9802250266075134, + "reward_std": 0.0012884229654446244, + "rewards/perpo_ocr_edit_distance_reward": 0.9802250862121582, + "step": 3441, + "temperature": 0.9 + }, + { + "advantages": -5.236694050836377e-05, + "completion_length": 191.0, + "delta_ref_entropy_loss": 0.0150146484375, + "delta_ref_ppl": -0.1484375, + "entropy_loss": -0.07275390625, + "epoch": 0.6884, + "grad_norm": 1.284871991909272, + "k1_kl": 0.1484375, + "k3_kl": 0.11865234375, + "kimi_kl": 0.58984375, + "learning_rate": 1.5579999999999998e-07, + "loss": 0.0048, + "ppl": 0.03515625, + "reward": 0.9937475919723511, + "reward_std": 0.001363306655548513, + "rewards/perpo_ocr_edit_distance_reward": 0.9937477707862854, + "step": 3442, + "temperature": 0.9 + }, + { + "advantages": -8.048330346355215e-05, + "completion_length": 477.0, + "delta_ref_entropy_loss": 0.024658203125, + "delta_ref_ppl": -0.0654296875, + "entropy_loss": -0.045166015625, + "epoch": 0.6886, + "grad_norm": 0.4476470061981648, + "k1_kl": 0.0654296875, + "k3_kl": 0.04541015625, + "kimi_kl": 0.171875, + "learning_rate": 1.557e-07, + "loss": 0.0019, + "ppl": 0.0146484375, + "reward": 0.9978536367416382, + "reward_std": 0.0008522068383172154, + "rewards/perpo_ocr_edit_distance_reward": 0.9978537559509277, + "step": 3443, + "temperature": 0.9 + }, + { + "advantages": 1.5548297596978955e-05, + "completion_length": 672.0, + "delta_ref_entropy_loss": 0.037841796875, + "delta_ref_ppl": -0.042724609375, + "entropy_loss": -0.038330078125, + "epoch": 0.6888, + "grad_norm": 0.41855925670619826, + "k1_kl": 0.042724609375, + "k3_kl": 0.02197265625, + "kimi_kl": 0.06689453125, + "learning_rate": 1.556e-07, + "loss": 0.0009, + "ppl": 0.012451171875, + "reward": 0.9978569746017456, + "reward_std": 0.0015433378284797072, + "rewards/perpo_ocr_edit_distance_reward": 0.9978569746017456, + "step": 3444, + "temperature": 0.9 + }, + { + "advantages": -2.1287373783707153e-06, + "completion_length": 185.0, + "delta_ref_entropy_loss": 0.061279296875, + "delta_ref_ppl": -0.267578125, + "entropy_loss": -0.671875, + "epoch": 0.689, + "grad_norm": 4.128296192189152, + "k1_kl": 0.267578125, + "k3_kl": 0.2041015625, + "kimi_kl": 0.61328125, + "learning_rate": 1.5549999999999998e-07, + "loss": 0.0081, + "ppl": 0.349609375, + "reward": 0.6181597709655762, + "reward_std": 0.03585744649171829, + "rewards/perpo_ocr_edit_distance_reward": 0.6181598901748657, + "step": 3445, + "temperature": 0.9 + }, + { + "advantages": -8.174351933121216e-07, + "completion_length": 1156.0, + "delta_ref_entropy_loss": 0.033935546875, + "delta_ref_ppl": -0.060546875, + "entropy_loss": -0.1162109375, + "epoch": 0.6892, + "grad_norm": 2.82721606609643, + "k1_kl": 0.060546875, + "k3_kl": 0.0439453125, + "kimi_kl": 0.09521484375, + "learning_rate": 1.554e-07, + "loss": 0.0018, + "ppl": 0.060302734375, + "reward": 0.9832804203033447, + "reward_std": 0.0207622442394495, + "rewards/perpo_ocr_edit_distance_reward": 0.9832804203033447, + "step": 3446, + "temperature": 0.9 + }, + { + "advantages": -4.6713012125110254e-05, + "completion_length": 700.0, + "delta_ref_entropy_loss": 0.0255126953125, + "delta_ref_ppl": -0.05810546875, + "entropy_loss": -0.05859375, + "epoch": 0.6894, + "grad_norm": 0.8163071266045624, + "k1_kl": 0.05810546875, + "k3_kl": 0.041259765625, + "kimi_kl": 0.1279296875, + "learning_rate": 1.553e-07, + "loss": 0.0017, + "ppl": 0.022705078125, + "reward": 0.9962738752365112, + "reward_std": 0.0017224631737917662, + "rewards/perpo_ocr_edit_distance_reward": 0.9962739944458008, + "step": 3447, + "temperature": 0.9 + }, + { + "advantages": -6.731918983859941e-05, + "completion_length": 758.0, + "delta_ref_entropy_loss": 0.023193359375, + "delta_ref_ppl": -0.045166015625, + "entropy_loss": -0.027099609375, + "epoch": 0.6896, + "grad_norm": 0.15969904200347695, + "k1_kl": 0.045166015625, + "k3_kl": 0.03173828125, + "kimi_kl": 0.09130859375, + "learning_rate": 1.552e-07, + "loss": 0.0013, + "ppl": 0.0069580078125, + "reward": 0.9978910088539124, + "reward_std": 0.0004058956110384315, + "rewards/perpo_ocr_edit_distance_reward": 0.9978910684585571, + "step": 3448, + "temperature": 0.9 + }, + { + "advantages": -6.147793556010583e-06, + "completion_length": 852.0, + "delta_ref_entropy_loss": 0.0308837890625, + "delta_ref_ppl": -0.057861328125, + "entropy_loss": -0.12109375, + "epoch": 0.6898, + "grad_norm": 1.4162193628931112, + "k1_kl": 0.057861328125, + "k3_kl": 0.03515625, + "kimi_kl": 0.07666015625, + "learning_rate": 1.5509999999999998e-07, + "loss": 0.0014, + "ppl": 0.044921875, + "reward": 0.8983936309814453, + "reward_std": 0.005445471499115229, + "rewards/perpo_ocr_edit_distance_reward": 0.8983936905860901, + "step": 3449, + "temperature": 0.9 + }, + { + "advantages": -4.817758599529043e-05, + "completion_length": 1050.0, + "delta_ref_entropy_loss": 0.033447265625, + "delta_ref_ppl": -0.06787109375, + "entropy_loss": -0.056640625, + "epoch": 0.69, + "grad_norm": 1.2172938766706558, + "k1_kl": 0.06787109375, + "k3_kl": 0.0458984375, + "kimi_kl": 0.11279296875, + "learning_rate": 1.55e-07, + "loss": 0.0019, + "ppl": 0.024658203125, + "reward": 0.97457355260849, + "reward_std": 0.0013139430666342378, + "rewards/perpo_ocr_edit_distance_reward": 0.9745736122131348, + "step": 3450, + "temperature": 0.9 + }, + { + "advantages": -5.069801045465283e-05, + "completion_length": 861.0, + "delta_ref_entropy_loss": 0.044677734375, + "delta_ref_ppl": -0.06494140625, + "entropy_loss": -0.0458984375, + "epoch": 0.6902, + "grad_norm": 0.6750344965081603, + "k1_kl": 0.0654296875, + "k3_kl": 0.04150390625, + "kimi_kl": 0.109375, + "learning_rate": 1.549e-07, + "loss": 0.0017, + "ppl": 0.024169921875, + "reward": 0.9822673797607422, + "reward_std": 0.0014119968982413411, + "rewards/perpo_ocr_edit_distance_reward": 0.982267439365387, + "step": 3451, + "temperature": 0.9 + }, + { + "advantages": -4.4873784645460546e-05, + "completion_length": 535.0, + "delta_ref_entropy_loss": 0.05322265625, + "delta_ref_ppl": -0.07568359375, + "entropy_loss": -0.052490234375, + "epoch": 0.6904, + "grad_norm": 0.6479278230424984, + "k1_kl": 0.076171875, + "k3_kl": 0.056396484375, + "kimi_kl": 0.14453125, + "learning_rate": 1.5479999999999998e-07, + "loss": 0.0023, + "ppl": 0.0220947265625, + "reward": 0.9941996335983276, + "reward_std": 0.0006586700910702348, + "rewards/perpo_ocr_edit_distance_reward": 0.9941996335983276, + "step": 3452, + "temperature": 0.9 + }, + { + "advantages": -2.5808813006733544e-05, + "completion_length": 434.0, + "delta_ref_entropy_loss": 0.039306640625, + "delta_ref_ppl": -0.04638671875, + "entropy_loss": -0.0458984375, + "epoch": 0.6906, + "grad_norm": 0.6012293508778334, + "k1_kl": 0.046142578125, + "k3_kl": 0.0279541015625, + "kimi_kl": 0.06494140625, + "learning_rate": 1.547e-07, + "loss": 0.0011, + "ppl": 0.0166015625, + "reward": 0.9981194734573364, + "reward_std": 0.0012200119672343135, + "rewards/perpo_ocr_edit_distance_reward": 0.9981195330619812, + "step": 3453, + "temperature": 0.9 + }, + { + "advantages": 6.037099410605151e-06, + "completion_length": 456.0, + "delta_ref_entropy_loss": 0.08154296875, + "delta_ref_ppl": -0.10400390625, + "entropy_loss": -0.08154296875, + "epoch": 0.6908, + "grad_norm": 1.062496220645971, + "k1_kl": 0.10400390625, + "k3_kl": 0.064453125, + "kimi_kl": 0.173828125, + "learning_rate": 1.5459999999999997e-07, + "loss": 0.0026, + "ppl": 0.0341796875, + "reward": 0.9769176244735718, + "reward_std": 0.0013161810347810388, + "rewards/perpo_ocr_edit_distance_reward": 0.976917564868927, + "step": 3454, + "temperature": 0.9 + }, + { + "advantages": -3.713369369506836e-05, + "completion_length": 899.0, + "delta_ref_entropy_loss": 0.04052734375, + "delta_ref_ppl": -0.087890625, + "entropy_loss": -0.0498046875, + "epoch": 0.691, + "grad_norm": 0.41604364278000255, + "k1_kl": 0.087890625, + "k3_kl": 0.058837890625, + "kimi_kl": 0.1875, + "learning_rate": 1.545e-07, + "loss": 0.0024, + "ppl": 0.0203857421875, + "reward": 0.9959009885787964, + "reward_std": 0.0005881139077246189, + "rewards/perpo_ocr_edit_distance_reward": 0.9959009885787964, + "step": 3455, + "temperature": 0.9 + }, + { + "advantages": -3.1909774406813085e-05, + "completion_length": 149.0, + "delta_ref_entropy_loss": 0.11279296875, + "delta_ref_ppl": -0.2421875, + "entropy_loss": -0.1953125, + "epoch": 0.6912, + "grad_norm": 2.6758130327250056, + "k1_kl": 0.2412109375, + "k3_kl": 0.171875, + "kimi_kl": 0.498046875, + "learning_rate": 1.544e-07, + "loss": 0.0069, + "ppl": 0.0888671875, + "reward": 0.9541751742362976, + "reward_std": 0.002568611642345786, + "rewards/perpo_ocr_edit_distance_reward": 0.9541752934455872, + "step": 3456, + "temperature": 0.9 + }, + { + "advantages": -2.7963094908045605e-05, + "completion_length": 1271.0, + "delta_ref_entropy_loss": 0.034912109375, + "delta_ref_ppl": -0.03955078125, + "entropy_loss": -0.05126953125, + "epoch": 0.6914, + "grad_norm": 1217.7643708817077, + "k1_kl": 0.039794921875, + "k3_kl": 26.125, + "kimi_kl": 0.10791015625, + "learning_rate": 1.543e-07, + "loss": 1.0423, + "ppl": 0.0291748046875, + "reward": 0.9947072267532349, + "reward_std": 0.0011175759136676788, + "rewards/perpo_ocr_edit_distance_reward": 0.9947072863578796, + "step": 3457, + "temperature": 0.9 + }, + { + "advantages": -6.011554432916455e-06, + "completion_length": 953.0, + "delta_ref_entropy_loss": 0.048095703125, + "delta_ref_ppl": -0.0703125, + "entropy_loss": -0.095703125, + "epoch": 0.6916, + "grad_norm": 1.0047587287052206, + "k1_kl": 0.0703125, + "k3_kl": 0.0439453125, + "kimi_kl": 0.115234375, + "learning_rate": 1.542e-07, + "loss": 0.0018, + "ppl": 0.046875, + "reward": 0.9574227929115295, + "reward_std": 0.0069908094592392445, + "rewards/perpo_ocr_edit_distance_reward": 0.9574228525161743, + "step": 3458, + "temperature": 0.9 + }, + { + "advantages": -1.7029899268550253e-08, + "completion_length": 589.0, + "delta_ref_entropy_loss": 0.058349609375, + "delta_ref_ppl": -0.09326171875, + "entropy_loss": -0.384765625, + "epoch": 0.6918, + "grad_norm": 1.7401967970482926, + "k1_kl": 0.09326171875, + "k3_kl": 0.0556640625, + "kimi_kl": 0.11572265625, + "learning_rate": 1.5409999999999998e-07, + "loss": 0.0022, + "ppl": 0.2001953125, + "reward": 0.9070252180099487, + "reward_std": 0.0030841680709272623, + "rewards/perpo_ocr_edit_distance_reward": 0.9070252180099487, + "step": 3459, + "temperature": 0.9 + }, + { + "advantages": -3.950936843466479e-06, + "completion_length": 238.0, + "delta_ref_entropy_loss": 0.041748046875, + "delta_ref_ppl": -0.1494140625, + "entropy_loss": -0.06640625, + "epoch": 0.692, + "grad_norm": 1.6599707284172835, + "k1_kl": 0.1494140625, + "k3_kl": 0.11669921875, + "kimi_kl": 0.53125, + "learning_rate": 1.54e-07, + "loss": 0.0047, + "ppl": 0.0244140625, + "reward": 0.9867452383041382, + "reward_std": 0.002055363031104207, + "rewards/perpo_ocr_edit_distance_reward": 0.9867452383041382, + "step": 3460, + "temperature": 0.9 + }, + { + "advantages": -3.4349308407399803e-05, + "completion_length": 411.0, + "delta_ref_entropy_loss": 0.031982421875, + "delta_ref_ppl": -0.0634765625, + "entropy_loss": -0.057861328125, + "epoch": 0.6922, + "grad_norm": 0.8324197752019254, + "k1_kl": 0.06396484375, + "k3_kl": 0.0458984375, + "kimi_kl": 0.11767578125, + "learning_rate": 1.539e-07, + "loss": 0.0019, + "ppl": 0.0341796875, + "reward": 0.9844850897789001, + "reward_std": 0.003618698799982667, + "rewards/perpo_ocr_edit_distance_reward": 0.9844851493835449, + "step": 3461, + "temperature": 0.9 + }, + { + "advantages": -7.385867502307519e-05, + "completion_length": 1596.0, + "delta_ref_entropy_loss": -0.011474609375, + "delta_ref_ppl": -0.0107421875, + "entropy_loss": -0.0250244140625, + "epoch": 0.6924, + "grad_norm": 0.08540126909119564, + "k1_kl": 0.01080322265625, + "k3_kl": 0.00909423828125, + "kimi_kl": 0.032958984375, + "learning_rate": 1.538e-07, + "loss": 0.0004, + "ppl": 0.004974365234375, + "reward": 0.9982607364654541, + "reward_std": 0.0005916060763411224, + "rewards/perpo_ocr_edit_distance_reward": 0.9982607960700989, + "step": 3462, + "temperature": 0.9 + }, + { + "advantages": 4.393713879835559e-06, + "completion_length": 693.0, + "delta_ref_entropy_loss": 0.04443359375, + "delta_ref_ppl": -0.09033203125, + "entropy_loss": -0.083984375, + "epoch": 0.6926, + "grad_norm": 1.5157038250155837, + "k1_kl": 0.08984375, + "k3_kl": 0.058837890625, + "kimi_kl": 0.1611328125, + "learning_rate": 1.537e-07, + "loss": 0.0023, + "ppl": 0.036376953125, + "reward": 0.9325546026229858, + "reward_std": 0.0018468867056071758, + "rewards/perpo_ocr_edit_distance_reward": 0.9325546026229858, + "step": 3463, + "temperature": 0.9 + }, + { + "advantages": -3.9986203773878515e-05, + "completion_length": 1243.0, + "delta_ref_entropy_loss": 0.0133056640625, + "delta_ref_ppl": -0.0289306640625, + "entropy_loss": -0.05029296875, + "epoch": 0.6928, + "grad_norm": 1.4370909232488642, + "k1_kl": 0.0289306640625, + "k3_kl": 0.0189208984375, + "kimi_kl": 0.03955078125, + "learning_rate": 1.5359999999999997e-07, + "loss": 0.0008, + "ppl": 0.022216796875, + "reward": 0.996246337890625, + "reward_std": 0.0018168054521083832, + "rewards/perpo_ocr_edit_distance_reward": 0.9962463974952698, + "step": 3464, + "temperature": 0.9 + }, + { + "advantages": -4.158701267442666e-05, + "completion_length": 116.0, + "delta_ref_entropy_loss": 0.12060546875, + "delta_ref_ppl": -0.24609375, + "entropy_loss": -0.12451171875, + "epoch": 0.693, + "grad_norm": 1.768875616376082, + "k1_kl": 0.2470703125, + "k3_kl": 0.1748046875, + "kimi_kl": 0.48828125, + "learning_rate": 1.535e-07, + "loss": 0.0071, + "ppl": 0.0556640625, + "reward": 0.9487488865852356, + "reward_std": 0.0019493461586534977, + "rewards/perpo_ocr_edit_distance_reward": 0.9487490653991699, + "step": 3465, + "temperature": 0.9 + }, + { + "advantages": -3.0943327146815136e-05, + "completion_length": 351.0, + "delta_ref_entropy_loss": 0.032958984375, + "delta_ref_ppl": -0.10986328125, + "entropy_loss": -0.0556640625, + "epoch": 0.6932, + "grad_norm": 0.607557651446616, + "k1_kl": 0.10986328125, + "k3_kl": 0.0869140625, + "kimi_kl": 0.267578125, + "learning_rate": 1.534e-07, + "loss": 0.0035, + "ppl": 0.0194091796875, + "reward": 0.9674611687660217, + "reward_std": 0.003203925909474492, + "rewards/perpo_ocr_edit_distance_reward": 0.9674612283706665, + "step": 3466, + "temperature": 0.9 + }, + { + "advantages": -2.6583673388813622e-05, + "completion_length": 1126.0, + "delta_ref_entropy_loss": 0.051025390625, + "delta_ref_ppl": -0.0693359375, + "entropy_loss": -0.11474609375, + "epoch": 0.6934, + "grad_norm": 1.2305855371990069, + "k1_kl": 0.0693359375, + "k3_kl": 0.04443359375, + "kimi_kl": 0.1064453125, + "learning_rate": 1.5329999999999998e-07, + "loss": 0.0018, + "ppl": 0.055419921875, + "reward": 0.9281664490699768, + "reward_std": 0.002142968587577343, + "rewards/perpo_ocr_edit_distance_reward": 0.9281665086746216, + "step": 3467, + "temperature": 0.9 + }, + { + "advantages": -8.262055780505762e-05, + "completion_length": 547.0, + "delta_ref_entropy_loss": 0.03515625, + "delta_ref_ppl": -0.0400390625, + "entropy_loss": -0.036865234375, + "epoch": 0.6936, + "grad_norm": 0.31367162280048005, + "k1_kl": 0.0400390625, + "k3_kl": 0.02587890625, + "kimi_kl": 0.0537109375, + "learning_rate": 1.532e-07, + "loss": 0.0011, + "ppl": 0.01116943359375, + "reward": 0.9937945008277893, + "reward_std": 0.000518463202752173, + "rewards/perpo_ocr_edit_distance_reward": 0.9937945604324341, + "step": 3468, + "temperature": 0.9 + }, + { + "advantages": -3.618853588704951e-05, + "completion_length": 913.0, + "delta_ref_entropy_loss": 0.061279296875, + "delta_ref_ppl": -0.07275390625, + "entropy_loss": -0.07275390625, + "epoch": 0.6938, + "grad_norm": 14.703759381422511, + "k1_kl": 0.07275390625, + "k3_kl": 0.045654296875, + "kimi_kl": 0.09912109375, + "learning_rate": 1.5310000000000001e-07, + "loss": 0.0019, + "ppl": 0.0311279296875, + "reward": 0.97005295753479, + "reward_std": 0.0008411880116909742, + "rewards/perpo_ocr_edit_distance_reward": 0.9700530171394348, + "step": 3469, + "temperature": 0.9 + }, + { + "advantages": -7.94487350503914e-05, + "completion_length": 632.0, + "delta_ref_entropy_loss": 0.032958984375, + "delta_ref_ppl": -0.06689453125, + "entropy_loss": -0.04541015625, + "epoch": 0.694, + "grad_norm": 0.4546388107620903, + "k1_kl": 0.06689453125, + "k3_kl": 0.044189453125, + "kimi_kl": 0.1279296875, + "learning_rate": 1.5299999999999998e-07, + "loss": 0.0018, + "ppl": 0.0179443359375, + "reward": 0.939574658870697, + "reward_std": 0.0010788076324388385, + "rewards/perpo_ocr_edit_distance_reward": 0.9395747184753418, + "step": 3470, + "temperature": 0.9 + }, + { + "advantages": -3.423009729885962e-06, + "completion_length": 536.0, + "delta_ref_entropy_loss": 0.0703125, + "delta_ref_ppl": -0.1162109375, + "entropy_loss": -0.267578125, + "epoch": 0.6942, + "grad_norm": 2.3010405603682353, + "k1_kl": 0.11669921875, + "k3_kl": 0.08203125, + "kimi_kl": 0.2255859375, + "learning_rate": 1.529e-07, + "loss": 0.0033, + "ppl": 0.12890625, + "reward": 0.8372589349746704, + "reward_std": 0.004863569978624582, + "rewards/perpo_ocr_edit_distance_reward": 0.8372589349746704, + "step": 3471, + "temperature": 0.9 + }, + { + "advantages": 0.0, + "completion_length": 548.0, + "delta_ref_entropy_loss": 0.021728515625, + "delta_ref_ppl": -0.028076171875, + "entropy_loss": -0.032470703125, + "epoch": 0.6944, + "grad_norm": 0.46261075305401206, + "k1_kl": 0.0281982421875, + "k3_kl": 0.0233154296875, + "kimi_kl": 0.04296875, + "learning_rate": 1.528e-07, + "loss": 0.0009, + "ppl": 0.0107421875, + "reward": 0.9960243105888367, + "reward_std": 0.0007012057467363775, + "rewards/perpo_ocr_edit_distance_reward": 0.9960243701934814, + "step": 3472, + "temperature": 0.9 + }, + { + "advantages": -2.588544703030493e-06, + "completion_length": 426.0, + "delta_ref_entropy_loss": 0.041015625, + "delta_ref_ppl": -0.1162109375, + "entropy_loss": -0.1748046875, + "epoch": 0.6946, + "grad_norm": 1.2850288865941797, + "k1_kl": 0.1162109375, + "k3_kl": 0.08251953125, + "kimi_kl": 0.26171875, + "learning_rate": 1.5269999999999998e-07, + "loss": 0.0033, + "ppl": 0.07666015625, + "reward": 0.961449921131134, + "reward_std": 0.0031873539555817842, + "rewards/perpo_ocr_edit_distance_reward": 0.9614499807357788, + "step": 3473, + "temperature": 0.9 + }, + { + "advantages": -1.2125287867092993e-05, + "completion_length": 543.0, + "delta_ref_entropy_loss": 0.01214599609375, + "delta_ref_ppl": -0.07373046875, + "entropy_loss": -0.17578125, + "epoch": 0.6948, + "grad_norm": 1.2871866021049667, + "k1_kl": 0.07373046875, + "k3_kl": 0.055908203125, + "kimi_kl": 0.1865234375, + "learning_rate": 1.526e-07, + "loss": 0.0022, + "ppl": 0.07666015625, + "reward": 0.9880110621452332, + "reward_std": 0.0027093072421848774, + "rewards/perpo_ocr_edit_distance_reward": 0.9880111217498779, + "step": 3474, + "temperature": 0.9 + }, + { + "advantages": -6.093297997722402e-05, + "completion_length": 622.0, + "delta_ref_entropy_loss": 0.041015625, + "delta_ref_ppl": -0.068359375, + "entropy_loss": -0.06103515625, + "epoch": 0.695, + "grad_norm": 0.6394345920319385, + "k1_kl": 0.068359375, + "k3_kl": 0.040283203125, + "kimi_kl": 0.091796875, + "learning_rate": 1.525e-07, + "loss": 0.0017, + "ppl": 0.0244140625, + "reward": 0.9875919222831726, + "reward_std": 0.0010176629293709993, + "rewards/perpo_ocr_edit_distance_reward": 0.9875920414924622, + "step": 3475, + "temperature": 0.9 + }, + { + "advantages": -6.880079126858618e-06, + "completion_length": 1549.0, + "delta_ref_entropy_loss": -0.0023651123046875, + "delta_ref_ppl": -0.048828125, + "entropy_loss": -0.203125, + "epoch": 0.6952, + "grad_norm": 2.3067341805431374, + "k1_kl": 0.048828125, + "k3_kl": 0.034912109375, + "kimi_kl": 0.06396484375, + "learning_rate": 1.524e-07, + "loss": 0.0014, + "ppl": 0.09423828125, + "reward": 0.9368981719017029, + "reward_std": 0.011048806831240654, + "rewards/perpo_ocr_edit_distance_reward": 0.9368982911109924, + "step": 3476, + "temperature": 0.9 + }, + { + "advantages": -1.3402530385064892e-05, + "completion_length": 249.0, + "delta_ref_entropy_loss": 0.054931640625, + "delta_ref_ppl": -0.1494140625, + "entropy_loss": -0.07275390625, + "epoch": 0.6954, + "grad_norm": 1.3246757870901518, + "k1_kl": 0.1494140625, + "k3_kl": 0.10986328125, + "kimi_kl": 0.36328125, + "learning_rate": 1.5229999999999998e-07, + "loss": 0.0044, + "ppl": 0.0303955078125, + "reward": 0.9909923076629639, + "reward_std": 0.0024418688844889402, + "rewards/perpo_ocr_edit_distance_reward": 0.9909923076629639, + "step": 3477, + "temperature": 0.9 + }, + { + "advantages": -2.9563905627583154e-05, + "completion_length": 695.0, + "delta_ref_entropy_loss": 0.055908203125, + "delta_ref_ppl": -0.068359375, + "entropy_loss": -0.0830078125, + "epoch": 0.6956, + "grad_norm": 1.0455180634922725, + "k1_kl": 0.068359375, + "k3_kl": 0.04052734375, + "kimi_kl": 0.115234375, + "learning_rate": 1.522e-07, + "loss": 0.0017, + "ppl": 0.033447265625, + "reward": 0.9871857762336731, + "reward_std": 0.0027804458513855934, + "rewards/perpo_ocr_edit_distance_reward": 0.9871859550476074, + "step": 3478, + "temperature": 0.9 + }, + { + "advantages": -2.3952552510309033e-05, + "completion_length": 403.0, + "delta_ref_entropy_loss": 0.0262451171875, + "delta_ref_ppl": -0.06640625, + "entropy_loss": -0.0625, + "epoch": 0.6958, + "grad_norm": 0.46575114004668305, + "k1_kl": 0.06640625, + "k3_kl": 0.04443359375, + "kimi_kl": 0.15234375, + "learning_rate": 1.5210000000000002e-07, + "loss": 0.0018, + "ppl": 0.019775390625, + "reward": 0.878756582736969, + "reward_std": 0.0006111478433012962, + "rewards/perpo_ocr_edit_distance_reward": 0.8787567019462585, + "step": 3479, + "temperature": 0.9 + }, + { + "advantages": -2.8451107937144116e-05, + "completion_length": 135.0, + "delta_ref_entropy_loss": 0.028564453125, + "delta_ref_ppl": -0.1611328125, + "entropy_loss": -0.06103515625, + "epoch": 0.696, + "grad_norm": 1.1413656564398316, + "k1_kl": 0.1611328125, + "k3_kl": 0.125, + "kimi_kl": 0.45703125, + "learning_rate": 1.5199999999999998e-07, + "loss": 0.005, + "ppl": 0.0272216796875, + "reward": 0.9842051267623901, + "reward_std": 0.0025933694560080767, + "rewards/perpo_ocr_edit_distance_reward": 0.9842052459716797, + "step": 3480, + "temperature": 0.9 + }, + { + "advantages": -3.058569927816279e-05, + "completion_length": 703.0, + "delta_ref_entropy_loss": 0.049560546875, + "delta_ref_ppl": -0.08740234375, + "entropy_loss": -0.1572265625, + "epoch": 0.6962, + "grad_norm": 1.1376736623648693, + "k1_kl": 0.0869140625, + "k3_kl": 0.0556640625, + "kimi_kl": 0.142578125, + "learning_rate": 1.519e-07, + "loss": 0.0023, + "ppl": 0.064453125, + "reward": 0.9256200194358826, + "reward_std": 0.0029611929785460234, + "rewards/perpo_ocr_edit_distance_reward": 0.9256201386451721, + "step": 3481, + "temperature": 0.9 + }, + { + "advantages": -3.549030952854082e-05, + "completion_length": 1268.0, + "delta_ref_entropy_loss": 0.02490234375, + "delta_ref_ppl": -0.046875, + "entropy_loss": -0.033447265625, + "epoch": 0.6964, + "grad_norm": 0.33912112702915315, + "k1_kl": 0.046875, + "k3_kl": 0.02685546875, + "kimi_kl": 0.080078125, + "learning_rate": 1.518e-07, + "loss": 0.0011, + "ppl": 0.0087890625, + "reward": 0.9603711366653442, + "reward_std": 0.0011002449318766594, + "rewards/perpo_ocr_edit_distance_reward": 0.960371196269989, + "step": 3482, + "temperature": 0.9 + }, + { + "advantages": -4.447358151082881e-05, + "completion_length": 178.0, + "delta_ref_entropy_loss": 0.053955078125, + "delta_ref_ppl": -0.1337890625, + "entropy_loss": -0.0771484375, + "epoch": 0.6966, + "grad_norm": 0.9631018925065564, + "k1_kl": 0.1337890625, + "k3_kl": 0.1044921875, + "kimi_kl": 0.2236328125, + "learning_rate": 1.517e-07, + "loss": 0.0042, + "ppl": 0.0211181640625, + "reward": 0.9814686179161072, + "reward_std": 0.0014312817947939038, + "rewards/perpo_ocr_edit_distance_reward": 0.9814687371253967, + "step": 3483, + "temperature": 0.9 + }, + { + "advantages": 1.7029899268550253e-08, + "completion_length": 66.0, + "delta_ref_entropy_loss": -0.26171875, + "delta_ref_ppl": -0.12890625, + "entropy_loss": -0.5703125, + "epoch": 0.6968, + "grad_norm": 6.467918479208474, + "k1_kl": 0.1279296875, + "k3_kl": 0.1689453125, + "kimi_kl": 0.298828125, + "learning_rate": 1.516e-07, + "loss": 0.0067, + "ppl": 0.2138671875, + "reward": 0.37023675441741943, + "reward_std": 0.06916555017232895, + "rewards/perpo_ocr_edit_distance_reward": 0.3702367842197418, + "step": 3484, + "temperature": 0.9 + }, + { + "advantages": -2.7852400307892822e-05, + "completion_length": 319.0, + "delta_ref_entropy_loss": 0.07568359375, + "delta_ref_ppl": -0.10693359375, + "entropy_loss": -0.076171875, + "epoch": 0.697, + "grad_norm": 1.4117167764296719, + "k1_kl": 0.10693359375, + "k3_kl": 0.07177734375, + "kimi_kl": 0.234375, + "learning_rate": 1.515e-07, + "loss": 0.0029, + "ppl": 0.02978515625, + "reward": 0.9945363998413086, + "reward_std": 0.0017344561638310552, + "rewards/perpo_ocr_edit_distance_reward": 0.9945364594459534, + "step": 3485, + "temperature": 0.9 + }, + { + "advantages": -1.355580025119707e-05, + "completion_length": 105.0, + "delta_ref_entropy_loss": 0.03564453125, + "delta_ref_ppl": -0.1318359375, + "entropy_loss": -0.06201171875, + "epoch": 0.6972, + "grad_norm": 0.7239661590024438, + "k1_kl": 0.1318359375, + "k3_kl": 0.12158203125, + "kimi_kl": 0.337890625, + "learning_rate": 1.514e-07, + "loss": 0.0049, + "ppl": 0.0224609375, + "reward": 0.9835312366485596, + "reward_std": 0.002412142464891076, + "rewards/perpo_ocr_edit_distance_reward": 0.9835312366485596, + "step": 3486, + "temperature": 0.9 + }, + { + "advantages": 0.00014102458953857422, + "completion_length": 709.0, + "delta_ref_entropy_loss": 0.044189453125, + "delta_ref_ppl": -0.06591796875, + "entropy_loss": -0.10693359375, + "epoch": 0.6974, + "grad_norm": 0.9199029697162913, + "k1_kl": 0.06591796875, + "k3_kl": 0.03857421875, + "kimi_kl": 0.103515625, + "learning_rate": 1.5129999999999999e-07, + "loss": 0.0014, + "ppl": 0.0419921875, + "reward": 0.9608935117721558, + "reward_std": 0.0003226737608201802, + "rewards/perpo_ocr_edit_distance_reward": 0.9608935117721558, + "step": 3487, + "temperature": 0.9 + }, + { + "advantages": -7.980210648383945e-05, + "completion_length": 557.0, + "delta_ref_entropy_loss": 0.040283203125, + "delta_ref_ppl": -0.060546875, + "entropy_loss": -0.050048828125, + "epoch": 0.6976, + "grad_norm": 0.2792573383780924, + "k1_kl": 0.060546875, + "k3_kl": 0.034912109375, + "kimi_kl": 0.103515625, + "learning_rate": 1.512e-07, + "loss": 0.0015, + "ppl": 0.01275634765625, + "reward": 0.9979079961776733, + "reward_std": 0.0005401569069363177, + "rewards/perpo_ocr_edit_distance_reward": 0.9979079961776733, + "step": 3488, + "temperature": 0.9 + }, + { + "advantages": -3.7465779314516112e-06, + "completion_length": 27.0, + "delta_ref_entropy_loss": -0.0693359375, + "delta_ref_ppl": -1.0546875, + "entropy_loss": -0.248046875, + "epoch": 0.6978, + "grad_norm": 7.71633933091303, + "k1_kl": 1.0546875, + "k3_kl": 0.94140625, + "kimi_kl": 7.40625, + "learning_rate": 1.511e-07, + "loss": 0.0378, + "ppl": 0.10498046875, + "reward": 0.9390048384666443, + "reward_std": 0.018183670938014984, + "rewards/perpo_ocr_edit_distance_reward": 0.9390048980712891, + "step": 3489, + "temperature": 0.9 + }, + { + "advantages": -3.440039563429309e-06, + "completion_length": 677.0, + "delta_ref_entropy_loss": 0.08447265625, + "delta_ref_ppl": -0.10693359375, + "entropy_loss": -0.1591796875, + "epoch": 0.698, + "grad_norm": 1.671267332046192, + "k1_kl": 0.10693359375, + "k3_kl": 0.0703125, + "kimi_kl": 0.1669921875, + "learning_rate": 1.51e-07, + "loss": 0.0028, + "ppl": 0.08154296875, + "reward": 0.8449753522872925, + "reward_std": 0.0023639975115656853, + "rewards/perpo_ocr_edit_distance_reward": 0.8449754118919373, + "step": 3490, + "temperature": 0.9 + }, + { + "advantages": -1.4645713690697448e-06, + "completion_length": 1567.0, + "delta_ref_entropy_loss": 0.03759765625, + "delta_ref_ppl": -0.068359375, + "entropy_loss": -0.1181640625, + "epoch": 0.6982, + "grad_norm": 2.461513056310304, + "k1_kl": 0.068359375, + "k3_kl": 0.048583984375, + "kimi_kl": 0.142578125, + "learning_rate": 1.509e-07, + "loss": 0.0019, + "ppl": 0.058837890625, + "reward": 0.9334625601768494, + "reward_std": 0.05248190462589264, + "rewards/perpo_ocr_edit_distance_reward": 0.9334626197814941, + "step": 3491, + "temperature": 0.9 + }, + { + "advantages": -8.58562343637459e-05, + "completion_length": 412.0, + "delta_ref_entropy_loss": 0.052734375, + "delta_ref_ppl": -0.052490234375, + "entropy_loss": -0.026611328125, + "epoch": 0.6984, + "grad_norm": 0.2999107942324559, + "k1_kl": 0.052490234375, + "k3_kl": 0.0299072265625, + "kimi_kl": 0.1064453125, + "learning_rate": 1.5079999999999997e-07, + "loss": 0.0013, + "ppl": 0.0057373046875, + "reward": 0.9988032579421997, + "reward_std": 0.0003957795852329582, + "rewards/perpo_ocr_edit_distance_reward": 0.9988033175468445, + "step": 3492, + "temperature": 0.9 + }, + { + "advantages": -1.5803747373865917e-05, + "completion_length": 1123.0, + "delta_ref_entropy_loss": 0.035888671875, + "delta_ref_ppl": -0.044921875, + "entropy_loss": -0.072265625, + "epoch": 0.6986, + "grad_norm": 1.3633090233678593, + "k1_kl": 0.044921875, + "k3_kl": 0.0283203125, + "kimi_kl": 0.0634765625, + "learning_rate": 1.507e-07, + "loss": 0.0012, + "ppl": 0.0294189453125, + "reward": 0.9921250343322754, + "reward_std": 0.0009773449273779988, + "rewards/perpo_ocr_edit_distance_reward": 0.9921251535415649, + "step": 3493, + "temperature": 0.9 + }, + { + "advantages": -1.0796956303238403e-05, + "completion_length": 591.0, + "delta_ref_entropy_loss": 0.028076171875, + "delta_ref_ppl": -0.060791015625, + "entropy_loss": -0.03369140625, + "epoch": 0.6988, + "grad_norm": 0.4386151232300383, + "k1_kl": 0.060791015625, + "k3_kl": 0.04296875, + "kimi_kl": 0.1494140625, + "learning_rate": 1.506e-07, + "loss": 0.0017, + "ppl": 0.01373291015625, + "reward": 0.9978640675544739, + "reward_std": 0.002265119692310691, + "rewards/perpo_ocr_edit_distance_reward": 0.9978640675544739, + "step": 3494, + "temperature": 0.9 + }, + { + "advantages": -7.988725701579824e-05, + "completion_length": 576.0, + "delta_ref_entropy_loss": 0.039794921875, + "delta_ref_ppl": -0.060791015625, + "entropy_loss": -0.035400390625, + "epoch": 0.699, + "grad_norm": 0.3427579395616828, + "k1_kl": 0.060791015625, + "k3_kl": 0.036376953125, + "kimi_kl": 0.10498046875, + "learning_rate": 1.5049999999999998e-07, + "loss": 0.0015, + "ppl": 0.00848388671875, + "reward": 0.9873589873313904, + "reward_std": 0.00032625370658934116, + "rewards/perpo_ocr_edit_distance_reward": 0.9873591065406799, + "step": 3495, + "temperature": 0.9 + }, + { + "advantages": -0.00022940125199966133, + "completion_length": 984.0, + "delta_ref_entropy_loss": 0.0281982421875, + "delta_ref_ppl": -0.029052734375, + "entropy_loss": -0.041748046875, + "epoch": 0.6992, + "grad_norm": 4.165524233315343, + "k1_kl": 0.029052734375, + "k3_kl": 0.0262451171875, + "kimi_kl": 0.0478515625, + "learning_rate": 1.504e-07, + "loss": 0.0013, + "ppl": 0.0198974609375, + "reward": 0.9991875290870667, + "reward_std": 0.00030831238836981356, + "rewards/perpo_ocr_edit_distance_reward": 0.9991876482963562, + "step": 3496, + "temperature": 0.9 + }, + { + "advantages": -5.647966099786572e-05, + "completion_length": 995.0, + "delta_ref_entropy_loss": 0.017822265625, + "delta_ref_ppl": -0.0439453125, + "entropy_loss": -0.052978515625, + "epoch": 0.6994, + "grad_norm": 0.8953672648472462, + "k1_kl": 0.0439453125, + "k3_kl": 0.02783203125, + "kimi_kl": 0.0673828125, + "learning_rate": 1.503e-07, + "loss": 0.0012, + "ppl": 0.02978515625, + "reward": 0.9970828890800476, + "reward_std": 0.0008046205039136112, + "rewards/perpo_ocr_edit_distance_reward": 0.9970829486846924, + "step": 3497, + "temperature": 0.9 + }, + { + "advantages": -9.959936869563535e-05, + "completion_length": 407.0, + "delta_ref_entropy_loss": 0.055419921875, + "delta_ref_ppl": -0.07177734375, + "entropy_loss": -0.03564453125, + "epoch": 0.6996, + "grad_norm": 0.21744586974400296, + "k1_kl": 0.07177734375, + "k3_kl": 0.04345703125, + "kimi_kl": 0.12890625, + "learning_rate": 1.5019999999999998e-07, + "loss": 0.0018, + "ppl": 0.00823974609375, + "reward": 0.9983680844306946, + "reward_std": 0.00024192385899368674, + "rewards/perpo_ocr_edit_distance_reward": 0.9983680844306946, + "step": 3498, + "temperature": 0.9 + }, + { + "advantages": -5.551747108256677e-06, + "completion_length": 1202.0, + "delta_ref_entropy_loss": 0.0089111328125, + "delta_ref_ppl": -0.029296875, + "entropy_loss": -0.038818359375, + "epoch": 0.6998, + "grad_norm": 0.7628237265417014, + "k1_kl": 0.029296875, + "k3_kl": 0.020751953125, + "kimi_kl": 0.06298828125, + "learning_rate": 1.501e-07, + "loss": 0.0008, + "ppl": 0.0145263671875, + "reward": 0.9902692437171936, + "reward_std": 0.00601578364148736, + "rewards/perpo_ocr_edit_distance_reward": 0.9902692437171936, + "step": 3499, + "temperature": 0.9 + }, + { + "advantages": 0.0, + "completion_length": 1019.0, + "delta_ref_entropy_loss": -0.2060546875, + "delta_ref_ppl": -0.035888671875, + "entropy_loss": -0.4765625, + "epoch": 0.7, + "grad_norm": 5.488089754113947, + "k1_kl": 0.03564453125, + "k3_kl": 0.060302734375, + "kimi_kl": 0.1259765625, + "learning_rate": 1.5e-07, + "loss": 0.0024, + "ppl": 0.2216796875, + "reward": 0.40691789984703064, + "reward_std": 0.04048814997076988, + "rewards/perpo_ocr_edit_distance_reward": 0.406917929649353, + "step": 3500, + "temperature": 0.9 + }, + { + "advantages": -3.199705315637402e-05, + "completion_length": 248.0, + "delta_ref_entropy_loss": 0.050537109375, + "delta_ref_ppl": -0.169921875, + "entropy_loss": -0.07666015625, + "epoch": 0.7002, + "grad_norm": 1.7826251606674806, + "k1_kl": 0.169921875, + "k3_kl": 0.1318359375, + "kimi_kl": 0.5625, + "learning_rate": 1.4989999999999999e-07, + "loss": 0.0053, + "ppl": 0.03466796875, + "reward": 0.9855782389640808, + "reward_std": 0.0022955245804041624, + "rewards/perpo_ocr_edit_distance_reward": 0.9855782985687256, + "step": 3501, + "temperature": 0.9 + }, + { + "advantages": -1.6467913155793212e-05, + "completion_length": 418.0, + "delta_ref_entropy_loss": 0.054443359375, + "delta_ref_ppl": -0.125, + "entropy_loss": -0.07861328125, + "epoch": 0.7004, + "grad_norm": 0.9685099788316364, + "k1_kl": 0.125, + "k3_kl": 0.09033203125, + "kimi_kl": 0.34765625, + "learning_rate": 1.4979999999999998e-07, + "loss": 0.0036, + "ppl": 0.041259765625, + "reward": 0.9942607879638672, + "reward_std": 0.0009328132728114724, + "rewards/perpo_ocr_edit_distance_reward": 0.9942609071731567, + "step": 3502, + "temperature": 0.9 + }, + { + "advantages": -6.760869837307837e-06, + "completion_length": 752.0, + "delta_ref_entropy_loss": 0.07666015625, + "delta_ref_ppl": -0.11962890625, + "entropy_loss": -0.23828125, + "epoch": 0.7006, + "grad_norm": 1.7415471541240612, + "k1_kl": 0.119140625, + "k3_kl": 0.07763671875, + "kimi_kl": 0.1689453125, + "learning_rate": 1.497e-07, + "loss": 0.0031, + "ppl": 0.11279296875, + "reward": 0.7860608696937561, + "reward_std": 0.007442456670105457, + "rewards/perpo_ocr_edit_distance_reward": 0.7860609292984009, + "step": 3503, + "temperature": 0.9 + }, + { + "advantages": -0.00019727434846572578, + "completion_length": 524.0, + "delta_ref_entropy_loss": 0.064453125, + "delta_ref_ppl": -0.08984375, + "entropy_loss": -0.035400390625, + "epoch": 0.7008, + "grad_norm": 1.2345612579019383, + "k1_kl": 0.08984375, + "k3_kl": 0.057373046875, + "kimi_kl": 0.1962890625, + "learning_rate": 1.4960000000000002e-07, + "loss": 0.0025, + "ppl": 0.0096435546875, + "reward": 0.9979286789894104, + "reward_std": 0.0004179270763415843, + "rewards/perpo_ocr_edit_distance_reward": 0.9979287981987, + "step": 3504, + "temperature": 0.9 + }, + { + "advantages": 0.0, + "completion_length": 1151.0, + "delta_ref_entropy_loss": 0.016845703125, + "delta_ref_ppl": -0.035888671875, + "entropy_loss": -0.043701171875, + "epoch": 0.701, + "grad_norm": 0.2681623012817487, + "k1_kl": 0.035888671875, + "k3_kl": 0.0234375, + "kimi_kl": 0.0546875, + "learning_rate": 1.4949999999999998e-07, + "loss": 0.0009, + "ppl": 0.01544189453125, + "reward": 0.995190441608429, + "reward_std": 0.0002729018160607666, + "rewards/perpo_ocr_edit_distance_reward": 0.995190441608429, + "step": 3505, + "temperature": 0.9 + }, + { + "advantages": -0.00012929951481055468, + "completion_length": 920.0, + "delta_ref_entropy_loss": 0.018310546875, + "delta_ref_ppl": -0.0380859375, + "entropy_loss": -0.03515625, + "epoch": 0.7012, + "grad_norm": 0.18660465472602972, + "k1_kl": 0.037841796875, + "k3_kl": 0.02392578125, + "kimi_kl": 0.06494140625, + "learning_rate": 1.494e-07, + "loss": 0.0011, + "ppl": 0.01007080078125, + "reward": 0.9985291361808777, + "reward_std": 0.00022929746774025261, + "rewards/perpo_ocr_edit_distance_reward": 0.9985291957855225, + "step": 3506, + "temperature": 0.9 + }, + { + "advantages": -1.4134816410660278e-05, + "completion_length": 854.0, + "delta_ref_entropy_loss": 0.028076171875, + "delta_ref_ppl": -0.051513671875, + "entropy_loss": -0.0380859375, + "epoch": 0.7014, + "grad_norm": 0.40974060095291337, + "k1_kl": 0.0517578125, + "k3_kl": 0.031982421875, + "kimi_kl": 0.1083984375, + "learning_rate": 1.493e-07, + "loss": 0.0013, + "ppl": 0.01300048828125, + "reward": 0.9730472564697266, + "reward_std": 0.0005021296674385667, + "rewards/perpo_ocr_edit_distance_reward": 0.9730473160743713, + "step": 3507, + "temperature": 0.9 + }, + { + "advantages": -0.00011331694986438379, + "completion_length": 827.0, + "delta_ref_entropy_loss": 0.040283203125, + "delta_ref_ppl": -0.0634765625, + "entropy_loss": -0.04296875, + "epoch": 0.7016, + "grad_norm": 0.22477353447465595, + "k1_kl": 0.0634765625, + "k3_kl": 0.038818359375, + "kimi_kl": 0.09912109375, + "learning_rate": 1.4919999999999999e-07, + "loss": 0.0017, + "ppl": 0.0140380859375, + "reward": 0.9970229864120483, + "reward_std": 0.00035083855618722737, + "rewards/perpo_ocr_edit_distance_reward": 0.9970231056213379, + "step": 3508, + "temperature": 0.9 + }, + { + "advantages": 4.495893335842993e-06, + "completion_length": 916.0, + "delta_ref_entropy_loss": 0.06689453125, + "delta_ref_ppl": -0.09521484375, + "entropy_loss": -0.34375, + "epoch": 0.7018, + "grad_norm": 1.8773299914457746, + "k1_kl": 0.0947265625, + "k3_kl": 0.05615234375, + "kimi_kl": 0.09765625, + "learning_rate": 1.491e-07, + "loss": 0.0022, + "ppl": 0.171875, + "reward": 0.7708497047424316, + "reward_std": 0.005570414010435343, + "rewards/perpo_ocr_edit_distance_reward": 0.7708496451377869, + "step": 3509, + "temperature": 0.9 + }, + { + "advantages": -0.0005960464477539062, + "completion_length": 418.0, + "delta_ref_entropy_loss": 0.023193359375, + "delta_ref_ppl": -0.028076171875, + "entropy_loss": -0.0252685546875, + "epoch": 0.702, + "grad_norm": 0.019381821454264606, + "k1_kl": 0.028076171875, + "k3_kl": 0.0166015625, + "kimi_kl": 0.042236328125, + "learning_rate": 1.49e-07, + "loss": 0.0013, + "ppl": 0.00640869140625, + "reward": 0.9994107484817505, + "reward_std": 0.0, + "rewards/perpo_ocr_edit_distance_reward": 0.9994108080863953, + "step": 3510, + "temperature": 0.9 + }, + { + "advantages": -2.9666083719348535e-05, + "completion_length": 432.0, + "delta_ref_entropy_loss": 0.04541015625, + "delta_ref_ppl": -0.0791015625, + "entropy_loss": -0.0751953125, + "epoch": 0.7022, + "grad_norm": 0.7854439906846712, + "k1_kl": 0.0791015625, + "k3_kl": 0.052490234375, + "kimi_kl": 0.1455078125, + "learning_rate": 1.489e-07, + "loss": 0.0021, + "ppl": 0.033447265625, + "reward": 0.9855824708938599, + "reward_std": 0.0016225929139181972, + "rewards/perpo_ocr_edit_distance_reward": 0.9855824708938599, + "step": 3511, + "temperature": 0.9 + }, + { + "advantages": -5.028929081163369e-05, + "completion_length": 2048.0, + "delta_ref_entropy_loss": 0.0027923583984375, + "delta_ref_ppl": -0.017333984375, + "entropy_loss": -0.08935546875, + "epoch": 0.7024, + "grad_norm": 3.7129318964072726, + "k1_kl": 0.0172119140625, + "k3_kl": 0.037353515625, + "kimi_kl": 0.035400390625, + "learning_rate": 1.4879999999999998e-07, + "loss": 0.0015, + "ppl": 0.0517578125, + "reward": 0.8301171064376831, + "reward_std": 0.002101696329191327, + "rewards/perpo_ocr_edit_distance_reward": 0.8301172852516174, + "step": 3512, + "temperature": 0.9 + }, + { + "advantages": -0.00014267649385146797, + "completion_length": 389.0, + "delta_ref_entropy_loss": 0.0439453125, + "delta_ref_ppl": -0.051513671875, + "entropy_loss": -0.037353515625, + "epoch": 0.7026, + "grad_norm": 0.2078824736236175, + "k1_kl": 0.051513671875, + "k3_kl": 0.03125, + "kimi_kl": 0.099609375, + "learning_rate": 1.487e-07, + "loss": 0.0014, + "ppl": 0.00994873046875, + "reward": 0.9956183433532715, + "reward_std": 0.0002580330765340477, + "rewards/perpo_ocr_edit_distance_reward": 0.9956184029579163, + "step": 3513, + "temperature": 0.9 + }, + { + "advantages": -2.602168569865171e-05, + "completion_length": 818.0, + "delta_ref_entropy_loss": 0.050537109375, + "delta_ref_ppl": -0.0751953125, + "entropy_loss": -0.080078125, + "epoch": 0.7028, + "grad_norm": 1.1951836629954204, + "k1_kl": 0.0751953125, + "k3_kl": 0.0439453125, + "kimi_kl": 0.11865234375, + "learning_rate": 1.486e-07, + "loss": 0.0018, + "ppl": 0.03369140625, + "reward": 0.9916614294052124, + "reward_std": 0.0012093273689970374, + "rewards/perpo_ocr_edit_distance_reward": 0.991661548614502, + "step": 3514, + "temperature": 0.9 + }, + { + "advantages": -1.3938972188043408e-05, + "completion_length": 723.0, + "delta_ref_entropy_loss": 0.031494140625, + "delta_ref_ppl": -0.040771484375, + "entropy_loss": -0.0380859375, + "epoch": 0.703, + "grad_norm": 0.49264406150030193, + "k1_kl": 0.040771484375, + "k3_kl": 0.0240478515625, + "kimi_kl": 0.05859375, + "learning_rate": 1.4849999999999999e-07, + "loss": 0.001, + "ppl": 0.0133056640625, + "reward": 0.9956225156784058, + "reward_std": 0.001120803295634687, + "rewards/perpo_ocr_edit_distance_reward": 0.9956225156784058, + "step": 3515, + "temperature": 0.9 + }, + { + "advantages": -1.0524478057050146e-05, + "completion_length": 1250.0, + "delta_ref_entropy_loss": 0.037841796875, + "delta_ref_ppl": -0.07568359375, + "entropy_loss": -0.107421875, + "epoch": 0.7032, + "grad_norm": 1.5406010187673809, + "k1_kl": 0.076171875, + "k3_kl": 0.05419921875, + "kimi_kl": 0.1279296875, + "learning_rate": 1.484e-07, + "loss": 0.0022, + "ppl": 0.051025390625, + "reward": 0.9519656300544739, + "reward_std": 0.002325961831957102, + "rewards/perpo_ocr_edit_distance_reward": 0.9519656300544739, + "step": 3516, + "temperature": 0.9 + }, + { + "advantages": 4.83649137095199e-06, + "completion_length": 1016.0, + "delta_ref_entropy_loss": 0.043212890625, + "delta_ref_ppl": -0.07666015625, + "entropy_loss": -0.169921875, + "epoch": 0.7034, + "grad_norm": 3.500538657085914, + "k1_kl": 0.076171875, + "k3_kl": 0.043212890625, + "kimi_kl": 0.09228515625, + "learning_rate": 1.4829999999999997e-07, + "loss": 0.0017, + "ppl": 0.0791015625, + "reward": 0.7883867621421814, + "reward_std": 0.005180486477911472, + "rewards/perpo_ocr_edit_distance_reward": 0.7883867621421814, + "step": 3517, + "temperature": 0.9 + }, + { + "advantages": -1.3623919414840202e-07, + "completion_length": 707.0, + "delta_ref_entropy_loss": -0.000629425048828125, + "delta_ref_ppl": -0.0517578125, + "entropy_loss": -0.09375, + "epoch": 0.7036, + "grad_norm": 1.6601464336579228, + "k1_kl": 0.052001953125, + "k3_kl": 0.047607421875, + "kimi_kl": 0.1142578125, + "learning_rate": 1.482e-07, + "loss": 0.0019, + "ppl": 0.0322265625, + "reward": 0.8909399509429932, + "reward_std": 0.11022107303142548, + "rewards/perpo_ocr_edit_distance_reward": 0.8909400701522827, + "step": 3518, + "temperature": 0.9 + }, + { + "advantages": -0.00016496438183821738, + "completion_length": 561.0, + "delta_ref_entropy_loss": 0.060546875, + "delta_ref_ppl": -0.055908203125, + "entropy_loss": -0.05419921875, + "epoch": 0.7038, + "grad_norm": 0.3616318143047236, + "k1_kl": 0.055908203125, + "k3_kl": 0.03369140625, + "kimi_kl": 0.091796875, + "learning_rate": 1.481e-07, + "loss": 0.0015, + "ppl": 0.019775390625, + "reward": 0.9819063544273376, + "reward_std": 0.00031289560138247907, + "rewards/perpo_ocr_edit_distance_reward": 0.9819064140319824, + "step": 3519, + "temperature": 0.9 + }, + { + "advantages": -2.0333700376795605e-05, + "completion_length": 921.0, + "delta_ref_entropy_loss": 0.07568359375, + "delta_ref_ppl": -0.08447265625, + "entropy_loss": -0.08154296875, + "epoch": 0.704, + "grad_norm": 1.2398136015837946, + "k1_kl": 0.08447265625, + "k3_kl": 0.04541015625, + "kimi_kl": 0.087890625, + "learning_rate": 1.4799999999999998e-07, + "loss": 0.0018, + "ppl": 0.034912109375, + "reward": 0.9536051750183105, + "reward_std": 0.00073719781357795, + "rewards/perpo_ocr_edit_distance_reward": 0.9536052346229553, + "step": 3520, + "temperature": 0.9 + }, + { + "advantages": -6.931169082236011e-06, + "completion_length": 658.0, + "delta_ref_entropy_loss": 0.0240478515625, + "delta_ref_ppl": -0.11767578125, + "entropy_loss": -0.353515625, + "epoch": 0.7042, + "grad_norm": 1.997113963943459, + "k1_kl": 0.11767578125, + "k3_kl": 0.087890625, + "kimi_kl": 0.193359375, + "learning_rate": 1.479e-07, + "loss": 0.0035, + "ppl": 0.1689453125, + "reward": 0.7329906225204468, + "reward_std": 0.006036542356014252, + "rewards/perpo_ocr_edit_distance_reward": 0.7329906821250916, + "step": 3521, + "temperature": 0.9 + }, + { + "advantages": -7.008654938545078e-05, + "completion_length": 567.0, + "delta_ref_entropy_loss": 0.03173828125, + "delta_ref_ppl": -0.0625, + "entropy_loss": -0.062255859375, + "epoch": 0.7044, + "grad_norm": 0.3081093272480977, + "k1_kl": 0.06298828125, + "k3_kl": 0.037109375, + "kimi_kl": 0.0947265625, + "learning_rate": 1.4779999999999999e-07, + "loss": 0.0016, + "ppl": 0.0220947265625, + "reward": 0.9377260804176331, + "reward_std": 0.00038604813744314015, + "rewards/perpo_ocr_edit_distance_reward": 0.9377260804176331, + "step": 3522, + "temperature": 0.9 + }, + { + "advantages": -4.6142511564539745e-05, + "completion_length": 1154.0, + "delta_ref_entropy_loss": 0.0498046875, + "delta_ref_ppl": -0.06982421875, + "entropy_loss": -0.1171875, + "epoch": 0.7046, + "grad_norm": 1.3523770435640534, + "k1_kl": 0.06982421875, + "k3_kl": 0.041748046875, + "kimi_kl": 0.1005859375, + "learning_rate": 1.4769999999999998e-07, + "loss": 0.0017, + "ppl": 0.05224609375, + "reward": 0.8557513952255249, + "reward_std": 0.0010078820632770658, + "rewards/perpo_ocr_edit_distance_reward": 0.8557514548301697, + "step": 3523, + "temperature": 0.9 + }, + { + "advantages": -5.782502194051631e-05, + "completion_length": 527.0, + "delta_ref_entropy_loss": 0.02978515625, + "delta_ref_ppl": -0.047119140625, + "entropy_loss": -0.02587890625, + "epoch": 0.7048, + "grad_norm": 0.5041204145075756, + "k1_kl": 0.047119140625, + "k3_kl": 0.030029296875, + "kimi_kl": 0.0849609375, + "learning_rate": 1.476e-07, + "loss": 0.0013, + "ppl": 0.007598876953125, + "reward": 0.9977161288261414, + "reward_std": 0.0006360603729262948, + "rewards/perpo_ocr_edit_distance_reward": 0.9977162480354309, + "step": 3524, + "temperature": 0.9 + }, + { + "advantages": -0.0001661266724113375, + "completion_length": 441.0, + "delta_ref_entropy_loss": 0.03271484375, + "delta_ref_ppl": -0.02490234375, + "entropy_loss": -0.031494140625, + "epoch": 0.705, + "grad_norm": 0.3961947361283883, + "k1_kl": 0.02490234375, + "k3_kl": 0.0108642578125, + "kimi_kl": 0.0189208984375, + "learning_rate": 1.475e-07, + "loss": 0.0006, + "ppl": 0.00848388671875, + "reward": 0.9986023902893066, + "reward_std": 0.00020753630087710917, + "rewards/perpo_ocr_edit_distance_reward": 0.9986024498939514, + "step": 3525, + "temperature": 0.9 + }, + { + "advantages": -5.415507985162549e-06, + "completion_length": 399.0, + "delta_ref_entropy_loss": 0.03466796875, + "delta_ref_ppl": -0.08447265625, + "entropy_loss": -0.0712890625, + "epoch": 0.7052, + "grad_norm": 0.8684092486326058, + "k1_kl": 0.08447265625, + "k3_kl": 0.06591796875, + "kimi_kl": 0.232421875, + "learning_rate": 1.474e-07, + "loss": 0.0026, + "ppl": 0.0296630859375, + "reward": 0.9776322841644287, + "reward_std": 0.0014769185800105333, + "rewards/perpo_ocr_edit_distance_reward": 0.9776323437690735, + "step": 3526, + "temperature": 0.9 + }, + { + "advantages": -1.2176377822470386e-05, + "completion_length": 1232.0, + "delta_ref_entropy_loss": 0.017578125, + "delta_ref_ppl": -0.0308837890625, + "entropy_loss": -0.032958984375, + "epoch": 0.7054, + "grad_norm": 1.0507840295327566, + "k1_kl": 0.03076171875, + "k3_kl": 0.0184326171875, + "kimi_kl": 0.041748046875, + "learning_rate": 1.4729999999999998e-07, + "loss": 0.0007, + "ppl": 0.01025390625, + "reward": 0.9899147748947144, + "reward_std": 0.0033889575861394405, + "rewards/perpo_ocr_edit_distance_reward": 0.9899148941040039, + "step": 3527, + "temperature": 0.9 + }, + { + "advantages": -2.106598549289629e-05, + "completion_length": 568.0, + "delta_ref_entropy_loss": 0.03515625, + "delta_ref_ppl": -0.08544921875, + "entropy_loss": -0.09521484375, + "epoch": 0.7056, + "grad_norm": 0.7881062627958851, + "k1_kl": 0.08544921875, + "k3_kl": 0.059326171875, + "kimi_kl": 0.21484375, + "learning_rate": 1.472e-07, + "loss": 0.0024, + "ppl": 0.035888671875, + "reward": 0.9797070622444153, + "reward_std": 0.001111126970499754, + "rewards/perpo_ocr_edit_distance_reward": 0.9797070622444153, + "step": 3528, + "temperature": 0.9 + }, + { + "advantages": -1.2516975402832031e-06, + "completion_length": 109.0, + "delta_ref_entropy_loss": 0.030029296875, + "delta_ref_ppl": -0.2451171875, + "entropy_loss": -0.1494140625, + "epoch": 0.7058, + "grad_norm": 3.9592611813815024, + "k1_kl": 0.2451171875, + "k3_kl": 0.189453125, + "kimi_kl": 0.9296875, + "learning_rate": 1.4710000000000001e-07, + "loss": 0.0076, + "ppl": 0.08056640625, + "reward": 0.9467486143112183, + "reward_std": 0.00668263528496027, + "rewards/perpo_ocr_edit_distance_reward": 0.9467486143112183, + "step": 3529, + "temperature": 0.9 + }, + { + "advantages": -4.5980726781635894e-07, + "completion_length": 666.0, + "delta_ref_entropy_loss": -0.345703125, + "delta_ref_ppl": -0.057861328125, + "entropy_loss": -0.85546875, + "epoch": 0.706, + "grad_norm": 4.508782753642746, + "k1_kl": 0.05859375, + "k3_kl": 0.0966796875, + "kimi_kl": 0.177734375, + "learning_rate": 1.4699999999999998e-07, + "loss": 0.0039, + "ppl": 0.486328125, + "reward": 0.38386526703834534, + "reward_std": 0.0740952342748642, + "rewards/perpo_ocr_edit_distance_reward": 0.3838652968406677, + "step": 3530, + "temperature": 0.9 + }, + { + "advantages": -8.889607670425903e-06, + "completion_length": 387.0, + "delta_ref_entropy_loss": 0.0220947265625, + "delta_ref_ppl": -0.05126953125, + "entropy_loss": -0.040283203125, + "epoch": 0.7062, + "grad_norm": 0.7732056097675984, + "k1_kl": 0.05126953125, + "k3_kl": 0.03173828125, + "kimi_kl": 0.08056640625, + "learning_rate": 1.469e-07, + "loss": 0.0013, + "ppl": 0.0130615234375, + "reward": 0.9887282252311707, + "reward_std": 0.0027758849319070578, + "rewards/perpo_ocr_edit_distance_reward": 0.9887282848358154, + "step": 3531, + "temperature": 0.9 + }, + { + "advantages": -2.7077539925812744e-06, + "completion_length": 71.0, + "delta_ref_entropy_loss": 0.009765625, + "delta_ref_ppl": -0.4765625, + "entropy_loss": -0.140625, + "epoch": 0.7064, + "grad_norm": 3.193523861100184, + "k1_kl": 0.4765625, + "k3_kl": 0.39453125, + "kimi_kl": 1.765625, + "learning_rate": 1.4680000000000002e-07, + "loss": 0.0158, + "ppl": 0.06494140625, + "reward": 0.8378862738609314, + "reward_std": 0.006194503977894783, + "rewards/perpo_ocr_edit_distance_reward": 0.8378862738609314, + "step": 3532, + "temperature": 0.9 + }, + { + "advantages": 1.7029899268550253e-08, + "completion_length": 128.0, + "delta_ref_entropy_loss": 0.055419921875, + "delta_ref_ppl": -0.38671875, + "entropy_loss": -0.189453125, + "epoch": 0.7066, + "grad_norm": 2.429002705624467, + "k1_kl": 0.38671875, + "k3_kl": 0.296875, + "kimi_kl": 1.140625, + "learning_rate": 1.4669999999999998e-07, + "loss": 0.0119, + "ppl": 0.07763671875, + "reward": 0.9487734436988831, + "reward_std": 0.0047722794115543365, + "rewards/perpo_ocr_edit_distance_reward": 0.9487734436988831, + "step": 3533, + "temperature": 0.9 + }, + { + "advantages": -3.3974647521972656e-05, + "completion_length": 685.0, + "delta_ref_entropy_loss": 0.050048828125, + "delta_ref_ppl": -0.0712890625, + "entropy_loss": -0.1455078125, + "epoch": 0.7068, + "grad_norm": 0.7493376874702112, + "k1_kl": 0.0712890625, + "k3_kl": 0.044921875, + "kimi_kl": 0.10791015625, + "learning_rate": 1.466e-07, + "loss": 0.0018, + "ppl": 0.07470703125, + "reward": 0.9792508482933044, + "reward_std": 0.002406257903203368, + "rewards/perpo_ocr_edit_distance_reward": 0.979250967502594, + "step": 3534, + "temperature": 0.9 + }, + { + "advantages": -1.7029899481713073e-06, + "completion_length": 358.0, + "delta_ref_entropy_loss": -0.002227783203125, + "delta_ref_ppl": -0.12890625, + "entropy_loss": -0.37109375, + "epoch": 0.707, + "grad_norm": 2.6571981797499036, + "k1_kl": 0.12890625, + "k3_kl": 0.10595703125, + "kimi_kl": 0.24609375, + "learning_rate": 1.465e-07, + "loss": 0.0042, + "ppl": 0.150390625, + "reward": 0.6675474047660828, + "reward_std": 0.025788865983486176, + "rewards/perpo_ocr_edit_distance_reward": 0.6675475239753723, + "step": 3535, + "temperature": 0.9 + }, + { + "advantages": -4.225969678373076e-05, + "completion_length": 420.0, + "delta_ref_entropy_loss": 0.060791015625, + "delta_ref_ppl": -0.07763671875, + "entropy_loss": -0.05029296875, + "epoch": 0.7072, + "grad_norm": 0.9594609732270554, + "k1_kl": 0.0771484375, + "k3_kl": 0.046630859375, + "kimi_kl": 0.11279296875, + "learning_rate": 1.464e-07, + "loss": 0.0019, + "ppl": 0.0181884765625, + "reward": 0.9915084838867188, + "reward_std": 0.0011086229933425784, + "rewards/perpo_ocr_edit_distance_reward": 0.9915085434913635, + "step": 3536, + "temperature": 0.9 + }, + { + "advantages": -3.218651045244769e-06, + "completion_length": 427.0, + "delta_ref_entropy_loss": -0.06787109375, + "delta_ref_ppl": -0.05078125, + "entropy_loss": -0.2373046875, + "epoch": 0.7074, + "grad_norm": 3.3156201914570738, + "k1_kl": 0.05126953125, + "k3_kl": 0.047607421875, + "kimi_kl": 0.107421875, + "learning_rate": 1.463e-07, + "loss": 0.0019, + "ppl": 0.095703125, + "reward": 0.8178874850273132, + "reward_std": 0.018424641340970993, + "rewards/perpo_ocr_edit_distance_reward": 0.817887544631958, + "step": 3537, + "temperature": 0.9 + }, + { + "advantages": 0.0, + "completion_length": 396.0, + "delta_ref_entropy_loss": 0.0255126953125, + "delta_ref_ppl": -0.0576171875, + "entropy_loss": -0.044677734375, + "epoch": 0.7076, + "grad_norm": 0.5453428744597847, + "k1_kl": 0.0576171875, + "k3_kl": 0.03759765625, + "kimi_kl": 0.1240234375, + "learning_rate": 1.462e-07, + "loss": 0.0015, + "ppl": 0.01507568359375, + "reward": 0.997186005115509, + "reward_std": 0.0007678347756154835, + "rewards/perpo_ocr_edit_distance_reward": 0.9971860647201538, + "step": 3538, + "temperature": 0.9 + }, + { + "advantages": -2.6566642645775573e-06, + "completion_length": 487.0, + "delta_ref_entropy_loss": 0.01397705078125, + "delta_ref_ppl": -0.11474609375, + "entropy_loss": -0.33203125, + "epoch": 0.7078, + "grad_norm": 1.7351548759122493, + "k1_kl": 0.11474609375, + "k3_kl": 0.0810546875, + "kimi_kl": 0.166015625, + "learning_rate": 1.461e-07, + "loss": 0.0033, + "ppl": 0.1572265625, + "reward": 0.6573065519332886, + "reward_std": 0.006286322139203548, + "rewards/perpo_ocr_edit_distance_reward": 0.6573066115379333, + "step": 3539, + "temperature": 0.9 + }, + { + "advantages": -6.066050264053047e-05, + "completion_length": 684.0, + "delta_ref_entropy_loss": 0.038818359375, + "delta_ref_ppl": -0.050537109375, + "entropy_loss": -0.038818359375, + "epoch": 0.708, + "grad_norm": 0.38693560067131055, + "k1_kl": 0.050537109375, + "k3_kl": 0.0302734375, + "kimi_kl": 0.083984375, + "learning_rate": 1.4599999999999998e-07, + "loss": 0.0013, + "ppl": 0.01104736328125, + "reward": 0.9914407134056091, + "reward_std": 0.0007423826609738171, + "rewards/perpo_ocr_edit_distance_reward": 0.9914408326148987, + "step": 3540, + "temperature": 0.9 + }, + { + "advantages": -5.098751717014238e-05, + "completion_length": 716.0, + "delta_ref_entropy_loss": 0.0213623046875, + "delta_ref_ppl": -0.033203125, + "entropy_loss": -0.036376953125, + "epoch": 0.7082, + "grad_norm": 0.5715962391792244, + "k1_kl": 0.033203125, + "k3_kl": 0.022216796875, + "kimi_kl": 0.055419921875, + "learning_rate": 1.459e-07, + "loss": 0.0009, + "ppl": 0.01544189453125, + "reward": 0.9959073662757874, + "reward_std": 0.0014026003191247582, + "rewards/perpo_ocr_edit_distance_reward": 0.9959074258804321, + "step": 3541, + "temperature": 0.9 + }, + { + "advantages": -1.5292849639081396e-05, + "completion_length": 288.0, + "delta_ref_entropy_loss": 0.0220947265625, + "delta_ref_ppl": -0.08251953125, + "entropy_loss": -0.046875, + "epoch": 0.7084, + "grad_norm": 1.154416712260392, + "k1_kl": 0.08251953125, + "k3_kl": 0.061279296875, + "kimi_kl": 0.208984375, + "learning_rate": 1.458e-07, + "loss": 0.0025, + "ppl": 0.0179443359375, + "reward": 0.97206050157547, + "reward_std": 0.0021254578605294228, + "rewards/perpo_ocr_edit_distance_reward": 0.9720605611801147, + "step": 3542, + "temperature": 0.9 + }, + { + "advantages": -5.645411420118762e-06, + "completion_length": 42.0, + "delta_ref_entropy_loss": -0.005035400390625, + "delta_ref_ppl": -0.78125, + "entropy_loss": -0.310546875, + "epoch": 0.7086, + "grad_norm": 6.156634504648383, + "k1_kl": 0.78125, + "k3_kl": 0.6484375, + "kimi_kl": 2.765625, + "learning_rate": 1.457e-07, + "loss": 0.026, + "ppl": 0.125, + "reward": 0.9867256879806519, + "reward_std": 0.004424780607223511, + "rewards/perpo_ocr_edit_distance_reward": 0.9867256879806519, + "step": 3543, + "temperature": 0.9 + }, + { + "advantages": -1.6076224710559472e-05, + "completion_length": 1002.0, + "delta_ref_entropy_loss": 0.0159912109375, + "delta_ref_ppl": -0.0296630859375, + "entropy_loss": -0.0284423828125, + "epoch": 0.7088, + "grad_norm": 0.3621198030727199, + "k1_kl": 0.02978515625, + "k3_kl": 0.01904296875, + "kimi_kl": 0.05517578125, + "learning_rate": 1.456e-07, + "loss": 0.0008, + "ppl": 0.00982666015625, + "reward": 0.9973770976066589, + "reward_std": 0.002018703380599618, + "rewards/perpo_ocr_edit_distance_reward": 0.9973771572113037, + "step": 3544, + "temperature": 0.9 + }, + { + "advantages": -1.1018344594049267e-05, + "completion_length": 941.0, + "delta_ref_entropy_loss": 0.034912109375, + "delta_ref_ppl": -0.055908203125, + "entropy_loss": -0.076171875, + "epoch": 0.709, + "grad_norm": 0.5552418944651077, + "k1_kl": 0.055908203125, + "k3_kl": 0.03369140625, + "kimi_kl": 0.08056640625, + "learning_rate": 1.4549999999999997e-07, + "loss": 0.0014, + "ppl": 0.03515625, + "reward": 0.9959854483604431, + "reward_std": 0.0022142387460917234, + "rewards/perpo_ocr_edit_distance_reward": 0.9959855079650879, + "step": 3545, + "temperature": 0.9 + }, + { + "advantages": -8.3982951764483e-05, + "completion_length": 289.0, + "delta_ref_entropy_loss": 0.0250244140625, + "delta_ref_ppl": -0.12890625, + "entropy_loss": -0.052490234375, + "epoch": 0.7092, + "grad_norm": 1.313114338132688, + "k1_kl": 0.12890625, + "k3_kl": 0.09912109375, + "kimi_kl": 0.51953125, + "learning_rate": 1.454e-07, + "loss": 0.004, + "ppl": 0.0150146484375, + "reward": 0.9807981848716736, + "reward_std": 0.0010150508023798466, + "rewards/perpo_ocr_edit_distance_reward": 0.9807982444763184, + "step": 3546, + "temperature": 0.9 + }, + { + "advantages": -5.72289754927624e-05, + "completion_length": 539.0, + "delta_ref_entropy_loss": 0.04345703125, + "delta_ref_ppl": -0.06396484375, + "entropy_loss": -0.033203125, + "epoch": 0.7094, + "grad_norm": 0.5026562163908157, + "k1_kl": 0.06396484375, + "k3_kl": 0.041748046875, + "kimi_kl": 0.1357421875, + "learning_rate": 1.453e-07, + "loss": 0.0017, + "ppl": 0.01239013671875, + "reward": 0.9985861778259277, + "reward_std": 0.0004954258911311626, + "rewards/perpo_ocr_edit_distance_reward": 0.9985863566398621, + "step": 3547, + "temperature": 0.9 + }, + { + "advantages": -0.00020013537141494453, + "completion_length": 843.0, + "delta_ref_entropy_loss": 0.0277099609375, + "delta_ref_ppl": -0.032958984375, + "entropy_loss": -0.032958984375, + "epoch": 0.7096, + "grad_norm": 0.25022401679502587, + "k1_kl": 0.032958984375, + "k3_kl": 0.0196533203125, + "kimi_kl": 0.0556640625, + "learning_rate": 1.4519999999999998e-07, + "loss": 0.001, + "ppl": 0.009521484375, + "reward": 0.9812530279159546, + "reward_std": 0.0004105380503460765, + "rewards/perpo_ocr_edit_distance_reward": 0.9812531471252441, + "step": 3548, + "temperature": 0.9 + }, + { + "advantages": -1.8732889657258056e-06, + "completion_length": 135.0, + "delta_ref_entropy_loss": -0.1962890625, + "delta_ref_ppl": -0.23828125, + "entropy_loss": -0.80078125, + "epoch": 0.7098, + "grad_norm": 5.792637731761867, + "k1_kl": 0.2392578125, + "k3_kl": 0.2451171875, + "kimi_kl": 0.7265625, + "learning_rate": 1.451e-07, + "loss": 0.0098, + "ppl": 0.380859375, + "reward": 0.8539420962333679, + "reward_std": 0.03608677536249161, + "rewards/perpo_ocr_edit_distance_reward": 0.8539422154426575, + "step": 3549, + "temperature": 0.9 + }, + { + "advantages": 0.0, + "completion_length": 858.0, + "delta_ref_entropy_loss": 0.0281982421875, + "delta_ref_ppl": -0.059326171875, + "entropy_loss": -0.08740234375, + "epoch": 0.71, + "grad_norm": 0.6351627056871052, + "k1_kl": 0.0595703125, + "k3_kl": 0.03759765625, + "kimi_kl": 0.09033203125, + "learning_rate": 1.45e-07, + "loss": 0.0015, + "ppl": 0.033935546875, + "reward": 0.9257451891899109, + "reward_std": 0.0015477826818823814, + "rewards/perpo_ocr_edit_distance_reward": 0.9257451891899109, + "step": 3550, + "temperature": 0.9 + }, + { + "advantages": -2.9444696338032372e-05, + "completion_length": 503.0, + "delta_ref_entropy_loss": 0.041259765625, + "delta_ref_ppl": -0.06982421875, + "entropy_loss": -0.052001953125, + "epoch": 0.7102, + "grad_norm": 0.6610689902397622, + "k1_kl": 0.06982421875, + "k3_kl": 0.046630859375, + "kimi_kl": 0.154296875, + "learning_rate": 1.449e-07, + "loss": 0.0019, + "ppl": 0.0224609375, + "reward": 0.9886003136634827, + "reward_std": 0.001635211636312306, + "rewards/perpo_ocr_edit_distance_reward": 0.9886004328727722, + "step": 3551, + "temperature": 0.9 + }, + { + "advantages": -3.797667432081653e-06, + "completion_length": 741.0, + "delta_ref_entropy_loss": -0.10986328125, + "delta_ref_ppl": -0.0546875, + "entropy_loss": -0.2001953125, + "epoch": 0.7104, + "grad_norm": 1.1655222728869312, + "k1_kl": 0.0546875, + "k3_kl": 0.05712890625, + "kimi_kl": 0.205078125, + "learning_rate": 1.448e-07, + "loss": 0.0023, + "ppl": 0.058837890625, + "reward": 0.9918925762176514, + "reward_std": 0.004386726766824722, + "rewards/perpo_ocr_edit_distance_reward": 0.9918925762176514, + "step": 3552, + "temperature": 0.9 + }, + { + "advantages": -0.00010308197670383379, + "completion_length": 481.0, + "delta_ref_entropy_loss": 0.0303955078125, + "delta_ref_ppl": -0.0927734375, + "entropy_loss": -0.08349609375, + "epoch": 0.7106, + "grad_norm": 0.6831229981174005, + "k1_kl": 0.09326171875, + "k3_kl": 0.06494140625, + "kimi_kl": 0.20703125, + "learning_rate": 1.447e-07, + "loss": 0.0027, + "ppl": 0.0299072265625, + "reward": 0.9068845510482788, + "reward_std": 0.0010565241100266576, + "rewards/perpo_ocr_edit_distance_reward": 0.9068846702575684, + "step": 3553, + "temperature": 0.9 + }, + { + "advantages": -2.8703894713544287e-05, + "completion_length": 167.0, + "delta_ref_entropy_loss": 0.06103515625, + "delta_ref_ppl": -0.1298828125, + "entropy_loss": -0.0595703125, + "epoch": 0.7108, + "grad_norm": 0.9156705529976303, + "k1_kl": 0.1298828125, + "k3_kl": 0.0849609375, + "kimi_kl": 0.283203125, + "learning_rate": 1.446e-07, + "loss": 0.0034, + "ppl": 0.0196533203125, + "reward": 0.994848906993866, + "reward_std": 0.002569818403571844, + "rewards/perpo_ocr_edit_distance_reward": 0.9948489665985107, + "step": 3554, + "temperature": 0.9 + }, + { + "advantages": -7.56808731239289e-05, + "completion_length": 1044.0, + "delta_ref_entropy_loss": 0.04736328125, + "delta_ref_ppl": -0.0615234375, + "entropy_loss": -0.0654296875, + "epoch": 0.711, + "grad_norm": 0.6427200158726519, + "k1_kl": 0.0615234375, + "k3_kl": 0.035400390625, + "kimi_kl": 0.0712890625, + "learning_rate": 1.4449999999999998e-07, + "loss": 0.0015, + "ppl": 0.0308837890625, + "reward": 0.970380961894989, + "reward_std": 0.0003499865997582674, + "rewards/perpo_ocr_edit_distance_reward": 0.9703810214996338, + "step": 3555, + "temperature": 0.9 + }, + { + "advantages": -4.087175966560608e-07, + "completion_length": 316.0, + "delta_ref_entropy_loss": 0.029052734375, + "delta_ref_ppl": -0.109375, + "entropy_loss": -0.140625, + "epoch": 0.7112, + "grad_norm": 2.687642834565586, + "k1_kl": 0.109375, + "k3_kl": 0.0751953125, + "kimi_kl": 0.2392578125, + "learning_rate": 1.444e-07, + "loss": 0.003, + "ppl": 0.06689453125, + "reward": 0.9204559922218323, + "reward_std": 0.08831307291984558, + "rewards/perpo_ocr_edit_distance_reward": 0.920456051826477, + "step": 3556, + "temperature": 0.9 + }, + { + "advantages": -0.00010228004248347133, + "completion_length": 487.0, + "delta_ref_entropy_loss": 0.0400390625, + "delta_ref_ppl": -0.07666015625, + "entropy_loss": -0.03466796875, + "epoch": 0.7114, + "grad_norm": 0.7639224036899113, + "k1_kl": 0.07666015625, + "k3_kl": 0.050537109375, + "kimi_kl": 0.1591796875, + "learning_rate": 1.4430000000000001e-07, + "loss": 0.0021, + "ppl": 0.0145263671875, + "reward": 0.9959926605224609, + "reward_std": 0.0008157018455676734, + "rewards/perpo_ocr_edit_distance_reward": 0.9959928393363953, + "step": 3557, + "temperature": 0.9 + }, + { + "advantages": -1.0762896636151709e-05, + "completion_length": 451.0, + "delta_ref_entropy_loss": 0.051025390625, + "delta_ref_ppl": -0.08349609375, + "entropy_loss": -0.0673828125, + "epoch": 0.7116, + "grad_norm": 0.6012026201603305, + "k1_kl": 0.08349609375, + "k3_kl": 0.052978515625, + "kimi_kl": 0.1669921875, + "learning_rate": 1.4419999999999998e-07, + "loss": 0.0021, + "ppl": 0.0262451171875, + "reward": 0.9952635169029236, + "reward_std": 0.0014797578332945704, + "rewards/perpo_ocr_edit_distance_reward": 0.9952635765075684, + "step": 3558, + "temperature": 0.9 + }, + { + "advantages": -1.534393959445879e-05, + "completion_length": 719.0, + "delta_ref_entropy_loss": -0.0174560546875, + "delta_ref_ppl": -0.06298828125, + "entropy_loss": -0.0888671875, + "epoch": 0.7118, + "grad_norm": 0.7152452507965066, + "k1_kl": 0.0634765625, + "k3_kl": 0.05126953125, + "kimi_kl": 0.1953125, + "learning_rate": 1.441e-07, + "loss": 0.0021, + "ppl": 0.026123046875, + "reward": 0.9964385032653809, + "reward_std": 0.0054572029039263725, + "rewards/perpo_ocr_edit_distance_reward": 0.9964386224746704, + "step": 3559, + "temperature": 0.9 + }, + { + "advantages": -0.00027075837715528905, + "completion_length": 1138.0, + "delta_ref_entropy_loss": 0.029541015625, + "delta_ref_ppl": -0.044921875, + "entropy_loss": -0.0478515625, + "epoch": 0.712, + "grad_norm": 0.9179409449418238, + "k1_kl": 0.044921875, + "k3_kl": 0.0263671875, + "kimi_kl": 0.0625, + "learning_rate": 1.44e-07, + "loss": 0.0013, + "ppl": 0.0184326171875, + "reward": 0.9933351874351501, + "reward_std": 0.0002773916639853269, + "rewards/perpo_ocr_edit_distance_reward": 0.9933353066444397, + "step": 3560, + "temperature": 0.9 + }, + { + "advantages": -1.7404556274414062e-05, + "completion_length": 623.0, + "delta_ref_entropy_loss": 0.03515625, + "delta_ref_ppl": -0.0732421875, + "entropy_loss": -0.0576171875, + "epoch": 0.7122, + "grad_norm": 3.715593910364734, + "k1_kl": 0.0732421875, + "k3_kl": 0.04443359375, + "kimi_kl": 0.126953125, + "learning_rate": 1.4389999999999998e-07, + "loss": 0.0018, + "ppl": 0.0211181640625, + "reward": 0.8740217089653015, + "reward_std": 0.0013662123819813132, + "rewards/perpo_ocr_edit_distance_reward": 0.8740217685699463, + "step": 3561, + "temperature": 0.9 + }, + { + "advantages": -4.058225022163242e-05, + "completion_length": 755.0, + "delta_ref_entropy_loss": 0.032958984375, + "delta_ref_ppl": -0.08203125, + "entropy_loss": -0.208984375, + "epoch": 0.7124, + "grad_norm": 1.143241802064055, + "k1_kl": 0.08203125, + "k3_kl": 0.05517578125, + "kimi_kl": 0.150390625, + "learning_rate": 1.438e-07, + "loss": 0.0022, + "ppl": 0.1025390625, + "reward": 0.9429984092712402, + "reward_std": 0.0017872765893116593, + "rewards/perpo_ocr_edit_distance_reward": 0.942998468875885, + "step": 3562, + "temperature": 0.9 + }, + { + "advantages": -4.7313318646047264e-05, + "completion_length": 622.0, + "delta_ref_entropy_loss": 0.043701171875, + "delta_ref_ppl": -0.05810546875, + "entropy_loss": -0.053466796875, + "epoch": 0.7126, + "grad_norm": 0.38729045993315153, + "k1_kl": 0.058349609375, + "k3_kl": 0.036865234375, + "kimi_kl": 0.150390625, + "learning_rate": 1.437e-07, + "loss": 0.0015, + "ppl": 0.01611328125, + "reward": 0.9670291543006897, + "reward_std": 0.0006198784103617072, + "rewards/perpo_ocr_edit_distance_reward": 0.9670292139053345, + "step": 3563, + "temperature": 0.9 + }, + { + "advantages": -4.472902946872637e-05, + "completion_length": 815.0, + "delta_ref_entropy_loss": 0.0184326171875, + "delta_ref_ppl": -0.0439453125, + "entropy_loss": -0.047607421875, + "epoch": 0.7128, + "grad_norm": 0.4943482468172891, + "k1_kl": 0.044189453125, + "k3_kl": 0.0279541015625, + "kimi_kl": 0.0771484375, + "learning_rate": 1.436e-07, + "loss": 0.0012, + "ppl": 0.01507568359375, + "reward": 0.9788383841514587, + "reward_std": 0.0010417068842798471, + "rewards/perpo_ocr_edit_distance_reward": 0.9788385033607483, + "step": 3564, + "temperature": 0.9 + }, + { + "advantages": -4.340921441325918e-05, + "completion_length": 437.0, + "delta_ref_entropy_loss": 0.054443359375, + "delta_ref_ppl": -0.0791015625, + "entropy_loss": -0.04833984375, + "epoch": 0.713, + "grad_norm": 0.33430053045104424, + "k1_kl": 0.0791015625, + "k3_kl": 0.05126953125, + "kimi_kl": 0.13671875, + "learning_rate": 1.4349999999999998e-07, + "loss": 0.0021, + "ppl": 0.0186767578125, + "reward": 0.9783108830451965, + "reward_std": 0.0006843736628070474, + "rewards/perpo_ocr_edit_distance_reward": 0.9783108830451965, + "step": 3565, + "temperature": 0.9 + }, + { + "advantages": -2.8516566089820117e-05, + "completion_length": 713.0, + "delta_ref_entropy_loss": 0.0252685546875, + "delta_ref_ppl": -0.0303955078125, + "entropy_loss": -0.03759765625, + "epoch": 0.7132, + "grad_norm": 0.20567268720362344, + "k1_kl": 0.030517578125, + "k3_kl": 0.0157470703125, + "kimi_kl": 0.041259765625, + "learning_rate": 1.434e-07, + "loss": 0.0007, + "ppl": 0.01397705078125, + "reward": 0.9974489808082581, + "reward_std": 0.00019852058903779835, + "rewards/perpo_ocr_edit_distance_reward": 0.9974489808082581, + "step": 3566, + "temperature": 0.9 + }, + { + "advantages": -2.1108560758875683e-05, + "completion_length": 336.0, + "delta_ref_entropy_loss": 0.061279296875, + "delta_ref_ppl": -0.1474609375, + "entropy_loss": -0.388671875, + "epoch": 0.7134, + "grad_norm": 2.6890653155291573, + "k1_kl": 0.146484375, + "k3_kl": 0.10791015625, + "kimi_kl": 0.26953125, + "learning_rate": 1.433e-07, + "loss": 0.0043, + "ppl": 0.201171875, + "reward": 0.8690375089645386, + "reward_std": 0.003936301451176405, + "rewards/perpo_ocr_edit_distance_reward": 0.8690376877784729, + "step": 3567, + "temperature": 0.9 + }, + { + "advantages": -2.213887000834802e-06, + "completion_length": 651.0, + "delta_ref_entropy_loss": 0.0016021728515625, + "delta_ref_ppl": -0.0595703125, + "entropy_loss": -0.08349609375, + "epoch": 0.7136, + "grad_norm": 0.3827585424426301, + "k1_kl": 0.0595703125, + "k3_kl": 0.05029296875, + "kimi_kl": 0.10986328125, + "learning_rate": 1.4319999999999999e-07, + "loss": 0.002, + "ppl": 0.029296875, + "reward": 0.9697362184524536, + "reward_std": 0.019417455419898033, + "rewards/perpo_ocr_edit_distance_reward": 0.9697362184524536, + "step": 3568, + "temperature": 0.9 + }, + { + "advantages": -1.44754142183956e-06, + "completion_length": 479.0, + "delta_ref_entropy_loss": 0.08447265625, + "delta_ref_ppl": -0.0888671875, + "entropy_loss": -0.220703125, + "epoch": 0.7138, + "grad_norm": 1.457611633612517, + "k1_kl": 0.0888671875, + "k3_kl": 0.052978515625, + "kimi_kl": 0.11279296875, + "learning_rate": 1.431e-07, + "loss": 0.0021, + "ppl": 0.09765625, + "reward": 0.2132451981306076, + "reward_std": 0.0028172945603728294, + "rewards/perpo_ocr_edit_distance_reward": 0.2132451981306076, + "step": 3569, + "temperature": 0.9 + }, + { + "advantages": -3.354890213813633e-05, + "completion_length": 516.0, + "delta_ref_entropy_loss": 0.0283203125, + "delta_ref_ppl": -0.05126953125, + "entropy_loss": -0.0263671875, + "epoch": 0.714, + "grad_norm": 0.24205278330840124, + "k1_kl": 0.05126953125, + "k3_kl": 0.033203125, + "kimi_kl": 0.095703125, + "learning_rate": 1.4299999999999997e-07, + "loss": 0.0014, + "ppl": 0.005767822265625, + "reward": 0.9991282224655151, + "reward_std": 0.00015377411909867078, + "rewards/perpo_ocr_edit_distance_reward": 0.9991282224655151, + "step": 3570, + "temperature": 0.9 + }, + { + "advantages": -1.7029899268550253e-08, + "completion_length": 441.0, + "delta_ref_entropy_loss": 0.0263671875, + "delta_ref_ppl": -0.08935546875, + "entropy_loss": -0.09423828125, + "epoch": 0.7142, + "grad_norm": 0.7341874933762866, + "k1_kl": 0.0888671875, + "k3_kl": 0.060302734375, + "kimi_kl": 0.1796875, + "learning_rate": 1.429e-07, + "loss": 0.0024, + "ppl": 0.044921875, + "reward": 0.9921329617500305, + "reward_std": 0.0017997599206864834, + "rewards/perpo_ocr_edit_distance_reward": 0.9921330213546753, + "step": 3571, + "temperature": 0.9 + }, + { + "advantages": -8.9151526481146e-06, + "completion_length": 648.0, + "delta_ref_entropy_loss": 0.054931640625, + "delta_ref_ppl": -0.09716796875, + "entropy_loss": -0.162109375, + "epoch": 0.7144, + "grad_norm": 1.6393279436127086, + "k1_kl": 0.0966796875, + "k3_kl": 0.062255859375, + "kimi_kl": 0.177734375, + "learning_rate": 1.428e-07, + "loss": 0.0025, + "ppl": 0.0703125, + "reward": 0.9618961215019226, + "reward_std": 0.003722195979207754, + "rewards/perpo_ocr_edit_distance_reward": 0.9618961811065674, + "step": 3572, + "temperature": 0.9 + }, + { + "advantages": -1.7004354958771728e-05, + "completion_length": 140.0, + "delta_ref_entropy_loss": 0.06640625, + "delta_ref_ppl": -0.279296875, + "entropy_loss": -0.11669921875, + "epoch": 0.7146, + "grad_norm": 1.1449386909169343, + "k1_kl": 0.279296875, + "k3_kl": 0.212890625, + "kimi_kl": 0.85546875, + "learning_rate": 1.4269999999999997e-07, + "loss": 0.0085, + "ppl": 0.0419921875, + "reward": 0.975289523601532, + "reward_std": 0.0034050855319947004, + "rewards/perpo_ocr_edit_distance_reward": 0.9752896428108215, + "step": 3573, + "temperature": 0.9 + }, + { + "advantages": 1.7029899268550253e-08, + "completion_length": 335.0, + "delta_ref_entropy_loss": 0.09423828125, + "delta_ref_ppl": -0.138671875, + "entropy_loss": -0.123046875, + "epoch": 0.7148, + "grad_norm": 1.3369312246768252, + "k1_kl": 0.1396484375, + "k3_kl": 0.0869140625, + "kimi_kl": 0.259765625, + "learning_rate": 1.426e-07, + "loss": 0.0035, + "ppl": 0.04248046875, + "reward": 0.7342522740364075, + "reward_std": 0.08368513733148575, + "rewards/perpo_ocr_edit_distance_reward": 0.7342523336410522, + "step": 3574, + "temperature": 0.9 + }, + { + "advantages": -3.88281705454574e-06, + "completion_length": 503.0, + "delta_ref_entropy_loss": 0.055419921875, + "delta_ref_ppl": -0.0849609375, + "entropy_loss": -0.08642578125, + "epoch": 0.715, + "grad_norm": 0.6822911597776752, + "k1_kl": 0.0849609375, + "k3_kl": 0.052490234375, + "kimi_kl": 0.1259765625, + "learning_rate": 1.4249999999999999e-07, + "loss": 0.0021, + "ppl": 0.034423828125, + "reward": 0.9624383449554443, + "reward_std": 0.0020970841869711876, + "rewards/perpo_ocr_edit_distance_reward": 0.9624384045600891, + "step": 3575, + "temperature": 0.9 + }, + { + "advantages": -1.3027873137616552e-05, + "completion_length": 629.0, + "delta_ref_entropy_loss": 0.033935546875, + "delta_ref_ppl": -0.0908203125, + "entropy_loss": -0.0771484375, + "epoch": 0.7152, + "grad_norm": 0.9009350964416836, + "k1_kl": 0.0908203125, + "k3_kl": 0.06689453125, + "kimi_kl": 0.203125, + "learning_rate": 1.424e-07, + "loss": 0.0027, + "ppl": 0.027099609375, + "reward": 0.9818549156188965, + "reward_std": 0.003819925943389535, + "rewards/perpo_ocr_edit_distance_reward": 0.981855034828186, + "step": 3576, + "temperature": 0.9 + }, + { + "advantages": -9.308542939834297e-05, + "completion_length": 874.0, + "delta_ref_entropy_loss": 0.039306640625, + "delta_ref_ppl": -0.04443359375, + "entropy_loss": -0.052001953125, + "epoch": 0.7154, + "grad_norm": 0.3899112212319614, + "k1_kl": 0.04443359375, + "k3_kl": 0.0264892578125, + "kimi_kl": 0.052001953125, + "learning_rate": 1.423e-07, + "loss": 0.0012, + "ppl": 0.02099609375, + "reward": 0.9965109825134277, + "reward_std": 0.0005403168615885079, + "rewards/perpo_ocr_edit_distance_reward": 0.9965111017227173, + "step": 3577, + "temperature": 0.9 + }, + { + "advantages": -5.103860894450918e-05, + "completion_length": 318.0, + "delta_ref_entropy_loss": 0.0262451171875, + "delta_ref_ppl": -0.126953125, + "entropy_loss": -0.08203125, + "epoch": 0.7156, + "grad_norm": 0.5155657537576577, + "k1_kl": 0.1279296875, + "k3_kl": 0.0927734375, + "kimi_kl": 0.37109375, + "learning_rate": 1.422e-07, + "loss": 0.0038, + "ppl": 0.026611328125, + "reward": 0.9628499746322632, + "reward_std": 0.0015677866758778691, + "rewards/perpo_ocr_edit_distance_reward": 0.9628500938415527, + "step": 3578, + "temperature": 0.9 + }, + { + "advantages": -3.2356808787881164e-06, + "completion_length": 1162.0, + "delta_ref_entropy_loss": 0.0240478515625, + "delta_ref_ppl": -0.04443359375, + "entropy_loss": -0.04736328125, + "epoch": 0.7158, + "grad_norm": 0.45021794389190783, + "k1_kl": 0.04443359375, + "k3_kl": 0.0286865234375, + "kimi_kl": 0.0712890625, + "learning_rate": 1.421e-07, + "loss": 0.0011, + "ppl": 0.018798828125, + "reward": 0.9943565726280212, + "reward_std": 0.0025307557079941034, + "rewards/perpo_ocr_edit_distance_reward": 0.9943565130233765, + "step": 3579, + "temperature": 0.9 + }, + { + "advantages": -1.0149819900107104e-05, + "completion_length": 312.0, + "delta_ref_entropy_loss": 0.056640625, + "delta_ref_ppl": -0.12060546875, + "entropy_loss": -0.09912109375, + "epoch": 0.716, + "grad_norm": 1.0480366295680823, + "k1_kl": 0.12109375, + "k3_kl": 0.0703125, + "kimi_kl": 0.162109375, + "learning_rate": 1.4199999999999997e-07, + "loss": 0.0028, + "ppl": 0.0478515625, + "reward": 0.9565714597702026, + "reward_std": 0.0032530068419873714, + "rewards/perpo_ocr_edit_distance_reward": 0.9565714597702026, + "step": 3580, + "temperature": 0.9 + }, + { + "advantages": -3.067084981012158e-05, + "completion_length": 331.0, + "delta_ref_entropy_loss": 0.02099609375, + "delta_ref_ppl": -0.07421875, + "entropy_loss": -0.045654296875, + "epoch": 0.7162, + "grad_norm": 0.8215850447174847, + "k1_kl": 0.07421875, + "k3_kl": 0.0546875, + "kimi_kl": 0.1865234375, + "learning_rate": 1.419e-07, + "loss": 0.0022, + "ppl": 0.0223388671875, + "reward": 0.9847961664199829, + "reward_std": 0.0018433149671182036, + "rewards/perpo_ocr_edit_distance_reward": 0.9847961664199829, + "step": 3581, + "temperature": 0.9 + }, + { + "advantages": -6.256785127334297e-05, + "completion_length": 1052.0, + "delta_ref_entropy_loss": 0.05029296875, + "delta_ref_ppl": -0.09423828125, + "entropy_loss": -0.0791015625, + "epoch": 0.7164, + "grad_norm": 2.1544802738014126, + "k1_kl": 0.09423828125, + "k3_kl": 0.06005859375, + "kimi_kl": 0.162109375, + "learning_rate": 1.418e-07, + "loss": 0.0025, + "ppl": 0.03515625, + "reward": 0.9926270842552185, + "reward_std": 0.0005804114625789225, + "rewards/perpo_ocr_edit_distance_reward": 0.9926271438598633, + "step": 3582, + "temperature": 0.9 + }, + { + "advantages": -1.4134816410660278e-06, + "completion_length": 1364.0, + "delta_ref_entropy_loss": -0.0033416748046875, + "delta_ref_ppl": -0.07568359375, + "entropy_loss": -0.337890625, + "epoch": 0.7166, + "grad_norm": 6.76416990176639, + "k1_kl": 0.07568359375, + "k3_kl": 0.06298828125, + "kimi_kl": 0.1298828125, + "learning_rate": 1.4169999999999998e-07, + "loss": 0.0025, + "ppl": 0.1806640625, + "reward": 0.6857280731201172, + "reward_std": 0.04750361293554306, + "rewards/perpo_ocr_edit_distance_reward": 0.6857281923294067, + "step": 3583, + "temperature": 0.9 + }, + { + "advantages": -1.7029899268550253e-08, + "completion_length": 375.0, + "delta_ref_entropy_loss": 0.0439453125, + "delta_ref_ppl": -0.083984375, + "entropy_loss": -0.0927734375, + "epoch": 0.7168, + "grad_norm": 0.8278219979268238, + "k1_kl": 0.083984375, + "k3_kl": 0.0625, + "kimi_kl": 0.1650390625, + "learning_rate": 1.416e-07, + "loss": 0.0025, + "ppl": 0.053955078125, + "reward": 0.9142659902572632, + "reward_std": 0.0032741082832217216, + "rewards/perpo_ocr_edit_distance_reward": 0.9142659902572632, + "step": 3584, + "temperature": 0.9 + }, + { + "advantages": -9.494168807577807e-06, + "completion_length": 290.0, + "delta_ref_entropy_loss": 0.1484375, + "delta_ref_ppl": -0.20703125, + "entropy_loss": -0.2275390625, + "epoch": 0.717, + "grad_norm": 2.386052553255098, + "k1_kl": 0.20703125, + "k3_kl": 0.1474609375, + "kimi_kl": 0.498046875, + "learning_rate": 1.415e-07, + "loss": 0.0059, + "ppl": 0.10546875, + "reward": 0.9230888485908508, + "reward_std": 0.00438444409519434, + "rewards/perpo_ocr_edit_distance_reward": 0.9230889081954956, + "step": 3585, + "temperature": 0.9 + }, + { + "advantages": -8.747407991904765e-05, + "completion_length": 350.0, + "delta_ref_entropy_loss": 0.042724609375, + "delta_ref_ppl": -0.07275390625, + "entropy_loss": -0.036865234375, + "epoch": 0.7172, + "grad_norm": 0.3180682249576778, + "k1_kl": 0.07275390625, + "k3_kl": 0.04541015625, + "kimi_kl": 0.1240234375, + "learning_rate": 1.4139999999999998e-07, + "loss": 0.0019, + "ppl": 0.0145263671875, + "reward": 0.9993850588798523, + "reward_std": 0.0004840620094910264, + "rewards/perpo_ocr_edit_distance_reward": 0.9993851184844971, + "step": 3586, + "temperature": 0.9 + }, + { + "advantages": -3.823212318820879e-05, + "completion_length": 377.0, + "delta_ref_entropy_loss": 0.032470703125, + "delta_ref_ppl": -0.09375, + "entropy_loss": -0.1220703125, + "epoch": 0.7174, + "grad_norm": 1.7443770723547214, + "k1_kl": 0.09375, + "k3_kl": 0.0654296875, + "kimi_kl": 0.173828125, + "learning_rate": 1.413e-07, + "loss": 0.0026, + "ppl": 0.0625, + "reward": 0.8637770414352417, + "reward_std": 0.0014594622189179063, + "rewards/perpo_ocr_edit_distance_reward": 0.8637771606445312, + "step": 3587, + "temperature": 0.9 + }, + { + "advantages": -4.211494160699658e-05, + "completion_length": 575.0, + "delta_ref_entropy_loss": 0.033203125, + "delta_ref_ppl": -0.07763671875, + "entropy_loss": -0.0849609375, + "epoch": 0.7176, + "grad_norm": 0.3879840285566244, + "k1_kl": 0.07763671875, + "k3_kl": 0.05322265625, + "kimi_kl": 0.1611328125, + "learning_rate": 1.412e-07, + "loss": 0.0022, + "ppl": 0.0284423828125, + "reward": 0.9958118796348572, + "reward_std": 0.001518083387054503, + "rewards/perpo_ocr_edit_distance_reward": 0.9958119988441467, + "step": 3588, + "temperature": 0.9 + }, + { + "advantages": -1.0984284699588898e-06, + "completion_length": 157.0, + "delta_ref_entropy_loss": 0.00836181640625, + "delta_ref_ppl": -0.287109375, + "entropy_loss": -0.171875, + "epoch": 0.7178, + "grad_norm": 3.1065622807316395, + "k1_kl": 0.287109375, + "k3_kl": 0.228515625, + "kimi_kl": 1.0078125, + "learning_rate": 1.4109999999999999e-07, + "loss": 0.0091, + "ppl": 0.0751953125, + "reward": 0.7690201997756958, + "reward_std": 0.007651420775800943, + "rewards/perpo_ocr_edit_distance_reward": 0.7690202593803406, + "step": 3589, + "temperature": 0.9 + }, + { + "advantages": -1.0984284926962573e-05, + "completion_length": 96.0, + "delta_ref_entropy_loss": 0.06396484375, + "delta_ref_ppl": -0.251953125, + "entropy_loss": -0.07470703125, + "epoch": 0.718, + "grad_norm": 1.7698804261818317, + "k1_kl": 0.25390625, + "k3_kl": 0.1904296875, + "kimi_kl": 0.75390625, + "learning_rate": 1.4099999999999998e-07, + "loss": 0.0076, + "ppl": 0.035888671875, + "reward": 0.9961290955543518, + "reward_std": 0.0022267892491072416, + "rewards/perpo_ocr_edit_distance_reward": 0.9961290955543518, + "step": 3590, + "temperature": 0.9 + }, + { + "advantages": -3.704003120219568e-06, + "completion_length": 395.0, + "delta_ref_entropy_loss": -0.0174560546875, + "delta_ref_ppl": -0.12109375, + "entropy_loss": -0.1630859375, + "epoch": 0.7182, + "grad_norm": 1.1328377625803543, + "k1_kl": 0.12109375, + "k3_kl": 0.10302734375, + "kimi_kl": 0.490234375, + "learning_rate": 1.409e-07, + "loss": 0.0041, + "ppl": 0.055908203125, + "reward": 0.9162396788597107, + "reward_std": 0.018328743055462837, + "rewards/perpo_ocr_edit_distance_reward": 0.9162397980690002, + "step": 3591, + "temperature": 0.9 + }, + { + "advantages": -4.117829666938633e-05, + "completion_length": 403.0, + "delta_ref_entropy_loss": 0.053466796875, + "delta_ref_ppl": -0.05859375, + "entropy_loss": -0.07470703125, + "epoch": 0.7184, + "grad_norm": 0.43317951196016524, + "k1_kl": 0.05859375, + "k3_kl": 0.03466796875, + "kimi_kl": 0.08544921875, + "learning_rate": 1.408e-07, + "loss": 0.0014, + "ppl": 0.03173828125, + "reward": 0.9771319031715393, + "reward_std": 0.0009335739887319505, + "rewards/perpo_ocr_edit_distance_reward": 0.9771319627761841, + "step": 3592, + "temperature": 0.9 + }, + { + "advantages": -1.1410032811909332e-06, + "completion_length": 1754.0, + "delta_ref_entropy_loss": -0.0220947265625, + "delta_ref_ppl": -0.0159912109375, + "entropy_loss": -0.154296875, + "epoch": 0.7186, + "grad_norm": 4.493301258393019, + "k1_kl": 0.015869140625, + "k3_kl": 0.0595703125, + "kimi_kl": 0.0546875, + "learning_rate": 1.4069999999999998e-07, + "loss": 0.0024, + "ppl": 0.0712890625, + "reward": 0.7882763743400574, + "reward_std": 0.014716633595526218, + "rewards/perpo_ocr_edit_distance_reward": 0.7882764339447021, + "step": 3593, + "temperature": 0.9 + }, + { + "advantages": -2.895082786835701e-07, + "completion_length": 645.0, + "delta_ref_entropy_loss": 0.046142578125, + "delta_ref_ppl": -0.10546875, + "entropy_loss": -0.275390625, + "epoch": 0.7188, + "grad_norm": 1.872861066708932, + "k1_kl": 0.10595703125, + "k3_kl": 0.07080078125, + "kimi_kl": 0.173828125, + "learning_rate": 1.406e-07, + "loss": 0.0028, + "ppl": 0.1328125, + "reward": 0.8547529578208923, + "reward_std": 0.2194664180278778, + "rewards/perpo_ocr_edit_distance_reward": 0.8547530770301819, + "step": 3594, + "temperature": 0.9 + }, + { + "advantages": -2.7218036848353222e-05, + "completion_length": 228.0, + "delta_ref_entropy_loss": 0.05419921875, + "delta_ref_ppl": -0.1123046875, + "entropy_loss": -0.10107421875, + "epoch": 0.719, + "grad_norm": 1.1795635716317852, + "k1_kl": 0.1123046875, + "k3_kl": 0.0771484375, + "kimi_kl": 0.224609375, + "learning_rate": 1.4050000000000002e-07, + "loss": 0.0031, + "ppl": 0.04052734375, + "reward": 0.9800495505332947, + "reward_std": 0.002403025049716234, + "rewards/perpo_ocr_edit_distance_reward": 0.9800496697425842, + "step": 3595, + "temperature": 0.9 + }, + { + "advantages": 8.514949634275126e-09, + "completion_length": 752.0, + "delta_ref_entropy_loss": -0.01507568359375, + "delta_ref_ppl": -0.08447265625, + "entropy_loss": -0.375, + "epoch": 0.7192, + "grad_norm": 2.1578858633597644, + "k1_kl": 0.08447265625, + "k3_kl": 0.064453125, + "kimi_kl": 0.126953125, + "learning_rate": 1.4039999999999999e-07, + "loss": 0.0026, + "ppl": 0.17578125, + "reward": 0.8022596836090088, + "reward_std": 0.036070317029953, + "rewards/perpo_ocr_edit_distance_reward": 0.8022597432136536, + "step": 3596, + "temperature": 0.9 + }, + { + "advantages": -3.2731466490076855e-05, + "completion_length": 682.0, + "delta_ref_entropy_loss": 0.024658203125, + "delta_ref_ppl": -0.05712890625, + "entropy_loss": -0.0732421875, + "epoch": 0.7194, + "grad_norm": 1.0621913931368325, + "k1_kl": 0.05712890625, + "k3_kl": 0.037841796875, + "kimi_kl": 0.09326171875, + "learning_rate": 1.403e-07, + "loss": 0.0016, + "ppl": 0.037353515625, + "reward": 0.975184440612793, + "reward_std": 0.0014617351116612554, + "rewards/perpo_ocr_edit_distance_reward": 0.9751845002174377, + "step": 3597, + "temperature": 0.9 + }, + { + "advantages": -6.785563164157793e-05, + "completion_length": 519.0, + "delta_ref_entropy_loss": 0.03173828125, + "delta_ref_ppl": -0.07080078125, + "entropy_loss": -0.047119140625, + "epoch": 0.7196, + "grad_norm": 0.4540188494735964, + "k1_kl": 0.07080078125, + "k3_kl": 0.048095703125, + "kimi_kl": 0.150390625, + "learning_rate": 1.402e-07, + "loss": 0.002, + "ppl": 0.0223388671875, + "reward": 0.9922499060630798, + "reward_std": 0.0009037574054673314, + "rewards/perpo_ocr_edit_distance_reward": 0.9922500252723694, + "step": 3598, + "temperature": 0.9 + }, + { + "advantages": -8.405958215007558e-05, + "completion_length": 253.0, + "delta_ref_entropy_loss": 0.058349609375, + "delta_ref_ppl": -0.2119140625, + "entropy_loss": -0.06787109375, + "epoch": 0.7198, + "grad_norm": 0.8890511131421964, + "k1_kl": 0.2119140625, + "k3_kl": 0.169921875, + "kimi_kl": 0.76171875, + "learning_rate": 1.401e-07, + "loss": 0.0069, + "ppl": 0.0289306640625, + "reward": 0.97429358959198, + "reward_std": 0.0014194803079590201, + "rewards/perpo_ocr_edit_distance_reward": 0.9742937684059143, + "step": 3599, + "temperature": 0.9 + }, + { + "advantages": -4.162533150520176e-05, + "completion_length": 1279.0, + "delta_ref_entropy_loss": 0.02587890625, + "delta_ref_ppl": -0.0634765625, + "entropy_loss": -0.0830078125, + "epoch": 0.72, + "grad_norm": 0.7869210100402796, + "k1_kl": 0.0634765625, + "k3_kl": 0.041748046875, + "kimi_kl": 0.099609375, + "learning_rate": 1.4e-07, + "loss": 0.0017, + "ppl": 0.04150390625, + "reward": 0.9905241131782532, + "reward_std": 0.0015364992432296276, + "rewards/perpo_ocr_edit_distance_reward": 0.990524172782898, + "step": 3600, + "temperature": 0.9 + }, + { + "advantages": -1.7029899268550253e-08, + "completion_length": 2048.0, + "delta_ref_entropy_loss": -0.2734375, + "delta_ref_ppl": -0.08251953125, + "entropy_loss": -0.953125, + "epoch": 0.7202, + "grad_norm": 8.679257640791775, + "k1_kl": 0.0810546875, + "k3_kl": 0.10302734375, + "kimi_kl": 0.1884765625, + "learning_rate": 1.399e-07, + "loss": 0.0041, + "ppl": 0.49609375, + "reward": 0.29501697421073914, + "reward_std": 0.0955210030078888, + "rewards/perpo_ocr_edit_distance_reward": 0.29501697421073914, + "step": 3601, + "temperature": 0.9 + }, + { + "advantages": -9.075233538169414e-05, + "completion_length": 928.0, + "delta_ref_entropy_loss": 0.053955078125, + "delta_ref_ppl": -0.08056640625, + "entropy_loss": -0.07275390625, + "epoch": 0.7204, + "grad_norm": 0.8098467274109578, + "k1_kl": 0.08056640625, + "k3_kl": 0.048583984375, + "kimi_kl": 0.10400390625, + "learning_rate": 1.398e-07, + "loss": 0.002, + "ppl": 0.031982421875, + "reward": 0.9720622301101685, + "reward_std": 0.0008383156964555383, + "rewards/perpo_ocr_edit_distance_reward": 0.9720624089241028, + "step": 3602, + "temperature": 0.9 + }, + { + "advantages": -5.764407978858799e-05, + "completion_length": 836.0, + "delta_ref_entropy_loss": 0.06689453125, + "delta_ref_ppl": -0.0771484375, + "entropy_loss": -0.043701171875, + "epoch": 0.7206, + "grad_norm": 0.47772974260851736, + "k1_kl": 0.07666015625, + "k3_kl": 0.04541015625, + "kimi_kl": 0.130859375, + "learning_rate": 1.397e-07, + "loss": 0.0019, + "ppl": 0.0174560546875, + "reward": 0.9926869869232178, + "reward_std": 0.0003431613149587065, + "rewards/perpo_ocr_edit_distance_reward": 0.9926870465278625, + "step": 3603, + "temperature": 0.9 + }, + { + "advantages": 2.1713121896027587e-05, + "completion_length": 538.0, + "delta_ref_entropy_loss": 0.046142578125, + "delta_ref_ppl": -0.08544921875, + "entropy_loss": -0.06640625, + "epoch": 0.7208, + "grad_norm": 0.9291665836154263, + "k1_kl": 0.08544921875, + "k3_kl": 0.060546875, + "kimi_kl": 0.1845703125, + "learning_rate": 1.396e-07, + "loss": 0.0024, + "ppl": 0.03369140625, + "reward": 0.9741773009300232, + "reward_std": 0.0014669517986476421, + "rewards/perpo_ocr_edit_distance_reward": 0.9741772413253784, + "step": 3604, + "temperature": 0.9 + }, + { + "advantages": -1.7583370208740234e-05, + "completion_length": 899.0, + "delta_ref_entropy_loss": 0.021240234375, + "delta_ref_ppl": -0.0517578125, + "entropy_loss": -0.060546875, + "epoch": 0.721, + "grad_norm": 0.8840604932601992, + "k1_kl": 0.051513671875, + "k3_kl": 0.03271484375, + "kimi_kl": 0.08203125, + "learning_rate": 1.395e-07, + "loss": 0.0013, + "ppl": 0.0301513671875, + "reward": 0.6516519784927368, + "reward_std": 0.0023213259410113096, + "rewards/perpo_ocr_edit_distance_reward": 0.6516519784927368, + "step": 3605, + "temperature": 0.9 + }, + { + "advantages": -8.514949634275126e-09, + "completion_length": 1229.0, + "delta_ref_entropy_loss": 0.028076171875, + "delta_ref_ppl": -0.0771484375, + "entropy_loss": -0.1435546875, + "epoch": 0.7212, + "grad_norm": 1.1292042783908174, + "k1_kl": 0.0771484375, + "k3_kl": 0.05322265625, + "kimi_kl": 0.1552734375, + "learning_rate": 1.394e-07, + "loss": 0.0021, + "ppl": 0.06201171875, + "reward": 0.914246678352356, + "reward_std": 0.05839132145047188, + "rewards/perpo_ocr_edit_distance_reward": 0.9142467379570007, + "step": 3606, + "temperature": 0.9 + }, + { + "advantages": -3.1198775104712695e-05, + "completion_length": 644.0, + "delta_ref_entropy_loss": 0.05517578125, + "delta_ref_ppl": -0.0986328125, + "entropy_loss": -0.09326171875, + "epoch": 0.7214, + "grad_norm": 0.9685426089615434, + "k1_kl": 0.0986328125, + "k3_kl": 0.0654296875, + "kimi_kl": 0.2275390625, + "learning_rate": 1.393e-07, + "loss": 0.0026, + "ppl": 0.045166015625, + "reward": 0.9649488925933838, + "reward_std": 0.0012642174260690808, + "rewards/perpo_ocr_edit_distance_reward": 0.9649489521980286, + "step": 3607, + "temperature": 0.9 + }, + { + "advantages": -1.8732889373040962e-07, + "completion_length": 1559.0, + "delta_ref_entropy_loss": -0.01171875, + "delta_ref_ppl": -0.0390625, + "entropy_loss": -0.146484375, + "epoch": 0.7216, + "grad_norm": 3.3826478097030166, + "k1_kl": 0.03857421875, + "k3_kl": 0.0341796875, + "kimi_kl": 0.07373046875, + "learning_rate": 1.3919999999999998e-07, + "loss": 0.0014, + "ppl": 0.06884765625, + "reward": 0.8887465000152588, + "reward_std": 0.20687174797058105, + "rewards/perpo_ocr_edit_distance_reward": 0.8887465596199036, + "step": 3608, + "temperature": 0.9 + }, + { + "advantages": -2.5868417651508935e-05, + "completion_length": 946.0, + "delta_ref_entropy_loss": 0.07421875, + "delta_ref_ppl": -0.06640625, + "entropy_loss": -0.1474609375, + "epoch": 0.7218, + "grad_norm": 1.4922420943639971, + "k1_kl": 0.06640625, + "k3_kl": 0.03271484375, + "kimi_kl": 0.061279296875, + "learning_rate": 1.391e-07, + "loss": 0.0013, + "ppl": 0.06689453125, + "reward": 0.9663398861885071, + "reward_std": 0.0012173473369330168, + "rewards/perpo_ocr_edit_distance_reward": 0.9663399457931519, + "step": 3609, + "temperature": 0.9 + }, + { + "advantages": -1.9746168618439697e-05, + "completion_length": 868.0, + "delta_ref_entropy_loss": 0.0191650390625, + "delta_ref_ppl": -0.0439453125, + "entropy_loss": -0.052001953125, + "epoch": 0.722, + "grad_norm": 0.5795429418587961, + "k1_kl": 0.0439453125, + "k3_kl": 0.029541015625, + "kimi_kl": 0.0849609375, + "learning_rate": 1.3900000000000001e-07, + "loss": 0.0012, + "ppl": 0.0201416015625, + "reward": 0.9834641814231873, + "reward_std": 0.004214141983538866, + "rewards/perpo_ocr_edit_distance_reward": 0.9834643006324768, + "step": 3610, + "temperature": 0.9 + }, + { + "advantages": -3.7806375985383056e-06, + "completion_length": 239.0, + "delta_ref_entropy_loss": 0.06201171875, + "delta_ref_ppl": -0.1533203125, + "entropy_loss": -0.166015625, + "epoch": 0.7222, + "grad_norm": 1.6411402164994502, + "k1_kl": 0.1533203125, + "k3_kl": 0.111328125, + "kimi_kl": 0.326171875, + "learning_rate": 1.3889999999999998e-07, + "loss": 0.0045, + "ppl": 0.0908203125, + "reward": 0.9604083299636841, + "reward_std": 0.004396864213049412, + "rewards/perpo_ocr_edit_distance_reward": 0.9604083895683289, + "step": 3611, + "temperature": 0.9 + }, + { + "advantages": 1.9822802642011084e-05, + "completion_length": 792.0, + "delta_ref_entropy_loss": 0.037109375, + "delta_ref_ppl": -0.04345703125, + "entropy_loss": -0.022705078125, + "epoch": 0.7224, + "grad_norm": 0.26975497368481155, + "k1_kl": 0.043701171875, + "k3_kl": 0.0235595703125, + "kimi_kl": 0.0859375, + "learning_rate": 1.388e-07, + "loss": 0.0009, + "ppl": 0.0076904296875, + "reward": 0.9985721707344055, + "reward_std": 0.0003297214861959219, + "rewards/perpo_ocr_edit_distance_reward": 0.9985722303390503, + "step": 3612, + "temperature": 0.9 + }, + { + "advantages": 2.4412360289716162e-05, + "completion_length": 351.0, + "delta_ref_entropy_loss": 0.04638671875, + "delta_ref_ppl": -0.08203125, + "entropy_loss": -0.056396484375, + "epoch": 0.7226, + "grad_norm": 0.27702160801992726, + "k1_kl": 0.08203125, + "k3_kl": 0.05859375, + "kimi_kl": 0.1962890625, + "learning_rate": 1.387e-07, + "loss": 0.0023, + "ppl": 0.017578125, + "reward": 0.9857872128486633, + "reward_std": 0.0009461721638217568, + "rewards/perpo_ocr_edit_distance_reward": 0.9857871532440186, + "step": 3613, + "temperature": 0.9 + }, + { + "advantages": -2.0197459889459424e-05, + "completion_length": 365.0, + "delta_ref_entropy_loss": 0.044921875, + "delta_ref_ppl": -0.1044921875, + "entropy_loss": -0.0654296875, + "epoch": 0.7228, + "grad_norm": 2.007415722907602, + "k1_kl": 0.10400390625, + "k3_kl": 0.06982421875, + "kimi_kl": 0.302734375, + "learning_rate": 1.3859999999999998e-07, + "loss": 0.0028, + "ppl": 0.0262451171875, + "reward": 0.9939881563186646, + "reward_std": 0.0015872996300458908, + "rewards/perpo_ocr_edit_distance_reward": 0.9939882159233093, + "step": 3614, + "temperature": 0.9 + }, + { + "advantages": -1.7506736185168847e-05, + "completion_length": 1523.0, + "delta_ref_entropy_loss": 0.01153564453125, + "delta_ref_ppl": -0.032958984375, + "entropy_loss": -0.07421875, + "epoch": 0.723, + "grad_norm": 0.6006870000191631, + "k1_kl": 0.032958984375, + "k3_kl": 0.024169921875, + "kimi_kl": 0.054443359375, + "learning_rate": 1.385e-07, + "loss": 0.001, + "ppl": 0.029541015625, + "reward": 0.8555425405502319, + "reward_std": 0.001358034205622971, + "rewards/perpo_ocr_edit_distance_reward": 0.8555425405502319, + "step": 3615, + "temperature": 0.9 + }, + { + "advantages": -6.014960308675654e-05, + "completion_length": 340.0, + "delta_ref_entropy_loss": 0.064453125, + "delta_ref_ppl": -0.0771484375, + "entropy_loss": -0.06396484375, + "epoch": 0.7232, + "grad_norm": 0.9688999313316258, + "k1_kl": 0.0771484375, + "k3_kl": 0.0458984375, + "kimi_kl": 0.1220703125, + "learning_rate": 1.384e-07, + "loss": 0.0019, + "ppl": 0.0223388671875, + "reward": 0.9936127662658691, + "reward_std": 0.001174095319584012, + "rewards/perpo_ocr_edit_distance_reward": 0.9936128854751587, + "step": 3616, + "temperature": 0.9 + }, + { + "advantages": -2.0350730665086303e-06, + "completion_length": 2048.0, + "delta_ref_entropy_loss": -0.01324462890625, + "delta_ref_ppl": -0.020751953125, + "entropy_loss": -0.236328125, + "epoch": 0.7234, + "grad_norm": 225.30373994427313, + "k1_kl": 0.020751953125, + "k3_kl": 1.9375, + "kimi_kl": 0.06396484375, + "learning_rate": 1.383e-07, + "loss": 0.0777, + "ppl": 0.14453125, + "reward": 0.8794937133789062, + "reward_std": 0.033073388040065765, + "rewards/perpo_ocr_edit_distance_reward": 0.8794938325881958, + "step": 3617, + "temperature": 0.9 + }, + { + "advantages": -1.4679772903036792e-05, + "completion_length": 698.0, + "delta_ref_entropy_loss": 0.0308837890625, + "delta_ref_ppl": -0.08984375, + "entropy_loss": -0.1669921875, + "epoch": 0.7236, + "grad_norm": 1.2808323321364996, + "k1_kl": 0.08984375, + "k3_kl": 0.06787109375, + "kimi_kl": 0.1728515625, + "learning_rate": 1.3819999999999998e-07, + "loss": 0.0027, + "ppl": 0.087890625, + "reward": 0.9579831957817078, + "reward_std": 0.003956078086048365, + "rewards/perpo_ocr_edit_distance_reward": 0.9579832553863525, + "step": 3618, + "temperature": 0.9 + }, + { + "advantages": -4.58104295830708e-06, + "completion_length": 219.0, + "delta_ref_entropy_loss": 0.033447265625, + "delta_ref_ppl": -0.2041015625, + "entropy_loss": -0.318359375, + "epoch": 0.7238, + "grad_norm": 6.688603529022061, + "k1_kl": 0.2041015625, + "k3_kl": 0.1533203125, + "kimi_kl": 0.6640625, + "learning_rate": 1.381e-07, + "loss": 0.0061, + "ppl": 0.126953125, + "reward": 0.9419105648994446, + "reward_std": 0.009222730994224548, + "rewards/perpo_ocr_edit_distance_reward": 0.9419106245040894, + "step": 3619, + "temperature": 0.9 + }, + { + "advantages": 1.7029899268550253e-08, + "completion_length": 1223.0, + "delta_ref_entropy_loss": 0.0191650390625, + "delta_ref_ppl": -0.0274658203125, + "entropy_loss": -0.03759765625, + "epoch": 0.724, + "grad_norm": 0.5019186014012548, + "k1_kl": 0.02734375, + "k3_kl": 0.01708984375, + "kimi_kl": 0.0419921875, + "learning_rate": 1.3800000000000002e-07, + "loss": 0.0007, + "ppl": 0.0152587890625, + "reward": 0.9932762980461121, + "reward_std": 0.0034481340553611517, + "rewards/perpo_ocr_edit_distance_reward": 0.9932763576507568, + "step": 3620, + "temperature": 0.9 + }, + { + "advantages": -0.00014209747314453125, + "completion_length": 615.0, + "delta_ref_entropy_loss": 0.06787109375, + "delta_ref_ppl": -0.0947265625, + "entropy_loss": -0.06298828125, + "epoch": 0.7242, + "grad_norm": 0.5560451457207061, + "k1_kl": 0.0947265625, + "k3_kl": 0.05859375, + "kimi_kl": 0.2021484375, + "learning_rate": 1.3789999999999998e-07, + "loss": 0.0025, + "ppl": 0.0174560546875, + "reward": 0.9524946808815002, + "reward_std": 0.0004992239992134273, + "rewards/perpo_ocr_edit_distance_reward": 0.9524948000907898, + "step": 3621, + "temperature": 0.9 + }, + { + "advantages": -2.4591174224042334e-05, + "completion_length": 831.0, + "delta_ref_entropy_loss": 0.0233154296875, + "delta_ref_ppl": -0.061767578125, + "entropy_loss": -0.1123046875, + "epoch": 0.7244, + "grad_norm": 0.8665021810117585, + "k1_kl": 0.061767578125, + "k3_kl": 0.03955078125, + "kimi_kl": 0.09326171875, + "learning_rate": 1.378e-07, + "loss": 0.0016, + "ppl": 0.048095703125, + "reward": 0.9921157956123352, + "reward_std": 0.0012856475077569485, + "rewards/perpo_ocr_edit_distance_reward": 0.9921157956123352, + "step": 3622, + "temperature": 0.9 + }, + { + "advantages": 7.66345493730114e-08, + "completion_length": 535.0, + "delta_ref_entropy_loss": -0.1982421875, + "delta_ref_ppl": -0.11572265625, + "entropy_loss": -1.0546875, + "epoch": 0.7246, + "grad_norm": 5.162112625707997, + "k1_kl": 0.11669921875, + "k3_kl": 0.1328125, + "kimi_kl": 0.248046875, + "learning_rate": 1.377e-07, + "loss": 0.0053, + "ppl": 0.578125, + "reward": 0.391684889793396, + "reward_std": 0.05886281654238701, + "rewards/perpo_ocr_edit_distance_reward": 0.391684889793396, + "step": 3623, + "temperature": 0.9 + }, + { + "advantages": -5.34483406227082e-05, + "completion_length": 539.0, + "delta_ref_entropy_loss": 0.0174560546875, + "delta_ref_ppl": -0.045166015625, + "entropy_loss": -0.0498046875, + "epoch": 0.7248, + "grad_norm": 0.575459198131389, + "k1_kl": 0.045166015625, + "k3_kl": 0.03076171875, + "kimi_kl": 0.0849609375, + "learning_rate": 1.376e-07, + "loss": 0.0013, + "ppl": 0.0189208984375, + "reward": 0.9968487024307251, + "reward_std": 0.0006962029146961868, + "rewards/perpo_ocr_edit_distance_reward": 0.9968487620353699, + "step": 3624, + "temperature": 0.9 + }, + { + "advantages": -0.00014776844182051718, + "completion_length": 446.0, + "delta_ref_entropy_loss": 0.046142578125, + "delta_ref_ppl": -0.0908203125, + "entropy_loss": -0.050048828125, + "epoch": 0.725, + "grad_norm": 0.3915184007047923, + "k1_kl": 0.0908203125, + "k3_kl": 0.06103515625, + "kimi_kl": 0.2451171875, + "learning_rate": 1.375e-07, + "loss": 0.0026, + "ppl": 0.0191650390625, + "reward": 0.9860144257545471, + "reward_std": 0.0003609956766013056, + "rewards/perpo_ocr_edit_distance_reward": 0.9860145449638367, + "step": 3625, + "temperature": 0.9 + }, + { + "advantages": -3.995214501628652e-05, + "completion_length": 989.0, + "delta_ref_entropy_loss": 0.003021240234375, + "delta_ref_ppl": -0.028076171875, + "entropy_loss": -0.033447265625, + "epoch": 0.7252, + "grad_norm": 0.533621282345669, + "k1_kl": 0.028076171875, + "k3_kl": 0.019287109375, + "kimi_kl": 0.052490234375, + "learning_rate": 1.374e-07, + "loss": 0.0008, + "ppl": 0.013427734375, + "reward": 0.9976255297660828, + "reward_std": 0.0007527558482252061, + "rewards/perpo_ocr_edit_distance_reward": 0.9976255893707275, + "step": 3626, + "temperature": 0.9 + }, + { + "advantages": -4.1833947761915624e-05, + "completion_length": 619.0, + "delta_ref_entropy_loss": 0.0018157958984375, + "delta_ref_ppl": -0.07861328125, + "entropy_loss": -0.15625, + "epoch": 0.7254, + "grad_norm": 0.6312698247458339, + "k1_kl": 0.07861328125, + "k3_kl": 0.05615234375, + "kimi_kl": 0.1455078125, + "learning_rate": 1.373e-07, + "loss": 0.0023, + "ppl": 0.058837890625, + "reward": 0.9756807684898376, + "reward_std": 0.0021387608721852303, + "rewards/perpo_ocr_edit_distance_reward": 0.9756808280944824, + "step": 3627, + "temperature": 0.9 + }, + { + "advantages": -1.0064670277643017e-05, + "completion_length": 491.0, + "delta_ref_entropy_loss": -0.00885009765625, + "delta_ref_ppl": -0.049560546875, + "entropy_loss": -0.08203125, + "epoch": 0.7256, + "grad_norm": 0.6360423615854767, + "k1_kl": 0.0498046875, + "k3_kl": 0.050537109375, + "kimi_kl": 0.1435546875, + "learning_rate": 1.3719999999999998e-07, + "loss": 0.002, + "ppl": 0.025146484375, + "reward": 0.922311544418335, + "reward_std": 0.0007442649221047759, + "rewards/perpo_ocr_edit_distance_reward": 0.9223116636276245, + "step": 3628, + "temperature": 0.9 + }, + { + "advantages": -2.55448497910038e-07, + "completion_length": 187.0, + "delta_ref_entropy_loss": -0.033935546875, + "delta_ref_ppl": -0.255859375, + "entropy_loss": -0.416015625, + "epoch": 0.7258, + "grad_norm": 5.282088190328145, + "k1_kl": 0.255859375, + "k3_kl": 0.2255859375, + "kimi_kl": 0.7890625, + "learning_rate": 1.371e-07, + "loss": 0.009, + "ppl": 0.1767578125, + "reward": 0.3722405731678009, + "reward_std": 0.1151689887046814, + "rewards/perpo_ocr_edit_distance_reward": 0.3722406029701233, + "step": 3629, + "temperature": 0.9 + }, + { + "advantages": -3.528594970703125e-05, + "completion_length": 775.0, + "delta_ref_entropy_loss": -0.029052734375, + "delta_ref_ppl": -0.072265625, + "entropy_loss": -0.470703125, + "epoch": 0.726, + "grad_norm": 2.305687806511618, + "k1_kl": 0.07177734375, + "k3_kl": 0.05859375, + "kimi_kl": 0.130859375, + "learning_rate": 1.37e-07, + "loss": 0.0024, + "ppl": 0.2353515625, + "reward": 0.9015998840332031, + "reward_std": 0.0015892450464889407, + "rewards/perpo_ocr_edit_distance_reward": 0.9015999436378479, + "step": 3630, + "temperature": 0.9 + }, + { + "advantages": -8.806160622043535e-05, + "completion_length": 446.0, + "delta_ref_entropy_loss": 0.05908203125, + "delta_ref_ppl": -0.10009765625, + "entropy_loss": -0.068359375, + "epoch": 0.7262, + "grad_norm": 0.793191879738387, + "k1_kl": 0.1005859375, + "k3_kl": 0.0654296875, + "kimi_kl": 0.177734375, + "learning_rate": 1.369e-07, + "loss": 0.0027, + "ppl": 0.0201416015625, + "reward": 0.9982966780662537, + "reward_std": 0.0005768190021626651, + "rewards/perpo_ocr_edit_distance_reward": 0.9982967376708984, + "step": 3631, + "temperature": 0.9 + }, + { + "advantages": -8.630753291072324e-05, + "completion_length": 553.0, + "delta_ref_entropy_loss": 0.042236328125, + "delta_ref_ppl": -0.068359375, + "entropy_loss": -0.0498046875, + "epoch": 0.7264, + "grad_norm": 0.6062843035214117, + "k1_kl": 0.068359375, + "k3_kl": 0.04296875, + "kimi_kl": 0.12158203125, + "learning_rate": 1.368e-07, + "loss": 0.0018, + "ppl": 0.01513671875, + "reward": 0.9964376091957092, + "reward_std": 0.0005905781290493906, + "rewards/perpo_ocr_edit_distance_reward": 0.996437668800354, + "step": 3632, + "temperature": 0.9 + }, + { + "advantages": -0.00021810192265547812, + "completion_length": 785.0, + "delta_ref_entropy_loss": 0.0277099609375, + "delta_ref_ppl": -0.04150390625, + "entropy_loss": -0.0260009765625, + "epoch": 0.7266, + "grad_norm": 0.10694253970163958, + "k1_kl": 0.04150390625, + "k3_kl": 0.023681640625, + "kimi_kl": 0.07763671875, + "learning_rate": 1.3669999999999997e-07, + "loss": 0.0012, + "ppl": 0.004913330078125, + "reward": 0.9994602203369141, + "reward_std": 9.519954619463533e-05, + "rewards/perpo_ocr_edit_distance_reward": 0.9994602799415588, + "step": 3633, + "temperature": 0.9 + }, + { + "advantages": -3.759350147447549e-05, + "completion_length": 651.0, + "delta_ref_entropy_loss": 0.06298828125, + "delta_ref_ppl": -0.11865234375, + "entropy_loss": -0.30859375, + "epoch": 0.7268, + "grad_norm": 1.7896348231723358, + "k1_kl": 0.1181640625, + "k3_kl": 0.08447265625, + "kimi_kl": 0.1533203125, + "learning_rate": 1.366e-07, + "loss": 0.0034, + "ppl": 0.1474609375, + "reward": 0.869612991809845, + "reward_std": 0.0028460482135415077, + "rewards/perpo_ocr_edit_distance_reward": 0.8696131706237793, + "step": 3634, + "temperature": 0.9 + }, + { + "advantages": -3.867915802402422e-05, + "completion_length": 630.0, + "delta_ref_entropy_loss": 0.047119140625, + "delta_ref_ppl": -0.08056640625, + "entropy_loss": -0.11767578125, + "epoch": 0.727, + "grad_norm": 1.440901421118049, + "k1_kl": 0.08056640625, + "k3_kl": 0.052001953125, + "kimi_kl": 0.1416015625, + "learning_rate": 1.365e-07, + "loss": 0.0021, + "ppl": 0.049072265625, + "reward": 0.8992680907249451, + "reward_std": 0.0010006519732996821, + "rewards/perpo_ocr_edit_distance_reward": 0.8992681503295898, + "step": 3635, + "temperature": 0.9 + }, + { + "advantages": -0.00013218607637099922, + "completion_length": 884.0, + "delta_ref_entropy_loss": 0.0322265625, + "delta_ref_ppl": -0.043701171875, + "entropy_loss": -0.03857421875, + "epoch": 0.7272, + "grad_norm": 0.5449700910918098, + "k1_kl": 0.04345703125, + "k3_kl": 0.0255126953125, + "kimi_kl": 0.06982421875, + "learning_rate": 1.3639999999999998e-07, + "loss": 0.0012, + "ppl": 0.01251220703125, + "reward": 0.9947121143341064, + "reward_std": 0.00047974090557545424, + "rewards/perpo_ocr_edit_distance_reward": 0.9947121739387512, + "step": 3636, + "temperature": 0.9 + }, + { + "advantages": -9.499277803115547e-05, + "completion_length": 693.0, + "delta_ref_entropy_loss": 0.034912109375, + "delta_ref_ppl": -0.060546875, + "entropy_loss": -0.07275390625, + "epoch": 0.7274, + "grad_norm": 0.612286876608002, + "k1_kl": 0.060546875, + "k3_kl": 0.035400390625, + "kimi_kl": 0.08251953125, + "learning_rate": 1.363e-07, + "loss": 0.0015, + "ppl": 0.021728515625, + "reward": 0.9857097864151001, + "reward_std": 0.0007066893740557134, + "rewards/perpo_ocr_edit_distance_reward": 0.9857099056243896, + "step": 3637, + "temperature": 0.9 + }, + { + "advantages": 5.551747108256677e-06, + "completion_length": 737.0, + "delta_ref_entropy_loss": 0.07177734375, + "delta_ref_ppl": -0.0859375, + "entropy_loss": -0.1435546875, + "epoch": 0.7276, + "grad_norm": 0.9926760133013184, + "k1_kl": 0.0859375, + "k3_kl": 0.049560546875, + "kimi_kl": 0.115234375, + "learning_rate": 1.362e-07, + "loss": 0.002, + "ppl": 0.056884765625, + "reward": 0.8137223720550537, + "reward_std": 0.0014362584333866835, + "rewards/perpo_ocr_edit_distance_reward": 0.8137223720550537, + "step": 3638, + "temperature": 0.9 + }, + { + "advantages": -2.588544703030493e-05, + "completion_length": 477.0, + "delta_ref_entropy_loss": 0.047119140625, + "delta_ref_ppl": -0.0712890625, + "entropy_loss": -0.06884765625, + "epoch": 0.7278, + "grad_norm": 0.7170023905787084, + "k1_kl": 0.07177734375, + "k3_kl": 0.0458984375, + "kimi_kl": 0.169921875, + "learning_rate": 1.3609999999999998e-07, + "loss": 0.0019, + "ppl": 0.02197265625, + "reward": 0.9972929358482361, + "reward_std": 0.0008865774143487215, + "rewards/perpo_ocr_edit_distance_reward": 0.9972929954528809, + "step": 3639, + "temperature": 0.9 + }, + { + "advantages": -1.6757421690272167e-05, + "completion_length": 434.0, + "delta_ref_entropy_loss": 0.055908203125, + "delta_ref_ppl": -0.09423828125, + "entropy_loss": -0.07080078125, + "epoch": 0.728, + "grad_norm": 0.982553253231195, + "k1_kl": 0.09423828125, + "k3_kl": 0.058837890625, + "kimi_kl": 0.1787109375, + "learning_rate": 1.36e-07, + "loss": 0.0024, + "ppl": 0.0255126953125, + "reward": 0.9778844714164734, + "reward_std": 0.0019339901627972722, + "rewards/perpo_ocr_edit_distance_reward": 0.9778844714164734, + "step": 3640, + "temperature": 0.9 + }, + { + "advantages": -9.16208591661416e-06, + "completion_length": 1689.0, + "delta_ref_entropy_loss": 0.01312255859375, + "delta_ref_ppl": -0.04052734375, + "entropy_loss": -0.1220703125, + "epoch": 0.7282, + "grad_norm": 0.9926134599902151, + "k1_kl": 0.04052734375, + "k3_kl": 0.04248046875, + "kimi_kl": 0.08349609375, + "learning_rate": 1.359e-07, + "loss": 0.0017, + "ppl": 0.06494140625, + "reward": 0.9326367378234863, + "reward_std": 0.0017606116598472, + "rewards/perpo_ocr_edit_distance_reward": 0.9326367378234863, + "step": 3641, + "temperature": 0.9 + }, + { + "advantages": 3.6103385809838073e-06, + "completion_length": 825.0, + "delta_ref_entropy_loss": 0.017333984375, + "delta_ref_ppl": -0.03369140625, + "entropy_loss": -0.039306640625, + "epoch": 0.7284, + "grad_norm": 0.6870388428727574, + "k1_kl": 0.03369140625, + "k3_kl": 0.0185546875, + "kimi_kl": 0.04541015625, + "learning_rate": 1.3579999999999999e-07, + "loss": 0.0007, + "ppl": 0.01165771484375, + "reward": 0.993622899055481, + "reward_std": 0.0022558222990483046, + "rewards/perpo_ocr_edit_distance_reward": 0.9936229586601257, + "step": 3642, + "temperature": 0.9 + }, + { + "advantages": -1.4015607121109497e-05, + "completion_length": 511.0, + "delta_ref_entropy_loss": -0.037841796875, + "delta_ref_ppl": -0.07177734375, + "entropy_loss": -0.2470703125, + "epoch": 0.7286, + "grad_norm": 1.5340408113419428, + "k1_kl": 0.07177734375, + "k3_kl": 0.060791015625, + "kimi_kl": 0.18359375, + "learning_rate": 1.3569999999999998e-07, + "loss": 0.0024, + "ppl": 0.07861328125, + "reward": 0.8469638228416443, + "reward_std": 0.00537087582051754, + "rewards/perpo_ocr_edit_distance_reward": 0.8469639420509338, + "step": 3643, + "temperature": 0.9 + }, + { + "advantages": -2.392700844211504e-05, + "completion_length": 945.0, + "delta_ref_entropy_loss": 0.046142578125, + "delta_ref_ppl": -0.09716796875, + "entropy_loss": -0.11962890625, + "epoch": 0.7288, + "grad_norm": 2.2800467308539223, + "k1_kl": 0.0966796875, + "k3_kl": 0.0654296875, + "kimi_kl": 0.2109375, + "learning_rate": 1.356e-07, + "loss": 0.0026, + "ppl": 0.0595703125, + "reward": 0.9644089937210083, + "reward_std": 0.0009671343141235411, + "rewards/perpo_ocr_edit_distance_reward": 0.9644090533256531, + "step": 3644, + "temperature": 0.9 + }, + { + "advantages": -3.477505379123613e-05, + "completion_length": 619.0, + "delta_ref_entropy_loss": 0.038818359375, + "delta_ref_ppl": -0.0908203125, + "entropy_loss": -0.064453125, + "epoch": 0.729, + "grad_norm": 0.5548043664826978, + "k1_kl": 0.0908203125, + "k3_kl": 0.061767578125, + "kimi_kl": 0.2216796875, + "learning_rate": 1.3550000000000002e-07, + "loss": 0.0025, + "ppl": 0.02685546875, + "reward": 0.4627690613269806, + "reward_std": 0.0008790649590082467, + "rewards/perpo_ocr_edit_distance_reward": 0.462769091129303, + "step": 3645, + "temperature": 0.9 + }, + { + "advantages": -5.5040633014868945e-05, + "completion_length": 456.0, + "delta_ref_entropy_loss": 0.09375, + "delta_ref_ppl": -0.11328125, + "entropy_loss": -0.2216796875, + "epoch": 0.7292, + "grad_norm": 1.8687097444193652, + "k1_kl": 0.11279296875, + "k3_kl": 0.0693359375, + "kimi_kl": 0.1494140625, + "learning_rate": 1.3539999999999998e-07, + "loss": 0.0028, + "ppl": 0.10986328125, + "reward": 0.8957992196083069, + "reward_std": 0.0012922262540087104, + "rewards/perpo_ocr_edit_distance_reward": 0.8957992792129517, + "step": 3646, + "temperature": 0.9 + }, + { + "advantages": -5.364844037103467e-05, + "completion_length": 539.0, + "delta_ref_entropy_loss": 0.047607421875, + "delta_ref_ppl": -0.08251953125, + "entropy_loss": -0.0908203125, + "epoch": 0.7294, + "grad_norm": 0.7713438334818796, + "k1_kl": 0.08251953125, + "k3_kl": 0.049072265625, + "kimi_kl": 0.1328125, + "learning_rate": 1.353e-07, + "loss": 0.002, + "ppl": 0.033203125, + "reward": 0.9521428942680359, + "reward_std": 0.0008521616691723466, + "rewards/perpo_ocr_edit_distance_reward": 0.9521428942680359, + "step": 3647, + "temperature": 0.9 + }, + { + "advantages": -1.9073486328125e-06, + "completion_length": 66.0, + "delta_ref_entropy_loss": -0.033935546875, + "delta_ref_ppl": -0.5234375, + "entropy_loss": -0.46875, + "epoch": 0.7296, + "grad_norm": 7.425894242507479, + "k1_kl": 0.52734375, + "k3_kl": 0.443359375, + "kimi_kl": 1.6953125, + "learning_rate": 1.352e-07, + "loss": 0.0177, + "ppl": 0.2001953125, + "reward": 0.7492877840995789, + "reward_std": 0.017792029306292534, + "rewards/perpo_ocr_edit_distance_reward": 0.7492877840995789, + "step": 3648, + "temperature": 0.9 + }, + { + "advantages": -3.349781036376953e-05, + "completion_length": 881.0, + "delta_ref_entropy_loss": 0.054443359375, + "delta_ref_ppl": -0.0791015625, + "entropy_loss": -0.033447265625, + "epoch": 0.7298, + "grad_norm": 0.7848385976816524, + "k1_kl": 0.0791015625, + "k3_kl": 0.046875, + "kimi_kl": 0.16015625, + "learning_rate": 1.3509999999999999e-07, + "loss": 0.0019, + "ppl": 0.0098876953125, + "reward": 0.9770053029060364, + "reward_std": 0.0004084752581547946, + "rewards/perpo_ocr_edit_distance_reward": 0.9770053029060364, + "step": 3649, + "temperature": 0.9 + }, + { + "advantages": 3.2527107123314636e-06, + "completion_length": 548.0, + "delta_ref_entropy_loss": 0.0234375, + "delta_ref_ppl": -0.10107421875, + "entropy_loss": -0.63671875, + "epoch": 0.73, + "grad_norm": 2.436583158462591, + "k1_kl": 0.1005859375, + "k3_kl": 0.07373046875, + "kimi_kl": 0.12255859375, + "learning_rate": 1.35e-07, + "loss": 0.0029, + "ppl": 0.33984375, + "reward": 0.7851746678352356, + "reward_std": 0.0025264101568609476, + "rewards/perpo_ocr_edit_distance_reward": 0.7851747274398804, + "step": 3650, + "temperature": 0.9 + }, + { + "advantages": -2.22665930778021e-05, + "completion_length": 319.0, + "delta_ref_entropy_loss": 0.050537109375, + "delta_ref_ppl": -0.15234375, + "entropy_loss": -0.078125, + "epoch": 0.7302, + "grad_norm": 0.9212768733423398, + "k1_kl": 0.1533203125, + "k3_kl": 0.1123046875, + "kimi_kl": 0.48046875, + "learning_rate": 1.349e-07, + "loss": 0.0045, + "ppl": 0.03076171875, + "reward": 0.9943153262138367, + "reward_std": 0.0021960726007819176, + "rewards/perpo_ocr_edit_distance_reward": 0.9943153262138367, + "step": 3651, + "temperature": 0.9 + }, + { + "advantages": -1.7472677427576855e-05, + "completion_length": 508.0, + "delta_ref_entropy_loss": 0.0157470703125, + "delta_ref_ppl": -0.0830078125, + "entropy_loss": -0.115234375, + "epoch": 0.7304, + "grad_norm": 0.5376215725822538, + "k1_kl": 0.08251953125, + "k3_kl": 0.06591796875, + "kimi_kl": 0.2119140625, + "learning_rate": 1.348e-07, + "loss": 0.0026, + "ppl": 0.042236328125, + "reward": 0.9757347702980042, + "reward_std": 0.0008760601631365716, + "rewards/perpo_ocr_edit_distance_reward": 0.9757348299026489, + "step": 3652, + "temperature": 0.9 + }, + { + "advantages": -6.471361848525703e-05, + "completion_length": 1368.0, + "delta_ref_entropy_loss": 0.040283203125, + "delta_ref_ppl": -0.06103515625, + "entropy_loss": -0.0703125, + "epoch": 0.7306, + "grad_norm": 1.0997643071578451, + "k1_kl": 0.06103515625, + "k3_kl": 0.036376953125, + "kimi_kl": 0.08203125, + "learning_rate": 1.3469999999999998e-07, + "loss": 0.0015, + "ppl": 0.03564453125, + "reward": 0.9943922758102417, + "reward_std": 0.0012157870223745704, + "rewards/perpo_ocr_edit_distance_reward": 0.9943923950195312, + "step": 3653, + "temperature": 0.9 + }, + { + "advantages": -8.514949634275126e-09, + "completion_length": 873.0, + "delta_ref_entropy_loss": 0.0322265625, + "delta_ref_ppl": -0.0615234375, + "entropy_loss": -0.046142578125, + "epoch": 0.7308, + "grad_norm": 0.5771667010377515, + "k1_kl": 0.0615234375, + "k3_kl": 0.038818359375, + "kimi_kl": 0.1259765625, + "learning_rate": 1.346e-07, + "loss": 0.0016, + "ppl": 0.0169677734375, + "reward": 0.9912087917327881, + "reward_std": 0.0006552304839715362, + "rewards/perpo_ocr_edit_distance_reward": 0.9912087917327881, + "step": 3654, + "temperature": 0.9 + }, + { + "advantages": -4.3353866203688085e-05, + "completion_length": 426.0, + "delta_ref_entropy_loss": 0.06982421875, + "delta_ref_ppl": -0.15625, + "entropy_loss": -0.1513671875, + "epoch": 0.731, + "grad_norm": 1.5993534599702335, + "k1_kl": 0.15625, + "k3_kl": 0.10791015625, + "kimi_kl": 0.37109375, + "learning_rate": 1.345e-07, + "loss": 0.0044, + "ppl": 0.06640625, + "reward": 0.9105000495910645, + "reward_std": 0.0014710458926856518, + "rewards/perpo_ocr_edit_distance_reward": 0.9105001091957092, + "step": 3655, + "temperature": 0.9 + }, + { + "advantages": 3.1335014227806823e-06, + "completion_length": 30.0, + "delta_ref_entropy_loss": 0.003631591796875, + "delta_ref_ppl": -0.34765625, + "entropy_loss": -0.1923828125, + "epoch": 0.7312, + "grad_norm": 11.611077133221515, + "k1_kl": 0.34765625, + "k3_kl": 0.2451171875, + "kimi_kl": 0.765625, + "learning_rate": 1.3439999999999999e-07, + "loss": 0.0098, + "ppl": 0.140625, + "reward": 0.9139783978462219, + "reward_std": 0.010752706788480282, + "rewards/perpo_ocr_edit_distance_reward": 0.9139784574508667, + "step": 3656, + "temperature": 0.9 + }, + { + "advantages": 2.913815842475742e-05, + "completion_length": 403.0, + "delta_ref_entropy_loss": 0.038818359375, + "delta_ref_ppl": -0.0595703125, + "entropy_loss": -0.03759765625, + "epoch": 0.7314, + "grad_norm": 0.6767513705735542, + "k1_kl": 0.0595703125, + "k3_kl": 0.036376953125, + "kimi_kl": 0.10595703125, + "learning_rate": 1.343e-07, + "loss": 0.0014, + "ppl": 0.01495361328125, + "reward": 0.9989998936653137, + "reward_std": 0.00048450531903654337, + "rewards/perpo_ocr_edit_distance_reward": 0.9989998936653137, + "step": 3657, + "temperature": 0.9 + }, + { + "advantages": -2.660070276760962e-05, + "completion_length": 920.0, + "delta_ref_entropy_loss": 0.01458740234375, + "delta_ref_ppl": -0.060302734375, + "entropy_loss": -0.103515625, + "epoch": 0.7316, + "grad_norm": 2.9545016196031306, + "k1_kl": 0.06005859375, + "k3_kl": 0.04248046875, + "kimi_kl": 0.10009765625, + "learning_rate": 1.342e-07, + "loss": 0.0017, + "ppl": 0.050048828125, + "reward": 0.9786914587020874, + "reward_std": 0.0037414308171719313, + "rewards/perpo_ocr_edit_distance_reward": 0.978691577911377, + "step": 3658, + "temperature": 0.9 + }, + { + "advantages": 0.0, + "completion_length": 265.0, + "delta_ref_entropy_loss": 0.03466796875, + "delta_ref_ppl": -0.10400390625, + "entropy_loss": -0.09912109375, + "epoch": 0.7318, + "grad_norm": 1.1297343062857412, + "k1_kl": 0.10400390625, + "k3_kl": 0.08447265625, + "kimi_kl": 0.21484375, + "learning_rate": 1.341e-07, + "loss": 0.0034, + "ppl": 0.0478515625, + "reward": 0.9772454500198364, + "reward_std": 0.000977813033387065, + "rewards/perpo_ocr_edit_distance_reward": 0.9772455096244812, + "step": 3659, + "temperature": 0.9 + }, + { + "advantages": -1.2193408110761084e-05, + "completion_length": 374.0, + "delta_ref_entropy_loss": 0.04931640625, + "delta_ref_ppl": -0.0927734375, + "entropy_loss": -0.059326171875, + "epoch": 0.732, + "grad_norm": 0.6010832970325514, + "k1_kl": 0.09228515625, + "k3_kl": 0.062255859375, + "kimi_kl": 0.2021484375, + "learning_rate": 1.34e-07, + "loss": 0.0025, + "ppl": 0.02734375, + "reward": 0.8230507969856262, + "reward_std": 0.0012981918407604098, + "rewards/perpo_ocr_edit_distance_reward": 0.823050856590271, + "step": 3660, + "temperature": 0.9 + }, + { + "advantages": -5.514281292562373e-05, + "completion_length": 647.0, + "delta_ref_entropy_loss": 0.0306396484375, + "delta_ref_ppl": -0.046875, + "entropy_loss": -0.038818359375, + "epoch": 0.7322, + "grad_norm": 0.2822270661324258, + "k1_kl": 0.046875, + "k3_kl": 0.0294189453125, + "kimi_kl": 0.09814453125, + "learning_rate": 1.3389999999999997e-07, + "loss": 0.0012, + "ppl": 0.0103759765625, + "reward": 0.993540346622467, + "reward_std": 0.0003633407468441874, + "rewards/perpo_ocr_edit_distance_reward": 0.9935404062271118, + "step": 3661, + "temperature": 0.9 + }, + { + "advantages": -2.7196749215363525e-05, + "completion_length": 337.0, + "delta_ref_entropy_loss": 0.046875, + "delta_ref_ppl": -0.09033203125, + "entropy_loss": -0.1318359375, + "epoch": 0.7324, + "grad_norm": 1.5697576960193493, + "k1_kl": 0.09033203125, + "k3_kl": 0.057861328125, + "kimi_kl": 0.1103515625, + "learning_rate": 1.338e-07, + "loss": 0.0023, + "ppl": 0.0712890625, + "reward": 0.8814311623573303, + "reward_std": 0.001465782755985856, + "rewards/perpo_ocr_edit_distance_reward": 0.8814312219619751, + "step": 3662, + "temperature": 0.9 + }, + { + "advantages": -5.28267482877709e-05, + "completion_length": 526.0, + "delta_ref_entropy_loss": 0.042724609375, + "delta_ref_ppl": -0.08203125, + "entropy_loss": -0.06298828125, + "epoch": 0.7326, + "grad_norm": 0.5254906131986603, + "k1_kl": 0.08154296875, + "k3_kl": 0.052490234375, + "kimi_kl": 0.150390625, + "learning_rate": 1.337e-07, + "loss": 0.0022, + "ppl": 0.022705078125, + "reward": 0.9971388578414917, + "reward_std": 0.0011895106872543693, + "rewards/perpo_ocr_edit_distance_reward": 0.9971389770507812, + "step": 3663, + "temperature": 0.9 + }, + { + "advantages": -1.0183879567193799e-05, + "completion_length": 1067.0, + "delta_ref_entropy_loss": -0.0027923583984375, + "delta_ref_ppl": -0.0654296875, + "entropy_loss": -0.369140625, + "epoch": 0.7328, + "grad_norm": 1.441058658552119, + "k1_kl": 0.06494140625, + "k3_kl": 0.0478515625, + "kimi_kl": 0.09765625, + "learning_rate": 1.3359999999999998e-07, + "loss": 0.0019, + "ppl": 0.1611328125, + "reward": 0.9230026602745056, + "reward_std": 0.006574866361916065, + "rewards/perpo_ocr_edit_distance_reward": 0.9230027198791504, + "step": 3664, + "temperature": 0.9 + }, + { + "advantages": -3.4059798537100505e-08, + "completion_length": 209.0, + "delta_ref_entropy_loss": -0.2392578125, + "delta_ref_ppl": -0.349609375, + "entropy_loss": -0.71484375, + "epoch": 0.733, + "grad_norm": 5.531908768161475, + "k1_kl": 0.349609375, + "k3_kl": 0.33203125, + "kimi_kl": 1.34375, + "learning_rate": 1.335e-07, + "loss": 0.0133, + "ppl": 0.34375, + "reward": 0.5669644474983215, + "reward_std": 0.16095691919326782, + "rewards/perpo_ocr_edit_distance_reward": 0.5669644474983215, + "step": 3665, + "temperature": 0.9 + }, + { + "advantages": -2.09978661587229e-05, + "completion_length": 457.0, + "delta_ref_entropy_loss": 0.061767578125, + "delta_ref_ppl": -0.09619140625, + "entropy_loss": -0.1796875, + "epoch": 0.7332, + "grad_norm": 1.383926996594221, + "k1_kl": 0.09619140625, + "k3_kl": 0.058349609375, + "kimi_kl": 0.126953125, + "learning_rate": 1.334e-07, + "loss": 0.0024, + "ppl": 0.078125, + "reward": 0.9110826849937439, + "reward_std": 0.004358563106507063, + "rewards/perpo_ocr_edit_distance_reward": 0.9110827445983887, + "step": 3666, + "temperature": 0.9 + }, + { + "advantages": -4.76837158203125e-07, + "completion_length": 229.0, + "delta_ref_entropy_loss": -0.2197265625, + "delta_ref_ppl": -0.1328125, + "entropy_loss": -0.478515625, + "epoch": 0.7334, + "grad_norm": 3.3449010340817, + "k1_kl": 0.1318359375, + "k3_kl": 0.1484375, + "kimi_kl": 0.4140625, + "learning_rate": 1.3329999999999998e-07, + "loss": 0.0059, + "ppl": 0.181640625, + "reward": 0.49797841906547546, + "reward_std": 0.027630571275949478, + "rewards/perpo_ocr_edit_distance_reward": 0.49797844886779785, + "step": 3667, + "temperature": 0.9 + }, + { + "advantages": -1.5565328794764355e-05, + "completion_length": 413.0, + "delta_ref_entropy_loss": 0.060302734375, + "delta_ref_ppl": -0.166015625, + "entropy_loss": -0.1689453125, + "epoch": 0.7336, + "grad_norm": 1.3362953891307285, + "k1_kl": 0.166015625, + "k3_kl": 0.1142578125, + "kimi_kl": 0.390625, + "learning_rate": 1.332e-07, + "loss": 0.0046, + "ppl": 0.07421875, + "reward": 0.9473186731338501, + "reward_std": 0.0026343080680817366, + "rewards/perpo_ocr_edit_distance_reward": 0.9473187923431396, + "step": 3668, + "temperature": 0.9 + }, + { + "advantages": -6.159714394016191e-05, + "completion_length": 257.0, + "delta_ref_entropy_loss": 0.08447265625, + "delta_ref_ppl": -0.1875, + "entropy_loss": -0.0869140625, + "epoch": 0.7338, + "grad_norm": 1.1024644370305159, + "k1_kl": 0.1875, + "k3_kl": 0.140625, + "kimi_kl": 0.5234375, + "learning_rate": 1.331e-07, + "loss": 0.0057, + "ppl": 0.0341796875, + "reward": 0.9952744245529175, + "reward_std": 0.001282318844459951, + "rewards/perpo_ocr_edit_distance_reward": 0.995274543762207, + "step": 3669, + "temperature": 0.9 + }, + { + "advantages": -7.935933354019653e-06, + "completion_length": 807.0, + "delta_ref_entropy_loss": 0.007049560546875, + "delta_ref_ppl": -0.04150390625, + "entropy_loss": -0.123046875, + "epoch": 0.734, + "grad_norm": 0.7323058901683238, + "k1_kl": 0.04150390625, + "k3_kl": 0.0267333984375, + "kimi_kl": 0.06982421875, + "learning_rate": 1.33e-07, + "loss": 0.0011, + "ppl": 0.0439453125, + "reward": 0.97526615858078, + "reward_std": 0.003119412576779723, + "rewards/perpo_ocr_edit_distance_reward": 0.9752662181854248, + "step": 3670, + "temperature": 0.9 + }, + { + "advantages": -7.580859528388828e-05, + "completion_length": 747.0, + "delta_ref_entropy_loss": 0.022705078125, + "delta_ref_ppl": -0.0419921875, + "entropy_loss": -0.04150390625, + "epoch": 0.7342, + "grad_norm": 0.7365250397371733, + "k1_kl": 0.0419921875, + "k3_kl": 0.0284423828125, + "kimi_kl": 0.06689453125, + "learning_rate": 1.3289999999999998e-07, + "loss": 0.0012, + "ppl": 0.0166015625, + "reward": 0.9974709749221802, + "reward_std": 0.0006862832815386355, + "rewards/perpo_ocr_edit_distance_reward": 0.997471034526825, + "step": 3671, + "temperature": 0.9 + }, + { + "advantages": -6.917545397300273e-05, + "completion_length": 1289.0, + "delta_ref_entropy_loss": 0.0478515625, + "delta_ref_ppl": -0.06982421875, + "entropy_loss": -0.09814453125, + "epoch": 0.7344, + "grad_norm": 0.8082030982140845, + "k1_kl": 0.06982421875, + "k3_kl": 0.039306640625, + "kimi_kl": 0.1083984375, + "learning_rate": 1.328e-07, + "loss": 0.0016, + "ppl": 0.03857421875, + "reward": 0.9898175597190857, + "reward_std": 0.0012541782343760133, + "rewards/perpo_ocr_edit_distance_reward": 0.9898176789283752, + "step": 3672, + "temperature": 0.9 + }, + { + "advantages": 0.0, + "completion_length": 810.0, + "delta_ref_entropy_loss": 0.03857421875, + "delta_ref_ppl": -0.10595703125, + "entropy_loss": -0.3046875, + "epoch": 0.7346, + "grad_norm": 1.606874821709833, + "k1_kl": 0.1064453125, + "k3_kl": 0.06982421875, + "kimi_kl": 0.1630859375, + "learning_rate": 1.3270000000000002e-07, + "loss": 0.0028, + "ppl": 0.142578125, + "reward": 0.7956922054290771, + "reward_std": 0.002979683456942439, + "rewards/perpo_ocr_edit_distance_reward": 0.7956922650337219, + "step": 3673, + "temperature": 0.9 + }, + { + "advantages": -1.958438360816217e-06, + "completion_length": 1223.0, + "delta_ref_entropy_loss": 0.0108642578125, + "delta_ref_ppl": -0.0322265625, + "entropy_loss": -0.047607421875, + "epoch": 0.7348, + "grad_norm": 0.3349863149283842, + "k1_kl": 0.0322265625, + "k3_kl": 0.0211181640625, + "kimi_kl": 0.048583984375, + "learning_rate": 1.3259999999999998e-07, + "loss": 0.0008, + "ppl": 0.01806640625, + "reward": 0.9921784400939941, + "reward_std": 0.0042415945790708065, + "rewards/perpo_ocr_edit_distance_reward": 0.9921784400939941, + "step": 3674, + "temperature": 0.9 + }, + { + "advantages": -0.00012899297871626914, + "completion_length": 334.0, + "delta_ref_entropy_loss": 0.04638671875, + "delta_ref_ppl": -0.0849609375, + "entropy_loss": -0.05859375, + "epoch": 0.735, + "grad_norm": 0.3780173365210474, + "k1_kl": 0.0849609375, + "k3_kl": 0.05419921875, + "kimi_kl": 0.166015625, + "learning_rate": 1.325e-07, + "loss": 0.0023, + "ppl": 0.0213623046875, + "reward": 0.9965131878852844, + "reward_std": 0.0006261797971092165, + "rewards/perpo_ocr_edit_distance_reward": 0.9965133666992188, + "step": 3675, + "temperature": 0.9 + }, + { + "advantages": -1.5471663573407568e-05, + "completion_length": 257.0, + "delta_ref_entropy_loss": 0.042236328125, + "delta_ref_ppl": -0.130859375, + "entropy_loss": -0.04736328125, + "epoch": 0.7352, + "grad_norm": 0.7310533061391776, + "k1_kl": 0.130859375, + "k3_kl": 0.0966796875, + "kimi_kl": 0.39453125, + "learning_rate": 1.324e-07, + "loss": 0.0039, + "ppl": 0.019775390625, + "reward": 0.997044026851654, + "reward_std": 0.000999801093712449, + "rewards/perpo_ocr_edit_distance_reward": 0.9970440864562988, + "step": 3676, + "temperature": 0.9 + }, + { + "advantages": -1.863070974650327e-05, + "completion_length": 277.0, + "delta_ref_entropy_loss": 0.138671875, + "delta_ref_ppl": -0.154296875, + "entropy_loss": -0.189453125, + "epoch": 0.7354, + "grad_norm": 1.9828198629661082, + "k1_kl": 0.154296875, + "k3_kl": 0.09326171875, + "kimi_kl": 0.2333984375, + "learning_rate": 1.323e-07, + "loss": 0.0038, + "ppl": 0.10302734375, + "reward": 0.9567165374755859, + "reward_std": 0.002184250159189105, + "rewards/perpo_ocr_edit_distance_reward": 0.9567165970802307, + "step": 3677, + "temperature": 0.9 + }, + { + "advantages": -3.899846888089087e-06, + "completion_length": 469.0, + "delta_ref_entropy_loss": 0.057373046875, + "delta_ref_ppl": -0.09130859375, + "entropy_loss": -0.1328125, + "epoch": 0.7356, + "grad_norm": 0.9115431320205942, + "k1_kl": 0.09130859375, + "k3_kl": 0.06005859375, + "kimi_kl": 0.2060546875, + "learning_rate": 1.322e-07, + "loss": 0.0024, + "ppl": 0.055419921875, + "reward": 0.9706719517707825, + "reward_std": 0.0020879742223769426, + "rewards/perpo_ocr_edit_distance_reward": 0.9706720113754272, + "step": 3678, + "temperature": 0.9 + }, + { + "advantages": -3.218651045244769e-06, + "completion_length": 889.0, + "delta_ref_entropy_loss": -0.0181884765625, + "delta_ref_ppl": -0.0537109375, + "entropy_loss": -0.1904296875, + "epoch": 0.7358, + "grad_norm": 1.2244338517390314, + "k1_kl": 0.0537109375, + "k3_kl": 0.042724609375, + "kimi_kl": 0.130859375, + "learning_rate": 1.321e-07, + "loss": 0.0017, + "ppl": 0.0732421875, + "reward": 0.8310493230819702, + "reward_std": 0.005193273536860943, + "rewards/perpo_ocr_edit_distance_reward": 0.831049382686615, + "step": 3679, + "temperature": 0.9 + }, + { + "advantages": -2.7145659259986132e-05, + "completion_length": 432.0, + "delta_ref_entropy_loss": 0.0341796875, + "delta_ref_ppl": -0.08740234375, + "entropy_loss": -0.08251953125, + "epoch": 0.736, + "grad_norm": 0.8985149539289801, + "k1_kl": 0.0869140625, + "k3_kl": 0.058837890625, + "kimi_kl": 0.173828125, + "learning_rate": 1.32e-07, + "loss": 0.0024, + "ppl": 0.031005859375, + "reward": 0.9973053932189941, + "reward_std": 0.002096426673233509, + "rewards/perpo_ocr_edit_distance_reward": 0.9973054528236389, + "step": 3680, + "temperature": 0.9 + }, + { + "advantages": -5.062137461209204e-06, + "completion_length": 190.0, + "delta_ref_entropy_loss": 0.039794921875, + "delta_ref_ppl": -0.2099609375, + "entropy_loss": -0.0693359375, + "epoch": 0.7362, + "grad_norm": 0.9693447872828701, + "k1_kl": 0.2099609375, + "k3_kl": 0.1669921875, + "kimi_kl": 0.703125, + "learning_rate": 1.3189999999999998e-07, + "loss": 0.0067, + "ppl": 0.0286865234375, + "reward": 0.9937001466751099, + "reward_std": 0.001575853442773223, + "rewards/perpo_ocr_edit_distance_reward": 0.9937002062797546, + "step": 3681, + "temperature": 0.9 + }, + { + "advantages": -9.5367431640625e-06, + "completion_length": 434.0, + "delta_ref_entropy_loss": 0.01953125, + "delta_ref_ppl": -0.06591796875, + "entropy_loss": -0.064453125, + "epoch": 0.7364, + "grad_norm": 0.7254888176781026, + "k1_kl": 0.06591796875, + "k3_kl": 0.048095703125, + "kimi_kl": 0.1474609375, + "learning_rate": 1.318e-07, + "loss": 0.0019, + "ppl": 0.0274658203125, + "reward": 0.9837157130241394, + "reward_std": 0.0016854852437973022, + "rewards/perpo_ocr_edit_distance_reward": 0.9837157726287842, + "step": 3682, + "temperature": 0.9 + }, + { + "advantages": -1.6008105376386084e-05, + "completion_length": 494.0, + "delta_ref_entropy_loss": -0.0191650390625, + "delta_ref_ppl": -0.049560546875, + "entropy_loss": -0.1845703125, + "epoch": 0.7366, + "grad_norm": 1.3518558061388133, + "k1_kl": 0.049560546875, + "k3_kl": 0.033447265625, + "kimi_kl": 0.08203125, + "learning_rate": 1.317e-07, + "loss": 0.0014, + "ppl": 0.062255859375, + "reward": 0.7238264679908752, + "reward_std": 0.003092806087806821, + "rewards/perpo_ocr_edit_distance_reward": 0.72382652759552, + "step": 3683, + "temperature": 0.9 + }, + { + "advantages": -0.00010446140368003398, + "completion_length": 472.0, + "delta_ref_entropy_loss": 0.0291748046875, + "delta_ref_ppl": -0.052978515625, + "entropy_loss": -0.05859375, + "epoch": 0.7368, + "grad_norm": 0.796888237797566, + "k1_kl": 0.052978515625, + "k3_kl": 0.03759765625, + "kimi_kl": 0.126953125, + "learning_rate": 1.316e-07, + "loss": 0.0016, + "ppl": 0.01904296875, + "reward": 0.9712036848068237, + "reward_std": 0.0006336052319966257, + "rewards/perpo_ocr_edit_distance_reward": 0.9712038040161133, + "step": 3684, + "temperature": 0.9 + }, + { + "advantages": -8.320809138240293e-05, + "completion_length": 1033.0, + "delta_ref_entropy_loss": 0.0228271484375, + "delta_ref_ppl": -0.0322265625, + "entropy_loss": -0.035400390625, + "epoch": 0.737, + "grad_norm": 0.22836242169694185, + "k1_kl": 0.0322265625, + "k3_kl": 0.019287109375, + "kimi_kl": 0.047607421875, + "learning_rate": 1.315e-07, + "loss": 0.0009, + "ppl": 0.01129150390625, + "reward": 0.99233478307724, + "reward_std": 0.0004116080526728183, + "rewards/perpo_ocr_edit_distance_reward": 0.9923348426818848, + "step": 3685, + "temperature": 0.9 + }, + { + "advantages": -2.73670484602917e-05, + "completion_length": 1157.0, + "delta_ref_entropy_loss": 0.01025390625, + "delta_ref_ppl": -0.038818359375, + "entropy_loss": -0.0458984375, + "epoch": 0.7372, + "grad_norm": 0.530315298396784, + "k1_kl": 0.03857421875, + "k3_kl": 0.0283203125, + "kimi_kl": 0.07080078125, + "learning_rate": 1.3139999999999997e-07, + "loss": 0.0012, + "ppl": 0.0157470703125, + "reward": 0.996888279914856, + "reward_std": 0.0027004829607903957, + "rewards/perpo_ocr_edit_distance_reward": 0.9968883395195007, + "step": 3686, + "temperature": 0.9 + }, + { + "advantages": -5.497677193488926e-05, + "completion_length": 1443.0, + "delta_ref_entropy_loss": 0.0167236328125, + "delta_ref_ppl": -0.0281982421875, + "entropy_loss": -0.056396484375, + "epoch": 0.7374, + "grad_norm": 14.826233926111632, + "k1_kl": 0.0281982421875, + "k3_kl": 0.0185546875, + "kimi_kl": 0.03662109375, + "learning_rate": 1.313e-07, + "loss": 0.0008, + "ppl": 0.02294921875, + "reward": 0.9931949973106384, + "reward_std": 0.0006745622958987951, + "rewards/perpo_ocr_edit_distance_reward": 0.993195116519928, + "step": 3687, + "temperature": 0.9 + }, + { + "advantages": -4.638944665202871e-05, + "completion_length": 499.0, + "delta_ref_entropy_loss": 0.0634765625, + "delta_ref_ppl": -0.08203125, + "entropy_loss": -0.10595703125, + "epoch": 0.7376, + "grad_norm": 0.9992355419988543, + "k1_kl": 0.08154296875, + "k3_kl": 0.052734375, + "kimi_kl": 0.115234375, + "learning_rate": 1.312e-07, + "loss": 0.0022, + "ppl": 0.04931640625, + "reward": 0.895233690738678, + "reward_std": 0.0011845976114273071, + "rewards/perpo_ocr_edit_distance_reward": 0.8952337503433228, + "step": 3688, + "temperature": 0.9 + }, + { + "advantages": -0.0001504663232481107, + "completion_length": 1000.0, + "delta_ref_entropy_loss": 0.0177001953125, + "delta_ref_ppl": -0.0303955078125, + "entropy_loss": -0.04150390625, + "epoch": 0.7378, + "grad_norm": 0.23175217020701122, + "k1_kl": 0.0303955078125, + "k3_kl": 0.0213623046875, + "kimi_kl": 0.05078125, + "learning_rate": 1.3109999999999998e-07, + "loss": 0.001, + "ppl": 0.0206298828125, + "reward": 0.994730532169342, + "reward_std": 0.00023952624178491533, + "rewards/perpo_ocr_edit_distance_reward": 0.9947305917739868, + "step": 3689, + "temperature": 0.9 + }, + { + "advantages": -6.729364395141602e-05, + "completion_length": 526.0, + "delta_ref_entropy_loss": 0.0419921875, + "delta_ref_ppl": -0.05224609375, + "entropy_loss": -0.05908203125, + "epoch": 0.738, + "grad_norm": 0.7371788172798686, + "k1_kl": 0.05224609375, + "k3_kl": 0.033203125, + "kimi_kl": 0.1015625, + "learning_rate": 1.31e-07, + "loss": 0.0014, + "ppl": 0.0218505859375, + "reward": 0.9557018280029297, + "reward_std": 0.0014180269790813327, + "rewards/perpo_ocr_edit_distance_reward": 0.955702006816864, + "step": 3690, + "temperature": 0.9 + }, + { + "advantages": 9.877341653918847e-06, + "completion_length": 352.0, + "delta_ref_entropy_loss": 0.0244140625, + "delta_ref_ppl": -0.11767578125, + "entropy_loss": -0.0654296875, + "epoch": 0.7382, + "grad_norm": 0.7591705462978725, + "k1_kl": 0.11767578125, + "k3_kl": 0.0966796875, + "kimi_kl": 0.349609375, + "learning_rate": 1.309e-07, + "loss": 0.0038, + "ppl": 0.025634765625, + "reward": 0.9952830076217651, + "reward_std": 0.0007611897890456021, + "rewards/perpo_ocr_edit_distance_reward": 0.9952830076217651, + "step": 3691, + "temperature": 0.9 + }, + { + "advantages": -2.7384077839087695e-05, + "completion_length": 1139.0, + "delta_ref_entropy_loss": 0.022216796875, + "delta_ref_ppl": -0.05029296875, + "entropy_loss": -0.212890625, + "epoch": 0.7384, + "grad_norm": 7.453358145347474, + "k1_kl": 0.05029296875, + "k3_kl": 0.05078125, + "kimi_kl": 0.07568359375, + "learning_rate": 1.308e-07, + "loss": 0.0021, + "ppl": 0.11083984375, + "reward": 0.9504581093788147, + "reward_std": 0.0036298870109021664, + "rewards/perpo_ocr_edit_distance_reward": 0.9504581689834595, + "step": 3692, + "temperature": 0.9 + }, + { + "advantages": 1.0962997293972876e-05, + "completion_length": 442.0, + "delta_ref_entropy_loss": 0.061767578125, + "delta_ref_ppl": -0.10498046875, + "entropy_loss": -0.08837890625, + "epoch": 0.7386, + "grad_norm": 0.9747478689212391, + "k1_kl": 0.1044921875, + "k3_kl": 0.068359375, + "kimi_kl": 0.2109375, + "learning_rate": 1.307e-07, + "loss": 0.0027, + "ppl": 0.03564453125, + "reward": 0.9791139960289001, + "reward_std": 0.0006775921792723238, + "rewards/perpo_ocr_edit_distance_reward": 0.9791139364242554, + "step": 3693, + "temperature": 0.9 + }, + { + "advantages": -2.1346979337977245e-05, + "completion_length": 2048.0, + "delta_ref_entropy_loss": -0.006683349609375, + "delta_ref_ppl": -0.0247802734375, + "entropy_loss": -0.08837890625, + "epoch": 0.7388, + "grad_norm": 2.5264419411309693, + "k1_kl": 0.0247802734375, + "k3_kl": 0.019287109375, + "kimi_kl": 0.0478515625, + "learning_rate": 1.306e-07, + "loss": 0.0008, + "ppl": 0.04248046875, + "reward": 0.7753071784973145, + "reward_std": 0.005885693244636059, + "rewards/perpo_ocr_edit_distance_reward": 0.775307297706604, + "step": 3694, + "temperature": 0.9 + }, + { + "advantages": -1.234667706739856e-05, + "completion_length": 392.0, + "delta_ref_entropy_loss": 0.11328125, + "delta_ref_ppl": -0.1259765625, + "entropy_loss": -0.1240234375, + "epoch": 0.739, + "grad_norm": 1.144786534616353, + "k1_kl": 0.1259765625, + "k3_kl": 0.06982421875, + "kimi_kl": 0.1923828125, + "learning_rate": 1.305e-07, + "loss": 0.0028, + "ppl": 0.042724609375, + "reward": 0.8298922181129456, + "reward_std": 0.0012807055609300733, + "rewards/perpo_ocr_edit_distance_reward": 0.8298922181129456, + "step": 3695, + "temperature": 0.9 + }, + { + "advantages": -2.111707544827368e-06, + "completion_length": 583.0, + "delta_ref_entropy_loss": 0.078125, + "delta_ref_ppl": -0.09521484375, + "entropy_loss": -0.23046875, + "epoch": 0.7392, + "grad_norm": 1.3826865268647417, + "k1_kl": 0.09521484375, + "k3_kl": 0.06396484375, + "kimi_kl": 0.173828125, + "learning_rate": 1.3039999999999998e-07, + "loss": 0.0026, + "ppl": 0.0966796875, + "reward": 0.3578636050224304, + "reward_std": 0.003931677900254726, + "rewards/perpo_ocr_edit_distance_reward": 0.3578636050224304, + "step": 3696, + "temperature": 0.9 + }, + { + "advantages": -5.5875098041724414e-05, + "completion_length": 425.0, + "delta_ref_entropy_loss": 0.058349609375, + "delta_ref_ppl": -0.11572265625, + "entropy_loss": -0.06884765625, + "epoch": 0.7394, + "grad_norm": 0.3356602767301951, + "k1_kl": 0.1162109375, + "k3_kl": 0.080078125, + "kimi_kl": 0.3125, + "learning_rate": 1.303e-07, + "loss": 0.0033, + "ppl": 0.024169921875, + "reward": 0.9957205057144165, + "reward_std": 0.0006617571343667805, + "rewards/perpo_ocr_edit_distance_reward": 0.9957205653190613, + "step": 3697, + "temperature": 0.9 + }, + { + "advantages": -5.074909950053552e-06, + "completion_length": 869.0, + "delta_ref_entropy_loss": 0.00799560546875, + "delta_ref_ppl": -0.059326171875, + "entropy_loss": -0.09765625, + "epoch": 0.7396, + "grad_norm": 0.962514011904731, + "k1_kl": 0.05908203125, + "k3_kl": 0.046875, + "kimi_kl": 0.115234375, + "learning_rate": 1.3020000000000001e-07, + "loss": 0.0019, + "ppl": 0.04052734375, + "reward": 0.9591398239135742, + "reward_std": 0.004953776951879263, + "rewards/perpo_ocr_edit_distance_reward": 0.959139883518219, + "step": 3698, + "temperature": 0.9 + }, + { + "advantages": -1.6033649444580078e-05, + "completion_length": 596.0, + "delta_ref_entropy_loss": 0.056884765625, + "delta_ref_ppl": -0.06591796875, + "entropy_loss": -0.09814453125, + "epoch": 0.7398, + "grad_norm": 0.6992790746413134, + "k1_kl": 0.06591796875, + "k3_kl": 0.04052734375, + "kimi_kl": 0.087890625, + "learning_rate": 1.3009999999999998e-07, + "loss": 0.0016, + "ppl": 0.039306640625, + "reward": 0.9448791742324829, + "reward_std": 0.0025564900133758783, + "rewards/perpo_ocr_edit_distance_reward": 0.9448791742324829, + "step": 3699, + "temperature": 0.9 + }, + { + "advantages": -7.089546852512285e-05, + "completion_length": 728.0, + "delta_ref_entropy_loss": 0.033447265625, + "delta_ref_ppl": -0.05712890625, + "entropy_loss": -0.04150390625, + "epoch": 0.74, + "grad_norm": 0.3669336676911578, + "k1_kl": 0.05712890625, + "k3_kl": 0.036376953125, + "kimi_kl": 0.1376953125, + "learning_rate": 1.3e-07, + "loss": 0.0015, + "ppl": 0.01263427734375, + "reward": 0.9754705429077148, + "reward_std": 0.0006206798134371638, + "rewards/perpo_ocr_edit_distance_reward": 0.9754705429077148, + "step": 3700, + "temperature": 0.9 + }, + { + "advantages": -0.00012920584413222969, + "completion_length": 1150.0, + "delta_ref_entropy_loss": 0.01080322265625, + "delta_ref_ppl": -0.0281982421875, + "entropy_loss": -0.045166015625, + "epoch": 0.7402, + "grad_norm": 0.8272637862035221, + "k1_kl": 0.028076171875, + "k3_kl": 0.0179443359375, + "kimi_kl": 0.041015625, + "learning_rate": 1.299e-07, + "loss": 0.0008, + "ppl": 0.018310546875, + "reward": 0.9925097823143005, + "reward_std": 0.00036138147697784007, + "rewards/perpo_ocr_edit_distance_reward": 0.9925098419189453, + "step": 3701, + "temperature": 0.9 + }, + { + "advantages": -1.2602125707417144e-06, + "completion_length": 891.0, + "delta_ref_entropy_loss": -0.03076171875, + "delta_ref_ppl": -0.042236328125, + "entropy_loss": -0.169921875, + "epoch": 0.7404, + "grad_norm": 1.3139328652340163, + "k1_kl": 0.042236328125, + "k3_kl": 0.039306640625, + "kimi_kl": 0.09521484375, + "learning_rate": 1.2979999999999998e-07, + "loss": 0.0016, + "ppl": 0.06591796875, + "reward": 0.9127475023269653, + "reward_std": 0.05315650254487991, + "rewards/perpo_ocr_edit_distance_reward": 0.9127475619316101, + "step": 3702, + "temperature": 0.9 + }, + { + "advantages": -4.3681691749952734e-05, + "completion_length": 672.0, + "delta_ref_entropy_loss": 0.03515625, + "delta_ref_ppl": -0.05810546875, + "entropy_loss": -0.039794921875, + "epoch": 0.7406, + "grad_norm": 0.2031893298316771, + "k1_kl": 0.05810546875, + "k3_kl": 0.036376953125, + "kimi_kl": 0.1181640625, + "learning_rate": 1.297e-07, + "loss": 0.0015, + "ppl": 0.0081787109375, + "reward": 0.993951141834259, + "reward_std": 0.0002896826190408319, + "rewards/perpo_ocr_edit_distance_reward": 0.9939512014389038, + "step": 3703, + "temperature": 0.9 + }, + { + "advantages": -3.007480154337827e-05, + "completion_length": 855.0, + "delta_ref_entropy_loss": 0.03271484375, + "delta_ref_ppl": -0.0810546875, + "entropy_loss": -0.384765625, + "epoch": 0.7408, + "grad_norm": 4.289912755303195, + "k1_kl": 0.0810546875, + "k3_kl": 0.06640625, + "kimi_kl": 0.14453125, + "learning_rate": 1.296e-07, + "loss": 0.0027, + "ppl": 0.2109375, + "reward": 0.8917941451072693, + "reward_std": 0.003297165036201477, + "rewards/perpo_ocr_edit_distance_reward": 0.8917942643165588, + "step": 3704, + "temperature": 0.9 + }, + { + "advantages": -8.983271982287988e-06, + "completion_length": 134.0, + "delta_ref_entropy_loss": 0.05517578125, + "delta_ref_ppl": -0.275390625, + "entropy_loss": -0.1669921875, + "epoch": 0.741, + "grad_norm": 2.7523889794912173, + "k1_kl": 0.275390625, + "k3_kl": 0.2177734375, + "kimi_kl": 0.80078125, + "learning_rate": 1.295e-07, + "loss": 0.0087, + "ppl": 0.08154296875, + "reward": 0.9808750152587891, + "reward_std": 0.003676775610074401, + "rewards/perpo_ocr_edit_distance_reward": 0.9808751344680786, + "step": 3705, + "temperature": 0.9 + }, + { + "advantages": -0.00019049645925406367, + "completion_length": 743.0, + "delta_ref_entropy_loss": 0.0308837890625, + "delta_ref_ppl": -0.03662109375, + "entropy_loss": -0.040283203125, + "epoch": 0.7412, + "grad_norm": 0.47135504490751584, + "k1_kl": 0.036376953125, + "k3_kl": 0.0181884765625, + "kimi_kl": 0.038818359375, + "learning_rate": 1.2939999999999998e-07, + "loss": 0.0009, + "ppl": 0.01007080078125, + "reward": 0.9987834692001343, + "reward_std": 0.00039172518881969154, + "rewards/perpo_ocr_edit_distance_reward": 0.9987835884094238, + "step": 3706, + "temperature": 0.9 + }, + { + "advantages": -6.880079126858618e-06, + "completion_length": 257.0, + "delta_ref_entropy_loss": -0.04296875, + "delta_ref_ppl": -0.142578125, + "entropy_loss": -0.349609375, + "epoch": 0.7414, + "grad_norm": 3.6971054723059695, + "k1_kl": 0.142578125, + "k3_kl": 0.126953125, + "kimi_kl": 0.310546875, + "learning_rate": 1.293e-07, + "loss": 0.0051, + "ppl": 0.1298828125, + "reward": 0.9299311637878418, + "reward_std": 0.011026277206838131, + "rewards/perpo_ocr_edit_distance_reward": 0.9299312829971313, + "step": 3707, + "temperature": 0.9 + }, + { + "advantages": -4.257474817137563e-09, + "completion_length": 1559.0, + "delta_ref_entropy_loss": 0.011962890625, + "delta_ref_ppl": -0.021484375, + "entropy_loss": -0.0390625, + "epoch": 0.7416, + "grad_norm": 2.840508638360652, + "k1_kl": 0.021484375, + "k3_kl": 0.01409912109375, + "kimi_kl": 0.03466796875, + "learning_rate": 1.292e-07, + "loss": 0.0006, + "ppl": 0.027099609375, + "reward": 0.9939824938774109, + "reward_std": 0.0013767200289294124, + "rewards/perpo_ocr_edit_distance_reward": 0.9939824938774109, + "step": 3708, + "temperature": 0.9 + }, + { + "advantages": -6.849425699329004e-05, + "completion_length": 709.0, + "delta_ref_entropy_loss": 0.0712890625, + "delta_ref_ppl": -0.0625, + "entropy_loss": -0.087890625, + "epoch": 0.7418, + "grad_norm": 0.7972398626279897, + "k1_kl": 0.062255859375, + "k3_kl": 0.02978515625, + "kimi_kl": 0.06640625, + "learning_rate": 1.2909999999999998e-07, + "loss": 0.0013, + "ppl": 0.034423828125, + "reward": 0.8878931999206543, + "reward_std": 0.0005216765566729009, + "rewards/perpo_ocr_edit_distance_reward": 0.8878931999206543, + "step": 3709, + "temperature": 0.9 + }, + { + "advantages": -0.00011743818322429433, + "completion_length": 892.0, + "delta_ref_entropy_loss": 0.00946044921875, + "delta_ref_ppl": -0.028564453125, + "entropy_loss": -0.025146484375, + "epoch": 0.742, + "grad_norm": 0.3119069056829161, + "k1_kl": 0.028564453125, + "k3_kl": 0.0205078125, + "kimi_kl": 0.059814453125, + "learning_rate": 1.29e-07, + "loss": 0.0009, + "ppl": 0.0084228515625, + "reward": 0.9984169006347656, + "reward_std": 0.0006975200958549976, + "rewards/perpo_ocr_edit_distance_reward": 0.9984169602394104, + "step": 3710, + "temperature": 0.9 + }, + { + "advantages": -0.00012436084216460586, + "completion_length": 351.0, + "delta_ref_entropy_loss": 0.0498046875, + "delta_ref_ppl": -0.0537109375, + "entropy_loss": -0.043212890625, + "epoch": 0.7422, + "grad_norm": 0.8662318067467007, + "k1_kl": 0.0537109375, + "k3_kl": 0.027587890625, + "kimi_kl": 0.053955078125, + "learning_rate": 1.2889999999999997e-07, + "loss": 0.0012, + "ppl": 0.01165771484375, + "reward": 0.9772664904594421, + "reward_std": 0.00031079220934771, + "rewards/perpo_ocr_edit_distance_reward": 0.9772665500640869, + "step": 3711, + "temperature": 0.9 + }, + { + "advantages": -2.5102071958826855e-05, + "completion_length": 926.0, + "delta_ref_entropy_loss": 0.02294921875, + "delta_ref_ppl": -0.044677734375, + "entropy_loss": -0.08251953125, + "epoch": 0.7424, + "grad_norm": 0.6917217607092834, + "k1_kl": 0.044677734375, + "k3_kl": 0.0264892578125, + "kimi_kl": 0.05859375, + "learning_rate": 1.288e-07, + "loss": 0.0011, + "ppl": 0.0308837890625, + "reward": 0.9541459083557129, + "reward_std": 0.0029522301629185677, + "rewards/perpo_ocr_edit_distance_reward": 0.9541460275650024, + "step": 3712, + "temperature": 0.9 + }, + { + "advantages": -3.993511199951172e-05, + "completion_length": 663.0, + "delta_ref_entropy_loss": 0.07958984375, + "delta_ref_ppl": -0.07421875, + "entropy_loss": -0.1083984375, + "epoch": 0.7426, + "grad_norm": 0.9251017290959962, + "k1_kl": 0.07421875, + "k3_kl": 0.040283203125, + "kimi_kl": 0.09814453125, + "learning_rate": 1.287e-07, + "loss": 0.0016, + "ppl": 0.04736328125, + "reward": 0.3760311007499695, + "reward_std": 0.00043308662134222686, + "rewards/perpo_ocr_edit_distance_reward": 0.3760311007499695, + "step": 3713, + "temperature": 0.9 + }, + { + "advantages": -1.120567412726814e-05, + "completion_length": 450.0, + "delta_ref_entropy_loss": 0.06640625, + "delta_ref_ppl": -0.1064453125, + "entropy_loss": -0.1298828125, + "epoch": 0.7428, + "grad_norm": 1.2017396953732267, + "k1_kl": 0.1064453125, + "k3_kl": 0.0712890625, + "kimi_kl": 0.189453125, + "learning_rate": 1.2859999999999997e-07, + "loss": 0.0029, + "ppl": 0.05908203125, + "reward": 0.5400727391242981, + "reward_std": 0.003692991565912962, + "rewards/perpo_ocr_edit_distance_reward": 0.5400727987289429, + "step": 3714, + "temperature": 0.9 + }, + { + "advantages": -4.5299530029296875e-06, + "completion_length": 668.0, + "delta_ref_entropy_loss": 0.03564453125, + "delta_ref_ppl": -0.08642578125, + "entropy_loss": -0.291015625, + "epoch": 0.743, + "grad_norm": 2.024878765582958, + "k1_kl": 0.08642578125, + "k3_kl": 0.056884765625, + "kimi_kl": 0.111328125, + "learning_rate": 1.285e-07, + "loss": 0.0023, + "ppl": 0.1376953125, + "reward": 0.8014423251152039, + "reward_std": 0.01113544125109911, + "rewards/perpo_ocr_edit_distance_reward": 0.8014424443244934, + "step": 3715, + "temperature": 0.9 + }, + { + "advantages": -2.4523055799363647e-06, + "completion_length": 508.0, + "delta_ref_entropy_loss": -0.03173828125, + "delta_ref_ppl": -0.16015625, + "entropy_loss": -0.25, + "epoch": 0.7432, + "grad_norm": 3.156117054116375, + "k1_kl": 0.16015625, + "k3_kl": 0.1337890625, + "kimi_kl": 0.41015625, + "learning_rate": 1.2839999999999999e-07, + "loss": 0.0054, + "ppl": 0.10595703125, + "reward": 0.8473432064056396, + "reward_std": 0.01366451196372509, + "rewards/perpo_ocr_edit_distance_reward": 0.8473432660102844, + "step": 3716, + "temperature": 0.9 + }, + { + "advantages": -2.6123865609406494e-05, + "completion_length": 231.0, + "delta_ref_entropy_loss": 0.0908203125, + "delta_ref_ppl": -0.16015625, + "entropy_loss": -0.232421875, + "epoch": 0.7434, + "grad_norm": 1.5627908449543, + "k1_kl": 0.16015625, + "k3_kl": 0.111328125, + "kimi_kl": 0.322265625, + "learning_rate": 1.283e-07, + "loss": 0.0045, + "ppl": 0.08935546875, + "reward": 0.5307930111885071, + "reward_std": 0.0021798054222017527, + "rewards/perpo_ocr_edit_distance_reward": 0.5307930707931519, + "step": 3717, + "temperature": 0.9 + }, + { + "advantages": -0.00011655263369902968, + "completion_length": 826.0, + "delta_ref_entropy_loss": 0.048095703125, + "delta_ref_ppl": -0.06982421875, + "entropy_loss": -0.05224609375, + "epoch": 0.7436, + "grad_norm": 0.4465687911649077, + "k1_kl": 0.06982421875, + "k3_kl": 0.045654296875, + "kimi_kl": 0.1259765625, + "learning_rate": 1.282e-07, + "loss": 0.0019, + "ppl": 0.0206298828125, + "reward": 0.9919442534446716, + "reward_std": 0.0006305912975221872, + "rewards/perpo_ocr_edit_distance_reward": 0.9919443726539612, + "step": 3718, + "temperature": 0.9 + }, + { + "advantages": -1.9141607481287792e-05, + "completion_length": 662.0, + "delta_ref_entropy_loss": 0.03955078125, + "delta_ref_ppl": -0.08642578125, + "entropy_loss": -0.20703125, + "epoch": 0.7438, + "grad_norm": 6.1025540590877245, + "k1_kl": 0.08642578125, + "k3_kl": 0.10791015625, + "kimi_kl": 0.177734375, + "learning_rate": 1.281e-07, + "loss": 0.0043, + "ppl": 0.1083984375, + "reward": 0.8994029760360718, + "reward_std": 0.00523931160569191, + "rewards/perpo_ocr_edit_distance_reward": 0.8994030952453613, + "step": 3719, + "temperature": 0.9 + }, + { + "advantages": -5.562603837461211e-05, + "completion_length": 382.0, + "delta_ref_entropy_loss": 0.043701171875, + "delta_ref_ppl": -0.099609375, + "entropy_loss": -0.08056640625, + "epoch": 0.744, + "grad_norm": 1.162359541559891, + "k1_kl": 0.09912109375, + "k3_kl": 0.078125, + "kimi_kl": 0.283203125, + "learning_rate": 1.28e-07, + "loss": 0.0032, + "ppl": 0.04296875, + "reward": 0.9963535666465759, + "reward_std": 0.0011247828369960189, + "rewards/perpo_ocr_edit_distance_reward": 0.9963536262512207, + "step": 3720, + "temperature": 0.9 + }, + { + "advantages": 3.4059798537100505e-08, + "completion_length": 223.0, + "delta_ref_entropy_loss": -0.408203125, + "delta_ref_ppl": -0.1494140625, + "entropy_loss": -0.84765625, + "epoch": 0.7442, + "grad_norm": 4.2164628493497425, + "k1_kl": 0.150390625, + "k3_kl": 0.185546875, + "kimi_kl": 0.466796875, + "learning_rate": 1.279e-07, + "loss": 0.0074, + "ppl": 0.384765625, + "reward": 0.35950273275375366, + "reward_std": 0.06677419692277908, + "rewards/perpo_ocr_edit_distance_reward": 0.35950276255607605, + "step": 3721, + "temperature": 0.9 + }, + { + "advantages": -8.83851771504851e-06, + "completion_length": 1090.0, + "delta_ref_entropy_loss": 0.042724609375, + "delta_ref_ppl": -0.0693359375, + "entropy_loss": -0.07763671875, + "epoch": 0.7444, + "grad_norm": 2.646662186635487, + "k1_kl": 0.0693359375, + "k3_kl": 0.044921875, + "kimi_kl": 0.123046875, + "learning_rate": 1.278e-07, + "loss": 0.0018, + "ppl": 0.037353515625, + "reward": 0.5061216354370117, + "reward_std": 0.0018299994990229607, + "rewards/perpo_ocr_edit_distance_reward": 0.5061216354370117, + "step": 3722, + "temperature": 0.9 + }, + { + "advantages": -5.10896995820076e-07, + "completion_length": 120.0, + "delta_ref_entropy_loss": -0.203125, + "delta_ref_ppl": -0.53125, + "entropy_loss": -0.46484375, + "epoch": 0.7446, + "grad_norm": 8.566086827384845, + "k1_kl": 0.53125, + "k3_kl": 0.4765625, + "kimi_kl": 2.625, + "learning_rate": 1.277e-07, + "loss": 0.0191, + "ppl": 0.1884765625, + "reward": 0.3089700937271118, + "reward_std": 0.03805656358599663, + "rewards/perpo_ocr_edit_distance_reward": 0.3089701235294342, + "step": 3723, + "temperature": 0.9 + }, + { + "advantages": -3.0313220122479834e-05, + "completion_length": 922.0, + "delta_ref_entropy_loss": 0.0159912109375, + "delta_ref_ppl": -0.0262451171875, + "entropy_loss": -0.0294189453125, + "epoch": 0.7448, + "grad_norm": 0.4090747209030235, + "k1_kl": 0.0262451171875, + "k3_kl": 0.0179443359375, + "kimi_kl": 0.04296875, + "learning_rate": 1.2759999999999998e-07, + "loss": 0.0007, + "ppl": 0.00927734375, + "reward": 0.993658721446991, + "reward_std": 0.0007423735805787146, + "rewards/perpo_ocr_edit_distance_reward": 0.9936587810516357, + "step": 3724, + "temperature": 0.9 + }, + { + "advantages": -4.233633080730215e-05, + "completion_length": 687.0, + "delta_ref_entropy_loss": 0.042236328125, + "delta_ref_ppl": -0.0546875, + "entropy_loss": -0.050048828125, + "epoch": 0.745, + "grad_norm": 0.5877039422191687, + "k1_kl": 0.054931640625, + "k3_kl": 0.033447265625, + "kimi_kl": 0.1005859375, + "learning_rate": 1.275e-07, + "loss": 0.0014, + "ppl": 0.016845703125, + "reward": 0.9933314919471741, + "reward_std": 0.0009056134731508791, + "rewards/perpo_ocr_edit_distance_reward": 0.9933315515518188, + "step": 3725, + "temperature": 0.9 + }, + { + "advantages": -6.399836274795234e-05, + "completion_length": 239.0, + "delta_ref_entropy_loss": 0.03173828125, + "delta_ref_ppl": -0.2158203125, + "entropy_loss": -0.18359375, + "epoch": 0.7452, + "grad_norm": 1.7238185711044887, + "k1_kl": 0.216796875, + "k3_kl": 0.16015625, + "kimi_kl": 0.6953125, + "learning_rate": 1.2740000000000002e-07, + "loss": 0.0065, + "ppl": 0.0654296875, + "reward": 0.7964639067649841, + "reward_std": 0.0014968853211030364, + "rewards/perpo_ocr_edit_distance_reward": 0.7964640259742737, + "step": 3726, + "temperature": 0.9 + }, + { + "advantages": -1.2568065358209424e-05, + "completion_length": 66.0, + "delta_ref_entropy_loss": 0.046142578125, + "delta_ref_ppl": -0.484375, + "entropy_loss": -0.201171875, + "epoch": 0.7454, + "grad_norm": 7.783444248187183, + "k1_kl": 0.484375, + "k3_kl": 0.40234375, + "kimi_kl": 1.671875, + "learning_rate": 1.2729999999999998e-07, + "loss": 0.0161, + "ppl": 0.06640625, + "reward": 0.8689006567001343, + "reward_std": 0.008706592954695225, + "rewards/perpo_ocr_edit_distance_reward": 0.8689007759094238, + "step": 3727, + "temperature": 0.9 + }, + { + "advantages": -1.8153872588300146e-05, + "completion_length": 450.0, + "delta_ref_entropy_loss": 0.046630859375, + "delta_ref_ppl": -0.11181640625, + "entropy_loss": -0.064453125, + "epoch": 0.7456, + "grad_norm": 0.7774215497971546, + "k1_kl": 0.11181640625, + "k3_kl": 0.08154296875, + "kimi_kl": 0.28515625, + "learning_rate": 1.272e-07, + "loss": 0.0033, + "ppl": 0.02734375, + "reward": 0.9914261698722839, + "reward_std": 0.003178960643708706, + "rewards/perpo_ocr_edit_distance_reward": 0.9914262294769287, + "step": 3728, + "temperature": 0.9 + }, + { + "advantages": 1.4730862858414184e-05, + "completion_length": 1091.0, + "delta_ref_entropy_loss": 0.01348876953125, + "delta_ref_ppl": -0.0235595703125, + "entropy_loss": -0.026611328125, + "epoch": 0.7458, + "grad_norm": 0.2549319322459351, + "k1_kl": 0.0235595703125, + "k3_kl": 0.01556396484375, + "kimi_kl": 0.035400390625, + "learning_rate": 1.271e-07, + "loss": 0.0006, + "ppl": 0.00830078125, + "reward": 0.9177544713020325, + "reward_std": 0.0004773649852722883, + "rewards/perpo_ocr_edit_distance_reward": 0.9177544713020325, + "step": 3729, + "temperature": 0.9 + }, + { + "advantages": -4.546983109321445e-05, + "completion_length": 598.0, + "delta_ref_entropy_loss": 0.033935546875, + "delta_ref_ppl": -0.05810546875, + "entropy_loss": -0.06103515625, + "epoch": 0.746, + "grad_norm": 0.5663090971902793, + "k1_kl": 0.05810546875, + "k3_kl": 0.037353515625, + "kimi_kl": 0.1259765625, + "learning_rate": 1.2699999999999999e-07, + "loss": 0.0015, + "ppl": 0.0228271484375, + "reward": 0.988034725189209, + "reward_std": 0.0008358903578482568, + "rewards/perpo_ocr_edit_distance_reward": 0.9880348443984985, + "step": 3730, + "temperature": 0.9 + }, + { + "advantages": 2.1423613361548632e-05, + "completion_length": 358.0, + "delta_ref_entropy_loss": 0.045166015625, + "delta_ref_ppl": -0.07861328125, + "entropy_loss": -0.04833984375, + "epoch": 0.7462, + "grad_norm": 0.863222404829009, + "k1_kl": 0.0791015625, + "k3_kl": 0.0498046875, + "kimi_kl": 0.158203125, + "learning_rate": 1.269e-07, + "loss": 0.002, + "ppl": 0.02001953125, + "reward": 0.9888880252838135, + "reward_std": 0.001092202146537602, + "rewards/perpo_ocr_edit_distance_reward": 0.9888879656791687, + "step": 3731, + "temperature": 0.9 + }, + { + "advantages": 8.514949634275126e-09, + "completion_length": 383.0, + "delta_ref_entropy_loss": 0.0294189453125, + "delta_ref_ppl": -0.06298828125, + "entropy_loss": -0.053955078125, + "epoch": 0.7464, + "grad_norm": 0.638588141542679, + "k1_kl": 0.0634765625, + "k3_kl": 0.0634765625, + "kimi_kl": 0.13671875, + "learning_rate": 1.268e-07, + "loss": 0.0026, + "ppl": 0.020263671875, + "reward": 0.9931486248970032, + "reward_std": 0.0011353857116773725, + "rewards/perpo_ocr_edit_distance_reward": 0.993148684501648, + "step": 3732, + "temperature": 0.9 + }, + { + "advantages": -3.004074278578628e-05, + "completion_length": 1122.0, + "delta_ref_entropy_loss": 0.06396484375, + "delta_ref_ppl": -0.09326171875, + "entropy_loss": -0.09033203125, + "epoch": 0.7466, + "grad_norm": 0.7342761842968609, + "k1_kl": 0.09326171875, + "k3_kl": 0.054443359375, + "kimi_kl": 0.138671875, + "learning_rate": 1.267e-07, + "loss": 0.0022, + "ppl": 0.03466796875, + "reward": 0.9792352318763733, + "reward_std": 0.0013168557779863477, + "rewards/perpo_ocr_edit_distance_reward": 0.9792353510856628, + "step": 3733, + "temperature": 0.9 + }, + { + "advantages": -5.4819247452542186e-05, + "completion_length": 340.0, + "delta_ref_entropy_loss": 0.0218505859375, + "delta_ref_ppl": -0.0703125, + "entropy_loss": -0.044677734375, + "epoch": 0.7468, + "grad_norm": 0.4403692495660645, + "k1_kl": 0.0703125, + "k3_kl": 0.052734375, + "kimi_kl": 0.2001953125, + "learning_rate": 1.2659999999999998e-07, + "loss": 0.0022, + "ppl": 0.0147705078125, + "reward": 0.9929063320159912, + "reward_std": 0.0005213312688283622, + "rewards/perpo_ocr_edit_distance_reward": 0.9929064512252808, + "step": 3734, + "temperature": 0.9 + }, + { + "advantages": 8.514949634275126e-09, + "completion_length": 531.0, + "delta_ref_entropy_loss": -0.06298828125, + "delta_ref_ppl": -0.1015625, + "entropy_loss": -0.7109375, + "epoch": 0.747, + "grad_norm": 3.290520851821094, + "k1_kl": 0.1015625, + "k3_kl": 0.0947265625, + "kimi_kl": 0.18359375, + "learning_rate": 1.265e-07, + "loss": 0.0038, + "ppl": 0.35546875, + "reward": 0.49968764185905457, + "reward_std": 0.0381438322365284, + "rewards/perpo_ocr_edit_distance_reward": 0.49968770146369934, + "step": 3735, + "temperature": 0.9 + }, + { + "advantages": -8.633307152194902e-05, + "completion_length": 724.0, + "delta_ref_entropy_loss": 0.038818359375, + "delta_ref_ppl": -0.045654296875, + "entropy_loss": -0.04931640625, + "epoch": 0.7472, + "grad_norm": 0.5030962799414054, + "k1_kl": 0.0458984375, + "k3_kl": 0.0269775390625, + "kimi_kl": 0.080078125, + "learning_rate": 1.264e-07, + "loss": 0.0012, + "ppl": 0.017822265625, + "reward": 0.9902405738830566, + "reward_std": 0.0006890854565426707, + "rewards/perpo_ocr_edit_distance_reward": 0.9902406930923462, + "step": 3736, + "temperature": 0.9 + }, + { + "advantages": -4.991463356418535e-05, + "completion_length": 625.0, + "delta_ref_entropy_loss": 0.04052734375, + "delta_ref_ppl": -0.06494140625, + "entropy_loss": -0.044189453125, + "epoch": 0.7474, + "grad_norm": 0.5083121555276083, + "k1_kl": 0.06494140625, + "k3_kl": 0.040283203125, + "kimi_kl": 0.10791015625, + "learning_rate": 1.2629999999999999e-07, + "loss": 0.0017, + "ppl": 0.01904296875, + "reward": 0.9920417666435242, + "reward_std": 0.0005824666004627943, + "rewards/perpo_ocr_edit_distance_reward": 0.992041826248169, + "step": 3737, + "temperature": 0.9 + }, + { + "advantages": -3.635883331298828e-05, + "completion_length": 738.0, + "delta_ref_entropy_loss": 0.01251220703125, + "delta_ref_ppl": -0.046630859375, + "entropy_loss": -0.0576171875, + "epoch": 0.7476, + "grad_norm": 0.6312567238476494, + "k1_kl": 0.046630859375, + "k3_kl": 0.035400390625, + "kimi_kl": 0.0908203125, + "learning_rate": 1.262e-07, + "loss": 0.0014, + "ppl": 0.0272216796875, + "reward": 0.9737163186073303, + "reward_std": 0.0015382618876174092, + "rewards/perpo_ocr_edit_distance_reward": 0.9737164378166199, + "step": 3738, + "temperature": 0.9 + }, + { + "advantages": -0.00010914462473010644, + "completion_length": 587.0, + "delta_ref_entropy_loss": 0.051513671875, + "delta_ref_ppl": -0.060791015625, + "entropy_loss": -0.044677734375, + "epoch": 0.7478, + "grad_norm": 0.2727627836093507, + "k1_kl": 0.060791015625, + "k3_kl": 0.033203125, + "kimi_kl": 0.07861328125, + "learning_rate": 1.2609999999999997e-07, + "loss": 0.0014, + "ppl": 0.01220703125, + "reward": 0.9957782030105591, + "reward_std": 0.00021207339887041599, + "rewards/perpo_ocr_edit_distance_reward": 0.9957782626152039, + "step": 3739, + "temperature": 0.9 + }, + { + "advantages": -3.808736801147461e-05, + "completion_length": 634.0, + "delta_ref_entropy_loss": 0.06396484375, + "delta_ref_ppl": -0.07568359375, + "entropy_loss": -0.09716796875, + "epoch": 0.748, + "grad_norm": 0.6883228715131301, + "k1_kl": 0.07568359375, + "k3_kl": 0.047607421875, + "kimi_kl": 0.1123046875, + "learning_rate": 1.26e-07, + "loss": 0.0019, + "ppl": 0.03662109375, + "reward": 0.9628329873085022, + "reward_std": 0.0016882122727110982, + "rewards/perpo_ocr_edit_distance_reward": 0.962833046913147, + "step": 3740, + "temperature": 0.9 + }, + { + "advantages": -0.00011429617006797343, + "completion_length": 576.0, + "delta_ref_entropy_loss": 0.0235595703125, + "delta_ref_ppl": -0.046630859375, + "entropy_loss": -0.0498046875, + "epoch": 0.7482, + "grad_norm": 1.03309007398309, + "k1_kl": 0.046630859375, + "k3_kl": 0.031494140625, + "kimi_kl": 0.087890625, + "learning_rate": 1.259e-07, + "loss": 0.0014, + "ppl": 0.0228271484375, + "reward": 0.9947516322135925, + "reward_std": 0.0005706620286218822, + "rewards/perpo_ocr_edit_distance_reward": 0.9947516918182373, + "step": 3741, + "temperature": 0.9 + }, + { + "advantages": -3.320830273878528e-06, + "completion_length": 1360.0, + "delta_ref_entropy_loss": 0.0137939453125, + "delta_ref_ppl": -0.0517578125, + "entropy_loss": -0.1103515625, + "epoch": 0.7484, + "grad_norm": 10.878094114442016, + "k1_kl": 0.051513671875, + "k3_kl": 0.040771484375, + "kimi_kl": 0.078125, + "learning_rate": 1.258e-07, + "loss": 0.0016, + "ppl": 0.052001953125, + "reward": 0.9626017212867737, + "reward_std": 0.012589714489877224, + "rewards/perpo_ocr_edit_distance_reward": 0.9626017808914185, + "step": 3742, + "temperature": 0.9 + }, + { + "advantages": -1.2261527444934472e-05, + "completion_length": 700.0, + "delta_ref_entropy_loss": 0.0279541015625, + "delta_ref_ppl": -0.038330078125, + "entropy_loss": -0.04443359375, + "epoch": 0.7486, + "grad_norm": 0.3947162456897831, + "k1_kl": 0.0380859375, + "k3_kl": 0.02392578125, + "kimi_kl": 0.060791015625, + "learning_rate": 1.257e-07, + "loss": 0.001, + "ppl": 0.017822265625, + "reward": 0.9950206279754639, + "reward_std": 0.0019878847524523735, + "rewards/perpo_ocr_edit_distance_reward": 0.9950206279754639, + "step": 3743, + "temperature": 0.9 + }, + { + "advantages": -2.339908132853452e-05, + "completion_length": 483.0, + "delta_ref_entropy_loss": 0.023193359375, + "delta_ref_ppl": -0.123046875, + "entropy_loss": -0.14453125, + "epoch": 0.7488, + "grad_norm": 1.6121364497487851, + "k1_kl": 0.12353515625, + "k3_kl": 0.08251953125, + "kimi_kl": 0.23046875, + "learning_rate": 1.2559999999999999e-07, + "loss": 0.0033, + "ppl": 0.06298828125, + "reward": 0.8516566753387451, + "reward_std": 0.0028112907893955708, + "rewards/perpo_ocr_edit_distance_reward": 0.8516567349433899, + "step": 3744, + "temperature": 0.9 + }, + { + "advantages": -3.0108862119959667e-05, + "completion_length": 670.0, + "delta_ref_entropy_loss": 0.0115966796875, + "delta_ref_ppl": -0.04638671875, + "entropy_loss": -0.0419921875, + "epoch": 0.749, + "grad_norm": 0.5249067024732965, + "k1_kl": 0.04638671875, + "k3_kl": 0.033447265625, + "kimi_kl": 0.10693359375, + "learning_rate": 1.255e-07, + "loss": 0.0014, + "ppl": 0.010986328125, + "reward": 0.9494797587394714, + "reward_std": 0.0007486490067094564, + "rewards/perpo_ocr_edit_distance_reward": 0.9494798183441162, + "step": 3745, + "temperature": 0.9 + }, + { + "advantages": 1.021793991640152e-06, + "completion_length": 693.0, + "delta_ref_entropy_loss": 0.029296875, + "delta_ref_ppl": -0.10205078125, + "entropy_loss": -0.26171875, + "epoch": 0.7492, + "grad_norm": 2.501477307916114, + "k1_kl": 0.1015625, + "k3_kl": 0.07177734375, + "kimi_kl": 0.1494140625, + "learning_rate": 1.254e-07, + "loss": 0.0029, + "ppl": 0.11328125, + "reward": 0.7638605833053589, + "reward_std": 0.0328478142619133, + "rewards/perpo_ocr_edit_distance_reward": 0.7638605833053589, + "step": 3746, + "temperature": 0.9 + }, + { + "advantages": -4.495893335842993e-06, + "completion_length": 426.0, + "delta_ref_entropy_loss": 0.0155029296875, + "delta_ref_ppl": -0.0400390625, + "entropy_loss": -0.032958984375, + "epoch": 0.7494, + "grad_norm": 0.7201286948685516, + "k1_kl": 0.0400390625, + "k3_kl": 0.0279541015625, + "kimi_kl": 0.0859375, + "learning_rate": 1.253e-07, + "loss": 0.0011, + "ppl": 0.01483154296875, + "reward": 0.9921529293060303, + "reward_std": 0.0017921144608408213, + "rewards/perpo_ocr_edit_distance_reward": 0.9921529293060303, + "step": 3747, + "temperature": 0.9 + }, + { + "advantages": -2.6379313567304052e-05, + "completion_length": 890.0, + "delta_ref_entropy_loss": 0.05419921875, + "delta_ref_ppl": -0.06689453125, + "entropy_loss": -0.06787109375, + "epoch": 0.7496, + "grad_norm": 0.6158450380086389, + "k1_kl": 0.06689453125, + "k3_kl": 0.03662109375, + "kimi_kl": 0.09765625, + "learning_rate": 1.252e-07, + "loss": 0.0015, + "ppl": 0.027099609375, + "reward": 0.9860020875930786, + "reward_std": 0.0005454136990010738, + "rewards/perpo_ocr_edit_distance_reward": 0.9860021471977234, + "step": 3748, + "temperature": 0.9 + }, + { + "advantages": -3.913470936822705e-05, + "completion_length": 766.0, + "delta_ref_entropy_loss": 0.021240234375, + "delta_ref_ppl": -0.04638671875, + "entropy_loss": -0.05029296875, + "epoch": 0.7498, + "grad_norm": 0.4694991133038816, + "k1_kl": 0.04638671875, + "k3_kl": 0.0341796875, + "kimi_kl": 0.095703125, + "learning_rate": 1.2509999999999998e-07, + "loss": 0.0014, + "ppl": 0.018310546875, + "reward": 0.9825645685195923, + "reward_std": 0.000770106038544327, + "rewards/perpo_ocr_edit_distance_reward": 0.9825646281242371, + "step": 3749, + "temperature": 0.9 + }, + { + "advantages": -3.317424489068799e-05, + "completion_length": 183.0, + "delta_ref_entropy_loss": 0.05859375, + "delta_ref_ppl": -0.2109375, + "entropy_loss": -0.091796875, + "epoch": 0.75, + "grad_norm": 1.1411159834925821, + "k1_kl": 0.2109375, + "k3_kl": 0.1533203125, + "kimi_kl": 0.5703125, + "learning_rate": 1.25e-07, + "loss": 0.0062, + "ppl": 0.0390625, + "reward": 0.9859485030174255, + "reward_std": 0.0027249674312770367, + "rewards/perpo_ocr_edit_distance_reward": 0.9859485626220703, + "step": 3750, + "temperature": 0.9 + }, + { + "advantages": -3.274849586887285e-05, + "completion_length": 154.0, + "delta_ref_entropy_loss": 0.040283203125, + "delta_ref_ppl": -0.1494140625, + "entropy_loss": -0.052734375, + "epoch": 0.7502, + "grad_norm": 1.6472498801441577, + "k1_kl": 0.1494140625, + "k3_kl": 0.10888671875, + "kimi_kl": 0.3359375, + "learning_rate": 1.249e-07, + "loss": 0.0044, + "ppl": 0.02490234375, + "reward": 0.9961749315261841, + "reward_std": 0.0027596158906817436, + "rewards/perpo_ocr_edit_distance_reward": 0.9961749911308289, + "step": 3751, + "temperature": 0.9 + }, + { + "advantages": -2.4778502847766504e-05, + "completion_length": 1465.0, + "delta_ref_entropy_loss": 0.0308837890625, + "delta_ref_ppl": -0.048583984375, + "entropy_loss": -0.203125, + "epoch": 0.7504, + "grad_norm": 4.383644855755843, + "k1_kl": 0.04833984375, + "k3_kl": 0.040771484375, + "kimi_kl": 0.054443359375, + "learning_rate": 1.2479999999999998e-07, + "loss": 0.0017, + "ppl": 0.10595703125, + "reward": 0.9014565348625183, + "reward_std": 0.001274973968975246, + "rewards/perpo_ocr_edit_distance_reward": 0.9014565944671631, + "step": 3752, + "temperature": 0.9 + }, + { + "advantages": -6.437302090489538e-06, + "completion_length": 172.0, + "delta_ref_entropy_loss": -0.004608154296875, + "delta_ref_ppl": -0.2490234375, + "entropy_loss": -0.2138671875, + "epoch": 0.7506, + "grad_norm": 2.391508418116304, + "k1_kl": 0.25, + "k3_kl": 0.1904296875, + "kimi_kl": 0.8359375, + "learning_rate": 1.247e-07, + "loss": 0.0076, + "ppl": 0.087890625, + "reward": 0.9027354717254639, + "reward_std": 0.0038796337321400642, + "rewards/perpo_ocr_edit_distance_reward": 0.9027355909347534, + "step": 3753, + "temperature": 0.9 + }, + { + "advantages": -6.256785127334297e-05, + "completion_length": 524.0, + "delta_ref_entropy_loss": 0.0556640625, + "delta_ref_ppl": -0.0859375, + "entropy_loss": -0.08642578125, + "epoch": 0.7508, + "grad_norm": 1.0880254470475657, + "k1_kl": 0.08642578125, + "k3_kl": 0.05615234375, + "kimi_kl": 0.1494140625, + "learning_rate": 1.246e-07, + "loss": 0.0023, + "ppl": 0.032958984375, + "reward": 0.9886845946311951, + "reward_std": 0.0018056079279631376, + "rewards/perpo_ocr_edit_distance_reward": 0.9886847138404846, + "step": 3754, + "temperature": 0.9 + }, + { + "advantages": 8.773804438533261e-05, + "completion_length": 747.0, + "delta_ref_entropy_loss": 0.029052734375, + "delta_ref_ppl": -0.0400390625, + "entropy_loss": -0.02197265625, + "epoch": 0.751, + "grad_norm": 0.06732168280929773, + "k1_kl": 0.040283203125, + "k3_kl": 0.021484375, + "kimi_kl": 0.052001953125, + "learning_rate": 1.2449999999999998e-07, + "loss": 0.0008, + "ppl": 0.00421142578125, + "reward": 0.9992173314094543, + "reward_std": 9.41163016250357e-05, + "rewards/perpo_ocr_edit_distance_reward": 0.9992173910140991, + "step": 3755, + "temperature": 0.9 + }, + { + "advantages": -8.514949634275126e-09, + "completion_length": 866.0, + "delta_ref_entropy_loss": 0.013916015625, + "delta_ref_ppl": -0.0439453125, + "entropy_loss": -0.05322265625, + "epoch": 0.7512, + "grad_norm": 3.7881687493906187, + "k1_kl": 0.0439453125, + "k3_kl": 0.025634765625, + "kimi_kl": 0.09033203125, + "learning_rate": 1.244e-07, + "loss": 0.001, + "ppl": 0.019287109375, + "reward": 0.8375838994979858, + "reward_std": 0.0004649889888241887, + "rewards/perpo_ocr_edit_distance_reward": 0.8375838994979858, + "step": 3756, + "temperature": 0.9 + }, + { + "advantages": -5.3593092161463574e-05, + "completion_length": 639.0, + "delta_ref_entropy_loss": 0.043701171875, + "delta_ref_ppl": -0.053955078125, + "entropy_loss": -0.08056640625, + "epoch": 0.7514, + "grad_norm": 0.7642246944709639, + "k1_kl": 0.05419921875, + "k3_kl": 0.035400390625, + "kimi_kl": 0.078125, + "learning_rate": 1.243e-07, + "loss": 0.0015, + "ppl": 0.03564453125, + "reward": 0.9967479705810547, + "reward_std": 0.0006945111090317369, + "rewards/perpo_ocr_edit_distance_reward": 0.9967480301856995, + "step": 3757, + "temperature": 0.9 + }, + { + "advantages": -6.164823571452871e-05, + "completion_length": 446.0, + "delta_ref_entropy_loss": -0.016357421875, + "delta_ref_ppl": -0.036376953125, + "entropy_loss": -0.11328125, + "epoch": 0.7516, + "grad_norm": 0.5740055877236054, + "k1_kl": 0.036376953125, + "k3_kl": 0.0250244140625, + "kimi_kl": 0.06689453125, + "learning_rate": 1.242e-07, + "loss": 0.0011, + "ppl": 0.0255126953125, + "reward": 0.9640860557556152, + "reward_std": 0.001143394154496491, + "rewards/perpo_ocr_edit_distance_reward": 0.9640861749649048, + "step": 3758, + "temperature": 0.9 + }, + { + "advantages": -2.7588437205849914e-06, + "completion_length": 679.0, + "delta_ref_entropy_loss": 0.050048828125, + "delta_ref_ppl": -0.09619140625, + "entropy_loss": -0.21875, + "epoch": 0.7518, + "grad_norm": 1.6050854239047165, + "k1_kl": 0.09619140625, + "k3_kl": 0.06640625, + "kimi_kl": 0.169921875, + "learning_rate": 1.241e-07, + "loss": 0.0027, + "ppl": 0.1103515625, + "reward": 0.9302188158035278, + "reward_std": 0.018294906243681908, + "rewards/perpo_ocr_edit_distance_reward": 0.9302188754081726, + "step": 3759, + "temperature": 0.9 + }, + { + "advantages": -1.947794771695044e-05, + "completion_length": 476.0, + "delta_ref_entropy_loss": 0.06689453125, + "delta_ref_ppl": -0.0947265625, + "entropy_loss": -0.1845703125, + "epoch": 0.752, + "grad_norm": 1.1668005384838027, + "k1_kl": 0.0947265625, + "k3_kl": 0.058837890625, + "kimi_kl": 0.1416015625, + "learning_rate": 1.24e-07, + "loss": 0.0024, + "ppl": 0.08544921875, + "reward": 0.7520953416824341, + "reward_std": 0.0020858589559793472, + "rewards/perpo_ocr_edit_distance_reward": 0.7520954012870789, + "step": 3760, + "temperature": 0.9 + }, + { + "advantages": -3.0909268389223143e-05, + "completion_length": 771.0, + "delta_ref_entropy_loss": 0.0146484375, + "delta_ref_ppl": -0.047119140625, + "entropy_loss": -0.0732421875, + "epoch": 0.7522, + "grad_norm": 1.2306988504375775, + "k1_kl": 0.04736328125, + "k3_kl": 0.0294189453125, + "kimi_kl": 0.0703125, + "learning_rate": 1.239e-07, + "loss": 0.0012, + "ppl": 0.0262451171875, + "reward": 0.9944700002670288, + "reward_std": 0.0021044358145445585, + "rewards/perpo_ocr_edit_distance_reward": 0.9944701194763184, + "step": 3761, + "temperature": 0.9 + }, + { + "advantages": -1.0643687346600927e-05, + "completion_length": 1157.0, + "delta_ref_entropy_loss": 0.0230712890625, + "delta_ref_ppl": -0.036865234375, + "entropy_loss": -0.08740234375, + "epoch": 0.7524, + "grad_norm": 0.9270145141929174, + "k1_kl": 0.036865234375, + "k3_kl": 0.025146484375, + "kimi_kl": 0.0478515625, + "learning_rate": 1.2379999999999998e-07, + "loss": 0.001, + "ppl": 0.044677734375, + "reward": 0.9908735156059265, + "reward_std": 0.0038994878996163607, + "rewards/perpo_ocr_edit_distance_reward": 0.9908735752105713, + "step": 3762, + "temperature": 0.9 + }, + { + "advantages": -1.8903187992691528e-06, + "completion_length": 625.0, + "delta_ref_entropy_loss": 0.00174713134765625, + "delta_ref_ppl": -0.06787109375, + "entropy_loss": -0.111328125, + "epoch": 0.7526, + "grad_norm": 0.5684481910550364, + "k1_kl": 0.06787109375, + "k3_kl": 0.052001953125, + "kimi_kl": 0.1337890625, + "learning_rate": 1.237e-07, + "loss": 0.0021, + "ppl": 0.048583984375, + "reward": 0.9741415977478027, + "reward_std": 0.00443923519924283, + "rewards/perpo_ocr_edit_distance_reward": 0.9741415977478027, + "step": 3763, + "temperature": 0.9 + }, + { + "advantages": -2.6932786568067968e-05, + "completion_length": 525.0, + "delta_ref_entropy_loss": 0.11865234375, + "delta_ref_ppl": -0.1474609375, + "entropy_loss": -0.40234375, + "epoch": 0.7528, + "grad_norm": 2.533485710935382, + "k1_kl": 0.146484375, + "k3_kl": 0.095703125, + "kimi_kl": 0.2255859375, + "learning_rate": 1.236e-07, + "loss": 0.0038, + "ppl": 0.19921875, + "reward": 0.9165683388710022, + "reward_std": 0.004324935842305422, + "rewards/perpo_ocr_edit_distance_reward": 0.916568398475647, + "step": 3764, + "temperature": 0.9 + }, + { + "advantages": -6.462846795329824e-05, + "completion_length": 1013.0, + "delta_ref_entropy_loss": 0.04736328125, + "delta_ref_ppl": -0.064453125, + "entropy_loss": -0.0537109375, + "epoch": 0.753, + "grad_norm": 1.0397689749839707, + "k1_kl": 0.064453125, + "k3_kl": 0.03466796875, + "kimi_kl": 0.0830078125, + "learning_rate": 1.235e-07, + "loss": 0.0014, + "ppl": 0.019287109375, + "reward": 0.6445516347885132, + "reward_std": 0.0006905616028234363, + "rewards/perpo_ocr_edit_distance_reward": 0.644551694393158, + "step": 3765, + "temperature": 0.9 + }, + { + "advantages": -6.0473172197816893e-05, + "completion_length": 783.0, + "delta_ref_entropy_loss": 0.03759765625, + "delta_ref_ppl": -0.0771484375, + "entropy_loss": -0.0302734375, + "epoch": 0.7532, + "grad_norm": 0.3163954006155483, + "k1_kl": 0.0771484375, + "k3_kl": 0.052734375, + "kimi_kl": 0.1953125, + "learning_rate": 1.2339999999999998e-07, + "loss": 0.0022, + "ppl": 0.01043701171875, + "reward": 0.9972450137138367, + "reward_std": 0.00032233507954515517, + "rewards/perpo_ocr_edit_distance_reward": 0.9972450137138367, + "step": 3766, + "temperature": 0.9 + }, + { + "advantages": 8.089202196970291e-07, + "completion_length": 743.0, + "delta_ref_entropy_loss": -0.029052734375, + "delta_ref_ppl": -0.09619140625, + "entropy_loss": -0.72265625, + "epoch": 0.7534, + "grad_norm": 2.8450998608473768, + "k1_kl": 0.09716796875, + "k3_kl": 0.09033203125, + "kimi_kl": 0.15234375, + "learning_rate": 1.233e-07, + "loss": 0.0036, + "ppl": 0.35546875, + "reward": 0.3694974482059479, + "reward_std": 0.010286315344274044, + "rewards/perpo_ocr_edit_distance_reward": 0.3694974482059479, + "step": 3767, + "temperature": 0.9 + }, + { + "advantages": -6.011554432916455e-06, + "completion_length": 888.0, + "delta_ref_entropy_loss": 0.04833984375, + "delta_ref_ppl": -0.0634765625, + "entropy_loss": -0.05322265625, + "epoch": 0.7536, + "grad_norm": 0.7029731105387735, + "k1_kl": 0.0634765625, + "k3_kl": 0.039794921875, + "kimi_kl": 0.1494140625, + "learning_rate": 1.232e-07, + "loss": 0.0016, + "ppl": 0.02294921875, + "reward": 0.9912399053573608, + "reward_std": 0.0013145992998033762, + "rewards/perpo_ocr_edit_distance_reward": 0.9912399053573608, + "step": 3768, + "temperature": 0.9 + }, + { + "advantages": -1.3964516938358429e-06, + "completion_length": 627.0, + "delta_ref_entropy_loss": 0.036376953125, + "delta_ref_ppl": -0.068359375, + "entropy_loss": -0.126953125, + "epoch": 0.7538, + "grad_norm": 2.1503137308924405, + "k1_kl": 0.068359375, + "k3_kl": 0.04638671875, + "kimi_kl": 0.1181640625, + "learning_rate": 1.2309999999999998e-07, + "loss": 0.0019, + "ppl": 0.057373046875, + "reward": 0.9726420640945435, + "reward_std": 0.061699360609054565, + "rewards/perpo_ocr_edit_distance_reward": 0.9726421236991882, + "step": 3769, + "temperature": 0.9 + }, + { + "advantages": -3.779786129598506e-05, + "completion_length": 902.0, + "delta_ref_entropy_loss": 0.047607421875, + "delta_ref_ppl": -0.0810546875, + "entropy_loss": -0.0693359375, + "epoch": 0.754, + "grad_norm": 0.49800732720810265, + "k1_kl": 0.0810546875, + "k3_kl": 0.04736328125, + "kimi_kl": 0.12109375, + "learning_rate": 1.23e-07, + "loss": 0.0019, + "ppl": 0.0283203125, + "reward": 0.9908905029296875, + "reward_std": 0.0014775675954297185, + "rewards/perpo_ocr_edit_distance_reward": 0.9908905625343323, + "step": 3770, + "temperature": 0.9 + }, + { + "advantages": -7.561275197076611e-06, + "completion_length": 330.0, + "delta_ref_entropy_loss": 0.0279541015625, + "delta_ref_ppl": -0.12060546875, + "entropy_loss": -0.0517578125, + "epoch": 0.7542, + "grad_norm": 0.639102204396539, + "k1_kl": 0.12109375, + "k3_kl": 0.0908203125, + "kimi_kl": 0.40625, + "learning_rate": 1.229e-07, + "loss": 0.0036, + "ppl": 0.018798828125, + "reward": 0.9942272305488586, + "reward_std": 0.0021480133291333914, + "rewards/perpo_ocr_edit_distance_reward": 0.9942272901535034, + "step": 3771, + "temperature": 0.9 + }, + { + "advantages": -4.00543212890625e-05, + "completion_length": 841.0, + "delta_ref_entropy_loss": 0.018798828125, + "delta_ref_ppl": -0.030029296875, + "entropy_loss": -0.0301513671875, + "epoch": 0.7544, + "grad_norm": 0.40705202935303453, + "k1_kl": 0.0301513671875, + "k3_kl": 0.016357421875, + "kimi_kl": 0.03759765625, + "learning_rate": 1.228e-07, + "loss": 0.0007, + "ppl": 0.0101318359375, + "reward": 0.9927770495414734, + "reward_std": 0.0005380490329116583, + "rewards/perpo_ocr_edit_distance_reward": 0.9927771091461182, + "step": 3772, + "temperature": 0.9 + }, + { + "advantages": -8.310590601467993e-06, + "completion_length": 709.0, + "delta_ref_entropy_loss": 0.055908203125, + "delta_ref_ppl": -0.0771484375, + "entropy_loss": -0.142578125, + "epoch": 0.7546, + "grad_norm": 2.5146168332128975, + "k1_kl": 0.07666015625, + "k3_kl": 0.052734375, + "kimi_kl": 0.140625, + "learning_rate": 1.227e-07, + "loss": 0.0021, + "ppl": 0.0654296875, + "reward": 0.9609957337379456, + "reward_std": 0.0019443010678514838, + "rewards/perpo_ocr_edit_distance_reward": 0.9609957337379456, + "step": 3773, + "temperature": 0.9 + }, + { + "advantages": 2.207074976467993e-05, + "completion_length": 285.0, + "delta_ref_entropy_loss": 0.04150390625, + "delta_ref_ppl": -0.08984375, + "entropy_loss": -0.05908203125, + "epoch": 0.7548, + "grad_norm": 0.6552299442691257, + "k1_kl": 0.08984375, + "k3_kl": 0.0654296875, + "kimi_kl": 0.201171875, + "learning_rate": 1.226e-07, + "loss": 0.0026, + "ppl": 0.0216064453125, + "reward": 0.8341497778892517, + "reward_std": 0.0006720171659253538, + "rewards/perpo_ocr_edit_distance_reward": 0.8341497778892517, + "step": 3774, + "temperature": 0.9 + }, + { + "advantages": -0.00020431620941963047, + "completion_length": 652.0, + "delta_ref_entropy_loss": 0.039306640625, + "delta_ref_ppl": -0.058349609375, + "entropy_loss": -0.0203857421875, + "epoch": 0.755, + "grad_norm": 0.26544898318518423, + "k1_kl": 0.058349609375, + "k3_kl": 0.0341796875, + "kimi_kl": 0.08740234375, + "learning_rate": 1.225e-07, + "loss": 0.0016, + "ppl": 0.005126953125, + "reward": 0.9993996024131775, + "reward_std": 0.0004834221617784351, + "rewards/perpo_ocr_edit_distance_reward": 0.9993996620178223, + "step": 3775, + "temperature": 0.9 + }, + { + "advantages": -0.00020650881924666464, + "completion_length": 1081.0, + "delta_ref_entropy_loss": 0.0272216796875, + "delta_ref_ppl": -0.0272216796875, + "entropy_loss": -0.02880859375, + "epoch": 0.7552, + "grad_norm": 0.6148332696948454, + "k1_kl": 0.0272216796875, + "k3_kl": 0.0120849609375, + "kimi_kl": 0.02880859375, + "learning_rate": 1.2239999999999998e-07, + "loss": 0.0007, + "ppl": 0.00689697265625, + "reward": 0.9987684488296509, + "reward_std": 0.00027106504421681166, + "rewards/perpo_ocr_edit_distance_reward": 0.9987685680389404, + "step": 3776, + "temperature": 0.9 + }, + { + "advantages": -1.3027872682869202e-06, + "completion_length": 672.0, + "delta_ref_entropy_loss": 0.0556640625, + "delta_ref_ppl": -0.061767578125, + "entropy_loss": -0.146484375, + "epoch": 0.7554, + "grad_norm": 1.1789640295737396, + "k1_kl": 0.06201171875, + "k3_kl": 0.03759765625, + "kimi_kl": 0.0810546875, + "learning_rate": 1.223e-07, + "loss": 0.0015, + "ppl": 0.0654296875, + "reward": 0.7663345336914062, + "reward_std": 0.006403573323041201, + "rewards/perpo_ocr_edit_distance_reward": 0.766334593296051, + "step": 3777, + "temperature": 0.9 + }, + { + "advantages": -1.6399793821619824e-05, + "completion_length": 915.0, + "delta_ref_entropy_loss": 0.0186767578125, + "delta_ref_ppl": -0.039794921875, + "entropy_loss": -0.09619140625, + "epoch": 0.7556, + "grad_norm": 1.1979953350749135, + "k1_kl": 0.039794921875, + "k3_kl": 0.029052734375, + "kimi_kl": 0.0615234375, + "learning_rate": 1.222e-07, + "loss": 0.0012, + "ppl": 0.04345703125, + "reward": 0.9675832986831665, + "reward_std": 0.0014576215762645006, + "rewards/perpo_ocr_edit_distance_reward": 0.967583417892456, + "step": 3778, + "temperature": 0.9 + }, + { + "advantages": -7.18832015991211e-05, + "completion_length": 912.0, + "delta_ref_entropy_loss": 0.04736328125, + "delta_ref_ppl": -0.056640625, + "entropy_loss": -0.044677734375, + "epoch": 0.7558, + "grad_norm": 1.7940478121932684, + "k1_kl": 0.056640625, + "k3_kl": 0.0281982421875, + "kimi_kl": 0.06298828125, + "learning_rate": 1.221e-07, + "loss": 0.0012, + "ppl": 0.02197265625, + "reward": 0.9919780492782593, + "reward_std": 0.001321584451943636, + "rewards/perpo_ocr_edit_distance_reward": 0.9919781684875488, + "step": 3779, + "temperature": 0.9 + }, + { + "advantages": -3.2356808787881164e-07, + "completion_length": 777.0, + "delta_ref_entropy_loss": -0.0274658203125, + "delta_ref_ppl": -0.059326171875, + "entropy_loss": -0.1953125, + "epoch": 0.756, + "grad_norm": 2.4908421970939996, + "k1_kl": 0.059326171875, + "k3_kl": 0.06982421875, + "kimi_kl": 0.1474609375, + "learning_rate": 1.2199999999999998e-07, + "loss": 0.0028, + "ppl": 0.080078125, + "reward": 0.6743637919425964, + "reward_std": 0.1473783552646637, + "rewards/perpo_ocr_edit_distance_reward": 0.6743638515472412, + "step": 3780, + "temperature": 0.9 + }, + { + "advantages": -7.722208101768047e-05, + "completion_length": 641.0, + "delta_ref_entropy_loss": 0.037353515625, + "delta_ref_ppl": -0.06494140625, + "entropy_loss": -0.03759765625, + "epoch": 0.7562, + "grad_norm": 0.3778677243836941, + "k1_kl": 0.0654296875, + "k3_kl": 0.040283203125, + "kimi_kl": 0.12158203125, + "learning_rate": 1.219e-07, + "loss": 0.0017, + "ppl": 0.01220703125, + "reward": 0.9966525435447693, + "reward_std": 0.0006717305514030159, + "rewards/perpo_ocr_edit_distance_reward": 0.9966526031494141, + "step": 3781, + "temperature": 0.9 + }, + { + "advantages": -2.0861627490376122e-05, + "completion_length": 416.0, + "delta_ref_entropy_loss": 0.080078125, + "delta_ref_ppl": -0.09375, + "entropy_loss": -0.052978515625, + "epoch": 0.7564, + "grad_norm": 0.28673666159565764, + "k1_kl": 0.09375, + "k3_kl": 0.05810546875, + "kimi_kl": 0.1787109375, + "learning_rate": 1.218e-07, + "loss": 0.0023, + "ppl": 0.0137939453125, + "reward": 0.9914575219154358, + "reward_std": 0.0003082666080445051, + "rewards/perpo_ocr_edit_distance_reward": 0.9914575815200806, + "step": 3782, + "temperature": 0.9 + }, + { + "advantages": -4.783698750543408e-05, + "completion_length": 799.0, + "delta_ref_entropy_loss": 0.0546875, + "delta_ref_ppl": -0.08056640625, + "entropy_loss": -0.146484375, + "epoch": 0.7566, + "grad_norm": 1.188888388888365, + "k1_kl": 0.08056640625, + "k3_kl": 0.048095703125, + "kimi_kl": 0.1640625, + "learning_rate": 1.2169999999999998e-07, + "loss": 0.002, + "ppl": 0.06787109375, + "reward": 0.9735403060913086, + "reward_std": 0.0011458642547950149, + "rewards/perpo_ocr_edit_distance_reward": 0.9735403656959534, + "step": 3783, + "temperature": 0.9 + }, + { + "advantages": -1.284054360439768e-05, + "completion_length": 578.0, + "delta_ref_entropy_loss": 0.07470703125, + "delta_ref_ppl": -0.09228515625, + "entropy_loss": -0.06494140625, + "epoch": 0.7568, + "grad_norm": 0.8105076637246231, + "k1_kl": 0.0927734375, + "k3_kl": 0.05908203125, + "kimi_kl": 0.189453125, + "learning_rate": 1.216e-07, + "loss": 0.0024, + "ppl": 0.02099609375, + "reward": 0.9940344095230103, + "reward_std": 0.001226235181093216, + "rewards/perpo_ocr_edit_distance_reward": 0.9940344095230103, + "step": 3784, + "temperature": 0.9 + }, + { + "advantages": -0.00010604518320178613, + "completion_length": 244.0, + "delta_ref_entropy_loss": 0.01080322265625, + "delta_ref_ppl": -0.09716796875, + "entropy_loss": -0.041015625, + "epoch": 0.757, + "grad_norm": 0.47380653291641917, + "k1_kl": 0.09716796875, + "k3_kl": 0.072265625, + "kimi_kl": 0.259765625, + "learning_rate": 1.215e-07, + "loss": 0.003, + "ppl": 0.0166015625, + "reward": 0.9466848373413086, + "reward_std": 0.0003014173707924783, + "rewards/perpo_ocr_edit_distance_reward": 0.9466849565505981, + "step": 3785, + "temperature": 0.9 + }, + { + "advantages": -1.8409320546197705e-05, + "completion_length": 143.0, + "delta_ref_entropy_loss": 0.03173828125, + "delta_ref_ppl": -0.1728515625, + "entropy_loss": -0.06298828125, + "epoch": 0.7572, + "grad_norm": 1.306490864468825, + "k1_kl": 0.1728515625, + "k3_kl": 0.1298828125, + "kimi_kl": 0.47265625, + "learning_rate": 1.214e-07, + "loss": 0.0052, + "ppl": 0.0281982421875, + "reward": 0.9777820706367493, + "reward_std": 0.0022176753263920546, + "rewards/perpo_ocr_edit_distance_reward": 0.977782130241394, + "step": 3786, + "temperature": 0.9 + }, + { + "advantages": -4.077438643435016e-05, + "completion_length": 116.0, + "delta_ref_entropy_loss": 0.0250244140625, + "delta_ref_ppl": -0.2578125, + "entropy_loss": -0.1240234375, + "epoch": 0.7574, + "grad_norm": 1.454298092544686, + "k1_kl": 0.2578125, + "k3_kl": 0.2138671875, + "kimi_kl": 0.88671875, + "learning_rate": 1.213e-07, + "loss": 0.0086, + "ppl": 0.0419921875, + "reward": 0.9764150381088257, + "reward_std": 0.0013616869691759348, + "rewards/perpo_ocr_edit_distance_reward": 0.9764151573181152, + "step": 3787, + "temperature": 0.9 + }, + { + "advantages": -6.130763722467236e-06, + "completion_length": 930.0, + "delta_ref_entropy_loss": 0.005218505859375, + "delta_ref_ppl": -0.0400390625, + "entropy_loss": -0.06591796875, + "epoch": 0.7576, + "grad_norm": 0.5685585105208291, + "k1_kl": 0.040283203125, + "k3_kl": 0.0269775390625, + "kimi_kl": 0.0625, + "learning_rate": 1.212e-07, + "loss": 0.0011, + "ppl": 0.0247802734375, + "reward": 0.9791260957717896, + "reward_std": 0.0026854805182665586, + "rewards/perpo_ocr_edit_distance_reward": 0.9791261553764343, + "step": 3788, + "temperature": 0.9 + }, + { + "advantages": -8.387225420847244e-07, + "completion_length": 1186.0, + "delta_ref_entropy_loss": 0.01129150390625, + "delta_ref_ppl": -0.057861328125, + "entropy_loss": -0.1220703125, + "epoch": 0.7578, + "grad_norm": 6.650433938894017, + "k1_kl": 0.057861328125, + "k3_kl": 0.0419921875, + "kimi_kl": 0.1298828125, + "learning_rate": 1.211e-07, + "loss": 0.0017, + "ppl": 0.052001953125, + "reward": 0.9042596220970154, + "reward_std": 0.04018401354551315, + "rewards/perpo_ocr_edit_distance_reward": 0.9042597413063049, + "step": 3789, + "temperature": 0.9 + }, + { + "advantages": -3.1607494747731835e-05, + "completion_length": 894.0, + "delta_ref_entropy_loss": 0.05615234375, + "delta_ref_ppl": -0.0615234375, + "entropy_loss": -0.03515625, + "epoch": 0.758, + "grad_norm": 0.32153054961386823, + "k1_kl": 0.0615234375, + "k3_kl": 0.034423828125, + "kimi_kl": 0.09521484375, + "learning_rate": 1.2099999999999998e-07, + "loss": 0.0014, + "ppl": 0.0123291015625, + "reward": 0.9951277375221252, + "reward_std": 0.0007079889765009284, + "rewards/perpo_ocr_edit_distance_reward": 0.9951277375221252, + "step": 3790, + "temperature": 0.9 + }, + { + "advantages": -7.769039802951738e-05, + "completion_length": 568.0, + "delta_ref_entropy_loss": 0.03857421875, + "delta_ref_ppl": -0.08984375, + "entropy_loss": -0.07861328125, + "epoch": 0.7582, + "grad_norm": 0.6711144548065465, + "k1_kl": 0.08984375, + "k3_kl": 0.0625, + "kimi_kl": 0.2373046875, + "learning_rate": 1.2089999999999998e-07, + "loss": 0.0026, + "ppl": 0.029052734375, + "reward": 0.7139476537704468, + "reward_std": 0.0013246851740404963, + "rewards/perpo_ocr_edit_distance_reward": 0.7139478325843811, + "step": 3791, + "temperature": 0.9 + }, + { + "advantages": -6.948198915779358e-06, + "completion_length": 98.0, + "delta_ref_entropy_loss": 0.09912109375, + "delta_ref_ppl": -0.2890625, + "entropy_loss": -0.1337890625, + "epoch": 0.7584, + "grad_norm": 2.3668542982560634, + "k1_kl": 0.2890625, + "k3_kl": 0.2158203125, + "kimi_kl": 0.81640625, + "learning_rate": 1.208e-07, + "loss": 0.0086, + "ppl": 0.0634765625, + "reward": 0.991431713104248, + "reward_std": 0.006025122012943029, + "rewards/perpo_ocr_edit_distance_reward": 0.991431713104248, + "step": 3792, + "temperature": 0.9 + }, + { + "advantages": -0.00017150811618193984, + "completion_length": 835.0, + "delta_ref_entropy_loss": 0.03662109375, + "delta_ref_ppl": -0.041015625, + "entropy_loss": -0.0284423828125, + "epoch": 0.7586, + "grad_norm": 0.20713392736228306, + "k1_kl": 0.041015625, + "k3_kl": 0.02197265625, + "kimi_kl": 0.06298828125, + "learning_rate": 1.207e-07, + "loss": 0.0011, + "ppl": 0.0078125, + "reward": 0.9980927109718323, + "reward_std": 0.0001978897926164791, + "rewards/perpo_ocr_edit_distance_reward": 0.998092770576477, + "step": 3793, + "temperature": 0.9 + }, + { + "advantages": -3.448554707574658e-06, + "completion_length": 566.0, + "delta_ref_entropy_loss": 0.08056640625, + "delta_ref_ppl": -0.10546875, + "entropy_loss": -0.0947265625, + "epoch": 0.7588, + "grad_norm": 1.1200467645130312, + "k1_kl": 0.10498046875, + "k3_kl": 0.0634765625, + "kimi_kl": 0.166015625, + "learning_rate": 1.2059999999999998e-07, + "loss": 0.0026, + "ppl": 0.037109375, + "reward": 0.9075462818145752, + "reward_std": 0.007318785414099693, + "rewards/perpo_ocr_edit_distance_reward": 0.90754634141922, + "step": 3794, + "temperature": 0.9 + }, + { + "advantages": 7.76563410909148e-06, + "completion_length": 247.0, + "delta_ref_entropy_loss": 0.0458984375, + "delta_ref_ppl": -0.10986328125, + "entropy_loss": -0.10107421875, + "epoch": 0.759, + "grad_norm": 2.0409458082249263, + "k1_kl": 0.10986328125, + "k3_kl": 0.07861328125, + "kimi_kl": 0.205078125, + "learning_rate": 1.205e-07, + "loss": 0.0031, + "ppl": 0.045166015625, + "reward": 0.9772919416427612, + "reward_std": 0.004300492350012064, + "rewards/perpo_ocr_edit_distance_reward": 0.9772919416427612, + "step": 3795, + "temperature": 0.9 + }, + { + "advantages": -9.559733734931797e-05, + "completion_length": 637.0, + "delta_ref_entropy_loss": 0.05322265625, + "delta_ref_ppl": -0.0654296875, + "entropy_loss": -0.04931640625, + "epoch": 0.7592, + "grad_norm": 0.5476898987260309, + "k1_kl": 0.0654296875, + "k3_kl": 0.0380859375, + "kimi_kl": 0.10205078125, + "learning_rate": 1.204e-07, + "loss": 0.0016, + "ppl": 0.0145263671875, + "reward": 0.9902572631835938, + "reward_std": 0.00043443756294436753, + "rewards/perpo_ocr_edit_distance_reward": 0.9902573227882385, + "step": 3796, + "temperature": 0.9 + }, + { + "advantages": 7.833753443264868e-06, + "completion_length": 173.0, + "delta_ref_entropy_loss": 0.056396484375, + "delta_ref_ppl": -0.169921875, + "entropy_loss": -0.09130859375, + "epoch": 0.7594, + "grad_norm": 1.8341204198216625, + "k1_kl": 0.1689453125, + "k3_kl": 0.125, + "kimi_kl": 0.408203125, + "learning_rate": 1.203e-07, + "loss": 0.005, + "ppl": 0.03564453125, + "reward": 0.863519012928009, + "reward_std": 0.003179667517542839, + "rewards/perpo_ocr_edit_distance_reward": 0.863519012928009, + "step": 3797, + "temperature": 0.9 + }, + { + "advantages": -1.2908663848065771e-05, + "completion_length": 135.0, + "delta_ref_entropy_loss": 0.058837890625, + "delta_ref_ppl": -0.240234375, + "entropy_loss": -0.07568359375, + "epoch": 0.7596, + "grad_norm": 1.2232329600608176, + "k1_kl": 0.240234375, + "k3_kl": 0.18359375, + "kimi_kl": 0.98046875, + "learning_rate": 1.202e-07, + "loss": 0.0073, + "ppl": 0.025390625, + "reward": 0.9317525029182434, + "reward_std": 0.0025375825352966785, + "rewards/perpo_ocr_edit_distance_reward": 0.9317525029182434, + "step": 3798, + "temperature": 0.9 + }, + { + "advantages": -6.658690836047754e-05, + "completion_length": 170.0, + "delta_ref_entropy_loss": 0.028076171875, + "delta_ref_ppl": -0.1728515625, + "entropy_loss": -0.0693359375, + "epoch": 0.7598, + "grad_norm": 0.7385557583863566, + "k1_kl": 0.173828125, + "k3_kl": 0.1484375, + "kimi_kl": 0.62890625, + "learning_rate": 1.201e-07, + "loss": 0.006, + "ppl": 0.0296630859375, + "reward": 0.9917354583740234, + "reward_std": 0.0007952444138936698, + "rewards/perpo_ocr_edit_distance_reward": 0.991735577583313, + "step": 3799, + "temperature": 0.9 + }, + { + "advantages": -7.987022399902344e-06, + "completion_length": 63.0, + "delta_ref_entropy_loss": 0.08447265625, + "delta_ref_ppl": -0.32421875, + "entropy_loss": -0.314453125, + "epoch": 0.76, + "grad_norm": 4.142921998203069, + "k1_kl": 0.32421875, + "k3_kl": 0.232421875, + "kimi_kl": 0.63671875, + "learning_rate": 1.2e-07, + "loss": 0.0093, + "ppl": 0.10205078125, + "reward": 0.938377857208252, + "reward_std": 0.0041678776033222675, + "rewards/perpo_ocr_edit_distance_reward": 0.938377857208252, + "step": 3800, + "temperature": 0.9 + }, + { + "advantages": -4.8339370550820604e-05, + "completion_length": 242.0, + "delta_ref_entropy_loss": 0.05322265625, + "delta_ref_ppl": -0.1875, + "entropy_loss": -0.076171875, + "epoch": 0.7602, + "grad_norm": 0.9930400881898709, + "k1_kl": 0.1875, + "k3_kl": 0.146484375, + "kimi_kl": 0.57421875, + "learning_rate": 1.199e-07, + "loss": 0.0059, + "ppl": 0.0302734375, + "reward": 0.9251532554626465, + "reward_std": 0.0014847067650407553, + "rewards/perpo_ocr_edit_distance_reward": 0.9251533150672913, + "step": 3801, + "temperature": 0.9 + }, + { + "advantages": 1.2261527444934472e-05, + "completion_length": 1122.0, + "delta_ref_entropy_loss": 0.0087890625, + "delta_ref_ppl": -0.0301513671875, + "entropy_loss": -0.08447265625, + "epoch": 0.7604, + "grad_norm": 0.9022004838330215, + "k1_kl": 0.0302734375, + "k3_kl": 0.022216796875, + "kimi_kl": 0.06396484375, + "learning_rate": 1.198e-07, + "loss": 0.0009, + "ppl": 0.037841796875, + "reward": 0.9870855808258057, + "reward_std": 0.0019883017521351576, + "rewards/perpo_ocr_edit_distance_reward": 0.9870856404304504, + "step": 3802, + "temperature": 0.9 + }, + { + "advantages": -7.891654968261719e-05, + "completion_length": 590.0, + "delta_ref_entropy_loss": 0.049072265625, + "delta_ref_ppl": -0.064453125, + "entropy_loss": -0.0693359375, + "epoch": 0.7606, + "grad_norm": 0.5698734226930374, + "k1_kl": 0.06494140625, + "k3_kl": 0.0380859375, + "kimi_kl": 0.10595703125, + "learning_rate": 1.197e-07, + "loss": 0.0016, + "ppl": 0.0252685546875, + "reward": 0.988258957862854, + "reward_std": 0.0005473795463331044, + "rewards/perpo_ocr_edit_distance_reward": 0.9882590770721436, + "step": 3803, + "temperature": 0.9 + }, + { + "advantages": -1.3623919414840202e-07, + "completion_length": 386.0, + "delta_ref_entropy_loss": 0.024169921875, + "delta_ref_ppl": -0.09765625, + "entropy_loss": -0.17578125, + "epoch": 0.7608, + "grad_norm": 1.8004267117781878, + "k1_kl": 0.09765625, + "k3_kl": 0.06494140625, + "kimi_kl": 0.2236328125, + "learning_rate": 1.1959999999999999e-07, + "loss": 0.0026, + "ppl": 0.0673828125, + "reward": 0.7831132411956787, + "reward_std": 0.07105609029531479, + "rewards/perpo_ocr_edit_distance_reward": 0.7831132411956787, + "step": 3804, + "temperature": 0.9 + }, + { + "advantages": -4.967621498508379e-05, + "completion_length": 667.0, + "delta_ref_entropy_loss": 0.0458984375, + "delta_ref_ppl": -0.07421875, + "entropy_loss": -0.027099609375, + "epoch": 0.761, + "grad_norm": 0.4013241670447762, + "k1_kl": 0.07421875, + "k3_kl": 0.048583984375, + "kimi_kl": 0.177734375, + "learning_rate": 1.1949999999999998e-07, + "loss": 0.002, + "ppl": 0.0084228515625, + "reward": 0.996357262134552, + "reward_std": 0.0005854789051227272, + "rewards/perpo_ocr_edit_distance_reward": 0.9963573217391968, + "step": 3805, + "temperature": 0.9 + }, + { + "advantages": -1.44754142183956e-06, + "completion_length": 81.0, + "delta_ref_entropy_loss": -0.3203125, + "delta_ref_ppl": -0.43359375, + "entropy_loss": -1.0859375, + "epoch": 0.7612, + "grad_norm": 7.07945176595189, + "k1_kl": 0.43359375, + "k3_kl": 0.390625, + "kimi_kl": 1.953125, + "learning_rate": 1.194e-07, + "loss": 0.0156, + "ppl": 0.451171875, + "reward": 0.2280634492635727, + "reward_std": 0.014459305442869663, + "rewards/perpo_ocr_edit_distance_reward": 0.2280634641647339, + "step": 3806, + "temperature": 0.9 + }, + { + "advantages": -2.2309168343781494e-05, + "completion_length": 397.0, + "delta_ref_entropy_loss": 0.033935546875, + "delta_ref_ppl": -0.0771484375, + "entropy_loss": -0.05322265625, + "epoch": 0.7614, + "grad_norm": 0.6622984165927933, + "k1_kl": 0.07666015625, + "k3_kl": 0.045654296875, + "kimi_kl": 0.1298828125, + "learning_rate": 1.193e-07, + "loss": 0.0018, + "ppl": 0.0140380859375, + "reward": 0.9978933930397034, + "reward_std": 0.0006635368335992098, + "rewards/perpo_ocr_edit_distance_reward": 0.9978934526443481, + "step": 3807, + "temperature": 0.9 + }, + { + "advantages": -2.009528088819934e-06, + "completion_length": 823.0, + "delta_ref_entropy_loss": 0.025146484375, + "delta_ref_ppl": -0.05859375, + "entropy_loss": -0.05419921875, + "epoch": 0.7616, + "grad_norm": 0.37872295709782816, + "k1_kl": 0.058349609375, + "k3_kl": 0.034423828125, + "kimi_kl": 0.08447265625, + "learning_rate": 1.192e-07, + "loss": 0.0014, + "ppl": 0.0196533203125, + "reward": 0.039177075028419495, + "reward_std": 0.00016398975276388228, + "rewards/perpo_ocr_edit_distance_reward": 0.03917707875370979, + "step": 3808, + "temperature": 0.9 + }, + { + "advantages": -8.102825813693926e-05, + "completion_length": 473.0, + "delta_ref_entropy_loss": 0.0233154296875, + "delta_ref_ppl": -0.0556640625, + "entropy_loss": -0.03271484375, + "epoch": 0.7618, + "grad_norm": 0.3493755768765612, + "k1_kl": 0.0556640625, + "k3_kl": 0.041748046875, + "kimi_kl": 0.1513671875, + "learning_rate": 1.1909999999999999e-07, + "loss": 0.0018, + "ppl": 0.01129150390625, + "reward": 0.9981763362884521, + "reward_std": 0.00042539156856946647, + "rewards/perpo_ocr_edit_distance_reward": 0.9981763362884521, + "step": 3809, + "temperature": 0.9 + }, + { + "advantages": -7.482937508029863e-05, + "completion_length": 458.0, + "delta_ref_entropy_loss": 0.03125, + "delta_ref_ppl": -0.07421875, + "entropy_loss": -0.052001953125, + "epoch": 0.762, + "grad_norm": 0.39427541068097866, + "k1_kl": 0.07470703125, + "k3_kl": 0.050537109375, + "kimi_kl": 0.1337890625, + "learning_rate": 1.19e-07, + "loss": 0.0021, + "ppl": 0.0177001953125, + "reward": 0.9749240279197693, + "reward_std": 0.0005825560656376183, + "rewards/perpo_ocr_edit_distance_reward": 0.9749240875244141, + "step": 3810, + "temperature": 0.9 + }, + { + "advantages": -1.7029899268550253e-08, + "completion_length": 201.0, + "delta_ref_entropy_loss": 0.01153564453125, + "delta_ref_ppl": -0.162109375, + "entropy_loss": -0.0537109375, + "epoch": 0.7622, + "grad_norm": 0.4789803993651014, + "k1_kl": 0.162109375, + "k3_kl": 0.130859375, + "kimi_kl": 0.60546875, + "learning_rate": 1.189e-07, + "loss": 0.0052, + "ppl": 0.01708984375, + "reward": 0.8742206692695618, + "reward_std": 0.0007465059170499444, + "rewards/perpo_ocr_edit_distance_reward": 0.8742206692695618, + "step": 3811, + "temperature": 0.9 + }, + { + "advantages": -1.253400569112273e-05, + "completion_length": 94.0, + "delta_ref_entropy_loss": 0.0284423828125, + "delta_ref_ppl": -0.349609375, + "entropy_loss": -0.1318359375, + "epoch": 0.7624, + "grad_norm": 1.524219448602819, + "k1_kl": 0.34765625, + "k3_kl": 0.28125, + "kimi_kl": 1.078125, + "learning_rate": 1.1879999999999999e-07, + "loss": 0.0112, + "ppl": 0.0390625, + "reward": 0.9935970306396484, + "reward_std": 0.0026190944481641054, + "rewards/perpo_ocr_edit_distance_reward": 0.9935970902442932, + "step": 3812, + "temperature": 0.9 + }, + { + "advantages": -0.0001247780746780336, + "completion_length": 1928.0, + "delta_ref_entropy_loss": 0.033203125, + "delta_ref_ppl": -0.06494140625, + "entropy_loss": -0.09912109375, + "epoch": 0.7626, + "grad_norm": 2.393249651107677, + "k1_kl": 0.06494140625, + "k3_kl": 0.046142578125, + "kimi_kl": 0.1171875, + "learning_rate": 1.187e-07, + "loss": 0.002, + "ppl": 0.047119140625, + "reward": 0.9678983092308044, + "reward_std": 0.000855375430546701, + "rewards/perpo_ocr_edit_distance_reward": 0.967898428440094, + "step": 3813, + "temperature": 0.9 + }, + { + "advantages": -8.94069742685133e-08, + "completion_length": 43.0, + "delta_ref_entropy_loss": -0.255859375, + "delta_ref_ppl": -1.0625, + "entropy_loss": -1.2421875, + "epoch": 0.7628, + "grad_norm": 10.37709464246535, + "k1_kl": 1.0625, + "k3_kl": 0.875, + "kimi_kl": 3.359375, + "learning_rate": 1.1859999999999999e-07, + "loss": 0.035, + "ppl": 0.57421875, + "reward": 0.17241378128528595, + "reward_std": 0.06196511536836624, + "rewards/perpo_ocr_edit_distance_reward": 0.17241379618644714, + "step": 3814, + "temperature": 0.9 + }, + { + "advantages": -3.6554680264089257e-05, + "completion_length": 513.0, + "delta_ref_entropy_loss": 0.03271484375, + "delta_ref_ppl": -0.059326171875, + "entropy_loss": -0.0380859375, + "epoch": 0.763, + "grad_norm": 0.6627132739130759, + "k1_kl": 0.05908203125, + "k3_kl": 0.043212890625, + "kimi_kl": 0.126953125, + "learning_rate": 1.1849999999999998e-07, + "loss": 0.0018, + "ppl": 0.01190185546875, + "reward": 0.9973910450935364, + "reward_std": 0.0005991885554976761, + "rewards/perpo_ocr_edit_distance_reward": 0.9973910450935364, + "step": 3815, + "temperature": 0.9 + }, + { + "advantages": 8.617129424237646e-06, + "completion_length": 613.0, + "delta_ref_entropy_loss": 0.080078125, + "delta_ref_ppl": -0.107421875, + "entropy_loss": -0.1494140625, + "epoch": 0.7632, + "grad_norm": 1.2794039892838833, + "k1_kl": 0.107421875, + "k3_kl": 0.06494140625, + "kimi_kl": 0.140625, + "learning_rate": 1.184e-07, + "loss": 0.0026, + "ppl": 0.07470703125, + "reward": 0.9125526547431946, + "reward_std": 0.0018777366494759917, + "rewards/perpo_ocr_edit_distance_reward": 0.9125527143478394, + "step": 3816, + "temperature": 0.9 + }, + { + "advantages": -7.408006240439136e-06, + "completion_length": 754.0, + "delta_ref_entropy_loss": 0.0458984375, + "delta_ref_ppl": -0.0859375, + "entropy_loss": -0.12353515625, + "epoch": 0.7634, + "grad_norm": 1.1852583295569568, + "k1_kl": 0.08642578125, + "k3_kl": 0.0625, + "kimi_kl": 0.171875, + "learning_rate": 1.183e-07, + "loss": 0.0025, + "ppl": 0.059814453125, + "reward": 0.9635467529296875, + "reward_std": 0.002205727156251669, + "rewards/perpo_ocr_edit_distance_reward": 0.9635468125343323, + "step": 3817, + "temperature": 0.9 + }, + { + "advantages": -0.0001334973785560578, + "completion_length": 1249.0, + "delta_ref_entropy_loss": 0.03369140625, + "delta_ref_ppl": -0.037353515625, + "entropy_loss": -0.044189453125, + "epoch": 0.7636, + "grad_norm": 0.5105220802362367, + "k1_kl": 0.037353515625, + "k3_kl": 0.0228271484375, + "kimi_kl": 0.04931640625, + "learning_rate": 1.182e-07, + "loss": 0.001, + "ppl": 0.017333984375, + "reward": 0.9833583831787109, + "reward_std": 0.0004740200820378959, + "rewards/perpo_ocr_edit_distance_reward": 0.9833584427833557, + "step": 3818, + "temperature": 0.9 + }, + { + "advantages": -6.756612492608838e-06, + "completion_length": 111.0, + "delta_ref_entropy_loss": 0.057373046875, + "delta_ref_ppl": -0.2353515625, + "entropy_loss": -0.134765625, + "epoch": 0.7638, + "grad_norm": 2.184013138747771, + "k1_kl": 0.2353515625, + "k3_kl": 0.1884765625, + "kimi_kl": 0.65234375, + "learning_rate": 1.1809999999999999e-07, + "loss": 0.0075, + "ppl": 0.059814453125, + "reward": 0.9587416052818298, + "reward_std": 0.00620305398479104, + "rewards/perpo_ocr_edit_distance_reward": 0.9587416648864746, + "step": 3819, + "temperature": 0.9 + }, + { + "advantages": -1.4305115882962127e-06, + "completion_length": 129.0, + "delta_ref_entropy_loss": -0.1552734375, + "delta_ref_ppl": -0.333984375, + "entropy_loss": -0.458984375, + "epoch": 0.764, + "grad_norm": 3.3604464246636647, + "k1_kl": 0.333984375, + "k3_kl": 0.302734375, + "kimi_kl": 1.0625, + "learning_rate": 1.1799999999999998e-07, + "loss": 0.0121, + "ppl": 0.2001953125, + "reward": 0.39680102467536926, + "reward_std": 0.01768164150416851, + "rewards/perpo_ocr_edit_distance_reward": 0.39680108428001404, + "step": 3820, + "temperature": 0.9 + }, + { + "advantages": -9.877342108666198e-07, + "completion_length": 622.0, + "delta_ref_entropy_loss": 0.0048828125, + "delta_ref_ppl": -0.045166015625, + "entropy_loss": -0.1337890625, + "epoch": 0.7642, + "grad_norm": 2.032031721259013, + "k1_kl": 0.04541015625, + "k3_kl": 0.033935546875, + "kimi_kl": 0.0791015625, + "learning_rate": 1.179e-07, + "loss": 0.0014, + "ppl": 0.04736328125, + "reward": 0.9415451288223267, + "reward_std": 0.060283076018095016, + "rewards/perpo_ocr_edit_distance_reward": 0.9415451884269714, + "step": 3821, + "temperature": 0.9 + }, + { + "advantages": -3.107956558778824e-07, + "completion_length": 101.0, + "delta_ref_entropy_loss": -0.345703125, + "delta_ref_ppl": -0.51171875, + "entropy_loss": -0.90625, + "epoch": 0.7644, + "grad_norm": 4.521280182156832, + "k1_kl": 0.51171875, + "k3_kl": 0.47265625, + "kimi_kl": 1.7578125, + "learning_rate": 1.178e-07, + "loss": 0.0189, + "ppl": 0.357421875, + "reward": 0.1448040008544922, + "reward_std": 0.044342897832393646, + "rewards/perpo_ocr_edit_distance_reward": 0.14480403065681458, + "step": 3822, + "temperature": 0.9 + }, + { + "advantages": 1.7370497289448394e-06, + "completion_length": 69.0, + "delta_ref_entropy_loss": 0.06591796875, + "delta_ref_ppl": -0.51953125, + "entropy_loss": -0.283203125, + "epoch": 0.7646, + "grad_norm": 3.5196481468482346, + "k1_kl": 0.5234375, + "k3_kl": 0.435546875, + "kimi_kl": 1.8125, + "learning_rate": 1.1769999999999999e-07, + "loss": 0.0174, + "ppl": 0.10546875, + "reward": 0.96211177110672, + "reward_std": 0.009629002772271633, + "rewards/perpo_ocr_edit_distance_reward": 0.96211177110672, + "step": 3823, + "temperature": 0.9 + }, + { + "advantages": -0.00011220149463042617, + "completion_length": 494.0, + "delta_ref_entropy_loss": 0.04443359375, + "delta_ref_ppl": -0.07958984375, + "entropy_loss": -0.041748046875, + "epoch": 0.7648, + "grad_norm": 0.5123013331592515, + "k1_kl": 0.07958984375, + "k3_kl": 0.048828125, + "kimi_kl": 0.142578125, + "learning_rate": 1.176e-07, + "loss": 0.0021, + "ppl": 0.0166015625, + "reward": 0.9952276945114136, + "reward_std": 0.0005830413429066539, + "rewards/perpo_ocr_edit_distance_reward": 0.9952278137207031, + "step": 3824, + "temperature": 0.9 + }, + { + "advantages": -4.972730494046118e-06, + "completion_length": 936.0, + "delta_ref_entropy_loss": 0.052001953125, + "delta_ref_ppl": -0.068359375, + "entropy_loss": -0.146484375, + "epoch": 0.765, + "grad_norm": 1.5885024420638303, + "k1_kl": 0.06884765625, + "k3_kl": 0.04443359375, + "kimi_kl": 0.1015625, + "learning_rate": 1.1749999999999999e-07, + "loss": 0.0018, + "ppl": 0.0703125, + "reward": 0.9340053796768188, + "reward_std": 0.0016162117244675756, + "rewards/perpo_ocr_edit_distance_reward": 0.9340054988861084, + "step": 3825, + "temperature": 0.9 + }, + { + "advantages": -7.680484486627392e-06, + "completion_length": 82.0, + "delta_ref_entropy_loss": 0.03662109375, + "delta_ref_ppl": -0.42578125, + "entropy_loss": -0.1552734375, + "epoch": 0.7652, + "grad_norm": 2.4617059092924563, + "k1_kl": 0.42578125, + "k3_kl": 0.345703125, + "kimi_kl": 1.4140625, + "learning_rate": 1.1739999999999999e-07, + "loss": 0.0138, + "ppl": 0.052734375, + "reward": 0.9853842258453369, + "reward_std": 0.0032208047341555357, + "rewards/perpo_ocr_edit_distance_reward": 0.9853842854499817, + "step": 3826, + "temperature": 0.9 + }, + { + "advantages": -0.0001526985870441422, + "completion_length": 711.0, + "delta_ref_entropy_loss": 0.0306396484375, + "delta_ref_ppl": -0.048583984375, + "entropy_loss": -0.029296875, + "epoch": 0.7654, + "grad_norm": 0.2587556253160212, + "k1_kl": 0.04833984375, + "k3_kl": 0.0247802734375, + "kimi_kl": 0.061279296875, + "learning_rate": 1.173e-07, + "loss": 0.0011, + "ppl": 0.0089111328125, + "reward": 0.9992333054542542, + "reward_std": 0.00023457272618543357, + "rewards/perpo_ocr_edit_distance_reward": 0.9992334246635437, + "step": 3827, + "temperature": 0.9 + }, + { + "advantages": -9.567397501086816e-05, + "completion_length": 949.0, + "delta_ref_entropy_loss": 0.034912109375, + "delta_ref_ppl": -0.0673828125, + "entropy_loss": -0.05322265625, + "epoch": 0.7656, + "grad_norm": 0.56398882859453, + "k1_kl": 0.0673828125, + "k3_kl": 0.0458984375, + "kimi_kl": 0.125, + "learning_rate": 1.1719999999999999e-07, + "loss": 0.0019, + "ppl": 0.0228271484375, + "reward": 0.9749951362609863, + "reward_std": 0.0008791542495600879, + "rewards/perpo_ocr_edit_distance_reward": 0.9749952554702759, + "step": 3828, + "temperature": 0.9 + }, + { + "advantages": -4.829679528484121e-05, + "completion_length": 443.0, + "delta_ref_entropy_loss": 0.03857421875, + "delta_ref_ppl": -0.0498046875, + "entropy_loss": -0.04150390625, + "epoch": 0.7658, + "grad_norm": 0.6782967187170278, + "k1_kl": 0.0498046875, + "k3_kl": 0.027587890625, + "kimi_kl": 0.06640625, + "learning_rate": 1.171e-07, + "loss": 0.0012, + "ppl": 0.0146484375, + "reward": 0.9985751509666443, + "reward_std": 0.0006053149700164795, + "rewards/perpo_ocr_edit_distance_reward": 0.9985752701759338, + "step": 3829, + "temperature": 0.9 + }, + { + "advantages": -0.00012227467959746718, + "completion_length": 752.0, + "delta_ref_entropy_loss": 0.036865234375, + "delta_ref_ppl": -0.05126953125, + "entropy_loss": -0.0751953125, + "epoch": 0.766, + "grad_norm": 0.5038937140107405, + "k1_kl": 0.051025390625, + "k3_kl": 0.03173828125, + "kimi_kl": 0.07958984375, + "learning_rate": 1.17e-07, + "loss": 0.0014, + "ppl": 0.023681640625, + "reward": 0.9832537770271301, + "reward_std": 0.0003874439571518451, + "rewards/perpo_ocr_edit_distance_reward": 0.9832538366317749, + "step": 3830, + "temperature": 0.9 + }, + { + "advantages": 0.0, + "completion_length": 1058.0, + "delta_ref_entropy_loss": 0.0322265625, + "delta_ref_ppl": -0.0478515625, + "entropy_loss": -0.046875, + "epoch": 0.7662, + "grad_norm": 0.6245183480593602, + "k1_kl": 0.0478515625, + "k3_kl": 0.0262451171875, + "kimi_kl": 0.05908203125, + "learning_rate": 1.169e-07, + "loss": 0.001, + "ppl": 0.017822265625, + "reward": 0.9980281591415405, + "reward_std": 0.0007452858262695372, + "rewards/perpo_ocr_edit_distance_reward": 0.9980281591415405, + "step": 3831, + "temperature": 0.9 + }, + { + "advantages": -2.1730151274823584e-05, + "completion_length": 437.0, + "delta_ref_entropy_loss": 0.056396484375, + "delta_ref_ppl": -0.1025390625, + "entropy_loss": -0.041748046875, + "epoch": 0.7664, + "grad_norm": 0.24626250432523816, + "k1_kl": 0.1025390625, + "k3_kl": 0.06591796875, + "kimi_kl": 0.19921875, + "learning_rate": 1.168e-07, + "loss": 0.0027, + "ppl": 0.01385498046875, + "reward": 0.9802831411361694, + "reward_std": 0.0002918574318755418, + "rewards/perpo_ocr_edit_distance_reward": 0.9802831411361694, + "step": 3832, + "temperature": 0.9 + }, + { + "advantages": -3.899846888089087e-06, + "completion_length": 545.0, + "delta_ref_entropy_loss": 0.06494140625, + "delta_ref_ppl": -0.1201171875, + "entropy_loss": -0.330078125, + "epoch": 0.7666, + "grad_norm": 2.4616805926057443, + "k1_kl": 0.1201171875, + "k3_kl": 0.07666015625, + "kimi_kl": 0.1533203125, + "learning_rate": 1.1669999999999999e-07, + "loss": 0.0031, + "ppl": 0.1591796875, + "reward": 0.7901641726493835, + "reward_std": 0.0042771026492118835, + "rewards/perpo_ocr_edit_distance_reward": 0.7901641726493835, + "step": 3833, + "temperature": 0.9 + }, + { + "advantages": -0.00013945784303359687, + "completion_length": 608.0, + "delta_ref_entropy_loss": 0.03515625, + "delta_ref_ppl": -0.053466796875, + "entropy_loss": -0.045654296875, + "epoch": 0.7668, + "grad_norm": 0.5679554450196356, + "k1_kl": 0.053466796875, + "k3_kl": 0.036865234375, + "kimi_kl": 0.1279296875, + "learning_rate": 1.1659999999999999e-07, + "loss": 0.0016, + "ppl": 0.015380859375, + "reward": 0.9937707185745239, + "reward_std": 0.000388451328035444, + "rewards/perpo_ocr_edit_distance_reward": 0.9937708377838135, + "step": 3834, + "temperature": 0.9 + }, + { + "advantages": -1.4032637409400195e-05, + "completion_length": 728.0, + "delta_ref_entropy_loss": 0.062255859375, + "delta_ref_ppl": -0.10888671875, + "entropy_loss": -0.181640625, + "epoch": 0.767, + "grad_norm": 2.663770136797217, + "k1_kl": 0.10888671875, + "k3_kl": 0.064453125, + "kimi_kl": 0.1748046875, + "learning_rate": 1.165e-07, + "loss": 0.0026, + "ppl": 0.08056640625, + "reward": 0.9263731241226196, + "reward_std": 0.0023265972267836332, + "rewards/perpo_ocr_edit_distance_reward": 0.9263731837272644, + "step": 3835, + "temperature": 0.9 + }, + { + "advantages": -6.261042290134355e-05, + "completion_length": 955.0, + "delta_ref_entropy_loss": 0.014892578125, + "delta_ref_ppl": -0.053955078125, + "entropy_loss": -0.0537109375, + "epoch": 0.7672, + "grad_norm": 0.6000284931847766, + "k1_kl": 0.0537109375, + "k3_kl": 0.037841796875, + "kimi_kl": 0.10986328125, + "learning_rate": 1.164e-07, + "loss": 0.0016, + "ppl": 0.021484375, + "reward": 0.991498589515686, + "reward_std": 0.0008518336690030992, + "rewards/perpo_ocr_edit_distance_reward": 0.9914986491203308, + "step": 3836, + "temperature": 0.9 + }, + { + "advantages": -0.00017639569705352187, + "completion_length": 474.0, + "delta_ref_entropy_loss": 0.022705078125, + "delta_ref_ppl": -0.0693359375, + "entropy_loss": -0.03759765625, + "epoch": 0.7674, + "grad_norm": 0.4038147859804289, + "k1_kl": 0.0693359375, + "k3_kl": 0.0498046875, + "kimi_kl": 0.1875, + "learning_rate": 1.1629999999999999e-07, + "loss": 0.0022, + "ppl": 0.01129150390625, + "reward": 0.9923223257064819, + "reward_std": 0.0005758196348324418, + "rewards/perpo_ocr_edit_distance_reward": 0.9923224449157715, + "step": 3837, + "temperature": 0.9 + }, + { + "advantages": -7.18661749488092e-06, + "completion_length": 988.0, + "delta_ref_entropy_loss": 0.04345703125, + "delta_ref_ppl": -0.0556640625, + "entropy_loss": -0.10205078125, + "epoch": 0.7676, + "grad_norm": 2.995591275831814, + "k1_kl": 0.0556640625, + "k3_kl": 0.033447265625, + "kimi_kl": 0.083984375, + "learning_rate": 1.162e-07, + "loss": 0.0013, + "ppl": 0.04345703125, + "reward": 0.9642888903617859, + "reward_std": 0.007010527886450291, + "rewards/perpo_ocr_edit_distance_reward": 0.9642889499664307, + "step": 3838, + "temperature": 0.9 + }, + { + "advantages": -1.0290316822647583e-05, + "completion_length": 448.0, + "delta_ref_entropy_loss": 0.095703125, + "delta_ref_ppl": -0.10107421875, + "entropy_loss": -0.1923828125, + "epoch": 0.7678, + "grad_norm": 6.929575550256634, + "k1_kl": 0.10107421875, + "k3_kl": 0.06005859375, + "kimi_kl": 0.134765625, + "learning_rate": 1.1609999999999999e-07, + "loss": 0.0024, + "ppl": 0.09326171875, + "reward": 0.7041000127792358, + "reward_std": 0.002381921513006091, + "rewards/perpo_ocr_edit_distance_reward": 0.7041000723838806, + "step": 3839, + "temperature": 0.9 + }, + { + "advantages": -9.460109140491113e-05, + "completion_length": 839.0, + "delta_ref_entropy_loss": 0.0284423828125, + "delta_ref_ppl": -0.042724609375, + "entropy_loss": -0.0361328125, + "epoch": 0.768, + "grad_norm": 0.2202612575110514, + "k1_kl": 0.042724609375, + "k3_kl": 0.025146484375, + "kimi_kl": 0.07373046875, + "learning_rate": 1.16e-07, + "loss": 0.0011, + "ppl": 0.00994873046875, + "reward": 0.9958898425102234, + "reward_std": 0.0003501552273519337, + "rewards/perpo_ocr_edit_distance_reward": 0.9958899021148682, + "step": 3840, + "temperature": 0.9 + }, + { + "advantages": -4.70663835585583e-05, + "completion_length": 494.0, + "delta_ref_entropy_loss": 0.044921875, + "delta_ref_ppl": -0.08349609375, + "entropy_loss": -0.03955078125, + "epoch": 0.7682, + "grad_norm": 0.32696214268661067, + "k1_kl": 0.08349609375, + "k3_kl": 0.055908203125, + "kimi_kl": 0.19140625, + "learning_rate": 1.159e-07, + "loss": 0.0023, + "ppl": 0.01361083984375, + "reward": 0.9978696703910828, + "reward_std": 0.0004427175153978169, + "rewards/perpo_ocr_edit_distance_reward": 0.9978697299957275, + "step": 3841, + "temperature": 0.9 + }, + { + "advantages": 8.514949634275126e-09, + "completion_length": 257.0, + "delta_ref_entropy_loss": -0.291015625, + "delta_ref_ppl": -0.1748046875, + "entropy_loss": -1.015625, + "epoch": 0.7684, + "grad_norm": 6.873651617637291, + "k1_kl": 0.173828125, + "k3_kl": 0.1953125, + "kimi_kl": 0.58984375, + "learning_rate": 1.1579999999999999e-07, + "loss": 0.0078, + "ppl": 0.44140625, + "reward": 0.45661959052085876, + "reward_std": 0.04519391432404518, + "rewards/perpo_ocr_edit_distance_reward": 0.45661962032318115, + "step": 3842, + "temperature": 0.9 + }, + { + "advantages": -9.931837121257558e-05, + "completion_length": 1618.0, + "delta_ref_entropy_loss": 0.034912109375, + "delta_ref_ppl": -0.05029296875, + "entropy_loss": -0.08447265625, + "epoch": 0.7686, + "grad_norm": 0.8623644803338782, + "k1_kl": 0.05029296875, + "k3_kl": 0.033447265625, + "kimi_kl": 0.06591796875, + "learning_rate": 1.157e-07, + "loss": 0.0014, + "ppl": 0.0361328125, + "reward": 0.9900620579719543, + "reward_std": 0.0010999308433383703, + "rewards/perpo_ocr_edit_distance_reward": 0.9900622367858887, + "step": 3843, + "temperature": 0.9 + }, + { + "advantages": -8.753368092584424e-06, + "completion_length": 829.0, + "delta_ref_entropy_loss": 0.00775146484375, + "delta_ref_ppl": -0.02783203125, + "entropy_loss": -0.0301513671875, + "epoch": 0.7688, + "grad_norm": 0.43756335659887957, + "k1_kl": 0.02783203125, + "k3_kl": 0.0198974609375, + "kimi_kl": 0.05615234375, + "learning_rate": 1.1559999999999999e-07, + "loss": 0.0008, + "ppl": 0.00958251953125, + "reward": 0.9879471659660339, + "reward_std": 0.0037740792613476515, + "rewards/perpo_ocr_edit_distance_reward": 0.9879472255706787, + "step": 3844, + "temperature": 0.9 + }, + { + "advantages": -1.616988993191626e-05, + "completion_length": 861.0, + "delta_ref_entropy_loss": 0.01495361328125, + "delta_ref_ppl": -0.032470703125, + "entropy_loss": -0.0634765625, + "epoch": 0.769, + "grad_norm": 0.7286475022968624, + "k1_kl": 0.032470703125, + "k3_kl": 0.0250244140625, + "kimi_kl": 0.0615234375, + "learning_rate": 1.155e-07, + "loss": 0.001, + "ppl": 0.026123046875, + "reward": 0.9969297051429749, + "reward_std": 0.0020049705635756254, + "rewards/perpo_ocr_edit_distance_reward": 0.9969297647476196, + "step": 3845, + "temperature": 0.9 + }, + { + "advantages": -0.0001030904968502, + "completion_length": 726.0, + "delta_ref_entropy_loss": 0.034912109375, + "delta_ref_ppl": -0.057861328125, + "entropy_loss": -0.038818359375, + "epoch": 0.7692, + "grad_norm": 0.46881034313565306, + "k1_kl": 0.0576171875, + "k3_kl": 0.0361328125, + "kimi_kl": 0.10595703125, + "learning_rate": 1.154e-07, + "loss": 0.0015, + "ppl": 0.01324462890625, + "reward": 0.9957848191261292, + "reward_std": 0.0004781893512699753, + "rewards/perpo_ocr_edit_distance_reward": 0.9957849383354187, + "step": 3846, + "temperature": 0.9 + }, + { + "advantages": -2.55448497910038e-07, + "completion_length": 1226.0, + "delta_ref_entropy_loss": 0.0252685546875, + "delta_ref_ppl": -0.05322265625, + "entropy_loss": -0.259765625, + "epoch": 0.7694, + "grad_norm": 3.3552899393353757, + "k1_kl": 0.052978515625, + "k3_kl": 0.03857421875, + "kimi_kl": 0.0869140625, + "learning_rate": 1.153e-07, + "loss": 0.0015, + "ppl": 0.1357421875, + "reward": 0.9143692851066589, + "reward_std": 0.1724671721458435, + "rewards/perpo_ocr_edit_distance_reward": 0.9143693447113037, + "step": 3847, + "temperature": 0.9 + }, + { + "advantages": -0.00022291287314146757, + "completion_length": 760.0, + "delta_ref_entropy_loss": 0.030029296875, + "delta_ref_ppl": -0.034912109375, + "entropy_loss": -0.027587890625, + "epoch": 0.7696, + "grad_norm": 0.22895609081734916, + "k1_kl": 0.034912109375, + "k3_kl": 0.0169677734375, + "kimi_kl": 0.0380859375, + "learning_rate": 1.1519999999999999e-07, + "loss": 0.0009, + "ppl": 0.006011962890625, + "reward": 0.9972376823425293, + "reward_std": 0.00020559625409077853, + "rewards/perpo_ocr_edit_distance_reward": 0.9972377419471741, + "step": 3848, + "temperature": 0.9 + }, + { + "advantages": -0.00015607051318511367, + "completion_length": 853.0, + "delta_ref_entropy_loss": 0.050537109375, + "delta_ref_ppl": -0.06298828125, + "entropy_loss": -0.076171875, + "epoch": 0.7698, + "grad_norm": 0.5431860755927542, + "k1_kl": 0.06298828125, + "k3_kl": 0.03466796875, + "kimi_kl": 0.08447265625, + "learning_rate": 1.1509999999999999e-07, + "loss": 0.0015, + "ppl": 0.0283203125, + "reward": 0.9275336265563965, + "reward_std": 0.0005547640612348914, + "rewards/perpo_ocr_edit_distance_reward": 0.927533745765686, + "step": 3849, + "temperature": 0.9 + }, + { + "advantages": -5.211149073147681e-06, + "completion_length": 71.0, + "delta_ref_entropy_loss": 0.046875, + "delta_ref_ppl": -0.59375, + "entropy_loss": -0.322265625, + "epoch": 0.77, + "grad_norm": 6.593687033035483, + "k1_kl": 0.59375, + "k3_kl": 0.478515625, + "kimi_kl": 2.15625, + "learning_rate": 1.15e-07, + "loss": 0.0191, + "ppl": 0.1171875, + "reward": 0.9740034341812134, + "reward_std": 0.00967330764979124, + "rewards/perpo_ocr_edit_distance_reward": 0.9740034937858582, + "step": 3850, + "temperature": 0.9 + }, + { + "advantages": -1.456056361348601e-05, + "completion_length": 806.0, + "delta_ref_entropy_loss": 0.022216796875, + "delta_ref_ppl": -0.03173828125, + "entropy_loss": -0.041748046875, + "epoch": 0.7702, + "grad_norm": 0.43004731032558174, + "k1_kl": 0.03173828125, + "k3_kl": 0.0184326171875, + "kimi_kl": 0.04931640625, + "learning_rate": 1.149e-07, + "loss": 0.0008, + "ppl": 0.01507568359375, + "reward": 0.9870799779891968, + "reward_std": 0.00107066601049155, + "rewards/perpo_ocr_edit_distance_reward": 0.9870799779891968, + "step": 3851, + "temperature": 0.9 + }, + { + "advantages": 2.0282610421418212e-05, + "completion_length": 979.0, + "delta_ref_entropy_loss": 0.0303955078125, + "delta_ref_ppl": -0.05029296875, + "entropy_loss": -0.038818359375, + "epoch": 0.7704, + "grad_norm": 0.487747258767975, + "k1_kl": 0.050048828125, + "k3_kl": 0.029541015625, + "kimi_kl": 0.0791015625, + "learning_rate": 1.148e-07, + "loss": 0.0012, + "ppl": 0.01495361328125, + "reward": 0.994491457939148, + "reward_std": 0.0011594591196626425, + "rewards/perpo_ocr_edit_distance_reward": 0.994491457939148, + "step": 3852, + "temperature": 0.9 + }, + { + "advantages": -2.919776306953281e-05, + "completion_length": 348.0, + "delta_ref_entropy_loss": 0.08544921875, + "delta_ref_ppl": -0.13671875, + "entropy_loss": -0.1943359375, + "epoch": 0.7706, + "grad_norm": 1.69999226687736, + "k1_kl": 0.13671875, + "k3_kl": 0.08984375, + "kimi_kl": 0.2373046875, + "learning_rate": 1.1469999999999999e-07, + "loss": 0.0036, + "ppl": 0.10498046875, + "reward": 0.9522457122802734, + "reward_std": 0.002233628649264574, + "rewards/perpo_ocr_edit_distance_reward": 0.952245831489563, + "step": 3853, + "temperature": 0.9 + }, + { + "advantages": -7.717098924331367e-05, + "completion_length": 496.0, + "delta_ref_entropy_loss": 0.06005859375, + "delta_ref_ppl": -0.0634765625, + "entropy_loss": -0.10595703125, + "epoch": 0.7708, + "grad_norm": 1.3856469413485302, + "k1_kl": 0.0634765625, + "k3_kl": 0.04443359375, + "kimi_kl": 0.08544921875, + "learning_rate": 1.146e-07, + "loss": 0.0019, + "ppl": 0.0478515625, + "reward": 0.9682749509811401, + "reward_std": 0.0007827031658962369, + "rewards/perpo_ocr_edit_distance_reward": 0.9682750701904297, + "step": 3854, + "temperature": 0.9 + }, + { + "advantages": -3.122432099189609e-05, + "completion_length": 527.0, + "delta_ref_entropy_loss": 0.0458984375, + "delta_ref_ppl": -0.08740234375, + "entropy_loss": -0.07763671875, + "epoch": 0.771, + "grad_norm": 0.8362252516772893, + "k1_kl": 0.08740234375, + "k3_kl": 0.06201171875, + "kimi_kl": 0.2197265625, + "learning_rate": 1.145e-07, + "loss": 0.0025, + "ppl": 0.03271484375, + "reward": 0.9817948341369629, + "reward_std": 0.0020809725392609835, + "rewards/perpo_ocr_edit_distance_reward": 0.9817948937416077, + "step": 3855, + "temperature": 0.9 + }, + { + "advantages": -6.7455432144925e-05, + "completion_length": 778.0, + "delta_ref_entropy_loss": 0.0291748046875, + "delta_ref_ppl": -0.03955078125, + "entropy_loss": -0.047119140625, + "epoch": 0.7712, + "grad_norm": 0.7278314638223079, + "k1_kl": 0.03955078125, + "k3_kl": 0.0216064453125, + "kimi_kl": 0.046142578125, + "learning_rate": 1.1439999999999999e-07, + "loss": 0.0009, + "ppl": 0.01531982421875, + "reward": 0.9993211030960083, + "reward_std": 0.001162125961855054, + "rewards/perpo_ocr_edit_distance_reward": 0.9993211627006531, + "step": 3856, + "temperature": 0.9 + }, + { + "advantages": -6.444114114856347e-05, + "completion_length": 714.0, + "delta_ref_entropy_loss": 0.028076171875, + "delta_ref_ppl": -0.06005859375, + "entropy_loss": -0.07958984375, + "epoch": 0.7714, + "grad_norm": 0.6263194593225075, + "k1_kl": 0.059814453125, + "k3_kl": 0.031982421875, + "kimi_kl": 0.0673828125, + "learning_rate": 1.143e-07, + "loss": 0.0013, + "ppl": 0.03466796875, + "reward": 0.7530273199081421, + "reward_std": 0.0005605060141533613, + "rewards/perpo_ocr_edit_distance_reward": 0.7530273795127869, + "step": 3857, + "temperature": 0.9 + }, + { + "advantages": -6.553066486958414e-05, + "completion_length": 329.0, + "delta_ref_entropy_loss": 0.060791015625, + "delta_ref_ppl": -0.08837890625, + "entropy_loss": -0.08203125, + "epoch": 0.7716, + "grad_norm": 0.6396194616367459, + "k1_kl": 0.08837890625, + "k3_kl": 0.0537109375, + "kimi_kl": 0.154296875, + "learning_rate": 1.1419999999999999e-07, + "loss": 0.0022, + "ppl": 0.0267333984375, + "reward": 0.9975590109825134, + "reward_std": 0.0009395500528626144, + "rewards/perpo_ocr_edit_distance_reward": 0.9975590705871582, + "step": 3858, + "temperature": 0.9 + }, + { + "advantages": -4.764965706272051e-05, + "completion_length": 1068.0, + "delta_ref_entropy_loss": 0.01409912109375, + "delta_ref_ppl": -0.026123046875, + "entropy_loss": -0.033447265625, + "epoch": 0.7718, + "grad_norm": 0.43975197543726324, + "k1_kl": 0.026123046875, + "k3_kl": 0.0155029296875, + "kimi_kl": 0.042236328125, + "learning_rate": 1.1409999999999998e-07, + "loss": 0.0007, + "ppl": 0.0130615234375, + "reward": 0.9985430836677551, + "reward_std": 0.0007931401487439871, + "rewards/perpo_ocr_edit_distance_reward": 0.9985430836677551, + "step": 3859, + "temperature": 0.9 + }, + { + "advantages": -0.00011906454165000468, + "completion_length": 952.0, + "delta_ref_entropy_loss": 0.0274658203125, + "delta_ref_ppl": -0.033203125, + "entropy_loss": -0.03662109375, + "epoch": 0.772, + "grad_norm": 0.36337241654562535, + "k1_kl": 0.033203125, + "k3_kl": 0.018798828125, + "kimi_kl": 0.044189453125, + "learning_rate": 1.14e-07, + "loss": 0.0009, + "ppl": 0.01348876953125, + "reward": 0.9976670742034912, + "reward_std": 0.00040062516927719116, + "rewards/perpo_ocr_edit_distance_reward": 0.9976671934127808, + "step": 3860, + "temperature": 0.9 + }, + { + "advantages": -8.595841791247949e-05, + "completion_length": 969.0, + "delta_ref_entropy_loss": 0.015869140625, + "delta_ref_ppl": -0.046875, + "entropy_loss": -0.052490234375, + "epoch": 0.7722, + "grad_norm": 0.44192922455206857, + "k1_kl": 0.046875, + "k3_kl": 0.034912109375, + "kimi_kl": 0.08935546875, + "learning_rate": 1.139e-07, + "loss": 0.0015, + "ppl": 0.0216064453125, + "reward": 0.9936297535896301, + "reward_std": 0.0010886668460443616, + "rewards/perpo_ocr_edit_distance_reward": 0.9936299324035645, + "step": 3861, + "temperature": 0.9 + }, + { + "advantages": -2.0384790332172997e-05, + "completion_length": 435.0, + "delta_ref_entropy_loss": 0.056884765625, + "delta_ref_ppl": -0.1318359375, + "entropy_loss": -0.2158203125, + "epoch": 0.7724, + "grad_norm": 1.5404475432759859, + "k1_kl": 0.130859375, + "k3_kl": 0.0947265625, + "kimi_kl": 0.244140625, + "learning_rate": 1.1379999999999999e-07, + "loss": 0.0038, + "ppl": 0.091796875, + "reward": 0.855226457118988, + "reward_std": 0.0036561130546033382, + "rewards/perpo_ocr_edit_distance_reward": 0.8552265763282776, + "step": 3862, + "temperature": 0.9 + }, + { + "advantages": -1.1920929864572827e-05, + "completion_length": 53.0, + "delta_ref_entropy_loss": 0.09912109375, + "delta_ref_ppl": -0.59765625, + "entropy_loss": -0.1875, + "epoch": 0.7726, + "grad_norm": 2.285567901985545, + "k1_kl": 0.59765625, + "k3_kl": 0.48046875, + "kimi_kl": 2.1875, + "learning_rate": 1.1369999999999999e-07, + "loss": 0.0192, + "ppl": 0.06689453125, + "reward": 0.7910635471343994, + "reward_std": 0.0034660568926483393, + "rewards/perpo_ocr_edit_distance_reward": 0.7910636067390442, + "step": 3863, + "temperature": 0.9 + }, + { + "advantages": -4.3341092350601684e-06, + "completion_length": 387.0, + "delta_ref_entropy_loss": 0.1328125, + "delta_ref_ppl": -0.11376953125, + "entropy_loss": -0.263671875, + "epoch": 0.7728, + "grad_norm": 1.3484308801812723, + "k1_kl": 0.11328125, + "k3_kl": 0.06396484375, + "kimi_kl": 0.13671875, + "learning_rate": 1.136e-07, + "loss": 0.0026, + "ppl": 0.1142578125, + "reward": 0.8427696824073792, + "reward_std": 0.001870593405328691, + "rewards/perpo_ocr_edit_distance_reward": 0.8427697420120239, + "step": 3864, + "temperature": 0.9 + }, + { + "advantages": -3.075599670410156e-05, + "completion_length": 386.0, + "delta_ref_entropy_loss": 0.064453125, + "delta_ref_ppl": -0.1201171875, + "entropy_loss": -0.05224609375, + "epoch": 0.773, + "grad_norm": 0.5373439342286301, + "k1_kl": 0.1201171875, + "k3_kl": 0.08642578125, + "kimi_kl": 0.33984375, + "learning_rate": 1.135e-07, + "loss": 0.0035, + "ppl": 0.0194091796875, + "reward": 0.9973196983337402, + "reward_std": 0.0007303054444491863, + "rewards/perpo_ocr_edit_distance_reward": 0.9973198175430298, + "step": 3865, + "temperature": 0.9 + }, + { + "advantages": 8.514949634275126e-09, + "completion_length": 472.0, + "delta_ref_entropy_loss": 0.04248046875, + "delta_ref_ppl": -0.1005859375, + "entropy_loss": -0.076171875, + "epoch": 0.7732, + "grad_norm": 0.9751702281814028, + "k1_kl": 0.1005859375, + "k3_kl": 0.06494140625, + "kimi_kl": 0.1748046875, + "learning_rate": 1.134e-07, + "loss": 0.0026, + "ppl": 0.0272216796875, + "reward": 0.9918768405914307, + "reward_std": 0.0018788925372064114, + "rewards/perpo_ocr_edit_distance_reward": 0.9918769001960754, + "step": 3866, + "temperature": 0.9 + }, + { + "advantages": -2.2309168343781494e-06, + "completion_length": 628.0, + "delta_ref_entropy_loss": 0.053955078125, + "delta_ref_ppl": -0.0771484375, + "entropy_loss": -0.045654296875, + "epoch": 0.7734, + "grad_norm": 0.8833242265554849, + "k1_kl": 0.07666015625, + "k3_kl": 0.048828125, + "kimi_kl": 0.18359375, + "learning_rate": 1.1329999999999999e-07, + "loss": 0.002, + "ppl": 0.01495361328125, + "reward": 0.7835803627967834, + "reward_std": 0.038081731647253036, + "rewards/perpo_ocr_edit_distance_reward": 0.783580482006073, + "step": 3867, + "temperature": 0.9 + }, + { + "advantages": -6.495203706435859e-05, + "completion_length": 329.0, + "delta_ref_entropy_loss": 0.049560546875, + "delta_ref_ppl": -0.115234375, + "entropy_loss": -0.047119140625, + "epoch": 0.7736, + "grad_norm": 1.1807524736800303, + "k1_kl": 0.115234375, + "k3_kl": 0.0859375, + "kimi_kl": 0.3828125, + "learning_rate": 1.132e-07, + "loss": 0.0035, + "ppl": 0.0157470703125, + "reward": 0.9335863590240479, + "reward_std": 0.001211183494888246, + "rewards/perpo_ocr_edit_distance_reward": 0.9335864186286926, + "step": 3868, + "temperature": 0.9 + }, + { + "advantages": -1.3130052138876636e-05, + "completion_length": 258.0, + "delta_ref_entropy_loss": 0.0137939453125, + "delta_ref_ppl": -0.11767578125, + "entropy_loss": -0.046875, + "epoch": 0.7738, + "grad_norm": 1.1958820022938534, + "k1_kl": 0.1171875, + "k3_kl": 0.09912109375, + "kimi_kl": 0.5390625, + "learning_rate": 1.131e-07, + "loss": 0.004, + "ppl": 0.018798828125, + "reward": 0.9956994652748108, + "reward_std": 0.0011969818733632565, + "rewards/perpo_ocr_edit_distance_reward": 0.9956995248794556, + "step": 3869, + "temperature": 0.9 + }, + { + "advantages": 7.152557941481064e-07, + "completion_length": 588.0, + "delta_ref_entropy_loss": -0.16015625, + "delta_ref_ppl": -0.09130859375, + "entropy_loss": -0.453125, + "epoch": 0.774, + "grad_norm": 3.3927455415941394, + "k1_kl": 0.0908203125, + "k3_kl": 0.09521484375, + "kimi_kl": 0.216796875, + "learning_rate": 1.1299999999999999e-07, + "loss": 0.0038, + "ppl": 0.1845703125, + "reward": 0.7760905623435974, + "reward_std": 0.024683188647031784, + "rewards/perpo_ocr_edit_distance_reward": 0.7760905623435974, + "step": 3870, + "temperature": 0.9 + }, + { + "advantages": -4.427773774295929e-07, + "completion_length": 799.0, + "delta_ref_entropy_loss": 0.02099609375, + "delta_ref_ppl": -0.08203125, + "entropy_loss": -0.0966796875, + "epoch": 0.7742, + "grad_norm": 0.9709734789637635, + "k1_kl": 0.08203125, + "k3_kl": 0.056640625, + "kimi_kl": 0.1416015625, + "learning_rate": 1.129e-07, + "loss": 0.0023, + "ppl": 0.041015625, + "reward": 0.9652283191680908, + "reward_std": 0.01917167939245701, + "rewards/perpo_ocr_edit_distance_reward": 0.9652283191680908, + "step": 3871, + "temperature": 0.9 + }, + { + "advantages": 8.514949634275126e-09, + "completion_length": 1941.0, + "delta_ref_entropy_loss": -0.03125, + "delta_ref_ppl": -0.033203125, + "entropy_loss": -0.17578125, + "epoch": 0.7744, + "grad_norm": 1.542850150451821, + "k1_kl": 0.033203125, + "k3_kl": 0.035400390625, + "kimi_kl": 0.0751953125, + "learning_rate": 1.1279999999999999e-07, + "loss": 0.0014, + "ppl": 0.08251953125, + "reward": 0.07860742509365082, + "reward_std": 0.006988765671849251, + "rewards/perpo_ocr_edit_distance_reward": 0.07860743254423141, + "step": 3872, + "temperature": 0.9 + }, + { + "advantages": -4.666192580771167e-06, + "completion_length": 901.0, + "delta_ref_entropy_loss": 0.0272216796875, + "delta_ref_ppl": -0.06689453125, + "entropy_loss": -0.07958984375, + "epoch": 0.7746, + "grad_norm": 1.8734603274989095, + "k1_kl": 0.06689453125, + "k3_kl": 0.0478515625, + "kimi_kl": 0.1572265625, + "learning_rate": 1.1269999999999998e-07, + "loss": 0.0019, + "ppl": 0.042236328125, + "reward": 0.986207127571106, + "reward_std": 0.0090365931391716, + "rewards/perpo_ocr_edit_distance_reward": 0.9862071871757507, + "step": 3873, + "temperature": 0.9 + }, + { + "advantages": -2.67199120571604e-05, + "completion_length": 453.0, + "delta_ref_entropy_loss": 0.0206298828125, + "delta_ref_ppl": -0.07177734375, + "entropy_loss": -0.047607421875, + "epoch": 0.7748, + "grad_norm": 0.7700634307917353, + "k1_kl": 0.072265625, + "k3_kl": 0.048095703125, + "kimi_kl": 0.1494140625, + "learning_rate": 1.126e-07, + "loss": 0.0019, + "ppl": 0.02099609375, + "reward": 0.9973666071891785, + "reward_std": 0.0018091615056619048, + "rewards/perpo_ocr_edit_distance_reward": 0.9973666667938232, + "step": 3874, + "temperature": 0.9 + }, + { + "advantages": -1.3215201761340722e-05, + "completion_length": 1370.0, + "delta_ref_entropy_loss": 0.03857421875, + "delta_ref_ppl": -0.041259765625, + "entropy_loss": -0.05419921875, + "epoch": 0.775, + "grad_norm": 3.9925482713057816, + "k1_kl": 0.041015625, + "k3_kl": 0.0238037109375, + "kimi_kl": 0.044189453125, + "learning_rate": 1.125e-07, + "loss": 0.001, + "ppl": 0.0216064453125, + "reward": 0.9851382374763489, + "reward_std": 0.0011889169691130519, + "rewards/perpo_ocr_edit_distance_reward": 0.9851382374763489, + "step": 3875, + "temperature": 0.9 + }, + { + "advantages": -6.849000055808574e-05, + "completion_length": 278.0, + "delta_ref_entropy_loss": 0.05908203125, + "delta_ref_ppl": -0.0908203125, + "entropy_loss": -0.03125, + "epoch": 0.7752, + "grad_norm": 0.4714525189870945, + "k1_kl": 0.0908203125, + "k3_kl": 0.057373046875, + "kimi_kl": 0.181640625, + "learning_rate": 1.124e-07, + "loss": 0.0024, + "ppl": 0.0107421875, + "reward": 0.9953466653823853, + "reward_std": 0.00039734828169457614, + "rewards/perpo_ocr_edit_distance_reward": 0.99534672498703, + "step": 3876, + "temperature": 0.9 + }, + { + "advantages": -3.417049447307363e-05, + "completion_length": 1279.0, + "delta_ref_entropy_loss": 0.028076171875, + "delta_ref_ppl": -0.0439453125, + "entropy_loss": -0.15234375, + "epoch": 0.7754, + "grad_norm": 23.2016533147966, + "k1_kl": 0.044189453125, + "k3_kl": 0.080078125, + "kimi_kl": 0.06884765625, + "learning_rate": 1.1229999999999999e-07, + "loss": 0.0032, + "ppl": 0.07470703125, + "reward": 0.93784499168396, + "reward_std": 0.0006473446846939623, + "rewards/perpo_ocr_edit_distance_reward": 0.9378451108932495, + "step": 3877, + "temperature": 0.9 + }, + { + "advantages": -6.443262100219727e-05, + "completion_length": 461.0, + "delta_ref_entropy_loss": 0.02783203125, + "delta_ref_ppl": -0.08154296875, + "entropy_loss": -0.03466796875, + "epoch": 0.7756, + "grad_norm": 0.5174701383089175, + "k1_kl": 0.08154296875, + "k3_kl": 0.0576171875, + "kimi_kl": 0.185546875, + "learning_rate": 1.1219999999999999e-07, + "loss": 0.0024, + "ppl": 0.01422119140625, + "reward": 0.991621732711792, + "reward_std": 0.0006928057409822941, + "rewards/perpo_ocr_edit_distance_reward": 0.9916218519210815, + "step": 3878, + "temperature": 0.9 + }, + { + "advantages": -1.4305115882962127e-06, + "completion_length": 448.0, + "delta_ref_entropy_loss": 0.056396484375, + "delta_ref_ppl": -0.1044921875, + "entropy_loss": -0.1591796875, + "epoch": 0.7758, + "grad_norm": 1.2256652530403662, + "k1_kl": 0.10498046875, + "k3_kl": 0.078125, + "kimi_kl": 0.2001953125, + "learning_rate": 1.121e-07, + "loss": 0.0031, + "ppl": 0.07421875, + "reward": 0.9162206649780273, + "reward_std": 0.0117756687104702, + "rewards/perpo_ocr_edit_distance_reward": 0.9162207245826721, + "step": 3879, + "temperature": 0.9 + }, + { + "advantages": -3.0824116947769653e-06, + "completion_length": 1172.0, + "delta_ref_entropy_loss": -0.00732421875, + "delta_ref_ppl": -0.0267333984375, + "entropy_loss": -0.0849609375, + "epoch": 0.776, + "grad_norm": 1.4626932680770417, + "k1_kl": 0.0267333984375, + "k3_kl": 0.024658203125, + "kimi_kl": 0.055419921875, + "learning_rate": 1.12e-07, + "loss": 0.001, + "ppl": 0.03564453125, + "reward": 0.8854919672012329, + "reward_std": 0.013749368488788605, + "rewards/perpo_ocr_edit_distance_reward": 0.8854920268058777, + "step": 3880, + "temperature": 0.9 + }, + { + "advantages": -1.3027873137616552e-05, + "completion_length": 1216.0, + "delta_ref_entropy_loss": 0.0162353515625, + "delta_ref_ppl": -0.0546875, + "entropy_loss": -0.2080078125, + "epoch": 0.7762, + "grad_norm": 1.3292247501936583, + "k1_kl": 0.0546875, + "k3_kl": 0.03662109375, + "kimi_kl": 0.072265625, + "learning_rate": 1.1189999999999999e-07, + "loss": 0.0015, + "ppl": 0.099609375, + "reward": 0.751753032207489, + "reward_std": 0.001858686562627554, + "rewards/perpo_ocr_edit_distance_reward": 0.7517530918121338, + "step": 3881, + "temperature": 0.9 + }, + { + "advantages": -4.402229023980908e-05, + "completion_length": 247.0, + "delta_ref_entropy_loss": 0.06640625, + "delta_ref_ppl": -0.1806640625, + "entropy_loss": -0.087890625, + "epoch": 0.7764, + "grad_norm": 1.5686870881007406, + "k1_kl": 0.1806640625, + "k3_kl": 0.1513671875, + "kimi_kl": 0.478515625, + "learning_rate": 1.118e-07, + "loss": 0.0061, + "ppl": 0.044677734375, + "reward": 0.9113731384277344, + "reward_std": 0.0022220194805413485, + "rewards/perpo_ocr_edit_distance_reward": 0.9113731980323792, + "step": 3882, + "temperature": 0.9 + }, + { + "advantages": -2.60812903434271e-05, + "completion_length": 914.0, + "delta_ref_entropy_loss": 0.0223388671875, + "delta_ref_ppl": -0.031494140625, + "entropy_loss": -0.134765625, + "epoch": 0.7766, + "grad_norm": 3.5767613775769425, + "k1_kl": 0.03173828125, + "k3_kl": 0.0211181640625, + "kimi_kl": 0.047607421875, + "learning_rate": 1.1169999999999999e-07, + "loss": 0.0009, + "ppl": 0.07177734375, + "reward": 0.9908556342124939, + "reward_std": 0.0031636925414204597, + "rewards/perpo_ocr_edit_distance_reward": 0.9908556938171387, + "step": 3883, + "temperature": 0.9 + }, + { + "advantages": 8.199896547012031e-05, + "completion_length": 1034.0, + "delta_ref_entropy_loss": 0.017578125, + "delta_ref_ppl": -0.032470703125, + "entropy_loss": -0.0291748046875, + "epoch": 0.7768, + "grad_norm": 0.2035509018093363, + "k1_kl": 0.032470703125, + "k3_kl": 0.0179443359375, + "kimi_kl": 0.043701171875, + "learning_rate": 1.116e-07, + "loss": 0.0006, + "ppl": 0.0107421875, + "reward": 0.9990366697311401, + "reward_std": 0.00021151338296476752, + "rewards/perpo_ocr_edit_distance_reward": 0.9990366697311401, + "step": 3884, + "temperature": 0.9 + }, + { + "advantages": -2.3143633370636962e-05, + "completion_length": 581.0, + "delta_ref_entropy_loss": 0.037109375, + "delta_ref_ppl": -0.05126953125, + "entropy_loss": -0.056884765625, + "epoch": 0.777, + "grad_norm": 0.843916597735881, + "k1_kl": 0.051025390625, + "k3_kl": 0.032958984375, + "kimi_kl": 0.09521484375, + "learning_rate": 1.115e-07, + "loss": 0.0013, + "ppl": 0.022705078125, + "reward": 0.9963058829307556, + "reward_std": 0.0006357760285027325, + "rewards/perpo_ocr_edit_distance_reward": 0.9963059425354004, + "step": 3885, + "temperature": 0.9 + }, + { + "advantages": -6.999288416409399e-06, + "completion_length": 82.0, + "delta_ref_entropy_loss": 0.12158203125, + "delta_ref_ppl": -0.5234375, + "entropy_loss": -0.1796875, + "epoch": 0.7772, + "grad_norm": 3.4772453107019654, + "k1_kl": 0.5234375, + "k3_kl": 0.427734375, + "kimi_kl": 2.03125, + "learning_rate": 1.1139999999999999e-07, + "loss": 0.0171, + "ppl": 0.06591796875, + "reward": 0.9876334071159363, + "reward_std": 0.0035562103148549795, + "rewards/perpo_ocr_edit_distance_reward": 0.987633466720581, + "step": 3886, + "temperature": 0.9 + }, + { + "advantages": -1.1597361663007177e-05, + "completion_length": 536.0, + "delta_ref_entropy_loss": 0.0286865234375, + "delta_ref_ppl": -0.052001953125, + "entropy_loss": -0.06494140625, + "epoch": 0.7774, + "grad_norm": 0.5186470197790943, + "k1_kl": 0.052001953125, + "k3_kl": 0.03271484375, + "kimi_kl": 0.08837890625, + "learning_rate": 1.113e-07, + "loss": 0.0013, + "ppl": 0.0250244140625, + "reward": 0.9957878589630127, + "reward_std": 0.0013653523055836558, + "rewards/perpo_ocr_edit_distance_reward": 0.9957878589630127, + "step": 3887, + "temperature": 0.9 + }, + { + "advantages": -1.0831015970325097e-05, + "completion_length": 517.0, + "delta_ref_entropy_loss": 0.0245361328125, + "delta_ref_ppl": -0.09033203125, + "entropy_loss": -0.1181640625, + "epoch": 0.7776, + "grad_norm": 2.065815637073564, + "k1_kl": 0.09033203125, + "k3_kl": 0.06640625, + "kimi_kl": 0.2275390625, + "learning_rate": 1.1119999999999999e-07, + "loss": 0.0027, + "ppl": 0.04345703125, + "reward": 0.8394831418991089, + "reward_std": 0.00540171330794692, + "rewards/perpo_ocr_edit_distance_reward": 0.8394831418991089, + "step": 3888, + "temperature": 0.9 + }, + { + "advantages": -3.674200706882402e-05, + "completion_length": 579.0, + "delta_ref_entropy_loss": 0.06005859375, + "delta_ref_ppl": -0.08935546875, + "entropy_loss": -0.06689453125, + "epoch": 0.7778, + "grad_norm": 0.6369887319697383, + "k1_kl": 0.08935546875, + "k3_kl": 0.058349609375, + "kimi_kl": 0.1728515625, + "learning_rate": 1.111e-07, + "loss": 0.0024, + "ppl": 0.023193359375, + "reward": 0.9487367272377014, + "reward_std": 0.0008272569393739104, + "rewards/perpo_ocr_edit_distance_reward": 0.9487367868423462, + "step": 3889, + "temperature": 0.9 + }, + { + "advantages": -6.113733888923889e-06, + "completion_length": 875.0, + "delta_ref_entropy_loss": 0.060302734375, + "delta_ref_ppl": -0.049072265625, + "entropy_loss": -0.12890625, + "epoch": 0.778, + "grad_norm": 1.6417895034704448, + "k1_kl": 0.049072265625, + "k3_kl": 0.028076171875, + "kimi_kl": 0.05078125, + "learning_rate": 1.11e-07, + "loss": 0.0011, + "ppl": 0.05810546875, + "reward": 0.9734755158424377, + "reward_std": 0.0026861189398914576, + "rewards/perpo_ocr_edit_distance_reward": 0.9734755158424377, + "step": 3890, + "temperature": 0.9 + }, + { + "advantages": -2.0248549844836816e-05, + "completion_length": 645.0, + "delta_ref_entropy_loss": 0.08349609375, + "delta_ref_ppl": -0.11181640625, + "entropy_loss": -0.20703125, + "epoch": 0.7782, + "grad_norm": 2.3625069006408888, + "k1_kl": 0.111328125, + "k3_kl": 0.08056640625, + "kimi_kl": 0.2109375, + "learning_rate": 1.109e-07, + "loss": 0.0032, + "ppl": 0.11279296875, + "reward": 0.9279032945632935, + "reward_std": 0.003685971722006798, + "rewards/perpo_ocr_edit_distance_reward": 0.9279033541679382, + "step": 3891, + "temperature": 0.9 + }, + { + "advantages": -2.740110721788369e-05, + "completion_length": 396.0, + "delta_ref_entropy_loss": -0.00153350830078125, + "delta_ref_ppl": -0.06689453125, + "entropy_loss": -0.11181640625, + "epoch": 0.7784, + "grad_norm": 1.4081447973070829, + "k1_kl": 0.06689453125, + "k3_kl": 0.05126953125, + "kimi_kl": 0.185546875, + "learning_rate": 1.1079999999999999e-07, + "loss": 0.0021, + "ppl": 0.035888671875, + "reward": 0.8869390487670898, + "reward_std": 0.002385744359344244, + "rewards/perpo_ocr_edit_distance_reward": 0.8869391083717346, + "step": 3892, + "temperature": 0.9 + }, + { + "advantages": -1.4926707081031054e-05, + "completion_length": 557.0, + "delta_ref_entropy_loss": 0.055908203125, + "delta_ref_ppl": -0.0791015625, + "entropy_loss": -0.0771484375, + "epoch": 0.7786, + "grad_norm": 0.9033219642690009, + "k1_kl": 0.0791015625, + "k3_kl": 0.05078125, + "kimi_kl": 0.1591796875, + "learning_rate": 1.107e-07, + "loss": 0.002, + "ppl": 0.02685546875, + "reward": 0.9966250061988831, + "reward_std": 0.0016113610472530127, + "rewards/perpo_ocr_edit_distance_reward": 0.9966250061988831, + "step": 3893, + "temperature": 0.9 + }, + { + "advantages": -8.344650268554688e-06, + "completion_length": 870.0, + "delta_ref_entropy_loss": 0.000278472900390625, + "delta_ref_ppl": -0.0576171875, + "entropy_loss": -0.251953125, + "epoch": 0.7788, + "grad_norm": 53.02566963504063, + "k1_kl": 0.057861328125, + "k3_kl": 0.31640625, + "kimi_kl": 0.11474609375, + "learning_rate": 1.106e-07, + "loss": 0.0127, + "ppl": 0.11328125, + "reward": 0.9454814195632935, + "reward_std": 0.003964782226830721, + "rewards/perpo_ocr_edit_distance_reward": 0.9454814791679382, + "step": 3894, + "temperature": 0.9 + }, + { + "advantages": -2.7384077839087695e-05, + "completion_length": 429.0, + "delta_ref_entropy_loss": 0.025634765625, + "delta_ref_ppl": -0.058349609375, + "entropy_loss": -0.046630859375, + "epoch": 0.779, + "grad_norm": 0.7242553896585364, + "k1_kl": 0.058349609375, + "k3_kl": 0.043701171875, + "kimi_kl": 0.142578125, + "learning_rate": 1.1049999999999999e-07, + "loss": 0.0018, + "ppl": 0.0162353515625, + "reward": 0.9935299158096313, + "reward_std": 0.0008333919104188681, + "rewards/perpo_ocr_edit_distance_reward": 0.9935300350189209, + "step": 3895, + "temperature": 0.9 + }, + { + "advantages": -1.2993813470529858e-05, + "completion_length": 581.0, + "delta_ref_entropy_loss": 0.031005859375, + "delta_ref_ppl": -0.053955078125, + "entropy_loss": -0.056884765625, + "epoch": 0.7792, + "grad_norm": 0.6514642109733023, + "k1_kl": 0.053955078125, + "k3_kl": 0.034912109375, + "kimi_kl": 0.09375, + "learning_rate": 1.104e-07, + "loss": 0.0014, + "ppl": 0.0301513671875, + "reward": 0.9928767681121826, + "reward_std": 0.0018649031408131123, + "rewards/perpo_ocr_edit_distance_reward": 0.9928767681121826, + "step": 3896, + "temperature": 0.9 + }, + { + "advantages": 1.3113022987454315e-06, + "completion_length": 627.0, + "delta_ref_entropy_loss": -0.0228271484375, + "delta_ref_ppl": -0.1162109375, + "entropy_loss": -0.470703125, + "epoch": 0.7794, + "grad_norm": 3.3418836775766363, + "k1_kl": 0.115234375, + "k3_kl": 0.0986328125, + "kimi_kl": 0.1943359375, + "learning_rate": 1.1029999999999999e-07, + "loss": 0.0039, + "ppl": 0.236328125, + "reward": 0.7978796362876892, + "reward_std": 0.006383475847542286, + "rewards/perpo_ocr_edit_distance_reward": 0.7978796362876892, + "step": 3897, + "temperature": 0.9 + }, + { + "advantages": -3.084966374444775e-05, + "completion_length": 1218.0, + "delta_ref_entropy_loss": 0.039794921875, + "delta_ref_ppl": -0.056640625, + "entropy_loss": -0.103515625, + "epoch": 0.7796, + "grad_norm": 1.3071198354472953, + "k1_kl": 0.056640625, + "k3_kl": 0.03466796875, + "kimi_kl": 0.07373046875, + "learning_rate": 1.1020000000000001e-07, + "loss": 0.0014, + "ppl": 0.05029296875, + "reward": 0.984645426273346, + "reward_std": 0.0021078467834740877, + "rewards/perpo_ocr_edit_distance_reward": 0.9846455454826355, + "step": 3898, + "temperature": 0.9 + }, + { + "advantages": -4.1459290514467284e-05, + "completion_length": 169.0, + "delta_ref_entropy_loss": 0.03564453125, + "delta_ref_ppl": -0.2041015625, + "entropy_loss": -0.052734375, + "epoch": 0.7798, + "grad_norm": 2.0445694157509116, + "k1_kl": 0.2041015625, + "k3_kl": 0.154296875, + "kimi_kl": 0.890625, + "learning_rate": 1.101e-07, + "loss": 0.0062, + "ppl": 0.0189208984375, + "reward": 0.9614197015762329, + "reward_std": 0.0015432039508596063, + "rewards/perpo_ocr_edit_distance_reward": 0.9614198207855225, + "step": 3899, + "temperature": 0.9 + }, + { + "advantages": -1.9124576283502392e-05, + "completion_length": 1240.0, + "delta_ref_entropy_loss": 0.0289306640625, + "delta_ref_ppl": -0.053955078125, + "entropy_loss": -0.076171875, + "epoch": 0.78, + "grad_norm": 0.9644157342242448, + "k1_kl": 0.053955078125, + "k3_kl": 0.032470703125, + "kimi_kl": 0.0693359375, + "learning_rate": 1.0999999999999999e-07, + "loss": 0.0013, + "ppl": 0.03173828125, + "reward": 0.9282771348953247, + "reward_std": 0.0012360234977677464, + "rewards/perpo_ocr_edit_distance_reward": 0.9282771944999695, + "step": 3900, + "temperature": 0.9 + }, + { + "advantages": -5.3133284382056445e-05, + "completion_length": 531.0, + "delta_ref_entropy_loss": 0.0262451171875, + "delta_ref_ppl": -0.06982421875, + "entropy_loss": -0.03564453125, + "epoch": 0.7802, + "grad_norm": 0.587886340246124, + "k1_kl": 0.06982421875, + "k3_kl": 0.049072265625, + "kimi_kl": 0.185546875, + "learning_rate": 1.099e-07, + "loss": 0.002, + "ppl": 0.0135498046875, + "reward": 0.9966007471084595, + "reward_std": 0.0010217156959697604, + "rewards/perpo_ocr_edit_distance_reward": 0.996600866317749, + "step": 3901, + "temperature": 0.9 + }, + { + "advantages": -2.1406583982752636e-05, + "completion_length": 554.0, + "delta_ref_entropy_loss": 0.06494140625, + "delta_ref_ppl": -0.10205078125, + "entropy_loss": -0.173828125, + "epoch": 0.7804, + "grad_norm": 1.4060263021884059, + "k1_kl": 0.10205078125, + "k3_kl": 0.06884765625, + "kimi_kl": 0.197265625, + "learning_rate": 1.0979999999999999e-07, + "loss": 0.0028, + "ppl": 0.08154296875, + "reward": 0.9400147199630737, + "reward_std": 0.003082711948081851, + "rewards/perpo_ocr_edit_distance_reward": 0.9400148391723633, + "step": 3902, + "temperature": 0.9 + }, + { + "advantages": -1.8434866433381103e-05, + "completion_length": 1275.0, + "delta_ref_entropy_loss": 0.0419921875, + "delta_ref_ppl": -0.041748046875, + "entropy_loss": -0.0771484375, + "epoch": 0.7806, + "grad_norm": 201.8549324277344, + "k1_kl": 0.041748046875, + "k3_kl": 2.96875, + "kimi_kl": 0.06005859375, + "learning_rate": 1.097e-07, + "loss": 0.1187, + "ppl": 0.0390625, + "reward": 0.7544909715652466, + "reward_std": 0.005902908276766539, + "rewards/perpo_ocr_edit_distance_reward": 0.7544910311698914, + "step": 3903, + "temperature": 0.9 + }, + { + "advantages": 0.0, + "completion_length": 709.0, + "delta_ref_entropy_loss": 0.031005859375, + "delta_ref_ppl": -0.05029296875, + "entropy_loss": -0.08251953125, + "epoch": 0.7808, + "grad_norm": 0.8822947724300811, + "k1_kl": 0.05029296875, + "k3_kl": 0.0289306640625, + "kimi_kl": 0.06689453125, + "learning_rate": 1.096e-07, + "loss": 0.0012, + "ppl": 0.03564453125, + "reward": 0.9952604174613953, + "reward_std": 0.0048350319266319275, + "rewards/perpo_ocr_edit_distance_reward": 0.9952604174613953, + "step": 3904, + "temperature": 0.9 + }, + { + "advantages": -2.0018645955133252e-05, + "completion_length": 520.0, + "delta_ref_entropy_loss": 0.0238037109375, + "delta_ref_ppl": -0.059326171875, + "entropy_loss": -0.06005859375, + "epoch": 0.781, + "grad_norm": 0.7057875824235903, + "k1_kl": 0.059326171875, + "k3_kl": 0.037109375, + "kimi_kl": 0.11767578125, + "learning_rate": 1.095e-07, + "loss": 0.0015, + "ppl": 0.025146484375, + "reward": 0.9876230955123901, + "reward_std": 0.002877668710425496, + "rewards/perpo_ocr_edit_distance_reward": 0.9876232147216797, + "step": 3905, + "temperature": 0.9 + }, + { + "advantages": -3.3829895983217284e-05, + "completion_length": 1209.0, + "delta_ref_entropy_loss": 0.01904296875, + "delta_ref_ppl": -0.030029296875, + "entropy_loss": -0.039794921875, + "epoch": 0.7812, + "grad_norm": 0.6319367275704803, + "k1_kl": 0.0301513671875, + "k3_kl": 0.017822265625, + "kimi_kl": 0.047119140625, + "learning_rate": 1.0939999999999999e-07, + "loss": 0.0007, + "ppl": 0.0145263671875, + "reward": 0.9968146085739136, + "reward_std": 0.0016606288263574243, + "rewards/perpo_ocr_edit_distance_reward": 0.9968147277832031, + "step": 3906, + "temperature": 0.9 + }, + { + "advantages": -5.200505620450713e-05, + "completion_length": 519.0, + "delta_ref_entropy_loss": 0.02392578125, + "delta_ref_ppl": -0.048095703125, + "entropy_loss": -0.032958984375, + "epoch": 0.7814, + "grad_norm": 0.4359489429725886, + "k1_kl": 0.048095703125, + "k3_kl": 0.0341796875, + "kimi_kl": 0.130859375, + "learning_rate": 1.0929999999999999e-07, + "loss": 0.0014, + "ppl": 0.00982666015625, + "reward": 0.9990354180335999, + "reward_std": 0.0005549233756028116, + "rewards/perpo_ocr_edit_distance_reward": 0.9990355372428894, + "step": 3907, + "temperature": 0.9 + }, + { + "advantages": -2.7120115191792138e-05, + "completion_length": 884.0, + "delta_ref_entropy_loss": 0.05615234375, + "delta_ref_ppl": -0.07861328125, + "entropy_loss": -0.083984375, + "epoch": 0.7816, + "grad_norm": 2.0152118965815746, + "k1_kl": 0.078125, + "k3_kl": 0.05126953125, + "kimi_kl": 0.119140625, + "learning_rate": 1.092e-07, + "loss": 0.0021, + "ppl": 0.041748046875, + "reward": 0.9311883449554443, + "reward_std": 0.0011561354622244835, + "rewards/perpo_ocr_edit_distance_reward": 0.9311884045600891, + "step": 3908, + "temperature": 0.9 + }, + { + "advantages": -0.00016074308950919658, + "completion_length": 551.0, + "delta_ref_entropy_loss": 0.031005859375, + "delta_ref_ppl": -0.07470703125, + "entropy_loss": -0.028076171875, + "epoch": 0.7818, + "grad_norm": 0.3235001636637544, + "k1_kl": 0.07470703125, + "k3_kl": 0.0546875, + "kimi_kl": 0.1708984375, + "learning_rate": 1.091e-07, + "loss": 0.0024, + "ppl": 0.00970458984375, + "reward": 0.9975106120109558, + "reward_std": 0.00037673121551051736, + "rewards/perpo_ocr_edit_distance_reward": 0.9975106716156006, + "step": 3909, + "temperature": 0.9 + }, + { + "advantages": -5.10896995820076e-08, + "completion_length": 41.0, + "delta_ref_entropy_loss": -0.55078125, + "delta_ref_ppl": -0.58203125, + "entropy_loss": -1.375, + "epoch": 0.782, + "grad_norm": 14.80657092829634, + "k1_kl": 0.58203125, + "k3_kl": 0.80859375, + "kimi_kl": 1.8984375, + "learning_rate": 1.09e-07, + "loss": 0.0323, + "ppl": 0.6796875, + "reward": 0.4268762171268463, + "reward_std": 0.17251528799533844, + "rewards/perpo_ocr_edit_distance_reward": 0.4268762171268463, + "step": 3910, + "temperature": 0.9 + }, + { + "advantages": -2.002716064453125e-05, + "completion_length": 131.0, + "delta_ref_entropy_loss": -0.0206298828125, + "delta_ref_ppl": -0.28125, + "entropy_loss": -0.10693359375, + "epoch": 0.7822, + "grad_norm": 2.435502990333794, + "k1_kl": 0.28125, + "k3_kl": 0.23828125, + "kimi_kl": 1.1484375, + "learning_rate": 1.0889999999999999e-07, + "loss": 0.0096, + "ppl": 0.0556640625, + "reward": 0.9291778802871704, + "reward_std": 0.0020282245241105556, + "rewards/perpo_ocr_edit_distance_reward": 0.9291778802871704, + "step": 3911, + "temperature": 0.9 + }, + { + "advantages": -1.7200197817146545e-06, + "completion_length": 731.0, + "delta_ref_entropy_loss": -0.030029296875, + "delta_ref_ppl": -0.06689453125, + "entropy_loss": -0.2138671875, + "epoch": 0.7824, + "grad_norm": 1.1074545624078496, + "k1_kl": 0.06689453125, + "k3_kl": 0.0546875, + "kimi_kl": 0.1708984375, + "learning_rate": 1.088e-07, + "loss": 0.0022, + "ppl": 0.0888671875, + "reward": 0.7883764505386353, + "reward_std": 0.024487629532814026, + "rewards/perpo_ocr_edit_distance_reward": 0.78837651014328, + "step": 3912, + "temperature": 0.9 + }, + { + "advantages": -9.114879503613338e-05, + "completion_length": 428.0, + "delta_ref_entropy_loss": 0.01904296875, + "delta_ref_ppl": -0.05517578125, + "entropy_loss": -0.036376953125, + "epoch": 0.7826, + "grad_norm": 0.4315355528327022, + "k1_kl": 0.054931640625, + "k3_kl": 0.04052734375, + "kimi_kl": 0.1318359375, + "learning_rate": 1.087e-07, + "loss": 0.0017, + "ppl": 0.01287841796875, + "reward": 0.9964028000831604, + "reward_std": 0.00036711941356770694, + "rewards/perpo_ocr_edit_distance_reward": 0.99640291929245, + "step": 3913, + "temperature": 0.9 + }, + { + "advantages": 4.427773774295929e-07, + "completion_length": 433.0, + "delta_ref_entropy_loss": -0.1552734375, + "delta_ref_ppl": -0.078125, + "entropy_loss": -0.67578125, + "epoch": 0.7828, + "grad_norm": 3.904231565453246, + "k1_kl": 0.07763671875, + "k3_kl": 0.08349609375, + "kimi_kl": 0.1376953125, + "learning_rate": 1.0859999999999999e-07, + "loss": 0.0033, + "ppl": 0.376953125, + "reward": 0.4859432578086853, + "reward_std": 0.028465943410992622, + "rewards/perpo_ocr_edit_distance_reward": 0.4859432578086853, + "step": 3914, + "temperature": 0.9 + }, + { + "advantages": -2.869538002414629e-05, + "completion_length": 677.0, + "delta_ref_entropy_loss": 0.03466796875, + "delta_ref_ppl": -0.057373046875, + "entropy_loss": -0.03369140625, + "epoch": 0.783, + "grad_norm": 0.5644262485912943, + "k1_kl": 0.0576171875, + "k3_kl": 0.036376953125, + "kimi_kl": 0.11376953125, + "learning_rate": 1.085e-07, + "loss": 0.0015, + "ppl": 0.0118408203125, + "reward": 0.9868078231811523, + "reward_std": 0.0013840490719303489, + "rewards/perpo_ocr_edit_distance_reward": 0.9868078827857971, + "step": 3915, + "temperature": 0.9 + }, + { + "advantages": 4.751341748487903e-06, + "completion_length": 296.0, + "delta_ref_entropy_loss": 0.03271484375, + "delta_ref_ppl": -0.10888671875, + "entropy_loss": -0.060546875, + "epoch": 0.7832, + "grad_norm": 1.0030653119717907, + "k1_kl": 0.10888671875, + "k3_kl": 0.08251953125, + "kimi_kl": 0.373046875, + "learning_rate": 1.0839999999999999e-07, + "loss": 0.0033, + "ppl": 0.0238037109375, + "reward": 0.9918975830078125, + "reward_std": 0.001692386926151812, + "rewards/perpo_ocr_edit_distance_reward": 0.9918976426124573, + "step": 3916, + "temperature": 0.9 + }, + { + "advantages": -1.965250339708291e-05, + "completion_length": 468.0, + "delta_ref_entropy_loss": 0.048583984375, + "delta_ref_ppl": -0.11328125, + "entropy_loss": -0.095703125, + "epoch": 0.7834, + "grad_norm": 1.3157720832643294, + "k1_kl": 0.11279296875, + "k3_kl": 0.06640625, + "kimi_kl": 0.14453125, + "learning_rate": 1.0829999999999998e-07, + "loss": 0.0027, + "ppl": 0.0380859375, + "reward": 0.9787326455116272, + "reward_std": 0.002500450937077403, + "rewards/perpo_ocr_edit_distance_reward": 0.9787327647209167, + "step": 3917, + "temperature": 0.9 + }, + { + "advantages": -1.1733600331353955e-05, + "completion_length": 388.0, + "delta_ref_entropy_loss": 0.07763671875, + "delta_ref_ppl": -0.08837890625, + "entropy_loss": -0.083984375, + "epoch": 0.7836, + "grad_norm": 0.9993258944790115, + "k1_kl": 0.08837890625, + "k3_kl": 0.051513671875, + "kimi_kl": 0.115234375, + "learning_rate": 1.082e-07, + "loss": 0.0021, + "ppl": 0.035888671875, + "reward": 0.9867302775382996, + "reward_std": 0.0028016327414661646, + "rewards/perpo_ocr_edit_distance_reward": 0.9867302775382996, + "step": 3918, + "temperature": 0.9 + }, + { + "advantages": -2.8082304197596386e-05, + "completion_length": 327.0, + "delta_ref_entropy_loss": 0.0296630859375, + "delta_ref_ppl": -0.0859375, + "entropy_loss": -0.07421875, + "epoch": 0.7838, + "grad_norm": 1.0346983334015192, + "k1_kl": 0.0859375, + "k3_kl": 0.061279296875, + "kimi_kl": 0.166015625, + "learning_rate": 1.081e-07, + "loss": 0.0025, + "ppl": 0.035888671875, + "reward": 0.9815943241119385, + "reward_std": 0.0020225667394697666, + "rewards/perpo_ocr_edit_distance_reward": 0.9815943837165833, + "step": 3919, + "temperature": 0.9 + }, + { + "advantages": -1.1920928955078125e-07, + "completion_length": 281.0, + "delta_ref_entropy_loss": -0.076171875, + "delta_ref_ppl": -0.08544921875, + "entropy_loss": -0.26171875, + "epoch": 0.784, + "grad_norm": 2.257620759296328, + "k1_kl": 0.0849609375, + "k3_kl": 0.07666015625, + "kimi_kl": 0.1923828125, + "learning_rate": 1.0799999999999999e-07, + "loss": 0.0031, + "ppl": 0.109375, + "reward": 0.8518788814544678, + "reward_std": 0.13678713142871857, + "rewards/perpo_ocr_edit_distance_reward": 0.8518788814544678, + "step": 3920, + "temperature": 0.9 + }, + { + "advantages": -2.309680166945327e-05, + "completion_length": 90.0, + "delta_ref_entropy_loss": 0.0751953125, + "delta_ref_ppl": -0.353515625, + "entropy_loss": -0.08349609375, + "epoch": 0.7842, + "grad_norm": 2.3410136492635387, + "k1_kl": 0.353515625, + "k3_kl": 0.279296875, + "kimi_kl": 1.1953125, + "learning_rate": 1.079e-07, + "loss": 0.0112, + "ppl": 0.033935546875, + "reward": 0.9937458038330078, + "reward_std": 0.001741796382702887, + "rewards/perpo_ocr_edit_distance_reward": 0.9937459230422974, + "step": 3921, + "temperature": 0.9 + }, + { + "advantages": 1.7327922250842676e-05, + "completion_length": 598.0, + "delta_ref_entropy_loss": 0.02099609375, + "delta_ref_ppl": -0.0537109375, + "entropy_loss": -0.0294189453125, + "epoch": 0.7844, + "grad_norm": 0.2898785353510074, + "k1_kl": 0.0537109375, + "k3_kl": 0.034912109375, + "kimi_kl": 0.11669921875, + "learning_rate": 1.078e-07, + "loss": 0.0014, + "ppl": 0.00872802734375, + "reward": 0.9994713068008423, + "reward_std": 0.0003911074891220778, + "rewards/perpo_ocr_edit_distance_reward": 0.9994713068008423, + "step": 3922, + "temperature": 0.9 + }, + { + "advantages": 7.62939453125e-06, + "completion_length": 445.0, + "delta_ref_entropy_loss": 0.0322265625, + "delta_ref_ppl": -0.07763671875, + "entropy_loss": -0.054443359375, + "epoch": 0.7846, + "grad_norm": 0.628176570611901, + "k1_kl": 0.07763671875, + "k3_kl": 0.0615234375, + "kimi_kl": 0.203125, + "learning_rate": 1.077e-07, + "loss": 0.0025, + "ppl": 0.0205078125, + "reward": 0.9962059855461121, + "reward_std": 0.0010153373004868627, + "rewards/perpo_ocr_edit_distance_reward": 0.9962060451507568, + "step": 3923, + "temperature": 0.9 + }, + { + "advantages": -0.00010039125481853262, + "completion_length": 775.0, + "delta_ref_entropy_loss": 0.01409912109375, + "delta_ref_ppl": -0.042236328125, + "entropy_loss": -0.03466796875, + "epoch": 0.7848, + "grad_norm": 0.39990411699341155, + "k1_kl": 0.042236328125, + "k3_kl": 0.02880859375, + "kimi_kl": 0.09326171875, + "learning_rate": 1.076e-07, + "loss": 0.0013, + "ppl": 0.0126953125, + "reward": 0.9939226508140564, + "reward_std": 0.0009178118198178709, + "rewards/perpo_ocr_edit_distance_reward": 0.9939227104187012, + "step": 3924, + "temperature": 0.9 + }, + { + "advantages": -8.344650836988876e-07, + "completion_length": 540.0, + "delta_ref_entropy_loss": 0.05517578125, + "delta_ref_ppl": -0.126953125, + "entropy_loss": -0.314453125, + "epoch": 0.785, + "grad_norm": 3.2639554039975796, + "k1_kl": 0.126953125, + "k3_kl": 0.08984375, + "kimi_kl": 0.2265625, + "learning_rate": 1.0749999999999999e-07, + "loss": 0.0036, + "ppl": 0.1494140625, + "reward": 0.6881469488143921, + "reward_std": 0.07248537242412567, + "rewards/perpo_ocr_edit_distance_reward": 0.6881470084190369, + "step": 3925, + "temperature": 0.9 + }, + { + "advantages": -5.713531209039502e-05, + "completion_length": 596.0, + "delta_ref_entropy_loss": 0.072265625, + "delta_ref_ppl": -0.0673828125, + "entropy_loss": -0.1015625, + "epoch": 0.7852, + "grad_norm": 1.0047686106009601, + "k1_kl": 0.0673828125, + "k3_kl": 0.038818359375, + "kimi_kl": 0.09326171875, + "learning_rate": 1.074e-07, + "loss": 0.0016, + "ppl": 0.042236328125, + "reward": 0.9190916419029236, + "reward_std": 0.0007942520314827561, + "rewards/perpo_ocr_edit_distance_reward": 0.9190917611122131, + "step": 3926, + "temperature": 0.9 + }, + { + "advantages": -1.9584383892379265e-07, + "completion_length": 653.0, + "delta_ref_entropy_loss": -0.0517578125, + "delta_ref_ppl": -0.09716796875, + "entropy_loss": -0.361328125, + "epoch": 0.7854, + "grad_norm": 2.566468491744604, + "k1_kl": 0.09716796875, + "k3_kl": 0.08447265625, + "kimi_kl": 0.1640625, + "learning_rate": 1.073e-07, + "loss": 0.0034, + "ppl": 0.1689453125, + "reward": 0.3794829547405243, + "reward_std": 0.07181022316217422, + "rewards/perpo_ocr_edit_distance_reward": 0.3794829547405243, + "step": 3927, + "temperature": 0.9 + }, + { + "advantages": -3.196512261638418e-05, + "completion_length": 463.0, + "delta_ref_entropy_loss": 0.02978515625, + "delta_ref_ppl": -0.06689453125, + "entropy_loss": -0.049560546875, + "epoch": 0.7856, + "grad_norm": 0.6027705930037637, + "k1_kl": 0.06640625, + "k3_kl": 0.04345703125, + "kimi_kl": 0.1396484375, + "learning_rate": 1.072e-07, + "loss": 0.0018, + "ppl": 0.0201416015625, + "reward": 0.9942699670791626, + "reward_std": 0.0006991294212639332, + "rewards/perpo_ocr_edit_distance_reward": 0.9942700266838074, + "step": 3928, + "temperature": 0.9 + }, + { + "advantages": -2.8099333349018707e-07, + "completion_length": 467.0, + "delta_ref_entropy_loss": -0.5, + "delta_ref_ppl": -0.10986328125, + "entropy_loss": -1.046875, + "epoch": 0.7858, + "grad_norm": 5.016746344151067, + "k1_kl": 0.109375, + "k3_kl": 0.171875, + "kimi_kl": 0.36328125, + "learning_rate": 1.071e-07, + "loss": 0.0069, + "ppl": 0.578125, + "reward": 0.1936897337436676, + "reward_std": 0.06684233248233795, + "rewards/perpo_ocr_edit_distance_reward": 0.19368976354599, + "step": 3929, + "temperature": 0.9 + }, + { + "advantages": -7.067408205330139e-06, + "completion_length": 41.0, + "delta_ref_entropy_loss": 0.048828125, + "delta_ref_ppl": -0.77734375, + "entropy_loss": -0.1748046875, + "epoch": 0.786, + "grad_norm": 2.9484179217561004, + "k1_kl": 0.77734375, + "k3_kl": 0.671875, + "kimi_kl": 3.171875, + "learning_rate": 1.0699999999999999e-07, + "loss": 0.0269, + "ppl": 0.0712890625, + "reward": 0.8036998510360718, + "reward_std": 0.003510428359732032, + "rewards/perpo_ocr_edit_distance_reward": 0.8036999106407166, + "step": 3930, + "temperature": 0.9 + }, + { + "advantages": -8.353165867447387e-06, + "completion_length": 474.0, + "delta_ref_entropy_loss": 0.1201171875, + "delta_ref_ppl": -0.1845703125, + "entropy_loss": -0.462890625, + "epoch": 0.7862, + "grad_norm": 2.4653049827884326, + "k1_kl": 0.1845703125, + "k3_kl": 0.107421875, + "kimi_kl": 0.22265625, + "learning_rate": 1.0689999999999998e-07, + "loss": 0.0043, + "ppl": 0.2353515625, + "reward": 0.7330136895179749, + "reward_std": 0.004997077863663435, + "rewards/perpo_ocr_edit_distance_reward": 0.7330137491226196, + "step": 3931, + "temperature": 0.9 + }, + { + "advantages": -3.167561317241052e-06, + "completion_length": 55.0, + "delta_ref_entropy_loss": -0.036865234375, + "delta_ref_ppl": -0.6328125, + "entropy_loss": -0.42578125, + "epoch": 0.7864, + "grad_norm": 11.019906920815423, + "k1_kl": 0.62890625, + "k3_kl": 0.55078125, + "kimi_kl": 3.046875, + "learning_rate": 1.068e-07, + "loss": 0.0221, + "ppl": 0.138671875, + "reward": 0.3689248859882355, + "reward_std": 0.006558857858181, + "rewards/perpo_ocr_edit_distance_reward": 0.36892491579055786, + "step": 3932, + "temperature": 0.9 + }, + { + "advantages": -1.5216214706015307e-05, + "completion_length": 306.0, + "delta_ref_entropy_loss": 0.0693359375, + "delta_ref_ppl": -0.1474609375, + "entropy_loss": -0.1376953125, + "epoch": 0.7866, + "grad_norm": 1.5011750227789495, + "k1_kl": 0.1474609375, + "k3_kl": 0.1064453125, + "kimi_kl": 0.412109375, + "learning_rate": 1.067e-07, + "loss": 0.0043, + "ppl": 0.0634765625, + "reward": 0.6097269654273987, + "reward_std": 0.0015810326440259814, + "rewards/perpo_ocr_edit_distance_reward": 0.6097270250320435, + "step": 3933, + "temperature": 0.9 + }, + { + "advantages": -2.4778502847766504e-05, + "completion_length": 383.0, + "delta_ref_entropy_loss": 0.052490234375, + "delta_ref_ppl": -0.08740234375, + "entropy_loss": -0.04931640625, + "epoch": 0.7868, + "grad_norm": 0.50478074912104, + "k1_kl": 0.08740234375, + "k3_kl": 0.061279296875, + "kimi_kl": 0.1689453125, + "learning_rate": 1.066e-07, + "loss": 0.0025, + "ppl": 0.017578125, + "reward": 0.9888084530830383, + "reward_std": 0.0009309992892667651, + "rewards/perpo_ocr_edit_distance_reward": 0.9888084530830383, + "step": 3934, + "temperature": 0.9 + }, + { + "advantages": -1.1588846064114477e-05, + "completion_length": 875.0, + "delta_ref_entropy_loss": 0.042236328125, + "delta_ref_ppl": -0.083984375, + "entropy_loss": -0.427734375, + "epoch": 0.787, + "grad_norm": 2.833969080016272, + "k1_kl": 0.08447265625, + "k3_kl": 0.07177734375, + "kimi_kl": 0.125, + "learning_rate": 1.065e-07, + "loss": 0.0029, + "ppl": 0.228515625, + "reward": 0.6216245889663696, + "reward_std": 0.004301116336137056, + "rewards/perpo_ocr_edit_distance_reward": 0.6216246485710144, + "step": 3935, + "temperature": 0.9 + }, + { + "advantages": -3.2697407732484862e-06, + "completion_length": 780.0, + "delta_ref_entropy_loss": 0.0291748046875, + "delta_ref_ppl": -0.05908203125, + "entropy_loss": -0.055908203125, + "epoch": 0.7872, + "grad_norm": 0.9886888742269552, + "k1_kl": 0.05908203125, + "k3_kl": 0.039306640625, + "kimi_kl": 0.11181640625, + "learning_rate": 1.0639999999999999e-07, + "loss": 0.0016, + "ppl": 0.0179443359375, + "reward": 0.9899382591247559, + "reward_std": 0.015478739514946938, + "rewards/perpo_ocr_edit_distance_reward": 0.9899383783340454, + "step": 3936, + "temperature": 0.9 + }, + { + "advantages": -7.808208465576172e-05, + "completion_length": 824.0, + "delta_ref_entropy_loss": 0.038330078125, + "delta_ref_ppl": -0.0498046875, + "entropy_loss": -0.0693359375, + "epoch": 0.7874, + "grad_norm": 0.8978644654002952, + "k1_kl": 0.050048828125, + "k3_kl": 0.0303955078125, + "kimi_kl": 0.06982421875, + "learning_rate": 1.063e-07, + "loss": 0.0013, + "ppl": 0.0299072265625, + "reward": 0.9742259383201599, + "reward_std": 0.0013177108485251665, + "rewards/perpo_ocr_edit_distance_reward": 0.9742260575294495, + "step": 3937, + "temperature": 0.9 + }, + { + "advantages": -8.305481605930254e-05, + "completion_length": 697.0, + "delta_ref_entropy_loss": 0.019775390625, + "delta_ref_ppl": -0.03515625, + "entropy_loss": -0.044677734375, + "epoch": 0.7876, + "grad_norm": 0.941280908567231, + "k1_kl": 0.035400390625, + "k3_kl": 0.026611328125, + "kimi_kl": 0.06591796875, + "learning_rate": 1.062e-07, + "loss": 0.0011, + "ppl": 0.0159912109375, + "reward": 0.9962606430053711, + "reward_std": 0.0009252509917132556, + "rewards/perpo_ocr_edit_distance_reward": 0.9962607622146606, + "step": 3938, + "temperature": 0.9 + }, + { + "advantages": -1.9754684217332397e-06, + "completion_length": 673.0, + "delta_ref_entropy_loss": 0.0135498046875, + "delta_ref_ppl": -0.0693359375, + "entropy_loss": -0.12158203125, + "epoch": 0.7878, + "grad_norm": 0.7293256182661306, + "k1_kl": 0.0693359375, + "k3_kl": 0.045654296875, + "kimi_kl": 0.111328125, + "learning_rate": 1.0609999999999999e-07, + "loss": 0.0018, + "ppl": 0.05078125, + "reward": 0.7531187534332275, + "reward_std": 0.004150677938014269, + "rewards/perpo_ocr_edit_distance_reward": 0.7531188130378723, + "step": 3939, + "temperature": 0.9 + }, + { + "advantages": -0.00013010842667426914, + "completion_length": 831.0, + "delta_ref_entropy_loss": 0.025146484375, + "delta_ref_ppl": -0.052001953125, + "entropy_loss": -0.043212890625, + "epoch": 0.788, + "grad_norm": 0.3767330391301369, + "k1_kl": 0.0517578125, + "k3_kl": 0.033447265625, + "kimi_kl": 0.10400390625, + "learning_rate": 1.06e-07, + "loss": 0.0015, + "ppl": 0.0140380859375, + "reward": 0.9986814856529236, + "reward_std": 0.0006199208437465131, + "rewards/perpo_ocr_edit_distance_reward": 0.9986815452575684, + "step": 3940, + "temperature": 0.9 + }, + { + "advantages": -3.7465778746081924e-07, + "completion_length": 1226.0, + "delta_ref_entropy_loss": 0.033447265625, + "delta_ref_ppl": -0.07275390625, + "entropy_loss": -0.380859375, + "epoch": 0.7882, + "grad_norm": 2.2226764192937387, + "k1_kl": 0.0732421875, + "k3_kl": 0.052734375, + "kimi_kl": 0.0810546875, + "learning_rate": 1.0589999999999999e-07, + "loss": 0.0021, + "ppl": 0.1865234375, + "reward": 0.7416242361068726, + "reward_std": 0.021998224779963493, + "rewards/perpo_ocr_edit_distance_reward": 0.7416242361068726, + "step": 3941, + "temperature": 0.9 + }, + { + "advantages": 8.514949634275126e-09, + "completion_length": 464.0, + "delta_ref_entropy_loss": 0.05078125, + "delta_ref_ppl": -0.0986328125, + "entropy_loss": -0.09521484375, + "epoch": 0.7884, + "grad_norm": 1.0683670565437713, + "k1_kl": 0.0986328125, + "k3_kl": 0.06494140625, + "kimi_kl": 0.1806640625, + "learning_rate": 1.058e-07, + "loss": 0.0026, + "ppl": 0.04345703125, + "reward": 0.9662081599235535, + "reward_std": 0.0007743349997326732, + "rewards/perpo_ocr_edit_distance_reward": 0.9662082195281982, + "step": 3942, + "temperature": 0.9 + }, + { + "advantages": -1.4679772903036792e-05, + "completion_length": 821.0, + "delta_ref_entropy_loss": 0.061279296875, + "delta_ref_ppl": -0.1064453125, + "entropy_loss": -0.265625, + "epoch": 0.7886, + "grad_norm": 1.6882544717831232, + "k1_kl": 0.1064453125, + "k3_kl": 0.0625, + "kimi_kl": 0.150390625, + "learning_rate": 1.057e-07, + "loss": 0.0025, + "ppl": 0.1396484375, + "reward": 0.9491842985153198, + "reward_std": 0.004540703725069761, + "rewards/perpo_ocr_edit_distance_reward": 0.9491844177246094, + "step": 3943, + "temperature": 0.9 + }, + { + "advantages": 0.0, + "completion_length": 456.0, + "delta_ref_entropy_loss": 0.0267333984375, + "delta_ref_ppl": -0.09130859375, + "entropy_loss": -0.087890625, + "epoch": 0.7888, + "grad_norm": 0.5391509194877457, + "k1_kl": 0.09130859375, + "k3_kl": 0.06201171875, + "kimi_kl": 0.244140625, + "learning_rate": 1.0559999999999999e-07, + "loss": 0.0025, + "ppl": 0.0380859375, + "reward": 0.9797859191894531, + "reward_std": 0.001572981709614396, + "rewards/perpo_ocr_edit_distance_reward": 0.9797859787940979, + "step": 3944, + "temperature": 0.9 + }, + { + "advantages": 4.257474817137563e-09, + "completion_length": 353.0, + "delta_ref_entropy_loss": 0.0181884765625, + "delta_ref_ppl": -0.07958984375, + "entropy_loss": -0.045166015625, + "epoch": 0.789, + "grad_norm": 0.2515808682949422, + "k1_kl": 0.080078125, + "k3_kl": 0.05615234375, + "kimi_kl": 0.1962890625, + "learning_rate": 1.0549999999999999e-07, + "loss": 0.0022, + "ppl": 0.01434326171875, + "reward": 0.994339108467102, + "reward_std": 0.00026854066527448595, + "rewards/perpo_ocr_edit_distance_reward": 0.9943391680717468, + "step": 3945, + "temperature": 0.9 + }, + { + "advantages": -2.1713121896027587e-06, + "completion_length": 502.0, + "delta_ref_entropy_loss": 0.0213623046875, + "delta_ref_ppl": -0.09619140625, + "entropy_loss": -0.328125, + "epoch": 0.7892, + "grad_norm": 1.6876634180668926, + "k1_kl": 0.09619140625, + "k3_kl": 0.07568359375, + "kimi_kl": 0.16796875, + "learning_rate": 1.0539999999999999e-07, + "loss": 0.003, + "ppl": 0.1591796875, + "reward": 0.6539961099624634, + "reward_std": 0.0038412315770983696, + "rewards/perpo_ocr_edit_distance_reward": 0.6539961695671082, + "step": 3946, + "temperature": 0.9 + }, + { + "advantages": -1.0064670277643017e-05, + "completion_length": 242.0, + "delta_ref_entropy_loss": 0.09423828125, + "delta_ref_ppl": -0.1484375, + "entropy_loss": -0.08837890625, + "epoch": 0.7894, + "grad_norm": 1.59262520411069, + "k1_kl": 0.1484375, + "k3_kl": 0.09765625, + "kimi_kl": 0.287109375, + "learning_rate": 1.053e-07, + "loss": 0.0039, + "ppl": 0.03955078125, + "reward": 0.9869316220283508, + "reward_std": 0.0015945027116686106, + "rewards/perpo_ocr_edit_distance_reward": 0.9869316220283508, + "step": 3947, + "temperature": 0.9 + }, + { + "advantages": -8.05450399639085e-05, + "completion_length": 351.0, + "delta_ref_entropy_loss": 0.043701171875, + "delta_ref_ppl": -0.083984375, + "entropy_loss": -0.045166015625, + "epoch": 0.7896, + "grad_norm": 0.5717553425464355, + "k1_kl": 0.08447265625, + "k3_kl": 0.06005859375, + "kimi_kl": 0.23828125, + "learning_rate": 1.052e-07, + "loss": 0.0025, + "ppl": 0.0159912109375, + "reward": 0.9846477508544922, + "reward_std": 0.0009571976843290031, + "rewards/perpo_ocr_edit_distance_reward": 0.9846479296684265, + "step": 3948, + "temperature": 0.9 + }, + { + "advantages": -2.043587983280304e-06, + "completion_length": 617.0, + "delta_ref_entropy_loss": 0.078125, + "delta_ref_ppl": -0.1044921875, + "entropy_loss": -0.1728515625, + "epoch": 0.7898, + "grad_norm": 1.1775300296451798, + "k1_kl": 0.1044921875, + "k3_kl": 0.0654296875, + "kimi_kl": 0.1611328125, + "learning_rate": 1.051e-07, + "loss": 0.0026, + "ppl": 0.080078125, + "reward": 0.9433035254478455, + "reward_std": 0.0041158776730299, + "rewards/perpo_ocr_edit_distance_reward": 0.9433035850524902, + "step": 3949, + "temperature": 0.9 + }, + { + "advantages": -6.23294317847467e-06, + "completion_length": 513.0, + "delta_ref_entropy_loss": 0.0223388671875, + "delta_ref_ppl": -0.06689453125, + "entropy_loss": -0.043701171875, + "epoch": 0.79, + "grad_norm": 0.8309351951017252, + "k1_kl": 0.06689453125, + "k3_kl": 0.0458984375, + "kimi_kl": 0.138671875, + "learning_rate": 1.0499999999999999e-07, + "loss": 0.0018, + "ppl": 0.0206298828125, + "reward": 0.9854146242141724, + "reward_std": 0.001268817693926394, + "rewards/perpo_ocr_edit_distance_reward": 0.9854146838188171, + "step": 3950, + "temperature": 0.9 + }, + { + "advantages": -2.7077539925812744e-05, + "completion_length": 422.0, + "delta_ref_entropy_loss": 0.060546875, + "delta_ref_ppl": -0.10693359375, + "entropy_loss": -0.083984375, + "epoch": 0.7902, + "grad_norm": 0.47481070071718406, + "k1_kl": 0.10693359375, + "k3_kl": 0.06591796875, + "kimi_kl": 0.2490234375, + "learning_rate": 1.0489999999999999e-07, + "loss": 0.0027, + "ppl": 0.0289306640625, + "reward": 0.9822194576263428, + "reward_std": 0.0008441998506896198, + "rewards/perpo_ocr_edit_distance_reward": 0.9822195172309875, + "step": 3951, + "temperature": 0.9 + }, + { + "advantages": -0.00011697837908286601, + "completion_length": 830.0, + "delta_ref_entropy_loss": 0.0166015625, + "delta_ref_ppl": -0.044677734375, + "entropy_loss": -0.0302734375, + "epoch": 0.7904, + "grad_norm": 0.2137677399691578, + "k1_kl": 0.044677734375, + "k3_kl": 0.030029296875, + "kimi_kl": 0.0966796875, + "learning_rate": 1.048e-07, + "loss": 0.0013, + "ppl": 0.0072021484375, + "reward": 0.996196985244751, + "reward_std": 0.0005550351343117654, + "rewards/perpo_ocr_edit_distance_reward": 0.9961970448493958, + "step": 3952, + "temperature": 0.9 + }, + { + "advantages": 2.469335413479712e-06, + "completion_length": 664.0, + "delta_ref_entropy_loss": 0.01141357421875, + "delta_ref_ppl": -0.09423828125, + "entropy_loss": -0.396484375, + "epoch": 0.7906, + "grad_norm": 2.301720521662361, + "k1_kl": 0.09375, + "k3_kl": 0.076171875, + "kimi_kl": 0.1552734375, + "learning_rate": 1.0469999999999999e-07, + "loss": 0.003, + "ppl": 0.2021484375, + "reward": 0.8291004300117493, + "reward_std": 0.013623703271150589, + "rewards/perpo_ocr_edit_distance_reward": 0.829100489616394, + "step": 3953, + "temperature": 0.9 + }, + { + "advantages": -0.00012241091462783515, + "completion_length": 633.0, + "delta_ref_entropy_loss": 0.0361328125, + "delta_ref_ppl": -0.05078125, + "entropy_loss": -0.023193359375, + "epoch": 0.7908, + "grad_norm": 0.37009050396042753, + "k1_kl": 0.050537109375, + "k3_kl": 0.0279541015625, + "kimi_kl": 0.07666015625, + "learning_rate": 1.046e-07, + "loss": 0.0012, + "ppl": 0.0054931640625, + "reward": 0.9997454285621643, + "reward_std": 0.00031741964630782604, + "rewards/perpo_ocr_edit_distance_reward": 0.9997455477714539, + "step": 3954, + "temperature": 0.9 + }, + { + "advantages": -3.88281705454574e-06, + "completion_length": 742.0, + "delta_ref_entropy_loss": 0.111328125, + "delta_ref_ppl": -0.107421875, + "entropy_loss": -0.185546875, + "epoch": 0.791, + "grad_norm": 4.473712913905957, + "k1_kl": 0.10693359375, + "k3_kl": 0.08984375, + "kimi_kl": 0.162109375, + "learning_rate": 1.0449999999999999e-07, + "loss": 0.0036, + "ppl": 0.08984375, + "reward": 0.15155091881752014, + "reward_std": 0.0015467037446796894, + "rewards/perpo_ocr_edit_distance_reward": 0.15155093371868134, + "step": 3955, + "temperature": 0.9 + }, + { + "advantages": -0.0001850383705459535, + "completion_length": 571.0, + "delta_ref_entropy_loss": 0.035888671875, + "delta_ref_ppl": -0.04638671875, + "entropy_loss": -0.04150390625, + "epoch": 0.7912, + "grad_norm": 0.26719813672813764, + "k1_kl": 0.04638671875, + "k3_kl": 0.026123046875, + "kimi_kl": 0.0849609375, + "learning_rate": 1.0440000000000001e-07, + "loss": 0.0012, + "ppl": 0.01055908203125, + "reward": 0.9875054359436035, + "reward_std": 0.000360194593667984, + "rewards/perpo_ocr_edit_distance_reward": 0.9875055551528931, + "step": 3956, + "temperature": 0.9 + }, + { + "advantages": -2.8542111977003515e-05, + "completion_length": 1018.0, + "delta_ref_entropy_loss": 0.0130615234375, + "delta_ref_ppl": -0.053955078125, + "entropy_loss": -0.10009765625, + "epoch": 0.7914, + "grad_norm": 0.681511968864486, + "k1_kl": 0.053955078125, + "k3_kl": 0.03955078125, + "kimi_kl": 0.10791015625, + "learning_rate": 1.043e-07, + "loss": 0.0016, + "ppl": 0.04345703125, + "reward": 0.9210025072097778, + "reward_std": 0.0022867866791784763, + "rewards/perpo_ocr_edit_distance_reward": 0.9210025668144226, + "step": 3957, + "temperature": 0.9 + }, + { + "advantages": -2.4693355271665496e-07, + "completion_length": 50.0, + "delta_ref_entropy_loss": -0.8046875, + "delta_ref_ppl": -0.93359375, + "entropy_loss": -1.4765625, + "epoch": 0.7916, + "grad_norm": 11.006360210205434, + "k1_kl": 0.93359375, + "k3_kl": 0.8203125, + "kimi_kl": 3.53125, + "learning_rate": 1.0419999999999999e-07, + "loss": 0.0329, + "ppl": 0.609375, + "reward": 0.31122449040412903, + "reward_std": 0.12525787949562073, + "rewards/perpo_ocr_edit_distance_reward": 0.3112245202064514, + "step": 3958, + "temperature": 0.9 + }, + { + "advantages": 1.2431826689862646e-05, + "completion_length": 536.0, + "delta_ref_entropy_loss": 0.061279296875, + "delta_ref_ppl": -0.10546875, + "entropy_loss": -0.06884765625, + "epoch": 0.7918, + "grad_norm": 0.8840154377678432, + "k1_kl": 0.10498046875, + "k3_kl": 0.06787109375, + "kimi_kl": 0.1748046875, + "learning_rate": 1.041e-07, + "loss": 0.0027, + "ppl": 0.031982421875, + "reward": 0.9913195967674255, + "reward_std": 0.0012713732430711389, + "rewards/perpo_ocr_edit_distance_reward": 0.9913195371627808, + "step": 3959, + "temperature": 0.9 + }, + { + "advantages": -2.889973984565586e-05, + "completion_length": 1101.0, + "delta_ref_entropy_loss": 0.023193359375, + "delta_ref_ppl": -0.038330078125, + "entropy_loss": -0.0673828125, + "epoch": 0.792, + "grad_norm": 0.6324819886545641, + "k1_kl": 0.038330078125, + "k3_kl": 0.025146484375, + "kimi_kl": 0.06982421875, + "learning_rate": 1.0399999999999999e-07, + "loss": 0.001, + "ppl": 0.0260009765625, + "reward": 0.9932078123092651, + "reward_std": 0.0010787559440359473, + "rewards/perpo_ocr_edit_distance_reward": 0.9932078123092651, + "step": 3960, + "temperature": 0.9 + }, + { + "advantages": -9.70704263636435e-07, + "completion_length": 1242.0, + "delta_ref_entropy_loss": 0.015869140625, + "delta_ref_ppl": -0.08203125, + "entropy_loss": -0.296875, + "epoch": 0.7922, + "grad_norm": 4.196017571672002, + "k1_kl": 0.08203125, + "k3_kl": 0.058837890625, + "kimi_kl": 0.146484375, + "learning_rate": 1.039e-07, + "loss": 0.0024, + "ppl": 0.1357421875, + "reward": 0.8397009372711182, + "reward_std": 0.053505487740039825, + "rewards/perpo_ocr_edit_distance_reward": 0.8397009968757629, + "step": 3961, + "temperature": 0.9 + }, + { + "advantages": 4.066740075359121e-05, + "completion_length": 641.0, + "delta_ref_entropy_loss": 0.05029296875, + "delta_ref_ppl": -0.08251953125, + "entropy_loss": -0.07470703125, + "epoch": 0.7924, + "grad_norm": 0.6739761951056613, + "k1_kl": 0.08251953125, + "k3_kl": 0.047607421875, + "kimi_kl": 0.1337890625, + "learning_rate": 1.038e-07, + "loss": 0.0019, + "ppl": 0.0322265625, + "reward": 0.9858148097991943, + "reward_std": 0.0005280193290673196, + "rewards/perpo_ocr_edit_distance_reward": 0.9858148097991943, + "step": 3962, + "temperature": 0.9 + }, + { + "advantages": -3.1113624572753906e-05, + "completion_length": 604.0, + "delta_ref_entropy_loss": 0.03857421875, + "delta_ref_ppl": -0.1123046875, + "entropy_loss": -0.2734375, + "epoch": 0.7926, + "grad_norm": 1.707769549728629, + "k1_kl": 0.1123046875, + "k3_kl": 0.0810546875, + "kimi_kl": 0.216796875, + "learning_rate": 1.037e-07, + "loss": 0.0033, + "ppl": 0.12451171875, + "reward": 0.8973232507705688, + "reward_std": 0.0023622543085366488, + "rewards/perpo_ocr_edit_distance_reward": 0.8973233699798584, + "step": 3963, + "temperature": 0.9 + }, + { + "advantages": 1.6433853033959167e-06, + "completion_length": 191.0, + "delta_ref_entropy_loss": 0.0078125, + "delta_ref_ppl": -0.2119140625, + "entropy_loss": -0.1796875, + "epoch": 0.7928, + "grad_norm": 1.8492993314617396, + "k1_kl": 0.2119140625, + "k3_kl": 0.1494140625, + "kimi_kl": 0.4609375, + "learning_rate": 1.0359999999999999e-07, + "loss": 0.006, + "ppl": 0.07080078125, + "reward": 0.979305624961853, + "reward_std": 0.010372960940003395, + "rewards/perpo_ocr_edit_distance_reward": 0.979305624961853, + "step": 3964, + "temperature": 0.9 + }, + { + "advantages": -6.0796741308877245e-05, + "completion_length": 555.0, + "delta_ref_entropy_loss": 0.01611328125, + "delta_ref_ppl": -0.034423828125, + "entropy_loss": -0.029296875, + "epoch": 0.793, + "grad_norm": 0.21697994493595665, + "k1_kl": 0.034423828125, + "k3_kl": 0.0224609375, + "kimi_kl": 0.06591796875, + "learning_rate": 1.035e-07, + "loss": 0.001, + "ppl": 0.00811767578125, + "reward": 0.9978387355804443, + "reward_std": 0.0003201905929017812, + "rewards/perpo_ocr_edit_distance_reward": 0.9978388547897339, + "step": 3965, + "temperature": 0.9 + }, + { + "advantages": -1.498631149843277e-06, + "completion_length": 495.0, + "delta_ref_entropy_loss": 0.048828125, + "delta_ref_ppl": -0.119140625, + "entropy_loss": -0.1845703125, + "epoch": 0.7932, + "grad_norm": 2.214752442182418, + "k1_kl": 0.1201171875, + "k3_kl": 0.080078125, + "kimi_kl": 0.216796875, + "learning_rate": 1.034e-07, + "loss": 0.0032, + "ppl": 0.0810546875, + "reward": 0.9222850799560547, + "reward_std": 0.03438277542591095, + "rewards/perpo_ocr_edit_distance_reward": 0.9222851395606995, + "step": 3966, + "temperature": 0.9 + }, + { + "advantages": -1.9890921976184472e-05, + "completion_length": 1649.0, + "delta_ref_entropy_loss": 0.006256103515625, + "delta_ref_ppl": -0.0306396484375, + "entropy_loss": -0.05908203125, + "epoch": 0.7934, + "grad_norm": 0.6857357902645663, + "k1_kl": 0.030517578125, + "k3_kl": 0.0247802734375, + "kimi_kl": 0.046875, + "learning_rate": 1.0329999999999999e-07, + "loss": 0.001, + "ppl": 0.0269775390625, + "reward": 0.9918632507324219, + "reward_std": 0.0016120593063533306, + "rewards/perpo_ocr_edit_distance_reward": 0.9918633103370667, + "step": 3967, + "temperature": 0.9 + }, + { + "advantages": -0.00021275453036651015, + "completion_length": 1111.0, + "delta_ref_entropy_loss": 0.039794921875, + "delta_ref_ppl": -0.052001953125, + "entropy_loss": -0.054931640625, + "epoch": 0.7936, + "grad_norm": 0.4905618645313888, + "k1_kl": 0.052001953125, + "k3_kl": 0.0272216796875, + "kimi_kl": 0.055419921875, + "learning_rate": 1.032e-07, + "loss": 0.0013, + "ppl": 0.018798828125, + "reward": 0.9872540831565857, + "reward_std": 0.00046029823715798557, + "rewards/perpo_ocr_edit_distance_reward": 0.9872542023658752, + "step": 3968, + "temperature": 0.9 + }, + { + "advantages": -4.863739377469756e-05, + "completion_length": 451.0, + "delta_ref_entropy_loss": 0.036376953125, + "delta_ref_ppl": -0.0654296875, + "entropy_loss": -0.029052734375, + "epoch": 0.7938, + "grad_norm": 0.2743816273779855, + "k1_kl": 0.0654296875, + "k3_kl": 0.041748046875, + "kimi_kl": 0.1396484375, + "learning_rate": 1.0309999999999999e-07, + "loss": 0.0017, + "ppl": 0.00665283203125, + "reward": 0.9988635182380676, + "reward_std": 0.00025010117678903043, + "rewards/perpo_ocr_edit_distance_reward": 0.9988635778427124, + "step": 3969, + "temperature": 0.9 + }, + { + "advantages": -5.892345143365674e-06, + "completion_length": 683.0, + "delta_ref_entropy_loss": -0.04296875, + "delta_ref_ppl": -0.0289306640625, + "entropy_loss": -0.119140625, + "epoch": 0.794, + "grad_norm": 1.2445322737321505, + "k1_kl": 0.029052734375, + "k3_kl": 0.0279541015625, + "kimi_kl": 0.0869140625, + "learning_rate": 1.03e-07, + "loss": 0.0011, + "ppl": 0.04052734375, + "reward": 0.8604704737663269, + "reward_std": 0.01434369944036007, + "rewards/perpo_ocr_edit_distance_reward": 0.8604705333709717, + "step": 3970, + "temperature": 0.9 + }, + { + "advantages": -5.790165573671402e-07, + "completion_length": 688.0, + "delta_ref_entropy_loss": -0.0302734375, + "delta_ref_ppl": -0.04736328125, + "entropy_loss": -0.166015625, + "epoch": 0.7942, + "grad_norm": 1.244037757104238, + "k1_kl": 0.047119140625, + "k3_kl": 0.0439453125, + "kimi_kl": 0.10888671875, + "learning_rate": 1.029e-07, + "loss": 0.0018, + "ppl": 0.06640625, + "reward": 0.9046944975852966, + "reward_std": 0.06070505827665329, + "rewards/perpo_ocr_edit_distance_reward": 0.9046946167945862, + "step": 3971, + "temperature": 0.9 + }, + { + "advantages": -7.872071364545263e-06, + "completion_length": 709.0, + "delta_ref_entropy_loss": -0.0169677734375, + "delta_ref_ppl": -0.0771484375, + "entropy_loss": -0.46875, + "epoch": 0.7944, + "grad_norm": 2.0031775566682843, + "k1_kl": 0.0771484375, + "k3_kl": 0.061767578125, + "kimi_kl": 0.11181640625, + "learning_rate": 1.028e-07, + "loss": 0.0025, + "ppl": 0.2470703125, + "reward": 0.8758599162101746, + "reward_std": 0.008545921184122562, + "rewards/perpo_ocr_edit_distance_reward": 0.8758599758148193, + "step": 3972, + "temperature": 0.9 + }, + { + "advantages": -3.656319313449785e-05, + "completion_length": 904.0, + "delta_ref_entropy_loss": 0.026611328125, + "delta_ref_ppl": -0.038818359375, + "entropy_loss": -0.041015625, + "epoch": 0.7946, + "grad_norm": 0.3365744254181941, + "k1_kl": 0.0390625, + "k3_kl": 0.0245361328125, + "kimi_kl": 0.07666015625, + "learning_rate": 1.027e-07, + "loss": 0.001, + "ppl": 0.013671875, + "reward": 0.997675359249115, + "reward_std": 0.0005984512972645462, + "rewards/perpo_ocr_edit_distance_reward": 0.9976754784584045, + "step": 3973, + "temperature": 0.9 + }, + { + "advantages": 4.32559427281376e-05, + "completion_length": 207.0, + "delta_ref_entropy_loss": 0.043212890625, + "delta_ref_ppl": -0.1357421875, + "entropy_loss": -0.049560546875, + "epoch": 0.7948, + "grad_norm": 0.6756551680923282, + "k1_kl": 0.1357421875, + "k3_kl": 0.09765625, + "kimi_kl": 0.333984375, + "learning_rate": 1.0259999999999999e-07, + "loss": 0.0039, + "ppl": 0.0157470703125, + "reward": 0.9922469854354858, + "reward_std": 0.0004903814406134188, + "rewards/perpo_ocr_edit_distance_reward": 0.9922469258308411, + "step": 3974, + "temperature": 0.9 + }, + { + "advantages": -7.985319825820625e-05, + "completion_length": 410.0, + "delta_ref_entropy_loss": 0.02685546875, + "delta_ref_ppl": -0.056396484375, + "entropy_loss": -0.037841796875, + "epoch": 0.795, + "grad_norm": 0.5468000104734411, + "k1_kl": 0.056396484375, + "k3_kl": 0.037353515625, + "kimi_kl": 0.11376953125, + "learning_rate": 1.0249999999999998e-07, + "loss": 0.0016, + "ppl": 0.0152587890625, + "reward": 0.9970001578330994, + "reward_std": 0.0005398291978053749, + "rewards/perpo_ocr_edit_distance_reward": 0.9970002174377441, + "step": 3975, + "temperature": 0.9 + }, + { + "advantages": -2.741813887041644e-06, + "completion_length": 276.0, + "delta_ref_entropy_loss": 0.060302734375, + "delta_ref_ppl": -0.14453125, + "entropy_loss": -0.1845703125, + "epoch": 0.7952, + "grad_norm": 2.6194207283977615, + "k1_kl": 0.1455078125, + "k3_kl": 0.103515625, + "kimi_kl": 0.2578125, + "learning_rate": 1.024e-07, + "loss": 0.0041, + "ppl": 0.07763671875, + "reward": 0.9661117792129517, + "reward_std": 0.02166796661913395, + "rewards/perpo_ocr_edit_distance_reward": 0.9661118388175964, + "step": 3976, + "temperature": 0.9 + }, + { + "advantages": -1.4475413934178505e-07, + "completion_length": 37.0, + "delta_ref_entropy_loss": -0.1474609375, + "delta_ref_ppl": -0.99609375, + "entropy_loss": -0.55078125, + "epoch": 0.7954, + "grad_norm": 15.576102655272392, + "k1_kl": 0.99609375, + "k3_kl": 0.84375, + "kimi_kl": 3.40625, + "learning_rate": 1.023e-07, + "loss": 0.0338, + "ppl": 0.2138671875, + "reward": 0.5561394691467285, + "reward_std": 0.08526161313056946, + "rewards/perpo_ocr_edit_distance_reward": 0.5561395287513733, + "step": 3977, + "temperature": 0.9 + }, + { + "advantages": -5.2758627134608105e-05, + "completion_length": 710.0, + "delta_ref_entropy_loss": 0.042724609375, + "delta_ref_ppl": -0.07275390625, + "entropy_loss": -0.04443359375, + "epoch": 0.7956, + "grad_norm": 0.37089087450832825, + "k1_kl": 0.07275390625, + "k3_kl": 0.047119140625, + "kimi_kl": 0.14453125, + "learning_rate": 1.0219999999999999e-07, + "loss": 0.0019, + "ppl": 0.01513671875, + "reward": 0.9960684776306152, + "reward_std": 0.0005453870981000364, + "rewards/perpo_ocr_edit_distance_reward": 0.9960685968399048, + "step": 3978, + "temperature": 0.9 + }, + { + "advantages": -9.5367431640625e-07, + "completion_length": 725.0, + "delta_ref_entropy_loss": -0.036376953125, + "delta_ref_ppl": -0.056884765625, + "entropy_loss": -0.1630859375, + "epoch": 0.7958, + "grad_norm": 1.6863841092946912, + "k1_kl": 0.056884765625, + "k3_kl": 0.0498046875, + "kimi_kl": 0.1279296875, + "learning_rate": 1.021e-07, + "loss": 0.002, + "ppl": 0.06005859375, + "reward": 0.6444198489189148, + "reward_std": 0.008733335882425308, + "rewards/perpo_ocr_edit_distance_reward": 0.6444198489189148, + "step": 3979, + "temperature": 0.9 + }, + { + "advantages": -3.593308792915195e-05, + "completion_length": 556.0, + "delta_ref_entropy_loss": 0.05859375, + "delta_ref_ppl": -0.0625, + "entropy_loss": -0.035888671875, + "epoch": 0.796, + "grad_norm": 0.4259101889558569, + "k1_kl": 0.0625, + "k3_kl": 0.036376953125, + "kimi_kl": 0.1064453125, + "learning_rate": 1.0199999999999999e-07, + "loss": 0.0015, + "ppl": 0.0076904296875, + "reward": 0.9992520809173584, + "reward_std": 0.0003739893436431885, + "rewards/perpo_ocr_edit_distance_reward": 0.9992520809173584, + "step": 3980, + "temperature": 0.9 + }, + { + "advantages": -1.1920928955078125e-07, + "completion_length": 671.0, + "delta_ref_entropy_loss": -0.0703125, + "delta_ref_ppl": -0.09765625, + "entropy_loss": -0.6015625, + "epoch": 0.7962, + "grad_norm": 2.1893180325234125, + "k1_kl": 0.09716796875, + "k3_kl": 0.08544921875, + "kimi_kl": 0.1640625, + "learning_rate": 1.019e-07, + "loss": 0.0034, + "ppl": 0.306640625, + "reward": 0.38021120429039, + "reward_std": 0.039099305868148804, + "rewards/perpo_ocr_edit_distance_reward": 0.3802112340927124, + "step": 3981, + "temperature": 0.9 + }, + { + "advantages": 3.295285569038242e-05, + "completion_length": 624.0, + "delta_ref_entropy_loss": 0.048828125, + "delta_ref_ppl": -0.06591796875, + "entropy_loss": -0.060302734375, + "epoch": 0.7964, + "grad_norm": 0.35762891049655854, + "k1_kl": 0.06591796875, + "k3_kl": 0.03759765625, + "kimi_kl": 0.08837890625, + "learning_rate": 1.018e-07, + "loss": 0.0015, + "ppl": 0.01953125, + "reward": 0.9780701398849487, + "reward_std": 0.0004169236053712666, + "rewards/perpo_ocr_edit_distance_reward": 0.9780701398849487, + "step": 3982, + "temperature": 0.9 + }, + { + "advantages": -4.13315647165291e-05, + "completion_length": 478.0, + "delta_ref_entropy_loss": 0.03955078125, + "delta_ref_ppl": -0.04931640625, + "entropy_loss": -0.0439453125, + "epoch": 0.7966, + "grad_norm": 0.49099110679501107, + "k1_kl": 0.04931640625, + "k3_kl": 0.0296630859375, + "kimi_kl": 0.0908203125, + "learning_rate": 1.0169999999999999e-07, + "loss": 0.0012, + "ppl": 0.0174560546875, + "reward": 0.9934030175209045, + "reward_std": 0.0005179900326766074, + "rewards/perpo_ocr_edit_distance_reward": 0.9934031367301941, + "step": 3983, + "temperature": 0.9 + }, + { + "advantages": -4.1578499804018065e-05, + "completion_length": 185.0, + "delta_ref_entropy_loss": 0.09619140625, + "delta_ref_ppl": -0.28125, + "entropy_loss": -0.11181640625, + "epoch": 0.7968, + "grad_norm": 1.123139765970074, + "k1_kl": 0.279296875, + "k3_kl": 0.2138671875, + "kimi_kl": 0.859375, + "learning_rate": 1.016e-07, + "loss": 0.0086, + "ppl": 0.04541015625, + "reward": 0.9957537055015564, + "reward_std": 0.00174236751627177, + "rewards/perpo_ocr_edit_distance_reward": 0.9957537651062012, + "step": 3984, + "temperature": 0.9 + }, + { + "advantages": -3.474099457889679e-06, + "completion_length": 918.0, + "delta_ref_entropy_loss": 0.01177978515625, + "delta_ref_ppl": -0.050048828125, + "entropy_loss": -0.09619140625, + "epoch": 0.797, + "grad_norm": 0.5741022824685295, + "k1_kl": 0.050048828125, + "k3_kl": 0.033203125, + "kimi_kl": 0.06689453125, + "learning_rate": 1.015e-07, + "loss": 0.0013, + "ppl": 0.032470703125, + "reward": 0.9848808646202087, + "reward_std": 0.0023486881982535124, + "rewards/perpo_ocr_edit_distance_reward": 0.9848809242248535, + "step": 3985, + "temperature": 0.9 + }, + { + "advantages": -2.8337752155493945e-05, + "completion_length": 278.0, + "delta_ref_entropy_loss": 0.0233154296875, + "delta_ref_ppl": -0.103515625, + "entropy_loss": -0.046875, + "epoch": 0.7972, + "grad_norm": 0.7563183148829766, + "k1_kl": 0.103515625, + "k3_kl": 0.07763671875, + "kimi_kl": 0.310546875, + "learning_rate": 1.014e-07, + "loss": 0.0031, + "ppl": 0.017578125, + "reward": 0.9967980980873108, + "reward_std": 0.0008009034208953381, + "rewards/perpo_ocr_edit_distance_reward": 0.9967982172966003, + "step": 3986, + "temperature": 0.9 + }, + { + "advantages": -2.0061221221112646e-05, + "completion_length": 196.0, + "delta_ref_entropy_loss": 0.0220947265625, + "delta_ref_ppl": -0.12255859375, + "entropy_loss": -0.0771484375, + "epoch": 0.7974, + "grad_norm": 1.7350051272710496, + "k1_kl": 0.12158203125, + "k3_kl": 0.1162109375, + "kimi_kl": 0.359375, + "learning_rate": 1.013e-07, + "loss": 0.0047, + "ppl": 0.0294189453125, + "reward": 0.9754712581634521, + "reward_std": 0.002022920409217477, + "rewards/perpo_ocr_edit_distance_reward": 0.9754713177680969, + "step": 3987, + "temperature": 0.9 + }, + { + "advantages": 2.111707544827368e-06, + "completion_length": 326.0, + "delta_ref_entropy_loss": 0.07275390625, + "delta_ref_ppl": -0.16796875, + "entropy_loss": -0.373046875, + "epoch": 0.7976, + "grad_norm": 3.0155114255658817, + "k1_kl": 0.16796875, + "k3_kl": 0.126953125, + "kimi_kl": 0.37109375, + "learning_rate": 1.0119999999999999e-07, + "loss": 0.0051, + "ppl": 0.15625, + "reward": 0.6311144232749939, + "reward_std": 0.007945111952722073, + "rewards/perpo_ocr_edit_distance_reward": 0.6311144232749939, + "step": 3988, + "temperature": 0.9 + }, + { + "advantages": -3.797667523031123e-05, + "completion_length": 399.0, + "delta_ref_entropy_loss": 0.08544921875, + "delta_ref_ppl": -0.150390625, + "entropy_loss": -0.236328125, + "epoch": 0.7978, + "grad_norm": 1.9773049138532495, + "k1_kl": 0.150390625, + "k3_kl": 0.09375, + "kimi_kl": 0.2392578125, + "learning_rate": 1.0109999999999999e-07, + "loss": 0.0038, + "ppl": 0.10693359375, + "reward": 0.7548071146011353, + "reward_std": 0.0016923160292208195, + "rewards/perpo_ocr_edit_distance_reward": 0.7548072338104248, + "step": 3989, + "temperature": 0.9 + }, + { + "advantages": -6.846019459771924e-06, + "completion_length": 581.0, + "delta_ref_entropy_loss": 0.0184326171875, + "delta_ref_ppl": -0.04833984375, + "entropy_loss": -0.041015625, + "epoch": 0.798, + "grad_norm": 0.6333343462986183, + "k1_kl": 0.04833984375, + "k3_kl": 0.03515625, + "kimi_kl": 0.11279296875, + "learning_rate": 1.01e-07, + "loss": 0.0014, + "ppl": 0.01611328125, + "reward": 0.9972527623176575, + "reward_std": 0.0011432729661464691, + "rewards/perpo_ocr_edit_distance_reward": 0.9972527623176575, + "step": 3990, + "temperature": 0.9 + }, + { + "advantages": -9.81720495474292e-06, + "completion_length": 324.0, + "delta_ref_entropy_loss": 0.0281982421875, + "delta_ref_ppl": -0.099609375, + "entropy_loss": -0.0712890625, + "epoch": 0.7982, + "grad_norm": 0.9586340020228062, + "k1_kl": 0.09912109375, + "k3_kl": 0.07861328125, + "kimi_kl": 0.330078125, + "learning_rate": 1.009e-07, + "loss": 0.0032, + "ppl": 0.028076171875, + "reward": 0.9946879148483276, + "reward_std": 0.0007672683568671346, + "rewards/perpo_ocr_edit_distance_reward": 0.9946879148483276, + "step": 3991, + "temperature": 0.9 + }, + { + "advantages": 0.0, + "completion_length": 269.0, + "delta_ref_entropy_loss": 0.0361328125, + "delta_ref_ppl": -0.1513671875, + "entropy_loss": -0.053466796875, + "epoch": 0.7984, + "grad_norm": 0.7944570284531388, + "k1_kl": 0.1513671875, + "k3_kl": 0.12109375, + "kimi_kl": 0.5625, + "learning_rate": 1.008e-07, + "loss": 0.0048, + "ppl": 0.01708984375, + "reward": 0.9888817071914673, + "reward_std": 0.0010069733252748847, + "rewards/perpo_ocr_edit_distance_reward": 0.9888817071914673, + "step": 3992, + "temperature": 0.9 + }, + { + "advantages": -7.152557941481064e-07, + "completion_length": 1075.0, + "delta_ref_entropy_loss": 0.0439453125, + "delta_ref_ppl": -0.062255859375, + "entropy_loss": -0.061767578125, + "epoch": 0.7986, + "grad_norm": 1.13580233425601, + "k1_kl": 0.0625, + "k3_kl": 0.033935546875, + "kimi_kl": 0.08203125, + "learning_rate": 1.007e-07, + "loss": 0.0014, + "ppl": 0.0255126953125, + "reward": 0.7936541438102722, + "reward_std": 0.12224607169628143, + "rewards/perpo_ocr_edit_distance_reward": 0.793654203414917, + "step": 3993, + "temperature": 0.9 + }, + { + "advantages": -4.192761116428301e-05, + "completion_length": 737.0, + "delta_ref_entropy_loss": 0.0224609375, + "delta_ref_ppl": -0.047119140625, + "entropy_loss": -0.06396484375, + "epoch": 0.7988, + "grad_norm": 0.48509478405230305, + "k1_kl": 0.047119140625, + "k3_kl": 0.0308837890625, + "kimi_kl": 0.080078125, + "learning_rate": 1.0059999999999999e-07, + "loss": 0.0013, + "ppl": 0.02880859375, + "reward": 0.990677535533905, + "reward_std": 0.0005093907820992172, + "rewards/perpo_ocr_edit_distance_reward": 0.9906775951385498, + "step": 3994, + "temperature": 0.9 + }, + { + "advantages": -7.322856845348724e-07, + "completion_length": 738.0, + "delta_ref_entropy_loss": 0.025390625, + "delta_ref_ppl": -0.07470703125, + "entropy_loss": -0.1142578125, + "epoch": 0.799, + "grad_norm": 1.8815273802145318, + "k1_kl": 0.07470703125, + "k3_kl": 0.056396484375, + "kimi_kl": 0.1533203125, + "learning_rate": 1.005e-07, + "loss": 0.0023, + "ppl": 0.051025390625, + "reward": 0.9065887928009033, + "reward_std": 0.09204664826393127, + "rewards/perpo_ocr_edit_distance_reward": 0.9065888524055481, + "step": 3995, + "temperature": 0.9 + }, + { + "advantages": -2.358640995225869e-05, + "completion_length": 400.0, + "delta_ref_entropy_loss": 0.0159912109375, + "delta_ref_ppl": -0.038330078125, + "entropy_loss": -0.037353515625, + "epoch": 0.7992, + "grad_norm": 0.2796977256398389, + "k1_kl": 0.0380859375, + "k3_kl": 0.032958984375, + "kimi_kl": 0.07861328125, + "learning_rate": 1.004e-07, + "loss": 0.0013, + "ppl": 0.00958251953125, + "reward": 0.9924004673957825, + "reward_std": 0.0006214614841155708, + "rewards/perpo_ocr_edit_distance_reward": 0.9924004673957825, + "step": 3996, + "temperature": 0.9 + }, + { + "advantages": -4.0190563595388085e-05, + "completion_length": 460.0, + "delta_ref_entropy_loss": 0.01806640625, + "delta_ref_ppl": -0.053466796875, + "entropy_loss": -0.03662109375, + "epoch": 0.7994, + "grad_norm": 0.44045377934726276, + "k1_kl": 0.053466796875, + "k3_kl": 0.05078125, + "kimi_kl": 0.1689453125, + "learning_rate": 1.0029999999999999e-07, + "loss": 0.0021, + "ppl": 0.01348876953125, + "reward": 0.9975665211677551, + "reward_std": 0.0005356063484214246, + "rewards/perpo_ocr_edit_distance_reward": 0.9975665807723999, + "step": 3997, + "temperature": 0.9 + }, + { + "advantages": -1.004764044409967e-06, + "completion_length": 903.0, + "delta_ref_entropy_loss": 0.056640625, + "delta_ref_ppl": -0.07373046875, + "entropy_loss": -0.11083984375, + "epoch": 0.7996, + "grad_norm": 0.9920816780840559, + "k1_kl": 0.07373046875, + "k3_kl": 0.042724609375, + "kimi_kl": 0.1103515625, + "learning_rate": 1.002e-07, + "loss": 0.0017, + "ppl": 0.052978515625, + "reward": 0.9584819674491882, + "reward_std": 0.017090268433094025, + "rewards/perpo_ocr_edit_distance_reward": 0.9584819674491882, + "step": 3998, + "temperature": 0.9 + }, + { + "advantages": -3.7670135498046875e-05, + "completion_length": 485.0, + "delta_ref_entropy_loss": 0.0791015625, + "delta_ref_ppl": -0.11376953125, + "entropy_loss": -0.1171875, + "epoch": 0.7998, + "grad_norm": 4.11152803804361, + "k1_kl": 0.11376953125, + "k3_kl": 0.07568359375, + "kimi_kl": 0.185546875, + "learning_rate": 1.0009999999999999e-07, + "loss": 0.0031, + "ppl": 0.05078125, + "reward": 0.8516852259635925, + "reward_std": 0.0014817784540355206, + "rewards/perpo_ocr_edit_distance_reward": 0.8516852855682373, + "step": 3999, + "temperature": 0.9 + }, + { + "advantages": -4.996572533855215e-05, + "completion_length": 318.0, + "delta_ref_entropy_loss": 0.051513671875, + "delta_ref_ppl": -0.109375, + "entropy_loss": -0.09619140625, + "epoch": 0.8, + "grad_norm": 0.7902040812548545, + "k1_kl": 0.109375, + "k3_kl": 0.07470703125, + "kimi_kl": 0.26171875, + "learning_rate": 1e-07, + "loss": 0.003, + "ppl": 0.040283203125, + "reward": 0.9749824404716492, + "reward_std": 0.0010930384742096066, + "rewards/perpo_ocr_edit_distance_reward": 0.9749824404716492, + "step": 4000, + "temperature": 0.9 + }, + { + "advantages": -3.2901763916015625e-05, + "completion_length": 436.0, + "delta_ref_entropy_loss": 0.026123046875, + "delta_ref_ppl": -0.09326171875, + "entropy_loss": -0.033447265625, + "epoch": 0.8002, + "grad_norm": 0.2047541805133316, + "k1_kl": 0.09326171875, + "k3_kl": 0.07373046875, + "kimi_kl": 0.337890625, + "learning_rate": 9.99e-08, + "loss": 0.003, + "ppl": 0.01019287109375, + "reward": 0.9951854348182678, + "reward_std": 0.00041768289520405233, + "rewards/perpo_ocr_edit_distance_reward": 0.9951854944229126, + "step": 4001, + "temperature": 0.9 + }, + { + "advantages": -7.130418816814199e-05, + "completion_length": 574.0, + "delta_ref_entropy_loss": 0.032958984375, + "delta_ref_ppl": -0.053955078125, + "entropy_loss": -0.033447265625, + "epoch": 0.8004, + "grad_norm": 0.35456029045791226, + "k1_kl": 0.0537109375, + "k3_kl": 0.0341796875, + "kimi_kl": 0.107421875, + "learning_rate": 9.98e-08, + "loss": 0.0014, + "ppl": 0.00823974609375, + "reward": 0.9971428513526917, + "reward_std": 0.000497207511216402, + "rewards/perpo_ocr_edit_distance_reward": 0.9971429109573364, + "step": 4002, + "temperature": 0.9 + }, + { + "advantages": -5.1259998144814745e-05, + "completion_length": 444.0, + "delta_ref_entropy_loss": 0.01953125, + "delta_ref_ppl": -0.08349609375, + "entropy_loss": -0.0703125, + "epoch": 0.8006, + "grad_norm": 1.3000325484386073, + "k1_kl": 0.0830078125, + "k3_kl": 0.06298828125, + "kimi_kl": 0.2490234375, + "learning_rate": 9.969999999999999e-08, + "loss": 0.0026, + "ppl": 0.032470703125, + "reward": 0.9951003789901733, + "reward_std": 0.0020588126499205828, + "rewards/perpo_ocr_edit_distance_reward": 0.9951004981994629, + "step": 4003, + "temperature": 0.9 + }, + { + "advantages": -8.605208131484687e-05, + "completion_length": 463.0, + "delta_ref_entropy_loss": 0.061279296875, + "delta_ref_ppl": -0.07470703125, + "entropy_loss": -0.126953125, + "epoch": 0.8008, + "grad_norm": 0.8940581523312892, + "k1_kl": 0.07421875, + "k3_kl": 0.04736328125, + "kimi_kl": 0.1201171875, + "learning_rate": 9.959999999999999e-08, + "loss": 0.002, + "ppl": 0.051025390625, + "reward": 0.9661654829978943, + "reward_std": 0.001186311594210565, + "rewards/perpo_ocr_edit_distance_reward": 0.9661655426025391, + "step": 4004, + "temperature": 0.9 + }, + { + "advantages": -3.226740227546543e-05, + "completion_length": 666.0, + "delta_ref_entropy_loss": 0.042236328125, + "delta_ref_ppl": -0.08544921875, + "entropy_loss": -0.064453125, + "epoch": 0.801, + "grad_norm": 0.996108892207486, + "k1_kl": 0.08544921875, + "k3_kl": 0.05810546875, + "kimi_kl": 0.1689453125, + "learning_rate": 9.95e-08, + "loss": 0.0024, + "ppl": 0.0272216796875, + "reward": 0.9938552975654602, + "reward_std": 0.0009553614072501659, + "rewards/perpo_ocr_edit_distance_reward": 0.993855357170105, + "step": 4005, + "temperature": 0.9 + }, + { + "advantages": -7.144042683648877e-06, + "completion_length": 281.0, + "delta_ref_entropy_loss": 0.09423828125, + "delta_ref_ppl": -0.185546875, + "entropy_loss": -0.142578125, + "epoch": 0.8012, + "grad_norm": 1.4580680311358616, + "k1_kl": 0.185546875, + "k3_kl": 0.1337890625, + "kimi_kl": 0.4921875, + "learning_rate": 9.94e-08, + "loss": 0.0054, + "ppl": 0.06201171875, + "reward": 0.9707186818122864, + "reward_std": 0.0022910451516509056, + "rewards/perpo_ocr_edit_distance_reward": 0.9707187414169312, + "step": 4006, + "temperature": 0.9 + }, + { + "advantages": -6.80046432535164e-05, + "completion_length": 189.0, + "delta_ref_entropy_loss": 0.052001953125, + "delta_ref_ppl": -0.220703125, + "entropy_loss": -0.08935546875, + "epoch": 0.8014, + "grad_norm": 0.6665023599783305, + "k1_kl": 0.220703125, + "k3_kl": 0.1748046875, + "kimi_kl": 0.88671875, + "learning_rate": 9.93e-08, + "loss": 0.007, + "ppl": 0.029541015625, + "reward": 0.9398330450057983, + "reward_std": 0.0014028827426955104, + "rewards/perpo_ocr_edit_distance_reward": 0.9398331642150879, + "step": 4007, + "temperature": 0.9 + }, + { + "advantages": 1.7029899268550253e-08, + "completion_length": 400.0, + "delta_ref_entropy_loss": 0.043212890625, + "delta_ref_ppl": -0.09228515625, + "entropy_loss": -0.0732421875, + "epoch": 0.8016, + "grad_norm": 0.6750467020538672, + "k1_kl": 0.09228515625, + "k3_kl": 0.060546875, + "kimi_kl": 0.1552734375, + "learning_rate": 9.919999999999999e-08, + "loss": 0.0024, + "ppl": 0.0245361328125, + "reward": 0.9943932294845581, + "reward_std": 0.0014374286402016878, + "rewards/perpo_ocr_edit_distance_reward": 0.9943932294845581, + "step": 4008, + "temperature": 0.9 + }, + { + "advantages": -2.0895686247968115e-05, + "completion_length": 796.0, + "delta_ref_entropy_loss": -0.01202392578125, + "delta_ref_ppl": -0.0927734375, + "entropy_loss": -0.2060546875, + "epoch": 0.8018, + "grad_norm": 0.9352956997092614, + "k1_kl": 0.0927734375, + "k3_kl": 0.06396484375, + "kimi_kl": 0.25390625, + "learning_rate": 9.91e-08, + "loss": 0.0026, + "ppl": 0.0849609375, + "reward": 0.9296799898147583, + "reward_std": 0.003973884042352438, + "rewards/perpo_ocr_edit_distance_reward": 0.9296801686286926, + "step": 4009, + "temperature": 0.9 + }, + { + "advantages": 1.406669707648689e-05, + "completion_length": 821.0, + "delta_ref_entropy_loss": 0.01470947265625, + "delta_ref_ppl": -0.041259765625, + "entropy_loss": -0.06298828125, + "epoch": 0.802, + "grad_norm": 0.4837630657724355, + "k1_kl": 0.041259765625, + "k3_kl": 0.027587890625, + "kimi_kl": 0.068359375, + "learning_rate": 9.9e-08, + "loss": 0.0011, + "ppl": 0.0228271484375, + "reward": 0.7447149157524109, + "reward_std": 0.00110915070399642, + "rewards/perpo_ocr_edit_distance_reward": 0.7447149157524109, + "step": 4010, + "temperature": 0.9 + }, + { + "advantages": -2.8661321266554296e-05, + "completion_length": 406.0, + "delta_ref_entropy_loss": 0.037109375, + "delta_ref_ppl": -0.08154296875, + "entropy_loss": -0.054443359375, + "epoch": 0.8022, + "grad_norm": 0.8581411530572403, + "k1_kl": 0.08154296875, + "k3_kl": 0.056396484375, + "kimi_kl": 0.19921875, + "learning_rate": 9.889999999999999e-08, + "loss": 0.0023, + "ppl": 0.013427734375, + "reward": 0.9977567791938782, + "reward_std": 0.001088690827600658, + "rewards/perpo_ocr_edit_distance_reward": 0.9977567791938782, + "step": 4011, + "temperature": 0.9 + }, + { + "advantages": -2.060617771348916e-05, + "completion_length": 374.0, + "delta_ref_entropy_loss": 0.036865234375, + "delta_ref_ppl": -0.0966796875, + "entropy_loss": -0.0849609375, + "epoch": 0.8024, + "grad_norm": 1.202002795722016, + "k1_kl": 0.0966796875, + "k3_kl": 0.0693359375, + "kimi_kl": 0.2158203125, + "learning_rate": 9.88e-08, + "loss": 0.0028, + "ppl": 0.03076171875, + "reward": 0.9854614734649658, + "reward_std": 0.0011386320693418384, + "rewards/perpo_ocr_edit_distance_reward": 0.9854614734649658, + "step": 4012, + "temperature": 0.9 + }, + { + "advantages": 0.0, + "completion_length": 754.0, + "delta_ref_entropy_loss": 0.0230712890625, + "delta_ref_ppl": -0.0654296875, + "entropy_loss": -0.06982421875, + "epoch": 0.8026, + "grad_norm": 0.4148994426737791, + "k1_kl": 0.0654296875, + "k3_kl": 0.044189453125, + "kimi_kl": 0.138671875, + "learning_rate": 9.869999999999999e-08, + "loss": 0.0018, + "ppl": 0.0260009765625, + "reward": 0.9920275807380676, + "reward_std": 0.0006713049951940775, + "rewards/perpo_ocr_edit_distance_reward": 0.9920275807380676, + "step": 4013, + "temperature": 0.9 + }, + { + "advantages": 1.4850072147964966e-05, + "completion_length": 567.0, + "delta_ref_entropy_loss": 0.04931640625, + "delta_ref_ppl": -0.12060546875, + "entropy_loss": -0.220703125, + "epoch": 0.8028, + "grad_norm": 1.4451700838231174, + "k1_kl": 0.12060546875, + "k3_kl": 0.0791015625, + "kimi_kl": 0.181640625, + "learning_rate": 9.859999999999998e-08, + "loss": 0.0031, + "ppl": 0.10693359375, + "reward": 0.9110758304595947, + "reward_std": 0.0016197735676541924, + "rewards/perpo_ocr_edit_distance_reward": 0.9110758900642395, + "step": 4014, + "temperature": 0.9 + }, + { + "advantages": -1.120567412726814e-05, + "completion_length": 643.0, + "delta_ref_entropy_loss": 0.015869140625, + "delta_ref_ppl": -0.05859375, + "entropy_loss": -0.0732421875, + "epoch": 0.803, + "grad_norm": 0.8217396991470391, + "k1_kl": 0.05859375, + "k3_kl": 0.040771484375, + "kimi_kl": 0.10498046875, + "learning_rate": 9.85e-08, + "loss": 0.0016, + "ppl": 0.03271484375, + "reward": 0.9710327982902527, + "reward_std": 0.0014201884623616934, + "rewards/perpo_ocr_edit_distance_reward": 0.9710328578948975, + "step": 4015, + "temperature": 0.9 + }, + { + "advantages": -0.00014372383884619921, + "completion_length": 958.0, + "delta_ref_entropy_loss": 0.0269775390625, + "delta_ref_ppl": -0.033935546875, + "entropy_loss": -0.0322265625, + "epoch": 0.8032, + "grad_norm": 0.38723523294126316, + "k1_kl": 0.033935546875, + "k3_kl": 0.0185546875, + "kimi_kl": 0.048583984375, + "learning_rate": 9.84e-08, + "loss": 0.0009, + "ppl": 0.0078125, + "reward": 0.9971334934234619, + "reward_std": 0.0001962247770279646, + "rewards/perpo_ocr_edit_distance_reward": 0.9971335530281067, + "step": 4016, + "temperature": 0.9 + }, + { + "advantages": -3.4144948131142883e-06, + "completion_length": 477.0, + "delta_ref_entropy_loss": -0.07958984375, + "delta_ref_ppl": -0.09912109375, + "entropy_loss": -0.298828125, + "epoch": 0.8034, + "grad_norm": 1.9135931101571984, + "k1_kl": 0.09912109375, + "k3_kl": 0.09375, + "kimi_kl": 0.2578125, + "learning_rate": 9.83e-08, + "loss": 0.0038, + "ppl": 0.1123046875, + "reward": 0.847455620765686, + "reward_std": 0.004885393660515547, + "rewards/perpo_ocr_edit_distance_reward": 0.847455620765686, + "step": 4017, + "temperature": 0.9 + }, + { + "advantages": 2.2138868871479644e-07, + "completion_length": 27.0, + "delta_ref_entropy_loss": -0.380859375, + "delta_ref_ppl": -1.234375, + "entropy_loss": -1.421875, + "epoch": 0.8036, + "grad_norm": 9.712021700943179, + "k1_kl": 1.2265625, + "k3_kl": 1.1015625, + "kimi_kl": 4.5625, + "learning_rate": 9.819999999999999e-08, + "loss": 0.044, + "ppl": 0.5859375, + "reward": 0.4010152220726013, + "reward_std": 0.03910057619214058, + "rewards/perpo_ocr_edit_distance_reward": 0.4010152220726013, + "step": 4018, + "temperature": 0.9 + }, + { + "advantages": -1.532690930616809e-06, + "completion_length": 467.0, + "delta_ref_entropy_loss": -0.003631591796875, + "delta_ref_ppl": -0.10205078125, + "entropy_loss": -0.26953125, + "epoch": 0.8038, + "grad_norm": 2.2212864818307207, + "k1_kl": 0.10205078125, + "k3_kl": 0.08349609375, + "kimi_kl": 0.251953125, + "learning_rate": 9.81e-08, + "loss": 0.0033, + "ppl": 0.1318359375, + "reward": 0.8964753150939941, + "reward_std": 0.038952797651290894, + "rewards/perpo_ocr_edit_distance_reward": 0.8964753746986389, + "step": 4019, + "temperature": 0.9 + }, + { + "advantages": -7.356916285061743e-06, + "completion_length": 960.0, + "delta_ref_entropy_loss": 0.10302734375, + "delta_ref_ppl": -0.109375, + "entropy_loss": -0.20703125, + "epoch": 0.804, + "grad_norm": 1.806449785277149, + "k1_kl": 0.109375, + "k3_kl": 0.072265625, + "kimi_kl": 0.166015625, + "learning_rate": 9.8e-08, + "loss": 0.0029, + "ppl": 0.10205078125, + "reward": 0.8260135054588318, + "reward_std": 0.011481484398245811, + "rewards/perpo_ocr_edit_distance_reward": 0.8260136246681213, + "step": 4020, + "temperature": 0.9 + }, + { + "advantages": -6.437302090489538e-06, + "completion_length": 1097.0, + "delta_ref_entropy_loss": 0.03955078125, + "delta_ref_ppl": -0.080078125, + "entropy_loss": -0.1474609375, + "epoch": 0.8042, + "grad_norm": 1.452256634644336, + "k1_kl": 0.080078125, + "k3_kl": 0.05419921875, + "kimi_kl": 0.142578125, + "learning_rate": 9.79e-08, + "loss": 0.0022, + "ppl": 0.06494140625, + "reward": 0.9625033140182495, + "reward_std": 0.002551059937104583, + "rewards/perpo_ocr_edit_distance_reward": 0.9625033140182495, + "step": 4021, + "temperature": 0.9 + }, + { + "advantages": -4.5384680561255664e-05, + "completion_length": 708.0, + "delta_ref_entropy_loss": 0.0294189453125, + "delta_ref_ppl": -0.0537109375, + "entropy_loss": -0.1015625, + "epoch": 0.8044, + "grad_norm": 0.8719823255400148, + "k1_kl": 0.0537109375, + "k3_kl": 0.036376953125, + "kimi_kl": 0.09228515625, + "learning_rate": 9.779999999999999e-08, + "loss": 0.0015, + "ppl": 0.041748046875, + "reward": 0.9942145347595215, + "reward_std": 0.001213253359310329, + "rewards/perpo_ocr_edit_distance_reward": 0.9942147135734558, + "step": 4022, + "temperature": 0.9 + }, + { + "advantages": -6.508827209472656e-05, + "completion_length": 694.0, + "delta_ref_entropy_loss": 0.033203125, + "delta_ref_ppl": -0.06689453125, + "entropy_loss": -0.0595703125, + "epoch": 0.8046, + "grad_norm": 0.28021482562159505, + "k1_kl": 0.06689453125, + "k3_kl": 0.04638671875, + "kimi_kl": 0.181640625, + "learning_rate": 9.77e-08, + "loss": 0.0019, + "ppl": 0.017822265625, + "reward": 0.7271080017089844, + "reward_std": 0.00029259081929922104, + "rewards/perpo_ocr_edit_distance_reward": 0.7271080613136292, + "step": 4023, + "temperature": 0.9 + }, + { + "advantages": -6.641660547757056e-06, + "completion_length": 502.0, + "delta_ref_entropy_loss": 0.07568359375, + "delta_ref_ppl": -0.1318359375, + "entropy_loss": -0.0908203125, + "epoch": 0.8048, + "grad_norm": 0.846771009182403, + "k1_kl": 0.1318359375, + "k3_kl": 0.09130859375, + "kimi_kl": 0.431640625, + "learning_rate": 9.76e-08, + "loss": 0.0037, + "ppl": 0.034912109375, + "reward": 0.8601553440093994, + "reward_std": 0.002461014548316598, + "rewards/perpo_ocr_edit_distance_reward": 0.8601554036140442, + "step": 4024, + "temperature": 0.9 + }, + { + "advantages": -6.141833000583574e-05, + "completion_length": 428.0, + "delta_ref_entropy_loss": 0.054443359375, + "delta_ref_ppl": -0.09130859375, + "entropy_loss": -0.107421875, + "epoch": 0.805, + "grad_norm": 0.8037234569723155, + "k1_kl": 0.09130859375, + "k3_kl": 0.06640625, + "kimi_kl": 0.185546875, + "learning_rate": 9.749999999999999e-08, + "loss": 0.0027, + "ppl": 0.0390625, + "reward": 0.9694543480873108, + "reward_std": 0.000731860229279846, + "rewards/perpo_ocr_edit_distance_reward": 0.9694544076919556, + "step": 4025, + "temperature": 0.9 + }, + { + "advantages": -1.3321639016794506e-05, + "completion_length": 448.0, + "delta_ref_entropy_loss": 0.025390625, + "delta_ref_ppl": -0.06201171875, + "entropy_loss": -0.06005859375, + "epoch": 0.8052, + "grad_norm": 0.8588985530707914, + "k1_kl": 0.06201171875, + "k3_kl": 0.045654296875, + "kimi_kl": 0.158203125, + "learning_rate": 9.74e-08, + "loss": 0.0018, + "ppl": 0.0238037109375, + "reward": 0.9907398223876953, + "reward_std": 0.0024567092768847942, + "rewards/perpo_ocr_edit_distance_reward": 0.9907398819923401, + "step": 4026, + "temperature": 0.9 + }, + { + "advantages": -6.920951273059472e-05, + "completion_length": 452.0, + "delta_ref_entropy_loss": 0.0255126953125, + "delta_ref_ppl": -0.07421875, + "entropy_loss": -0.04052734375, + "epoch": 0.8054, + "grad_norm": 0.7359834945760166, + "k1_kl": 0.07421875, + "k3_kl": 0.0537109375, + "kimi_kl": 0.205078125, + "learning_rate": 9.729999999999999e-08, + "loss": 0.0022, + "ppl": 0.01287841796875, + "reward": 0.9994807839393616, + "reward_std": 0.0007615455542691052, + "rewards/perpo_ocr_edit_distance_reward": 0.9994808435440063, + "step": 4027, + "temperature": 0.9 + }, + { + "advantages": -6.982258469179214e-07, + "completion_length": 139.0, + "delta_ref_entropy_loss": -0.58984375, + "delta_ref_ppl": -0.5625, + "entropy_loss": -1.453125, + "epoch": 0.8056, + "grad_norm": 6.178212483898189, + "k1_kl": 0.56640625, + "k3_kl": 0.5859375, + "kimi_kl": 1.9375, + "learning_rate": 9.72e-08, + "loss": 0.0235, + "ppl": 0.609375, + "reward": 0.3372752070426941, + "reward_std": 0.024564143270254135, + "rewards/perpo_ocr_edit_distance_reward": 0.3372752368450165, + "step": 4028, + "temperature": 0.9 + }, + { + "advantages": -1.7029898913278885e-07, + "completion_length": 1709.0, + "delta_ref_entropy_loss": -0.11279296875, + "delta_ref_ppl": -0.02001953125, + "entropy_loss": -0.40625, + "epoch": 0.8058, + "grad_norm": 3.482355399229724, + "k1_kl": 0.02001953125, + "k3_kl": 0.037353515625, + "kimi_kl": 0.0810546875, + "learning_rate": 9.71e-08, + "loss": 0.0015, + "ppl": 0.24609375, + "reward": 0.8662850260734558, + "reward_std": 0.14219887554645538, + "rewards/perpo_ocr_edit_distance_reward": 0.8662850856781006, + "step": 4029, + "temperature": 0.9 + }, + { + "advantages": -1.4447740795731079e-05, + "completion_length": 710.0, + "delta_ref_entropy_loss": 0.01202392578125, + "delta_ref_ppl": -0.042724609375, + "entropy_loss": -0.058349609375, + "epoch": 0.806, + "grad_norm": 0.4574216195841783, + "k1_kl": 0.042724609375, + "k3_kl": 0.0267333984375, + "kimi_kl": 0.080078125, + "learning_rate": 9.7e-08, + "loss": 0.0011, + "ppl": 0.0201416015625, + "reward": 0.9967703819274902, + "reward_std": 0.0010784976184368134, + "rewards/perpo_ocr_edit_distance_reward": 0.9967703819274902, + "step": 4030, + "temperature": 0.9 + }, + { + "advantages": -5.1379207434365526e-05, + "completion_length": 451.0, + "delta_ref_entropy_loss": 0.0693359375, + "delta_ref_ppl": -0.10546875, + "entropy_loss": -0.06982421875, + "epoch": 0.8062, + "grad_norm": 0.6386844401242122, + "k1_kl": 0.10546875, + "k3_kl": 0.07177734375, + "kimi_kl": 0.2578125, + "learning_rate": 9.69e-08, + "loss": 0.0029, + "ppl": 0.021240234375, + "reward": 0.9932834506034851, + "reward_std": 0.0007290957728400826, + "rewards/perpo_ocr_edit_distance_reward": 0.9932835698127747, + "step": 4031, + "temperature": 0.9 + }, + { + "advantages": 0.0, + "completion_length": 194.0, + "delta_ref_entropy_loss": -0.0225830078125, + "delta_ref_ppl": -0.19140625, + "entropy_loss": -0.26953125, + "epoch": 0.8064, + "grad_norm": 2.7894720750303654, + "k1_kl": 0.19140625, + "k3_kl": 0.1748046875, + "kimi_kl": 0.61328125, + "learning_rate": 9.679999999999999e-08, + "loss": 0.007, + "ppl": 0.1103515625, + "reward": 0.7516301274299622, + "reward_std": 0.037331197410821915, + "rewards/perpo_ocr_edit_distance_reward": 0.7516301274299622, + "step": 4032, + "temperature": 0.9 + }, + { + "advantages": -9.472030069446191e-05, + "completion_length": 356.0, + "delta_ref_entropy_loss": 0.03076171875, + "delta_ref_ppl": -0.10791015625, + "entropy_loss": -0.05615234375, + "epoch": 0.8066, + "grad_norm": 0.8986573263923401, + "k1_kl": 0.10791015625, + "k3_kl": 0.08154296875, + "kimi_kl": 0.3203125, + "learning_rate": 9.669999999999999e-08, + "loss": 0.0034, + "ppl": 0.021240234375, + "reward": 0.9883502125740051, + "reward_std": 0.001158386585302651, + "rewards/perpo_ocr_edit_distance_reward": 0.9883503913879395, + "step": 4033, + "temperature": 0.9 + }, + { + "advantages": -0.00018721394008025527, + "completion_length": 838.0, + "delta_ref_entropy_loss": 0.027099609375, + "delta_ref_ppl": -0.04296875, + "entropy_loss": -0.030029296875, + "epoch": 0.8068, + "grad_norm": 0.19087710280159578, + "k1_kl": 0.04296875, + "k3_kl": 0.0255126953125, + "kimi_kl": 0.0830078125, + "learning_rate": 9.66e-08, + "loss": 0.0012, + "ppl": 0.00823974609375, + "reward": 0.9991419315338135, + "reward_std": 0.000127422230434604, + "rewards/perpo_ocr_edit_distance_reward": 0.9991419315338135, + "step": 4034, + "temperature": 0.9 + }, + { + "advantages": -0.00019572462770156562, + "completion_length": 941.0, + "delta_ref_entropy_loss": 0.031494140625, + "delta_ref_ppl": -0.059814453125, + "entropy_loss": -0.06005859375, + "epoch": 0.807, + "grad_norm": 16.050195630962236, + "k1_kl": 0.059814453125, + "k3_kl": 0.04150390625, + "kimi_kl": 0.130859375, + "learning_rate": 9.65e-08, + "loss": 0.0019, + "ppl": 0.021484375, + "reward": 0.9966830611228943, + "reward_std": 0.0005524810985662043, + "rewards/perpo_ocr_edit_distance_reward": 0.9966831803321838, + "step": 4035, + "temperature": 0.9 + }, + { + "advantages": -0.00010425704385852441, + "completion_length": 260.0, + "delta_ref_entropy_loss": 0.024169921875, + "delta_ref_ppl": -0.1689453125, + "entropy_loss": -0.060302734375, + "epoch": 0.8072, + "grad_norm": 0.5251072864001611, + "k1_kl": 0.16796875, + "k3_kl": 0.1318359375, + "kimi_kl": 0.62890625, + "learning_rate": 9.639999999999999e-08, + "loss": 0.0054, + "ppl": 0.0196533203125, + "reward": 0.9949127435684204, + "reward_std": 0.0007167871808633208, + "rewards/perpo_ocr_edit_distance_reward": 0.99491286277771, + "step": 4036, + "temperature": 0.9 + }, + { + "advantages": -3.303800622234121e-05, + "completion_length": 1138.0, + "delta_ref_entropy_loss": 0.028076171875, + "delta_ref_ppl": -0.03125, + "entropy_loss": -0.048828125, + "epoch": 0.8074, + "grad_norm": 3.429415438313883, + "k1_kl": 0.0311279296875, + "k3_kl": 0.0155029296875, + "kimi_kl": 0.04052734375, + "learning_rate": 9.63e-08, + "loss": 0.0007, + "ppl": 0.01806640625, + "reward": 0.9981696009635925, + "reward_std": 0.0006730589084327221, + "rewards/perpo_ocr_edit_distance_reward": 0.9981696605682373, + "step": 4037, + "temperature": 0.9 + }, + { + "advantages": -6.74384000376449e-06, + "completion_length": 914.0, + "delta_ref_entropy_loss": 0.035888671875, + "delta_ref_ppl": -0.06591796875, + "entropy_loss": -0.21875, + "epoch": 0.8076, + "grad_norm": 1.1579539742628846, + "k1_kl": 0.06591796875, + "k3_kl": 0.04150390625, + "kimi_kl": 0.08203125, + "learning_rate": 9.619999999999999e-08, + "loss": 0.0017, + "ppl": 0.0888671875, + "reward": 0.8652894496917725, + "reward_std": 0.011272921226918697, + "rewards/perpo_ocr_edit_distance_reward": 0.865289568901062, + "step": 4038, + "temperature": 0.9 + }, + { + "advantages": -2.336502257094253e-05, + "completion_length": 709.0, + "delta_ref_entropy_loss": 0.01092529296875, + "delta_ref_ppl": -0.03271484375, + "entropy_loss": -0.033447265625, + "epoch": 0.8078, + "grad_norm": 0.33769075246014213, + "k1_kl": 0.03271484375, + "k3_kl": 0.0223388671875, + "kimi_kl": 0.059814453125, + "learning_rate": 9.610000000000001e-08, + "loss": 0.0009, + "ppl": 0.00921630859375, + "reward": 0.9944563508033752, + "reward_std": 0.0024524142500013113, + "rewards/perpo_ocr_edit_distance_reward": 0.99445641040802, + "step": 4039, + "temperature": 0.9 + }, + { + "advantages": -2.9376576549111633e-06, + "completion_length": 144.0, + "delta_ref_entropy_loss": 0.0147705078125, + "delta_ref_ppl": -0.2109375, + "entropy_loss": -0.263671875, + "epoch": 0.808, + "grad_norm": 2.590035046420406, + "k1_kl": 0.2109375, + "k3_kl": 0.16796875, + "kimi_kl": 0.4921875, + "learning_rate": 9.6e-08, + "loss": 0.0067, + "ppl": 0.09912109375, + "reward": 0.8729878664016724, + "reward_std": 0.002770420163869858, + "rewards/perpo_ocr_edit_distance_reward": 0.8729879260063171, + "step": 4040, + "temperature": 0.9 + }, + { + "advantages": -9.104183845920488e-05, + "completion_length": 795.0, + "delta_ref_entropy_loss": 0.04833984375, + "delta_ref_ppl": -0.058349609375, + "entropy_loss": -0.07373046875, + "epoch": 0.8082, + "grad_norm": 0.7914006782884256, + "k1_kl": 0.05810546875, + "k3_kl": 0.0341796875, + "kimi_kl": 0.087890625, + "learning_rate": 9.589999999999999e-08, + "loss": 0.0015, + "ppl": 0.0264892578125, + "reward": 0.9880419373512268, + "reward_std": 0.0008354154997505248, + "rewards/perpo_ocr_edit_distance_reward": 0.9880419969558716, + "step": 4041, + "temperature": 0.9 + }, + { + "advantages": -0.00013002328341826797, + "completion_length": 1044.0, + "delta_ref_entropy_loss": 0.01708984375, + "delta_ref_ppl": -0.036865234375, + "entropy_loss": -0.06640625, + "epoch": 0.8084, + "grad_norm": 0.9250421075722975, + "k1_kl": 0.036865234375, + "k3_kl": 0.0252685546875, + "kimi_kl": 0.06689453125, + "learning_rate": 9.58e-08, + "loss": 0.0011, + "ppl": 0.029541015625, + "reward": 0.9936295747756958, + "reward_std": 0.0006204365636222064, + "rewards/perpo_ocr_edit_distance_reward": 0.9936297535896301, + "step": 4042, + "temperature": 0.9 + }, + { + "advantages": -3.7125180369912414e-06, + "completion_length": 395.0, + "delta_ref_entropy_loss": 0.06640625, + "delta_ref_ppl": -0.0859375, + "entropy_loss": -0.07666015625, + "epoch": 0.8086, + "grad_norm": 1.2121228000798026, + "k1_kl": 0.08642578125, + "k3_kl": 0.050537109375, + "kimi_kl": 0.1376953125, + "learning_rate": 9.569999999999999e-08, + "loss": 0.002, + "ppl": 0.033203125, + "reward": 0.9955499172210693, + "reward_std": 0.0022012086119502783, + "rewards/perpo_ocr_edit_distance_reward": 0.9955499172210693, + "step": 4043, + "temperature": 0.9 + }, + { + "advantages": -1.44754142183956e-06, + "completion_length": 292.0, + "delta_ref_entropy_loss": 0.050048828125, + "delta_ref_ppl": -0.1376953125, + "entropy_loss": -0.10986328125, + "epoch": 0.8088, + "grad_norm": 1.4303837295497748, + "k1_kl": 0.1376953125, + "k3_kl": 0.10791015625, + "kimi_kl": 0.3515625, + "learning_rate": 9.56e-08, + "loss": 0.0043, + "ppl": 0.048828125, + "reward": 0.9808865785598755, + "reward_std": 0.02339101769030094, + "rewards/perpo_ocr_edit_distance_reward": 0.9808865785598755, + "step": 4044, + "temperature": 0.9 + }, + { + "advantages": -2.118519478244707e-05, + "completion_length": 1345.0, + "delta_ref_entropy_loss": 0.0089111328125, + "delta_ref_ppl": -0.0272216796875, + "entropy_loss": -0.0390625, + "epoch": 0.809, + "grad_norm": 0.37183891286380194, + "k1_kl": 0.0272216796875, + "k3_kl": 0.0186767578125, + "kimi_kl": 0.0537109375, + "learning_rate": 9.55e-08, + "loss": 0.0008, + "ppl": 0.0146484375, + "reward": 0.9965686798095703, + "reward_std": 0.0023095901124179363, + "rewards/perpo_ocr_edit_distance_reward": 0.9965687394142151, + "step": 4045, + "temperature": 0.9 + }, + { + "advantages": -2.997262299686554e-06, + "completion_length": 46.0, + "delta_ref_entropy_loss": -0.107421875, + "delta_ref_ppl": -0.55859375, + "entropy_loss": -0.494140625, + "epoch": 0.8092, + "grad_norm": 8.581100651932127, + "k1_kl": 0.55859375, + "k3_kl": 0.45703125, + "kimi_kl": 1.6171875, + "learning_rate": 9.54e-08, + "loss": 0.0182, + "ppl": 0.19921875, + "reward": 0.3129230737686157, + "reward_std": 0.015450882725417614, + "rewards/perpo_ocr_edit_distance_reward": 0.3129231035709381, + "step": 4046, + "temperature": 0.9 + }, + { + "advantages": 2.8635775379370898e-05, + "completion_length": 791.0, + "delta_ref_entropy_loss": 0.035400390625, + "delta_ref_ppl": -0.06982421875, + "entropy_loss": -0.060302734375, + "epoch": 0.8094, + "grad_norm": 0.48571487290197823, + "k1_kl": 0.06982421875, + "k3_kl": 0.04296875, + "kimi_kl": 0.09228515625, + "learning_rate": 9.529999999999999e-08, + "loss": 0.0017, + "ppl": 0.0257568359375, + "reward": 0.9763978123664856, + "reward_std": 0.0007920424686744809, + "rewards/perpo_ocr_edit_distance_reward": 0.9763978719711304, + "step": 4047, + "temperature": 0.9 + }, + { + "advantages": -8.174351933121216e-06, + "completion_length": 947.0, + "delta_ref_entropy_loss": 0.02294921875, + "delta_ref_ppl": -0.0390625, + "entropy_loss": -0.046142578125, + "epoch": 0.8096, + "grad_norm": 1.1018679088683332, + "k1_kl": 0.0390625, + "k3_kl": 0.02294921875, + "kimi_kl": 0.06787109375, + "learning_rate": 9.52e-08, + "loss": 0.0009, + "ppl": 0.0157470703125, + "reward": 0.995535671710968, + "reward_std": 0.0009396495879627764, + "rewards/perpo_ocr_edit_distance_reward": 0.9955357313156128, + "step": 4048, + "temperature": 0.9 + }, + { + "advantages": -9.672982560005039e-05, + "completion_length": 597.0, + "delta_ref_entropy_loss": 0.023193359375, + "delta_ref_ppl": -0.051025390625, + "entropy_loss": -0.04736328125, + "epoch": 0.8098, + "grad_norm": 0.8573860303535619, + "k1_kl": 0.05078125, + "k3_kl": 0.031494140625, + "kimi_kl": 0.09912109375, + "learning_rate": 9.51e-08, + "loss": 0.0014, + "ppl": 0.01324462890625, + "reward": 0.9965811371803284, + "reward_std": 0.0006043565226718783, + "rewards/perpo_ocr_edit_distance_reward": 0.9965812563896179, + "step": 4049, + "temperature": 0.9 + }, + { + "advantages": -6.269557343330234e-05, + "completion_length": 1128.0, + "delta_ref_entropy_loss": 0.0576171875, + "delta_ref_ppl": -0.06982421875, + "entropy_loss": -0.130859375, + "epoch": 0.81, + "grad_norm": 0.8889694333566466, + "k1_kl": 0.06982421875, + "k3_kl": 0.04052734375, + "kimi_kl": 0.1025390625, + "learning_rate": 9.499999999999999e-08, + "loss": 0.0017, + "ppl": 0.061279296875, + "reward": 0.9684486985206604, + "reward_std": 0.001258153235539794, + "rewards/perpo_ocr_edit_distance_reward": 0.96844881772995, + "step": 4050, + "temperature": 0.9 + }, + { + "advantages": 5.514281292562373e-05, + "completion_length": 540.0, + "delta_ref_entropy_loss": 0.034423828125, + "delta_ref_ppl": -0.09423828125, + "entropy_loss": -0.11767578125, + "epoch": 0.8102, + "grad_norm": 0.7843184341744331, + "k1_kl": 0.09423828125, + "k3_kl": 0.06689453125, + "kimi_kl": 0.1904296875, + "learning_rate": 9.49e-08, + "loss": 0.0026, + "ppl": 0.045166015625, + "reward": 0.9687079191207886, + "reward_std": 0.0005177434650249779, + "rewards/perpo_ocr_edit_distance_reward": 0.9687079191207886, + "step": 4051, + "temperature": 0.9 + }, + { + "advantages": 0.0, + "completion_length": 377.0, + "delta_ref_entropy_loss": 0.03466796875, + "delta_ref_ppl": -0.09326171875, + "entropy_loss": -0.05419921875, + "epoch": 0.8104, + "grad_norm": 0.45639733530298265, + "k1_kl": 0.09326171875, + "k3_kl": 0.0625, + "kimi_kl": 0.1796875, + "learning_rate": 9.479999999999999e-08, + "loss": 0.0025, + "ppl": 0.0205078125, + "reward": 0.8545830845832825, + "reward_std": 0.0009674904868006706, + "rewards/perpo_ocr_edit_distance_reward": 0.8545831441879272, + "step": 4052, + "temperature": 0.9 + }, + { + "advantages": -4.3583768274402246e-05, + "completion_length": 657.0, + "delta_ref_entropy_loss": 0.042236328125, + "delta_ref_ppl": -0.07763671875, + "entropy_loss": -0.10546875, + "epoch": 0.8106, + "grad_norm": 0.9878597558094162, + "k1_kl": 0.0771484375, + "k3_kl": 0.0498046875, + "kimi_kl": 0.13671875, + "learning_rate": 9.470000000000001e-08, + "loss": 0.002, + "ppl": 0.044921875, + "reward": 0.9465994834899902, + "reward_std": 0.0012675031321123242, + "rewards/perpo_ocr_edit_distance_reward": 0.9465996026992798, + "step": 4053, + "temperature": 0.9 + }, + { + "advantages": -8.440869714831933e-05, + "completion_length": 1012.0, + "delta_ref_entropy_loss": 0.044677734375, + "delta_ref_ppl": -0.07080078125, + "entropy_loss": -0.20703125, + "epoch": 0.8108, + "grad_norm": 1.2996732212614424, + "k1_kl": 0.07080078125, + "k3_kl": 0.052001953125, + "kimi_kl": 0.10546875, + "learning_rate": 9.46e-08, + "loss": 0.0022, + "ppl": 0.10546875, + "reward": 0.9443238377571106, + "reward_std": 0.0012114325072616339, + "rewards/perpo_ocr_edit_distance_reward": 0.9443240165710449, + "step": 4054, + "temperature": 0.9 + }, + { + "advantages": -5.960464932286413e-06, + "completion_length": 1717.0, + "delta_ref_entropy_loss": 0.0089111328125, + "delta_ref_ppl": -0.06103515625, + "entropy_loss": -0.146484375, + "epoch": 0.811, + "grad_norm": 1.7697415850976788, + "k1_kl": 0.060791015625, + "k3_kl": 0.052978515625, + "kimi_kl": 0.1279296875, + "learning_rate": 9.449999999999999e-08, + "loss": 0.0021, + "ppl": 0.072265625, + "reward": 0.9244260191917419, + "reward_std": 0.015595879405736923, + "rewards/perpo_ocr_edit_distance_reward": 0.9244261384010315, + "step": 4055, + "temperature": 0.9 + }, + { + "advantages": -4.5827458961866796e-05, + "completion_length": 235.0, + "delta_ref_entropy_loss": 0.06201171875, + "delta_ref_ppl": -0.2275390625, + "entropy_loss": -0.11962890625, + "epoch": 0.8112, + "grad_norm": 3.006026710078703, + "k1_kl": 0.2275390625, + "k3_kl": 0.17578125, + "kimi_kl": 0.80859375, + "learning_rate": 9.44e-08, + "loss": 0.0071, + "ppl": 0.055908203125, + "reward": 0.9926522374153137, + "reward_std": 0.002686368301510811, + "rewards/perpo_ocr_edit_distance_reward": 0.992652416229248, + "step": 4056, + "temperature": 0.9 + }, + { + "advantages": -1.1920928955078125e-07, + "completion_length": 591.0, + "delta_ref_entropy_loss": -0.0634765625, + "delta_ref_ppl": -0.08837890625, + "entropy_loss": -0.318359375, + "epoch": 0.8114, + "grad_norm": 6.767640765468478, + "k1_kl": 0.08837890625, + "k3_kl": 0.07177734375, + "kimi_kl": 0.18359375, + "learning_rate": 9.429999999999999e-08, + "loss": 0.0029, + "ppl": 0.126953125, + "reward": 0.899451732635498, + "reward_std": 0.2521563768386841, + "rewards/perpo_ocr_edit_distance_reward": 0.899451732635498, + "step": 4057, + "temperature": 0.9 + }, + { + "advantages": -1.0700098755478393e-05, + "completion_length": 771.0, + "delta_ref_entropy_loss": 0.0299072265625, + "delta_ref_ppl": -0.07275390625, + "entropy_loss": -0.123046875, + "epoch": 0.8116, + "grad_norm": 1.6517077584429944, + "k1_kl": 0.07275390625, + "k3_kl": 0.052001953125, + "kimi_kl": 0.1201171875, + "learning_rate": 9.42e-08, + "loss": 0.0021, + "ppl": 0.039794921875, + "reward": 0.9824715256690979, + "reward_std": 0.002288415329530835, + "rewards/perpo_ocr_edit_distance_reward": 0.9824716448783875, + "step": 4058, + "temperature": 0.9 + }, + { + "advantages": 2.55448497910038e-08, + "completion_length": 92.0, + "delta_ref_entropy_loss": -0.220703125, + "delta_ref_ppl": -0.5078125, + "entropy_loss": -0.60546875, + "epoch": 0.8118, + "grad_norm": 6.684777278855024, + "k1_kl": 0.5078125, + "k3_kl": 0.443359375, + "kimi_kl": 1.7578125, + "learning_rate": 9.41e-08, + "loss": 0.0177, + "ppl": 0.2275390625, + "reward": 0.4666374623775482, + "reward_std": 0.020136792212724686, + "rewards/perpo_ocr_edit_distance_reward": 0.4666374921798706, + "step": 4059, + "temperature": 0.9 + }, + { + "advantages": -4.172325134277344e-06, + "completion_length": 511.0, + "delta_ref_entropy_loss": 0.0400390625, + "delta_ref_ppl": -0.09326171875, + "entropy_loss": -0.08984375, + "epoch": 0.812, + "grad_norm": 1.4128699942527083, + "k1_kl": 0.0927734375, + "k3_kl": 0.0751953125, + "kimi_kl": 0.1923828125, + "learning_rate": 9.4e-08, + "loss": 0.003, + "ppl": 0.04150390625, + "reward": 0.990435779094696, + "reward_std": 0.0019462041091173887, + "rewards/perpo_ocr_edit_distance_reward": 0.990435779094696, + "step": 4060, + "temperature": 0.9 + }, + { + "advantages": -3.661428422674362e-07, + "completion_length": 887.0, + "delta_ref_entropy_loss": -0.259765625, + "delta_ref_ppl": -0.0281982421875, + "entropy_loss": -0.69921875, + "epoch": 0.8122, + "grad_norm": 17.214483869290568, + "k1_kl": 0.0286865234375, + "k3_kl": 0.06884765625, + "kimi_kl": 0.12060546875, + "learning_rate": 9.389999999999999e-08, + "loss": 0.0028, + "ppl": 0.35546875, + "reward": 0.6939288973808289, + "reward_std": 0.12339851260185242, + "rewards/perpo_ocr_edit_distance_reward": 0.6939289569854736, + "step": 4061, + "temperature": 0.9 + }, + { + "advantages": -0.0001564366539241746, + "completion_length": 447.0, + "delta_ref_entropy_loss": 0.025390625, + "delta_ref_ppl": -0.09814453125, + "entropy_loss": -0.053955078125, + "epoch": 0.8124, + "grad_norm": 0.37475786504038316, + "k1_kl": 0.09765625, + "k3_kl": 0.0732421875, + "kimi_kl": 0.29296875, + "learning_rate": 9.379999999999999e-08, + "loss": 0.0031, + "ppl": 0.0177001953125, + "reward": 0.9931244850158691, + "reward_std": 0.00033545782207511365, + "rewards/perpo_ocr_edit_distance_reward": 0.9931246042251587, + "step": 4062, + "temperature": 0.9 + }, + { + "advantages": -1.8221992377220886e-06, + "completion_length": 2048.0, + "delta_ref_entropy_loss": -0.1533203125, + "delta_ref_ppl": -0.032958984375, + "entropy_loss": -1.2421875, + "epoch": 0.8126, + "grad_norm": 25.49836818337563, + "k1_kl": 0.0322265625, + "k3_kl": 0.08154296875, + "kimi_kl": 0.1201171875, + "learning_rate": 9.37e-08, + "loss": 0.0033, + "ppl": 0.71484375, + "reward": 0.2738557457923889, + "reward_std": 0.011507526971399784, + "rewards/perpo_ocr_edit_distance_reward": 0.2738557755947113, + "step": 4063, + "temperature": 0.9 + }, + { + "advantages": -1.6519002201675903e-06, + "completion_length": 396.0, + "delta_ref_entropy_loss": -0.00104522705078125, + "delta_ref_ppl": -0.05859375, + "entropy_loss": -0.0849609375, + "epoch": 0.8128, + "grad_norm": 1.4781598100966116, + "k1_kl": 0.05859375, + "k3_kl": 0.046630859375, + "kimi_kl": 0.1494140625, + "learning_rate": 9.36e-08, + "loss": 0.0019, + "ppl": 0.0252685546875, + "reward": 0.670850932598114, + "reward_std": 0.020206693559885025, + "rewards/perpo_ocr_edit_distance_reward": 0.6708510518074036, + "step": 4064, + "temperature": 0.9 + }, + { + "advantages": -1.4645713690697448e-06, + "completion_length": 1184.0, + "delta_ref_entropy_loss": 0.00909423828125, + "delta_ref_ppl": -0.0546875, + "entropy_loss": -0.1669921875, + "epoch": 0.813, + "grad_norm": 101.30548163035567, + "k1_kl": 0.05517578125, + "k3_kl": 0.0771484375, + "kimi_kl": 0.10888671875, + "learning_rate": 9.35e-08, + "loss": 0.0031, + "ppl": 0.08837890625, + "reward": 0.9396711587905884, + "reward_std": 0.01161852665245533, + "rewards/perpo_ocr_edit_distance_reward": 0.9396711587905884, + "step": 4065, + "temperature": 0.9 + }, + { + "advantages": -6.692750503134448e-06, + "completion_length": 358.0, + "delta_ref_entropy_loss": 0.1298828125, + "delta_ref_ppl": -0.21875, + "entropy_loss": -0.306640625, + "epoch": 0.8132, + "grad_norm": 1.7636177941057825, + "k1_kl": 0.21875, + "k3_kl": 0.150390625, + "kimi_kl": 0.439453125, + "learning_rate": 9.339999999999999e-08, + "loss": 0.006, + "ppl": 0.1484375, + "reward": 0.9217194318771362, + "reward_std": 0.002451602602377534, + "rewards/perpo_ocr_edit_distance_reward": 0.9217194318771362, + "step": 4066, + "temperature": 0.9 + }, + { + "advantages": -3.320830364827998e-05, + "completion_length": 484.0, + "delta_ref_entropy_loss": 0.034912109375, + "delta_ref_ppl": -0.0947265625, + "entropy_loss": -0.0751953125, + "epoch": 0.8134, + "grad_norm": 0.5410898544885153, + "k1_kl": 0.0947265625, + "k3_kl": 0.06494140625, + "kimi_kl": 0.208984375, + "learning_rate": 9.33e-08, + "loss": 0.0026, + "ppl": 0.02978515625, + "reward": 0.9924662113189697, + "reward_std": 0.0006692521856166422, + "rewards/perpo_ocr_edit_distance_reward": 0.9924662709236145, + "step": 4067, + "temperature": 0.9 + }, + { + "advantages": 1.7029899268550253e-08, + "completion_length": 25.0, + "delta_ref_entropy_loss": -0.3671875, + "delta_ref_ppl": -1.71875, + "entropy_loss": -0.7421875, + "epoch": 0.8136, + "grad_norm": 9.429810215580787, + "k1_kl": 1.71875, + "k3_kl": 1.578125, + "kimi_kl": 8.625, + "learning_rate": 9.32e-08, + "loss": 0.0631, + "ppl": 0.25, + "reward": 0.28798907995224, + "reward_std": 0.00515398196876049, + "rewards/perpo_ocr_edit_distance_reward": 0.28798907995224, + "step": 4068, + "temperature": 0.9 + }, + { + "advantages": -2.699239121284336e-05, + "completion_length": 958.0, + "delta_ref_entropy_loss": 0.016357421875, + "delta_ref_ppl": -0.041748046875, + "entropy_loss": -0.05615234375, + "epoch": 0.8138, + "grad_norm": 1.4293198413509642, + "k1_kl": 0.041748046875, + "k3_kl": 0.0263671875, + "kimi_kl": 0.06494140625, + "learning_rate": 9.309999999999999e-08, + "loss": 0.0011, + "ppl": 0.02197265625, + "reward": 0.9933528900146484, + "reward_std": 0.002105687279254198, + "rewards/perpo_ocr_edit_distance_reward": 0.9933529496192932, + "step": 4069, + "temperature": 0.9 + }, + { + "advantages": -1.706395960354712e-05, + "completion_length": 500.0, + "delta_ref_entropy_loss": 0.0208740234375, + "delta_ref_ppl": -0.0888671875, + "entropy_loss": -0.201171875, + "epoch": 0.814, + "grad_norm": 1.7148759102038031, + "k1_kl": 0.0888671875, + "k3_kl": 0.0625, + "kimi_kl": 0.1474609375, + "learning_rate": 9.3e-08, + "loss": 0.0025, + "ppl": 0.07568359375, + "reward": 0.9564926624298096, + "reward_std": 0.001893166801892221, + "rewards/perpo_ocr_edit_distance_reward": 0.9564926624298096, + "step": 4070, + "temperature": 0.9 + }, + { + "advantages": -0.0001170635296148248, + "completion_length": 857.0, + "delta_ref_entropy_loss": 0.01708984375, + "delta_ref_ppl": -0.0390625, + "entropy_loss": -0.045654296875, + "epoch": 0.8142, + "grad_norm": 0.30132957340583577, + "k1_kl": 0.0390625, + "k3_kl": 0.024169921875, + "kimi_kl": 0.06982421875, + "learning_rate": 9.289999999999999e-08, + "loss": 0.0011, + "ppl": 0.015869140625, + "reward": 0.9978327751159668, + "reward_std": 0.0004091897571925074, + "rewards/perpo_ocr_edit_distance_reward": 0.9978328347206116, + "step": 4071, + "temperature": 0.9 + }, + { + "advantages": -2.1048956114100292e-05, + "completion_length": 790.0, + "delta_ref_entropy_loss": 0.005523681640625, + "delta_ref_ppl": -0.05615234375, + "entropy_loss": -0.08154296875, + "epoch": 0.8144, + "grad_norm": 3.494417444157592, + "k1_kl": 0.05615234375, + "k3_kl": 0.03759765625, + "kimi_kl": 0.09765625, + "learning_rate": 9.279999999999998e-08, + "loss": 0.0015, + "ppl": 0.0301513671875, + "reward": 0.9867997169494629, + "reward_std": 0.0035369605757296085, + "rewards/perpo_ocr_edit_distance_reward": 0.9867998361587524, + "step": 4072, + "temperature": 0.9 + }, + { + "advantages": -2.946172571682837e-06, + "completion_length": 45.0, + "delta_ref_entropy_loss": 0.029052734375, + "delta_ref_ppl": -0.69921875, + "entropy_loss": -0.234375, + "epoch": 0.8146, + "grad_norm": 4.494560041752093, + "k1_kl": 0.69921875, + "k3_kl": 0.57421875, + "kimi_kl": 2.703125, + "learning_rate": 9.27e-08, + "loss": 0.0231, + "ppl": 0.11376953125, + "reward": 0.8999274969100952, + "reward_std": 0.0056481799110770226, + "rewards/perpo_ocr_edit_distance_reward": 0.8999274969100952, + "step": 4073, + "temperature": 0.9 + }, + { + "advantages": -6.437302090489538e-06, + "completion_length": 76.0, + "delta_ref_entropy_loss": 0.0220947265625, + "delta_ref_ppl": -0.5234375, + "entropy_loss": -0.12890625, + "epoch": 0.8148, + "grad_norm": 4.981902867176784, + "k1_kl": 0.5234375, + "k3_kl": 0.451171875, + "kimi_kl": 2.3125, + "learning_rate": 9.26e-08, + "loss": 0.0181, + "ppl": 0.05126953125, + "reward": 0.9844412803649902, + "reward_std": 0.0065215956419706345, + "rewards/perpo_ocr_edit_distance_reward": 0.984441339969635, + "step": 4074, + "temperature": 0.9 + }, + { + "advantages": -7.578304916933121e-07, + "completion_length": 98.0, + "delta_ref_entropy_loss": -0.2216796875, + "delta_ref_ppl": -0.37109375, + "entropy_loss": -0.73828125, + "epoch": 0.815, + "grad_norm": 6.33444421044507, + "k1_kl": 0.37109375, + "k3_kl": 0.34375, + "kimi_kl": 1.1875, + "learning_rate": 9.25e-08, + "loss": 0.0138, + "ppl": 0.3203125, + "reward": 0.6551638841629028, + "reward_std": 0.066578209400177, + "rewards/perpo_ocr_edit_distance_reward": 0.6551639437675476, + "step": 4075, + "temperature": 0.9 + }, + { + "advantages": -6.156308518256992e-05, + "completion_length": 544.0, + "delta_ref_entropy_loss": 0.04833984375, + "delta_ref_ppl": -0.0791015625, + "entropy_loss": -0.056396484375, + "epoch": 0.8152, + "grad_norm": 0.4577195621307509, + "k1_kl": 0.0791015625, + "k3_kl": 0.052001953125, + "kimi_kl": 0.1787109375, + "learning_rate": 9.24e-08, + "loss": 0.0021, + "ppl": 0.016357421875, + "reward": 0.9977242350578308, + "reward_std": 0.0008684965432621539, + "rewards/perpo_ocr_edit_distance_reward": 0.9977242946624756, + "step": 4076, + "temperature": 0.9 + }, + { + "advantages": 3.0764513212488964e-05, + "completion_length": 857.0, + "delta_ref_entropy_loss": 0.01953125, + "delta_ref_ppl": -0.045166015625, + "entropy_loss": -0.0380859375, + "epoch": 0.8154, + "grad_norm": 0.1809764150579749, + "k1_kl": 0.045166015625, + "k3_kl": 0.0308837890625, + "kimi_kl": 0.08837890625, + "learning_rate": 9.229999999999999e-08, + "loss": 0.0012, + "ppl": 0.01300048828125, + "reward": 0.9944832921028137, + "reward_std": 0.00045349213178269565, + "rewards/perpo_ocr_edit_distance_reward": 0.9944832921028137, + "step": 4077, + "temperature": 0.9 + }, + { + "advantages": 0.0, + "completion_length": 521.0, + "delta_ref_entropy_loss": 0.0260009765625, + "delta_ref_ppl": -0.07568359375, + "entropy_loss": -0.11083984375, + "epoch": 0.8156, + "grad_norm": 2.1291903626458244, + "k1_kl": 0.076171875, + "k3_kl": 0.049560546875, + "kimi_kl": 0.1328125, + "learning_rate": 9.22e-08, + "loss": 0.002, + "ppl": 0.048095703125, + "reward": 0.9139374494552612, + "reward_std": 0.05014580860733986, + "rewards/perpo_ocr_edit_distance_reward": 0.9139374494552612, + "step": 4078, + "temperature": 0.9 + }, + { + "advantages": -5.337170296115801e-05, + "completion_length": 737.0, + "delta_ref_entropy_loss": 0.03125, + "delta_ref_ppl": -0.049072265625, + "entropy_loss": -0.0380859375, + "epoch": 0.8158, + "grad_norm": 0.5432796456063401, + "k1_kl": 0.049072265625, + "k3_kl": 0.0302734375, + "kimi_kl": 0.09619140625, + "learning_rate": 9.21e-08, + "loss": 0.0013, + "ppl": 0.0108642578125, + "reward": 0.9968416690826416, + "reward_std": 0.0011762368958443403, + "rewards/perpo_ocr_edit_distance_reward": 0.9968417286872864, + "step": 4079, + "temperature": 0.9 + }, + { + "advantages": -1.7268317606067285e-05, + "completion_length": 35.0, + "delta_ref_entropy_loss": 0.038330078125, + "delta_ref_ppl": -0.9765625, + "entropy_loss": -0.255859375, + "epoch": 0.816, + "grad_norm": 4.030390230689025, + "k1_kl": 0.97265625, + "k3_kl": 0.859375, + "kimi_kl": 4.3125, + "learning_rate": 9.199999999999999e-08, + "loss": 0.0343, + "ppl": 0.08544921875, + "reward": 0.9666959047317505, + "reward_std": 0.004826955962926149, + "rewards/perpo_ocr_edit_distance_reward": 0.9666959643363953, + "step": 4080, + "temperature": 0.9 + }, + { + "advantages": 1.7029899268550253e-08, + "completion_length": 1577.0, + "delta_ref_entropy_loss": 0.0301513671875, + "delta_ref_ppl": -0.0908203125, + "entropy_loss": -0.53125, + "epoch": 0.8162, + "grad_norm": 8.851612166707815, + "k1_kl": 0.0908203125, + "k3_kl": 0.1435546875, + "kimi_kl": 0.154296875, + "learning_rate": 9.19e-08, + "loss": 0.0057, + "ppl": 0.294921875, + "reward": 0.7290725708007812, + "reward_std": 0.0037525410298258066, + "rewards/perpo_ocr_edit_distance_reward": 0.729072630405426, + "step": 4081, + "temperature": 0.9 + }, + { + "advantages": -1.1920928955078125e-07, + "completion_length": 1134.0, + "delta_ref_entropy_loss": 0.056884765625, + "delta_ref_ppl": -0.10302734375, + "entropy_loss": -0.171875, + "epoch": 0.8164, + "grad_norm": 12.585339935465102, + "k1_kl": 0.103515625, + "k3_kl": 0.08349609375, + "kimi_kl": 0.1943359375, + "learning_rate": 9.18e-08, + "loss": 0.0033, + "ppl": 0.10498046875, + "reward": 0.8887673616409302, + "reward_std": 0.14463688433170319, + "rewards/perpo_ocr_edit_distance_reward": 0.888767421245575, + "step": 4082, + "temperature": 0.9 + }, + { + "advantages": -2.5527819161652587e-05, + "completion_length": 928.0, + "delta_ref_entropy_loss": 0.048583984375, + "delta_ref_ppl": -0.06884765625, + "entropy_loss": -0.111328125, + "epoch": 0.8166, + "grad_norm": 1.826059712887342, + "k1_kl": 0.068359375, + "k3_kl": 0.047607421875, + "kimi_kl": 0.091796875, + "learning_rate": 9.17e-08, + "loss": 0.0019, + "ppl": 0.04443359375, + "reward": 0.9921427369117737, + "reward_std": 0.002567518036812544, + "rewards/perpo_ocr_edit_distance_reward": 0.9921427965164185, + "step": 4083, + "temperature": 0.9 + }, + { + "advantages": -3.525189185893396e-06, + "completion_length": 145.0, + "delta_ref_entropy_loss": 0.018310546875, + "delta_ref_ppl": -0.173828125, + "entropy_loss": -0.083984375, + "epoch": 0.8168, + "grad_norm": 3.069251406982827, + "k1_kl": 0.173828125, + "k3_kl": 0.1337890625, + "kimi_kl": 0.5546875, + "learning_rate": 9.16e-08, + "loss": 0.0053, + "ppl": 0.041259765625, + "reward": 0.9469677209854126, + "reward_std": 0.004747078288346529, + "rewards/perpo_ocr_edit_distance_reward": 0.9469677805900574, + "step": 4084, + "temperature": 0.9 + }, + { + "advantages": -3.3378603347955504e-06, + "completion_length": 271.0, + "delta_ref_entropy_loss": 0.0771484375, + "delta_ref_ppl": -0.15625, + "entropy_loss": -0.158203125, + "epoch": 0.817, + "grad_norm": 1.328628753371339, + "k1_kl": 0.15625, + "k3_kl": 0.11083984375, + "kimi_kl": 0.349609375, + "learning_rate": 9.149999999999999e-08, + "loss": 0.0044, + "ppl": 0.07763671875, + "reward": 0.9810941815376282, + "reward_std": 0.0024549788795411587, + "rewards/perpo_ocr_edit_distance_reward": 0.9810941815376282, + "step": 4085, + "temperature": 0.9 + }, + { + "advantages": -2.474870052537881e-05, + "completion_length": 555.0, + "delta_ref_entropy_loss": 0.03076171875, + "delta_ref_ppl": -0.05810546875, + "entropy_loss": -0.041015625, + "epoch": 0.8172, + "grad_norm": 0.7698532385761372, + "k1_kl": 0.05810546875, + "k3_kl": 0.0400390625, + "kimi_kl": 0.1220703125, + "learning_rate": 9.139999999999998e-08, + "loss": 0.0016, + "ppl": 0.017822265625, + "reward": 0.9934362173080444, + "reward_std": 0.0016211335314437747, + "rewards/perpo_ocr_edit_distance_reward": 0.993436336517334, + "step": 4086, + "temperature": 0.9 + }, + { + "advantages": -1.7881394569485565e-06, + "completion_length": 629.0, + "delta_ref_entropy_loss": -0.28515625, + "delta_ref_ppl": -0.08251953125, + "entropy_loss": -1.0546875, + "epoch": 0.8174, + "grad_norm": 4.987901517470478, + "k1_kl": 0.08203125, + "k3_kl": 0.15234375, + "kimi_kl": 0.2412109375, + "learning_rate": 9.13e-08, + "loss": 0.0061, + "ppl": 0.52734375, + "reward": 0.7344284057617188, + "reward_std": 0.02834576927125454, + "rewards/perpo_ocr_edit_distance_reward": 0.7344285249710083, + "step": 4087, + "temperature": 0.9 + }, + { + "advantages": -4.9642156227491796e-05, + "completion_length": 368.0, + "delta_ref_entropy_loss": 0.017822265625, + "delta_ref_ppl": -0.08544921875, + "entropy_loss": -0.072265625, + "epoch": 0.8176, + "grad_norm": 0.6862112809737179, + "k1_kl": 0.08544921875, + "k3_kl": 0.0634765625, + "kimi_kl": 0.2109375, + "learning_rate": 9.12e-08, + "loss": 0.0026, + "ppl": 0.0262451171875, + "reward": 0.3572291433811188, + "reward_std": 0.00032883574021980166, + "rewards/perpo_ocr_edit_distance_reward": 0.35722917318344116, + "step": 4088, + "temperature": 0.9 + }, + { + "advantages": -4.598072700900957e-05, + "completion_length": 605.0, + "delta_ref_entropy_loss": 0.038818359375, + "delta_ref_ppl": -0.048828125, + "entropy_loss": -0.0380859375, + "epoch": 0.8178, + "grad_norm": 0.6163053990731231, + "k1_kl": 0.048828125, + "k3_kl": 0.025390625, + "kimi_kl": 0.055908203125, + "learning_rate": 9.11e-08, + "loss": 0.0011, + "ppl": 0.0103759765625, + "reward": 0.9979128837585449, + "reward_std": 0.0008262880728580058, + "rewards/perpo_ocr_edit_distance_reward": 0.9979129433631897, + "step": 4089, + "temperature": 0.9 + }, + { + "advantages": -0.00014867953723296523, + "completion_length": 593.0, + "delta_ref_entropy_loss": 0.0233154296875, + "delta_ref_ppl": -0.048828125, + "entropy_loss": -0.08935546875, + "epoch": 0.818, + "grad_norm": 1.3132762452894238, + "k1_kl": 0.048828125, + "k3_kl": 0.033447265625, + "kimi_kl": 0.099609375, + "learning_rate": 9.1e-08, + "loss": 0.0015, + "ppl": 0.049072265625, + "reward": 0.9885419607162476, + "reward_std": 0.0007590750465169549, + "rewards/perpo_ocr_edit_distance_reward": 0.9885420799255371, + "step": 4090, + "temperature": 0.9 + }, + { + "advantages": 1.0962997976093902e-07, + "completion_length": 28.0, + "delta_ref_entropy_loss": -1.3359375, + "delta_ref_ppl": -1.125, + "entropy_loss": -2.875, + "epoch": 0.8182, + "grad_norm": 22.534425371425364, + "k1_kl": 1.125, + "k3_kl": 1.125, + "kimi_kl": 4.09375, + "learning_rate": 9.089999999999999e-08, + "loss": 0.045, + "ppl": 1.25, + "reward": 0.2787695825099945, + "reward_std": 0.06134076416492462, + "rewards/perpo_ocr_edit_distance_reward": 0.2787695825099945, + "step": 4091, + "temperature": 0.9 + }, + { + "advantages": -5.517687441169983e-06, + "completion_length": 1241.0, + "delta_ref_entropy_loss": -0.016845703125, + "delta_ref_ppl": -0.03759765625, + "entropy_loss": -0.16796875, + "epoch": 0.8184, + "grad_norm": 2.060019589140552, + "k1_kl": 0.03759765625, + "k3_kl": 0.03076171875, + "kimi_kl": 0.076171875, + "learning_rate": 9.08e-08, + "loss": 0.0012, + "ppl": 0.08056640625, + "reward": 0.8950361609458923, + "reward_std": 0.016874000430107117, + "rewards/perpo_ocr_edit_distance_reward": 0.8950362801551819, + "step": 4092, + "temperature": 0.9 + }, + { + "advantages": -3.4059798537100505e-08, + "completion_length": 267.0, + "delta_ref_entropy_loss": 0.0205078125, + "delta_ref_ppl": -0.1298828125, + "entropy_loss": -0.064453125, + "epoch": 0.8186, + "grad_norm": 0.5432087830670733, + "k1_kl": 0.130859375, + "k3_kl": 0.09765625, + "kimi_kl": 0.376953125, + "learning_rate": 9.07e-08, + "loss": 0.0039, + "ppl": 0.0230712890625, + "reward": 0.9964619874954224, + "reward_std": 0.000634138414170593, + "rewards/perpo_ocr_edit_distance_reward": 0.9964620471000671, + "step": 4093, + "temperature": 0.9 + }, + { + "advantages": -1.8051692904919037e-06, + "completion_length": 482.0, + "delta_ref_entropy_loss": -0.0030670166015625, + "delta_ref_ppl": -0.107421875, + "entropy_loss": -0.296875, + "epoch": 0.8188, + "grad_norm": 1.7553577125811877, + "k1_kl": 0.107421875, + "k3_kl": 0.08251953125, + "kimi_kl": 0.1953125, + "learning_rate": 9.059999999999999e-08, + "loss": 0.0033, + "ppl": 0.11962890625, + "reward": 0.630358874797821, + "reward_std": 0.009237068705260754, + "rewards/perpo_ocr_edit_distance_reward": 0.630358874797821, + "step": 4094, + "temperature": 0.9 + }, + { + "advantages": -8.945805893745273e-05, + "completion_length": 547.0, + "delta_ref_entropy_loss": 0.041748046875, + "delta_ref_ppl": -0.08349609375, + "entropy_loss": -0.0771484375, + "epoch": 0.819, + "grad_norm": 0.40547990868776285, + "k1_kl": 0.08349609375, + "k3_kl": 0.054931640625, + "kimi_kl": 0.2080078125, + "learning_rate": 9.05e-08, + "loss": 0.0023, + "ppl": 0.0225830078125, + "reward": 0.9397258758544922, + "reward_std": 0.00047099904622882605, + "rewards/perpo_ocr_edit_distance_reward": 0.939725935459137, + "step": 4095, + "temperature": 0.9 + }, + { + "advantages": -5.834443436469883e-05, + "completion_length": 446.0, + "delta_ref_entropy_loss": 0.054443359375, + "delta_ref_ppl": -0.064453125, + "entropy_loss": -0.06640625, + "epoch": 0.8192, + "grad_norm": 0.786699332711531, + "k1_kl": 0.064453125, + "k3_kl": 0.038330078125, + "kimi_kl": 0.1259765625, + "learning_rate": 9.039999999999999e-08, + "loss": 0.0016, + "ppl": 0.03125, + "reward": 0.9847953915596008, + "reward_std": 0.0007756700506433845, + "rewards/perpo_ocr_edit_distance_reward": 0.9847954511642456, + "step": 4096, + "temperature": 0.9 + }, + { + "advantages": -7.27623701095581e-05, + "completion_length": 963.0, + "delta_ref_entropy_loss": 0.039306640625, + "delta_ref_ppl": -0.04248046875, + "entropy_loss": -0.051513671875, + "epoch": 0.8194, + "grad_norm": 0.3188830780680387, + "k1_kl": 0.04248046875, + "k3_kl": 0.0230712890625, + "kimi_kl": 0.046630859375, + "learning_rate": 9.03e-08, + "loss": 0.001, + "ppl": 0.0194091796875, + "reward": 0.9389720559120178, + "reward_std": 0.00048514720401726663, + "rewards/perpo_ocr_edit_distance_reward": 0.9389721155166626, + "step": 4097, + "temperature": 0.9 + }, + { + "advantages": -2.7247838829680404e-07, + "completion_length": 534.0, + "delta_ref_entropy_loss": 0.032958984375, + "delta_ref_ppl": -0.0673828125, + "entropy_loss": -0.07958984375, + "epoch": 0.8196, + "grad_norm": 1.1471884854624916, + "k1_kl": 0.0673828125, + "k3_kl": 0.047607421875, + "kimi_kl": 0.1943359375, + "learning_rate": 9.02e-08, + "loss": 0.0019, + "ppl": 0.0281982421875, + "reward": 0.9243044257164001, + "reward_std": 0.1941831260919571, + "rewards/perpo_ocr_edit_distance_reward": 0.9243044853210449, + "step": 4098, + "temperature": 0.9 + }, + { + "advantages": -2.8865679269074462e-06, + "completion_length": 100.0, + "delta_ref_entropy_loss": 0.11181640625, + "delta_ref_ppl": -0.447265625, + "entropy_loss": -0.234375, + "epoch": 0.8198, + "grad_norm": 3.0918674333122396, + "k1_kl": 0.44921875, + "k3_kl": 0.337890625, + "kimi_kl": 1.28125, + "learning_rate": 9.009999999999999e-08, + "loss": 0.0135, + "ppl": 0.11669921875, + "reward": 0.7774603366851807, + "reward_std": 0.0057989065535366535, + "rewards/perpo_ocr_edit_distance_reward": 0.7774603366851807, + "step": 4099, + "temperature": 0.9 + }, + { + "advantages": -6.503718759631738e-05, + "completion_length": 572.0, + "delta_ref_entropy_loss": 0.033935546875, + "delta_ref_ppl": -0.08203125, + "entropy_loss": -0.06884765625, + "epoch": 0.82, + "grad_norm": 0.26682863393388156, + "k1_kl": 0.08203125, + "k3_kl": 0.05419921875, + "kimi_kl": 0.1748046875, + "learning_rate": 9e-08, + "loss": 0.0022, + "ppl": 0.0224609375, + "reward": 0.9963007569313049, + "reward_std": 0.0004236142267473042, + "rewards/perpo_ocr_edit_distance_reward": 0.9963008165359497, + "step": 4100, + "temperature": 0.9 + }, + { + "advantages": -2.7247838829680404e-07, + "completion_length": 729.0, + "delta_ref_entropy_loss": -0.0228271484375, + "delta_ref_ppl": -0.1064453125, + "entropy_loss": -0.734375, + "epoch": 0.8202, + "grad_norm": 3.769248048200014, + "k1_kl": 0.107421875, + "k3_kl": 0.0908203125, + "kimi_kl": 0.1513671875, + "learning_rate": 8.989999999999999e-08, + "loss": 0.0036, + "ppl": 0.3828125, + "reward": 0.41020405292510986, + "reward_std": 0.016668235883116722, + "rewards/perpo_ocr_edit_distance_reward": 0.41020408272743225, + "step": 4101, + "temperature": 0.9 + }, + { + "advantages": -2.0529543689917773e-05, + "completion_length": 1052.0, + "delta_ref_entropy_loss": 0.046142578125, + "delta_ref_ppl": -0.06884765625, + "entropy_loss": -0.11865234375, + "epoch": 0.8204, + "grad_norm": 0.9095896113923742, + "k1_kl": 0.06884765625, + "k3_kl": 0.03466796875, + "kimi_kl": 0.0673828125, + "learning_rate": 8.98e-08, + "loss": 0.0014, + "ppl": 0.050537109375, + "reward": 0.8477544784545898, + "reward_std": 0.001974266953766346, + "rewards/perpo_ocr_edit_distance_reward": 0.8477545380592346, + "step": 4102, + "temperature": 0.9 + }, + { + "advantages": -0.0001290355430683121, + "completion_length": 706.0, + "delta_ref_entropy_loss": 0.01708984375, + "delta_ref_ppl": -0.031494140625, + "entropy_loss": -0.0390625, + "epoch": 0.8206, + "grad_norm": 0.4880780307994425, + "k1_kl": 0.031494140625, + "k3_kl": 0.02099609375, + "kimi_kl": 0.059814453125, + "learning_rate": 8.97e-08, + "loss": 0.001, + "ppl": 0.01220703125, + "reward": 0.9993433952331543, + "reward_std": 0.0008238849695771933, + "rewards/perpo_ocr_edit_distance_reward": 0.9993435144424438, + "step": 4103, + "temperature": 0.9 + }, + { + "advantages": 1.7396041585016064e-05, + "completion_length": 489.0, + "delta_ref_entropy_loss": 0.05078125, + "delta_ref_ppl": -0.08935546875, + "entropy_loss": -0.08056640625, + "epoch": 0.8208, + "grad_norm": 0.9692474996194661, + "k1_kl": 0.08935546875, + "k3_kl": 0.06494140625, + "kimi_kl": 0.255859375, + "learning_rate": 8.96e-08, + "loss": 0.0026, + "ppl": 0.03125, + "reward": 0.9853819608688354, + "reward_std": 0.0013675958616659045, + "rewards/perpo_ocr_edit_distance_reward": 0.9853819608688354, + "step": 4104, + "temperature": 0.9 + }, + { + "advantages": -4.495893335842993e-06, + "completion_length": 1212.0, + "delta_ref_entropy_loss": 0.03955078125, + "delta_ref_ppl": -0.0615234375, + "entropy_loss": -0.078125, + "epoch": 0.821, + "grad_norm": 101.60488890989235, + "k1_kl": 0.0615234375, + "k3_kl": 0.1884765625, + "kimi_kl": 0.103515625, + "learning_rate": 8.949999999999999e-08, + "loss": 0.0075, + "ppl": 0.0458984375, + "reward": 0.9336275458335876, + "reward_std": 0.016938403248786926, + "rewards/perpo_ocr_edit_distance_reward": 0.9336276054382324, + "step": 4105, + "temperature": 0.9 + }, + { + "advantages": -8.913448982639238e-05, + "completion_length": 363.0, + "delta_ref_entropy_loss": 0.0322265625, + "delta_ref_ppl": -0.06396484375, + "entropy_loss": -0.03955078125, + "epoch": 0.8212, + "grad_norm": 0.29091496068572054, + "k1_kl": 0.0634765625, + "k3_kl": 0.03955078125, + "kimi_kl": 0.1103515625, + "learning_rate": 8.939999999999999e-08, + "loss": 0.0017, + "ppl": 0.01202392578125, + "reward": 0.998190701007843, + "reward_std": 0.00028207668219693005, + "rewards/perpo_ocr_edit_distance_reward": 0.9981907606124878, + "step": 4106, + "temperature": 0.9 + }, + { + "advantages": -7.603850008308655e-06, + "completion_length": 386.0, + "delta_ref_entropy_loss": 0.00017452239990234375, + "delta_ref_ppl": -0.076171875, + "entropy_loss": -0.08447265625, + "epoch": 0.8214, + "grad_norm": 1.0533720778136275, + "k1_kl": 0.07568359375, + "k3_kl": 0.05810546875, + "kimi_kl": 0.2353515625, + "learning_rate": 8.93e-08, + "loss": 0.0023, + "ppl": 0.0390625, + "reward": 0.9806575179100037, + "reward_std": 0.009971720166504383, + "rewards/perpo_ocr_edit_distance_reward": 0.9806576371192932, + "step": 4107, + "temperature": 0.9 + }, + { + "advantages": -9.86031136562815e-06, + "completion_length": 729.0, + "delta_ref_entropy_loss": 0.05322265625, + "delta_ref_ppl": -0.10546875, + "entropy_loss": -0.1669921875, + "epoch": 0.8216, + "grad_norm": 1.2200822674981358, + "k1_kl": 0.10546875, + "k3_kl": 0.0673828125, + "kimi_kl": 0.1611328125, + "learning_rate": 8.919999999999999e-08, + "loss": 0.0027, + "ppl": 0.064453125, + "reward": 0.9450228810310364, + "reward_std": 0.0033561130985617638, + "rewards/perpo_ocr_edit_distance_reward": 0.9450229406356812, + "step": 4108, + "temperature": 0.9 + }, + { + "advantages": 1.4305115882962127e-06, + "completion_length": 2048.0, + "delta_ref_entropy_loss": -0.0145263671875, + "delta_ref_ppl": -0.0213623046875, + "entropy_loss": -0.0625, + "epoch": 0.8218, + "grad_norm": 2.0589820067706177, + "k1_kl": 0.0213623046875, + "k3_kl": 0.02099609375, + "kimi_kl": 0.059326171875, + "learning_rate": 8.91e-08, + "loss": 0.0008, + "ppl": 0.0303955078125, + "reward": 0.777963399887085, + "reward_std": 0.01745322160422802, + "rewards/perpo_ocr_edit_distance_reward": 0.777963399887085, + "step": 4109, + "temperature": 0.9 + }, + { + "advantages": -4.683222414314514e-07, + "completion_length": 397.0, + "delta_ref_entropy_loss": 0.00811767578125, + "delta_ref_ppl": -0.09814453125, + "entropy_loss": -0.1298828125, + "epoch": 0.822, + "grad_norm": 2.141165536937504, + "k1_kl": 0.09814453125, + "k3_kl": 0.07958984375, + "kimi_kl": 0.2314453125, + "learning_rate": 8.899999999999999e-08, + "loss": 0.0032, + "ppl": 0.04736328125, + "reward": 0.8396959900856018, + "reward_std": 0.17732654511928558, + "rewards/perpo_ocr_edit_distance_reward": 0.8396961092948914, + "step": 4110, + "temperature": 0.9 + }, + { + "advantages": 1.5505724149988964e-05, + "completion_length": 921.0, + "delta_ref_entropy_loss": -0.0004177093505859375, + "delta_ref_ppl": -0.0517578125, + "entropy_loss": -0.12890625, + "epoch": 0.8222, + "grad_norm": 0.8914008526078668, + "k1_kl": 0.0517578125, + "k3_kl": 0.036865234375, + "kimi_kl": 0.119140625, + "learning_rate": 8.890000000000001e-08, + "loss": 0.0015, + "ppl": 0.041015625, + "reward": 0.9618452787399292, + "reward_std": 0.003193498821929097, + "rewards/perpo_ocr_edit_distance_reward": 0.9618452787399292, + "step": 4111, + "temperature": 0.9 + }, + { + "advantages": -8.744853403186426e-06, + "completion_length": 2048.0, + "delta_ref_entropy_loss": 0.0181884765625, + "delta_ref_ppl": -0.05615234375, + "entropy_loss": -0.134765625, + "epoch": 0.8224, + "grad_norm": 3.4365894997376674, + "k1_kl": 0.05615234375, + "k3_kl": 0.043212890625, + "kimi_kl": 0.08984375, + "learning_rate": 8.88e-08, + "loss": 0.0017, + "ppl": 0.0673828125, + "reward": 0.9648442268371582, + "reward_std": 0.003796208882704377, + "rewards/perpo_ocr_edit_distance_reward": 0.9648442268371582, + "step": 4112, + "temperature": 0.9 + }, + { + "advantages": -1.3794218830298632e-05, + "completion_length": 532.0, + "delta_ref_entropy_loss": 0.0133056640625, + "delta_ref_ppl": -0.035400390625, + "entropy_loss": -0.036376953125, + "epoch": 0.8226, + "grad_norm": 0.1519104379813421, + "k1_kl": 0.03564453125, + "k3_kl": 0.0224609375, + "kimi_kl": 0.07568359375, + "learning_rate": 8.87e-08, + "loss": 0.0009, + "ppl": 0.010498046875, + "reward": 0.9956093430519104, + "reward_std": 0.0005172864766791463, + "rewards/perpo_ocr_edit_distance_reward": 0.9956093430519104, + "step": 4113, + "temperature": 0.9 + }, + { + "advantages": -2.8993403248023242e-05, + "completion_length": 405.0, + "delta_ref_entropy_loss": 0.036376953125, + "delta_ref_ppl": -0.07568359375, + "entropy_loss": -0.06298828125, + "epoch": 0.8228, + "grad_norm": 1.0279801062580034, + "k1_kl": 0.07568359375, + "k3_kl": 0.05029296875, + "kimi_kl": 0.126953125, + "learning_rate": 8.86e-08, + "loss": 0.002, + "ppl": 0.0208740234375, + "reward": 0.9974808692932129, + "reward_std": 0.0019571285229176283, + "rewards/perpo_ocr_edit_distance_reward": 0.9974809288978577, + "step": 4114, + "temperature": 0.9 + }, + { + "advantages": -8.962836000137031e-05, + "completion_length": 561.0, + "delta_ref_entropy_loss": 0.04296875, + "delta_ref_ppl": -0.08349609375, + "entropy_loss": -0.08349609375, + "epoch": 0.823, + "grad_norm": 0.5820673044339195, + "k1_kl": 0.0830078125, + "k3_kl": 0.05029296875, + "kimi_kl": 0.1357421875, + "learning_rate": 8.849999999999999e-08, + "loss": 0.0021, + "ppl": 0.033935546875, + "reward": 0.9878873825073242, + "reward_std": 0.0008500411058776081, + "rewards/perpo_ocr_edit_distance_reward": 0.9878875017166138, + "step": 4115, + "temperature": 0.9 + }, + { + "advantages": -4.188503589830361e-05, + "completion_length": 1090.0, + "delta_ref_entropy_loss": 0.026123046875, + "delta_ref_ppl": -0.032470703125, + "entropy_loss": -0.06494140625, + "epoch": 0.8232, + "grad_norm": 1.2328858821208848, + "k1_kl": 0.032470703125, + "k3_kl": 0.01904296875, + "kimi_kl": 0.0380859375, + "learning_rate": 8.84e-08, + "loss": 0.0008, + "ppl": 0.029541015625, + "reward": 0.9956741333007812, + "reward_std": 0.0009164040675386786, + "rewards/perpo_ocr_edit_distance_reward": 0.995674192905426, + "step": 4116, + "temperature": 0.9 + }, + { + "advantages": -2.8763499358319677e-05, + "completion_length": 914.0, + "delta_ref_entropy_loss": 0.0189208984375, + "delta_ref_ppl": -0.0703125, + "entropy_loss": -0.126953125, + "epoch": 0.8234, + "grad_norm": 1.2728677713563148, + "k1_kl": 0.0703125, + "k3_kl": 0.050048828125, + "kimi_kl": 0.126953125, + "learning_rate": 8.83e-08, + "loss": 0.002, + "ppl": 0.06005859375, + "reward": 0.977515697479248, + "reward_std": 0.001675666542723775, + "rewards/perpo_ocr_edit_distance_reward": 0.977515697479248, + "step": 4117, + "temperature": 0.9 + }, + { + "advantages": -2.067004061245825e-05, + "completion_length": 700.0, + "delta_ref_entropy_loss": 0.031005859375, + "delta_ref_ppl": -0.052490234375, + "entropy_loss": -0.06494140625, + "epoch": 0.8236, + "grad_norm": 0.42715738990163715, + "k1_kl": 0.052490234375, + "k3_kl": 0.0291748046875, + "kimi_kl": 0.07666015625, + "learning_rate": 8.82e-08, + "loss": 0.0012, + "ppl": 0.025634765625, + "reward": 0.997074544429779, + "reward_std": 0.0007240746053867042, + "rewards/perpo_ocr_edit_distance_reward": 0.9970746040344238, + "step": 4118, + "temperature": 0.9 + }, + { + "advantages": -8.02108297648374e-06, + "completion_length": 213.0, + "delta_ref_entropy_loss": 0.025634765625, + "delta_ref_ppl": -0.1865234375, + "entropy_loss": -0.0732421875, + "epoch": 0.8238, + "grad_norm": 1.7846595687740883, + "k1_kl": 0.1865234375, + "k3_kl": 0.150390625, + "kimi_kl": 1.046875, + "learning_rate": 8.809999999999999e-08, + "loss": 0.006, + "ppl": 0.02978515625, + "reward": 0.9905337691307068, + "reward_std": 0.005218037404119968, + "rewards/perpo_ocr_edit_distance_reward": 0.9905338883399963, + "step": 4119, + "temperature": 0.9 + }, + { + "advantages": -9.785380098037422e-05, + "completion_length": 728.0, + "delta_ref_entropy_loss": 0.0283203125, + "delta_ref_ppl": -0.052734375, + "entropy_loss": -0.07861328125, + "epoch": 0.824, + "grad_norm": 0.6395850990035958, + "k1_kl": 0.052734375, + "k3_kl": 0.034912109375, + "kimi_kl": 0.10400390625, + "learning_rate": 8.8e-08, + "loss": 0.0015, + "ppl": 0.030517578125, + "reward": 0.9953145980834961, + "reward_std": 0.0010312277590855956, + "rewards/perpo_ocr_edit_distance_reward": 0.9953147768974304, + "step": 4120, + "temperature": 0.9 + }, + { + "advantages": -8.600950968684629e-05, + "completion_length": 397.0, + "delta_ref_entropy_loss": 0.033447265625, + "delta_ref_ppl": -0.068359375, + "entropy_loss": -0.035888671875, + "epoch": 0.8242, + "grad_norm": 0.22457071117786068, + "k1_kl": 0.068359375, + "k3_kl": 0.0576171875, + "kimi_kl": 0.1640625, + "learning_rate": 8.79e-08, + "loss": 0.0024, + "ppl": 0.011962890625, + "reward": 0.9993149042129517, + "reward_std": 0.0005930201732553542, + "rewards/perpo_ocr_edit_distance_reward": 0.9993149638175964, + "step": 4121, + "temperature": 0.9 + }, + { + "advantages": -6.139278866612585e-06, + "completion_length": 2048.0, + "delta_ref_entropy_loss": -0.0123291015625, + "delta_ref_ppl": -0.0145263671875, + "entropy_loss": -0.083984375, + "epoch": 0.8244, + "grad_norm": 6.574011520659767, + "k1_kl": 0.01446533203125, + "k3_kl": 0.045166015625, + "kimi_kl": 0.052734375, + "learning_rate": 8.78e-08, + "loss": 0.0018, + "ppl": 0.046142578125, + "reward": 0.8087238073348999, + "reward_std": 0.006838505156338215, + "rewards/perpo_ocr_edit_distance_reward": 0.8087238669395447, + "step": 4122, + "temperature": 0.9 + }, + { + "advantages": -2.9614995582960546e-05, + "completion_length": 951.0, + "delta_ref_entropy_loss": 0.002197265625, + "delta_ref_ppl": -0.0296630859375, + "entropy_loss": -0.0458984375, + "epoch": 0.8246, + "grad_norm": 0.374577672145653, + "k1_kl": 0.0296630859375, + "k3_kl": 0.0189208984375, + "kimi_kl": 0.042236328125, + "learning_rate": 8.77e-08, + "loss": 0.0008, + "ppl": 0.0185546875, + "reward": 0.9951223134994507, + "reward_std": 0.004211835563182831, + "rewards/perpo_ocr_edit_distance_reward": 0.9951224327087402, + "step": 4123, + "temperature": 0.9 + }, + { + "advantages": -2.55448497910038e-08, + "completion_length": 1132.0, + "delta_ref_entropy_loss": 0.0322265625, + "delta_ref_ppl": -0.058349609375, + "entropy_loss": -0.08349609375, + "epoch": 0.8248, + "grad_norm": 0.8791156257530826, + "k1_kl": 0.058349609375, + "k3_kl": 0.041015625, + "kimi_kl": 0.09716796875, + "learning_rate": 8.759999999999999e-08, + "loss": 0.0016, + "ppl": 0.038818359375, + "reward": 0.9630600810050964, + "reward_std": 0.0019241084810346365, + "rewards/perpo_ocr_edit_distance_reward": 0.9630601406097412, + "step": 4124, + "temperature": 0.9 + }, + { + "advantages": -6.479876446974231e-06, + "completion_length": 611.0, + "delta_ref_entropy_loss": -0.01422119140625, + "delta_ref_ppl": -0.0751953125, + "entropy_loss": -0.1005859375, + "epoch": 0.825, + "grad_norm": 0.9797631655008217, + "k1_kl": 0.0751953125, + "k3_kl": 0.056884765625, + "kimi_kl": 0.197265625, + "learning_rate": 8.75e-08, + "loss": 0.0023, + "ppl": 0.036865234375, + "reward": 0.9468570351600647, + "reward_std": 0.009079139679670334, + "rewards/perpo_ocr_edit_distance_reward": 0.9468570947647095, + "step": 4125, + "temperature": 0.9 + }, + { + "advantages": 4.002026230409683e-07, + "completion_length": 754.0, + "delta_ref_entropy_loss": -0.058349609375, + "delta_ref_ppl": -0.0673828125, + "entropy_loss": -0.228515625, + "epoch": 0.8252, + "grad_norm": 2.7663187817604347, + "k1_kl": 0.06787109375, + "k3_kl": 0.058349609375, + "kimi_kl": 0.146484375, + "learning_rate": 8.74e-08, + "loss": 0.0023, + "ppl": 0.08251953125, + "reward": 0.5944317579269409, + "reward_std": 0.02158292569220066, + "rewards/perpo_ocr_edit_distance_reward": 0.5944317579269409, + "step": 4126, + "temperature": 0.9 + }, + { + "advantages": -1.1018344594049267e-05, + "completion_length": 616.0, + "delta_ref_entropy_loss": 0.06689453125, + "delta_ref_ppl": -0.09423828125, + "entropy_loss": -0.1982421875, + "epoch": 0.8254, + "grad_norm": 0.9382107211085698, + "k1_kl": 0.09375, + "k3_kl": 0.06201171875, + "kimi_kl": 0.171875, + "learning_rate": 8.73e-08, + "loss": 0.0025, + "ppl": 0.09228515625, + "reward": 0.9629480838775635, + "reward_std": 0.0014459602534770966, + "rewards/perpo_ocr_edit_distance_reward": 0.9629480838775635, + "step": 4127, + "temperature": 0.9 + }, + { + "advantages": 7.0652795329806395e-06, + "completion_length": 230.0, + "delta_ref_entropy_loss": 0.007049560546875, + "delta_ref_ppl": -0.1943359375, + "entropy_loss": -0.1865234375, + "epoch": 0.8256, + "grad_norm": 2.755440748230174, + "k1_kl": 0.193359375, + "k3_kl": 0.1435546875, + "kimi_kl": 0.6484375, + "learning_rate": 8.72e-08, + "loss": 0.0057, + "ppl": 0.060302734375, + "reward": 0.7834410071372986, + "reward_std": 0.0023117931559681892, + "rewards/perpo_ocr_edit_distance_reward": 0.7834410071372986, + "step": 4128, + "temperature": 0.9 + }, + { + "advantages": 2.7707646950148046e-05, + "completion_length": 1067.0, + "delta_ref_entropy_loss": 0.01458740234375, + "delta_ref_ppl": -0.033447265625, + "entropy_loss": -0.04296875, + "epoch": 0.8258, + "grad_norm": 0.4148769633809789, + "k1_kl": 0.033203125, + "k3_kl": 0.021728515625, + "kimi_kl": 0.0771484375, + "learning_rate": 8.709999999999999e-08, + "loss": 0.0008, + "ppl": 0.017822265625, + "reward": 0.9961303472518921, + "reward_std": 0.0005144643364474177, + "rewards/perpo_ocr_edit_distance_reward": 0.9961303472518921, + "step": 4129, + "temperature": 0.9 + }, + { + "advantages": -3.116471589237335e-06, + "completion_length": 843.0, + "delta_ref_entropy_loss": 0.04931640625, + "delta_ref_ppl": -0.11474609375, + "entropy_loss": -0.46484375, + "epoch": 0.826, + "grad_norm": 2.327467671041977, + "k1_kl": 0.1142578125, + "k3_kl": 0.07763671875, + "kimi_kl": 0.142578125, + "learning_rate": 8.699999999999998e-08, + "loss": 0.0031, + "ppl": 0.236328125, + "reward": 0.630219042301178, + "reward_std": 0.027316315099596977, + "rewards/perpo_ocr_edit_distance_reward": 0.6302191019058228, + "step": 4130, + "temperature": 0.9 + }, + { + "advantages": -1.2482915735745337e-05, + "completion_length": 1020.0, + "delta_ref_entropy_loss": 0.0301513671875, + "delta_ref_ppl": -0.053466796875, + "entropy_loss": -0.059814453125, + "epoch": 0.8262, + "grad_norm": 1.174699252943334, + "k1_kl": 0.05322265625, + "k3_kl": 0.031005859375, + "kimi_kl": 0.0654296875, + "learning_rate": 8.69e-08, + "loss": 0.0012, + "ppl": 0.0233154296875, + "reward": 0.9778429865837097, + "reward_std": 0.0005830155569128692, + "rewards/perpo_ocr_edit_distance_reward": 0.9778430461883545, + "step": 4131, + "temperature": 0.9 + }, + { + "advantages": -0.00014265946811065078, + "completion_length": 573.0, + "delta_ref_entropy_loss": 0.0284423828125, + "delta_ref_ppl": -0.041015625, + "entropy_loss": -0.037109375, + "epoch": 0.8264, + "grad_norm": 0.33408388452439025, + "k1_kl": 0.041015625, + "k3_kl": 0.0252685546875, + "kimi_kl": 0.08935546875, + "learning_rate": 8.68e-08, + "loss": 0.0012, + "ppl": 0.0118408203125, + "reward": 0.998866617679596, + "reward_std": 0.00037754265940748155, + "rewards/perpo_ocr_edit_distance_reward": 0.9988666772842407, + "step": 4132, + "temperature": 0.9 + }, + { + "advantages": -6.079674221837195e-06, + "completion_length": 603.0, + "delta_ref_entropy_loss": -0.0703125, + "delta_ref_ppl": -0.053466796875, + "entropy_loss": -0.25390625, + "epoch": 0.8266, + "grad_norm": 3.744772130240478, + "k1_kl": 0.0537109375, + "k3_kl": 0.046142578125, + "kimi_kl": 0.11181640625, + "learning_rate": 8.669999999999999e-08, + "loss": 0.0019, + "ppl": 0.09912109375, + "reward": 0.7638230323791504, + "reward_std": 0.015308043919503689, + "rewards/perpo_ocr_edit_distance_reward": 0.7638231515884399, + "step": 4133, + "temperature": 0.9 + }, + { + "advantages": -2.8099334485887084e-06, + "completion_length": 870.0, + "delta_ref_entropy_loss": -0.07080078125, + "delta_ref_ppl": -0.059326171875, + "entropy_loss": -0.484375, + "epoch": 0.8268, + "grad_norm": 3.353720490155109, + "k1_kl": 0.05908203125, + "k3_kl": 0.05615234375, + "kimi_kl": 0.09912109375, + "learning_rate": 8.66e-08, + "loss": 0.0022, + "ppl": 0.2216796875, + "reward": 0.9151486754417419, + "reward_std": 0.03916814178228378, + "rewards/perpo_ocr_edit_distance_reward": 0.9151487946510315, + "step": 4134, + "temperature": 0.9 + }, + { + "advantages": -1.621246337890625e-05, + "completion_length": 586.0, + "delta_ref_entropy_loss": 0.05810546875, + "delta_ref_ppl": -0.09228515625, + "entropy_loss": -0.138671875, + "epoch": 0.827, + "grad_norm": 1.4930881478249591, + "k1_kl": 0.09228515625, + "k3_kl": 0.056884765625, + "kimi_kl": 0.1376953125, + "learning_rate": 8.649999999999999e-08, + "loss": 0.0023, + "ppl": 0.060302734375, + "reward": 0.97564697265625, + "reward_std": 0.0030515077523887157, + "rewards/perpo_ocr_edit_distance_reward": 0.9756470322608948, + "step": 4135, + "temperature": 0.9 + }, + { + "advantages": -1.5871866708039306e-05, + "completion_length": 192.0, + "delta_ref_entropy_loss": 0.0712890625, + "delta_ref_ppl": -0.2041015625, + "entropy_loss": -0.09716796875, + "epoch": 0.8272, + "grad_norm": 1.7532768306857514, + "k1_kl": 0.2041015625, + "k3_kl": 0.142578125, + "kimi_kl": 0.5078125, + "learning_rate": 8.64e-08, + "loss": 0.0057, + "ppl": 0.040283203125, + "reward": 0.9725688695907593, + "reward_std": 0.0015073016984388232, + "rewards/perpo_ocr_edit_distance_reward": 0.9725689888000488, + "step": 4136, + "temperature": 0.9 + }, + { + "advantages": -6.798336107749492e-05, + "completion_length": 608.0, + "delta_ref_entropy_loss": 0.03759765625, + "delta_ref_ppl": -0.0771484375, + "entropy_loss": -0.11376953125, + "epoch": 0.8274, + "grad_norm": 0.5680606635121825, + "k1_kl": 0.0771484375, + "k3_kl": 0.05224609375, + "kimi_kl": 0.1845703125, + "learning_rate": 8.63e-08, + "loss": 0.0022, + "ppl": 0.04248046875, + "reward": 0.8553267121315002, + "reward_std": 0.0009021682199090719, + "rewards/perpo_ocr_edit_distance_reward": 0.8553268313407898, + "step": 4137, + "temperature": 0.9 + }, + { + "advantages": -4.938671054333099e-07, + "completion_length": 148.0, + "delta_ref_entropy_loss": -0.5078125, + "delta_ref_ppl": -0.208984375, + "entropy_loss": -0.953125, + "epoch": 0.8276, + "grad_norm": 9.44935477131127, + "k1_kl": 0.208984375, + "k3_kl": 0.234375, + "kimi_kl": 0.73828125, + "learning_rate": 8.619999999999999e-08, + "loss": 0.0094, + "ppl": 0.359375, + "reward": 0.342355877161026, + "reward_std": 0.034844595938920975, + "rewards/perpo_ocr_edit_distance_reward": 0.3423559069633484, + "step": 4138, + "temperature": 0.9 + }, + { + "advantages": -1.614434404473286e-05, + "completion_length": 573.0, + "delta_ref_entropy_loss": 0.03515625, + "delta_ref_ppl": -0.076171875, + "entropy_loss": -0.11865234375, + "epoch": 0.8278, + "grad_norm": 1.03919354978337, + "k1_kl": 0.076171875, + "k3_kl": 0.05419921875, + "kimi_kl": 0.140625, + "learning_rate": 8.61e-08, + "loss": 0.0022, + "ppl": 0.053466796875, + "reward": 0.9474021792411804, + "reward_std": 0.004116086754947901, + "rewards/perpo_ocr_edit_distance_reward": 0.94740229845047, + "step": 4139, + "temperature": 0.9 + }, + { + "advantages": -3.9322036172961816e-05, + "completion_length": 1283.0, + "delta_ref_entropy_loss": 0.0303955078125, + "delta_ref_ppl": -0.041748046875, + "entropy_loss": -0.04345703125, + "epoch": 0.828, + "grad_norm": 0.6180626448090634, + "k1_kl": 0.041748046875, + "k3_kl": 0.023193359375, + "kimi_kl": 0.051025390625, + "learning_rate": 8.599999999999999e-08, + "loss": 0.001, + "ppl": 0.01422119140625, + "reward": 0.9961515665054321, + "reward_std": 0.0007664475124329329, + "rewards/perpo_ocr_edit_distance_reward": 0.9961516261100769, + "step": 4140, + "temperature": 0.9 + }, + { + "advantages": -5.5968765082070604e-05, + "completion_length": 676.0, + "delta_ref_entropy_loss": 0.02099609375, + "delta_ref_ppl": -0.054931640625, + "entropy_loss": -0.045654296875, + "epoch": 0.8282, + "grad_norm": 0.4767897375256936, + "k1_kl": 0.05517578125, + "k3_kl": 0.034912109375, + "kimi_kl": 0.10400390625, + "learning_rate": 8.59e-08, + "loss": 0.0015, + "ppl": 0.0155029296875, + "reward": 0.9788485169410706, + "reward_std": 0.0009649644489400089, + "rewards/perpo_ocr_edit_distance_reward": 0.9788485765457153, + "step": 4141, + "temperature": 0.9 + }, + { + "advantages": -1.036695084621897e-05, + "completion_length": 962.0, + "delta_ref_entropy_loss": -0.034912109375, + "delta_ref_ppl": -0.08251953125, + "entropy_loss": -0.5546875, + "epoch": 0.8284, + "grad_norm": 3.432280315740897, + "k1_kl": 0.0830078125, + "k3_kl": 0.06982421875, + "kimi_kl": 0.1474609375, + "learning_rate": 8.58e-08, + "loss": 0.0028, + "ppl": 0.302734375, + "reward": 0.6582775115966797, + "reward_std": 0.006469514220952988, + "rewards/perpo_ocr_edit_distance_reward": 0.658277690410614, + "step": 4142, + "temperature": 0.9 + }, + { + "advantages": -3.6171506508253515e-05, + "completion_length": 485.0, + "delta_ref_entropy_loss": 0.00811767578125, + "delta_ref_ppl": -0.05908203125, + "entropy_loss": -0.058349609375, + "epoch": 0.8286, + "grad_norm": 1.287355271705872, + "k1_kl": 0.05908203125, + "k3_kl": 0.04931640625, + "kimi_kl": 0.15234375, + "learning_rate": 8.569999999999999e-08, + "loss": 0.002, + "ppl": 0.025146484375, + "reward": 0.9903897643089294, + "reward_std": 0.0022562986705452204, + "rewards/perpo_ocr_edit_distance_reward": 0.9903898239135742, + "step": 4143, + "temperature": 0.9 + }, + { + "advantages": -5.373784733819775e-05, + "completion_length": 547.0, + "delta_ref_entropy_loss": 0.04345703125, + "delta_ref_ppl": -0.107421875, + "entropy_loss": -0.07470703125, + "epoch": 0.8288, + "grad_norm": 0.7995492489141897, + "k1_kl": 0.107421875, + "k3_kl": 0.072265625, + "kimi_kl": 0.29296875, + "learning_rate": 8.559999999999999e-08, + "loss": 0.0029, + "ppl": 0.0260009765625, + "reward": 0.991583526134491, + "reward_std": 0.0011678229784592986, + "rewards/perpo_ocr_edit_distance_reward": 0.9915835857391357, + "step": 4144, + "temperature": 0.9 + }, + { + "advantages": -0.00011486667062854394, + "completion_length": 778.0, + "delta_ref_entropy_loss": 0.035400390625, + "delta_ref_ppl": -0.04931640625, + "entropy_loss": -0.042724609375, + "epoch": 0.829, + "grad_norm": 0.34057750418996996, + "k1_kl": 0.04931640625, + "k3_kl": 0.0296630859375, + "kimi_kl": 0.07958984375, + "learning_rate": 8.55e-08, + "loss": 0.0013, + "ppl": 0.0146484375, + "reward": 0.994667649269104, + "reward_std": 0.000641251215711236, + "rewards/perpo_ocr_edit_distance_reward": 0.9946677684783936, + "step": 4145, + "temperature": 0.9 + }, + { + "advantages": -4.427773774295929e-07, + "completion_length": 1114.0, + "delta_ref_entropy_loss": -0.041015625, + "delta_ref_ppl": -0.0390625, + "entropy_loss": -0.12158203125, + "epoch": 0.8292, + "grad_norm": 0.8225075154886818, + "k1_kl": 0.039306640625, + "k3_kl": 0.035400390625, + "kimi_kl": 0.10546875, + "learning_rate": 8.54e-08, + "loss": 0.0014, + "ppl": 0.04052734375, + "reward": 0.9737232327461243, + "reward_std": 0.019783388823270798, + "rewards/perpo_ocr_edit_distance_reward": 0.973723292350769, + "step": 4146, + "temperature": 0.9 + }, + { + "advantages": -2.946172571682837e-06, + "completion_length": 701.0, + "delta_ref_entropy_loss": -0.03759765625, + "delta_ref_ppl": -0.0625, + "entropy_loss": -0.224609375, + "epoch": 0.8294, + "grad_norm": 2.205682876874466, + "k1_kl": 0.0625, + "k3_kl": 0.062255859375, + "kimi_kl": 0.1298828125, + "learning_rate": 8.53e-08, + "loss": 0.0025, + "ppl": 0.09375, + "reward": 0.9594004154205322, + "reward_std": 0.020176608115434647, + "rewards/perpo_ocr_edit_distance_reward": 0.959400475025177, + "step": 4147, + "temperature": 0.9 + }, + { + "advantages": -6.840910646133125e-05, + "completion_length": 680.0, + "delta_ref_entropy_loss": 0.00494384765625, + "delta_ref_ppl": -0.032470703125, + "entropy_loss": -0.027099609375, + "epoch": 0.8296, + "grad_norm": 0.2317256828217129, + "k1_kl": 0.03271484375, + "k3_kl": 0.0238037109375, + "kimi_kl": 0.0751953125, + "learning_rate": 8.52e-08, + "loss": 0.001, + "ppl": 0.006866455078125, + "reward": 0.9937434792518616, + "reward_std": 0.000771413033362478, + "rewards/perpo_ocr_edit_distance_reward": 0.9937435388565063, + "step": 4148, + "temperature": 0.9 + }, + { + "advantages": -4.07184888899792e-05, + "completion_length": 646.0, + "delta_ref_entropy_loss": 0.0279541015625, + "delta_ref_ppl": -0.07861328125, + "entropy_loss": -0.042724609375, + "epoch": 0.8298, + "grad_norm": 0.5041640741683702, + "k1_kl": 0.07861328125, + "k3_kl": 0.050048828125, + "kimi_kl": 0.138671875, + "learning_rate": 8.509999999999999e-08, + "loss": 0.002, + "ppl": 0.01519775390625, + "reward": 0.9935016632080078, + "reward_std": 0.0017814598977565765, + "rewards/perpo_ocr_edit_distance_reward": 0.9935017824172974, + "step": 4149, + "temperature": 0.9 + }, + { + "advantages": 6.2499730120180175e-06, + "completion_length": 794.0, + "delta_ref_entropy_loss": 0.032470703125, + "delta_ref_ppl": -0.07470703125, + "entropy_loss": -0.10205078125, + "epoch": 0.83, + "grad_norm": 0.7839262870589264, + "k1_kl": 0.07470703125, + "k3_kl": 0.043701171875, + "kimi_kl": 0.1064453125, + "learning_rate": 8.500000000000001e-08, + "loss": 0.0017, + "ppl": 0.038818359375, + "reward": 0.9825257062911987, + "reward_std": 0.005352749489247799, + "rewards/perpo_ocr_edit_distance_reward": 0.9825257062911987, + "step": 4150, + "temperature": 0.9 + }, + { + "advantages": -8.617129424237646e-06, + "completion_length": 1342.0, + "delta_ref_entropy_loss": 0.0196533203125, + "delta_ref_ppl": -0.043212890625, + "entropy_loss": -0.08154296875, + "epoch": 0.8302, + "grad_norm": 5.100928368823059, + "k1_kl": 0.043212890625, + "k3_kl": 0.15625, + "kimi_kl": 0.080078125, + "learning_rate": 8.49e-08, + "loss": 0.0063, + "ppl": 0.04541015625, + "reward": 0.8012266755104065, + "reward_std": 0.003851447021588683, + "rewards/perpo_ocr_edit_distance_reward": 0.801226794719696, + "step": 4151, + "temperature": 0.9 + }, + { + "advantages": -7.1099830165621825e-06, + "completion_length": 718.0, + "delta_ref_entropy_loss": 0.038330078125, + "delta_ref_ppl": -0.0869140625, + "entropy_loss": -0.0673828125, + "epoch": 0.8304, + "grad_norm": 0.5174457222375161, + "k1_kl": 0.08642578125, + "k3_kl": 0.056884765625, + "kimi_kl": 0.1796875, + "learning_rate": 8.479999999999999e-08, + "loss": 0.0023, + "ppl": 0.026611328125, + "reward": 0.975875198841095, + "reward_std": 0.0010980811202898622, + "rewards/perpo_ocr_edit_distance_reward": 0.975875198841095, + "step": 4152, + "temperature": 0.9 + }, + { + "advantages": 7.918903065728955e-06, + "completion_length": 764.0, + "delta_ref_entropy_loss": 0.03857421875, + "delta_ref_ppl": -0.059814453125, + "entropy_loss": -0.050048828125, + "epoch": 0.8306, + "grad_norm": 1.094623174375487, + "k1_kl": 0.059814453125, + "k3_kl": 0.0380859375, + "kimi_kl": 0.09228515625, + "learning_rate": 8.47e-08, + "loss": 0.0015, + "ppl": 0.0206298828125, + "reward": 0.8371427655220032, + "reward_std": 0.0009753912454470992, + "rewards/perpo_ocr_edit_distance_reward": 0.837142825126648, + "step": 4153, + "temperature": 0.9 + }, + { + "advantages": -5.670956397807458e-06, + "completion_length": 280.0, + "delta_ref_entropy_loss": 0.056396484375, + "delta_ref_ppl": -0.1279296875, + "entropy_loss": -0.138671875, + "epoch": 0.8308, + "grad_norm": 1.21706620787313, + "k1_kl": 0.1279296875, + "k3_kl": 0.0888671875, + "kimi_kl": 0.267578125, + "learning_rate": 8.459999999999999e-08, + "loss": 0.0036, + "ppl": 0.044921875, + "reward": 0.9676166772842407, + "reward_std": 0.0043987976387143135, + "rewards/perpo_ocr_edit_distance_reward": 0.9676167368888855, + "step": 4154, + "temperature": 0.9 + }, + { + "advantages": -3.8164005673024803e-05, + "completion_length": 1254.0, + "delta_ref_entropy_loss": 0.0169677734375, + "delta_ref_ppl": -0.03125, + "entropy_loss": -0.04638671875, + "epoch": 0.831, + "grad_norm": 0.6739784051868408, + "k1_kl": 0.031494140625, + "k3_kl": 0.0201416015625, + "kimi_kl": 0.042724609375, + "learning_rate": 8.45e-08, + "loss": 0.0008, + "ppl": 0.0203857421875, + "reward": 0.9963639378547668, + "reward_std": 0.003023367142304778, + "rewards/perpo_ocr_edit_distance_reward": 0.9963641166687012, + "step": 4155, + "temperature": 0.9 + }, + { + "advantages": -5.7033132179640234e-05, + "completion_length": 843.0, + "delta_ref_entropy_loss": 0.0147705078125, + "delta_ref_ppl": -0.044677734375, + "entropy_loss": -0.0498046875, + "epoch": 0.8312, + "grad_norm": 0.4402133490951838, + "k1_kl": 0.044677734375, + "k3_kl": 0.031005859375, + "kimi_kl": 0.0869140625, + "learning_rate": 8.44e-08, + "loss": 0.0013, + "ppl": 0.0216064453125, + "reward": 0.9942857027053833, + "reward_std": 0.0004973242757841945, + "rewards/perpo_ocr_edit_distance_reward": 0.9942857623100281, + "step": 4156, + "temperature": 0.9 + }, + { + "advantages": -3.4008709917543456e-05, + "completion_length": 209.0, + "delta_ref_entropy_loss": 0.0220947265625, + "delta_ref_ppl": -0.130859375, + "entropy_loss": -0.09326171875, + "epoch": 0.8314, + "grad_norm": 1.6077980825875426, + "k1_kl": 0.130859375, + "k3_kl": 0.09814453125, + "kimi_kl": 0.37890625, + "learning_rate": 8.43e-08, + "loss": 0.004, + "ppl": 0.041748046875, + "reward": 0.9936956167221069, + "reward_std": 0.0019043528009206057, + "rewards/perpo_ocr_edit_distance_reward": 0.9936957359313965, + "step": 4157, + "temperature": 0.9 + }, + { + "advantages": 5.687986231350806e-06, + "completion_length": 506.0, + "delta_ref_entropy_loss": 0.056640625, + "delta_ref_ppl": -0.11181640625, + "entropy_loss": -0.26171875, + "epoch": 0.8316, + "grad_norm": 1.2567895443115975, + "k1_kl": 0.11181640625, + "k3_kl": 0.07763671875, + "kimi_kl": 0.173828125, + "learning_rate": 8.42e-08, + "loss": 0.0031, + "ppl": 0.10546875, + "reward": 0.9210118651390076, + "reward_std": 0.004394198767840862, + "rewards/perpo_ocr_edit_distance_reward": 0.9210118055343628, + "step": 4158, + "temperature": 0.9 + }, + { + "advantages": -1.507146043877583e-05, + "completion_length": 486.0, + "delta_ref_entropy_loss": 0.0186767578125, + "delta_ref_ppl": -0.0703125, + "entropy_loss": -0.036376953125, + "epoch": 0.8318, + "grad_norm": 0.5748326459878598, + "k1_kl": 0.0703125, + "k3_kl": 0.054443359375, + "kimi_kl": 0.236328125, + "learning_rate": 8.409999999999999e-08, + "loss": 0.0022, + "ppl": 0.01104736328125, + "reward": 0.9818340539932251, + "reward_std": 0.0021605887450277805, + "rewards/perpo_ocr_edit_distance_reward": 0.9818340539932251, + "step": 4159, + "temperature": 0.9 + }, + { + "advantages": 8.278660061478149e-06, + "completion_length": 719.0, + "delta_ref_entropy_loss": 0.016357421875, + "delta_ref_ppl": -0.06396484375, + "entropy_loss": -0.061767578125, + "epoch": 0.832, + "grad_norm": 0.7077954943988509, + "k1_kl": 0.06396484375, + "k3_kl": 0.046875, + "kimi_kl": 0.1435546875, + "learning_rate": 8.4e-08, + "loss": 0.0019, + "ppl": 0.0255126953125, + "reward": 0.9732383489608765, + "reward_std": 0.0009299929370172322, + "rewards/perpo_ocr_edit_distance_reward": 0.9732383489608765, + "step": 4160, + "temperature": 0.9 + }, + { + "advantages": -5.406993182077713e-07, + "completion_length": 237.0, + "delta_ref_entropy_loss": -0.046875, + "delta_ref_ppl": -0.158203125, + "entropy_loss": -0.2470703125, + "epoch": 0.8322, + "grad_norm": 2.035893686439828, + "k1_kl": 0.1591796875, + "k3_kl": 0.1318359375, + "kimi_kl": 0.5, + "learning_rate": 8.39e-08, + "loss": 0.0053, + "ppl": 0.08935546875, + "reward": 0.836029052734375, + "reward_std": 0.015996892005205154, + "rewards/perpo_ocr_edit_distance_reward": 0.8360289931297302, + "step": 4161, + "temperature": 0.9 + }, + { + "advantages": -6.130764086265117e-05, + "completion_length": 350.0, + "delta_ref_entropy_loss": 0.06787109375, + "delta_ref_ppl": -0.11083984375, + "entropy_loss": -0.068359375, + "epoch": 0.8324, + "grad_norm": 1.048948871589295, + "k1_kl": 0.11083984375, + "k3_kl": 0.06640625, + "kimi_kl": 0.1796875, + "learning_rate": 8.38e-08, + "loss": 0.0027, + "ppl": 0.0262451171875, + "reward": 0.9496061205863953, + "reward_std": 0.0008720847545191646, + "rewards/perpo_ocr_edit_distance_reward": 0.9496061205863953, + "step": 4162, + "temperature": 0.9 + }, + { + "advantages": -5.2213672461220995e-05, + "completion_length": 724.0, + "delta_ref_entropy_loss": 0.03466796875, + "delta_ref_ppl": -0.076171875, + "entropy_loss": -0.06298828125, + "epoch": 0.8326, + "grad_norm": 0.6505228323349663, + "k1_kl": 0.076171875, + "k3_kl": 0.04931640625, + "kimi_kl": 0.1640625, + "learning_rate": 8.369999999999999e-08, + "loss": 0.002, + "ppl": 0.0194091796875, + "reward": 0.9978694915771484, + "reward_std": 0.0016938388580456376, + "rewards/perpo_ocr_edit_distance_reward": 0.997869610786438, + "step": 4163, + "temperature": 0.9 + }, + { + "advantages": -3.916876721632434e-06, + "completion_length": 459.0, + "delta_ref_entropy_loss": -0.0137939453125, + "delta_ref_ppl": -0.10009765625, + "entropy_loss": -0.19140625, + "epoch": 0.8328, + "grad_norm": 1.2106548612283325, + "k1_kl": 0.10009765625, + "k3_kl": 0.0732421875, + "kimi_kl": 0.234375, + "learning_rate": 8.36e-08, + "loss": 0.0029, + "ppl": 0.07470703125, + "reward": 0.9683701992034912, + "reward_std": 0.012963572517037392, + "rewards/perpo_ocr_edit_distance_reward": 0.968370258808136, + "step": 4164, + "temperature": 0.9 + }, + { + "advantages": -3.303800622234121e-05, + "completion_length": 608.0, + "delta_ref_entropy_loss": 0.0279541015625, + "delta_ref_ppl": -0.07958984375, + "entropy_loss": -0.06884765625, + "epoch": 0.833, + "grad_norm": 1.451656137996901, + "k1_kl": 0.07958984375, + "k3_kl": 0.053955078125, + "kimi_kl": 0.1640625, + "learning_rate": 8.35e-08, + "loss": 0.0022, + "ppl": 0.0284423828125, + "reward": 0.9879629611968994, + "reward_std": 0.0009313817718066275, + "rewards/perpo_ocr_edit_distance_reward": 0.9879630208015442, + "step": 4165, + "temperature": 0.9 + }, + { + "advantages": -9.417533874511719e-06, + "completion_length": 271.0, + "delta_ref_entropy_loss": 0.0267333984375, + "delta_ref_ppl": -0.1767578125, + "entropy_loss": -0.0830078125, + "epoch": 0.8332, + "grad_norm": 1.3040317396409857, + "k1_kl": 0.1767578125, + "k3_kl": 0.142578125, + "kimi_kl": 0.7109375, + "learning_rate": 8.339999999999999e-08, + "loss": 0.0057, + "ppl": 0.034912109375, + "reward": 0.9929656982421875, + "reward_std": 0.0026138806715607643, + "rewards/perpo_ocr_edit_distance_reward": 0.9929657578468323, + "step": 4166, + "temperature": 0.9 + }, + { + "advantages": -0.00017761332856025547, + "completion_length": 1042.0, + "delta_ref_entropy_loss": 0.029296875, + "delta_ref_ppl": -0.03515625, + "entropy_loss": -0.038330078125, + "epoch": 0.8334, + "grad_norm": 1.048902723175519, + "k1_kl": 0.035400390625, + "k3_kl": 0.01904296875, + "kimi_kl": 0.042724609375, + "learning_rate": 8.33e-08, + "loss": 0.0009, + "ppl": 0.01434326171875, + "reward": 0.9943189024925232, + "reward_std": 0.00042734405724331737, + "rewards/perpo_ocr_edit_distance_reward": 0.994318962097168, + "step": 4167, + "temperature": 0.9 + }, + { + "advantages": -7.477829058188945e-05, + "completion_length": 382.0, + "delta_ref_entropy_loss": 0.035888671875, + "delta_ref_ppl": -0.1357421875, + "entropy_loss": -0.0771484375, + "epoch": 0.8336, + "grad_norm": 0.4277313289576563, + "k1_kl": 0.134765625, + "k3_kl": 0.1005859375, + "kimi_kl": 0.435546875, + "learning_rate": 8.319999999999999e-08, + "loss": 0.0041, + "ppl": 0.0267333984375, + "reward": 0.9938746094703674, + "reward_std": 0.0009247128036804497, + "rewards/perpo_ocr_edit_distance_reward": 0.9938746690750122, + "step": 4168, + "temperature": 0.9 + }, + { + "advantages": -2.6174955564783886e-05, + "completion_length": 264.0, + "delta_ref_entropy_loss": 0.0361328125, + "delta_ref_ppl": -0.162109375, + "entropy_loss": -0.078125, + "epoch": 0.8338, + "grad_norm": 1.2321526576390627, + "k1_kl": 0.162109375, + "k3_kl": 0.130859375, + "kimi_kl": 0.58984375, + "learning_rate": 8.309999999999998e-08, + "loss": 0.0053, + "ppl": 0.03125, + "reward": 0.987678050994873, + "reward_std": 0.0031538030598312616, + "rewards/perpo_ocr_edit_distance_reward": 0.9876781702041626, + "step": 4169, + "temperature": 0.9 + }, + { + "advantages": -0.00010247741738567129, + "completion_length": 551.0, + "delta_ref_entropy_loss": 0.0228271484375, + "delta_ref_ppl": -0.044189453125, + "entropy_loss": -0.0341796875, + "epoch": 0.834, + "grad_norm": 0.38726426386429236, + "k1_kl": 0.044189453125, + "k3_kl": 0.0255126953125, + "kimi_kl": 0.06884765625, + "learning_rate": 8.3e-08, + "loss": 0.0011, + "ppl": 0.01031494140625, + "reward": 0.9985809326171875, + "reward_std": 0.0008142077713273466, + "rewards/perpo_ocr_edit_distance_reward": 0.9985809922218323, + "step": 4170, + "temperature": 0.9 + }, + { + "advantages": -0.00014230184024199843, + "completion_length": 479.0, + "delta_ref_entropy_loss": 0.040771484375, + "delta_ref_ppl": -0.10791015625, + "entropy_loss": -0.0751953125, + "epoch": 0.8342, + "grad_norm": 0.6627632165832453, + "k1_kl": 0.10791015625, + "k3_kl": 0.08251953125, + "kimi_kl": 0.271484375, + "learning_rate": 8.29e-08, + "loss": 0.0035, + "ppl": 0.026611328125, + "reward": 0.9856657385826111, + "reward_std": 0.00025908570387400687, + "rewards/perpo_ocr_edit_distance_reward": 0.9856658577919006, + "step": 4171, + "temperature": 0.9 + }, + { + "advantages": -1.5633448128937744e-05, + "completion_length": 67.0, + "delta_ref_entropy_loss": 0.056884765625, + "delta_ref_ppl": -0.625, + "entropy_loss": -0.208984375, + "epoch": 0.8344, + "grad_norm": 4.620757181296455, + "k1_kl": 0.625, + "k3_kl": 0.51171875, + "kimi_kl": 2.296875, + "learning_rate": 8.28e-08, + "loss": 0.0205, + "ppl": 0.07470703125, + "reward": 0.9789520502090454, + "reward_std": 0.00534273823723197, + "rewards/perpo_ocr_edit_distance_reward": 0.978952169418335, + "step": 4172, + "temperature": 0.9 + }, + { + "advantages": -7.680484486627392e-06, + "completion_length": 1376.0, + "delta_ref_entropy_loss": -0.02392578125, + "delta_ref_ppl": -0.01422119140625, + "entropy_loss": -0.07470703125, + "epoch": 0.8346, + "grad_norm": 0.5838628292614222, + "k1_kl": 0.01416015625, + "k3_kl": 0.01507568359375, + "kimi_kl": 0.046875, + "learning_rate": 8.269999999999999e-08, + "loss": 0.0006, + "ppl": 0.0277099609375, + "reward": 0.984851598739624, + "reward_std": 0.002116392832249403, + "rewards/perpo_ocr_edit_distance_reward": 0.984851598739624, + "step": 4173, + "temperature": 0.9 + }, + { + "advantages": 4.3426243792055175e-06, + "completion_length": 1950.0, + "delta_ref_entropy_loss": 0.0145263671875, + "delta_ref_ppl": -0.042724609375, + "entropy_loss": -0.1396484375, + "epoch": 0.8348, + "grad_norm": 1.820671700391108, + "k1_kl": 0.042724609375, + "k3_kl": 0.03369140625, + "kimi_kl": 0.04638671875, + "learning_rate": 8.26e-08, + "loss": 0.0013, + "ppl": 0.07666015625, + "reward": 0.9836868047714233, + "reward_std": 0.0018624747171998024, + "rewards/perpo_ocr_edit_distance_reward": 0.9836867451667786, + "step": 4174, + "temperature": 0.9 + }, + { + "advantages": -7.893358088040259e-06, + "completion_length": 596.0, + "delta_ref_entropy_loss": 0.06982421875, + "delta_ref_ppl": -0.162109375, + "entropy_loss": -0.423828125, + "epoch": 0.835, + "grad_norm": 2.4739309530443623, + "k1_kl": 0.162109375, + "k3_kl": 0.1201171875, + "kimi_kl": 0.3828125, + "learning_rate": 8.25e-08, + "loss": 0.0048, + "ppl": 0.2099609375, + "reward": 0.8317646980285645, + "reward_std": 0.007440655492246151, + "rewards/perpo_ocr_edit_distance_reward": 0.8317647576332092, + "step": 4175, + "temperature": 0.9 + }, + { + "advantages": -3.135204315185547e-05, + "completion_length": 1108.0, + "delta_ref_entropy_loss": 0.0140380859375, + "delta_ref_ppl": -0.0322265625, + "entropy_loss": -0.060302734375, + "epoch": 0.8352, + "grad_norm": 0.9648378846400458, + "k1_kl": 0.0322265625, + "k3_kl": 0.021484375, + "kimi_kl": 0.051025390625, + "learning_rate": 8.24e-08, + "loss": 0.0009, + "ppl": 0.0274658203125, + "reward": 0.992194652557373, + "reward_std": 0.0012574821012094617, + "rewards/perpo_ocr_edit_distance_reward": 0.9921947717666626, + "step": 4176, + "temperature": 0.9 + }, + { + "advantages": 0.0, + "completion_length": 1331.0, + "delta_ref_entropy_loss": 0.0303955078125, + "delta_ref_ppl": -0.053955078125, + "entropy_loss": -0.10986328125, + "epoch": 0.8354, + "grad_norm": 0.8631424853760431, + "k1_kl": 0.0537109375, + "k3_kl": 0.031982421875, + "kimi_kl": 0.06787109375, + "learning_rate": 8.229999999999999e-08, + "loss": 0.0013, + "ppl": 0.044677734375, + "reward": 0.9708064198493958, + "reward_std": 0.0020941970869898796, + "rewards/perpo_ocr_edit_distance_reward": 0.9708064198493958, + "step": 4177, + "temperature": 0.9 + }, + { + "advantages": -3.734657002496533e-05, + "completion_length": 1324.0, + "delta_ref_entropy_loss": 0.004241943359375, + "delta_ref_ppl": -0.0279541015625, + "entropy_loss": -0.061767578125, + "epoch": 0.8356, + "grad_norm": 1.2167276141469778, + "k1_kl": 0.0279541015625, + "k3_kl": 0.023681640625, + "kimi_kl": 0.0498046875, + "learning_rate": 8.22e-08, + "loss": 0.001, + "ppl": 0.0255126953125, + "reward": 0.9955075979232788, + "reward_std": 0.0019509217236191034, + "rewards/perpo_ocr_edit_distance_reward": 0.9955077171325684, + "step": 4178, + "temperature": 0.9 + }, + { + "advantages": -5.766323738498613e-05, + "completion_length": 393.0, + "delta_ref_entropy_loss": 0.0361328125, + "delta_ref_ppl": -0.061767578125, + "entropy_loss": -0.054443359375, + "epoch": 0.8358, + "grad_norm": 0.498755157146821, + "k1_kl": 0.0615234375, + "k3_kl": 0.037109375, + "kimi_kl": 0.095703125, + "learning_rate": 8.21e-08, + "loss": 0.0015, + "ppl": 0.0225830078125, + "reward": 0.9966428279876709, + "reward_std": 0.0012294694315642118, + "rewards/perpo_ocr_edit_distance_reward": 0.9966428875923157, + "step": 4179, + "temperature": 0.9 + }, + { + "advantages": -2.213886909885332e-05, + "completion_length": 367.0, + "delta_ref_entropy_loss": 0.00567626953125, + "delta_ref_ppl": -0.055419921875, + "entropy_loss": -0.05224609375, + "epoch": 0.836, + "grad_norm": 0.6399911174788879, + "k1_kl": 0.055419921875, + "k3_kl": 0.041015625, + "kimi_kl": 0.126953125, + "learning_rate": 8.2e-08, + "loss": 0.0017, + "ppl": 0.0223388671875, + "reward": 0.9827180504798889, + "reward_std": 0.0010542767122387886, + "rewards/perpo_ocr_edit_distance_reward": 0.9827181696891785, + "step": 4180, + "temperature": 0.9 + }, + { + "advantages": -2.2241049009608105e-05, + "completion_length": 891.0, + "delta_ref_entropy_loss": 0.0189208984375, + "delta_ref_ppl": -0.034912109375, + "entropy_loss": -0.05078125, + "epoch": 0.8362, + "grad_norm": 7.220025229654492, + "k1_kl": 0.034912109375, + "k3_kl": 0.0198974609375, + "kimi_kl": 0.052490234375, + "learning_rate": 8.19e-08, + "loss": 0.0008, + "ppl": 0.0191650390625, + "reward": 0.994746744632721, + "reward_std": 0.00028279441175982356, + "rewards/perpo_ocr_edit_distance_reward": 0.9947468042373657, + "step": 4181, + "temperature": 0.9 + }, + { + "advantages": 3.650358848972246e-05, + "completion_length": 830.0, + "delta_ref_entropy_loss": 0.017333984375, + "delta_ref_ppl": -0.054443359375, + "entropy_loss": -0.03466796875, + "epoch": 0.8364, + "grad_norm": 1.0214729645586358, + "k1_kl": 0.0546875, + "k3_kl": 0.037109375, + "kimi_kl": 0.125, + "learning_rate": 8.179999999999999e-08, + "loss": 0.0015, + "ppl": 0.0107421875, + "reward": 0.9966146945953369, + "reward_std": 0.00013330056390259415, + "rewards/perpo_ocr_edit_distance_reward": 0.9966147541999817, + "step": 4182, + "temperature": 0.9 + }, + { + "advantages": -1.1478152373456396e-05, + "completion_length": 67.0, + "delta_ref_entropy_loss": 0.07275390625, + "delta_ref_ppl": -0.68359375, + "entropy_loss": -0.2490234375, + "epoch": 0.8366, + "grad_norm": 4.366591007559009, + "k1_kl": 0.68359375, + "k3_kl": 0.5546875, + "kimi_kl": 2.5, + "learning_rate": 8.17e-08, + "loss": 0.0222, + "ppl": 0.1171875, + "reward": 0.9737142324447632, + "reward_std": 0.005089683923870325, + "rewards/perpo_ocr_edit_distance_reward": 0.9737143516540527, + "step": 4183, + "temperature": 0.9 + }, + { + "advantages": 1.27724248955019e-08, + "completion_length": 465.0, + "delta_ref_entropy_loss": 0.048583984375, + "delta_ref_ppl": -0.1064453125, + "entropy_loss": -0.07763671875, + "epoch": 0.8368, + "grad_norm": 0.8281525310497811, + "k1_kl": 0.10595703125, + "k3_kl": 0.0751953125, + "kimi_kl": 0.25, + "learning_rate": 8.16e-08, + "loss": 0.003, + "ppl": 0.033935546875, + "reward": 0.9943512678146362, + "reward_std": 0.0022552183363586664, + "rewards/perpo_ocr_edit_distance_reward": 0.9943512678146362, + "step": 4184, + "temperature": 0.9 + }, + { + "advantages": 0.0, + "completion_length": 735.0, + "delta_ref_entropy_loss": -0.126953125, + "delta_ref_ppl": -0.045654296875, + "entropy_loss": -0.2890625, + "epoch": 0.837, + "grad_norm": 2.8063091364508876, + "k1_kl": 0.0458984375, + "k3_kl": 0.060791015625, + "kimi_kl": 0.10986328125, + "learning_rate": 8.15e-08, + "loss": 0.0024, + "ppl": 0.09619140625, + "reward": 0.8778401613235474, + "reward_std": 0.026657041162252426, + "rewards/perpo_ocr_edit_distance_reward": 0.8778402209281921, + "step": 4185, + "temperature": 0.9 + }, + { + "advantages": 0.0, + "completion_length": 164.0, + "delta_ref_entropy_loss": 0.00167083740234375, + "delta_ref_ppl": -0.240234375, + "entropy_loss": -0.271484375, + "epoch": 0.8372, + "grad_norm": 2.63314505248372, + "k1_kl": 0.2412109375, + "k3_kl": 0.1767578125, + "kimi_kl": 0.67578125, + "learning_rate": 8.14e-08, + "loss": 0.0071, + "ppl": 0.0712890625, + "reward": 0.8907593488693237, + "reward_std": 0.077363520860672, + "rewards/perpo_ocr_edit_distance_reward": 0.8907593488693237, + "step": 4186, + "temperature": 0.9 + }, + { + "advantages": 4.257474817137563e-09, + "completion_length": 716.0, + "delta_ref_entropy_loss": 0.02294921875, + "delta_ref_ppl": -0.06640625, + "entropy_loss": -0.05029296875, + "epoch": 0.8374, + "grad_norm": 0.7203065033326234, + "k1_kl": 0.06640625, + "k3_kl": 0.043212890625, + "kimi_kl": 0.12451171875, + "learning_rate": 8.13e-08, + "loss": 0.0017, + "ppl": 0.0186767578125, + "reward": 0.9955708980560303, + "reward_std": 0.000976329087279737, + "rewards/perpo_ocr_edit_distance_reward": 0.995570957660675, + "step": 4187, + "temperature": 0.9 + }, + { + "advantages": -1.062665705831023e-05, + "completion_length": 1047.0, + "delta_ref_entropy_loss": 0.01470947265625, + "delta_ref_ppl": -0.04150390625, + "entropy_loss": -0.043701171875, + "epoch": 0.8376, + "grad_norm": 0.7670282470729699, + "k1_kl": 0.041748046875, + "k3_kl": 0.030029296875, + "kimi_kl": 0.08984375, + "learning_rate": 8.119999999999999e-08, + "loss": 0.0012, + "ppl": 0.017578125, + "reward": 0.8479406833648682, + "reward_std": 0.003105937736108899, + "rewards/perpo_ocr_edit_distance_reward": 0.8479407429695129, + "step": 4188, + "temperature": 0.9 + }, + { + "advantages": 5.568776941800024e-06, + "completion_length": 663.0, + "delta_ref_entropy_loss": 0.0162353515625, + "delta_ref_ppl": -0.07470703125, + "entropy_loss": -0.10986328125, + "epoch": 0.8378, + "grad_norm": 1.03604880919869, + "k1_kl": 0.07470703125, + "k3_kl": 0.052978515625, + "kimi_kl": 0.16015625, + "learning_rate": 8.11e-08, + "loss": 0.0021, + "ppl": 0.050048828125, + "reward": 0.9521348476409912, + "reward_std": 0.0014304211363196373, + "rewards/perpo_ocr_edit_distance_reward": 0.952134907245636, + "step": 4189, + "temperature": 0.9 + }, + { + "advantages": -8.71930842549773e-06, + "completion_length": 502.0, + "delta_ref_entropy_loss": 0.046630859375, + "delta_ref_ppl": -0.146484375, + "entropy_loss": -0.142578125, + "epoch": 0.838, + "grad_norm": 1.209612159906289, + "k1_kl": 0.146484375, + "k3_kl": 0.10205078125, + "kimi_kl": 0.31640625, + "learning_rate": 8.1e-08, + "loss": 0.0041, + "ppl": 0.0634765625, + "reward": 0.9126095771789551, + "reward_std": 0.0018541873432695866, + "rewards/perpo_ocr_edit_distance_reward": 0.9126095771789551, + "step": 4190, + "temperature": 0.9 + }, + { + "advantages": -2.5459698917984497e-06, + "completion_length": 816.0, + "delta_ref_entropy_loss": 0.015625, + "delta_ref_ppl": -0.080078125, + "entropy_loss": -0.31640625, + "epoch": 0.8382, + "grad_norm": 1.569478832144294, + "k1_kl": 0.07958984375, + "k3_kl": 0.06103515625, + "kimi_kl": 0.162109375, + "learning_rate": 8.089999999999999e-08, + "loss": 0.0024, + "ppl": 0.1435546875, + "reward": 0.3656793534755707, + "reward_std": 0.003248014487326145, + "rewards/perpo_ocr_edit_distance_reward": 0.3656793534755707, + "step": 4191, + "temperature": 0.9 + }, + { + "advantages": -1.5497207641601562e-05, + "completion_length": 411.0, + "delta_ref_entropy_loss": 0.00921630859375, + "delta_ref_ppl": -0.08203125, + "entropy_loss": -0.08056640625, + "epoch": 0.8384, + "grad_norm": 1.159175660914894, + "k1_kl": 0.08203125, + "k3_kl": 0.060546875, + "kimi_kl": 0.185546875, + "learning_rate": 8.08e-08, + "loss": 0.0024, + "ppl": 0.03857421875, + "reward": 0.9787657260894775, + "reward_std": 0.0020957107190042734, + "rewards/perpo_ocr_edit_distance_reward": 0.9787657856941223, + "step": 4192, + "temperature": 0.9 + }, + { + "advantages": -2.367155957472278e-06, + "completion_length": 583.0, + "delta_ref_entropy_loss": 0.06494140625, + "delta_ref_ppl": -0.115234375, + "entropy_loss": -0.181640625, + "epoch": 0.8386, + "grad_norm": 1.747260082989004, + "k1_kl": 0.115234375, + "k3_kl": 0.07373046875, + "kimi_kl": 0.1806640625, + "learning_rate": 8.069999999999999e-08, + "loss": 0.003, + "ppl": 0.0869140625, + "reward": 0.9650195240974426, + "reward_std": 0.007111551705747843, + "rewards/perpo_ocr_edit_distance_reward": 0.9650195837020874, + "step": 4193, + "temperature": 0.9 + }, + { + "advantages": 0.0, + "completion_length": 499.0, + "delta_ref_entropy_loss": -0.06787109375, + "delta_ref_ppl": -0.09228515625, + "entropy_loss": -0.236328125, + "epoch": 0.8388, + "grad_norm": 1.2189117307274415, + "k1_kl": 0.09228515625, + "k3_kl": 0.07177734375, + "kimi_kl": 0.25390625, + "learning_rate": 8.060000000000001e-08, + "loss": 0.0029, + "ppl": 0.08837890625, + "reward": 0.9202335476875305, + "reward_std": 0.00771887693554163, + "rewards/perpo_ocr_edit_distance_reward": 0.9202336072921753, + "step": 4194, + "temperature": 0.9 + }, + { + "advantages": -3.106253643636592e-05, + "completion_length": 1048.0, + "delta_ref_entropy_loss": 0.0267333984375, + "delta_ref_ppl": -0.05712890625, + "entropy_loss": -0.042236328125, + "epoch": 0.839, + "grad_norm": 0.5269356166270136, + "k1_kl": 0.05712890625, + "k3_kl": 0.03857421875, + "kimi_kl": 0.126953125, + "learning_rate": 8.05e-08, + "loss": 0.0016, + "ppl": 0.01416015625, + "reward": 0.9801333546638489, + "reward_std": 0.0004484364471863955, + "rewards/perpo_ocr_edit_distance_reward": 0.9801333546638489, + "step": 4195, + "temperature": 0.9 + }, + { + "advantages": -5.630936357192695e-05, + "completion_length": 623.0, + "delta_ref_entropy_loss": 0.019775390625, + "delta_ref_ppl": -0.050048828125, + "entropy_loss": -0.0966796875, + "epoch": 0.8392, + "grad_norm": 1.0808727520457426, + "k1_kl": 0.05029296875, + "k3_kl": 0.0302734375, + "kimi_kl": 0.06689453125, + "learning_rate": 8.039999999999999e-08, + "loss": 0.0013, + "ppl": 0.036865234375, + "reward": 0.9412087798118591, + "reward_std": 0.0015631651040166616, + "rewards/perpo_ocr_edit_distance_reward": 0.9412088990211487, + "step": 4196, + "temperature": 0.9 + }, + { + "advantages": 0.0, + "completion_length": 392.0, + "delta_ref_entropy_loss": 0.061279296875, + "delta_ref_ppl": -0.1318359375, + "entropy_loss": -0.197265625, + "epoch": 0.8394, + "grad_norm": 1.5916610936319722, + "k1_kl": 0.1318359375, + "k3_kl": 0.08056640625, + "kimi_kl": 0.2080078125, + "learning_rate": 8.03e-08, + "loss": 0.0032, + "ppl": 0.07763671875, + "reward": 0.9070020914077759, + "reward_std": 0.0015005874447524548, + "rewards/perpo_ocr_edit_distance_reward": 0.9070020914077759, + "step": 4197, + "temperature": 0.9 + }, + { + "advantages": 5.934920045547187e-05, + "completion_length": 765.0, + "delta_ref_entropy_loss": 0.03759765625, + "delta_ref_ppl": -0.051513671875, + "entropy_loss": -0.04443359375, + "epoch": 0.8396, + "grad_norm": 0.47606059275412155, + "k1_kl": 0.051513671875, + "k3_kl": 0.027099609375, + "kimi_kl": 0.064453125, + "learning_rate": 8.019999999999999e-08, + "loss": 0.001, + "ppl": 0.01519775390625, + "reward": 0.9982598423957825, + "reward_std": 0.0004735956899821758, + "rewards/perpo_ocr_edit_distance_reward": 0.9982598423957825, + "step": 4198, + "temperature": 0.9 + }, + { + "advantages": -4.564013124763733e-06, + "completion_length": 131.0, + "delta_ref_entropy_loss": -0.0020904541015625, + "delta_ref_ppl": -0.228515625, + "entropy_loss": -0.1826171875, + "epoch": 0.8398, + "grad_norm": 2.3003094680732974, + "k1_kl": 0.228515625, + "k3_kl": 0.1767578125, + "kimi_kl": 0.58984375, + "learning_rate": 8.01e-08, + "loss": 0.0071, + "ppl": 0.08740234375, + "reward": 0.9670329093933105, + "reward_std": 0.012940292246639729, + "rewards/perpo_ocr_edit_distance_reward": 0.9670330286026001, + "step": 4199, + "temperature": 0.9 + }, + { + "advantages": -9.400504495715722e-06, + "completion_length": 769.0, + "delta_ref_entropy_loss": 0.01287841796875, + "delta_ref_ppl": -0.03369140625, + "entropy_loss": -0.046875, + "epoch": 0.84, + "grad_norm": 0.3730610779407943, + "k1_kl": 0.033447265625, + "k3_kl": 0.0194091796875, + "kimi_kl": 0.04833984375, + "learning_rate": 8e-08, + "loss": 0.0008, + "ppl": 0.01806640625, + "reward": 0.9949033856391907, + "reward_std": 0.000805045710876584, + "rewards/perpo_ocr_edit_distance_reward": 0.9949033856391907, + "step": 4200, + "temperature": 0.9 + }, + { + "advantages": -6.079674221837195e-06, + "completion_length": 1364.0, + "delta_ref_entropy_loss": 0.0166015625, + "delta_ref_ppl": -0.060791015625, + "entropy_loss": -0.18359375, + "epoch": 0.8402, + "grad_norm": 1.4580439655519954, + "k1_kl": 0.06103515625, + "k3_kl": 0.038818359375, + "kimi_kl": 0.07421875, + "learning_rate": 7.99e-08, + "loss": 0.0016, + "ppl": 0.08349609375, + "reward": 0.7570744752883911, + "reward_std": 0.0055179353803396225, + "rewards/perpo_ocr_edit_distance_reward": 0.7570745348930359, + "step": 4201, + "temperature": 0.9 + }, + { + "advantages": -6.130763722467236e-06, + "completion_length": 764.0, + "delta_ref_entropy_loss": -0.0147705078125, + "delta_ref_ppl": -0.0947265625, + "entropy_loss": -0.50390625, + "epoch": 0.8404, + "grad_norm": 2.5771639069458394, + "k1_kl": 0.09521484375, + "k3_kl": 0.072265625, + "kimi_kl": 0.138671875, + "learning_rate": 7.979999999999999e-08, + "loss": 0.0029, + "ppl": 0.24609375, + "reward": 0.7711654901504517, + "reward_std": 0.011063891462981701, + "rewards/perpo_ocr_edit_distance_reward": 0.7711656093597412, + "step": 4202, + "temperature": 0.9 + }, + { + "advantages": -8.060890104388818e-05, + "completion_length": 519.0, + "delta_ref_entropy_loss": 0.050048828125, + "delta_ref_ppl": -0.09619140625, + "entropy_loss": -0.061279296875, + "epoch": 0.8406, + "grad_norm": 0.5648516639054764, + "k1_kl": 0.09619140625, + "k3_kl": 0.06298828125, + "kimi_kl": 0.2177734375, + "learning_rate": 7.969999999999999e-08, + "loss": 0.0026, + "ppl": 0.019775390625, + "reward": 0.9945335984230042, + "reward_std": 0.001062026247382164, + "rewards/perpo_ocr_edit_distance_reward": 0.9945337176322937, + "step": 4203, + "temperature": 0.9 + }, + { + "advantages": -2.2547586922883056e-05, + "completion_length": 1650.0, + "delta_ref_entropy_loss": 0.00531005859375, + "delta_ref_ppl": -0.03662109375, + "entropy_loss": -0.09619140625, + "epoch": 0.8408, + "grad_norm": 1.0579518745889627, + "k1_kl": 0.036865234375, + "k3_kl": 0.030029296875, + "kimi_kl": 0.055908203125, + "learning_rate": 7.96e-08, + "loss": 0.0012, + "ppl": 0.04443359375, + "reward": 0.9904053211212158, + "reward_std": 0.0010330594377592206, + "rewards/perpo_ocr_edit_distance_reward": 0.9904053211212158, + "step": 4204, + "temperature": 0.9 + }, + { + "advantages": -1.307044726672757e-06, + "completion_length": 388.0, + "delta_ref_entropy_loss": 0.002166748046875, + "delta_ref_ppl": -0.10498046875, + "entropy_loss": -0.12451171875, + "epoch": 0.841, + "grad_norm": 0.9447823032811452, + "k1_kl": 0.10498046875, + "k3_kl": 0.07470703125, + "kimi_kl": 0.224609375, + "learning_rate": 7.95e-08, + "loss": 0.003, + "ppl": 0.039794921875, + "reward": 0.3126830458641052, + "reward_std": 0.0031101477798074484, + "rewards/perpo_ocr_edit_distance_reward": 0.3126830458641052, + "step": 4205, + "temperature": 0.9 + }, + { + "advantages": -8.994341624202207e-05, + "completion_length": 1257.0, + "delta_ref_entropy_loss": 0.034912109375, + "delta_ref_ppl": -0.0546875, + "entropy_loss": -0.07373046875, + "epoch": 0.8412, + "grad_norm": 0.957151329409986, + "k1_kl": 0.0546875, + "k3_kl": 0.03369140625, + "kimi_kl": 0.08154296875, + "learning_rate": 7.94e-08, + "loss": 0.0014, + "ppl": 0.0341796875, + "reward": 0.9945262670516968, + "reward_std": 0.0006573746213689446, + "rewards/perpo_ocr_edit_distance_reward": 0.9945263862609863, + "step": 4206, + "temperature": 0.9 + }, + { + "advantages": -4.07184888899792e-05, + "completion_length": 1232.0, + "delta_ref_entropy_loss": 0.01092529296875, + "delta_ref_ppl": -0.021240234375, + "entropy_loss": -0.04443359375, + "epoch": 0.8414, + "grad_norm": 0.4350479176311979, + "k1_kl": 0.021240234375, + "k3_kl": 0.0130615234375, + "kimi_kl": 0.03271484375, + "learning_rate": 7.929999999999999e-08, + "loss": 0.0006, + "ppl": 0.019287109375, + "reward": 0.9882795214653015, + "reward_std": 0.0011549158953130245, + "rewards/perpo_ocr_edit_distance_reward": 0.9882796406745911, + "step": 4207, + "temperature": 0.9 + }, + { + "advantages": -1.5148095371841919e-05, + "completion_length": 463.0, + "delta_ref_entropy_loss": 0.0390625, + "delta_ref_ppl": -0.08544921875, + "entropy_loss": -0.08984375, + "epoch": 0.8416, + "grad_norm": 0.7382367076987102, + "k1_kl": 0.08544921875, + "k3_kl": 0.060302734375, + "kimi_kl": 0.1923828125, + "learning_rate": 7.920000000000001e-08, + "loss": 0.0024, + "ppl": 0.0419921875, + "reward": 0.9602652192115784, + "reward_std": 0.002147215884178877, + "rewards/perpo_ocr_edit_distance_reward": 0.9602653384208679, + "step": 4208, + "temperature": 0.9 + }, + { + "advantages": -2.043587983280304e-06, + "completion_length": 573.0, + "delta_ref_entropy_loss": -0.04052734375, + "delta_ref_ppl": -0.08740234375, + "entropy_loss": -0.1787109375, + "epoch": 0.8418, + "grad_norm": 1.4240017281529083, + "k1_kl": 0.08740234375, + "k3_kl": 0.0703125, + "kimi_kl": 0.2255859375, + "learning_rate": 7.91e-08, + "loss": 0.0028, + "ppl": 0.0634765625, + "reward": 0.9175689816474915, + "reward_std": 0.024684900417923927, + "rewards/perpo_ocr_edit_distance_reward": 0.9175690412521362, + "step": 4209, + "temperature": 0.9 + }, + { + "advantages": -4.267054100637324e-05, + "completion_length": 669.0, + "delta_ref_entropy_loss": 0.010498046875, + "delta_ref_ppl": -0.039794921875, + "entropy_loss": -0.05126953125, + "epoch": 0.842, + "grad_norm": 0.35822039587739696, + "k1_kl": 0.039794921875, + "k3_kl": 0.0262451171875, + "kimi_kl": 0.072265625, + "learning_rate": 7.899999999999999e-08, + "loss": 0.0011, + "ppl": 0.01531982421875, + "reward": 0.9835958480834961, + "reward_std": 0.0014959709951654077, + "rewards/perpo_ocr_edit_distance_reward": 0.9835959076881409, + "step": 4210, + "temperature": 0.9 + }, + { + "advantages": -0.00012056317063979805, + "completion_length": 1006.0, + "delta_ref_entropy_loss": 0.01483154296875, + "delta_ref_ppl": -0.0244140625, + "entropy_loss": -0.0308837890625, + "epoch": 0.8422, + "grad_norm": 0.28367074090539074, + "k1_kl": 0.0244140625, + "k3_kl": 0.0167236328125, + "kimi_kl": 0.039794921875, + "learning_rate": 7.89e-08, + "loss": 0.0008, + "ppl": 0.00958251953125, + "reward": 0.998482346534729, + "reward_std": 0.00018250664288643748, + "rewards/perpo_ocr_edit_distance_reward": 0.9984824657440186, + "step": 4211, + "temperature": 0.9 + }, + { + "advantages": 5.546638203668408e-05, + "completion_length": 574.0, + "delta_ref_entropy_loss": 0.0230712890625, + "delta_ref_ppl": -0.0966796875, + "entropy_loss": -0.146484375, + "epoch": 0.8424, + "grad_norm": 0.7071188412381803, + "k1_kl": 0.09619140625, + "k3_kl": 0.06689453125, + "kimi_kl": 0.2314453125, + "learning_rate": 7.879999999999999e-08, + "loss": 0.0026, + "ppl": 0.05029296875, + "reward": 0.9457710981369019, + "reward_std": 0.0005141465808264911, + "rewards/perpo_ocr_edit_distance_reward": 0.9457710981369019, + "step": 4212, + "temperature": 0.9 + }, + { + "advantages": -8.606911433162168e-05, + "completion_length": 545.0, + "delta_ref_entropy_loss": 0.0272216796875, + "delta_ref_ppl": -0.06591796875, + "entropy_loss": -0.039794921875, + "epoch": 0.8426, + "grad_norm": 0.2155005565753966, + "k1_kl": 0.06591796875, + "k3_kl": 0.0458984375, + "kimi_kl": 0.171875, + "learning_rate": 7.87e-08, + "loss": 0.0019, + "ppl": 0.01202392578125, + "reward": 0.9927603006362915, + "reward_std": 0.00039477174868807197, + "rewards/perpo_ocr_edit_distance_reward": 0.992760419845581, + "step": 4213, + "temperature": 0.9 + }, + { + "advantages": -4.442249337444082e-05, + "completion_length": 1170.0, + "delta_ref_entropy_loss": -0.04833984375, + "delta_ref_ppl": -0.017822265625, + "entropy_loss": -0.08642578125, + "epoch": 0.8428, + "grad_norm": 0.2282951671593912, + "k1_kl": 0.017822265625, + "k3_kl": 0.019287109375, + "kimi_kl": 0.06640625, + "learning_rate": 7.86e-08, + "loss": 0.0008, + "ppl": 0.0194091796875, + "reward": 0.926306426525116, + "reward_std": 0.001241820165887475, + "rewards/perpo_ocr_edit_distance_reward": 0.9263064861297607, + "step": 4214, + "temperature": 0.9 + }, + { + "advantages": -9.875638352241367e-05, + "completion_length": 822.0, + "delta_ref_entropy_loss": 0.02392578125, + "delta_ref_ppl": -0.056396484375, + "entropy_loss": -0.0869140625, + "epoch": 0.843, + "grad_norm": 0.8422213468182553, + "k1_kl": 0.056396484375, + "k3_kl": 0.033447265625, + "kimi_kl": 0.07666015625, + "learning_rate": 7.85e-08, + "loss": 0.0014, + "ppl": 0.03955078125, + "reward": 0.9940442442893982, + "reward_std": 0.001021197997033596, + "rewards/perpo_ocr_edit_distance_reward": 0.9940443634986877, + "step": 4215, + "temperature": 0.9 + }, + { + "advantages": -2.2619964511250146e-05, + "completion_length": 395.0, + "delta_ref_entropy_loss": 0.030517578125, + "delta_ref_ppl": -0.07861328125, + "entropy_loss": -0.068359375, + "epoch": 0.8432, + "grad_norm": 0.5653430728687027, + "k1_kl": 0.07861328125, + "k3_kl": 0.047119140625, + "kimi_kl": 0.10693359375, + "learning_rate": 7.839999999999999e-08, + "loss": 0.0019, + "ppl": 0.02978515625, + "reward": 0.9150488376617432, + "reward_std": 0.0010288874618709087, + "rewards/perpo_ocr_edit_distance_reward": 0.9150489568710327, + "step": 4216, + "temperature": 0.9 + }, + { + "advantages": -0.00011876651842612773, + "completion_length": 1280.0, + "delta_ref_entropy_loss": 0.0205078125, + "delta_ref_ppl": -0.05126953125, + "entropy_loss": -0.045654296875, + "epoch": 0.8434, + "grad_norm": 0.4254678379819032, + "k1_kl": 0.05126953125, + "k3_kl": 0.035888671875, + "kimi_kl": 0.107421875, + "learning_rate": 7.829999999999999e-08, + "loss": 0.0016, + "ppl": 0.015869140625, + "reward": 0.9968318343162537, + "reward_std": 0.000760113587602973, + "rewards/perpo_ocr_edit_distance_reward": 0.9968319535255432, + "step": 4217, + "temperature": 0.9 + }, + { + "advantages": -7.551057206001133e-05, + "completion_length": 471.0, + "delta_ref_entropy_loss": 0.0272216796875, + "delta_ref_ppl": -0.07861328125, + "entropy_loss": -0.0537109375, + "epoch": 0.8436, + "grad_norm": 0.23132629302160315, + "k1_kl": 0.07861328125, + "k3_kl": 0.056396484375, + "kimi_kl": 0.2353515625, + "learning_rate": 7.82e-08, + "loss": 0.0023, + "ppl": 0.01513671875, + "reward": 0.9962332248687744, + "reward_std": 0.00023825684911571443, + "rewards/perpo_ocr_edit_distance_reward": 0.9962332844734192, + "step": 4218, + "temperature": 0.9 + }, + { + "advantages": -2.5851386453723535e-05, + "completion_length": 248.0, + "delta_ref_entropy_loss": 0.035888671875, + "delta_ref_ppl": -0.1572265625, + "entropy_loss": -0.10205078125, + "epoch": 0.8438, + "grad_norm": 1.8583538826325627, + "k1_kl": 0.1572265625, + "k3_kl": 0.1220703125, + "kimi_kl": 0.4296875, + "learning_rate": 7.81e-08, + "loss": 0.0049, + "ppl": 0.040771484375, + "reward": 0.9688311219215393, + "reward_std": 0.00286322133615613, + "rewards/perpo_ocr_edit_distance_reward": 0.9688312411308289, + "step": 4219, + "temperature": 0.9 + }, + { + "advantages": -1.4977796126913745e-05, + "completion_length": 311.0, + "delta_ref_entropy_loss": 0.056396484375, + "delta_ref_ppl": -0.15234375, + "entropy_loss": -0.43359375, + "epoch": 0.844, + "grad_norm": 8.205118000228532, + "k1_kl": 0.1533203125, + "k3_kl": 0.1064453125, + "kimi_kl": 0.22265625, + "learning_rate": 7.8e-08, + "loss": 0.0043, + "ppl": 0.2255859375, + "reward": 0.9296743273735046, + "reward_std": 0.0038763885386288166, + "rewards/perpo_ocr_edit_distance_reward": 0.9296744465827942, + "step": 4220, + "temperature": 0.9 + }, + { + "advantages": -0.00022969927522353828, + "completion_length": 401.0, + "delta_ref_entropy_loss": 0.04931640625, + "delta_ref_ppl": -0.0947265625, + "entropy_loss": -0.052001953125, + "epoch": 0.8442, + "grad_norm": 0.22040430943755976, + "k1_kl": 0.0947265625, + "k3_kl": 0.06494140625, + "kimi_kl": 0.291015625, + "learning_rate": 7.789999999999999e-08, + "loss": 0.0028, + "ppl": 0.01263427734375, + "reward": 0.9920353293418884, + "reward_std": 0.00023362036154139787, + "rewards/perpo_ocr_edit_distance_reward": 0.992035448551178, + "step": 4221, + "temperature": 0.9 + }, + { + "advantages": -1.481601202613092e-06, + "completion_length": 39.0, + "delta_ref_entropy_loss": -0.369140625, + "delta_ref_ppl": -1.0390625, + "entropy_loss": -0.67578125, + "epoch": 0.8444, + "grad_norm": 14.152624923603708, + "k1_kl": 1.0390625, + "k3_kl": 1.0, + "kimi_kl": 5.9375, + "learning_rate": 7.78e-08, + "loss": 0.0401, + "ppl": 0.2177734375, + "reward": 0.977240800857544, + "reward_std": 0.023053426295518875, + "rewards/perpo_ocr_edit_distance_reward": 0.9772408604621887, + "step": 4222, + "temperature": 0.9 + }, + { + "advantages": -4.0090202674036846e-05, + "completion_length": 461.0, + "delta_ref_entropy_loss": -0.0279541015625, + "delta_ref_ppl": -0.10107421875, + "entropy_loss": -0.09521484375, + "epoch": 0.8446, + "grad_norm": 0.6433269256343878, + "k1_kl": 0.1005859375, + "k3_kl": 0.0859375, + "kimi_kl": 0.357421875, + "learning_rate": 7.77e-08, + "loss": 0.0035, + "ppl": 0.0260009765625, + "reward": 0.9963099956512451, + "reward_std": 0.0009624578524380922, + "rewards/perpo_ocr_edit_distance_reward": 0.9963099956512451, + "step": 4223, + "temperature": 0.9 + }, + { + "advantages": -3.81725185434334e-05, + "completion_length": 1077.0, + "delta_ref_entropy_loss": -0.00048828125, + "delta_ref_ppl": -0.040283203125, + "entropy_loss": -0.10498046875, + "epoch": 0.8448, + "grad_norm": 2.0067508195055725, + "k1_kl": 0.0400390625, + "k3_kl": 0.0264892578125, + "kimi_kl": 0.0595703125, + "learning_rate": 7.76e-08, + "loss": 0.0011, + "ppl": 0.043701171875, + "reward": 0.9744351506233215, + "reward_std": 0.0016835592687129974, + "rewards/perpo_ocr_edit_distance_reward": 0.9744351506233215, + "step": 4224, + "temperature": 0.9 + }, + { + "advantages": -3.2016209843277466e-06, + "completion_length": 222.0, + "delta_ref_entropy_loss": -0.043701171875, + "delta_ref_ppl": -0.1611328125, + "entropy_loss": -0.2197265625, + "epoch": 0.845, + "grad_norm": 3.4822677646572004, + "k1_kl": 0.16015625, + "k3_kl": 0.1279296875, + "kimi_kl": 0.5625, + "learning_rate": 7.75e-08, + "loss": 0.0051, + "ppl": 0.0947265625, + "reward": 0.6966749429702759, + "reward_std": 0.026342196390032768, + "rewards/perpo_ocr_edit_distance_reward": 0.6966750621795654, + "step": 4225, + "temperature": 0.9 + }, + { + "advantages": -2.7511801818036474e-05, + "completion_length": 598.0, + "delta_ref_entropy_loss": -0.04150390625, + "delta_ref_ppl": -0.072265625, + "entropy_loss": -0.26171875, + "epoch": 0.8452, + "grad_norm": 1.270591372495916, + "k1_kl": 0.072265625, + "k3_kl": 0.058349609375, + "kimi_kl": 0.1416015625, + "learning_rate": 7.739999999999999e-08, + "loss": 0.0024, + "ppl": 0.109375, + "reward": 0.9298385381698608, + "reward_std": 0.002065351465716958, + "rewards/perpo_ocr_edit_distance_reward": 0.9298386573791504, + "step": 4226, + "temperature": 0.9 + }, + { + "advantages": -5.265644722385332e-05, + "completion_length": 869.0, + "delta_ref_entropy_loss": 0.032958984375, + "delta_ref_ppl": -0.05419921875, + "entropy_loss": -0.051025390625, + "epoch": 0.8454, + "grad_norm": 0.2871658787906932, + "k1_kl": 0.054443359375, + "k3_kl": 0.031982421875, + "kimi_kl": 0.09423828125, + "learning_rate": 7.729999999999998e-08, + "loss": 0.0013, + "ppl": 0.016357421875, + "reward": 0.9953829646110535, + "reward_std": 0.0003851601213682443, + "rewards/perpo_ocr_edit_distance_reward": 0.9953830242156982, + "step": 4227, + "temperature": 0.9 + }, + { + "advantages": -2.087865686917212e-05, + "completion_length": 139.0, + "delta_ref_entropy_loss": 0.060546875, + "delta_ref_ppl": -0.22265625, + "entropy_loss": -0.08203125, + "epoch": 0.8456, + "grad_norm": 1.7456256231117304, + "k1_kl": 0.22265625, + "k3_kl": 0.1767578125, + "kimi_kl": 0.734375, + "learning_rate": 7.72e-08, + "loss": 0.0071, + "ppl": 0.031494140625, + "reward": 0.8729211091995239, + "reward_std": 0.002348646055907011, + "rewards/perpo_ocr_edit_distance_reward": 0.8729211688041687, + "step": 4228, + "temperature": 0.9 + }, + { + "advantages": -8.20841160020791e-06, + "completion_length": 857.0, + "delta_ref_entropy_loss": -0.0458984375, + "delta_ref_ppl": -0.044189453125, + "entropy_loss": -0.1064453125, + "epoch": 0.8458, + "grad_norm": 0.6363242377659617, + "k1_kl": 0.0439453125, + "k3_kl": 0.03759765625, + "kimi_kl": 0.1328125, + "learning_rate": 7.71e-08, + "loss": 0.0015, + "ppl": 0.02392578125, + "reward": 0.9522729516029358, + "reward_std": 0.005086421966552734, + "rewards/perpo_ocr_edit_distance_reward": 0.9522730112075806, + "step": 4229, + "temperature": 0.9 + }, + { + "advantages": -1.1673995686578564e-05, + "completion_length": 496.0, + "delta_ref_entropy_loss": -0.040771484375, + "delta_ref_ppl": -0.080078125, + "entropy_loss": -0.318359375, + "epoch": 0.846, + "grad_norm": 2.2427307317968492, + "k1_kl": 0.08056640625, + "k3_kl": 0.1005859375, + "kimi_kl": 0.15234375, + "learning_rate": 7.7e-08, + "loss": 0.004, + "ppl": 0.1396484375, + "reward": 0.6413066387176514, + "reward_std": 0.004995984956622124, + "rewards/perpo_ocr_edit_distance_reward": 0.6413067579269409, + "step": 4230, + "temperature": 0.9 + }, + { + "advantages": -4.815204010810703e-05, + "completion_length": 1568.0, + "delta_ref_entropy_loss": -0.10205078125, + "delta_ref_ppl": -0.011474609375, + "entropy_loss": -0.1484375, + "epoch": 0.8462, + "grad_norm": 0.5433376805089009, + "k1_kl": 0.0115966796875, + "k3_kl": 0.01953125, + "kimi_kl": 0.059814453125, + "learning_rate": 7.69e-08, + "loss": 0.0008, + "ppl": 0.034423828125, + "reward": 0.8835096955299377, + "reward_std": 0.001491308445110917, + "rewards/perpo_ocr_edit_distance_reward": 0.8835098147392273, + "step": 4231, + "temperature": 0.9 + }, + { + "advantages": 1.7029898913278885e-07, + "completion_length": 221.0, + "delta_ref_entropy_loss": -0.486328125, + "delta_ref_ppl": -0.1796875, + "entropy_loss": -1.0390625, + "epoch": 0.8464, + "grad_norm": 4.762717568680645, + "k1_kl": 0.1796875, + "k3_kl": 0.2177734375, + "kimi_kl": 0.58984375, + "learning_rate": 7.679999999999999e-08, + "loss": 0.0087, + "ppl": 0.470703125, + "reward": 0.4796038866043091, + "reward_std": 0.10055968165397644, + "rewards/perpo_ocr_edit_distance_reward": 0.47960391640663147, + "step": 4232, + "temperature": 0.9 + }, + { + "advantages": -4.7292029194068164e-05, + "completion_length": 486.0, + "delta_ref_entropy_loss": 0.0390625, + "delta_ref_ppl": -0.0791015625, + "entropy_loss": -0.10546875, + "epoch": 0.8466, + "grad_norm": 0.9014291967214851, + "k1_kl": 0.0791015625, + "k3_kl": 0.058349609375, + "kimi_kl": 0.1923828125, + "learning_rate": 7.67e-08, + "loss": 0.0024, + "ppl": 0.04296875, + "reward": 0.9944649338722229, + "reward_std": 0.001160806161351502, + "rewards/perpo_ocr_edit_distance_reward": 0.9944649934768677, + "step": 4233, + "temperature": 0.9 + }, + { + "advantages": -1.3078963092993945e-05, + "completion_length": 519.0, + "delta_ref_entropy_loss": 0.061279296875, + "delta_ref_ppl": -0.1083984375, + "entropy_loss": -0.140625, + "epoch": 0.8468, + "grad_norm": 1.0833803849764558, + "k1_kl": 0.1083984375, + "k3_kl": 0.06884765625, + "kimi_kl": 0.2099609375, + "learning_rate": 7.66e-08, + "loss": 0.0028, + "ppl": 0.055419921875, + "reward": 0.9330478310585022, + "reward_std": 0.0018513418035581708, + "rewards/perpo_ocr_edit_distance_reward": 0.933047890663147, + "step": 4234, + "temperature": 0.9 + }, + { + "advantages": -0.00010376317368354648, + "completion_length": 483.0, + "delta_ref_entropy_loss": 0.0198974609375, + "delta_ref_ppl": -0.0703125, + "entropy_loss": -0.058349609375, + "epoch": 0.847, + "grad_norm": 0.4677200339701467, + "k1_kl": 0.0703125, + "k3_kl": 0.051513671875, + "kimi_kl": 0.1591796875, + "learning_rate": 7.649999999999999e-08, + "loss": 0.0022, + "ppl": 0.021728515625, + "reward": 0.9938276410102844, + "reward_std": 0.000556505226995796, + "rewards/perpo_ocr_edit_distance_reward": 0.9938277006149292, + "step": 4235, + "temperature": 0.9 + }, + { + "advantages": -2.0776476503669983e-06, + "completion_length": 1436.0, + "delta_ref_entropy_loss": -0.1728515625, + "delta_ref_ppl": -0.0166015625, + "entropy_loss": -0.333984375, + "epoch": 0.8472, + "grad_norm": 2.202933285967225, + "k1_kl": 0.01708984375, + "k3_kl": 0.043701171875, + "kimi_kl": 0.0810546875, + "learning_rate": 7.64e-08, + "loss": 0.0017, + "ppl": 0.1396484375, + "reward": 0.9222152829170227, + "reward_std": 0.03279890492558479, + "rewards/perpo_ocr_edit_distance_reward": 0.9222153425216675, + "step": 4236, + "temperature": 0.9 + }, + { + "advantages": -3.467287388048135e-05, + "completion_length": 567.0, + "delta_ref_entropy_loss": 0.04052734375, + "delta_ref_ppl": -0.059814453125, + "entropy_loss": -0.05126953125, + "epoch": 0.8474, + "grad_norm": 0.8610228032022548, + "k1_kl": 0.059814453125, + "k3_kl": 0.03857421875, + "kimi_kl": 0.1064453125, + "learning_rate": 7.63e-08, + "loss": 0.0016, + "ppl": 0.017822265625, + "reward": 0.9955511093139648, + "reward_std": 0.00186396692879498, + "rewards/perpo_ocr_edit_distance_reward": 0.9955512285232544, + "step": 4237, + "temperature": 0.9 + }, + { + "advantages": -9.735993080539629e-05, + "completion_length": 733.0, + "delta_ref_entropy_loss": 0.0145263671875, + "delta_ref_ppl": -0.053466796875, + "entropy_loss": -0.041748046875, + "epoch": 0.8476, + "grad_norm": 0.3118765908359714, + "k1_kl": 0.053466796875, + "k3_kl": 0.033935546875, + "kimi_kl": 0.12353515625, + "learning_rate": 7.62e-08, + "loss": 0.0015, + "ppl": 0.01129150390625, + "reward": 0.997494101524353, + "reward_std": 0.0005996341933496296, + "rewards/perpo_ocr_edit_distance_reward": 0.9974942207336426, + "step": 4238, + "temperature": 0.9 + }, + { + "advantages": -2.6771002012537792e-05, + "completion_length": 1316.0, + "delta_ref_entropy_loss": 0.033935546875, + "delta_ref_ppl": -0.056396484375, + "entropy_loss": -0.08154296875, + "epoch": 0.8478, + "grad_norm": 0.7832097062725415, + "k1_kl": 0.056396484375, + "k3_kl": 0.03369140625, + "kimi_kl": 0.0810546875, + "learning_rate": 7.61e-08, + "loss": 0.0014, + "ppl": 0.033935546875, + "reward": 0.9871737360954285, + "reward_std": 0.0024451876524835825, + "rewards/perpo_ocr_edit_distance_reward": 0.9871737957000732, + "step": 4239, + "temperature": 0.9 + }, + { + "advantages": -3.576278913897113e-06, + "completion_length": 91.0, + "delta_ref_entropy_loss": 0.056884765625, + "delta_ref_ppl": -0.3203125, + "entropy_loss": -0.2021484375, + "epoch": 0.848, + "grad_norm": 2.682595538790947, + "k1_kl": 0.3203125, + "k3_kl": 0.240234375, + "kimi_kl": 0.75, + "learning_rate": 7.599999999999999e-08, + "loss": 0.0096, + "ppl": 0.076171875, + "reward": 0.9722549915313721, + "reward_std": 0.009457403793931007, + "rewards/perpo_ocr_edit_distance_reward": 0.9722550511360168, + "step": 4240, + "temperature": 0.9 + }, + { + "advantages": -1.4722348169016186e-05, + "completion_length": 561.0, + "delta_ref_entropy_loss": -0.058349609375, + "delta_ref_ppl": -0.04248046875, + "entropy_loss": -0.138671875, + "epoch": 0.8482, + "grad_norm": 0.6887684377905426, + "k1_kl": 0.04248046875, + "k3_kl": 0.03857421875, + "kimi_kl": 0.1337890625, + "learning_rate": 7.59e-08, + "loss": 0.0016, + "ppl": 0.031494140625, + "reward": 0.804876983165741, + "reward_std": 0.004532895516604185, + "rewards/perpo_ocr_edit_distance_reward": 0.8048770427703857, + "step": 4241, + "temperature": 0.9 + }, + { + "advantages": -3.060272865695879e-05, + "completion_length": 525.0, + "delta_ref_entropy_loss": 0.0216064453125, + "delta_ref_ppl": -0.02587890625, + "entropy_loss": -0.032958984375, + "epoch": 0.8484, + "grad_norm": 0.23332396328721894, + "k1_kl": 0.0257568359375, + "k3_kl": 0.01416015625, + "kimi_kl": 0.041748046875, + "learning_rate": 7.58e-08, + "loss": 0.0006, + "ppl": 0.0086669921875, + "reward": 0.991391658782959, + "reward_std": 0.0010128953726962209, + "rewards/perpo_ocr_edit_distance_reward": 0.9913918375968933, + "step": 4242, + "temperature": 0.9 + }, + { + "advantages": -1.3453620795189636e-06, + "completion_length": 851.0, + "delta_ref_entropy_loss": 0.0927734375, + "delta_ref_ppl": -0.126953125, + "entropy_loss": -0.326171875, + "epoch": 0.8486, + "grad_norm": 2.2114595371744357, + "k1_kl": 0.126953125, + "k3_kl": 0.0771484375, + "kimi_kl": 0.2060546875, + "learning_rate": 7.57e-08, + "loss": 0.0031, + "ppl": 0.169921875, + "reward": 0.733260989189148, + "reward_std": 0.006398678291589022, + "rewards/perpo_ocr_edit_distance_reward": 0.7332610487937927, + "step": 4243, + "temperature": 0.9 + }, + { + "advantages": 1.3777188542007934e-05, + "completion_length": 849.0, + "delta_ref_entropy_loss": 0.02734375, + "delta_ref_ppl": -0.06884765625, + "entropy_loss": -0.06005859375, + "epoch": 0.8488, + "grad_norm": 0.3794794412505144, + "k1_kl": 0.06884765625, + "k3_kl": 0.048828125, + "kimi_kl": 0.197265625, + "learning_rate": 7.56e-08, + "loss": 0.0019, + "ppl": 0.0189208984375, + "reward": 0.9951999187469482, + "reward_std": 0.0005181871238164604, + "rewards/perpo_ocr_edit_distance_reward": 0.9951999187469482, + "step": 4244, + "temperature": 0.9 + }, + { + "advantages": -0.00019412381516303867, + "completion_length": 802.0, + "delta_ref_entropy_loss": 0.033203125, + "delta_ref_ppl": -0.058837890625, + "entropy_loss": -0.056396484375, + "epoch": 0.849, + "grad_norm": 0.28139380747793646, + "k1_kl": 0.05859375, + "k3_kl": 0.038330078125, + "kimi_kl": 0.11767578125, + "learning_rate": 7.55e-08, + "loss": 0.0017, + "ppl": 0.0185546875, + "reward": 0.9970769286155701, + "reward_std": 0.0005140745779499412, + "rewards/perpo_ocr_edit_distance_reward": 0.9970770478248596, + "step": 4245, + "temperature": 0.9 + }, + { + "advantages": -7.748603820800781e-06, + "completion_length": 429.0, + "delta_ref_entropy_loss": -0.0108642578125, + "delta_ref_ppl": -0.08349609375, + "entropy_loss": -0.0751953125, + "epoch": 0.8492, + "grad_norm": 1.2395381836690367, + "k1_kl": 0.08349609375, + "k3_kl": 0.0654296875, + "kimi_kl": 0.265625, + "learning_rate": 7.539999999999999e-08, + "loss": 0.0026, + "ppl": 0.026611328125, + "reward": 0.9771395325660706, + "reward_std": 0.00650597782805562, + "rewards/perpo_ocr_edit_distance_reward": 0.9771395921707153, + "step": 4246, + "temperature": 0.9 + }, + { + "advantages": -9.546961518935859e-05, + "completion_length": 379.0, + "delta_ref_entropy_loss": 0.00909423828125, + "delta_ref_ppl": -0.09521484375, + "entropy_loss": -0.05517578125, + "epoch": 0.8494, + "grad_norm": 0.6977266626115529, + "k1_kl": 0.09521484375, + "k3_kl": 0.07568359375, + "kimi_kl": 0.337890625, + "learning_rate": 7.53e-08, + "loss": 0.0031, + "ppl": 0.0277099609375, + "reward": 0.9942495822906494, + "reward_std": 0.0008811687584966421, + "rewards/perpo_ocr_edit_distance_reward": 0.9942496418952942, + "step": 4247, + "temperature": 0.9 + }, + { + "advantages": -8.514949740856537e-07, + "completion_length": 1205.0, + "delta_ref_entropy_loss": -0.0172119140625, + "delta_ref_ppl": -0.029296875, + "entropy_loss": -0.06640625, + "epoch": 0.8496, + "grad_norm": 1.3127341639540822, + "k1_kl": 0.029296875, + "k3_kl": 0.028564453125, + "kimi_kl": 0.08447265625, + "learning_rate": 7.52e-08, + "loss": 0.0011, + "ppl": 0.027587890625, + "reward": 0.872905969619751, + "reward_std": 0.062177903950214386, + "rewards/perpo_ocr_edit_distance_reward": 0.872905969619751, + "step": 4248, + "temperature": 0.9 + }, + { + "advantages": -2.258164568047505e-05, + "completion_length": 393.0, + "delta_ref_entropy_loss": 0.031494140625, + "delta_ref_ppl": -0.11083984375, + "entropy_loss": -0.1318359375, + "epoch": 0.8498, + "grad_norm": 1.7877866252776484, + "k1_kl": 0.11083984375, + "k3_kl": 0.076171875, + "kimi_kl": 0.22265625, + "learning_rate": 7.509999999999999e-08, + "loss": 0.0031, + "ppl": 0.06005859375, + "reward": 0.9392510056495667, + "reward_std": 0.0036695690359920263, + "rewards/perpo_ocr_edit_distance_reward": 0.9392511248588562, + "step": 4249, + "temperature": 0.9 + }, + { + "advantages": -3.62736864190083e-06, + "completion_length": 1041.0, + "delta_ref_entropy_loss": 0.044189453125, + "delta_ref_ppl": -0.05615234375, + "entropy_loss": -0.201171875, + "epoch": 0.85, + "grad_norm": 69.38786306098825, + "k1_kl": 0.05615234375, + "k3_kl": 1.203125, + "kimi_kl": 0.0927734375, + "learning_rate": 7.5e-08, + "loss": 0.0483, + "ppl": 0.10400390625, + "reward": 0.9248486757278442, + "reward_std": 0.009275414049625397, + "rewards/perpo_ocr_edit_distance_reward": 0.924848735332489, + "step": 4250, + "temperature": 0.9 + }, + { + "advantages": -4.495893335842993e-06, + "completion_length": 571.0, + "delta_ref_entropy_loss": -0.09619140625, + "delta_ref_ppl": -0.1318359375, + "entropy_loss": -0.51171875, + "epoch": 0.8502, + "grad_norm": 3.682662090909223, + "k1_kl": 0.1318359375, + "k3_kl": 0.11572265625, + "kimi_kl": 0.267578125, + "learning_rate": 7.489999999999999e-08, + "loss": 0.0046, + "ppl": 0.2353515625, + "reward": 0.8136569857597351, + "reward_std": 0.00936655793339014, + "rewards/perpo_ocr_edit_distance_reward": 0.8136571049690247, + "step": 4251, + "temperature": 0.9 + }, + { + "advantages": -5.262239028525073e-06, + "completion_length": 374.0, + "delta_ref_entropy_loss": 0.0186767578125, + "delta_ref_ppl": -0.1708984375, + "entropy_loss": -0.171875, + "epoch": 0.8504, + "grad_norm": 1.5016555301849481, + "k1_kl": 0.1708984375, + "k3_kl": 0.1220703125, + "kimi_kl": 0.470703125, + "learning_rate": 7.480000000000001e-08, + "loss": 0.0049, + "ppl": 0.06689453125, + "reward": 0.3612566590309143, + "reward_std": 0.007170329801738262, + "rewards/perpo_ocr_edit_distance_reward": 0.3612566888332367, + "step": 4252, + "temperature": 0.9 + }, + { + "advantages": -3.217799530830234e-05, + "completion_length": 357.0, + "delta_ref_entropy_loss": 0.044677734375, + "delta_ref_ppl": -0.06689453125, + "entropy_loss": -0.04833984375, + "epoch": 0.8506, + "grad_norm": 0.4170832649358013, + "k1_kl": 0.06689453125, + "k3_kl": 0.03759765625, + "kimi_kl": 0.1025390625, + "learning_rate": 7.47e-08, + "loss": 0.0015, + "ppl": 0.01385498046875, + "reward": 0.9990260601043701, + "reward_std": 0.0004292112134862691, + "rewards/perpo_ocr_edit_distance_reward": 0.9990261197090149, + "step": 4253, + "temperature": 0.9 + }, + { + "advantages": 7.850783731555566e-06, + "completion_length": 894.0, + "delta_ref_entropy_loss": 0.023193359375, + "delta_ref_ppl": -0.0576171875, + "entropy_loss": -0.06591796875, + "epoch": 0.8508, + "grad_norm": 0.5154533068437337, + "k1_kl": 0.0576171875, + "k3_kl": 0.03759765625, + "kimi_kl": 0.1220703125, + "learning_rate": 7.459999999999999e-08, + "loss": 0.0015, + "ppl": 0.0228271484375, + "reward": 0.992534875869751, + "reward_std": 0.0009813032811507583, + "rewards/perpo_ocr_edit_distance_reward": 0.992534875869751, + "step": 4254, + "temperature": 0.9 + }, + { + "advantages": -3.823212318820879e-05, + "completion_length": 284.0, + "delta_ref_entropy_loss": 0.045166015625, + "delta_ref_ppl": -0.1298828125, + "entropy_loss": -0.076171875, + "epoch": 0.851, + "grad_norm": 1.3129680337919951, + "k1_kl": 0.1298828125, + "k3_kl": 0.091796875, + "kimi_kl": 0.34765625, + "learning_rate": 7.45e-08, + "loss": 0.0037, + "ppl": 0.0311279296875, + "reward": 0.9941753149032593, + "reward_std": 0.0021273395977914333, + "rewards/perpo_ocr_edit_distance_reward": 0.9941754341125488, + "step": 4255, + "temperature": 0.9 + }, + { + "advantages": -3.669943225759198e-06, + "completion_length": 370.0, + "delta_ref_entropy_loss": 0.0245361328125, + "delta_ref_ppl": -0.1708984375, + "entropy_loss": -0.392578125, + "epoch": 0.8512, + "grad_norm": 2.6100250899038375, + "k1_kl": 0.171875, + "k3_kl": 0.1337890625, + "kimi_kl": 0.423828125, + "learning_rate": 7.439999999999999e-08, + "loss": 0.0054, + "ppl": 0.1875, + "reward": 0.733905553817749, + "reward_std": 0.009271464310586452, + "rewards/perpo_ocr_edit_distance_reward": 0.7339056134223938, + "step": 4256, + "temperature": 0.9 + }, + { + "advantages": -5.338873597793281e-05, + "completion_length": 740.0, + "delta_ref_entropy_loss": 0.04052734375, + "delta_ref_ppl": -0.087890625, + "entropy_loss": -0.10986328125, + "epoch": 0.8514, + "grad_norm": 3.0396865837681513, + "k1_kl": 0.087890625, + "k3_kl": 0.054443359375, + "kimi_kl": 0.11767578125, + "learning_rate": 7.43e-08, + "loss": 0.0022, + "ppl": 0.048095703125, + "reward": 0.9891270995140076, + "reward_std": 0.0014948652824386954, + "rewards/perpo_ocr_edit_distance_reward": 0.9891272187232971, + "step": 4257, + "temperature": 0.9 + }, + { + "advantages": -9.707042408990674e-06, + "completion_length": 534.0, + "delta_ref_entropy_loss": 0.06494140625, + "delta_ref_ppl": -0.0986328125, + "entropy_loss": -0.16796875, + "epoch": 0.8516, + "grad_norm": 1.1782186727404882, + "k1_kl": 0.09814453125, + "k3_kl": 0.060302734375, + "kimi_kl": 0.1591796875, + "learning_rate": 7.42e-08, + "loss": 0.0024, + "ppl": 0.07421875, + "reward": 0.8033115267753601, + "reward_std": 0.001653090352192521, + "rewards/perpo_ocr_edit_distance_reward": 0.8033116459846497, + "step": 4258, + "temperature": 0.9 + }, + { + "advantages": 1.021793991640152e-07, + "completion_length": 2048.0, + "delta_ref_entropy_loss": -1.7578125, + "delta_ref_ppl": 0.029541015625, + "entropy_loss": -3.28125, + "epoch": 0.8518, + "grad_norm": 9.004382761645823, + "k1_kl": -0.0341796875, + "k3_kl": 0.390625, + "kimi_kl": 0.5078125, + "learning_rate": 7.41e-08, + "loss": 0.0156, + "ppl": 1.7109375, + "reward": 0.41889259219169617, + "reward_std": 0.1569739133119583, + "rewards/perpo_ocr_edit_distance_reward": 0.41889259219169617, + "step": 4259, + "temperature": 0.9 + }, + { + "advantages": -3.397464979570941e-06, + "completion_length": 937.0, + "delta_ref_entropy_loss": 0.0030670166015625, + "delta_ref_ppl": -0.056640625, + "entropy_loss": -0.061767578125, + "epoch": 0.852, + "grad_norm": 0.7596049389374202, + "k1_kl": 0.056640625, + "k3_kl": 0.046630859375, + "kimi_kl": 0.1025390625, + "learning_rate": 7.399999999999999e-08, + "loss": 0.0019, + "ppl": 0.025634765625, + "reward": 0.9712762236595154, + "reward_std": 0.0024038006085902452, + "rewards/perpo_ocr_edit_distance_reward": 0.9712763428688049, + "step": 4260, + "temperature": 0.9 + }, + { + "advantages": -2.0129340555286035e-05, + "completion_length": 550.0, + "delta_ref_entropy_loss": 0.028564453125, + "delta_ref_ppl": -0.1240234375, + "entropy_loss": -0.2373046875, + "epoch": 0.8522, + "grad_norm": 1.5242687241877768, + "k1_kl": 0.12353515625, + "k3_kl": 0.08154296875, + "kimi_kl": 0.1826171875, + "learning_rate": 7.389999999999999e-08, + "loss": 0.0033, + "ppl": 0.10107421875, + "reward": 0.9384016394615173, + "reward_std": 0.0028561283834278584, + "rewards/perpo_ocr_edit_distance_reward": 0.9384017586708069, + "step": 4261, + "temperature": 0.9 + }, + { + "advantages": 0.0, + "completion_length": 426.0, + "delta_ref_entropy_loss": 0.0203857421875, + "delta_ref_ppl": -0.0400390625, + "entropy_loss": -0.0400390625, + "epoch": 0.8524, + "grad_norm": 1.1175071634042886, + "k1_kl": 0.0400390625, + "k3_kl": 0.0272216796875, + "kimi_kl": 0.06689453125, + "learning_rate": 7.38e-08, + "loss": 0.0011, + "ppl": 0.016845703125, + "reward": 0.9979317784309387, + "reward_std": 0.0011758985929191113, + "rewards/perpo_ocr_edit_distance_reward": 0.9979318380355835, + "step": 4262, + "temperature": 0.9 + }, + { + "advantages": -9.191888239001855e-05, + "completion_length": 555.0, + "delta_ref_entropy_loss": 0.03271484375, + "delta_ref_ppl": -0.042724609375, + "entropy_loss": -0.038818359375, + "epoch": 0.8526, + "grad_norm": 0.3919741579810499, + "k1_kl": 0.042724609375, + "k3_kl": 0.0240478515625, + "kimi_kl": 0.072265625, + "learning_rate": 7.37e-08, + "loss": 0.0011, + "ppl": 0.01202392578125, + "reward": 0.9968761205673218, + "reward_std": 0.0006411552894860506, + "rewards/perpo_ocr_edit_distance_reward": 0.9968762397766113, + "step": 4263, + "temperature": 0.9 + }, + { + "advantages": -6.297657091636211e-05, + "completion_length": 1171.0, + "delta_ref_entropy_loss": 0.029052734375, + "delta_ref_ppl": -0.045654296875, + "entropy_loss": -0.07470703125, + "epoch": 0.8528, + "grad_norm": 0.598680911628301, + "k1_kl": 0.0458984375, + "k3_kl": 0.0272216796875, + "kimi_kl": 0.059814453125, + "learning_rate": 7.36e-08, + "loss": 0.0012, + "ppl": 0.03125, + "reward": 0.9920130968093872, + "reward_std": 0.0005762620130553842, + "rewards/perpo_ocr_edit_distance_reward": 0.992013156414032, + "step": 4264, + "temperature": 0.9 + }, + { + "advantages": -6.10777351539582e-05, + "completion_length": 1705.0, + "delta_ref_entropy_loss": 0.00537109375, + "delta_ref_ppl": -0.03662109375, + "entropy_loss": -0.055419921875, + "epoch": 0.853, + "grad_norm": 1.4294697789439945, + "k1_kl": 0.03662109375, + "k3_kl": 0.0286865234375, + "kimi_kl": 0.0703125, + "learning_rate": 7.349999999999999e-08, + "loss": 0.0012, + "ppl": 0.026611328125, + "reward": 0.9719500541687012, + "reward_std": 0.0011544760782271624, + "rewards/perpo_ocr_edit_distance_reward": 0.9719501733779907, + "step": 4265, + "temperature": 0.9 + }, + { + "advantages": -8.97049976629205e-05, + "completion_length": 668.0, + "delta_ref_entropy_loss": 0.018310546875, + "delta_ref_ppl": -0.052978515625, + "entropy_loss": -0.0322265625, + "epoch": 0.8532, + "grad_norm": 0.35787669691058877, + "k1_kl": 0.052978515625, + "k3_kl": 0.034423828125, + "kimi_kl": 0.13671875, + "learning_rate": 7.340000000000001e-08, + "loss": 0.0015, + "ppl": 0.007598876953125, + "reward": 0.9974377751350403, + "reward_std": 0.00027969246730208397, + "rewards/perpo_ocr_edit_distance_reward": 0.9974377751350403, + "step": 4266, + "temperature": 0.9 + }, + { + "advantages": -4.427773774295929e-07, + "completion_length": 824.0, + "delta_ref_entropy_loss": -0.1171875, + "delta_ref_ppl": -0.03955078125, + "entropy_loss": -0.208984375, + "epoch": 0.8534, + "grad_norm": 1.4330085663193801, + "k1_kl": 0.03955078125, + "k3_kl": 0.04443359375, + "kimi_kl": 0.16015625, + "learning_rate": 7.33e-08, + "loss": 0.0018, + "ppl": 0.06494140625, + "reward": 0.4276619553565979, + "reward_std": 0.0555371455848217, + "rewards/perpo_ocr_edit_distance_reward": 0.4276620149612427, + "step": 4267, + "temperature": 0.9 + }, + { + "advantages": -8.227144280681387e-05, + "completion_length": 538.0, + "delta_ref_entropy_loss": 0.047607421875, + "delta_ref_ppl": -0.054443359375, + "entropy_loss": -0.05615234375, + "epoch": 0.8536, + "grad_norm": 0.4049700252687473, + "k1_kl": 0.05419921875, + "k3_kl": 0.031982421875, + "kimi_kl": 0.09619140625, + "learning_rate": 7.32e-08, + "loss": 0.0014, + "ppl": 0.017333984375, + "reward": 0.997900128364563, + "reward_std": 0.0005208424990996718, + "rewards/perpo_ocr_edit_distance_reward": 0.9979001879692078, + "step": 4268, + "temperature": 0.9 + }, + { + "advantages": -2.900191793742124e-05, + "completion_length": 713.0, + "delta_ref_entropy_loss": 0.0125732421875, + "delta_ref_ppl": -0.03515625, + "entropy_loss": -0.044189453125, + "epoch": 0.8538, + "grad_norm": 0.4424559991640023, + "k1_kl": 0.03515625, + "k3_kl": 0.02587890625, + "kimi_kl": 0.05615234375, + "learning_rate": 7.31e-08, + "loss": 0.0011, + "ppl": 0.0177001953125, + "reward": 0.9817267060279846, + "reward_std": 0.0016609705053269863, + "rewards/perpo_ocr_edit_distance_reward": 0.9817268252372742, + "step": 4269, + "temperature": 0.9 + }, + { + "advantages": -5.8165620430372655e-05, + "completion_length": 1088.0, + "delta_ref_entropy_loss": 0.032470703125, + "delta_ref_ppl": -0.0634765625, + "entropy_loss": -0.10791015625, + "epoch": 0.854, + "grad_norm": 1.7911746380407156, + "k1_kl": 0.0634765625, + "k3_kl": 0.03955078125, + "kimi_kl": 0.11279296875, + "learning_rate": 7.299999999999999e-08, + "loss": 0.0016, + "ppl": 0.05126953125, + "reward": 0.9908103942871094, + "reward_std": 0.0015102324541658163, + "rewards/perpo_ocr_edit_distance_reward": 0.9908104538917542, + "step": 4270, + "temperature": 0.9 + }, + { + "advantages": -1.9558839994715527e-05, + "completion_length": 727.0, + "delta_ref_entropy_loss": 0.0211181640625, + "delta_ref_ppl": -0.06494140625, + "entropy_loss": -0.203125, + "epoch": 0.8542, + "grad_norm": 1.0008834241993276, + "k1_kl": 0.06494140625, + "k3_kl": 0.046142578125, + "kimi_kl": 0.09912109375, + "learning_rate": 7.29e-08, + "loss": 0.0019, + "ppl": 0.103515625, + "reward": 0.9611400365829468, + "reward_std": 0.0007713837549090385, + "rewards/perpo_ocr_edit_distance_reward": 0.9611400365829468, + "step": 4271, + "temperature": 0.9 + }, + { + "advantages": -1.8290111256646924e-05, + "completion_length": 540.0, + "delta_ref_entropy_loss": 0.046875, + "delta_ref_ppl": -0.115234375, + "entropy_loss": -0.107421875, + "epoch": 0.8544, + "grad_norm": 1.301856927731355, + "k1_kl": 0.115234375, + "k3_kl": 0.08642578125, + "kimi_kl": 0.310546875, + "learning_rate": 7.28e-08, + "loss": 0.0035, + "ppl": 0.047119140625, + "reward": 0.9765883088111877, + "reward_std": 0.0012969754170626402, + "rewards/perpo_ocr_edit_distance_reward": 0.9765884280204773, + "step": 4272, + "temperature": 0.9 + }, + { + "advantages": -5.609648724202998e-05, + "completion_length": 390.0, + "delta_ref_entropy_loss": 0.03271484375, + "delta_ref_ppl": -0.08056640625, + "entropy_loss": -0.03857421875, + "epoch": 0.8546, + "grad_norm": 0.18331520368844878, + "k1_kl": 0.080078125, + "k3_kl": 0.059326171875, + "kimi_kl": 0.2236328125, + "learning_rate": 7.27e-08, + "loss": 0.0024, + "ppl": 0.007659912109375, + "reward": 0.9988454580307007, + "reward_std": 0.00020363017392810434, + "rewards/perpo_ocr_edit_distance_reward": 0.9988454580307007, + "step": 4273, + "temperature": 0.9 + }, + { + "advantages": -3.510713577270508e-05, + "completion_length": 1317.0, + "delta_ref_entropy_loss": 0.00738525390625, + "delta_ref_ppl": -0.044189453125, + "entropy_loss": -0.078125, + "epoch": 0.8548, + "grad_norm": 0.9758994595887738, + "k1_kl": 0.044189453125, + "k3_kl": 0.033935546875, + "kimi_kl": 0.0966796875, + "learning_rate": 7.259999999999999e-08, + "loss": 0.0014, + "ppl": 0.03369140625, + "reward": 0.9941741824150085, + "reward_std": 0.001598660135641694, + "rewards/perpo_ocr_edit_distance_reward": 0.9941742420196533, + "step": 4274, + "temperature": 0.9 + }, + { + "advantages": -4.973581962985918e-05, + "completion_length": 1124.0, + "delta_ref_entropy_loss": 0.0223388671875, + "delta_ref_ppl": -0.054443359375, + "entropy_loss": -0.11865234375, + "epoch": 0.855, + "grad_norm": 0.6389725962060652, + "k1_kl": 0.054443359375, + "k3_kl": 0.033203125, + "kimi_kl": 0.06982421875, + "learning_rate": 7.25e-08, + "loss": 0.0014, + "ppl": 0.0478515625, + "reward": 0.9129104614257812, + "reward_std": 0.0009272008901461959, + "rewards/perpo_ocr_edit_distance_reward": 0.912910521030426, + "step": 4275, + "temperature": 0.9 + }, + { + "advantages": -4.938671054333099e-07, + "completion_length": 1078.0, + "delta_ref_entropy_loss": -0.2421875, + "delta_ref_ppl": -0.0654296875, + "entropy_loss": -0.435546875, + "epoch": 0.8552, + "grad_norm": 2.0477735822302474, + "k1_kl": 0.06591796875, + "k3_kl": 0.0908203125, + "kimi_kl": 0.208984375, + "learning_rate": 7.24e-08, + "loss": 0.0036, + "ppl": 0.16796875, + "reward": 0.3022731840610504, + "reward_std": 0.048581812530756, + "rewards/perpo_ocr_edit_distance_reward": 0.3022732138633728, + "step": 4276, + "temperature": 0.9 + }, + { + "advantages": -5.705016064894153e-06, + "completion_length": 297.0, + "delta_ref_entropy_loss": 0.060302734375, + "delta_ref_ppl": -0.166015625, + "entropy_loss": -0.09716796875, + "epoch": 0.8554, + "grad_norm": 7.856809877035411, + "k1_kl": 0.166015625, + "k3_kl": 0.1357421875, + "kimi_kl": 0.439453125, + "learning_rate": 7.23e-08, + "loss": 0.0055, + "ppl": 0.037109375, + "reward": 0.9262458682060242, + "reward_std": 0.0013916424941271544, + "rewards/perpo_ocr_edit_distance_reward": 0.9262458682060242, + "step": 4277, + "temperature": 0.9 + }, + { + "advantages": -1.106943500417401e-06, + "completion_length": 361.0, + "delta_ref_entropy_loss": -0.018310546875, + "delta_ref_ppl": -0.10009765625, + "entropy_loss": -0.3671875, + "epoch": 0.8556, + "grad_norm": 2.57928090950595, + "k1_kl": 0.1005859375, + "k3_kl": 0.07861328125, + "kimi_kl": 0.2109375, + "learning_rate": 7.22e-08, + "loss": 0.0031, + "ppl": 0.16796875, + "reward": 0.7622166872024536, + "reward_std": 0.030414599925279617, + "rewards/perpo_ocr_edit_distance_reward": 0.7622167468070984, + "step": 4278, + "temperature": 0.9 + }, + { + "advantages": -2.0316669179010205e-05, + "completion_length": 1129.0, + "delta_ref_entropy_loss": 0.0078125, + "delta_ref_ppl": -0.08203125, + "entropy_loss": -0.1845703125, + "epoch": 0.8558, + "grad_norm": 4.4360610817026105, + "k1_kl": 0.08154296875, + "k3_kl": 0.09619140625, + "kimi_kl": 0.27734375, + "learning_rate": 7.209999999999999e-08, + "loss": 0.0039, + "ppl": 0.09130859375, + "reward": 0.9269213080406189, + "reward_std": 0.0028340527787804604, + "rewards/perpo_ocr_edit_distance_reward": 0.9269213676452637, + "step": 4279, + "temperature": 0.9 + }, + { + "advantages": 0.0, + "completion_length": 751.0, + "delta_ref_entropy_loss": 0.006500244140625, + "delta_ref_ppl": -0.04931640625, + "entropy_loss": -0.10205078125, + "epoch": 0.856, + "grad_norm": 1.1305126250321569, + "k1_kl": 0.049560546875, + "k3_kl": 0.037109375, + "kimi_kl": 0.08203125, + "learning_rate": 7.2e-08, + "loss": 0.0015, + "ppl": 0.05126953125, + "reward": 0.9795433878898621, + "reward_std": 0.001039880677126348, + "rewards/perpo_ocr_edit_distance_reward": 0.9795434474945068, + "step": 4280, + "temperature": 0.9 + }, + { + "advantages": -6.386212135112146e-06, + "completion_length": 769.0, + "delta_ref_entropy_loss": -0.00921630859375, + "delta_ref_ppl": -0.09033203125, + "entropy_loss": -0.51171875, + "epoch": 0.8562, + "grad_norm": 2.156336579642896, + "k1_kl": 0.08935546875, + "k3_kl": 0.06591796875, + "kimi_kl": 0.1240234375, + "learning_rate": 7.19e-08, + "loss": 0.0026, + "ppl": 0.271484375, + "reward": 0.8885780572891235, + "reward_std": 0.013218911364674568, + "rewards/perpo_ocr_edit_distance_reward": 0.8885781764984131, + "step": 4281, + "temperature": 0.9 + }, + { + "advantages": -8.446830179309472e-06, + "completion_length": 530.0, + "delta_ref_entropy_loss": 0.0, + "delta_ref_ppl": -0.1240234375, + "entropy_loss": -0.357421875, + "epoch": 0.8564, + "grad_norm": 2.3369851129307793, + "k1_kl": 0.1240234375, + "k3_kl": 0.0869140625, + "kimi_kl": 0.201171875, + "learning_rate": 7.18e-08, + "loss": 0.0035, + "ppl": 0.158203125, + "reward": 0.8848907947540283, + "reward_std": 0.010995358228683472, + "rewards/perpo_ocr_edit_distance_reward": 0.8848908543586731, + "step": 4282, + "temperature": 0.9 + }, + { + "advantages": -7.339886451518396e-06, + "completion_length": 511.0, + "delta_ref_entropy_loss": 0.006134033203125, + "delta_ref_ppl": -0.10546875, + "entropy_loss": -0.1708984375, + "epoch": 0.8566, + "grad_norm": 3.386839638007473, + "k1_kl": 0.10546875, + "k3_kl": 0.07861328125, + "kimi_kl": 0.2021484375, + "learning_rate": 7.17e-08, + "loss": 0.0032, + "ppl": 0.076171875, + "reward": 0.9636873602867126, + "reward_std": 0.004541839472949505, + "rewards/perpo_ocr_edit_distance_reward": 0.9636874198913574, + "step": 4283, + "temperature": 0.9 + }, + { + "advantages": -3.576278913897113e-06, + "completion_length": 389.0, + "delta_ref_entropy_loss": 0.009765625, + "delta_ref_ppl": -0.1865234375, + "entropy_loss": -0.52734375, + "epoch": 0.8568, + "grad_norm": 2.691487658903844, + "k1_kl": 0.1865234375, + "k3_kl": 0.1474609375, + "kimi_kl": 0.3671875, + "learning_rate": 7.159999999999999e-08, + "loss": 0.0059, + "ppl": 0.2421875, + "reward": 0.9149540066719055, + "reward_std": 0.011836160905659199, + "rewards/perpo_ocr_edit_distance_reward": 0.9149540662765503, + "step": 4284, + "temperature": 0.9 + }, + { + "advantages": 0.0, + "completion_length": 170.0, + "delta_ref_entropy_loss": 0.0233154296875, + "delta_ref_ppl": -0.26953125, + "entropy_loss": -0.1337890625, + "epoch": 0.857, + "grad_norm": 1.5479342079988745, + "k1_kl": 0.26953125, + "k3_kl": 0.2216796875, + "kimi_kl": 1.015625, + "learning_rate": 7.149999999999999e-08, + "loss": 0.0089, + "ppl": 0.0517578125, + "reward": 0.9909617900848389, + "reward_std": 0.0032661003060638905, + "rewards/perpo_ocr_edit_distance_reward": 0.9909618496894836, + "step": 4285, + "temperature": 0.9 + }, + { + "advantages": -1.4083726455282886e-05, + "completion_length": 560.0, + "delta_ref_entropy_loss": -0.005157470703125, + "delta_ref_ppl": -0.08544921875, + "entropy_loss": -0.0986328125, + "epoch": 0.8572, + "grad_norm": 0.3362472151097639, + "k1_kl": 0.08544921875, + "k3_kl": 0.0654296875, + "kimi_kl": 0.23046875, + "learning_rate": 7.14e-08, + "loss": 0.0026, + "ppl": 0.0286865234375, + "reward": 0.9919155240058899, + "reward_std": 0.0017134507652372122, + "rewards/perpo_ocr_edit_distance_reward": 0.9919155836105347, + "step": 4286, + "temperature": 0.9 + }, + { + "advantages": -1.1529241419339087e-05, + "completion_length": 196.0, + "delta_ref_entropy_loss": 0.07568359375, + "delta_ref_ppl": -0.19921875, + "entropy_loss": -0.08642578125, + "epoch": 0.8574, + "grad_norm": 1.0377496688048258, + "k1_kl": 0.2001953125, + "k3_kl": 0.1376953125, + "kimi_kl": 0.419921875, + "learning_rate": 7.13e-08, + "loss": 0.0055, + "ppl": 0.038818359375, + "reward": 0.9855042099952698, + "reward_std": 0.0035933121107518673, + "rewards/perpo_ocr_edit_distance_reward": 0.9855042695999146, + "step": 4287, + "temperature": 0.9 + }, + { + "advantages": -7.493155749216385e-07, + "completion_length": 484.0, + "delta_ref_entropy_loss": -0.055419921875, + "delta_ref_ppl": -0.12451171875, + "entropy_loss": -0.5625, + "epoch": 0.8576, + "grad_norm": 3.9544406558626326, + "k1_kl": 0.125, + "k3_kl": 0.14453125, + "kimi_kl": 0.234375, + "learning_rate": 7.12e-08, + "loss": 0.0058, + "ppl": 0.296875, + "reward": 0.8642407655715942, + "reward_std": 0.044291552156209946, + "rewards/perpo_ocr_edit_distance_reward": 0.864240825176239, + "step": 4288, + "temperature": 0.9 + }, + { + "advantages": -8.472375156998169e-06, + "completion_length": 327.0, + "delta_ref_entropy_loss": -0.048828125, + "delta_ref_ppl": -0.1435546875, + "entropy_loss": -0.419921875, + "epoch": 0.8578, + "grad_norm": 2.4125180355530342, + "k1_kl": 0.142578125, + "k3_kl": 0.11083984375, + "kimi_kl": 0.31640625, + "learning_rate": 7.11e-08, + "loss": 0.0044, + "ppl": 0.1953125, + "reward": 0.7598448395729065, + "reward_std": 0.006950002629309893, + "rewards/perpo_ocr_edit_distance_reward": 0.759844958782196, + "step": 4289, + "temperature": 0.9 + }, + { + "advantages": -6.130763949840912e-07, + "completion_length": 163.0, + "delta_ref_entropy_loss": -0.00390625, + "delta_ref_ppl": -0.2119140625, + "entropy_loss": -0.2236328125, + "epoch": 0.858, + "grad_norm": 1.699500564503276, + "k1_kl": 0.2119140625, + "k3_kl": 0.2216796875, + "kimi_kl": 0.6953125, + "learning_rate": 7.099999999999999e-08, + "loss": 0.0089, + "ppl": 0.09326171875, + "reward": 0.8476189970970154, + "reward_std": 0.1263500154018402, + "rewards/perpo_ocr_edit_distance_reward": 0.8476191163063049, + "step": 4290, + "temperature": 0.9 + }, + { + "advantages": -5.0851278501795605e-05, + "completion_length": 1245.0, + "delta_ref_entropy_loss": -0.005279541015625, + "delta_ref_ppl": -0.0272216796875, + "entropy_loss": -0.0751953125, + "epoch": 0.8582, + "grad_norm": 0.7465203280778672, + "k1_kl": 0.02734375, + "k3_kl": 0.0201416015625, + "kimi_kl": 0.043212890625, + "learning_rate": 7.09e-08, + "loss": 0.0009, + "ppl": 0.0267333984375, + "reward": 0.9948688745498657, + "reward_std": 0.0009050323278643191, + "rewards/perpo_ocr_edit_distance_reward": 0.9948689937591553, + "step": 4291, + "temperature": 0.9 + }, + { + "advantages": -2.1508762074518017e-05, + "completion_length": 49.0, + "delta_ref_entropy_loss": -0.0257568359375, + "delta_ref_ppl": -0.76953125, + "entropy_loss": -0.2265625, + "epoch": 0.8584, + "grad_norm": 3.918465814550045, + "k1_kl": 0.7734375, + "k3_kl": 0.65625, + "kimi_kl": 2.921875, + "learning_rate": 7.08e-08, + "loss": 0.0262, + "ppl": 0.080078125, + "reward": 0.6240330934524536, + "reward_std": 0.0038593593053519726, + "rewards/perpo_ocr_edit_distance_reward": 0.6240330934524536, + "step": 4292, + "temperature": 0.9 + }, + { + "advantages": -5.693095226888545e-05, + "completion_length": 889.0, + "delta_ref_entropy_loss": 0.020263671875, + "delta_ref_ppl": -0.050537109375, + "entropy_loss": -0.09033203125, + "epoch": 0.8586, + "grad_norm": 0.7443558031101598, + "k1_kl": 0.050537109375, + "k3_kl": 0.033447265625, + "kimi_kl": 0.060546875, + "learning_rate": 7.069999999999999e-08, + "loss": 0.0014, + "ppl": 0.0322265625, + "reward": 0.992502748966217, + "reward_std": 0.0012460744474083185, + "rewards/perpo_ocr_edit_distance_reward": 0.9925028085708618, + "step": 4293, + "temperature": 0.9 + }, + { + "advantages": -2.8848649890278466e-05, + "completion_length": 1034.0, + "delta_ref_entropy_loss": 0.04150390625, + "delta_ref_ppl": -0.0546875, + "entropy_loss": -0.050537109375, + "epoch": 0.8588, + "grad_norm": 50.40993782802993, + "k1_kl": 0.0546875, + "k3_kl": 0.044677734375, + "kimi_kl": 0.0869140625, + "learning_rate": 7.06e-08, + "loss": 0.0018, + "ppl": 0.0206298828125, + "reward": 0.9878187775611877, + "reward_std": 0.0010810962412506342, + "rewards/perpo_ocr_edit_distance_reward": 0.9878188967704773, + "step": 4294, + "temperature": 0.9 + }, + { + "advantages": -0.00026786327362060547, + "completion_length": 769.0, + "delta_ref_entropy_loss": 0.0179443359375, + "delta_ref_ppl": -0.046142578125, + "entropy_loss": -0.060791015625, + "epoch": 0.859, + "grad_norm": 0.25900948719784533, + "k1_kl": 0.046142578125, + "k3_kl": 0.0272216796875, + "kimi_kl": 0.0712890625, + "learning_rate": 7.049999999999999e-08, + "loss": 0.0014, + "ppl": 0.0177001953125, + "reward": 0.9756549596786499, + "reward_std": 0.0003132500860374421, + "rewards/perpo_ocr_edit_distance_reward": 0.9756550788879395, + "step": 4295, + "temperature": 0.9 + }, + { + "advantages": -9.059906005859375e-06, + "completion_length": 76.0, + "delta_ref_entropy_loss": 0.06201171875, + "delta_ref_ppl": -0.51953125, + "entropy_loss": -0.275390625, + "epoch": 0.8592, + "grad_norm": 3.691843261124478, + "k1_kl": 0.51953125, + "k3_kl": 0.435546875, + "kimi_kl": 1.875, + "learning_rate": 7.04e-08, + "loss": 0.0174, + "ppl": 0.08154296875, + "reward": 0.9591836333274841, + "reward_std": 0.007438309025019407, + "rewards/perpo_ocr_edit_distance_reward": 0.9591837525367737, + "step": 4296, + "temperature": 0.9 + }, + { + "advantages": -6.585461960639805e-05, + "completion_length": 525.0, + "delta_ref_entropy_loss": 0.05517578125, + "delta_ref_ppl": -0.0625, + "entropy_loss": -0.12255859375, + "epoch": 0.8594, + "grad_norm": 0.8188398999169111, + "k1_kl": 0.0625, + "k3_kl": 0.038330078125, + "kimi_kl": 0.0703125, + "learning_rate": 7.03e-08, + "loss": 0.0016, + "ppl": 0.0380859375, + "reward": 0.9885312914848328, + "reward_std": 0.0008050529286265373, + "rewards/perpo_ocr_edit_distance_reward": 0.9885313510894775, + "step": 4297, + "temperature": 0.9 + }, + { + "advantages": -8.514949740856537e-07, + "completion_length": 38.0, + "delta_ref_entropy_loss": -0.11669921875, + "delta_ref_ppl": -1.0, + "entropy_loss": -0.306640625, + "epoch": 0.8596, + "grad_norm": 4.916048800921919, + "k1_kl": 1.0078125, + "k3_kl": 0.86328125, + "kimi_kl": 3.8125, + "learning_rate": 7.019999999999999e-08, + "loss": 0.0345, + "ppl": 0.126953125, + "reward": 0.9511277079582214, + "reward_std": 0.009946423582732677, + "rewards/perpo_ocr_edit_distance_reward": 0.951127827167511, + "step": 4298, + "temperature": 0.9 + }, + { + "advantages": -5.628381586575415e-06, + "completion_length": 494.0, + "delta_ref_entropy_loss": 0.003631591796875, + "delta_ref_ppl": -0.11181640625, + "entropy_loss": -0.5078125, + "epoch": 0.8598, + "grad_norm": 2.732462557612807, + "k1_kl": 0.11181640625, + "k3_kl": 0.07080078125, + "kimi_kl": 0.140625, + "learning_rate": 7.01e-08, + "loss": 0.0028, + "ppl": 0.263671875, + "reward": 0.8605316877365112, + "reward_std": 0.004429900553077459, + "rewards/perpo_ocr_edit_distance_reward": 0.8605316877365112, + "step": 4299, + "temperature": 0.9 + }, + { + "advantages": -3.488575021037832e-05, + "completion_length": 145.0, + "delta_ref_entropy_loss": 0.046142578125, + "delta_ref_ppl": -0.296875, + "entropy_loss": -0.2294921875, + "epoch": 0.86, + "grad_norm": 2.088729363175334, + "k1_kl": 0.296875, + "k3_kl": 0.2216796875, + "kimi_kl": 0.94140625, + "learning_rate": 7e-08, + "loss": 0.0089, + "ppl": 0.08154296875, + "reward": 0.7463508248329163, + "reward_std": 0.0023405125830322504, + "rewards/perpo_ocr_edit_distance_reward": 0.746350884437561, + "step": 4300, + "temperature": 0.9 + }, + { + "advantages": -6.969911919441074e-05, + "completion_length": 669.0, + "delta_ref_entropy_loss": 0.004241943359375, + "delta_ref_ppl": -0.059814453125, + "entropy_loss": -0.1005859375, + "epoch": 0.8602, + "grad_norm": 0.6279462476766493, + "k1_kl": 0.059814453125, + "k3_kl": 0.036376953125, + "kimi_kl": 0.0927734375, + "learning_rate": 6.99e-08, + "loss": 0.0015, + "ppl": 0.033203125, + "reward": 0.9926036596298218, + "reward_std": 0.0009993590647354722, + "rewards/perpo_ocr_edit_distance_reward": 0.9926037788391113, + "step": 4301, + "temperature": 0.9 + }, + { + "advantages": -1.4901162103342358e-05, + "completion_length": 606.0, + "delta_ref_entropy_loss": 0.0223388671875, + "delta_ref_ppl": -0.107421875, + "entropy_loss": -0.26953125, + "epoch": 0.8604, + "grad_norm": 1.3932718775147293, + "k1_kl": 0.10693359375, + "k3_kl": 0.07421875, + "kimi_kl": 0.177734375, + "learning_rate": 6.98e-08, + "loss": 0.003, + "ppl": 0.1298828125, + "reward": 0.8346736431121826, + "reward_std": 0.002756737871095538, + "rewards/perpo_ocr_edit_distance_reward": 0.8346736431121826, + "step": 4302, + "temperature": 0.9 + }, + { + "advantages": -2.205371856689453e-06, + "completion_length": 53.0, + "delta_ref_entropy_loss": 0.1328125, + "delta_ref_ppl": -0.369140625, + "entropy_loss": -0.27734375, + "epoch": 0.8606, + "grad_norm": 9.605251517916722, + "k1_kl": 0.369140625, + "k3_kl": 0.259765625, + "kimi_kl": 0.67578125, + "learning_rate": 6.97e-08, + "loss": 0.0103, + "ppl": 0.11083984375, + "reward": 0.955439031124115, + "reward_std": 0.015377621166408062, + "rewards/perpo_ocr_edit_distance_reward": 0.9554390907287598, + "step": 4303, + "temperature": 0.9 + }, + { + "advantages": 2.7588437205849914e-06, + "completion_length": 385.0, + "delta_ref_entropy_loss": 0.0361328125, + "delta_ref_ppl": -0.146484375, + "entropy_loss": -0.1845703125, + "epoch": 0.8608, + "grad_norm": 1.420023568295897, + "k1_kl": 0.146484375, + "k3_kl": 0.1279296875, + "kimi_kl": 0.375, + "learning_rate": 6.959999999999999e-08, + "loss": 0.0051, + "ppl": 0.08349609375, + "reward": 0.9375723600387573, + "reward_std": 0.0029998549725860357, + "rewards/perpo_ocr_edit_distance_reward": 0.9375723600387573, + "step": 4304, + "temperature": 0.9 + }, + { + "advantages": 3.4059798537100505e-08, + "completion_length": 701.0, + "delta_ref_entropy_loss": -0.0208740234375, + "delta_ref_ppl": -0.08056640625, + "entropy_loss": -0.162109375, + "epoch": 0.861, + "grad_norm": 1.3962154315148585, + "k1_kl": 0.0810546875, + "k3_kl": 0.06005859375, + "kimi_kl": 0.193359375, + "learning_rate": 6.950000000000001e-08, + "loss": 0.0024, + "ppl": 0.06884765625, + "reward": 0.8296781778335571, + "reward_std": 0.05318918079137802, + "rewards/perpo_ocr_edit_distance_reward": 0.8296781778335571, + "step": 4305, + "temperature": 0.9 + }, + { + "advantages": -1.1920928955078125e-07, + "completion_length": 371.0, + "delta_ref_entropy_loss": -0.53125, + "delta_ref_ppl": -0.38671875, + "entropy_loss": -1.1640625, + "epoch": 0.8612, + "grad_norm": 14.423529093554196, + "k1_kl": 0.384765625, + "k3_kl": 0.447265625, + "kimi_kl": 1.4296875, + "learning_rate": 6.94e-08, + "loss": 0.0179, + "ppl": 0.52734375, + "reward": 0.4030507504940033, + "reward_std": 0.15870997309684753, + "rewards/perpo_ocr_edit_distance_reward": 0.4030507802963257, + "step": 4306, + "temperature": 0.9 + }, + { + "advantages": -6.0626439335464966e-06, + "completion_length": 601.0, + "delta_ref_entropy_loss": -0.0189208984375, + "delta_ref_ppl": -0.08740234375, + "entropy_loss": -0.158203125, + "epoch": 0.8614, + "grad_norm": 0.7885027312300573, + "k1_kl": 0.08740234375, + "k3_kl": 0.064453125, + "kimi_kl": 0.1923828125, + "learning_rate": 6.929999999999999e-08, + "loss": 0.0026, + "ppl": 0.0390625, + "reward": 0.9728119373321533, + "reward_std": 0.0013094334863126278, + "rewards/perpo_ocr_edit_distance_reward": 0.9728119969367981, + "step": 4307, + "temperature": 0.9 + }, + { + "advantages": -5.534717502087005e-07, + "completion_length": 1173.0, + "delta_ref_entropy_loss": -0.01171875, + "delta_ref_ppl": -0.076171875, + "entropy_loss": -0.203125, + "epoch": 0.8616, + "grad_norm": 2.339011243033488, + "k1_kl": 0.076171875, + "k3_kl": 0.05712890625, + "kimi_kl": 0.140625, + "learning_rate": 6.92e-08, + "loss": 0.0023, + "ppl": 0.09033203125, + "reward": 0.8433078527450562, + "reward_std": 0.04606698080897331, + "rewards/perpo_ocr_edit_distance_reward": 0.8433079123497009, + "step": 4308, + "temperature": 0.9 + }, + { + "advantages": 3.4059798537100505e-08, + "completion_length": 354.0, + "delta_ref_entropy_loss": 0.03759765625, + "delta_ref_ppl": -0.08251953125, + "entropy_loss": -0.06005859375, + "epoch": 0.8618, + "grad_norm": 0.3930780841709069, + "k1_kl": 0.08203125, + "k3_kl": 0.059814453125, + "kimi_kl": 0.197265625, + "learning_rate": 6.909999999999999e-08, + "loss": 0.0024, + "ppl": 0.0206298828125, + "reward": 0.9979007244110107, + "reward_std": 0.0006610663840547204, + "rewards/perpo_ocr_edit_distance_reward": 0.9979007244110107, + "step": 4309, + "temperature": 0.9 + }, + { + "advantages": -0.00013066190876998007, + "completion_length": 238.0, + "delta_ref_entropy_loss": 0.037109375, + "delta_ref_ppl": -0.078125, + "entropy_loss": -0.03759765625, + "epoch": 0.862, + "grad_norm": 1.9827761175444105, + "k1_kl": 0.07763671875, + "k3_kl": 0.053955078125, + "kimi_kl": 0.1806640625, + "learning_rate": 6.900000000000001e-08, + "loss": 0.0023, + "ppl": 0.01531982421875, + "reward": 0.9940904974937439, + "reward_std": 0.0008775847963988781, + "rewards/perpo_ocr_edit_distance_reward": 0.9940905570983887, + "step": 4310, + "temperature": 0.9 + }, + { + "advantages": -7.935933354019653e-06, + "completion_length": 637.0, + "delta_ref_entropy_loss": 0.043212890625, + "delta_ref_ppl": -0.061279296875, + "entropy_loss": -0.076171875, + "epoch": 0.8622, + "grad_norm": 0.6393644398299653, + "k1_kl": 0.061279296875, + "k3_kl": 0.0341796875, + "kimi_kl": 0.091796875, + "learning_rate": 6.89e-08, + "loss": 0.0014, + "ppl": 0.02734375, + "reward": 0.9933849573135376, + "reward_std": 0.0009764157002791762, + "rewards/perpo_ocr_edit_distance_reward": 0.9933850765228271, + "step": 4311, + "temperature": 0.9 + }, + { + "advantages": 1.4952251149225049e-05, + "completion_length": 238.0, + "delta_ref_entropy_loss": 0.022705078125, + "delta_ref_ppl": -0.177734375, + "entropy_loss": -0.1181640625, + "epoch": 0.8624, + "grad_norm": 0.9914970592668556, + "k1_kl": 0.177734375, + "k3_kl": 0.1357421875, + "kimi_kl": 0.58203125, + "learning_rate": 6.88e-08, + "loss": 0.0054, + "ppl": 0.052490234375, + "reward": 0.9889000654220581, + "reward_std": 0.0021798850502818823, + "rewards/perpo_ocr_edit_distance_reward": 0.9889000654220581, + "step": 4312, + "temperature": 0.9 + }, + { + "advantages": -6.811959707420101e-08, + "completion_length": 1449.0, + "delta_ref_entropy_loss": -0.1572265625, + "delta_ref_ppl": -0.032470703125, + "entropy_loss": -0.4609375, + "epoch": 0.8626, + "grad_norm": 8.460520701273923, + "k1_kl": 0.0322265625, + "k3_kl": 0.059814453125, + "kimi_kl": 0.08984375, + "learning_rate": 6.87e-08, + "loss": 0.0024, + "ppl": 0.208984375, + "reward": 0.274993896484375, + "reward_std": 0.08935834467411041, + "rewards/perpo_ocr_edit_distance_reward": 0.274993896484375, + "step": 4313, + "temperature": 0.9 + }, + { + "advantages": -1.9822802642011084e-05, + "completion_length": 398.0, + "delta_ref_entropy_loss": 0.0216064453125, + "delta_ref_ppl": -0.0615234375, + "entropy_loss": -0.060546875, + "epoch": 0.8628, + "grad_norm": 1.2300396477070676, + "k1_kl": 0.0615234375, + "k3_kl": 0.0546875, + "kimi_kl": 0.1396484375, + "learning_rate": 6.859999999999999e-08, + "loss": 0.0022, + "ppl": 0.0247802734375, + "reward": 0.9945741295814514, + "reward_std": 0.002048957161605358, + "rewards/perpo_ocr_edit_distance_reward": 0.9945741891860962, + "step": 4314, + "temperature": 0.9 + }, + { + "advantages": 9.323869562649634e-06, + "completion_length": 404.0, + "delta_ref_entropy_loss": 0.0174560546875, + "delta_ref_ppl": -0.109375, + "entropy_loss": -0.1279296875, + "epoch": 0.863, + "grad_norm": 0.8430328182297071, + "k1_kl": 0.109375, + "k3_kl": 0.09326171875, + "kimi_kl": 0.349609375, + "learning_rate": 6.85e-08, + "loss": 0.0037, + "ppl": 0.052001953125, + "reward": 0.96882563829422, + "reward_std": 0.0026376349851489067, + "rewards/perpo_ocr_edit_distance_reward": 0.96882563829422, + "step": 4315, + "temperature": 0.9 + }, + { + "advantages": -3.288473453721963e-05, + "completion_length": 816.0, + "delta_ref_entropy_loss": 0.0185546875, + "delta_ref_ppl": -0.04736328125, + "entropy_loss": -0.061767578125, + "epoch": 0.8632, + "grad_norm": 1.0466500189189907, + "k1_kl": 0.04736328125, + "k3_kl": 0.03173828125, + "kimi_kl": 0.0810546875, + "learning_rate": 6.84e-08, + "loss": 0.0013, + "ppl": 0.0216064453125, + "reward": 0.852804958820343, + "reward_std": 0.0014549196930602193, + "rewards/perpo_ocr_edit_distance_reward": 0.8528050184249878, + "step": 4316, + "temperature": 0.9 + }, + { + "advantages": 8.514949634275126e-09, + "completion_length": 786.0, + "delta_ref_entropy_loss": 0.0277099609375, + "delta_ref_ppl": -0.05029296875, + "entropy_loss": -0.0458984375, + "epoch": 0.8634, + "grad_norm": 0.43893034067872366, + "k1_kl": 0.050048828125, + "k3_kl": 0.02734375, + "kimi_kl": 0.068359375, + "learning_rate": 6.83e-08, + "loss": 0.0011, + "ppl": 0.01324462890625, + "reward": 0.9837418794631958, + "reward_std": 0.0009324895800091326, + "rewards/perpo_ocr_edit_distance_reward": 0.9837419390678406, + "step": 4317, + "temperature": 0.9 + }, + { + "advantages": -4.8245703510474414e-05, + "completion_length": 1001.0, + "delta_ref_entropy_loss": 0.0654296875, + "delta_ref_ppl": -0.09716796875, + "entropy_loss": -0.0966796875, + "epoch": 0.8636, + "grad_norm": 0.7694908530567816, + "k1_kl": 0.09765625, + "k3_kl": 0.05712890625, + "kimi_kl": 0.130859375, + "learning_rate": 6.819999999999999e-08, + "loss": 0.0023, + "ppl": 0.044921875, + "reward": 0.5245658755302429, + "reward_std": 0.0011351192370057106, + "rewards/perpo_ocr_edit_distance_reward": 0.5245659351348877, + "step": 4318, + "temperature": 0.9 + }, + { + "advantages": -2.690724159037927e-06, + "completion_length": 755.0, + "delta_ref_entropy_loss": 0.0181884765625, + "delta_ref_ppl": -0.07275390625, + "entropy_loss": -0.10888671875, + "epoch": 0.8638, + "grad_norm": 0.5136863316487448, + "k1_kl": 0.07275390625, + "k3_kl": 0.04541015625, + "kimi_kl": 0.1171875, + "learning_rate": 6.81e-08, + "loss": 0.0018, + "ppl": 0.041015625, + "reward": 0.9514861106872559, + "reward_std": 0.022223452106118202, + "rewards/perpo_ocr_edit_distance_reward": 0.9514862298965454, + "step": 4319, + "temperature": 0.9 + }, + { + "advantages": -1.825605249905493e-05, + "completion_length": 421.0, + "delta_ref_entropy_loss": 0.0245361328125, + "delta_ref_ppl": -0.0751953125, + "entropy_loss": -0.04248046875, + "epoch": 0.864, + "grad_norm": 0.42159001122295536, + "k1_kl": 0.0751953125, + "k3_kl": 0.053466796875, + "kimi_kl": 0.1865234375, + "learning_rate": 6.8e-08, + "loss": 0.0022, + "ppl": 0.0167236328125, + "reward": 0.9975932836532593, + "reward_std": 0.0008329861448146403, + "rewards/perpo_ocr_edit_distance_reward": 0.9975934028625488, + "step": 4320, + "temperature": 0.9 + }, + { + "advantages": -0.00019563097157515585, + "completion_length": 1207.0, + "delta_ref_entropy_loss": 0.04150390625, + "delta_ref_ppl": -0.044677734375, + "entropy_loss": -0.050537109375, + "epoch": 0.8642, + "grad_norm": 4.5037895557135315, + "k1_kl": 0.044677734375, + "k3_kl": 0.0238037109375, + "kimi_kl": 0.052001953125, + "learning_rate": 6.789999999999999e-08, + "loss": 0.0011, + "ppl": 0.0172119140625, + "reward": 0.9896078109741211, + "reward_std": 0.00033524216269142926, + "rewards/perpo_ocr_edit_distance_reward": 0.9896078705787659, + "step": 4321, + "temperature": 0.9 + }, + { + "advantages": -1.3964516938358429e-06, + "completion_length": 637.0, + "delta_ref_entropy_loss": 0.0020904541015625, + "delta_ref_ppl": -0.09130859375, + "entropy_loss": -0.21484375, + "epoch": 0.8644, + "grad_norm": 1.324895126157083, + "k1_kl": 0.0908203125, + "k3_kl": 0.06787109375, + "kimi_kl": 0.177734375, + "learning_rate": 6.78e-08, + "loss": 0.0027, + "ppl": 0.08349609375, + "reward": 0.7903836965560913, + "reward_std": 0.0059824069030582905, + "rewards/perpo_ocr_edit_distance_reward": 0.7903836965560913, + "step": 4322, + "temperature": 0.9 + }, + { + "advantages": 1.4901162330716033e-06, + "completion_length": 521.0, + "delta_ref_entropy_loss": -0.037841796875, + "delta_ref_ppl": -0.07421875, + "entropy_loss": -0.1845703125, + "epoch": 0.8646, + "grad_norm": 1.056237569590496, + "k1_kl": 0.07373046875, + "k3_kl": 0.060546875, + "kimi_kl": 0.1630859375, + "learning_rate": 6.769999999999999e-08, + "loss": 0.0024, + "ppl": 0.052734375, + "reward": 0.9609464406967163, + "reward_std": 0.005594833754003048, + "rewards/perpo_ocr_edit_distance_reward": 0.9609464406967163, + "step": 4323, + "temperature": 0.9 + }, + { + "advantages": -1.4296600056695752e-05, + "completion_length": 305.0, + "delta_ref_entropy_loss": 0.029296875, + "delta_ref_ppl": -0.11572265625, + "entropy_loss": -0.0615234375, + "epoch": 0.8648, + "grad_norm": 0.525857359307796, + "k1_kl": 0.11572265625, + "k3_kl": 0.08984375, + "kimi_kl": 0.353515625, + "learning_rate": 6.76e-08, + "loss": 0.0036, + "ppl": 0.01611328125, + "reward": 0.9958248138427734, + "reward_std": 0.001090141711756587, + "rewards/perpo_ocr_edit_distance_reward": 0.9958248734474182, + "step": 4324, + "temperature": 0.9 + }, + { + "advantages": 4.647672540158965e-05, + "completion_length": 573.0, + "delta_ref_entropy_loss": 0.00445556640625, + "delta_ref_ppl": -0.0703125, + "entropy_loss": -0.07568359375, + "epoch": 0.865, + "grad_norm": 0.5950180440203096, + "k1_kl": 0.07080078125, + "k3_kl": 0.052734375, + "kimi_kl": 0.19921875, + "learning_rate": 6.75e-08, + "loss": 0.0021, + "ppl": 0.025390625, + "reward": 0.8595093488693237, + "reward_std": 0.0004495792672969401, + "rewards/perpo_ocr_edit_distance_reward": 0.8595093488693237, + "step": 4325, + "temperature": 0.9 + }, + { + "advantages": -1.78303052962292e-05, + "completion_length": 540.0, + "delta_ref_entropy_loss": -0.032958984375, + "delta_ref_ppl": -0.0908203125, + "entropy_loss": -0.416015625, + "epoch": 0.8652, + "grad_norm": 2.865365393828909, + "k1_kl": 0.0908203125, + "k3_kl": 0.07080078125, + "kimi_kl": 0.1552734375, + "learning_rate": 6.74e-08, + "loss": 0.0028, + "ppl": 0.2080078125, + "reward": 0.8787819743156433, + "reward_std": 0.001808652887120843, + "rewards/perpo_ocr_edit_distance_reward": 0.8787820339202881, + "step": 4326, + "temperature": 0.9 + }, + { + "advantages": -4.3102674680994824e-05, + "completion_length": 533.0, + "delta_ref_entropy_loss": 0.033203125, + "delta_ref_ppl": -0.06494140625, + "entropy_loss": -0.059326171875, + "epoch": 0.8654, + "grad_norm": 0.5593216460974283, + "k1_kl": 0.06494140625, + "k3_kl": 0.041748046875, + "kimi_kl": 0.12890625, + "learning_rate": 6.73e-08, + "loss": 0.0017, + "ppl": 0.0198974609375, + "reward": 0.9974699020385742, + "reward_std": 0.0004929096321575344, + "rewards/perpo_ocr_edit_distance_reward": 0.997469961643219, + "step": 4327, + "temperature": 0.9 + }, + { + "advantages": -3.4059798053931445e-05, + "completion_length": 444.0, + "delta_ref_entropy_loss": 0.0093994140625, + "delta_ref_ppl": -0.0634765625, + "entropy_loss": -0.06298828125, + "epoch": 0.8656, + "grad_norm": 1.0012993135103805, + "k1_kl": 0.0634765625, + "k3_kl": 0.043701171875, + "kimi_kl": 0.1455078125, + "learning_rate": 6.719999999999999e-08, + "loss": 0.0018, + "ppl": 0.023681640625, + "reward": 0.992315411567688, + "reward_std": 0.002399927005171776, + "rewards/perpo_ocr_edit_distance_reward": 0.9923154711723328, + "step": 4328, + "temperature": 0.9 + }, + { + "advantages": -7.767377246636897e-05, + "completion_length": 690.0, + "delta_ref_entropy_loss": 0.02978515625, + "delta_ref_ppl": -0.06494140625, + "entropy_loss": -0.0546875, + "epoch": 0.8658, + "grad_norm": 0.48503242514795236, + "k1_kl": 0.06494140625, + "k3_kl": 0.042724609375, + "kimi_kl": 0.1416015625, + "learning_rate": 6.71e-08, + "loss": 0.0018, + "ppl": 0.0166015625, + "reward": 0.9985530376434326, + "reward_std": 0.0006673668976873159, + "rewards/perpo_ocr_edit_distance_reward": 0.9985531568527222, + "step": 4329, + "temperature": 0.9 + }, + { + "advantages": -9.877342108666198e-07, + "completion_length": 17.0, + "delta_ref_entropy_loss": -0.47265625, + "delta_ref_ppl": -2.125, + "entropy_loss": -1.0703125, + "epoch": 0.866, + "grad_norm": 12.676547156970525, + "k1_kl": 2.140625, + "k3_kl": 1.8515625, + "kimi_kl": 8.5, + "learning_rate": 6.7e-08, + "loss": 0.0741, + "ppl": 0.298828125, + "reward": 0.2857142686843872, + "reward_std": 0.008569274097681046, + "rewards/perpo_ocr_edit_distance_reward": 0.2857142984867096, + "step": 4330, + "temperature": 0.9 + }, + { + "advantages": -9.076936294150073e-06, + "completion_length": 294.0, + "delta_ref_entropy_loss": 0.0035552978515625, + "delta_ref_ppl": -0.1611328125, + "entropy_loss": -0.115234375, + "epoch": 0.8662, + "grad_norm": 2.077100824306949, + "k1_kl": 0.1611328125, + "k3_kl": 0.1328125, + "kimi_kl": 0.58984375, + "learning_rate": 6.69e-08, + "loss": 0.0053, + "ppl": 0.04345703125, + "reward": 0.9946228861808777, + "reward_std": 0.005522353574633598, + "rewards/perpo_ocr_edit_distance_reward": 0.9946229457855225, + "step": 4331, + "temperature": 0.9 + }, + { + "advantages": -2.367155957472278e-06, + "completion_length": 482.0, + "delta_ref_entropy_loss": 0.0034942626953125, + "delta_ref_ppl": -0.0791015625, + "entropy_loss": -0.11962890625, + "epoch": 0.8664, + "grad_norm": 1.2455129722359535, + "k1_kl": 0.0791015625, + "k3_kl": 0.068359375, + "kimi_kl": 0.224609375, + "learning_rate": 6.679999999999999e-08, + "loss": 0.0027, + "ppl": 0.045654296875, + "reward": 0.9664892554283142, + "reward_std": 0.02160031348466873, + "rewards/perpo_ocr_edit_distance_reward": 0.9664893746376038, + "step": 4332, + "temperature": 0.9 + }, + { + "advantages": -1.1784690286731347e-05, + "completion_length": 1005.0, + "delta_ref_entropy_loss": 0.01007080078125, + "delta_ref_ppl": -0.0286865234375, + "entropy_loss": -0.05859375, + "epoch": 0.8666, + "grad_norm": 0.5374482430780867, + "k1_kl": 0.028564453125, + "k3_kl": 0.020263671875, + "kimi_kl": 0.048583984375, + "learning_rate": 6.67e-08, + "loss": 0.0008, + "ppl": 0.019775390625, + "reward": 0.995948076248169, + "reward_std": 0.0006214366876520216, + "rewards/perpo_ocr_edit_distance_reward": 0.995948076248169, + "step": 4333, + "temperature": 0.9 + }, + { + "advantages": -0.00011611836816882715, + "completion_length": 497.0, + "delta_ref_entropy_loss": 0.052978515625, + "delta_ref_ppl": -0.0986328125, + "entropy_loss": -0.0546875, + "epoch": 0.8668, + "grad_norm": 0.5202962943997804, + "k1_kl": 0.0986328125, + "k3_kl": 0.06298828125, + "kimi_kl": 0.1865234375, + "learning_rate": 6.66e-08, + "loss": 0.0026, + "ppl": 0.017822265625, + "reward": 0.9931873083114624, + "reward_std": 0.0004866338276769966, + "rewards/perpo_ocr_edit_distance_reward": 0.993187427520752, + "step": 4334, + "temperature": 0.9 + }, + { + "advantages": -1.5744142729090527e-05, + "completion_length": 151.0, + "delta_ref_entropy_loss": -0.037841796875, + "delta_ref_ppl": -0.287109375, + "entropy_loss": -0.232421875, + "epoch": 0.867, + "grad_norm": 2.352325736735366, + "k1_kl": 0.287109375, + "k3_kl": 0.2265625, + "kimi_kl": 0.8125, + "learning_rate": 6.65e-08, + "loss": 0.0091, + "ppl": 0.06640625, + "reward": 0.5422708988189697, + "reward_std": 0.00260291644372046, + "rewards/perpo_ocr_edit_distance_reward": 0.5422709584236145, + "step": 4335, + "temperature": 0.9 + }, + { + "advantages": -3.0457975299214013e-05, + "completion_length": 1011.0, + "delta_ref_entropy_loss": 0.00421142578125, + "delta_ref_ppl": -0.044189453125, + "entropy_loss": -0.052001953125, + "epoch": 0.8672, + "grad_norm": 0.4259733058978632, + "k1_kl": 0.044189453125, + "k3_kl": 0.033203125, + "kimi_kl": 0.09326171875, + "learning_rate": 6.64e-08, + "loss": 0.0014, + "ppl": 0.020751953125, + "reward": 0.97332364320755, + "reward_std": 0.001018298091366887, + "rewards/perpo_ocr_edit_distance_reward": 0.9733237028121948, + "step": 4336, + "temperature": 0.9 + }, + { + "advantages": -5.534717502087005e-07, + "completion_length": 193.0, + "delta_ref_entropy_loss": -0.890625, + "delta_ref_ppl": -0.296875, + "entropy_loss": -1.625, + "epoch": 0.8674, + "grad_norm": 7.287206059957136, + "k1_kl": 0.296875, + "k3_kl": 0.373046875, + "kimi_kl": 1.65625, + "learning_rate": 6.629999999999999e-08, + "loss": 0.015, + "ppl": 0.66015625, + "reward": 0.21469233930110931, + "reward_std": 0.03454893082380295, + "rewards/perpo_ocr_edit_distance_reward": 0.2146923691034317, + "step": 4337, + "temperature": 0.9 + }, + { + "advantages": -2.6566642645775573e-06, + "completion_length": 1025.0, + "delta_ref_entropy_loss": 0.0089111328125, + "delta_ref_ppl": -0.024169921875, + "entropy_loss": -0.03515625, + "epoch": 0.8676, + "grad_norm": 0.3923308712799176, + "k1_kl": 0.024169921875, + "k3_kl": 0.0157470703125, + "kimi_kl": 0.037109375, + "learning_rate": 6.62e-08, + "loss": 0.0006, + "ppl": 0.01055908203125, + "reward": 0.9942586421966553, + "reward_std": 0.0030908524058759212, + "rewards/perpo_ocr_edit_distance_reward": 0.9942586421966553, + "step": 4338, + "temperature": 0.9 + }, + { + "advantages": -2.4471964934491552e-05, + "completion_length": 649.0, + "delta_ref_entropy_loss": 0.0322265625, + "delta_ref_ppl": -0.06689453125, + "entropy_loss": -0.0673828125, + "epoch": 0.8678, + "grad_norm": 0.4994572291968325, + "k1_kl": 0.06689453125, + "k3_kl": 0.044189453125, + "kimi_kl": 0.08837890625, + "learning_rate": 6.61e-08, + "loss": 0.0018, + "ppl": 0.0211181640625, + "reward": 0.9902656674385071, + "reward_std": 0.0005961096030659974, + "rewards/perpo_ocr_edit_distance_reward": 0.9902657270431519, + "step": 4339, + "temperature": 0.9 + }, + { + "advantages": -6.194625893840566e-05, + "completion_length": 468.0, + "delta_ref_entropy_loss": 0.0185546875, + "delta_ref_ppl": -0.06591796875, + "entropy_loss": -0.0703125, + "epoch": 0.868, + "grad_norm": 0.5026462574029538, + "k1_kl": 0.06591796875, + "k3_kl": 0.04296875, + "kimi_kl": 0.1845703125, + "learning_rate": 6.6e-08, + "loss": 0.0018, + "ppl": 0.0218505859375, + "reward": 0.7693133354187012, + "reward_std": 0.000999881885945797, + "rewards/perpo_ocr_edit_distance_reward": 0.769313395023346, + "step": 4340, + "temperature": 0.9 + }, + { + "advantages": -0.00010065521928481758, + "completion_length": 580.0, + "delta_ref_entropy_loss": 0.040283203125, + "delta_ref_ppl": -0.05517578125, + "entropy_loss": -0.040771484375, + "epoch": 0.8682, + "grad_norm": 0.4397995478508127, + "k1_kl": 0.055419921875, + "k3_kl": 0.03271484375, + "kimi_kl": 0.087890625, + "learning_rate": 6.59e-08, + "loss": 0.0014, + "ppl": 0.00836181640625, + "reward": 0.9962668418884277, + "reward_std": 0.0006612829747609794, + "rewards/perpo_ocr_edit_distance_reward": 0.9962669610977173, + "step": 4341, + "temperature": 0.9 + }, + { + "advantages": -4.300049567973474e-06, + "completion_length": 85.0, + "delta_ref_entropy_loss": -0.020263671875, + "delta_ref_ppl": -0.408203125, + "entropy_loss": -0.1220703125, + "epoch": 0.8684, + "grad_norm": 4.045959200981725, + "k1_kl": 0.408203125, + "k3_kl": 0.359375, + "kimi_kl": 2.421875, + "learning_rate": 6.58e-08, + "loss": 0.0143, + "ppl": 0.04345703125, + "reward": 0.956009030342102, + "reward_std": 0.0038570903707295656, + "rewards/perpo_ocr_edit_distance_reward": 0.9560090899467468, + "step": 4342, + "temperature": 0.9 + }, + { + "advantages": -4.6610835852334276e-05, + "completion_length": 217.0, + "delta_ref_entropy_loss": 0.0286865234375, + "delta_ref_ppl": -0.078125, + "entropy_loss": -0.078125, + "epoch": 0.8686, + "grad_norm": 0.4644461939219648, + "k1_kl": 0.07763671875, + "k3_kl": 0.052978515625, + "kimi_kl": 0.181640625, + "learning_rate": 6.569999999999999e-08, + "loss": 0.0022, + "ppl": 0.02392578125, + "reward": 0.9890785813331604, + "reward_std": 0.0009963946649804711, + "rewards/perpo_ocr_edit_distance_reward": 0.9890786409378052, + "step": 4343, + "temperature": 0.9 + }, + { + "advantages": -1.8221991922473535e-05, + "completion_length": 564.0, + "delta_ref_entropy_loss": 0.000278472900390625, + "delta_ref_ppl": -0.1376953125, + "entropy_loss": -0.396484375, + "epoch": 0.8688, + "grad_norm": 2.337397431975542, + "k1_kl": 0.1376953125, + "k3_kl": 0.10009765625, + "kimi_kl": 0.265625, + "learning_rate": 6.56e-08, + "loss": 0.004, + "ppl": 0.189453125, + "reward": 0.7956273555755615, + "reward_std": 0.0022362309973686934, + "rewards/perpo_ocr_edit_distance_reward": 0.7956273555755615, + "step": 4344, + "temperature": 0.9 + }, + { + "advantages": -3.3889500627992675e-06, + "completion_length": 984.0, + "delta_ref_entropy_loss": 0.04052734375, + "delta_ref_ppl": -0.059814453125, + "entropy_loss": -0.08740234375, + "epoch": 0.869, + "grad_norm": 0.9198688067728789, + "k1_kl": 0.059814453125, + "k3_kl": 0.0380859375, + "kimi_kl": 0.09619140625, + "learning_rate": 6.55e-08, + "loss": 0.0015, + "ppl": 0.04052734375, + "reward": 0.9476767182350159, + "reward_std": 0.02003384567797184, + "rewards/perpo_ocr_edit_distance_reward": 0.9476768374443054, + "step": 4345, + "temperature": 0.9 + }, + { + "advantages": -0.00011375972826499492, + "completion_length": 408.0, + "delta_ref_entropy_loss": 0.047607421875, + "delta_ref_ppl": -0.1025390625, + "entropy_loss": -0.05908203125, + "epoch": 0.8692, + "grad_norm": 0.4073815379650872, + "k1_kl": 0.10205078125, + "k3_kl": 0.08154296875, + "kimi_kl": 0.283203125, + "learning_rate": 6.54e-08, + "loss": 0.0034, + "ppl": 0.0230712890625, + "reward": 0.9925931096076965, + "reward_std": 0.0009477492421865463, + "rewards/perpo_ocr_edit_distance_reward": 0.9925932884216309, + "step": 4346, + "temperature": 0.9 + }, + { + "advantages": -1.4969281437515747e-05, + "completion_length": 408.0, + "delta_ref_entropy_loss": 0.0203857421875, + "delta_ref_ppl": -0.11328125, + "entropy_loss": -0.35546875, + "epoch": 0.8694, + "grad_norm": 2.16000798029697, + "k1_kl": 0.11376953125, + "k3_kl": 0.0771484375, + "kimi_kl": 0.197265625, + "learning_rate": 6.53e-08, + "loss": 0.0031, + "ppl": 0.166015625, + "reward": 0.9392658472061157, + "reward_std": 0.004455785267055035, + "rewards/perpo_ocr_edit_distance_reward": 0.9392659664154053, + "step": 4347, + "temperature": 0.9 + }, + { + "advantages": -7.625988655490801e-05, + "completion_length": 555.0, + "delta_ref_entropy_loss": 0.0296630859375, + "delta_ref_ppl": -0.042236328125, + "entropy_loss": -0.033935546875, + "epoch": 0.8696, + "grad_norm": 0.26785857009940095, + "k1_kl": 0.04248046875, + "k3_kl": 0.0252685546875, + "kimi_kl": 0.072265625, + "learning_rate": 6.519999999999999e-08, + "loss": 0.0011, + "ppl": 0.00848388671875, + "reward": 0.9996005892753601, + "reward_std": 0.0006818290567025542, + "rewards/perpo_ocr_edit_distance_reward": 0.9996006488800049, + "step": 4348, + "temperature": 0.9 + }, + { + "advantages": -1.532690987460228e-07, + "completion_length": 902.0, + "delta_ref_entropy_loss": -0.3671875, + "delta_ref_ppl": -0.0281982421875, + "entropy_loss": -0.828125, + "epoch": 0.8698, + "grad_norm": 17.708544928533573, + "k1_kl": 0.0284423828125, + "k3_kl": 0.0654296875, + "kimi_kl": 0.0888671875, + "learning_rate": 6.510000000000001e-08, + "loss": 0.0026, + "ppl": 0.357421875, + "reward": 0.5791628360748291, + "reward_std": 0.07716904580593109, + "rewards/perpo_ocr_edit_distance_reward": 0.5791628956794739, + "step": 4349, + "temperature": 0.9 + }, + { + "advantages": -3.644398475444177e-06, + "completion_length": 452.0, + "delta_ref_entropy_loss": -0.1455078125, + "delta_ref_ppl": -0.10546875, + "entropy_loss": -0.482421875, + "epoch": 0.87, + "grad_norm": 2.393310870109613, + "k1_kl": 0.10595703125, + "k3_kl": 0.09814453125, + "kimi_kl": 0.1953125, + "learning_rate": 6.5e-08, + "loss": 0.0039, + "ppl": 0.2109375, + "reward": 0.9118666052818298, + "reward_std": 0.01151726208627224, + "rewards/perpo_ocr_edit_distance_reward": 0.9118666648864746, + "step": 4350, + "temperature": 0.9 + }, + { + "advantages": -1.8741404346656054e-05, + "completion_length": 581.0, + "delta_ref_entropy_loss": 0.0272216796875, + "delta_ref_ppl": -0.07861328125, + "entropy_loss": -0.05615234375, + "epoch": 0.8702, + "grad_norm": 0.6613821239814762, + "k1_kl": 0.07861328125, + "k3_kl": 0.0546875, + "kimi_kl": 0.2109375, + "learning_rate": 6.489999999999999e-08, + "loss": 0.0022, + "ppl": 0.01904296875, + "reward": 0.9900171756744385, + "reward_std": 0.0012619885383173823, + "rewards/perpo_ocr_edit_distance_reward": 0.9900172352790833, + "step": 4351, + "temperature": 0.9 + }, + { + "advantages": 1.5224730304908007e-05, + "completion_length": 767.0, + "delta_ref_entropy_loss": 0.03955078125, + "delta_ref_ppl": -0.06884765625, + "entropy_loss": -0.064453125, + "epoch": 0.8704, + "grad_norm": 0.8015801980449683, + "k1_kl": 0.06884765625, + "k3_kl": 0.04248046875, + "kimi_kl": 0.10986328125, + "learning_rate": 6.48e-08, + "loss": 0.0017, + "ppl": 0.022705078125, + "reward": 0.9946134090423584, + "reward_std": 0.0010185383725911379, + "rewards/perpo_ocr_edit_distance_reward": 0.9946134090423584, + "step": 4352, + "temperature": 0.9 + }, + { + "advantages": -0.0001521366066299379, + "completion_length": 454.0, + "delta_ref_entropy_loss": 0.0142822265625, + "delta_ref_ppl": -0.0517578125, + "entropy_loss": -0.050537109375, + "epoch": 0.8706, + "grad_norm": 0.32106835350742813, + "k1_kl": 0.0517578125, + "k3_kl": 0.038330078125, + "kimi_kl": 0.1328125, + "learning_rate": 6.469999999999999e-08, + "loss": 0.0017, + "ppl": 0.01214599609375, + "reward": 0.9977845549583435, + "reward_std": 0.0005716980085708201, + "rewards/perpo_ocr_edit_distance_reward": 0.9977846741676331, + "step": 4353, + "temperature": 0.9 + }, + { + "advantages": 4.06929430027958e-05, + "completion_length": 453.0, + "delta_ref_entropy_loss": 0.027587890625, + "delta_ref_ppl": -0.06689453125, + "entropy_loss": -0.041015625, + "epoch": 0.8708, + "grad_norm": 0.642275411209551, + "k1_kl": 0.0673828125, + "k3_kl": 0.04833984375, + "kimi_kl": 0.1611328125, + "learning_rate": 6.46e-08, + "loss": 0.0019, + "ppl": 0.01226806640625, + "reward": 0.993601381778717, + "reward_std": 0.0005277368472889066, + "rewards/perpo_ocr_edit_distance_reward": 0.9936014413833618, + "step": 4354, + "temperature": 0.9 + }, + { + "advantages": -2.0401819710968994e-05, + "completion_length": 521.0, + "delta_ref_entropy_loss": 0.035888671875, + "delta_ref_ppl": -0.0771484375, + "entropy_loss": -0.06494140625, + "epoch": 0.871, + "grad_norm": 0.3953490494591468, + "k1_kl": 0.07666015625, + "k3_kl": 0.049560546875, + "kimi_kl": 0.1640625, + "learning_rate": 6.45e-08, + "loss": 0.002, + "ppl": 0.022216796875, + "reward": 0.9951968789100647, + "reward_std": 0.0007336832350119948, + "rewards/perpo_ocr_edit_distance_reward": 0.9951969385147095, + "step": 4355, + "temperature": 0.9 + }, + { + "advantages": -0.00020701545872725546, + "completion_length": 715.0, + "delta_ref_entropy_loss": 0.029541015625, + "delta_ref_ppl": -0.052490234375, + "entropy_loss": -0.0751953125, + "epoch": 0.8712, + "grad_norm": 0.35547654244920746, + "k1_kl": 0.052734375, + "k3_kl": 0.02880859375, + "kimi_kl": 0.06396484375, + "learning_rate": 6.44e-08, + "loss": 0.0014, + "ppl": 0.0289306640625, + "reward": 0.9901906251907349, + "reward_std": 0.00018793967319652438, + "rewards/perpo_ocr_edit_distance_reward": 0.9901906847953796, + "step": 4356, + "temperature": 0.9 + }, + { + "advantages": -5.27926886206842e-07, + "completion_length": 919.0, + "delta_ref_entropy_loss": 0.01171875, + "delta_ref_ppl": -0.10595703125, + "entropy_loss": -0.2490234375, + "epoch": 0.8714, + "grad_norm": 3.157783348706738, + "k1_kl": 0.1064453125, + "k3_kl": 0.10107421875, + "kimi_kl": 0.248046875, + "learning_rate": 6.429999999999999e-08, + "loss": 0.0041, + "ppl": 0.11767578125, + "reward": 0.6705251336097717, + "reward_std": 0.1321154534816742, + "rewards/perpo_ocr_edit_distance_reward": 0.6705251932144165, + "step": 4357, + "temperature": 0.9 + }, + { + "advantages": -2.8950827982043847e-05, + "completion_length": 353.0, + "delta_ref_entropy_loss": 0.06640625, + "delta_ref_ppl": -0.158203125, + "entropy_loss": -0.1240234375, + "epoch": 0.8716, + "grad_norm": 1.0952622252611914, + "k1_kl": 0.158203125, + "k3_kl": 0.10400390625, + "kimi_kl": 0.35546875, + "learning_rate": 6.419999999999999e-08, + "loss": 0.0042, + "ppl": 0.046630859375, + "reward": 0.9958738684654236, + "reward_std": 0.001076901680789888, + "rewards/perpo_ocr_edit_distance_reward": 0.9958739876747131, + "step": 4358, + "temperature": 0.9 + }, + { + "advantages": -0.00021539416047744453, + "completion_length": 679.0, + "delta_ref_entropy_loss": 0.031005859375, + "delta_ref_ppl": -0.04931640625, + "entropy_loss": -0.038330078125, + "epoch": 0.8718, + "grad_norm": 0.1936029002299332, + "k1_kl": 0.049560546875, + "k3_kl": 0.033203125, + "kimi_kl": 0.1123046875, + "learning_rate": 6.41e-08, + "loss": 0.0015, + "ppl": 0.00933837890625, + "reward": 0.994045615196228, + "reward_std": 0.0002952979994006455, + "rewards/perpo_ocr_edit_distance_reward": 0.9940457344055176, + "step": 4359, + "temperature": 0.9 + }, + { + "advantages": -2.6362284188508056e-05, + "completion_length": 107.0, + "delta_ref_entropy_loss": -0.0693359375, + "delta_ref_ppl": -0.36328125, + "entropy_loss": -0.189453125, + "epoch": 0.872, + "grad_norm": 1.2141518180637263, + "k1_kl": 0.36328125, + "k3_kl": 0.30859375, + "kimi_kl": 1.484375, + "learning_rate": 6.4e-08, + "loss": 0.0123, + "ppl": 0.041015625, + "reward": 0.6613339781761169, + "reward_std": 0.001192309777252376, + "rewards/perpo_ocr_edit_distance_reward": 0.6613339781761169, + "step": 4360, + "temperature": 0.9 + }, + { + "advantages": -2.741813841566909e-05, + "completion_length": 608.0, + "delta_ref_entropy_loss": 0.01611328125, + "delta_ref_ppl": -0.0703125, + "entropy_loss": -0.06103515625, + "epoch": 0.8722, + "grad_norm": 0.6669553339486574, + "k1_kl": 0.0703125, + "k3_kl": 0.0458984375, + "kimi_kl": 0.134765625, + "learning_rate": 6.39e-08, + "loss": 0.0019, + "ppl": 0.01806640625, + "reward": 0.9930846095085144, + "reward_std": 0.001142418128438294, + "rewards/perpo_ocr_edit_distance_reward": 0.9930846691131592, + "step": 4361, + "temperature": 0.9 + }, + { + "advantages": -7.322856845348724e-07, + "completion_length": 681.0, + "delta_ref_entropy_loss": -0.08935546875, + "delta_ref_ppl": -0.10546875, + "entropy_loss": -0.5, + "epoch": 0.8724, + "grad_norm": 3.202549044304203, + "k1_kl": 0.10595703125, + "k3_kl": 0.08935546875, + "kimi_kl": 0.197265625, + "learning_rate": 6.379999999999999e-08, + "loss": 0.0036, + "ppl": 0.2236328125, + "reward": 0.7353140115737915, + "reward_std": 0.09044705331325531, + "rewards/perpo_ocr_edit_distance_reward": 0.735314130783081, + "step": 4362, + "temperature": 0.9 + }, + { + "advantages": -2.091271562676411e-05, + "completion_length": 1071.0, + "delta_ref_entropy_loss": -0.03955078125, + "delta_ref_ppl": -0.0615234375, + "entropy_loss": -0.373046875, + "epoch": 0.8726, + "grad_norm": 2.4456229493911206, + "k1_kl": 0.061279296875, + "k3_kl": 0.054443359375, + "kimi_kl": 0.1064453125, + "learning_rate": 6.370000000000001e-08, + "loss": 0.0022, + "ppl": 0.1796875, + "reward": 0.9096507430076599, + "reward_std": 0.0027480872813612223, + "rewards/perpo_ocr_edit_distance_reward": 0.9096508026123047, + "step": 4363, + "temperature": 0.9 + }, + { + "advantages": -1.9380026060389355e-05, + "completion_length": 387.0, + "delta_ref_entropy_loss": 0.0208740234375, + "delta_ref_ppl": -0.06298828125, + "entropy_loss": -0.041015625, + "epoch": 0.8728, + "grad_norm": 0.7488244877575551, + "k1_kl": 0.0634765625, + "k3_kl": 0.044921875, + "kimi_kl": 0.1650390625, + "learning_rate": 6.36e-08, + "loss": 0.0018, + "ppl": 0.012451171875, + "reward": 0.9940697550773621, + "reward_std": 0.0007790584932081401, + "rewards/perpo_ocr_edit_distance_reward": 0.9940698146820068, + "step": 4364, + "temperature": 0.9 + }, + { + "advantages": 0.0, + "completion_length": 895.0, + "delta_ref_entropy_loss": 0.00341796875, + "delta_ref_ppl": -0.0294189453125, + "entropy_loss": -0.037353515625, + "epoch": 0.873, + "grad_norm": 0.30210903794742305, + "k1_kl": 0.0294189453125, + "k3_kl": 0.02001953125, + "kimi_kl": 0.053466796875, + "learning_rate": 6.349999999999999e-08, + "loss": 0.0008, + "ppl": 0.01025390625, + "reward": 0.994531512260437, + "reward_std": 0.001394690596498549, + "rewards/perpo_ocr_edit_distance_reward": 0.994531512260437, + "step": 4365, + "temperature": 0.9 + }, + { + "advantages": -2.111707544827368e-05, + "completion_length": 648.0, + "delta_ref_entropy_loss": 0.07470703125, + "delta_ref_ppl": -0.08544921875, + "entropy_loss": -0.10986328125, + "epoch": 0.8732, + "grad_norm": 0.7268294331488295, + "k1_kl": 0.0849609375, + "k3_kl": 0.056640625, + "kimi_kl": 0.13671875, + "learning_rate": 6.34e-08, + "loss": 0.0023, + "ppl": 0.041748046875, + "reward": 0.9933421015739441, + "reward_std": 0.0011096277739852667, + "rewards/perpo_ocr_edit_distance_reward": 0.9933421015739441, + "step": 4366, + "temperature": 0.9 + }, + { + "advantages": -1.9499235349940136e-05, + "completion_length": 180.0, + "delta_ref_entropy_loss": 0.0291748046875, + "delta_ref_ppl": -0.26171875, + "entropy_loss": -0.1513671875, + "epoch": 0.8734, + "grad_norm": 1.304170803723372, + "k1_kl": 0.26171875, + "k3_kl": 0.2080078125, + "kimi_kl": 0.984375, + "learning_rate": 6.329999999999999e-08, + "loss": 0.0084, + "ppl": 0.04248046875, + "reward": 0.7835285067558289, + "reward_std": 0.0016464958898723125, + "rewards/perpo_ocr_edit_distance_reward": 0.7835286259651184, + "step": 4367, + "temperature": 0.9 + }, + { + "advantages": -3.8500344089698046e-05, + "completion_length": 615.0, + "delta_ref_entropy_loss": 0.0311279296875, + "delta_ref_ppl": -0.10498046875, + "entropy_loss": -0.2001953125, + "epoch": 0.8736, + "grad_norm": 1.0939034111411219, + "k1_kl": 0.10498046875, + "k3_kl": 0.0751953125, + "kimi_kl": 0.2353515625, + "learning_rate": 6.32e-08, + "loss": 0.003, + "ppl": 0.07763671875, + "reward": 0.9276390075683594, + "reward_std": 0.0018904795870184898, + "rewards/perpo_ocr_edit_distance_reward": 0.9276390671730042, + "step": 4368, + "temperature": 0.9 + }, + { + "advantages": -1.1903899576282129e-05, + "completion_length": 211.0, + "delta_ref_entropy_loss": 0.0203857421875, + "delta_ref_ppl": -0.2138671875, + "entropy_loss": -0.142578125, + "epoch": 0.8738, + "grad_norm": 2.941878859026945, + "k1_kl": 0.2138671875, + "k3_kl": 0.1689453125, + "kimi_kl": 0.6328125, + "learning_rate": 6.31e-08, + "loss": 0.0068, + "ppl": 0.046142578125, + "reward": 0.9556650519371033, + "reward_std": 0.0077599575743079185, + "rewards/perpo_ocr_edit_distance_reward": 0.955665111541748, + "step": 4369, + "temperature": 0.9 + }, + { + "advantages": -1.7361982827424072e-05, + "completion_length": 252.0, + "delta_ref_entropy_loss": -0.0089111328125, + "delta_ref_ppl": -0.2001953125, + "entropy_loss": -0.2294921875, + "epoch": 0.874, + "grad_norm": 2.1483563887059987, + "k1_kl": 0.201171875, + "k3_kl": 0.154296875, + "kimi_kl": 0.5546875, + "learning_rate": 6.3e-08, + "loss": 0.0062, + "ppl": 0.10009765625, + "reward": 0.89044189453125, + "reward_std": 0.004803014453500509, + "rewards/perpo_ocr_edit_distance_reward": 0.8904420137405396, + "step": 4370, + "temperature": 0.9 + }, + { + "advantages": -7.671969797229394e-06, + "completion_length": 469.0, + "delta_ref_entropy_loss": 0.02734375, + "delta_ref_ppl": -0.11669921875, + "entropy_loss": -0.181640625, + "epoch": 0.8742, + "grad_norm": 1.566275956470356, + "k1_kl": 0.11669921875, + "k3_kl": 0.07958984375, + "kimi_kl": 0.26171875, + "learning_rate": 6.29e-08, + "loss": 0.0032, + "ppl": 0.07568359375, + "reward": 0.9453043937683105, + "reward_std": 0.006557518150657415, + "rewards/perpo_ocr_edit_distance_reward": 0.9453043937683105, + "step": 4371, + "temperature": 0.9 + }, + { + "advantages": -5.670956397807458e-06, + "completion_length": 67.0, + "delta_ref_entropy_loss": 0.0233154296875, + "delta_ref_ppl": -0.59375, + "entropy_loss": -0.212890625, + "epoch": 0.8744, + "grad_norm": 4.869916332461957, + "k1_kl": 0.59375, + "k3_kl": 0.50390625, + "kimi_kl": 2.421875, + "learning_rate": 6.279999999999999e-08, + "loss": 0.0202, + "ppl": 0.087890625, + "reward": 0.9584798812866211, + "reward_std": 0.008929337374866009, + "rewards/perpo_ocr_edit_distance_reward": 0.9584800004959106, + "step": 4372, + "temperature": 0.9 + }, + { + "advantages": -1.9141607481287792e-05, + "completion_length": 863.0, + "delta_ref_entropy_loss": 0.01324462890625, + "delta_ref_ppl": -0.04052734375, + "entropy_loss": -0.04736328125, + "epoch": 0.8746, + "grad_norm": 0.23472668679497077, + "k1_kl": 0.04052734375, + "k3_kl": 0.02197265625, + "kimi_kl": 0.05517578125, + "learning_rate": 6.27e-08, + "loss": 0.0009, + "ppl": 0.01263427734375, + "reward": 0.9976217746734619, + "reward_std": 0.0003450404619798064, + "rewards/perpo_ocr_edit_distance_reward": 0.9976218938827515, + "step": 4373, + "temperature": 0.9 + }, + { + "advantages": -8.9023797045229e-06, + "completion_length": 2048.0, + "delta_ref_entropy_loss": -0.007537841796875, + "delta_ref_ppl": -0.0400390625, + "entropy_loss": -0.11474609375, + "epoch": 0.8748, + "grad_norm": 5.550186791640271, + "k1_kl": 0.039794921875, + "k3_kl": 0.10986328125, + "kimi_kl": 0.1025390625, + "learning_rate": 6.26e-08, + "loss": 0.0044, + "ppl": 0.06103515625, + "reward": 0.7245061993598938, + "reward_std": 0.006577820051461458, + "rewards/perpo_ocr_edit_distance_reward": 0.7245062589645386, + "step": 4374, + "temperature": 0.9 + }, + { + "advantages": -6.982258469179214e-07, + "completion_length": 178.0, + "delta_ref_entropy_loss": -0.8203125, + "delta_ref_ppl": -0.37890625, + "entropy_loss": -1.8203125, + "epoch": 0.875, + "grad_norm": 6.195314535788188, + "k1_kl": 0.380859375, + "k3_kl": 0.48046875, + "kimi_kl": 1.6875, + "learning_rate": 6.25e-08, + "loss": 0.0192, + "ppl": 0.84375, + "reward": 0.19686764478683472, + "reward_std": 0.021421849727630615, + "rewards/perpo_ocr_edit_distance_reward": 0.1968676745891571, + "step": 4375, + "temperature": 0.9 + }, + { + "advantages": -3.4996442082046997e-06, + "completion_length": 1124.0, + "delta_ref_entropy_loss": -0.00139617919921875, + "delta_ref_ppl": -0.07080078125, + "entropy_loss": -0.1865234375, + "epoch": 0.8752, + "grad_norm": 1.3107355621618526, + "k1_kl": 0.07080078125, + "k3_kl": 0.0517578125, + "kimi_kl": 0.1318359375, + "learning_rate": 6.239999999999999e-08, + "loss": 0.0021, + "ppl": 0.08056640625, + "reward": 0.9318056702613831, + "reward_std": 0.026677686721086502, + "rewards/perpo_ocr_edit_distance_reward": 0.9318057894706726, + "step": 4376, + "temperature": 0.9 + }, + { + "advantages": -3.576278913897113e-06, + "completion_length": 945.0, + "delta_ref_entropy_loss": -0.057373046875, + "delta_ref_ppl": -0.049560546875, + "entropy_loss": -0.154296875, + "epoch": 0.8754, + "grad_norm": 0.8324126956650707, + "k1_kl": 0.049560546875, + "k3_kl": 0.039306640625, + "kimi_kl": 0.103515625, + "learning_rate": 6.23e-08, + "loss": 0.0016, + "ppl": 0.03955078125, + "reward": 0.9790914058685303, + "reward_std": 0.009403346106410027, + "rewards/perpo_ocr_edit_distance_reward": 0.9790914058685303, + "step": 4377, + "temperature": 0.9 + }, + { + "advantages": 0.0, + "completion_length": 966.0, + "delta_ref_entropy_loss": 0.01324462890625, + "delta_ref_ppl": -0.0400390625, + "entropy_loss": -0.0458984375, + "epoch": 0.8756, + "grad_norm": 0.4717610403524503, + "k1_kl": 0.0400390625, + "k3_kl": 0.025146484375, + "kimi_kl": 0.06298828125, + "learning_rate": 6.22e-08, + "loss": 0.001, + "ppl": 0.01556396484375, + "reward": 0.9959942102432251, + "reward_std": 0.003945921082049608, + "rewards/perpo_ocr_edit_distance_reward": 0.9959942102432251, + "step": 4378, + "temperature": 0.9 + }, + { + "advantages": -8.514949740856537e-07, + "completion_length": 976.0, + "delta_ref_entropy_loss": -0.09228515625, + "delta_ref_ppl": -0.06591796875, + "entropy_loss": -0.27734375, + "epoch": 0.8758, + "grad_norm": 1.0728128165583728, + "k1_kl": 0.06591796875, + "k3_kl": 0.06982421875, + "kimi_kl": 0.166015625, + "learning_rate": 6.21e-08, + "loss": 0.0028, + "ppl": 0.10400390625, + "reward": 0.8554998636245728, + "reward_std": 0.09098883718252182, + "rewards/perpo_ocr_edit_distance_reward": 0.8554999828338623, + "step": 4379, + "temperature": 0.9 + }, + { + "advantages": -2.0580633645295165e-05, + "completion_length": 106.0, + "delta_ref_entropy_loss": 0.03369140625, + "delta_ref_ppl": -0.29296875, + "entropy_loss": -0.1533203125, + "epoch": 0.876, + "grad_norm": 3.1436677293392097, + "k1_kl": 0.29296875, + "k3_kl": 0.2373046875, + "kimi_kl": 0.92578125, + "learning_rate": 6.2e-08, + "loss": 0.0095, + "ppl": 0.05810546875, + "reward": 0.9275362491607666, + "reward_std": 0.003623189404606819, + "rewards/perpo_ocr_edit_distance_reward": 0.9275363087654114, + "step": 4380, + "temperature": 0.9 + }, + { + "advantages": -1.5071460666149505e-06, + "completion_length": 287.0, + "delta_ref_entropy_loss": -0.361328125, + "delta_ref_ppl": -0.1279296875, + "entropy_loss": -0.76171875, + "epoch": 0.8762, + "grad_norm": 2.5699038462708645, + "k1_kl": 0.1279296875, + "k3_kl": 0.1748046875, + "kimi_kl": 0.3671875, + "learning_rate": 6.189999999999999e-08, + "loss": 0.007, + "ppl": 0.341796875, + "reward": 0.8462141156196594, + "reward_std": 0.017003802582621574, + "rewards/perpo_ocr_edit_distance_reward": 0.8462141156196594, + "step": 4381, + "temperature": 0.9 + }, + { + "advantages": 1.7029899268550253e-08, + "completion_length": 2048.0, + "delta_ref_entropy_loss": -0.035400390625, + "delta_ref_ppl": -0.0654296875, + "entropy_loss": -0.259765625, + "epoch": 0.8764, + "grad_norm": 5.399966038551917, + "k1_kl": 0.0654296875, + "k3_kl": 0.06298828125, + "kimi_kl": 0.11669921875, + "learning_rate": 6.18e-08, + "loss": 0.0025, + "ppl": 0.1328125, + "reward": 0.9052302837371826, + "reward_std": 0.05139234662055969, + "rewards/perpo_ocr_edit_distance_reward": 0.9052302837371826, + "step": 4382, + "temperature": 0.9 + }, + { + "advantages": -2.251352634630166e-05, + "completion_length": 546.0, + "delta_ref_entropy_loss": 0.0281982421875, + "delta_ref_ppl": -0.08984375, + "entropy_loss": -0.061279296875, + "epoch": 0.8766, + "grad_norm": 0.7798596227390154, + "k1_kl": 0.09033203125, + "k3_kl": 0.06640625, + "kimi_kl": 0.2158203125, + "learning_rate": 6.169999999999999e-08, + "loss": 0.0027, + "ppl": 0.0211181640625, + "reward": 0.9929616451263428, + "reward_std": 0.0010355691192671657, + "rewards/perpo_ocr_edit_distance_reward": 0.9929616451263428, + "step": 4383, + "temperature": 0.9 + }, + { + "advantages": 5.877869625692256e-05, + "completion_length": 808.0, + "delta_ref_entropy_loss": 0.01806640625, + "delta_ref_ppl": -0.040771484375, + "entropy_loss": -0.041748046875, + "epoch": 0.8768, + "grad_norm": 0.24251725557858916, + "k1_kl": 0.041015625, + "k3_kl": 0.027099609375, + "kimi_kl": 0.091796875, + "learning_rate": 6.16e-08, + "loss": 0.001, + "ppl": 0.00927734375, + "reward": 0.9991786479949951, + "reward_std": 0.00033464591251686215, + "rewards/perpo_ocr_edit_distance_reward": 0.9991786479949951, + "step": 4384, + "temperature": 0.9 + }, + { + "advantages": -2.7452197173261084e-05, + "completion_length": 206.0, + "delta_ref_entropy_loss": 0.0, + "delta_ref_ppl": -0.1875, + "entropy_loss": -0.1943359375, + "epoch": 0.877, + "grad_norm": 2.3224549977949778, + "k1_kl": 0.1875, + "k3_kl": 0.1455078125, + "kimi_kl": 0.5, + "learning_rate": 6.15e-08, + "loss": 0.0058, + "ppl": 0.08251953125, + "reward": 0.9739986658096313, + "reward_std": 0.002381509868428111, + "rewards/perpo_ocr_edit_distance_reward": 0.9739987850189209, + "step": 4385, + "temperature": 0.9 + }, + { + "advantages": -5.572182999458164e-05, + "completion_length": 1114.0, + "delta_ref_entropy_loss": 0.0269775390625, + "delta_ref_ppl": -0.06787109375, + "entropy_loss": -0.09033203125, + "epoch": 0.8772, + "grad_norm": 1.714391687648089, + "k1_kl": 0.06787109375, + "k3_kl": 0.046875, + "kimi_kl": 0.140625, + "learning_rate": 6.14e-08, + "loss": 0.0019, + "ppl": 0.037353515625, + "reward": 0.9086573719978333, + "reward_std": 0.001733050332404673, + "rewards/perpo_ocr_edit_distance_reward": 0.9086575508117676, + "step": 4386, + "temperature": 0.9 + }, + { + "advantages": -1.583780613145791e-05, + "completion_length": 1875.0, + "delta_ref_entropy_loss": 0.00994873046875, + "delta_ref_ppl": -0.0250244140625, + "entropy_loss": -0.058349609375, + "epoch": 0.8774, + "grad_norm": 10730677.360980764, + "k1_kl": 0.0250244140625, + "k3_kl": 13120.0, + "kimi_kl": 0.1474609375, + "learning_rate": 6.13e-08, + "loss": 525.1689, + "ppl": 0.038330078125, + "reward": 0.9788201451301575, + "reward_std": 0.0015105897327885032, + "rewards/perpo_ocr_edit_distance_reward": 0.9788202047348022, + "step": 4387, + "temperature": 0.9 + }, + { + "advantages": -6.399836274795234e-05, + "completion_length": 510.0, + "delta_ref_entropy_loss": 0.041015625, + "delta_ref_ppl": -0.09326171875, + "entropy_loss": -0.20703125, + "epoch": 0.8776, + "grad_norm": 1.0591404035587577, + "k1_kl": 0.09326171875, + "k3_kl": 0.06005859375, + "kimi_kl": 0.1630859375, + "learning_rate": 6.119999999999999e-08, + "loss": 0.0025, + "ppl": 0.078125, + "reward": 0.9294726252555847, + "reward_std": 0.0006979702156968415, + "rewards/perpo_ocr_edit_distance_reward": 0.9294726848602295, + "step": 4388, + "temperature": 0.9 + }, + { + "advantages": -3.56265481968876e-05, + "completion_length": 395.0, + "delta_ref_entropy_loss": 0.0294189453125, + "delta_ref_ppl": -0.10302734375, + "entropy_loss": -0.07763671875, + "epoch": 0.8778, + "grad_norm": 0.6490230150191492, + "k1_kl": 0.103515625, + "k3_kl": 0.076171875, + "kimi_kl": 0.279296875, + "learning_rate": 6.11e-08, + "loss": 0.0031, + "ppl": 0.029541015625, + "reward": 0.995955228805542, + "reward_std": 0.0013341328594833612, + "rewards/perpo_ocr_edit_distance_reward": 0.9959552884101868, + "step": 4389, + "temperature": 0.9 + }, + { + "advantages": -2.6055745365738403e-06, + "completion_length": 527.0, + "delta_ref_entropy_loss": 0.0096435546875, + "delta_ref_ppl": -0.10107421875, + "entropy_loss": -0.21875, + "epoch": 0.878, + "grad_norm": 1.4529207332144818, + "k1_kl": 0.1015625, + "k3_kl": 0.0693359375, + "kimi_kl": 0.2001953125, + "learning_rate": 6.099999999999999e-08, + "loss": 0.0028, + "ppl": 0.0908203125, + "reward": 0.8348232507705688, + "reward_std": 0.0064916787669062614, + "rewards/perpo_ocr_edit_distance_reward": 0.8348233699798584, + "step": 4390, + "temperature": 0.9 + }, + { + "advantages": 5.517687441169983e-06, + "completion_length": 362.0, + "delta_ref_entropy_loss": 0.038818359375, + "delta_ref_ppl": -0.109375, + "entropy_loss": -0.12890625, + "epoch": 0.8782, + "grad_norm": 0.8357416315687118, + "k1_kl": 0.10888671875, + "k3_kl": 0.07763671875, + "kimi_kl": 0.24609375, + "learning_rate": 6.09e-08, + "loss": 0.0031, + "ppl": 0.04150390625, + "reward": 0.9866974949836731, + "reward_std": 0.0029881575610488653, + "rewards/perpo_ocr_edit_distance_reward": 0.9866974949836731, + "step": 4391, + "temperature": 0.9 + }, + { + "advantages": 1.27724248955019e-08, + "completion_length": 371.0, + "delta_ref_entropy_loss": 0.0255126953125, + "delta_ref_ppl": -0.08837890625, + "entropy_loss": -0.07470703125, + "epoch": 0.8784, + "grad_norm": 0.9630843365761781, + "k1_kl": 0.08837890625, + "k3_kl": 0.06298828125, + "kimi_kl": 0.287109375, + "learning_rate": 6.08e-08, + "loss": 0.0025, + "ppl": 0.023193359375, + "reward": 0.32800689339637756, + "reward_std": 0.001205273438245058, + "rewards/perpo_ocr_edit_distance_reward": 0.32800689339637756, + "step": 4392, + "temperature": 0.9 + }, + { + "advantages": 4.257474817137563e-09, + "completion_length": 687.0, + "delta_ref_entropy_loss": 0.052001953125, + "delta_ref_ppl": -0.08203125, + "entropy_loss": -0.07958984375, + "epoch": 0.8786, + "grad_norm": 0.7236928332117918, + "k1_kl": 0.08203125, + "k3_kl": 0.052978515625, + "kimi_kl": 0.1826171875, + "learning_rate": 6.07e-08, + "loss": 0.0021, + "ppl": 0.031982421875, + "reward": 0.6540035009384155, + "reward_std": 0.0006109435926191509, + "rewards/perpo_ocr_edit_distance_reward": 0.6540035605430603, + "step": 4393, + "temperature": 0.9 + }, + { + "advantages": -2.588544703030493e-06, + "completion_length": 552.0, + "delta_ref_entropy_loss": -0.06494140625, + "delta_ref_ppl": -0.0869140625, + "entropy_loss": -0.390625, + "epoch": 0.8788, + "grad_norm": 1.7298565963133081, + "k1_kl": 0.08740234375, + "k3_kl": 0.0751953125, + "kimi_kl": 0.1513671875, + "learning_rate": 6.06e-08, + "loss": 0.003, + "ppl": 0.16796875, + "reward": 0.7581709623336792, + "reward_std": 0.00318806990981102, + "rewards/perpo_ocr_edit_distance_reward": 0.7581709623336792, + "step": 4394, + "temperature": 0.9 + }, + { + "advantages": -1.532690930616809e-06, + "completion_length": 1080.0, + "delta_ref_entropy_loss": 0.002410888671875, + "delta_ref_ppl": -0.09765625, + "entropy_loss": -0.1328125, + "epoch": 0.879, + "grad_norm": 1.6065728701904194, + "k1_kl": 0.09765625, + "k3_kl": 0.06982421875, + "kimi_kl": 0.234375, + "learning_rate": 6.049999999999999e-08, + "loss": 0.0028, + "ppl": 0.06396484375, + "reward": 0.5007075071334839, + "reward_std": 0.021981360390782356, + "rewards/perpo_ocr_edit_distance_reward": 0.5007075667381287, + "step": 4395, + "temperature": 0.9 + }, + { + "advantages": -0.0001273836533073336, + "completion_length": 1090.0, + "delta_ref_entropy_loss": 0.0302734375, + "delta_ref_ppl": -0.055419921875, + "entropy_loss": -0.04931640625, + "epoch": 0.8792, + "grad_norm": 0.7051843806108006, + "k1_kl": 0.0556640625, + "k3_kl": 0.037353515625, + "kimi_kl": 0.1298828125, + "learning_rate": 6.04e-08, + "loss": 0.0016, + "ppl": 0.018798828125, + "reward": 0.9948082566261292, + "reward_std": 0.0004348089569248259, + "rewards/perpo_ocr_edit_distance_reward": 0.9948083162307739, + "step": 4396, + "temperature": 0.9 + }, + { + "advantages": -1.4543534234690014e-05, + "completion_length": 516.0, + "delta_ref_entropy_loss": 0.0303955078125, + "delta_ref_ppl": -0.07666015625, + "entropy_loss": -0.04833984375, + "epoch": 0.8794, + "grad_norm": 0.5452094231819733, + "k1_kl": 0.07666015625, + "k3_kl": 0.05078125, + "kimi_kl": 0.193359375, + "learning_rate": 6.029999999999999e-08, + "loss": 0.0021, + "ppl": 0.0125732421875, + "reward": 0.9964333772659302, + "reward_std": 0.0004852825659327209, + "rewards/perpo_ocr_edit_distance_reward": 0.996433436870575, + "step": 4397, + "temperature": 0.9 + }, + { + "advantages": -5.353348751668818e-05, + "completion_length": 444.0, + "delta_ref_entropy_loss": 0.027587890625, + "delta_ref_ppl": -0.05224609375, + "entropy_loss": -0.064453125, + "epoch": 0.8796, + "grad_norm": 0.5270975339131113, + "k1_kl": 0.05224609375, + "k3_kl": 0.0322265625, + "kimi_kl": 0.08056640625, + "learning_rate": 6.02e-08, + "loss": 0.0013, + "ppl": 0.0196533203125, + "reward": 0.9944543242454529, + "reward_std": 0.0003773128264583647, + "rewards/perpo_ocr_edit_distance_reward": 0.9944544434547424, + "step": 4398, + "temperature": 0.9 + }, + { + "advantages": -1.1554786397027783e-05, + "completion_length": 1207.0, + "delta_ref_entropy_loss": 0.0125732421875, + "delta_ref_ppl": -0.03955078125, + "entropy_loss": -0.07080078125, + "epoch": 0.8798, + "grad_norm": 0.7281416532665269, + "k1_kl": 0.039794921875, + "k3_kl": 0.0260009765625, + "kimi_kl": 0.059326171875, + "learning_rate": 6.01e-08, + "loss": 0.0011, + "ppl": 0.032470703125, + "reward": 0.9813462495803833, + "reward_std": 0.006535072810947895, + "rewards/perpo_ocr_edit_distance_reward": 0.9813463091850281, + "step": 4399, + "temperature": 0.9 + }, + { + "advantages": 2.0955290892743506e-05, + "completion_length": 940.0, + "delta_ref_entropy_loss": -0.0230712890625, + "delta_ref_ppl": -0.07275390625, + "entropy_loss": -0.2138671875, + "epoch": 0.88, + "grad_norm": 1.2979250794722306, + "k1_kl": 0.0732421875, + "k3_kl": 0.059814453125, + "kimi_kl": 0.166015625, + "learning_rate": 6e-08, + "loss": 0.0024, + "ppl": 0.08935546875, + "reward": 0.9015657901763916, + "reward_std": 0.0015246826224029064, + "rewards/perpo_ocr_edit_distance_reward": 0.9015657305717468, + "step": 4400, + "temperature": 0.9 + }, + { + "advantages": -3.7636077649949584e-06, + "completion_length": 699.0, + "delta_ref_entropy_loss": -0.0211181640625, + "delta_ref_ppl": -0.061279296875, + "entropy_loss": -0.1689453125, + "epoch": 0.8802, + "grad_norm": 1.892205753494591, + "k1_kl": 0.06103515625, + "k3_kl": 0.052001953125, + "kimi_kl": 0.140625, + "learning_rate": 5.99e-08, + "loss": 0.0021, + "ppl": 0.061279296875, + "reward": 0.9613128900527954, + "reward_std": 0.008976247161626816, + "rewards/perpo_ocr_edit_distance_reward": 0.9613128900527954, + "step": 4401, + "temperature": 0.9 + }, + { + "advantages": -6.786415178794414e-05, + "completion_length": 497.0, + "delta_ref_entropy_loss": 0.031982421875, + "delta_ref_ppl": -0.0771484375, + "entropy_loss": -0.046875, + "epoch": 0.8804, + "grad_norm": 0.4192211596063253, + "k1_kl": 0.0771484375, + "k3_kl": 0.052734375, + "kimi_kl": 0.1611328125, + "learning_rate": 5.979999999999999e-08, + "loss": 0.0022, + "ppl": 0.01507568359375, + "reward": 0.9968870878219604, + "reward_std": 0.0007781982421875, + "rewards/perpo_ocr_edit_distance_reward": 0.99688720703125, + "step": 4402, + "temperature": 0.9 + }, + { + "advantages": -4.107611675863154e-05, + "completion_length": 294.0, + "delta_ref_entropy_loss": 0.035400390625, + "delta_ref_ppl": -0.09033203125, + "entropy_loss": -0.05712890625, + "epoch": 0.8806, + "grad_norm": 0.47025511709386725, + "k1_kl": 0.09033203125, + "k3_kl": 0.06298828125, + "kimi_kl": 0.201171875, + "learning_rate": 5.97e-08, + "loss": 0.0026, + "ppl": 0.018798828125, + "reward": 0.9829174280166626, + "reward_std": 0.001144092297181487, + "rewards/perpo_ocr_edit_distance_reward": 0.9829174876213074, + "step": 4403, + "temperature": 0.9 + }, + { + "advantages": -1.0660716725396924e-05, + "completion_length": 312.0, + "delta_ref_entropy_loss": 0.05224609375, + "delta_ref_ppl": -0.150390625, + "entropy_loss": -0.1708984375, + "epoch": 0.8808, + "grad_norm": 1.6902227354534443, + "k1_kl": 0.150390625, + "k3_kl": 0.1103515625, + "kimi_kl": 0.36328125, + "learning_rate": 5.96e-08, + "loss": 0.0044, + "ppl": 0.0615234375, + "reward": 0.9865049123764038, + "reward_std": 0.004690872970968485, + "rewards/perpo_ocr_edit_distance_reward": 0.9865050315856934, + "step": 4404, + "temperature": 0.9 + }, + { + "advantages": -5.934919954597717e-06, + "completion_length": 567.0, + "delta_ref_entropy_loss": 0.01953125, + "delta_ref_ppl": -0.1318359375, + "entropy_loss": -0.35546875, + "epoch": 0.881, + "grad_norm": 1.963090410472854, + "k1_kl": 0.1318359375, + "k3_kl": 0.0966796875, + "kimi_kl": 0.265625, + "learning_rate": 5.95e-08, + "loss": 0.0039, + "ppl": 0.1640625, + "reward": 0.8495914340019226, + "reward_std": 0.0027743864338845015, + "rewards/perpo_ocr_edit_distance_reward": 0.8495915532112122, + "step": 4405, + "temperature": 0.9 + }, + { + "advantages": -4.9812453653430566e-05, + "completion_length": 900.0, + "delta_ref_entropy_loss": 0.0164794921875, + "delta_ref_ppl": -0.0517578125, + "entropy_loss": -0.08251953125, + "epoch": 0.8812, + "grad_norm": 0.6385854230846549, + "k1_kl": 0.0517578125, + "k3_kl": 0.03466796875, + "kimi_kl": 0.10986328125, + "learning_rate": 5.9399999999999996e-08, + "loss": 0.0014, + "ppl": 0.0264892578125, + "reward": 0.992117702960968, + "reward_std": 0.0005838014767505229, + "rewards/perpo_ocr_edit_distance_reward": 0.9921177625656128, + "step": 4406, + "temperature": 0.9 + }, + { + "advantages": -2.907003727159463e-05, + "completion_length": 283.0, + "delta_ref_entropy_loss": 0.039306640625, + "delta_ref_ppl": -0.1494140625, + "entropy_loss": -0.10009765625, + "epoch": 0.8814, + "grad_norm": 0.5479351468862182, + "k1_kl": 0.1494140625, + "k3_kl": 0.10791015625, + "kimi_kl": 0.3125, + "learning_rate": 5.9299999999999995e-08, + "loss": 0.0043, + "ppl": 0.03759765625, + "reward": 0.997506856918335, + "reward_std": 0.001364539610221982, + "rewards/perpo_ocr_edit_distance_reward": 0.9975069165229797, + "step": 4407, + "temperature": 0.9 + }, + { + "advantages": -4.485675526666455e-05, + "completion_length": 631.0, + "delta_ref_entropy_loss": 0.048095703125, + "delta_ref_ppl": -0.08984375, + "entropy_loss": -0.12255859375, + "epoch": 0.8816, + "grad_norm": 1.6593628412755932, + "k1_kl": 0.09033203125, + "k3_kl": 0.051513671875, + "kimi_kl": 0.126953125, + "learning_rate": 5.92e-08, + "loss": 0.0021, + "ppl": 0.045166015625, + "reward": 0.9746369123458862, + "reward_std": 0.0010383931221440434, + "rewards/perpo_ocr_edit_distance_reward": 0.9746370315551758, + "step": 4408, + "temperature": 0.9 + }, + { + "advantages": 4.5980726781635894e-07, + "completion_length": 999.0, + "delta_ref_entropy_loss": 0.0196533203125, + "delta_ref_ppl": -0.0712890625, + "entropy_loss": -0.1201171875, + "epoch": 0.8818, + "grad_norm": 1.1471676109766387, + "k1_kl": 0.0712890625, + "k3_kl": 0.048583984375, + "kimi_kl": 0.1181640625, + "learning_rate": 5.91e-08, + "loss": 0.0019, + "ppl": 0.055419921875, + "reward": 0.9699247479438782, + "reward_std": 0.01791215129196644, + "rewards/perpo_ocr_edit_distance_reward": 0.969924807548523, + "step": 4409, + "temperature": 0.9 + }, + { + "advantages": -7.574899063911289e-05, + "completion_length": 571.0, + "delta_ref_entropy_loss": -0.005645751953125, + "delta_ref_ppl": -0.062255859375, + "entropy_loss": -0.12451171875, + "epoch": 0.882, + "grad_norm": 0.7504511513401265, + "k1_kl": 0.0625, + "k3_kl": 0.044189453125, + "kimi_kl": 0.1171875, + "learning_rate": 5.899999999999999e-08, + "loss": 0.0018, + "ppl": 0.04052734375, + "reward": 0.9874356985092163, + "reward_std": 0.001249029766768217, + "rewards/perpo_ocr_edit_distance_reward": 0.9874358177185059, + "step": 4410, + "temperature": 0.9 + }, + { + "advantages": -2.6736941435956396e-05, + "completion_length": 161.0, + "delta_ref_entropy_loss": 0.0034942626953125, + "delta_ref_ppl": -0.263671875, + "entropy_loss": -0.13671875, + "epoch": 0.8822, + "grad_norm": 0.945147548146115, + "k1_kl": 0.263671875, + "k3_kl": 0.203125, + "kimi_kl": 0.78515625, + "learning_rate": 5.89e-08, + "loss": 0.0082, + "ppl": 0.03857421875, + "reward": 0.9890282154083252, + "reward_std": 0.0018098700093105435, + "rewards/perpo_ocr_edit_distance_reward": 0.98902827501297, + "step": 4411, + "temperature": 0.9 + }, + { + "advantages": -2.487216806912329e-05, + "completion_length": 865.0, + "delta_ref_entropy_loss": -0.003448486328125, + "delta_ref_ppl": -0.035400390625, + "entropy_loss": -0.0625, + "epoch": 0.8824, + "grad_norm": 0.5319286410667275, + "k1_kl": 0.035400390625, + "k3_kl": 0.0234375, + "kimi_kl": 0.05908203125, + "learning_rate": 5.88e-08, + "loss": 0.001, + "ppl": 0.0198974609375, + "reward": 0.9960205554962158, + "reward_std": 0.0029797463212162256, + "rewards/perpo_ocr_edit_distance_reward": 0.9960206151008606, + "step": 4412, + "temperature": 0.9 + }, + { + "advantages": 1.7029899268550253e-08, + "completion_length": 663.0, + "delta_ref_entropy_loss": 0.0311279296875, + "delta_ref_ppl": -0.078125, + "entropy_loss": -0.07666015625, + "epoch": 0.8826, + "grad_norm": 0.5219493522846845, + "k1_kl": 0.078125, + "k3_kl": 0.052978515625, + "kimi_kl": 0.1650390625, + "learning_rate": 5.8699999999999996e-08, + "loss": 0.0021, + "ppl": 0.025146484375, + "reward": 0.9828181266784668, + "reward_std": 0.001202055369503796, + "rewards/perpo_ocr_edit_distance_reward": 0.9828181266784668, + "step": 4413, + "temperature": 0.9 + }, + { + "advantages": -6.931169082236011e-06, + "completion_length": 218.0, + "delta_ref_entropy_loss": 0.035888671875, + "delta_ref_ppl": -0.2109375, + "entropy_loss": -0.08544921875, + "epoch": 0.8828, + "grad_norm": 2.0063546595731605, + "k1_kl": 0.2109375, + "k3_kl": 0.16015625, + "kimi_kl": 0.79296875, + "learning_rate": 5.8599999999999995e-08, + "loss": 0.0064, + "ppl": 0.0220947265625, + "reward": 0.9793012738227844, + "reward_std": 0.002351630013436079, + "rewards/perpo_ocr_edit_distance_reward": 0.9793013334274292, + "step": 4414, + "temperature": 0.9 + }, + { + "advantages": -4.764965706272051e-05, + "completion_length": 596.0, + "delta_ref_entropy_loss": 0.0240478515625, + "delta_ref_ppl": -0.06298828125, + "entropy_loss": -0.0400390625, + "epoch": 0.883, + "grad_norm": 1.1586995199959746, + "k1_kl": 0.06298828125, + "k3_kl": 0.044189453125, + "kimi_kl": 0.154296875, + "learning_rate": 5.85e-08, + "loss": 0.0018, + "ppl": 0.0128173828125, + "reward": 0.9915159344673157, + "reward_std": 0.0009723695693537593, + "rewards/perpo_ocr_edit_distance_reward": 0.9915159940719604, + "step": 4415, + "temperature": 0.9 + }, + { + "advantages": -7.3058267844317015e-06, + "completion_length": 1067.0, + "delta_ref_entropy_loss": 0.0037689208984375, + "delta_ref_ppl": -0.02294921875, + "entropy_loss": -0.042724609375, + "epoch": 0.8832, + "grad_norm": 0.5283111338645737, + "k1_kl": 0.02294921875, + "k3_kl": 0.0167236328125, + "kimi_kl": 0.03857421875, + "learning_rate": 5.84e-08, + "loss": 0.0007, + "ppl": 0.017333984375, + "reward": 0.9892364740371704, + "reward_std": 0.0010636651422828436, + "rewards/perpo_ocr_edit_distance_reward": 0.9892364740371704, + "step": 4416, + "temperature": 0.9 + }, + { + "advantages": -2.384185791015625e-07, + "completion_length": 1651.0, + "delta_ref_entropy_loss": -0.18359375, + "delta_ref_ppl": -0.024658203125, + "entropy_loss": -0.43359375, + "epoch": 0.8834, + "grad_norm": 5.301661855694881, + "k1_kl": 0.0245361328125, + "k3_kl": 0.050537109375, + "kimi_kl": 0.08349609375, + "learning_rate": 5.829999999999999e-08, + "loss": 0.002, + "ppl": 0.205078125, + "reward": 0.7643991708755493, + "reward_std": 0.1939648538827896, + "rewards/perpo_ocr_edit_distance_reward": 0.7643992900848389, + "step": 4417, + "temperature": 0.9 + }, + { + "advantages": -1.2321132089709863e-05, + "completion_length": 605.0, + "delta_ref_entropy_loss": -0.154296875, + "delta_ref_ppl": -0.08740234375, + "entropy_loss": -0.90234375, + "epoch": 0.8836, + "grad_norm": 3.1297178371193803, + "k1_kl": 0.08740234375, + "k3_kl": 0.10107421875, + "kimi_kl": 0.171875, + "learning_rate": 5.82e-08, + "loss": 0.0041, + "ppl": 0.484375, + "reward": 0.7823617458343506, + "reward_std": 0.0040505193173885345, + "rewards/perpo_ocr_edit_distance_reward": 0.7823618054389954, + "step": 4418, + "temperature": 0.9 + }, + { + "advantages": 2.09978661587229e-05, + "completion_length": 1116.0, + "delta_ref_entropy_loss": 0.042236328125, + "delta_ref_ppl": -0.049560546875, + "entropy_loss": -0.0791015625, + "epoch": 0.8838, + "grad_norm": 5.518433505119178, + "k1_kl": 0.049560546875, + "k3_kl": 0.0277099609375, + "kimi_kl": 0.0625, + "learning_rate": 5.81e-08, + "loss": 0.0011, + "ppl": 0.032958984375, + "reward": 0.993793785572052, + "reward_std": 0.0011163734598085284, + "rewards/perpo_ocr_edit_distance_reward": 0.993793785572052, + "step": 4419, + "temperature": 0.9 + }, + { + "advantages": -7.373946573352441e-05, + "completion_length": 389.0, + "delta_ref_entropy_loss": 0.059814453125, + "delta_ref_ppl": -0.1220703125, + "entropy_loss": -0.107421875, + "epoch": 0.884, + "grad_norm": 0.6201356215850132, + "k1_kl": 0.1220703125, + "k3_kl": 0.07958984375, + "kimi_kl": 0.23828125, + "learning_rate": 5.8e-08, + "loss": 0.0033, + "ppl": 0.03515625, + "reward": 0.8552818894386292, + "reward_std": 0.0010551608866080642, + "rewards/perpo_ocr_edit_distance_reward": 0.8552820086479187, + "step": 4420, + "temperature": 0.9 + }, + { + "advantages": -4.809243546333164e-05, + "completion_length": 802.0, + "delta_ref_entropy_loss": 0.027587890625, + "delta_ref_ppl": -0.068359375, + "entropy_loss": -0.054443359375, + "epoch": 0.8842, + "grad_norm": 1.487220296076199, + "k1_kl": 0.068359375, + "k3_kl": 0.052490234375, + "kimi_kl": 0.1689453125, + "learning_rate": 5.7899999999999996e-08, + "loss": 0.0022, + "ppl": 0.0157470703125, + "reward": 0.9861956238746643, + "reward_std": 0.0007852926501072943, + "rewards/perpo_ocr_edit_distance_reward": 0.9861956834793091, + "step": 4421, + "temperature": 0.9 + }, + { + "advantages": -2.469335413479712e-05, + "completion_length": 792.0, + "delta_ref_entropy_loss": 0.0157470703125, + "delta_ref_ppl": -0.05419921875, + "entropy_loss": -0.058349609375, + "epoch": 0.8844, + "grad_norm": 0.3481028070384352, + "k1_kl": 0.05419921875, + "k3_kl": 0.03564453125, + "kimi_kl": 0.10302734375, + "learning_rate": 5.7799999999999995e-08, + "loss": 0.0014, + "ppl": 0.021240234375, + "reward": 0.9945612549781799, + "reward_std": 0.0016242042183876038, + "rewards/perpo_ocr_edit_distance_reward": 0.9945613145828247, + "step": 4422, + "temperature": 0.9 + }, + { + "advantages": -8.514949456639442e-08, + "completion_length": 153.0, + "delta_ref_entropy_loss": -1.046875, + "delta_ref_ppl": -0.279296875, + "entropy_loss": -2.09375, + "epoch": 0.8846, + "grad_norm": 10.58709575995909, + "k1_kl": 0.279296875, + "k3_kl": 0.384765625, + "kimi_kl": 0.98046875, + "learning_rate": 5.77e-08, + "loss": 0.0154, + "ppl": 0.98828125, + "reward": 0.3439115285873413, + "reward_std": 0.11894232034683228, + "rewards/perpo_ocr_edit_distance_reward": 0.3439115583896637, + "step": 4423, + "temperature": 0.9 + }, + { + "advantages": -2.9597964385175146e-05, + "completion_length": 953.0, + "delta_ref_entropy_loss": 0.00188446044921875, + "delta_ref_ppl": -0.050048828125, + "entropy_loss": -0.07763671875, + "epoch": 0.8848, + "grad_norm": 0.552258895796227, + "k1_kl": 0.050048828125, + "k3_kl": 0.03466796875, + "kimi_kl": 0.0849609375, + "learning_rate": 5.759999999999999e-08, + "loss": 0.0014, + "ppl": 0.028076171875, + "reward": 0.9833618402481079, + "reward_std": 0.0016275187954306602, + "rewards/perpo_ocr_edit_distance_reward": 0.9833618402481079, + "step": 4424, + "temperature": 0.9 + }, + { + "advantages": -1.021793991640152e-07, + "completion_length": 643.0, + "delta_ref_entropy_loss": 0.039794921875, + "delta_ref_ppl": -0.09521484375, + "entropy_loss": -0.12109375, + "epoch": 0.885, + "grad_norm": 0.7662699241753065, + "k1_kl": 0.0947265625, + "k3_kl": 0.0634765625, + "kimi_kl": 0.1669921875, + "learning_rate": 5.75e-08, + "loss": 0.0025, + "ppl": 0.047607421875, + "reward": 0.9481964111328125, + "reward_std": 0.09359703958034515, + "rewards/perpo_ocr_edit_distance_reward": 0.9481964707374573, + "step": 4425, + "temperature": 0.9 + }, + { + "advantages": -1.3095992471789941e-05, + "completion_length": 732.0, + "delta_ref_entropy_loss": 0.017333984375, + "delta_ref_ppl": -0.08447265625, + "entropy_loss": -0.201171875, + "epoch": 0.8852, + "grad_norm": 1.3639157237156627, + "k1_kl": 0.08447265625, + "k3_kl": 0.06396484375, + "kimi_kl": 0.1767578125, + "learning_rate": 5.74e-08, + "loss": 0.0026, + "ppl": 0.08203125, + "reward": 0.9109592437744141, + "reward_std": 0.005749812815338373, + "rewards/perpo_ocr_edit_distance_reward": 0.9109593629837036, + "step": 4426, + "temperature": 0.9 + }, + { + "advantages": 0.0, + "completion_length": 276.0, + "delta_ref_entropy_loss": -0.0654296875, + "delta_ref_ppl": -0.1259765625, + "entropy_loss": -0.232421875, + "epoch": 0.8854, + "grad_norm": 1.6868657740764172, + "k1_kl": 0.1259765625, + "k3_kl": 0.11181640625, + "kimi_kl": 0.43359375, + "learning_rate": 5.73e-08, + "loss": 0.0045, + "ppl": 0.08251953125, + "reward": 0.8304069638252258, + "reward_std": 0.004821031354367733, + "rewards/perpo_ocr_edit_distance_reward": 0.8304070234298706, + "step": 4427, + "temperature": 0.9 + }, + { + "advantages": -8.81297296473349e-07, + "completion_length": 1276.0, + "delta_ref_entropy_loss": -0.09326171875, + "delta_ref_ppl": -0.11572265625, + "entropy_loss": -0.91015625, + "epoch": 0.8856, + "grad_norm": 3.6862846211386726, + "k1_kl": 0.115234375, + "k3_kl": 0.10595703125, + "kimi_kl": 0.1923828125, + "learning_rate": 5.7199999999999996e-08, + "loss": 0.0042, + "ppl": 0.5078125, + "reward": 0.3805842101573944, + "reward_std": 0.018814822658896446, + "rewards/perpo_ocr_edit_distance_reward": 0.3805842101573944, + "step": 4428, + "temperature": 0.9 + }, + { + "advantages": -3.814697265625e-05, + "completion_length": 944.0, + "delta_ref_entropy_loss": 0.031494140625, + "delta_ref_ppl": -0.05615234375, + "entropy_loss": -0.060546875, + "epoch": 0.8858, + "grad_norm": 0.4552591804276642, + "k1_kl": 0.05615234375, + "k3_kl": 0.0361328125, + "kimi_kl": 0.10302734375, + "learning_rate": 5.7099999999999995e-08, + "loss": 0.0015, + "ppl": 0.0255126953125, + "reward": 0.9955666065216064, + "reward_std": 0.0005693654529750347, + "rewards/perpo_ocr_edit_distance_reward": 0.9955666661262512, + "step": 4429, + "temperature": 0.9 + }, + { + "advantages": -3.739765816135332e-05, + "completion_length": 658.0, + "delta_ref_entropy_loss": 0.021484375, + "delta_ref_ppl": -0.05517578125, + "entropy_loss": -0.046142578125, + "epoch": 0.886, + "grad_norm": 0.28197386759566334, + "k1_kl": 0.05517578125, + "k3_kl": 0.03564453125, + "kimi_kl": 0.1005859375, + "learning_rate": 5.7e-08, + "loss": 0.0015, + "ppl": 0.0133056640625, + "reward": 0.9980918765068054, + "reward_std": 0.0005835562478750944, + "rewards/perpo_ocr_edit_distance_reward": 0.9980918765068054, + "step": 4430, + "temperature": 0.9 + }, + { + "advantages": -1.7029899268550253e-08, + "completion_length": 361.0, + "delta_ref_entropy_loss": -0.056396484375, + "delta_ref_ppl": -0.07421875, + "entropy_loss": -0.43359375, + "epoch": 0.8862, + "grad_norm": 3.022903187318404, + "k1_kl": 0.07421875, + "k3_kl": 0.0654296875, + "kimi_kl": 0.138671875, + "learning_rate": 5.6899999999999993e-08, + "loss": 0.0026, + "ppl": 0.22265625, + "reward": 0.27908456325531006, + "reward_std": 0.2703181207180023, + "rewards/perpo_ocr_edit_distance_reward": 0.27908459305763245, + "step": 4431, + "temperature": 0.9 + }, + { + "advantages": -2.5498016839264892e-05, + "completion_length": 654.0, + "delta_ref_entropy_loss": 0.01287841796875, + "delta_ref_ppl": -0.06201171875, + "entropy_loss": -0.057373046875, + "epoch": 0.8864, + "grad_norm": 0.6300733559362753, + "k1_kl": 0.06201171875, + "k3_kl": 0.04150390625, + "kimi_kl": 0.1640625, + "learning_rate": 5.68e-08, + "loss": 0.0017, + "ppl": 0.0203857421875, + "reward": 0.9951217770576477, + "reward_std": 0.0009017796255648136, + "rewards/perpo_ocr_edit_distance_reward": 0.9951217770576477, + "step": 4432, + "temperature": 0.9 + }, + { + "advantages": -1.7029899268550253e-08, + "completion_length": 546.0, + "delta_ref_entropy_loss": -0.1484375, + "delta_ref_ppl": -0.1201171875, + "entropy_loss": -0.62890625, + "epoch": 0.8866, + "grad_norm": 5.331563269516772, + "k1_kl": 0.12060546875, + "k3_kl": 0.12255859375, + "kimi_kl": 0.24609375, + "learning_rate": 5.67e-08, + "loss": 0.0049, + "ppl": 0.3125, + "reward": 0.7035731077194214, + "reward_std": 0.21983756124973297, + "rewards/perpo_ocr_edit_distance_reward": 0.7035731077194214, + "step": 4433, + "temperature": 0.9 + }, + { + "advantages": -1.7029899268550253e-08, + "completion_length": 340.0, + "delta_ref_entropy_loss": 0.021728515625, + "delta_ref_ppl": -0.09765625, + "entropy_loss": -0.09716796875, + "epoch": 0.8868, + "grad_norm": 1.2232496038046523, + "k1_kl": 0.09765625, + "k3_kl": 0.07177734375, + "kimi_kl": 0.2138671875, + "learning_rate": 5.66e-08, + "loss": 0.0029, + "ppl": 0.035400390625, + "reward": 0.9778239727020264, + "reward_std": 0.002117154188454151, + "rewards/perpo_ocr_edit_distance_reward": 0.9778239727020264, + "step": 4434, + "temperature": 0.9 + }, + { + "advantages": -3.101144829997793e-05, + "completion_length": 966.0, + "delta_ref_entropy_loss": 0.032470703125, + "delta_ref_ppl": -0.06103515625, + "entropy_loss": -0.2021484375, + "epoch": 0.887, + "grad_norm": 2.345104969071011, + "k1_kl": 0.061279296875, + "k3_kl": 0.041748046875, + "kimi_kl": 0.09326171875, + "learning_rate": 5.6499999999999996e-08, + "loss": 0.0017, + "ppl": 0.0966796875, + "reward": 0.9521409273147583, + "reward_std": 0.001822207821533084, + "rewards/perpo_ocr_edit_distance_reward": 0.9521411061286926, + "step": 4435, + "temperature": 0.9 + }, + { + "advantages": -2.469335413479712e-06, + "completion_length": 1076.0, + "delta_ref_entropy_loss": -0.13671875, + "delta_ref_ppl": -0.02734375, + "entropy_loss": -0.2578125, + "epoch": 0.8872, + "grad_norm": 2.471613727804196, + "k1_kl": 0.0272216796875, + "k3_kl": 0.041748046875, + "kimi_kl": 0.1005859375, + "learning_rate": 5.6399999999999995e-08, + "loss": 0.0017, + "ppl": 0.08154296875, + "reward": 0.9761620163917542, + "reward_std": 0.030854370445013046, + "rewards/perpo_ocr_edit_distance_reward": 0.9761621356010437, + "step": 4436, + "temperature": 0.9 + }, + { + "advantages": -3.2356808787881164e-07, + "completion_length": 203.0, + "delta_ref_entropy_loss": -0.1552734375, + "delta_ref_ppl": -0.1025390625, + "entropy_loss": -1.25, + "epoch": 0.8874, + "grad_norm": 8.11647028649822, + "k1_kl": 0.10205078125, + "k3_kl": 0.1494140625, + "kimi_kl": 0.25390625, + "learning_rate": 5.63e-08, + "loss": 0.0059, + "ppl": 0.65234375, + "reward": 0.31785687804222107, + "reward_std": 0.1311410516500473, + "rewards/perpo_ocr_edit_distance_reward": 0.31785693764686584, + "step": 4437, + "temperature": 0.9 + }, + { + "advantages": 2.8073789508198388e-05, + "completion_length": 504.0, + "delta_ref_entropy_loss": 0.04296875, + "delta_ref_ppl": -0.103515625, + "entropy_loss": -0.0791015625, + "epoch": 0.8876, + "grad_norm": 0.4559315533785068, + "k1_kl": 0.103515625, + "k3_kl": 0.068359375, + "kimi_kl": 0.2421875, + "learning_rate": 5.62e-08, + "loss": 0.0027, + "ppl": 0.0267333984375, + "reward": 0.9949096441268921, + "reward_std": 0.0008098709513433278, + "rewards/perpo_ocr_edit_distance_reward": 0.9949095845222473, + "step": 4438, + "temperature": 0.9 + }, + { + "advantages": -3.620556526584551e-05, + "completion_length": 789.0, + "delta_ref_entropy_loss": 0.04541015625, + "delta_ref_ppl": -0.060302734375, + "entropy_loss": -0.0458984375, + "epoch": 0.8878, + "grad_norm": 0.43555481187488865, + "k1_kl": 0.060302734375, + "k3_kl": 0.0303955078125, + "kimi_kl": 0.08447265625, + "learning_rate": 5.609999999999999e-08, + "loss": 0.0012, + "ppl": 0.0152587890625, + "reward": 0.9973325729370117, + "reward_std": 0.000606051879003644, + "rewards/perpo_ocr_edit_distance_reward": 0.9973326325416565, + "step": 4439, + "temperature": 0.9 + }, + { + "advantages": -5.79016568735824e-06, + "completion_length": 510.0, + "delta_ref_entropy_loss": -0.0159912109375, + "delta_ref_ppl": -0.0615234375, + "entropy_loss": -0.1416015625, + "epoch": 0.888, + "grad_norm": 1.471997660135895, + "k1_kl": 0.0615234375, + "k3_kl": 0.049560546875, + "kimi_kl": 0.16015625, + "learning_rate": 5.6e-08, + "loss": 0.002, + "ppl": 0.0595703125, + "reward": 0.859651505947113, + "reward_std": 0.008752056397497654, + "rewards/perpo_ocr_edit_distance_reward": 0.8596516251564026, + "step": 4440, + "temperature": 0.9 + }, + { + "advantages": -1.94140852727287e-06, + "completion_length": 973.0, + "delta_ref_entropy_loss": 0.068359375, + "delta_ref_ppl": -0.09912109375, + "entropy_loss": -0.1455078125, + "epoch": 0.8882, + "grad_norm": 2.6281235609942155, + "k1_kl": 0.09912109375, + "k3_kl": 0.06640625, + "kimi_kl": 0.197265625, + "learning_rate": 5.59e-08, + "loss": 0.0027, + "ppl": 0.0693359375, + "reward": 0.9610452055931091, + "reward_std": 0.03471953421831131, + "rewards/perpo_ocr_edit_distance_reward": 0.9610453248023987, + "step": 4441, + "temperature": 0.9 + }, + { + "advantages": -6.719998054904863e-05, + "completion_length": 1033.0, + "delta_ref_entropy_loss": 0.017578125, + "delta_ref_ppl": -0.039306640625, + "entropy_loss": -0.04931640625, + "epoch": 0.8884, + "grad_norm": 2.5058757475203652, + "k1_kl": 0.039306640625, + "k3_kl": 0.0262451171875, + "kimi_kl": 0.07763671875, + "learning_rate": 5.58e-08, + "loss": 0.0011, + "ppl": 0.0196533203125, + "reward": 0.9957494139671326, + "reward_std": 0.0007870003464631736, + "rewards/perpo_ocr_edit_distance_reward": 0.9957495331764221, + "step": 4442, + "temperature": 0.9 + }, + { + "advantages": -0.0003056951973121613, + "completion_length": 545.0, + "delta_ref_entropy_loss": 0.056884765625, + "delta_ref_ppl": -0.0966796875, + "entropy_loss": -0.056640625, + "epoch": 0.8886, + "grad_norm": 0.5733551715076299, + "k1_kl": 0.0966796875, + "k3_kl": 0.068359375, + "kimi_kl": 0.251953125, + "learning_rate": 5.5699999999999996e-08, + "loss": 0.003, + "ppl": 0.015625, + "reward": 0.9916567206382751, + "reward_std": 0.00028995273169130087, + "rewards/perpo_ocr_edit_distance_reward": 0.9916567802429199, + "step": 4443, + "temperature": 0.9 + }, + { + "advantages": -0.0005960464477539062, + "completion_length": 559.0, + "delta_ref_entropy_loss": 0.031982421875, + "delta_ref_ppl": -0.05712890625, + "entropy_loss": -0.037841796875, + "epoch": 0.8888, + "grad_norm": 0.026077150214018317, + "k1_kl": 0.05712890625, + "k3_kl": 0.03857421875, + "kimi_kl": 0.140625, + "learning_rate": 5.5599999999999995e-08, + "loss": 0.0021, + "ppl": 0.0084228515625, + "reward": 0.9929577112197876, + "reward_std": 0.0, + "rewards/perpo_ocr_edit_distance_reward": 0.9929578304290771, + "step": 4444, + "temperature": 0.9 + }, + { + "advantages": -8.549009180569556e-06, + "completion_length": 1321.0, + "delta_ref_entropy_loss": 0.01336669921875, + "delta_ref_ppl": -0.06591796875, + "entropy_loss": -0.1796875, + "epoch": 0.889, + "grad_norm": 1.401633396820645, + "k1_kl": 0.06591796875, + "k3_kl": 0.06103515625, + "kimi_kl": 0.1552734375, + "learning_rate": 5.55e-08, + "loss": 0.0024, + "ppl": 0.0947265625, + "reward": 0.9799047708511353, + "reward_std": 0.0018918163841590285, + "rewards/perpo_ocr_edit_distance_reward": 0.9799048900604248, + "step": 4445, + "temperature": 0.9 + }, + { + "advantages": -1.7591886717127636e-05, + "completion_length": 592.0, + "delta_ref_entropy_loss": 0.0238037109375, + "delta_ref_ppl": -0.050048828125, + "entropy_loss": -0.0634765625, + "epoch": 0.8892, + "grad_norm": 0.7692435817990494, + "k1_kl": 0.050048828125, + "k3_kl": 0.0322265625, + "kimi_kl": 0.09521484375, + "learning_rate": 5.539999999999999e-08, + "loss": 0.0013, + "ppl": 0.0201416015625, + "reward": 0.9766009449958801, + "reward_std": 0.00038399911136366427, + "rewards/perpo_ocr_edit_distance_reward": 0.9766010046005249, + "step": 4446, + "temperature": 0.9 + }, + { + "advantages": -2.976826363010332e-05, + "completion_length": 648.0, + "delta_ref_entropy_loss": 0.0322265625, + "delta_ref_ppl": -0.07861328125, + "entropy_loss": -0.0947265625, + "epoch": 0.8894, + "grad_norm": 0.6691895118593489, + "k1_kl": 0.07861328125, + "k3_kl": 0.044677734375, + "kimi_kl": 0.126953125, + "learning_rate": 5.53e-08, + "loss": 0.0018, + "ppl": 0.026611328125, + "reward": 0.7742879986763, + "reward_std": 0.0004726764454971999, + "rewards/perpo_ocr_edit_distance_reward": 0.7742879986763, + "step": 4447, + "temperature": 0.9 + }, + { + "advantages": -8.940271072788164e-05, + "completion_length": 657.0, + "delta_ref_entropy_loss": 0.01495361328125, + "delta_ref_ppl": -0.043212890625, + "entropy_loss": -0.04833984375, + "epoch": 0.8896, + "grad_norm": 0.3649177199835437, + "k1_kl": 0.04296875, + "k3_kl": 0.0262451171875, + "kimi_kl": 0.0810546875, + "learning_rate": 5.52e-08, + "loss": 0.0011, + "ppl": 0.0205078125, + "reward": 0.993798553943634, + "reward_std": 0.00028099631890654564, + "rewards/perpo_ocr_edit_distance_reward": 0.993798553943634, + "step": 4448, + "temperature": 0.9 + }, + { + "advantages": -8.855547548591858e-07, + "completion_length": 42.0, + "delta_ref_entropy_loss": -0.177734375, + "delta_ref_ppl": -0.90625, + "entropy_loss": -0.53125, + "epoch": 0.8898, + "grad_norm": 5.34689384638945, + "k1_kl": 0.90625, + "k3_kl": 0.78515625, + "kimi_kl": 3.609375, + "learning_rate": 5.5100000000000004e-08, + "loss": 0.0314, + "ppl": 0.162109375, + "reward": 0.7651098966598511, + "reward_std": 0.00938364677131176, + "rewards/perpo_ocr_edit_distance_reward": 0.7651098966598511, + "step": 4449, + "temperature": 0.9 + }, + { + "advantages": -2.920627775893081e-05, + "completion_length": 637.0, + "delta_ref_entropy_loss": 0.03857421875, + "delta_ref_ppl": -0.068359375, + "entropy_loss": -0.1220703125, + "epoch": 0.89, + "grad_norm": 2.0686416092845072, + "k1_kl": 0.068359375, + "k3_kl": 0.05224609375, + "kimi_kl": 0.1005859375, + "learning_rate": 5.4999999999999996e-08, + "loss": 0.0021, + "ppl": 0.060546875, + "reward": 0.9881590008735657, + "reward_std": 0.0031041994225233793, + "rewards/perpo_ocr_edit_distance_reward": 0.9881591796875, + "step": 4450, + "temperature": 0.9 + }, + { + "advantages": -9.076936294150073e-06, + "completion_length": 575.0, + "delta_ref_entropy_loss": 0.01458740234375, + "delta_ref_ppl": -0.05517578125, + "entropy_loss": -0.09130859375, + "epoch": 0.8902, + "grad_norm": 1.4243272410291425, + "k1_kl": 0.055419921875, + "k3_kl": 0.03271484375, + "kimi_kl": 0.07763671875, + "learning_rate": 5.4899999999999995e-08, + "loss": 0.0013, + "ppl": 0.029541015625, + "reward": 0.9797629117965698, + "reward_std": 0.0017734139692038298, + "rewards/perpo_ocr_edit_distance_reward": 0.9797629117965698, + "step": 4451, + "temperature": 0.9 + }, + { + "advantages": -2.4080278308247216e-05, + "completion_length": 511.0, + "delta_ref_entropy_loss": 0.0311279296875, + "delta_ref_ppl": -0.09423828125, + "entropy_loss": -0.0576171875, + "epoch": 0.8904, + "grad_norm": 0.4321332270914236, + "k1_kl": 0.09423828125, + "k3_kl": 0.06298828125, + "kimi_kl": 0.171875, + "learning_rate": 5.48e-08, + "loss": 0.0025, + "ppl": 0.019775390625, + "reward": 0.9497025012969971, + "reward_std": 0.0009609273402020335, + "rewards/perpo_ocr_edit_distance_reward": 0.9497025609016418, + "step": 4452, + "temperature": 0.9 + }, + { + "advantages": -7.612365152454004e-06, + "completion_length": 789.0, + "delta_ref_entropy_loss": -0.046142578125, + "delta_ref_ppl": -0.039794921875, + "entropy_loss": -0.126953125, + "epoch": 0.8906, + "grad_norm": 1.3960537073109596, + "k1_kl": 0.039794921875, + "k3_kl": 0.045166015625, + "kimi_kl": 0.10888671875, + "learning_rate": 5.4699999999999994e-08, + "loss": 0.0018, + "ppl": 0.044921875, + "reward": 0.9639490842819214, + "reward_std": 0.008851690217852592, + "rewards/perpo_ocr_edit_distance_reward": 0.9639492034912109, + "step": 4453, + "temperature": 0.9 + }, + { + "advantages": -3.1862942705629393e-05, + "completion_length": 498.0, + "delta_ref_entropy_loss": 0.058837890625, + "delta_ref_ppl": -0.11376953125, + "entropy_loss": -0.08056640625, + "epoch": 0.8908, + "grad_norm": 0.6197001457228272, + "k1_kl": 0.11376953125, + "k3_kl": 0.076171875, + "kimi_kl": 0.240234375, + "learning_rate": 5.46e-08, + "loss": 0.0031, + "ppl": 0.03173828125, + "reward": 0.9929752349853516, + "reward_std": 0.0012363820569589734, + "rewards/perpo_ocr_edit_distance_reward": 0.9929753541946411, + "step": 4454, + "temperature": 0.9 + }, + { + "advantages": 2.55448497910038e-08, + "completion_length": 389.0, + "delta_ref_entropy_loss": 0.0269775390625, + "delta_ref_ppl": -0.042724609375, + "entropy_loss": -0.05517578125, + "epoch": 0.891, + "grad_norm": 0.36527581490359445, + "k1_kl": 0.042724609375, + "k3_kl": 0.0274658203125, + "kimi_kl": 0.0703125, + "learning_rate": 5.45e-08, + "loss": 0.0011, + "ppl": 0.0162353515625, + "reward": 0.9983408451080322, + "reward_std": 0.0005814008763991296, + "rewards/perpo_ocr_edit_distance_reward": 0.998340904712677, + "step": 4455, + "temperature": 0.9 + }, + { + "advantages": -6.897109301462478e-07, + "completion_length": 407.0, + "delta_ref_entropy_loss": -0.66015625, + "delta_ref_ppl": -0.10107421875, + "entropy_loss": -1.2578125, + "epoch": 0.8912, + "grad_norm": 3.392639487148246, + "k1_kl": 0.09912109375, + "k3_kl": 0.208984375, + "kimi_kl": 0.451171875, + "learning_rate": 5.44e-08, + "loss": 0.0084, + "ppl": 0.5546875, + "reward": 0.7233291864395142, + "reward_std": 0.09612296521663666, + "rewards/perpo_ocr_edit_distance_reward": 0.7233292460441589, + "step": 4456, + "temperature": 0.9 + }, + { + "advantages": -3.695488203447894e-06, + "completion_length": 436.0, + "delta_ref_entropy_loss": -0.0108642578125, + "delta_ref_ppl": -0.0966796875, + "entropy_loss": -0.251953125, + "epoch": 0.8914, + "grad_norm": 2.5237525563164174, + "k1_kl": 0.09619140625, + "k3_kl": 0.0791015625, + "kimi_kl": 0.2001953125, + "learning_rate": 5.4299999999999997e-08, + "loss": 0.0032, + "ppl": 0.10302734375, + "reward": 0.9760776162147522, + "reward_std": 0.009104886092245579, + "rewards/perpo_ocr_edit_distance_reward": 0.976077675819397, + "step": 4457, + "temperature": 0.9 + }, + { + "advantages": -3.225462933187373e-05, + "completion_length": 718.0, + "delta_ref_entropy_loss": 0.034423828125, + "delta_ref_ppl": -0.04541015625, + "entropy_loss": -0.0458984375, + "epoch": 0.8916, + "grad_norm": 0.47332672902111245, + "k1_kl": 0.04541015625, + "k3_kl": 0.0262451171875, + "kimi_kl": 0.0771484375, + "learning_rate": 5.4199999999999996e-08, + "loss": 0.0011, + "ppl": 0.01336669921875, + "reward": 0.9965279698371887, + "reward_std": 0.0004279719141777605, + "rewards/perpo_ocr_edit_distance_reward": 0.9965279698371887, + "step": 4458, + "temperature": 0.9 + }, + { + "advantages": -2.488068275852129e-05, + "completion_length": 244.0, + "delta_ref_entropy_loss": 0.057861328125, + "delta_ref_ppl": -0.1630859375, + "entropy_loss": -0.10791015625, + "epoch": 0.8918, + "grad_norm": 0.9754966044491139, + "k1_kl": 0.1630859375, + "k3_kl": 0.12353515625, + "kimi_kl": 0.484375, + "learning_rate": 5.41e-08, + "loss": 0.005, + "ppl": 0.04638671875, + "reward": 0.9934128522872925, + "reward_std": 0.0019552395679056644, + "rewards/perpo_ocr_edit_distance_reward": 0.993412971496582, + "step": 4459, + "temperature": 0.9 + }, + { + "advantages": -0.00028652805485762656, + "completion_length": 806.0, + "delta_ref_entropy_loss": 0.0296630859375, + "delta_ref_ppl": -0.045654296875, + "entropy_loss": -0.044677734375, + "epoch": 0.892, + "grad_norm": 0.22218806167214258, + "k1_kl": 0.045654296875, + "k3_kl": 0.023681640625, + "kimi_kl": 0.06298828125, + "learning_rate": 5.3999999999999994e-08, + "loss": 0.0012, + "ppl": 0.013427734375, + "reward": 0.9992371201515198, + "reward_std": 0.00016746777691878378, + "rewards/perpo_ocr_edit_distance_reward": 0.9992372393608093, + "step": 4460, + "temperature": 0.9 + }, + { + "advantages": -5.177089406060986e-06, + "completion_length": 409.0, + "delta_ref_entropy_loss": 0.0101318359375, + "delta_ref_ppl": -0.083984375, + "entropy_loss": -0.08349609375, + "epoch": 0.8922, + "grad_norm": 0.8174599047462162, + "k1_kl": 0.083984375, + "k3_kl": 0.059326171875, + "kimi_kl": 0.1884765625, + "learning_rate": 5.39e-08, + "loss": 0.0024, + "ppl": 0.0299072265625, + "reward": 0.9737238883972168, + "reward_std": 0.011416662484407425, + "rewards/perpo_ocr_edit_distance_reward": 0.9737240076065063, + "step": 4461, + "temperature": 0.9 + }, + { + "advantages": -3.1803338060854e-05, + "completion_length": 276.0, + "delta_ref_entropy_loss": 0.0079345703125, + "delta_ref_ppl": -0.1748046875, + "entropy_loss": -0.19140625, + "epoch": 0.8924, + "grad_norm": 2.1036953499908826, + "k1_kl": 0.1748046875, + "k3_kl": 0.11865234375, + "kimi_kl": 0.4375, + "learning_rate": 5.38e-08, + "loss": 0.0048, + "ppl": 0.0712890625, + "reward": 0.7080785632133484, + "reward_std": 0.0017732215346768498, + "rewards/perpo_ocr_edit_distance_reward": 0.7080786228179932, + "step": 4462, + "temperature": 0.9 + }, + { + "advantages": -5.2911898819729686e-05, + "completion_length": 1310.0, + "delta_ref_entropy_loss": 0.04931640625, + "delta_ref_ppl": -0.08251953125, + "entropy_loss": -0.126953125, + "epoch": 0.8926, + "grad_norm": 1.2196691817379228, + "k1_kl": 0.08251953125, + "k3_kl": 0.051025390625, + "kimi_kl": 0.1337890625, + "learning_rate": 5.37e-08, + "loss": 0.0021, + "ppl": 0.061279296875, + "reward": 0.86763995885849, + "reward_std": 0.0015100085875019431, + "rewards/perpo_ocr_edit_distance_reward": 0.8676400184631348, + "step": 4463, + "temperature": 0.9 + }, + { + "advantages": -8.131776849040762e-05, + "completion_length": 358.0, + "delta_ref_entropy_loss": 0.026611328125, + "delta_ref_ppl": -0.0908203125, + "entropy_loss": -0.07421875, + "epoch": 0.8928, + "grad_norm": 0.8548735957582215, + "k1_kl": 0.0908203125, + "k3_kl": 0.05615234375, + "kimi_kl": 0.14453125, + "learning_rate": 5.36e-08, + "loss": 0.0023, + "ppl": 0.026123046875, + "reward": 0.986953616142273, + "reward_std": 0.0013662473065778613, + "rewards/perpo_ocr_edit_distance_reward": 0.9869537949562073, + "step": 4464, + "temperature": 0.9 + }, + { + "advantages": -3.62864593626e-05, + "completion_length": 665.0, + "delta_ref_entropy_loss": 0.0068359375, + "delta_ref_ppl": -0.04443359375, + "entropy_loss": -0.049560546875, + "epoch": 0.893, + "grad_norm": 0.4194460424083239, + "k1_kl": 0.04443359375, + "k3_kl": 0.0322265625, + "kimi_kl": 0.09814453125, + "learning_rate": 5.3499999999999996e-08, + "loss": 0.0013, + "ppl": 0.01611328125, + "reward": 0.8897103667259216, + "reward_std": 0.002012323122471571, + "rewards/perpo_ocr_edit_distance_reward": 0.8897104859352112, + "step": 4465, + "temperature": 0.9 + }, + { + "advantages": 1.4513731912302319e-05, + "completion_length": 827.0, + "delta_ref_entropy_loss": 0.00732421875, + "delta_ref_ppl": -0.046142578125, + "entropy_loss": -0.08544921875, + "epoch": 0.8932, + "grad_norm": 0.4368105680402265, + "k1_kl": 0.046142578125, + "k3_kl": 0.03759765625, + "kimi_kl": 0.08203125, + "learning_rate": 5.34e-08, + "loss": 0.0015, + "ppl": 0.033935546875, + "reward": 0.9963329434394836, + "reward_std": 0.0004862876667175442, + "rewards/perpo_ocr_edit_distance_reward": 0.9963330030441284, + "step": 4466, + "temperature": 0.9 + }, + { + "advantages": -6.400687561836094e-05, + "completion_length": 775.0, + "delta_ref_entropy_loss": 0.004119873046875, + "delta_ref_ppl": -0.048583984375, + "entropy_loss": -0.08837890625, + "epoch": 0.8934, + "grad_norm": 0.5155827160211266, + "k1_kl": 0.04833984375, + "k3_kl": 0.03173828125, + "kimi_kl": 0.07763671875, + "learning_rate": 5.33e-08, + "loss": 0.0013, + "ppl": 0.028564453125, + "reward": 0.995128870010376, + "reward_std": 0.0012302970280870795, + "rewards/perpo_ocr_edit_distance_reward": 0.9951289296150208, + "step": 4467, + "temperature": 0.9 + }, + { + "advantages": 0.0, + "completion_length": 469.0, + "delta_ref_entropy_loss": -0.205078125, + "delta_ref_ppl": -0.07080078125, + "entropy_loss": -0.40234375, + "epoch": 0.8936, + "grad_norm": 4.3053361552542855, + "k1_kl": 0.07080078125, + "k3_kl": 0.08349609375, + "kimi_kl": 0.1748046875, + "learning_rate": 5.319999999999999e-08, + "loss": 0.0033, + "ppl": 0.1455078125, + "reward": 0.8932573795318604, + "reward_std": 0.1407555788755417, + "rewards/perpo_ocr_edit_distance_reward": 0.8932573795318604, + "step": 4468, + "temperature": 0.9 + }, + { + "advantages": -1.1580331147342804e-06, + "completion_length": 618.0, + "delta_ref_entropy_loss": -0.392578125, + "delta_ref_ppl": -0.056884765625, + "entropy_loss": -1.0234375, + "epoch": 0.8938, + "grad_norm": 6.5157970589910335, + "k1_kl": 0.057373046875, + "k3_kl": 0.107421875, + "kimi_kl": 0.20703125, + "learning_rate": 5.31e-08, + "loss": 0.0043, + "ppl": 0.462890625, + "reward": 0.5340381264686584, + "reward_std": 0.021337518468499184, + "rewards/perpo_ocr_edit_distance_reward": 0.5340381860733032, + "step": 4469, + "temperature": 0.9 + }, + { + "advantages": -1.4654227925348096e-05, + "completion_length": 193.0, + "delta_ref_entropy_loss": -0.01177978515625, + "delta_ref_ppl": -0.1484375, + "entropy_loss": -0.11328125, + "epoch": 0.894, + "grad_norm": 1.9179179291549915, + "k1_kl": 0.1484375, + "k3_kl": 0.12109375, + "kimi_kl": 0.54296875, + "learning_rate": 5.3e-08, + "loss": 0.0049, + "ppl": 0.03662109375, + "reward": 0.9679156541824341, + "reward_std": 0.005135248880833387, + "rewards/perpo_ocr_edit_distance_reward": 0.9679157137870789, + "step": 4470, + "temperature": 0.9 + }, + { + "advantages": -3.9177284634206444e-05, + "completion_length": 589.0, + "delta_ref_entropy_loss": 0.0206298828125, + "delta_ref_ppl": -0.053466796875, + "entropy_loss": -0.047119140625, + "epoch": 0.8942, + "grad_norm": 0.3871146792277919, + "k1_kl": 0.053466796875, + "k3_kl": 0.03515625, + "kimi_kl": 0.11376953125, + "learning_rate": 5.29e-08, + "loss": 0.0014, + "ppl": 0.01531982421875, + "reward": 0.996676504611969, + "reward_std": 0.0005519294645637274, + "rewards/perpo_ocr_edit_distance_reward": 0.996676504611969, + "step": 4471, + "temperature": 0.9 + }, + { + "advantages": -5.449567652249243e-06, + "completion_length": 430.0, + "delta_ref_entropy_loss": 0.039794921875, + "delta_ref_ppl": -0.10205078125, + "entropy_loss": -0.0732421875, + "epoch": 0.8944, + "grad_norm": 1.2275137130518161, + "k1_kl": 0.1015625, + "k3_kl": 0.07275390625, + "kimi_kl": 0.236328125, + "learning_rate": 5.2799999999999996e-08, + "loss": 0.0029, + "ppl": 0.025146484375, + "reward": 0.9963687062263489, + "reward_std": 0.0014550299383699894, + "rewards/perpo_ocr_edit_distance_reward": 0.9963687062263489, + "step": 4472, + "temperature": 0.9 + }, + { + "advantages": -1.1920928955078125e-07, + "completion_length": 1146.0, + "delta_ref_entropy_loss": 0.00433349609375, + "delta_ref_ppl": -0.0859375, + "entropy_loss": -0.333984375, + "epoch": 0.8946, + "grad_norm": 3.2787896309654716, + "k1_kl": 0.0859375, + "k3_kl": 0.07958984375, + "kimi_kl": 0.138671875, + "learning_rate": 5.2699999999999995e-08, + "loss": 0.0032, + "ppl": 0.1708984375, + "reward": 0.8694948554039001, + "reward_std": 0.08097923547029495, + "rewards/perpo_ocr_edit_distance_reward": 0.8694947957992554, + "step": 4473, + "temperature": 0.9 + }, + { + "advantages": 3.784894943237305e-05, + "completion_length": 681.0, + "delta_ref_entropy_loss": 0.02001953125, + "delta_ref_ppl": -0.05810546875, + "entropy_loss": -0.037841796875, + "epoch": 0.8948, + "grad_norm": 0.3769215880719223, + "k1_kl": 0.05810546875, + "k3_kl": 0.036865234375, + "kimi_kl": 0.12353515625, + "learning_rate": 5.26e-08, + "loss": 0.0014, + "ppl": 0.009765625, + "reward": 0.9958418011665344, + "reward_std": 0.0005748618277721107, + "rewards/perpo_ocr_edit_distance_reward": 0.9958418607711792, + "step": 4474, + "temperature": 0.9 + }, + { + "advantages": -6.370885239448398e-05, + "completion_length": 412.0, + "delta_ref_entropy_loss": 0.031494140625, + "delta_ref_ppl": -0.10009765625, + "entropy_loss": -0.0791015625, + "epoch": 0.895, + "grad_norm": 1.1418777401619917, + "k1_kl": 0.099609375, + "k3_kl": 0.0654296875, + "kimi_kl": 0.2197265625, + "learning_rate": 5.2499999999999994e-08, + "loss": 0.0027, + "ppl": 0.032470703125, + "reward": 0.972987711429596, + "reward_std": 0.0015040928265079856, + "rewards/perpo_ocr_edit_distance_reward": 0.9729877710342407, + "step": 4475, + "temperature": 0.9 + }, + { + "advantages": -4.934413300361484e-05, + "completion_length": 789.0, + "delta_ref_entropy_loss": 0.0211181640625, + "delta_ref_ppl": -0.03515625, + "entropy_loss": -0.0634765625, + "epoch": 0.8952, + "grad_norm": 0.6747479493124721, + "k1_kl": 0.03515625, + "k3_kl": 0.01953125, + "kimi_kl": 0.04150390625, + "learning_rate": 5.24e-08, + "loss": 0.0008, + "ppl": 0.026611328125, + "reward": 0.9943267703056335, + "reward_std": 0.00110762775875628, + "rewards/perpo_ocr_edit_distance_reward": 0.9943268299102783, + "step": 4476, + "temperature": 0.9 + }, + { + "advantages": -1.5386514860438183e-05, + "completion_length": 680.0, + "delta_ref_entropy_loss": 0.00390625, + "delta_ref_ppl": -0.09326171875, + "entropy_loss": -0.1220703125, + "epoch": 0.8954, + "grad_norm": 1.0319545468576656, + "k1_kl": 0.0927734375, + "k3_kl": 0.068359375, + "kimi_kl": 0.19140625, + "learning_rate": 5.23e-08, + "loss": 0.0027, + "ppl": 0.0400390625, + "reward": 0.9800553321838379, + "reward_std": 0.003779004095122218, + "rewards/perpo_ocr_edit_distance_reward": 0.9800553917884827, + "step": 4477, + "temperature": 0.9 + }, + { + "advantages": 0.0, + "completion_length": 721.0, + "delta_ref_entropy_loss": -0.0118408203125, + "delta_ref_ppl": -0.0439453125, + "entropy_loss": -0.05712890625, + "epoch": 0.8956, + "grad_norm": 0.44237631941398914, + "k1_kl": 0.0439453125, + "k3_kl": 0.03662109375, + "kimi_kl": 0.11474609375, + "learning_rate": 5.2200000000000004e-08, + "loss": 0.0015, + "ppl": 0.0205078125, + "reward": 0.9910075664520264, + "reward_std": 0.001360766589641571, + "rewards/perpo_ocr_edit_distance_reward": 0.9910075664520264, + "step": 4478, + "temperature": 0.9 + }, + { + "advantages": -2.7469228371046484e-05, + "completion_length": 744.0, + "delta_ref_entropy_loss": 0.025634765625, + "delta_ref_ppl": -0.05517578125, + "entropy_loss": -0.07763671875, + "epoch": 0.8958, + "grad_norm": 0.5708877703978538, + "k1_kl": 0.05517578125, + "k3_kl": 0.031982421875, + "kimi_kl": 0.0947265625, + "learning_rate": 5.2099999999999997e-08, + "loss": 0.0013, + "ppl": 0.0240478515625, + "reward": 0.9951017498970032, + "reward_std": 0.0008294155122712255, + "rewards/perpo_ocr_edit_distance_reward": 0.995101809501648, + "step": 4479, + "temperature": 0.9 + }, + { + "advantages": -1.5786716176080517e-05, + "completion_length": 932.0, + "delta_ref_entropy_loss": 0.023193359375, + "delta_ref_ppl": -0.061279296875, + "entropy_loss": -0.115234375, + "epoch": 0.896, + "grad_norm": 0.8344827825360234, + "k1_kl": 0.061279296875, + "k3_kl": 0.03759765625, + "kimi_kl": 0.1015625, + "learning_rate": 5.1999999999999996e-08, + "loss": 0.0015, + "ppl": 0.051025390625, + "reward": 0.9810153245925903, + "reward_std": 0.0015175571897998452, + "rewards/perpo_ocr_edit_distance_reward": 0.9810153841972351, + "step": 4480, + "temperature": 0.9 + }, + { + "advantages": 0.0, + "completion_length": 299.0, + "delta_ref_entropy_loss": 0.033447265625, + "delta_ref_ppl": -0.08349609375, + "entropy_loss": -0.072265625, + "epoch": 0.8962, + "grad_norm": 1.1125273115095273, + "k1_kl": 0.083984375, + "k3_kl": 0.052734375, + "kimi_kl": 0.16015625, + "learning_rate": 5.19e-08, + "loss": 0.0021, + "ppl": 0.028564453125, + "reward": 0.9955661296844482, + "reward_std": 0.0012688480783253908, + "rewards/perpo_ocr_edit_distance_reward": 0.9955661296844482, + "step": 4481, + "temperature": 0.9 + }, + { + "advantages": -1.7506736185168847e-05, + "completion_length": 445.0, + "delta_ref_entropy_loss": -0.03369140625, + "delta_ref_ppl": -0.12158203125, + "entropy_loss": -0.333984375, + "epoch": 0.8964, + "grad_norm": 1.6117665017813434, + "k1_kl": 0.1220703125, + "k3_kl": 0.1005859375, + "kimi_kl": 0.29296875, + "learning_rate": 5.1799999999999994e-08, + "loss": 0.004, + "ppl": 0.12255859375, + "reward": 0.7370076775550842, + "reward_std": 0.005262049846351147, + "rewards/perpo_ocr_edit_distance_reward": 0.7370077967643738, + "step": 4482, + "temperature": 0.9 + }, + { + "advantages": -1.84476375579834e-05, + "completion_length": 305.0, + "delta_ref_entropy_loss": 0.03564453125, + "delta_ref_ppl": -0.1064453125, + "entropy_loss": -0.0634765625, + "epoch": 0.8966, + "grad_norm": 0.5468407742690193, + "k1_kl": 0.1064453125, + "k3_kl": 0.080078125, + "kimi_kl": 0.263671875, + "learning_rate": 5.17e-08, + "loss": 0.0032, + "ppl": 0.0166015625, + "reward": 0.9986893534660339, + "reward_std": 0.0017453085165470839, + "rewards/perpo_ocr_edit_distance_reward": 0.9986894130706787, + "step": 4483, + "temperature": 0.9 + }, + { + "advantages": -3.9901053241919726e-05, + "completion_length": 261.0, + "delta_ref_entropy_loss": -0.0018157958984375, + "delta_ref_ppl": -0.11474609375, + "entropy_loss": -0.09326171875, + "epoch": 0.8968, + "grad_norm": 0.7956645402376478, + "k1_kl": 0.11572265625, + "k3_kl": 0.11962890625, + "kimi_kl": 0.326171875, + "learning_rate": 5.16e-08, + "loss": 0.0048, + "ppl": 0.036376953125, + "reward": 0.9305669069290161, + "reward_std": 0.0013937484472990036, + "rewards/perpo_ocr_edit_distance_reward": 0.9305669069290161, + "step": 4484, + "temperature": 0.9 + }, + { + "advantages": -2.111707544827368e-05, + "completion_length": 760.0, + "delta_ref_entropy_loss": 0.00634765625, + "delta_ref_ppl": -0.056396484375, + "entropy_loss": -0.06787109375, + "epoch": 0.897, + "grad_norm": 0.5693849206880532, + "k1_kl": 0.056396484375, + "k3_kl": 0.038818359375, + "kimi_kl": 0.169921875, + "learning_rate": 5.15e-08, + "loss": 0.0016, + "ppl": 0.0230712890625, + "reward": 0.9977506399154663, + "reward_std": 0.0007065991521812975, + "rewards/perpo_ocr_edit_distance_reward": 0.9977506399154663, + "step": 4485, + "temperature": 0.9 + }, + { + "advantages": -2.748625774984248e-05, + "completion_length": 656.0, + "delta_ref_entropy_loss": 0.0235595703125, + "delta_ref_ppl": -0.053955078125, + "entropy_loss": -0.047119140625, + "epoch": 0.8972, + "grad_norm": 0.3204560220177749, + "k1_kl": 0.053955078125, + "k3_kl": 0.042236328125, + "kimi_kl": 0.095703125, + "learning_rate": 5.14e-08, + "loss": 0.0017, + "ppl": 0.017578125, + "reward": 0.994705319404602, + "reward_std": 0.0005188389332033694, + "rewards/perpo_ocr_edit_distance_reward": 0.9947053790092468, + "step": 4486, + "temperature": 0.9 + }, + { + "advantages": -7.390976406895788e-06, + "completion_length": 151.0, + "delta_ref_entropy_loss": 0.026611328125, + "delta_ref_ppl": -0.275390625, + "entropy_loss": -0.12451171875, + "epoch": 0.8974, + "grad_norm": 2.1979661283591394, + "k1_kl": 0.2734375, + "k3_kl": 0.23046875, + "kimi_kl": 1.1328125, + "learning_rate": 5.1299999999999996e-08, + "loss": 0.0092, + "ppl": 0.034912109375, + "reward": 0.9875949025154114, + "reward_std": 0.0021968029905110598, + "rewards/perpo_ocr_edit_distance_reward": 0.9875949621200562, + "step": 4487, + "temperature": 0.9 + }, + { + "advantages": -1.7642974853515625e-05, + "completion_length": 365.0, + "delta_ref_entropy_loss": 0.01019287109375, + "delta_ref_ppl": -0.059326171875, + "entropy_loss": -0.051025390625, + "epoch": 0.8976, + "grad_norm": 1.9645064881475687, + "k1_kl": 0.059326171875, + "k3_kl": 0.04541015625, + "kimi_kl": 0.1484375, + "learning_rate": 5.12e-08, + "loss": 0.0018, + "ppl": 0.020751953125, + "reward": 0.9697718024253845, + "reward_std": 0.0018305920530110598, + "rewards/perpo_ocr_edit_distance_reward": 0.9697718620300293, + "step": 4488, + "temperature": 0.9 + }, + { + "advantages": -2.665179272298701e-05, + "completion_length": 495.0, + "delta_ref_entropy_loss": 0.035400390625, + "delta_ref_ppl": -0.0810546875, + "entropy_loss": -0.0634765625, + "epoch": 0.8978, + "grad_norm": 0.6314123225929773, + "k1_kl": 0.0810546875, + "k3_kl": 0.0517578125, + "kimi_kl": 0.1826171875, + "learning_rate": 5.1099999999999994e-08, + "loss": 0.0021, + "ppl": 0.0177001953125, + "reward": 0.994551956653595, + "reward_std": 0.0005395286134444177, + "rewards/perpo_ocr_edit_distance_reward": 0.9945520162582397, + "step": 4489, + "temperature": 0.9 + }, + { + "advantages": -2.627713547553867e-05, + "completion_length": 1446.0, + "delta_ref_entropy_loss": 0.03759765625, + "delta_ref_ppl": -0.08740234375, + "entropy_loss": -0.1357421875, + "epoch": 0.898, + "grad_norm": 1.7382287940905057, + "k1_kl": 0.0869140625, + "k3_kl": 0.06787109375, + "kimi_kl": 0.173828125, + "learning_rate": 5.0999999999999993e-08, + "loss": 0.0027, + "ppl": 0.0703125, + "reward": 0.9409727454185486, + "reward_std": 0.003141697496175766, + "rewards/perpo_ocr_edit_distance_reward": 0.9409728646278381, + "step": 4490, + "temperature": 0.9 + }, + { + "advantages": -6.365776062011719e-05, + "completion_length": 859.0, + "delta_ref_entropy_loss": 0.04736328125, + "delta_ref_ppl": -0.0859375, + "entropy_loss": -0.09814453125, + "epoch": 0.8982, + "grad_norm": 0.6295122146758587, + "k1_kl": 0.0859375, + "k3_kl": 0.05029296875, + "kimi_kl": 0.1240234375, + "learning_rate": 5.09e-08, + "loss": 0.0021, + "ppl": 0.034423828125, + "reward": 0.9265180826187134, + "reward_std": 0.0008365968824364245, + "rewards/perpo_ocr_edit_distance_reward": 0.9265182018280029, + "step": 4491, + "temperature": 0.9 + }, + { + "advantages": -1.2942723515152466e-06, + "completion_length": 238.0, + "delta_ref_entropy_loss": -0.224609375, + "delta_ref_ppl": -0.14453125, + "entropy_loss": -0.486328125, + "epoch": 0.8984, + "grad_norm": 5.214473795494708, + "k1_kl": 0.14453125, + "k3_kl": 0.1787109375, + "kimi_kl": 0.5078125, + "learning_rate": 5.08e-08, + "loss": 0.0072, + "ppl": 0.19140625, + "reward": 0.734600305557251, + "reward_std": 0.03925085812807083, + "rewards/perpo_ocr_edit_distance_reward": 0.734600305557251, + "step": 4492, + "temperature": 0.9 + }, + { + "advantages": -8.689506103110034e-06, + "completion_length": 46.0, + "delta_ref_entropy_loss": 0.006561279296875, + "delta_ref_ppl": -0.75, + "entropy_loss": -0.162109375, + "epoch": 0.8986, + "grad_norm": 3.959775864306321, + "k1_kl": 0.75, + "k3_kl": 0.7265625, + "kimi_kl": 2.953125, + "learning_rate": 5.07e-08, + "loss": 0.0292, + "ppl": 0.0869140625, + "reward": 0.9648809432983398, + "reward_std": 0.006743133999407291, + "rewards/perpo_ocr_edit_distance_reward": 0.9648810029029846, + "step": 4493, + "temperature": 0.9 + }, + { + "advantages": -3.0504807000397705e-06, + "completion_length": 414.0, + "delta_ref_entropy_loss": 0.004180908203125, + "delta_ref_ppl": -0.1865234375, + "entropy_loss": -0.52734375, + "epoch": 0.8988, + "grad_norm": 2.4995743185312023, + "k1_kl": 0.185546875, + "k3_kl": 0.134765625, + "kimi_kl": 0.302734375, + "learning_rate": 5.0599999999999996e-08, + "loss": 0.0054, + "ppl": 0.2578125, + "reward": 0.5561262369155884, + "reward_std": 0.011033200658857822, + "rewards/perpo_ocr_edit_distance_reward": 0.5561262965202332, + "step": 4494, + "temperature": 0.9 + }, + { + "advantages": -1.5326910215662792e-05, + "completion_length": 198.0, + "delta_ref_entropy_loss": -0.0126953125, + "delta_ref_ppl": -0.212890625, + "entropy_loss": -0.1513671875, + "epoch": 0.899, + "grad_norm": 1.3310541969886354, + "k1_kl": 0.2138671875, + "k3_kl": 0.1611328125, + "kimi_kl": 0.6328125, + "learning_rate": 5.05e-08, + "loss": 0.0065, + "ppl": 0.05029296875, + "reward": 0.9928892254829407, + "reward_std": 0.0032374055590480566, + "rewards/perpo_ocr_edit_distance_reward": 0.9928892850875854, + "step": 4495, + "temperature": 0.9 + }, + { + "advantages": -7.094656029948965e-05, + "completion_length": 448.0, + "delta_ref_entropy_loss": 0.034912109375, + "delta_ref_ppl": -0.06884765625, + "entropy_loss": -0.06201171875, + "epoch": 0.8992, + "grad_norm": 0.4368535318427955, + "k1_kl": 0.06884765625, + "k3_kl": 0.043212890625, + "kimi_kl": 0.1357421875, + "learning_rate": 5.04e-08, + "loss": 0.0018, + "ppl": 0.017822265625, + "reward": 0.9870611429214478, + "reward_std": 0.0007400868344120681, + "rewards/perpo_ocr_edit_distance_reward": 0.9870612025260925, + "step": 4496, + "temperature": 0.9 + }, + { + "advantages": -0.00013772930833511055, + "completion_length": 1338.0, + "delta_ref_entropy_loss": 0.03173828125, + "delta_ref_ppl": -0.046630859375, + "entropy_loss": -0.0751953125, + "epoch": 0.8994, + "grad_norm": 3.9347019577350077, + "k1_kl": 0.046630859375, + "k3_kl": 0.0269775390625, + "kimi_kl": 0.06201171875, + "learning_rate": 5.0299999999999994e-08, + "loss": 0.0012, + "ppl": 0.0302734375, + "reward": 0.9837080240249634, + "reward_std": 0.0004564186092466116, + "rewards/perpo_ocr_edit_distance_reward": 0.9837081432342529, + "step": 4497, + "temperature": 0.9 + }, + { + "advantages": -3.0313219667732483e-06, + "completion_length": 524.0, + "delta_ref_entropy_loss": -0.043701171875, + "delta_ref_ppl": -0.1455078125, + "entropy_loss": -0.65625, + "epoch": 0.8996, + "grad_norm": 3.1163464181541913, + "k1_kl": 0.1455078125, + "k3_kl": 0.11181640625, + "kimi_kl": 0.298828125, + "learning_rate": 5.02e-08, + "loss": 0.0045, + "ppl": 0.333984375, + "reward": 0.6807172298431396, + "reward_std": 0.01102121639996767, + "rewards/perpo_ocr_edit_distance_reward": 0.6807172894477844, + "step": 4498, + "temperature": 0.9 + }, + { + "advantages": 0.0, + "completion_length": 599.0, + "delta_ref_entropy_loss": -0.08251953125, + "delta_ref_ppl": -0.115234375, + "entropy_loss": -0.65625, + "epoch": 0.8998, + "grad_norm": 2.3140718803057867, + "k1_kl": 0.115234375, + "k3_kl": 0.0947265625, + "kimi_kl": 0.1962890625, + "learning_rate": 5.01e-08, + "loss": 0.0038, + "ppl": 0.310546875, + "reward": 0.55186927318573, + "reward_std": 0.008947907015681267, + "rewards/perpo_ocr_edit_distance_reward": 0.55186927318573, + "step": 4499, + "temperature": 0.9 + }, + { + "advantages": -1.1358943083905615e-05, + "completion_length": 1934.0, + "delta_ref_entropy_loss": 0.0033416748046875, + "delta_ref_ppl": -0.03857421875, + "entropy_loss": -0.10205078125, + "epoch": 0.9, + "grad_norm": 2.049671347804802, + "k1_kl": 0.03857421875, + "k3_kl": 0.031494140625, + "kimi_kl": 0.0771484375, + "learning_rate": 5e-08, + "loss": 0.0013, + "ppl": 0.048095703125, + "reward": 0.9833487272262573, + "reward_std": 0.0021485593169927597, + "rewards/perpo_ocr_edit_distance_reward": 0.9833487272262573, + "step": 4500, + "temperature": 0.9 + }, + { + "advantages": -2.2734915546607226e-05, + "completion_length": 365.0, + "delta_ref_entropy_loss": 0.0269775390625, + "delta_ref_ppl": -0.1796875, + "entropy_loss": -0.09130859375, + "epoch": 0.9002, + "grad_norm": 1.127122609017986, + "k1_kl": 0.1796875, + "k3_kl": 0.1396484375, + "kimi_kl": 0.63671875, + "learning_rate": 4.99e-08, + "loss": 0.0056, + "ppl": 0.034912109375, + "reward": 0.9939165711402893, + "reward_std": 0.001399157103151083, + "rewards/perpo_ocr_edit_distance_reward": 0.9939165711402893, + "step": 4501, + "temperature": 0.9 + }, + { + "advantages": -5.1089696171402466e-06, + "completion_length": 753.0, + "delta_ref_entropy_loss": 0.0556640625, + "delta_ref_ppl": -0.06396484375, + "entropy_loss": -0.0654296875, + "epoch": 0.9004, + "grad_norm": 1.0246338197362037, + "k1_kl": 0.06396484375, + "k3_kl": 0.03369140625, + "kimi_kl": 0.07763671875, + "learning_rate": 4.9799999999999996e-08, + "loss": 0.0014, + "ppl": 0.0230712890625, + "reward": 0.9781692624092102, + "reward_std": 0.008223851211369038, + "rewards/perpo_ocr_edit_distance_reward": 0.978169322013855, + "step": 4502, + "temperature": 0.9 + }, + { + "advantages": -2.35268053074833e-05, + "completion_length": 1041.0, + "delta_ref_entropy_loss": 0.036865234375, + "delta_ref_ppl": -0.064453125, + "entropy_loss": -0.0830078125, + "epoch": 0.9006, + "grad_norm": 0.9323427800915989, + "k1_kl": 0.064453125, + "k3_kl": 0.04296875, + "kimi_kl": 0.09228515625, + "learning_rate": 4.97e-08, + "loss": 0.0017, + "ppl": 0.041259765625, + "reward": 0.9918048977851868, + "reward_std": 0.0009855765383690596, + "rewards/perpo_ocr_edit_distance_reward": 0.9918048977851868, + "step": 4503, + "temperature": 0.9 + }, + { + "advantages": -4.555497980618384e-06, + "completion_length": 312.0, + "delta_ref_entropy_loss": 0.005157470703125, + "delta_ref_ppl": -0.11328125, + "entropy_loss": -0.09912109375, + "epoch": 0.9008, + "grad_norm": 0.8333582948337768, + "k1_kl": 0.11328125, + "k3_kl": 0.08740234375, + "kimi_kl": 0.408203125, + "learning_rate": 4.9599999999999994e-08, + "loss": 0.0035, + "ppl": 0.033203125, + "reward": 0.9930641055107117, + "reward_std": 0.0017758695175871253, + "rewards/perpo_ocr_edit_distance_reward": 0.9930641055107117, + "step": 4504, + "temperature": 0.9 + }, + { + "advantages": -1.8111297322320752e-05, + "completion_length": 1146.0, + "delta_ref_entropy_loss": 0.03173828125, + "delta_ref_ppl": -0.0654296875, + "entropy_loss": -0.053466796875, + "epoch": 0.901, + "grad_norm": 0.6079015955491696, + "k1_kl": 0.06591796875, + "k3_kl": 0.042236328125, + "kimi_kl": 0.12890625, + "learning_rate": 4.95e-08, + "loss": 0.0017, + "ppl": 0.020751953125, + "reward": 0.9960753321647644, + "reward_std": 0.0013110354775562882, + "rewards/perpo_ocr_edit_distance_reward": 0.996075451374054, + "step": 4505, + "temperature": 0.9 + }, + { + "advantages": -7.06485370756127e-05, + "completion_length": 491.0, + "delta_ref_entropy_loss": 0.037109375, + "delta_ref_ppl": -0.0947265625, + "entropy_loss": -0.052490234375, + "epoch": 0.9012, + "grad_norm": 0.3982422592412315, + "k1_kl": 0.0947265625, + "k3_kl": 0.06982421875, + "kimi_kl": 0.28515625, + "learning_rate": 4.94e-08, + "loss": 0.0029, + "ppl": 0.01507568359375, + "reward": 0.9971042275428772, + "reward_std": 0.000623085128609091, + "rewards/perpo_ocr_edit_distance_reward": 0.997104287147522, + "step": 4506, + "temperature": 0.9 + }, + { + "advantages": -7.05889306118479e-06, + "completion_length": 765.0, + "delta_ref_entropy_loss": -0.08837890625, + "delta_ref_ppl": -0.0311279296875, + "entropy_loss": -0.177734375, + "epoch": 0.9014, + "grad_norm": 1.0819012573052467, + "k1_kl": 0.03125, + "k3_kl": 0.031005859375, + "kimi_kl": 0.09521484375, + "learning_rate": 4.929999999999999e-08, + "loss": 0.0012, + "ppl": 0.055419921875, + "reward": 0.9516358375549316, + "reward_std": 0.014401472173631191, + "rewards/perpo_ocr_edit_distance_reward": 0.9516359567642212, + "step": 4507, + "temperature": 0.9 + }, + { + "advantages": -1.5088490727066528e-05, + "completion_length": 556.0, + "delta_ref_entropy_loss": 0.043701171875, + "delta_ref_ppl": -0.0751953125, + "entropy_loss": -0.0947265625, + "epoch": 0.9016, + "grad_norm": 0.7167223653097975, + "k1_kl": 0.0751953125, + "k3_kl": 0.0498046875, + "kimi_kl": 0.140625, + "learning_rate": 4.92e-08, + "loss": 0.002, + "ppl": 0.033203125, + "reward": 0.9775440096855164, + "reward_std": 0.001028829487040639, + "rewards/perpo_ocr_edit_distance_reward": 0.9775440096855164, + "step": 4508, + "temperature": 0.9 + }, + { + "advantages": 7.969993021106347e-06, + "completion_length": 1124.0, + "delta_ref_entropy_loss": 0.01361083984375, + "delta_ref_ppl": -0.0498046875, + "entropy_loss": -0.123046875, + "epoch": 0.9018, + "grad_norm": 1.436244055541197, + "k1_kl": 0.0498046875, + "k3_kl": 0.0380859375, + "kimi_kl": 0.09228515625, + "learning_rate": 4.9099999999999996e-08, + "loss": 0.0015, + "ppl": 0.051513671875, + "reward": 0.9828193783760071, + "reward_std": 0.0020341689232736826, + "rewards/perpo_ocr_edit_distance_reward": 0.9828193783760071, + "step": 4509, + "temperature": 0.9 + }, + { + "advantages": 1.3879367770641693e-06, + "completion_length": 713.0, + "delta_ref_entropy_loss": -0.208984375, + "delta_ref_ppl": -0.056396484375, + "entropy_loss": -0.5, + "epoch": 0.902, + "grad_norm": 1.9428926064722458, + "k1_kl": 0.056396484375, + "k3_kl": 0.1044921875, + "kimi_kl": 0.1787109375, + "learning_rate": 4.9e-08, + "loss": 0.0042, + "ppl": 0.2255859375, + "reward": 0.6566658616065979, + "reward_std": 0.04261403530836105, + "rewards/perpo_ocr_edit_distance_reward": 0.6566658020019531, + "step": 4510, + "temperature": 0.9 + }, + { + "advantages": -6.811959707420101e-08, + "completion_length": 1833.0, + "delta_ref_entropy_loss": -0.2236328125, + "delta_ref_ppl": -0.01519775390625, + "entropy_loss": -0.73046875, + "epoch": 0.9022, + "grad_norm": 9.195640439448312, + "k1_kl": 0.01519775390625, + "k3_kl": 0.08056640625, + "kimi_kl": 0.08642578125, + "learning_rate": 4.8899999999999995e-08, + "loss": 0.0032, + "ppl": 0.369140625, + "reward": 0.7316396236419678, + "reward_std": 0.21911853551864624, + "rewards/perpo_ocr_edit_distance_reward": 0.7316396832466125, + "step": 4511, + "temperature": 0.9 + }, + { + "advantages": -5.960464477539062e-07, + "completion_length": 421.0, + "delta_ref_entropy_loss": -0.162109375, + "delta_ref_ppl": -0.0947265625, + "entropy_loss": -0.4375, + "epoch": 0.9024, + "grad_norm": 3.8763782506903146, + "k1_kl": 0.0947265625, + "k3_kl": 0.09814453125, + "kimi_kl": 0.24609375, + "learning_rate": 4.88e-08, + "loss": 0.0039, + "ppl": 0.1923828125, + "reward": 0.8933548331260681, + "reward_std": 0.09754683077335358, + "rewards/perpo_ocr_edit_distance_reward": 0.8933548927307129, + "step": 4512, + "temperature": 0.9 + }, + { + "advantages": -5.28097189089749e-05, + "completion_length": 398.0, + "delta_ref_entropy_loss": 0.041259765625, + "delta_ref_ppl": -0.10546875, + "entropy_loss": -0.1162109375, + "epoch": 0.9026, + "grad_norm": 0.7961212570968434, + "k1_kl": 0.10546875, + "k3_kl": 0.07763671875, + "kimi_kl": 0.333984375, + "learning_rate": 4.87e-08, + "loss": 0.0032, + "ppl": 0.044677734375, + "reward": 0.9844343066215515, + "reward_std": 0.0008677222649566829, + "rewards/perpo_ocr_edit_distance_reward": 0.9844344258308411, + "step": 4513, + "temperature": 0.9 + }, + { + "advantages": -2.2560359866474755e-05, + "completion_length": 1004.0, + "delta_ref_entropy_loss": 0.0390625, + "delta_ref_ppl": -0.064453125, + "entropy_loss": -0.06640625, + "epoch": 0.9028, + "grad_norm": 0.7929948551968625, + "k1_kl": 0.064453125, + "k3_kl": 0.044921875, + "kimi_kl": 0.12109375, + "learning_rate": 4.86e-08, + "loss": 0.0018, + "ppl": 0.0291748046875, + "reward": 0.9735051989555359, + "reward_std": 0.0025438284501433372, + "rewards/perpo_ocr_edit_distance_reward": 0.9735053181648254, + "step": 4514, + "temperature": 0.9 + }, + { + "advantages": -1.6689301673977752e-06, + "completion_length": 1737.0, + "delta_ref_entropy_loss": 0.0018157958984375, + "delta_ref_ppl": -0.052001953125, + "entropy_loss": -0.1552734375, + "epoch": 0.903, + "grad_norm": 3.081880267816427, + "k1_kl": 0.052001953125, + "k3_kl": 0.03955078125, + "kimi_kl": 0.083984375, + "learning_rate": 4.85e-08, + "loss": 0.0016, + "ppl": 0.0771484375, + "reward": 0.9286593198776245, + "reward_std": 0.005029846448451281, + "rewards/perpo_ocr_edit_distance_reward": 0.9286593198776245, + "step": 4515, + "temperature": 0.9 + }, + { + "advantages": -3.5911800750909606e-06, + "completion_length": 254.0, + "delta_ref_entropy_loss": 0.03125, + "delta_ref_ppl": -0.10009765625, + "entropy_loss": -0.064453125, + "epoch": 0.9032, + "grad_norm": 1.7559863267134659, + "k1_kl": 0.10009765625, + "k3_kl": 0.07373046875, + "kimi_kl": 0.2451171875, + "learning_rate": 4.8399999999999997e-08, + "loss": 0.003, + "ppl": 0.0203857421875, + "reward": 0.9928476810455322, + "reward_std": 0.002263830043375492, + "rewards/perpo_ocr_edit_distance_reward": 0.992847740650177, + "step": 4516, + "temperature": 0.9 + }, + { + "advantages": 6.982258469179214e-07, + "completion_length": 716.0, + "delta_ref_entropy_loss": -0.1240234375, + "delta_ref_ppl": -0.09228515625, + "entropy_loss": -0.87890625, + "epoch": 0.9034, + "grad_norm": 7.5524260297020644, + "k1_kl": 0.09130859375, + "k3_kl": 0.0888671875, + "kimi_kl": 0.1455078125, + "learning_rate": 4.83e-08, + "loss": 0.0035, + "ppl": 0.451171875, + "reward": 0.6946918368339539, + "reward_std": 0.01205655001103878, + "rewards/perpo_ocr_edit_distance_reward": 0.6946918368339539, + "step": 4517, + "temperature": 0.9 + }, + { + "advantages": -3.461326923570596e-05, + "completion_length": 910.0, + "delta_ref_entropy_loss": 0.00701904296875, + "delta_ref_ppl": -0.03125, + "entropy_loss": -0.039306640625, + "epoch": 0.9036, + "grad_norm": 0.4899179480250013, + "k1_kl": 0.03125, + "k3_kl": 0.021728515625, + "kimi_kl": 0.062255859375, + "learning_rate": 4.8199999999999995e-08, + "loss": 0.0009, + "ppl": 0.01177978515625, + "reward": 0.9974998831748962, + "reward_std": 0.0006380134145729244, + "rewards/perpo_ocr_edit_distance_reward": 0.9974998235702515, + "step": 4518, + "temperature": 0.9 + }, + { + "advantages": 0.0, + "completion_length": 747.0, + "delta_ref_entropy_loss": 0.047607421875, + "delta_ref_ppl": -0.0771484375, + "entropy_loss": -0.111328125, + "epoch": 0.9038, + "grad_norm": 0.8170974922164256, + "k1_kl": 0.07763671875, + "k3_kl": 0.0458984375, + "kimi_kl": 0.11669921875, + "learning_rate": 4.8099999999999994e-08, + "loss": 0.0018, + "ppl": 0.046630859375, + "reward": 0.950575053691864, + "reward_std": 0.0009061790769919753, + "rewards/perpo_ocr_edit_distance_reward": 0.950575053691864, + "step": 4519, + "temperature": 0.9 + }, + { + "advantages": 5.178792343940586e-05, + "completion_length": 1219.0, + "delta_ref_entropy_loss": -0.0177001953125, + "delta_ref_ppl": -0.038818359375, + "entropy_loss": -0.0751953125, + "epoch": 0.904, + "grad_norm": 0.9059708876112641, + "k1_kl": 0.038818359375, + "k3_kl": 0.0262451171875, + "kimi_kl": 0.0693359375, + "learning_rate": 4.8e-08, + "loss": 0.001, + "ppl": 0.02001953125, + "reward": 0.9934558272361755, + "reward_std": 0.0008865031995810568, + "rewards/perpo_ocr_edit_distance_reward": 0.9934557676315308, + "step": 4520, + "temperature": 0.9 + }, + { + "advantages": -5.994524599373108e-06, + "completion_length": 392.0, + "delta_ref_entropy_loss": 0.039794921875, + "delta_ref_ppl": -0.087890625, + "entropy_loss": -0.08203125, + "epoch": 0.9042, + "grad_norm": 0.8062747706070136, + "k1_kl": 0.087890625, + "k3_kl": 0.0673828125, + "kimi_kl": 0.2080078125, + "learning_rate": 4.79e-08, + "loss": 0.0027, + "ppl": 0.031982421875, + "reward": 0.985675573348999, + "reward_std": 0.0013223602436482906, + "rewards/perpo_ocr_edit_distance_reward": 0.985675573348999, + "step": 4521, + "temperature": 0.9 + }, + { + "advantages": -8.685248644724197e-07, + "completion_length": 958.0, + "delta_ref_entropy_loss": -0.138671875, + "delta_ref_ppl": -0.06005859375, + "entropy_loss": -0.66015625, + "epoch": 0.9044, + "grad_norm": 2.651272326653858, + "k1_kl": 0.059814453125, + "k3_kl": 0.06298828125, + "kimi_kl": 0.11474609375, + "learning_rate": 4.78e-08, + "loss": 0.0025, + "ppl": 0.318359375, + "reward": 0.3255840241909027, + "reward_std": 0.014113983139395714, + "rewards/perpo_ocr_edit_distance_reward": 0.3255840241909027, + "step": 4522, + "temperature": 0.9 + }, + { + "advantages": -0.00010507447586860508, + "completion_length": 525.0, + "delta_ref_entropy_loss": 0.039794921875, + "delta_ref_ppl": -0.06689453125, + "entropy_loss": -0.059326171875, + "epoch": 0.9046, + "grad_norm": 0.6879722582425508, + "k1_kl": 0.06689453125, + "k3_kl": 0.041748046875, + "kimi_kl": 0.1474609375, + "learning_rate": 4.77e-08, + "loss": 0.0018, + "ppl": 0.0201416015625, + "reward": 0.9939194321632385, + "reward_std": 0.00054822867969051, + "rewards/perpo_ocr_edit_distance_reward": 0.9939194917678833, + "step": 4523, + "temperature": 0.9 + }, + { + "advantages": -0.00012382440036162734, + "completion_length": 916.0, + "delta_ref_entropy_loss": 0.030029296875, + "delta_ref_ppl": -0.057373046875, + "entropy_loss": -0.07275390625, + "epoch": 0.9048, + "grad_norm": 1.4516892380107476, + "k1_kl": 0.0576171875, + "k3_kl": 0.03369140625, + "kimi_kl": 0.087890625, + "learning_rate": 4.76e-08, + "loss": 0.0015, + "ppl": 0.0279541015625, + "reward": 0.9953320622444153, + "reward_std": 0.0005877374205738306, + "rewards/perpo_ocr_edit_distance_reward": 0.9953322410583496, + "step": 4524, + "temperature": 0.9 + }, + { + "advantages": -1.0030610610556323e-05, + "completion_length": 343.0, + "delta_ref_entropy_loss": 0.0361328125, + "delta_ref_ppl": -0.048095703125, + "entropy_loss": -0.0654296875, + "epoch": 0.905, + "grad_norm": 0.579493989000969, + "k1_kl": 0.048095703125, + "k3_kl": 0.023193359375, + "kimi_kl": 0.05322265625, + "learning_rate": 4.7499999999999995e-08, + "loss": 0.0009, + "ppl": 0.0184326171875, + "reward": 0.9950900673866272, + "reward_std": 0.0007494321907870471, + "rewards/perpo_ocr_edit_distance_reward": 0.9950901865959167, + "step": 4525, + "temperature": 0.9 + }, + { + "advantages": -3.048352027690271e-06, + "completion_length": 372.0, + "delta_ref_entropy_loss": -0.042724609375, + "delta_ref_ppl": -0.09130859375, + "entropy_loss": -0.1708984375, + "epoch": 0.9052, + "grad_norm": 1.034776351736508, + "k1_kl": 0.0908203125, + "k3_kl": 0.0927734375, + "kimi_kl": 0.240234375, + "learning_rate": 4.7399999999999994e-08, + "loss": 0.0037, + "ppl": 0.06640625, + "reward": 0.872043251991272, + "reward_std": 0.005455943290144205, + "rewards/perpo_ocr_edit_distance_reward": 0.8720433712005615, + "step": 4526, + "temperature": 0.9 + }, + { + "advantages": -0.00017393488087691367, + "completion_length": 891.0, + "delta_ref_entropy_loss": -0.003387451171875, + "delta_ref_ppl": -0.0419921875, + "entropy_loss": -0.0576171875, + "epoch": 0.9054, + "grad_norm": 0.10350241745081563, + "k1_kl": 0.0419921875, + "k3_kl": 0.0302734375, + "kimi_kl": 0.09326171875, + "learning_rate": 4.73e-08, + "loss": 0.0014, + "ppl": 0.01434326171875, + "reward": 0.9964149594306946, + "reward_std": 9.581720223650336e-05, + "rewards/perpo_ocr_edit_distance_reward": 0.9964150190353394, + "step": 4527, + "temperature": 0.9 + }, + { + "advantages": 8.174351933121216e-06, + "completion_length": 809.0, + "delta_ref_entropy_loss": 0.0223388671875, + "delta_ref_ppl": -0.12451171875, + "entropy_loss": -0.5, + "epoch": 0.9056, + "grad_norm": 2.3731414153789836, + "k1_kl": 0.125, + "k3_kl": 0.10009765625, + "kimi_kl": 0.255859375, + "learning_rate": 4.72e-08, + "loss": 0.004, + "ppl": 0.259765625, + "reward": 0.7528002858161926, + "reward_std": 0.004081597086042166, + "rewards/perpo_ocr_edit_distance_reward": 0.7528002858161926, + "step": 4528, + "temperature": 0.9 + }, + { + "advantages": -1.0771411325549707e-05, + "completion_length": 411.0, + "delta_ref_entropy_loss": 0.06787109375, + "delta_ref_ppl": -0.0712890625, + "entropy_loss": -0.16796875, + "epoch": 0.9058, + "grad_norm": 2.2544026209143713, + "k1_kl": 0.0712890625, + "k3_kl": 0.0458984375, + "kimi_kl": 0.0849609375, + "learning_rate": 4.71e-08, + "loss": 0.0019, + "ppl": 0.07666015625, + "reward": 0.9648178219795227, + "reward_std": 0.004649409558624029, + "rewards/perpo_ocr_edit_distance_reward": 0.9648178815841675, + "step": 4529, + "temperature": 0.9 + }, + { + "advantages": -2.6668822101783007e-05, + "completion_length": 194.0, + "delta_ref_entropy_loss": 0.049072265625, + "delta_ref_ppl": -0.1943359375, + "entropy_loss": -0.0888671875, + "epoch": 0.906, + "grad_norm": 1.4467033598707295, + "k1_kl": 0.193359375, + "k3_kl": 0.154296875, + "kimi_kl": 0.61328125, + "learning_rate": 4.7e-08, + "loss": 0.0062, + "ppl": 0.032958984375, + "reward": 0.9936007857322693, + "reward_std": 0.002770803403109312, + "rewards/perpo_ocr_edit_distance_reward": 0.9936009049415588, + "step": 4530, + "temperature": 0.9 + }, + { + "advantages": -1.7421587472199462e-05, + "completion_length": 522.0, + "delta_ref_entropy_loss": 0.0235595703125, + "delta_ref_ppl": -0.052978515625, + "entropy_loss": -0.064453125, + "epoch": 0.9062, + "grad_norm": 0.8148742316310766, + "k1_kl": 0.052978515625, + "k3_kl": 0.034423828125, + "kimi_kl": 0.10693359375, + "learning_rate": 4.6899999999999996e-08, + "loss": 0.0014, + "ppl": 0.019287109375, + "reward": 0.9966902732849121, + "reward_std": 0.0013649017782881856, + "rewards/perpo_ocr_edit_distance_reward": 0.9966903328895569, + "step": 4531, + "temperature": 0.9 + }, + { + "advantages": -1.3462135029840283e-05, + "completion_length": 668.0, + "delta_ref_entropy_loss": 0.01470947265625, + "delta_ref_ppl": -0.04345703125, + "entropy_loss": -0.07421875, + "epoch": 0.9064, + "grad_norm": 0.5065505246019714, + "k1_kl": 0.043701171875, + "k3_kl": 0.0284423828125, + "kimi_kl": 0.0771484375, + "learning_rate": 4.68e-08, + "loss": 0.0012, + "ppl": 0.02880859375, + "reward": 0.9980586767196655, + "reward_std": 0.0011644219048321247, + "rewards/perpo_ocr_edit_distance_reward": 0.9980587959289551, + "step": 4532, + "temperature": 0.9 + }, + { + "advantages": -4.2957919504260644e-05, + "completion_length": 378.0, + "delta_ref_entropy_loss": 0.062255859375, + "delta_ref_ppl": -0.109375, + "entropy_loss": -0.1455078125, + "epoch": 0.9066, + "grad_norm": 1.6479568172520271, + "k1_kl": 0.109375, + "k3_kl": 0.0751953125, + "kimi_kl": 0.228515625, + "learning_rate": 4.6699999999999995e-08, + "loss": 0.003, + "ppl": 0.0537109375, + "reward": 0.9441208243370056, + "reward_std": 0.0012884973548352718, + "rewards/perpo_ocr_edit_distance_reward": 0.9441208839416504, + "step": 4533, + "temperature": 0.9 + }, + { + "advantages": -4.0411949157714844e-05, + "completion_length": 722.0, + "delta_ref_entropy_loss": 0.0250244140625, + "delta_ref_ppl": -0.0791015625, + "entropy_loss": -0.1484375, + "epoch": 0.9068, + "grad_norm": 0.8841009106739931, + "k1_kl": 0.0791015625, + "k3_kl": 0.051025390625, + "kimi_kl": 0.1416015625, + "learning_rate": 4.66e-08, + "loss": 0.0021, + "ppl": 0.064453125, + "reward": 0.9583906531333923, + "reward_std": 0.0013755273539572954, + "rewards/perpo_ocr_edit_distance_reward": 0.9583907127380371, + "step": 4534, + "temperature": 0.9 + }, + { + "advantages": -2.2377287677954882e-05, + "completion_length": 800.0, + "delta_ref_entropy_loss": 0.028564453125, + "delta_ref_ppl": -0.04736328125, + "entropy_loss": -0.04150390625, + "epoch": 0.907, + "grad_norm": 8.76949240667038, + "k1_kl": 0.04736328125, + "k3_kl": 0.1005859375, + "kimi_kl": 0.1396484375, + "learning_rate": 4.65e-08, + "loss": 0.0041, + "ppl": 0.01287841796875, + "reward": 0.9965711832046509, + "reward_std": 0.004463616758584976, + "rewards/perpo_ocr_edit_distance_reward": 0.9965713024139404, + "step": 4535, + "temperature": 0.9 + }, + { + "advantages": -4.632132549886592e-05, + "completion_length": 922.0, + "delta_ref_entropy_loss": 0.04052734375, + "delta_ref_ppl": -0.05859375, + "entropy_loss": -0.1767578125, + "epoch": 0.9072, + "grad_norm": 2.36621254311321, + "k1_kl": 0.058837890625, + "k3_kl": 0.0654296875, + "kimi_kl": 0.0810546875, + "learning_rate": 4.639999999999999e-08, + "loss": 0.0027, + "ppl": 0.087890625, + "reward": 0.9726394414901733, + "reward_std": 0.001738387974910438, + "rewards/perpo_ocr_edit_distance_reward": 0.9726395606994629, + "step": 4536, + "temperature": 0.9 + }, + { + "advantages": -2.321175270481035e-05, + "completion_length": 485.0, + "delta_ref_entropy_loss": 0.0322265625, + "delta_ref_ppl": -0.0947265625, + "entropy_loss": -0.197265625, + "epoch": 0.9074, + "grad_norm": 1.3881550067640207, + "k1_kl": 0.0947265625, + "k3_kl": 0.06640625, + "kimi_kl": 0.2021484375, + "learning_rate": 4.63e-08, + "loss": 0.0027, + "ppl": 0.08642578125, + "reward": 0.9400920867919922, + "reward_std": 0.0021030432544648647, + "rewards/perpo_ocr_edit_distance_reward": 0.9400922060012817, + "step": 4537, + "temperature": 0.9 + }, + { + "advantages": -1.0290316822647583e-05, + "completion_length": 77.0, + "delta_ref_entropy_loss": -0.09912109375, + "delta_ref_ppl": -0.609375, + "entropy_loss": -0.318359375, + "epoch": 0.9076, + "grad_norm": 3.288230612903275, + "k1_kl": 0.609375, + "k3_kl": 0.53515625, + "kimi_kl": 2.828125, + "learning_rate": 4.62e-08, + "loss": 0.0214, + "ppl": 0.111328125, + "reward": 0.9635944366455078, + "reward_std": 0.004043783526867628, + "rewards/perpo_ocr_edit_distance_reward": 0.9635944962501526, + "step": 4538, + "temperature": 0.9 + }, + { + "advantages": -9.86031136562815e-06, + "completion_length": 392.0, + "delta_ref_entropy_loss": 0.01043701171875, + "delta_ref_ppl": -0.0947265625, + "entropy_loss": -0.09423828125, + "epoch": 0.9078, + "grad_norm": 1.114393695077041, + "k1_kl": 0.09521484375, + "k3_kl": 0.0654296875, + "kimi_kl": 0.216796875, + "learning_rate": 4.61e-08, + "loss": 0.0026, + "ppl": 0.033203125, + "reward": 0.9893051385879517, + "reward_std": 0.0024921721778810024, + "rewards/perpo_ocr_edit_distance_reward": 0.9893051981925964, + "step": 4539, + "temperature": 0.9 + }, + { + "advantages": -7.833753556951706e-07, + "completion_length": 474.0, + "delta_ref_entropy_loss": -0.1015625, + "delta_ref_ppl": -0.1318359375, + "entropy_loss": -0.60546875, + "epoch": 0.908, + "grad_norm": 4.698292725725919, + "k1_kl": 0.1318359375, + "k3_kl": 0.11279296875, + "kimi_kl": 0.283203125, + "learning_rate": 4.5999999999999995e-08, + "loss": 0.0045, + "ppl": 0.298828125, + "reward": 0.466971218585968, + "reward_std": 0.027638429775834084, + "rewards/perpo_ocr_edit_distance_reward": 0.4669712483882904, + "step": 4540, + "temperature": 0.9 + }, + { + "advantages": -3.6103385809838073e-06, + "completion_length": 539.0, + "delta_ref_entropy_loss": 0.04443359375, + "delta_ref_ppl": -0.10888671875, + "entropy_loss": -0.1279296875, + "epoch": 0.9082, + "grad_norm": 1.9357944599776835, + "k1_kl": 0.10888671875, + "k3_kl": 0.0771484375, + "kimi_kl": 0.236328125, + "learning_rate": 4.59e-08, + "loss": 0.0031, + "ppl": 0.054443359375, + "reward": 0.7212774157524109, + "reward_std": 0.0022544122766703367, + "rewards/perpo_ocr_edit_distance_reward": 0.7212774157524109, + "step": 4541, + "temperature": 0.9 + }, + { + "advantages": -4.938670826959424e-06, + "completion_length": 659.0, + "delta_ref_entropy_loss": -0.04150390625, + "delta_ref_ppl": -0.060546875, + "entropy_loss": -0.169921875, + "epoch": 0.9084, + "grad_norm": 1.6475238683472089, + "k1_kl": 0.060791015625, + "k3_kl": 0.052490234375, + "kimi_kl": 0.1396484375, + "learning_rate": 4.58e-08, + "loss": 0.0021, + "ppl": 0.061279296875, + "reward": 0.9657818078994751, + "reward_std": 0.006794819142669439, + "rewards/perpo_ocr_edit_distance_reward": 0.9657818675041199, + "step": 4542, + "temperature": 0.9 + }, + { + "advantages": -2.1031926280556945e-06, + "completion_length": 58.0, + "delta_ref_entropy_loss": -0.01348876953125, + "delta_ref_ppl": -0.6328125, + "entropy_loss": -0.1728515625, + "epoch": 0.9086, + "grad_norm": 4.791555631030036, + "k1_kl": 0.6328125, + "k3_kl": 0.5625, + "kimi_kl": 3.75, + "learning_rate": 4.569999999999999e-08, + "loss": 0.0225, + "ppl": 0.076171875, + "reward": 0.9829424619674683, + "reward_std": 0.008051489479839802, + "rewards/perpo_ocr_edit_distance_reward": 0.9829424619674683, + "step": 4543, + "temperature": 0.9 + }, + { + "advantages": -1.4322145034384448e-05, + "completion_length": 673.0, + "delta_ref_entropy_loss": 0.0233154296875, + "delta_ref_ppl": -0.06298828125, + "entropy_loss": -0.0703125, + "epoch": 0.9088, + "grad_norm": 0.5917056871005812, + "k1_kl": 0.06298828125, + "k3_kl": 0.04248046875, + "kimi_kl": 0.130859375, + "learning_rate": 4.56e-08, + "loss": 0.0017, + "ppl": 0.022216796875, + "reward": 0.9916278719902039, + "reward_std": 0.0010889553232118487, + "rewards/perpo_ocr_edit_distance_reward": 0.9916279315948486, + "step": 4544, + "temperature": 0.9 + }, + { + "advantages": -1.59229566634167e-05, + "completion_length": 452.0, + "delta_ref_entropy_loss": 0.04296875, + "delta_ref_ppl": -0.078125, + "entropy_loss": -0.11572265625, + "epoch": 0.909, + "grad_norm": 1.3392928766550263, + "k1_kl": 0.078125, + "k3_kl": 0.04931640625, + "kimi_kl": 0.126953125, + "learning_rate": 4.55e-08, + "loss": 0.002, + "ppl": 0.05078125, + "reward": 0.9897580742835999, + "reward_std": 0.0031078518368303776, + "rewards/perpo_ocr_edit_distance_reward": 0.9897581338882446, + "step": 4545, + "temperature": 0.9 + }, + { + "advantages": 3.2697407732484862e-06, + "completion_length": 535.0, + "delta_ref_entropy_loss": -0.375, + "delta_ref_ppl": -0.046630859375, + "entropy_loss": -0.6796875, + "epoch": 0.9092, + "grad_norm": 2.9949707168562276, + "k1_kl": 0.046630859375, + "k3_kl": 0.08447265625, + "kimi_kl": 0.2021484375, + "learning_rate": 4.54e-08, + "loss": 0.0034, + "ppl": 0.27734375, + "reward": 0.9079169034957886, + "reward_std": 0.01291099563241005, + "rewards/perpo_ocr_edit_distance_reward": 0.9079169034957886, + "step": 4546, + "temperature": 0.9 + }, + { + "advantages": -0.00011795759928645566, + "completion_length": 870.0, + "delta_ref_entropy_loss": 0.005889892578125, + "delta_ref_ppl": -0.03173828125, + "entropy_loss": -0.068359375, + "epoch": 0.9094, + "grad_norm": 0.7183243621291107, + "k1_kl": 0.03173828125, + "k3_kl": 0.0247802734375, + "kimi_kl": 0.0576171875, + "learning_rate": 4.5299999999999995e-08, + "loss": 0.0011, + "ppl": 0.0206298828125, + "reward": 0.9985626935958862, + "reward_std": 0.0007662829011678696, + "rewards/perpo_ocr_edit_distance_reward": 0.9985628128051758, + "step": 4547, + "temperature": 0.9 + }, + { + "advantages": -8.96709316293709e-05, + "completion_length": 361.0, + "delta_ref_entropy_loss": 0.021240234375, + "delta_ref_ppl": -0.05029296875, + "entropy_loss": -0.03955078125, + "epoch": 0.9096, + "grad_norm": 0.41634276449562607, + "k1_kl": 0.050048828125, + "k3_kl": 0.048583984375, + "kimi_kl": 0.13671875, + "learning_rate": 4.5199999999999994e-08, + "loss": 0.002, + "ppl": 0.01495361328125, + "reward": 0.9956183433532715, + "reward_std": 0.0005647385842166841, + "rewards/perpo_ocr_edit_distance_reward": 0.995618462562561, + "step": 4548, + "temperature": 0.9 + }, + { + "advantages": -3.131372795905918e-05, + "completion_length": 1213.0, + "delta_ref_entropy_loss": 0.0272216796875, + "delta_ref_ppl": -0.034912109375, + "entropy_loss": -0.05078125, + "epoch": 0.9098, + "grad_norm": 1.1286247890174008, + "k1_kl": 0.034912109375, + "k3_kl": 0.02490234375, + "kimi_kl": 0.056396484375, + "learning_rate": 4.51e-08, + "loss": 0.001, + "ppl": 0.0218505859375, + "reward": 0.998028576374054, + "reward_std": 0.0009879485005512834, + "rewards/perpo_ocr_edit_distance_reward": 0.998028576374054, + "step": 4549, + "temperature": 0.9 + }, + { + "advantages": -2.8269632821320556e-05, + "completion_length": 1723.0, + "delta_ref_entropy_loss": 0.0034942626953125, + "delta_ref_ppl": -0.05615234375, + "entropy_loss": -0.15234375, + "epoch": 0.91, + "grad_norm": 4.063467190432122, + "k1_kl": 0.05615234375, + "k3_kl": 0.044677734375, + "kimi_kl": 0.10205078125, + "learning_rate": 4.5e-08, + "loss": 0.0018, + "ppl": 0.07763671875, + "reward": 0.8848557472229004, + "reward_std": 0.002913415664806962, + "rewards/perpo_ocr_edit_distance_reward": 0.8848558664321899, + "step": 4550, + "temperature": 0.9 + }, + { + "advantages": 1.4773437214898877e-05, + "completion_length": 435.0, + "delta_ref_entropy_loss": 0.045654296875, + "delta_ref_ppl": -0.11328125, + "entropy_loss": -0.08154296875, + "epoch": 0.9102, + "grad_norm": 0.8643331187520631, + "k1_kl": 0.11279296875, + "k3_kl": 0.07763671875, + "kimi_kl": 0.2216796875, + "learning_rate": 4.49e-08, + "loss": 0.0031, + "ppl": 0.033203125, + "reward": 0.9775408506393433, + "reward_std": 0.001052584033459425, + "rewards/perpo_ocr_edit_distance_reward": 0.9775408506393433, + "step": 4551, + "temperature": 0.9 + }, + { + "advantages": -4.7470843128394336e-05, + "completion_length": 255.0, + "delta_ref_entropy_loss": 0.0108642578125, + "delta_ref_ppl": -0.111328125, + "entropy_loss": -0.068359375, + "epoch": 0.9104, + "grad_norm": 1.3739177913594445, + "k1_kl": 0.111328125, + "k3_kl": 0.08935546875, + "kimi_kl": 0.31640625, + "learning_rate": 4.48e-08, + "loss": 0.0036, + "ppl": 0.022705078125, + "reward": 0.9865007996559143, + "reward_std": 0.0015144250355660915, + "rewards/perpo_ocr_edit_distance_reward": 0.9865009188652039, + "step": 4552, + "temperature": 0.9 + }, + { + "advantages": -6.607600880670361e-06, + "completion_length": 124.0, + "delta_ref_entropy_loss": 0.11572265625, + "delta_ref_ppl": -0.212890625, + "entropy_loss": -0.1787109375, + "epoch": 0.9106, + "grad_norm": 2.689723860477688, + "k1_kl": 0.2119140625, + "k3_kl": 0.1416015625, + "kimi_kl": 0.5859375, + "learning_rate": 4.4699999999999997e-08, + "loss": 0.0057, + "ppl": 0.095703125, + "reward": 0.9251508712768555, + "reward_std": 0.007643966935575008, + "rewards/perpo_ocr_edit_distance_reward": 0.9251509308815002, + "step": 4553, + "temperature": 0.9 + }, + { + "advantages": -2.5417124561499804e-05, + "completion_length": 1142.0, + "delta_ref_entropy_loss": 0.037109375, + "delta_ref_ppl": -0.06591796875, + "entropy_loss": -0.1015625, + "epoch": 0.9108, + "grad_norm": 1.1922831184019078, + "k1_kl": 0.06591796875, + "k3_kl": 0.04443359375, + "kimi_kl": 0.115234375, + "learning_rate": 4.4599999999999996e-08, + "loss": 0.0018, + "ppl": 0.048583984375, + "reward": 0.9536560773849487, + "reward_std": 0.002246222225949168, + "rewards/perpo_ocr_edit_distance_reward": 0.9536561965942383, + "step": 4554, + "temperature": 0.9 + }, + { + "advantages": -5.861691170139238e-05, + "completion_length": 367.0, + "delta_ref_entropy_loss": 0.016845703125, + "delta_ref_ppl": -0.13671875, + "entropy_loss": -0.09130859375, + "epoch": 0.911, + "grad_norm": 0.5152611471587474, + "k1_kl": 0.13671875, + "k3_kl": 0.1025390625, + "kimi_kl": 0.423828125, + "learning_rate": 4.4499999999999995e-08, + "loss": 0.0042, + "ppl": 0.027099609375, + "reward": 0.8839762210845947, + "reward_std": 0.0009168887045234442, + "rewards/perpo_ocr_edit_distance_reward": 0.8839762806892395, + "step": 4555, + "temperature": 0.9 + }, + { + "advantages": -2.8014184863422997e-05, + "completion_length": 262.0, + "delta_ref_entropy_loss": 0.04638671875, + "delta_ref_ppl": -0.1669921875, + "entropy_loss": -0.12890625, + "epoch": 0.9112, + "grad_norm": 1.9588764135344372, + "k1_kl": 0.1669921875, + "k3_kl": 0.123046875, + "kimi_kl": 0.5, + "learning_rate": 4.44e-08, + "loss": 0.005, + "ppl": 0.054443359375, + "reward": 0.9809412956237793, + "reward_std": 0.003851751098409295, + "rewards/perpo_ocr_edit_distance_reward": 0.9809414148330688, + "step": 4556, + "temperature": 0.9 + }, + { + "advantages": -1.6757421690272167e-05, + "completion_length": 833.0, + "delta_ref_entropy_loss": -0.042236328125, + "delta_ref_ppl": -0.061279296875, + "entropy_loss": -0.265625, + "epoch": 0.9114, + "grad_norm": 2.0220621700775165, + "k1_kl": 0.061279296875, + "k3_kl": 0.058837890625, + "kimi_kl": 0.0966796875, + "learning_rate": 4.43e-08, + "loss": 0.0024, + "ppl": 0.12353515625, + "reward": 0.9478211402893066, + "reward_std": 0.003963182680308819, + "rewards/perpo_ocr_edit_distance_reward": 0.9478211998939514, + "step": 4557, + "temperature": 0.9 + }, + { + "advantages": 2.219847374362871e-05, + "completion_length": 839.0, + "delta_ref_entropy_loss": 0.017578125, + "delta_ref_ppl": -0.056396484375, + "entropy_loss": -0.08056640625, + "epoch": 0.9116, + "grad_norm": 0.882358133673368, + "k1_kl": 0.056396484375, + "k3_kl": 0.0390625, + "kimi_kl": 0.09765625, + "learning_rate": 4.42e-08, + "loss": 0.0015, + "ppl": 0.030029296875, + "reward": 0.9895785450935364, + "reward_std": 0.0006678516510874033, + "rewards/perpo_ocr_edit_distance_reward": 0.9895785450935364, + "step": 4558, + "temperature": 0.9 + }, + { + "advantages": 3.065381974920456e-07, + "completion_length": 724.0, + "delta_ref_entropy_loss": -0.09521484375, + "delta_ref_ppl": -0.053955078125, + "entropy_loss": -0.25, + "epoch": 0.9118, + "grad_norm": 1.7232871071438038, + "k1_kl": 0.053955078125, + "k3_kl": 0.05615234375, + "kimi_kl": 0.1416015625, + "learning_rate": 4.41e-08, + "loss": 0.0022, + "ppl": 0.08447265625, + "reward": 0.8451241850852966, + "reward_std": 0.1102127879858017, + "rewards/perpo_ocr_edit_distance_reward": 0.8451241850852966, + "step": 4559, + "temperature": 0.9 + }, + { + "advantages": -7.442066271323711e-05, + "completion_length": 624.0, + "delta_ref_entropy_loss": 0.0303955078125, + "delta_ref_ppl": -0.04931640625, + "entropy_loss": -0.0498046875, + "epoch": 0.912, + "grad_norm": 0.5112064391692726, + "k1_kl": 0.049560546875, + "k3_kl": 0.0289306640625, + "kimi_kl": 0.083984375, + "learning_rate": 4.4e-08, + "loss": 0.0012, + "ppl": 0.016845703125, + "reward": 0.9907259941101074, + "reward_std": 0.0007010095869190991, + "rewards/perpo_ocr_edit_distance_reward": 0.9907260537147522, + "step": 4560, + "temperature": 0.9 + }, + { + "advantages": -3.37021701852791e-05, + "completion_length": 1132.0, + "delta_ref_entropy_loss": 0.050537109375, + "delta_ref_ppl": -0.078125, + "entropy_loss": -0.072265625, + "epoch": 0.9122, + "grad_norm": 0.672710625640225, + "k1_kl": 0.07763671875, + "k3_kl": 0.0498046875, + "kimi_kl": 0.12060546875, + "learning_rate": 4.39e-08, + "loss": 0.002, + "ppl": 0.02880859375, + "reward": 0.9821892380714417, + "reward_std": 0.0014158185804262757, + "rewards/perpo_ocr_edit_distance_reward": 0.9821892976760864, + "step": 4561, + "temperature": 0.9 + }, + { + "advantages": 2.733298742896295e-06, + "completion_length": 361.0, + "delta_ref_entropy_loss": 0.036865234375, + "delta_ref_ppl": -0.09814453125, + "entropy_loss": -0.083984375, + "epoch": 0.9124, + "grad_norm": 3.637167999330279, + "k1_kl": 0.09765625, + "k3_kl": 0.0712890625, + "kimi_kl": 0.23828125, + "learning_rate": 4.3799999999999995e-08, + "loss": 0.0028, + "ppl": 0.035888671875, + "reward": 0.9903694987297058, + "reward_std": 0.0030098818242549896, + "rewards/perpo_ocr_edit_distance_reward": 0.9903695583343506, + "step": 4562, + "temperature": 0.9 + }, + { + "advantages": -5.517687441169983e-06, + "completion_length": 618.0, + "delta_ref_entropy_loss": -0.140625, + "delta_ref_ppl": -0.091796875, + "entropy_loss": -0.328125, + "epoch": 0.9126, + "grad_norm": 1.9791644390892902, + "k1_kl": 0.091796875, + "k3_kl": 0.09912109375, + "kimi_kl": 0.3046875, + "learning_rate": 4.37e-08, + "loss": 0.004, + "ppl": 0.11669921875, + "reward": 0.9794935584068298, + "reward_std": 0.00907848309725523, + "rewards/perpo_ocr_edit_distance_reward": 0.9794936180114746, + "step": 4563, + "temperature": 0.9 + }, + { + "advantages": -6.028584266459802e-06, + "completion_length": 299.0, + "delta_ref_entropy_loss": 0.057861328125, + "delta_ref_ppl": -0.11865234375, + "entropy_loss": -0.09912109375, + "epoch": 0.9128, + "grad_norm": 1.158216624035926, + "k1_kl": 0.11865234375, + "k3_kl": 0.08056640625, + "kimi_kl": 0.2314453125, + "learning_rate": 4.36e-08, + "loss": 0.0032, + "ppl": 0.04443359375, + "reward": 0.9794386029243469, + "reward_std": 0.0027295921463519335, + "rewards/perpo_ocr_edit_distance_reward": 0.9794386625289917, + "step": 4564, + "temperature": 0.9 + }, + { + "advantages": -3.930500679416582e-05, + "completion_length": 114.0, + "delta_ref_entropy_loss": -0.0167236328125, + "delta_ref_ppl": -0.3671875, + "entropy_loss": -0.1025390625, + "epoch": 0.913, + "grad_norm": 3.086493482507557, + "k1_kl": 0.369140625, + "k3_kl": 0.31640625, + "kimi_kl": 1.984375, + "learning_rate": 4.349999999999999e-08, + "loss": 0.0127, + "ppl": 0.03466796875, + "reward": 0.9946932196617126, + "reward_std": 0.0016326417680829763, + "rewards/perpo_ocr_edit_distance_reward": 0.9946932792663574, + "step": 4565, + "temperature": 0.9 + }, + { + "advantages": -5.551747108256677e-06, + "completion_length": 436.0, + "delta_ref_entropy_loss": -0.142578125, + "delta_ref_ppl": -0.10302734375, + "entropy_loss": -0.5546875, + "epoch": 0.9132, + "grad_norm": 2.0328160171578804, + "k1_kl": 0.10302734375, + "k3_kl": 0.12060546875, + "kimi_kl": 0.3046875, + "learning_rate": 4.34e-08, + "loss": 0.0048, + "ppl": 0.25390625, + "reward": 0.9157822132110596, + "reward_std": 0.007590082008391619, + "rewards/perpo_ocr_edit_distance_reward": 0.9157822728157043, + "step": 4566, + "temperature": 0.9 + }, + { + "advantages": -7.935933354019653e-06, + "completion_length": 69.0, + "delta_ref_entropy_loss": 0.0166015625, + "delta_ref_ppl": -0.5703125, + "entropy_loss": -0.2138671875, + "epoch": 0.9134, + "grad_norm": 4.329327341066808, + "k1_kl": 0.5703125, + "k3_kl": 0.5, + "kimi_kl": 2.6875, + "learning_rate": 4.33e-08, + "loss": 0.0201, + "ppl": 0.0791015625, + "reward": 0.9320842623710632, + "reward_std": 0.0031186651904135942, + "rewards/perpo_ocr_edit_distance_reward": 0.932084321975708, + "step": 4567, + "temperature": 0.9 + }, + { + "advantages": -6.93116890033707e-05, + "completion_length": 357.0, + "delta_ref_entropy_loss": 0.0267333984375, + "delta_ref_ppl": -0.11962890625, + "entropy_loss": -0.09912109375, + "epoch": 0.9136, + "grad_norm": 1.0972679729157546, + "k1_kl": 0.11962890625, + "k3_kl": 0.08642578125, + "kimi_kl": 0.318359375, + "learning_rate": 4.32e-08, + "loss": 0.0035, + "ppl": 0.04345703125, + "reward": 0.9767807722091675, + "reward_std": 0.0013737446861341596, + "rewards/perpo_ocr_edit_distance_reward": 0.976780891418457, + "step": 4568, + "temperature": 0.9 + }, + { + "advantages": -2.1287373641598606e-08, + "completion_length": 392.0, + "delta_ref_entropy_loss": 0.0220947265625, + "delta_ref_ppl": -0.11376953125, + "entropy_loss": -0.1455078125, + "epoch": 0.9138, + "grad_norm": 1.0820183790768045, + "k1_kl": 0.11376953125, + "k3_kl": 0.07763671875, + "kimi_kl": 0.234375, + "learning_rate": 4.3099999999999996e-08, + "loss": 0.0031, + "ppl": 0.04931640625, + "reward": 0.9699494242668152, + "reward_std": 0.00287247309461236, + "rewards/perpo_ocr_edit_distance_reward": 0.96994948387146, + "step": 4569, + "temperature": 0.9 + }, + { + "advantages": -2.3637499907636084e-05, + "completion_length": 825.0, + "delta_ref_entropy_loss": 0.0306396484375, + "delta_ref_ppl": -0.0439453125, + "entropy_loss": -0.0673828125, + "epoch": 0.914, + "grad_norm": 0.7760545352708096, + "k1_kl": 0.043701171875, + "k3_kl": 0.0244140625, + "kimi_kl": 0.05712890625, + "learning_rate": 4.2999999999999995e-08, + "loss": 0.001, + "ppl": 0.02685546875, + "reward": 0.9961450099945068, + "reward_std": 0.0006200295174494386, + "rewards/perpo_ocr_edit_distance_reward": 0.9961450695991516, + "step": 4570, + "temperature": 0.9 + }, + { + "advantages": -3.5337041026650695e-06, + "completion_length": 646.0, + "delta_ref_entropy_loss": 0.01043701171875, + "delta_ref_ppl": -0.040771484375, + "entropy_loss": -0.064453125, + "epoch": 0.9142, + "grad_norm": 0.7083828024648934, + "k1_kl": 0.040771484375, + "k3_kl": 0.0303955078125, + "kimi_kl": 0.08056640625, + "learning_rate": 4.29e-08, + "loss": 0.0012, + "ppl": 0.019775390625, + "reward": 0.9932104349136353, + "reward_std": 0.0071100411005318165, + "rewards/perpo_ocr_edit_distance_reward": 0.99321049451828, + "step": 4571, + "temperature": 0.9 + }, + { + "advantages": -2.715417394938413e-05, + "completion_length": 607.0, + "delta_ref_entropy_loss": 0.01202392578125, + "delta_ref_ppl": -0.05029296875, + "entropy_loss": -0.048583984375, + "epoch": 0.9144, + "grad_norm": 0.5062268353339496, + "k1_kl": 0.050048828125, + "k3_kl": 0.036376953125, + "kimi_kl": 0.173828125, + "learning_rate": 4.279999999999999e-08, + "loss": 0.0015, + "ppl": 0.01287841796875, + "reward": 0.9956099390983582, + "reward_std": 0.0008411198505200446, + "rewards/perpo_ocr_edit_distance_reward": 0.9956100583076477, + "step": 4572, + "temperature": 0.9 + }, + { + "advantages": -1.322371645073872e-05, + "completion_length": 679.0, + "delta_ref_entropy_loss": 0.0128173828125, + "delta_ref_ppl": -0.042236328125, + "entropy_loss": -0.041015625, + "epoch": 0.9146, + "grad_norm": 0.5941955350882281, + "k1_kl": 0.0419921875, + "k3_kl": 0.0322265625, + "kimi_kl": 0.10791015625, + "learning_rate": 4.27e-08, + "loss": 0.0013, + "ppl": 0.017578125, + "reward": 0.9942355751991272, + "reward_std": 0.001828892738558352, + "rewards/perpo_ocr_edit_distance_reward": 0.994235634803772, + "step": 4573, + "temperature": 0.9 + }, + { + "advantages": -1.1972018910455517e-05, + "completion_length": 528.0, + "delta_ref_entropy_loss": 0.0145263671875, + "delta_ref_ppl": -0.06396484375, + "entropy_loss": -0.044921875, + "epoch": 0.9148, + "grad_norm": 0.7563168912435665, + "k1_kl": 0.06396484375, + "k3_kl": 0.047119140625, + "kimi_kl": 0.1748046875, + "learning_rate": 4.26e-08, + "loss": 0.0019, + "ppl": 0.0152587890625, + "reward": 0.996590256690979, + "reward_std": 0.0006097055156715214, + "rewards/perpo_ocr_edit_distance_reward": 0.9965903162956238, + "step": 4574, + "temperature": 0.9 + }, + { + "advantages": -2.3688589863013476e-05, + "completion_length": 1994.0, + "delta_ref_entropy_loss": 0.006195068359375, + "delta_ref_ppl": -0.03076171875, + "entropy_loss": -0.06884765625, + "epoch": 0.915, + "grad_norm": 55.397207204413675, + "k1_kl": 0.0308837890625, + "k3_kl": 0.10791015625, + "kimi_kl": 0.07373046875, + "learning_rate": 4.2500000000000003e-08, + "loss": 0.0043, + "ppl": 0.030029296875, + "reward": 0.9878543019294739, + "reward_std": 0.0045721461065113544, + "rewards/perpo_ocr_edit_distance_reward": 0.9878544807434082, + "step": 4575, + "temperature": 0.9 + }, + { + "advantages": -3.4962384233949706e-05, + "completion_length": 640.0, + "delta_ref_entropy_loss": -0.002227783203125, + "delta_ref_ppl": -0.0361328125, + "entropy_loss": -0.06787109375, + "epoch": 0.9152, + "grad_norm": 0.874720348909507, + "k1_kl": 0.036376953125, + "k3_kl": 0.025634765625, + "kimi_kl": 0.06005859375, + "learning_rate": 4.2399999999999996e-08, + "loss": 0.0011, + "ppl": 0.02392578125, + "reward": 0.9948244690895081, + "reward_std": 0.0013609598390758038, + "rewards/perpo_ocr_edit_distance_reward": 0.9948245286941528, + "step": 4576, + "temperature": 0.9 + }, + { + "advantages": -7.045269012451172e-05, + "completion_length": 543.0, + "delta_ref_entropy_loss": 0.0208740234375, + "delta_ref_ppl": -0.0654296875, + "entropy_loss": -0.0419921875, + "epoch": 0.9154, + "grad_norm": 0.3750705037101076, + "k1_kl": 0.0654296875, + "k3_kl": 0.046630859375, + "kimi_kl": 0.158203125, + "learning_rate": 4.2299999999999995e-08, + "loss": 0.0019, + "ppl": 0.01312255859375, + "reward": 0.9768131375312805, + "reward_std": 0.0005043478449806571, + "rewards/perpo_ocr_edit_distance_reward": 0.9768131971359253, + "step": 4577, + "temperature": 0.9 + }, + { + "advantages": -9.964194032363594e-05, + "completion_length": 864.0, + "delta_ref_entropy_loss": 0.04150390625, + "delta_ref_ppl": -0.045166015625, + "entropy_loss": -0.054931640625, + "epoch": 0.9156, + "grad_norm": 0.5092951017176045, + "k1_kl": 0.045166015625, + "k3_kl": 0.0262451171875, + "kimi_kl": 0.061767578125, + "learning_rate": 4.22e-08, + "loss": 0.0012, + "ppl": 0.01806640625, + "reward": 0.9977248907089233, + "reward_std": 0.0006693562027066946, + "rewards/perpo_ocr_edit_distance_reward": 0.9977250099182129, + "step": 4578, + "temperature": 0.9 + }, + { + "advantages": -1.9294875528430566e-05, + "completion_length": 547.0, + "delta_ref_entropy_loss": 0.00921630859375, + "delta_ref_ppl": -0.11962890625, + "entropy_loss": -0.1572265625, + "epoch": 0.9158, + "grad_norm": 0.8899615220762487, + "k1_kl": 0.11962890625, + "k3_kl": 0.0888671875, + "kimi_kl": 0.244140625, + "learning_rate": 4.21e-08, + "loss": 0.0036, + "ppl": 0.060791015625, + "reward": 0.43705129623413086, + "reward_std": 0.001885301899164915, + "rewards/perpo_ocr_edit_distance_reward": 0.43705132603645325, + "step": 4579, + "temperature": 0.9 + }, + { + "advantages": -6.0626439335464966e-06, + "completion_length": 88.0, + "delta_ref_entropy_loss": 0.01251220703125, + "delta_ref_ppl": -0.451171875, + "entropy_loss": -0.1357421875, + "epoch": 0.916, + "grad_norm": 2.5008175814956393, + "k1_kl": 0.453125, + "k3_kl": 0.390625, + "kimi_kl": 1.96875, + "learning_rate": 4.2e-08, + "loss": 0.0156, + "ppl": 0.05908203125, + "reward": 0.9902713298797607, + "reward_std": 0.0027094362303614616, + "rewards/perpo_ocr_edit_distance_reward": 0.9902713894844055, + "step": 4580, + "temperature": 0.9 + }, + { + "advantages": -4.676410389947705e-05, + "completion_length": 308.0, + "delta_ref_entropy_loss": 0.10986328125, + "delta_ref_ppl": -0.1806640625, + "entropy_loss": -0.09033203125, + "epoch": 0.9162, + "grad_norm": 2.0623952859008052, + "k1_kl": 0.1806640625, + "k3_kl": 0.12060546875, + "kimi_kl": 0.416015625, + "learning_rate": 4.19e-08, + "loss": 0.0049, + "ppl": 0.0341796875, + "reward": 0.9938148856163025, + "reward_std": 0.0015386156737804413, + "rewards/perpo_ocr_edit_distance_reward": 0.9938149452209473, + "step": 4581, + "temperature": 0.9 + }, + { + "advantages": -2.384185791015625e-07, + "completion_length": 354.0, + "delta_ref_entropy_loss": -0.019287109375, + "delta_ref_ppl": -0.087890625, + "entropy_loss": -0.197265625, + "epoch": 0.9164, + "grad_norm": 2.4904846291916773, + "k1_kl": 0.087890625, + "k3_kl": 0.0673828125, + "kimi_kl": 0.1796875, + "learning_rate": 4.18e-08, + "loss": 0.0027, + "ppl": 0.08056640625, + "reward": 0.9027225375175476, + "reward_std": 0.034132566303014755, + "rewards/perpo_ocr_edit_distance_reward": 0.9027225375175476, + "step": 4582, + "temperature": 0.9 + }, + { + "advantages": -6.3606676121708e-05, + "completion_length": 446.0, + "delta_ref_entropy_loss": 0.043212890625, + "delta_ref_ppl": -0.0732421875, + "entropy_loss": -0.052490234375, + "epoch": 0.9166, + "grad_norm": 0.6596661044118113, + "k1_kl": 0.0732421875, + "k3_kl": 0.047607421875, + "kimi_kl": 0.1513671875, + "learning_rate": 4.1699999999999996e-08, + "loss": 0.002, + "ppl": 0.020263671875, + "reward": 0.9962629675865173, + "reward_std": 0.0009709825972095132, + "rewards/perpo_ocr_edit_distance_reward": 0.9962630867958069, + "step": 4583, + "temperature": 0.9 + }, + { + "advantages": -2.060617771348916e-05, + "completion_length": 1268.0, + "delta_ref_entropy_loss": 0.030029296875, + "delta_ref_ppl": -0.055908203125, + "entropy_loss": -0.06787109375, + "epoch": 0.9168, + "grad_norm": 1.352224765414704, + "k1_kl": 0.05615234375, + "k3_kl": 0.034423828125, + "kimi_kl": 0.0966796875, + "learning_rate": 4.1599999999999995e-08, + "loss": 0.0014, + "ppl": 0.0235595703125, + "reward": 0.996406614780426, + "reward_std": 0.0007270630449056625, + "rewards/perpo_ocr_edit_distance_reward": 0.996406614780426, + "step": 4584, + "temperature": 0.9 + }, + { + "advantages": 3.405979782655777e-07, + "completion_length": 182.0, + "delta_ref_entropy_loss": -0.466796875, + "delta_ref_ppl": -0.193359375, + "entropy_loss": -0.76171875, + "epoch": 0.917, + "grad_norm": 8.965388776610823, + "k1_kl": 0.1943359375, + "k3_kl": 0.2236328125, + "kimi_kl": 1.0625, + "learning_rate": 4.15e-08, + "loss": 0.009, + "ppl": 0.2294921875, + "reward": 0.6176269054412842, + "reward_std": 0.07459068298339844, + "rewards/perpo_ocr_edit_distance_reward": 0.6176269054412842, + "step": 4585, + "temperature": 0.9 + }, + { + "advantages": 7.280281806743005e-06, + "completion_length": 618.0, + "delta_ref_entropy_loss": 0.034423828125, + "delta_ref_ppl": -0.0830078125, + "entropy_loss": -0.1298828125, + "epoch": 0.9172, + "grad_norm": 0.7673975844132858, + "k1_kl": 0.0830078125, + "k3_kl": 0.047607421875, + "kimi_kl": 0.12451171875, + "learning_rate": 4.14e-08, + "loss": 0.0019, + "ppl": 0.038330078125, + "reward": 0.989357590675354, + "reward_std": 0.0010708172339946032, + "rewards/perpo_ocr_edit_distance_reward": 0.989357590675354, + "step": 4586, + "temperature": 0.9 + }, + { + "advantages": -8.916855585994199e-05, + "completion_length": 763.0, + "delta_ref_entropy_loss": 0.0191650390625, + "delta_ref_ppl": -0.0654296875, + "entropy_loss": -0.054931640625, + "epoch": 0.9174, + "grad_norm": 0.39503668310052387, + "k1_kl": 0.0654296875, + "k3_kl": 0.044677734375, + "kimi_kl": 0.1474609375, + "learning_rate": 4.13e-08, + "loss": 0.0019, + "ppl": 0.0189208984375, + "reward": 0.9943467378616333, + "reward_std": 0.0006640840438194573, + "rewards/perpo_ocr_edit_distance_reward": 0.9943468570709229, + "step": 4587, + "temperature": 0.9 + }, + { + "advantages": -9.2204129032325e-05, + "completion_length": 586.0, + "delta_ref_entropy_loss": 0.02197265625, + "delta_ref_ppl": -0.06396484375, + "entropy_loss": -0.055419921875, + "epoch": 0.9176, + "grad_norm": 0.6061902013119966, + "k1_kl": 0.06396484375, + "k3_kl": 0.04150390625, + "kimi_kl": 0.1279296875, + "learning_rate": 4.12e-08, + "loss": 0.0017, + "ppl": 0.0201416015625, + "reward": 0.9971012473106384, + "reward_std": 0.0006388187175616622, + "rewards/perpo_ocr_edit_distance_reward": 0.9971013069152832, + "step": 4588, + "temperature": 0.9 + }, + { + "advantages": -0.00023206643527373672, + "completion_length": 926.0, + "delta_ref_entropy_loss": 0.015869140625, + "delta_ref_ppl": -0.03564453125, + "entropy_loss": -0.048583984375, + "epoch": 0.9178, + "grad_norm": 0.10232105618369072, + "k1_kl": 0.03564453125, + "k3_kl": 0.0225830078125, + "kimi_kl": 0.05810546875, + "learning_rate": 4.11e-08, + "loss": 0.0011, + "ppl": 0.01165771484375, + "reward": 0.999058187007904, + "reward_std": 0.00015686001279391348, + "rewards/perpo_ocr_edit_distance_reward": 0.9990583062171936, + "step": 4589, + "temperature": 0.9 + }, + { + "advantages": -0.00012211609282530844, + "completion_length": 257.0, + "delta_ref_entropy_loss": 0.04248046875, + "delta_ref_ppl": -0.091796875, + "entropy_loss": -0.055419921875, + "epoch": 0.918, + "grad_norm": 1.0557450906427575, + "k1_kl": 0.09228515625, + "k3_kl": 0.0634765625, + "kimi_kl": 0.2109375, + "learning_rate": 4.1e-08, + "loss": 0.0027, + "ppl": 0.0181884765625, + "reward": 0.9849532842636108, + "reward_std": 0.0008066532900556922, + "rewards/perpo_ocr_edit_distance_reward": 0.9849534034729004, + "step": 4590, + "temperature": 0.9 + }, + { + "advantages": -3.4894263080786914e-05, + "completion_length": 320.0, + "delta_ref_entropy_loss": 0.007659912109375, + "delta_ref_ppl": -0.1455078125, + "entropy_loss": -0.10986328125, + "epoch": 0.9182, + "grad_norm": 0.852964851526565, + "k1_kl": 0.1455078125, + "k3_kl": 0.109375, + "kimi_kl": 0.455078125, + "learning_rate": 4.0899999999999996e-08, + "loss": 0.0044, + "ppl": 0.033203125, + "reward": 0.7988736629486084, + "reward_std": 0.0013642450794577599, + "rewards/perpo_ocr_edit_distance_reward": 0.7988736629486084, + "step": 4591, + "temperature": 0.9 + }, + { + "advantages": -4.910571442451328e-05, + "completion_length": 657.0, + "delta_ref_entropy_loss": 0.01531982421875, + "delta_ref_ppl": -0.068359375, + "entropy_loss": -0.07080078125, + "epoch": 0.9184, + "grad_norm": 0.6628438671482121, + "k1_kl": 0.068359375, + "k3_kl": 0.045166015625, + "kimi_kl": 0.1220703125, + "learning_rate": 4.08e-08, + "loss": 0.0019, + "ppl": 0.0220947265625, + "reward": 0.9973198771476746, + "reward_std": 0.0009407050674781203, + "rewards/perpo_ocr_edit_distance_reward": 0.9973199367523193, + "step": 4592, + "temperature": 0.9 + }, + { + "advantages": -6.881782610435039e-05, + "completion_length": 421.0, + "delta_ref_entropy_loss": 0.037353515625, + "delta_ref_ppl": -0.103515625, + "entropy_loss": -0.06689453125, + "epoch": 0.9186, + "grad_norm": 0.9497114377155047, + "k1_kl": 0.103515625, + "k3_kl": 0.07275390625, + "kimi_kl": 0.26171875, + "learning_rate": 4.07e-08, + "loss": 0.003, + "ppl": 0.02099609375, + "reward": 0.9965015053749084, + "reward_std": 0.0013846452347934246, + "rewards/perpo_ocr_edit_distance_reward": 0.9965015649795532, + "step": 4593, + "temperature": 0.9 + }, + { + "advantages": -3.5711698728846386e-05, + "completion_length": 1005.0, + "delta_ref_entropy_loss": 0.0013580322265625, + "delta_ref_ppl": -0.02392578125, + "entropy_loss": -0.0595703125, + "epoch": 0.9188, + "grad_norm": 0.7012478178198197, + "k1_kl": 0.02392578125, + "k3_kl": 0.0189208984375, + "kimi_kl": 0.04150390625, + "learning_rate": 4.059999999999999e-08, + "loss": 0.0008, + "ppl": 0.0203857421875, + "reward": 0.996429443359375, + "reward_std": 0.0018076017731800675, + "rewards/perpo_ocr_edit_distance_reward": 0.9964295029640198, + "step": 4594, + "temperature": 0.9 + }, + { + "advantages": -5.960464477539063e-08, + "completion_length": 727.0, + "delta_ref_entropy_loss": -0.216796875, + "delta_ref_ppl": -0.036865234375, + "entropy_loss": -0.609375, + "epoch": 0.919, + "grad_norm": 3.9597833590975244, + "k1_kl": 0.036376953125, + "k3_kl": 0.064453125, + "kimi_kl": 0.11279296875, + "learning_rate": 4.05e-08, + "loss": 0.0026, + "ppl": 0.28125, + "reward": 0.7792178988456726, + "reward_std": 0.33387961983680725, + "rewards/perpo_ocr_edit_distance_reward": 0.7792179584503174, + "step": 4595, + "temperature": 0.9 + }, + { + "advantages": -8.59158444654895e-06, + "completion_length": 510.0, + "delta_ref_entropy_loss": -0.00125885009765625, + "delta_ref_ppl": -0.045654296875, + "entropy_loss": -0.08203125, + "epoch": 0.9192, + "grad_norm": 1.072805333532971, + "k1_kl": 0.045654296875, + "k3_kl": 0.0361328125, + "kimi_kl": 0.10205078125, + "learning_rate": 4.04e-08, + "loss": 0.0015, + "ppl": 0.033203125, + "reward": 0.9841284155845642, + "reward_std": 0.0018867854960262775, + "rewards/perpo_ocr_edit_distance_reward": 0.984128475189209, + "step": 4596, + "temperature": 0.9 + }, + { + "advantages": -6.68082939228043e-05, + "completion_length": 583.0, + "delta_ref_entropy_loss": 0.0274658203125, + "delta_ref_ppl": -0.09130859375, + "entropy_loss": -0.06298828125, + "epoch": 0.9194, + "grad_norm": 0.5227178347428603, + "k1_kl": 0.09130859375, + "k3_kl": 0.0693359375, + "kimi_kl": 0.21484375, + "learning_rate": 4.0300000000000004e-08, + "loss": 0.0029, + "ppl": 0.022705078125, + "reward": 0.994567334651947, + "reward_std": 0.0011746063828468323, + "rewards/perpo_ocr_edit_distance_reward": 0.9945673942565918, + "step": 4597, + "temperature": 0.9 + }, + { + "advantages": -9.73003261606209e-05, + "completion_length": 410.0, + "delta_ref_entropy_loss": 0.0262451171875, + "delta_ref_ppl": -0.08447265625, + "entropy_loss": -0.06396484375, + "epoch": 0.9196, + "grad_norm": 0.6469757468834297, + "k1_kl": 0.08447265625, + "k3_kl": 0.055908203125, + "kimi_kl": 0.1552734375, + "learning_rate": 4.0199999999999996e-08, + "loss": 0.0023, + "ppl": 0.022705078125, + "reward": 0.995699942111969, + "reward_std": 0.0009500087471678853, + "rewards/perpo_ocr_edit_distance_reward": 0.9957000613212585, + "step": 4598, + "temperature": 0.9 + }, + { + "advantages": -4.96293832839001e-05, + "completion_length": 257.0, + "delta_ref_entropy_loss": 0.0133056640625, + "delta_ref_ppl": -0.1396484375, + "entropy_loss": -0.06884765625, + "epoch": 0.9198, + "grad_norm": 3.0249359611371225, + "k1_kl": 0.1396484375, + "k3_kl": 0.107421875, + "kimi_kl": 0.640625, + "learning_rate": 4.0099999999999995e-08, + "loss": 0.0043, + "ppl": 0.0262451171875, + "reward": 0.9858784079551697, + "reward_std": 0.0012727968860417604, + "rewards/perpo_ocr_edit_distance_reward": 0.985878586769104, + "step": 4599, + "temperature": 0.9 + }, + { + "advantages": -4.257474817137563e-09, + "completion_length": 2048.0, + "delta_ref_entropy_loss": -0.15625, + "delta_ref_ppl": -0.019775390625, + "entropy_loss": -0.35546875, + "epoch": 0.92, + "grad_norm": 2.364192437926188, + "k1_kl": 0.0198974609375, + "k3_kl": 0.0419921875, + "kimi_kl": 0.0712890625, + "learning_rate": 4e-08, + "loss": 0.0017, + "ppl": 0.16796875, + "reward": 0.5718325972557068, + "reward_std": 0.07229337841272354, + "rewards/perpo_ocr_edit_distance_reward": 0.5718325972557068, + "step": 4600, + "temperature": 0.9 + }, + { + "advantages": -3.6341803934192285e-05, + "completion_length": 898.0, + "delta_ref_entropy_loss": 0.00811767578125, + "delta_ref_ppl": -0.0269775390625, + "entropy_loss": -0.0361328125, + "epoch": 0.9202, + "grad_norm": 0.41179319775577344, + "k1_kl": 0.02685546875, + "k3_kl": 0.0179443359375, + "kimi_kl": 0.0458984375, + "learning_rate": 3.9899999999999993e-08, + "loss": 0.0008, + "ppl": 0.01251220703125, + "reward": 0.9963409900665283, + "reward_std": 0.001071671606041491, + "rewards/perpo_ocr_edit_distance_reward": 0.9963409900665283, + "step": 4601, + "temperature": 0.9 + }, + { + "advantages": -5.095345841255039e-05, + "completion_length": 621.0, + "delta_ref_entropy_loss": -0.037109375, + "delta_ref_ppl": -0.0791015625, + "entropy_loss": -0.1572265625, + "epoch": 0.9204, + "grad_norm": 0.8212351741528531, + "k1_kl": 0.07958984375, + "k3_kl": 0.0654296875, + "kimi_kl": 0.2294921875, + "learning_rate": 3.98e-08, + "loss": 0.0027, + "ppl": 0.043701171875, + "reward": 0.995186448097229, + "reward_std": 0.0019055475713685155, + "rewards/perpo_ocr_edit_distance_reward": 0.9951865673065186, + "step": 4602, + "temperature": 0.9 + }, + { + "advantages": 0.0, + "completion_length": 672.0, + "delta_ref_entropy_loss": 0.0206298828125, + "delta_ref_ppl": -0.042236328125, + "entropy_loss": -0.054931640625, + "epoch": 0.9206, + "grad_norm": 0.31668286251909356, + "k1_kl": 0.0419921875, + "k3_kl": 0.0257568359375, + "kimi_kl": 0.0751953125, + "learning_rate": 3.97e-08, + "loss": 0.001, + "ppl": 0.01806640625, + "reward": 0.9986059665679932, + "reward_std": 0.0005180721054784954, + "rewards/perpo_ocr_edit_distance_reward": 0.9986060261726379, + "step": 4603, + "temperature": 0.9 + }, + { + "advantages": -6.026242772350088e-05, + "completion_length": 716.0, + "delta_ref_entropy_loss": 0.0218505859375, + "delta_ref_ppl": -0.058349609375, + "entropy_loss": -0.07470703125, + "epoch": 0.9208, + "grad_norm": 0.4897254894774864, + "k1_kl": 0.058349609375, + "k3_kl": 0.041748046875, + "kimi_kl": 0.1298828125, + "learning_rate": 3.9600000000000004e-08, + "loss": 0.0017, + "ppl": 0.02734375, + "reward": 0.9974004626274109, + "reward_std": 0.0013132310705259442, + "rewards/perpo_ocr_edit_distance_reward": 0.9974005818367004, + "step": 4604, + "temperature": 0.9 + }, + { + "advantages": -0.0003676925553008914, + "completion_length": 587.0, + "delta_ref_entropy_loss": 0.0081787109375, + "delta_ref_ppl": -0.04052734375, + "entropy_loss": -0.0341796875, + "epoch": 0.921, + "grad_norm": 0.2889007460162422, + "k1_kl": 0.040771484375, + "k3_kl": 0.0281982421875, + "kimi_kl": 0.119140625, + "learning_rate": 3.9499999999999996e-08, + "loss": 0.0015, + "ppl": 0.01025390625, + "reward": 0.9986175298690796, + "reward_std": 0.0002473549684509635, + "rewards/perpo_ocr_edit_distance_reward": 0.9986176490783691, + "step": 4605, + "temperature": 0.9 + }, + { + "advantages": -4.4175558286951855e-05, + "completion_length": 570.0, + "delta_ref_entropy_loss": 0.036376953125, + "delta_ref_ppl": -0.0478515625, + "entropy_loss": -0.0771484375, + "epoch": 0.9212, + "grad_norm": 0.9184040064913089, + "k1_kl": 0.0478515625, + "k3_kl": 0.03271484375, + "kimi_kl": 0.07177734375, + "learning_rate": 3.9399999999999995e-08, + "loss": 0.0014, + "ppl": 0.0291748046875, + "reward": 0.9591836333274841, + "reward_std": 0.001827614731155336, + "rewards/perpo_ocr_edit_distance_reward": 0.9591837525367737, + "step": 4606, + "temperature": 0.9 + }, + { + "advantages": -1.08139856820344e-05, + "completion_length": 527.0, + "delta_ref_entropy_loss": 0.025146484375, + "delta_ref_ppl": -0.049560546875, + "entropy_loss": -0.052978515625, + "epoch": 0.9214, + "grad_norm": 0.6602371834748691, + "k1_kl": 0.049560546875, + "k3_kl": 0.0283203125, + "kimi_kl": 0.0703125, + "learning_rate": 3.93e-08, + "loss": 0.0011, + "ppl": 0.0166015625, + "reward": 0.991712212562561, + "reward_std": 0.0006891122320666909, + "rewards/perpo_ocr_edit_distance_reward": 0.991712212562561, + "step": 4607, + "temperature": 0.9 + }, + { + "advantages": -7.092953183018835e-06, + "completion_length": 737.0, + "delta_ref_entropy_loss": -0.1142578125, + "delta_ref_ppl": -0.0712890625, + "entropy_loss": -0.3984375, + "epoch": 0.9216, + "grad_norm": 1.5664381676418606, + "k1_kl": 0.07177734375, + "k3_kl": 0.07080078125, + "kimi_kl": 0.1767578125, + "learning_rate": 3.9199999999999994e-08, + "loss": 0.0028, + "ppl": 0.1904296875, + "reward": 0.9448981285095215, + "reward_std": 0.007098186295479536, + "rewards/perpo_ocr_edit_distance_reward": 0.944898247718811, + "step": 4608, + "temperature": 0.9 + }, + { + "advantages": -0.0002580881118774414, + "completion_length": 899.0, + "delta_ref_entropy_loss": 0.02197265625, + "delta_ref_ppl": -0.040283203125, + "entropy_loss": -0.03564453125, + "epoch": 0.9218, + "grad_norm": 0.2145713774332865, + "k1_kl": 0.040283203125, + "k3_kl": 0.0240478515625, + "kimi_kl": 0.0751953125, + "learning_rate": 3.91e-08, + "loss": 0.0012, + "ppl": 0.00958251953125, + "reward": 0.9977334141731262, + "reward_std": 0.0003618453338276595, + "rewards/perpo_ocr_edit_distance_reward": 0.9977335929870605, + "step": 4609, + "temperature": 0.9 + }, + { + "advantages": -1.697880907158833e-05, + "completion_length": 545.0, + "delta_ref_entropy_loss": 0.04345703125, + "delta_ref_ppl": -0.08984375, + "entropy_loss": -0.1640625, + "epoch": 0.922, + "grad_norm": 1.6917986759409698, + "k1_kl": 0.08984375, + "k3_kl": 0.05859375, + "kimi_kl": 0.1875, + "learning_rate": 3.9e-08, + "loss": 0.0024, + "ppl": 0.061767578125, + "reward": 0.9571552276611328, + "reward_std": 0.0019036999437958002, + "rewards/perpo_ocr_edit_distance_reward": 0.9571552872657776, + "step": 4610, + "temperature": 0.9 + }, + { + "advantages": -4.777738286065869e-05, + "completion_length": 463.0, + "delta_ref_entropy_loss": 0.039794921875, + "delta_ref_ppl": -0.1201171875, + "entropy_loss": -0.06591796875, + "epoch": 0.9222, + "grad_norm": 0.698566593128702, + "k1_kl": 0.1201171875, + "k3_kl": 0.08447265625, + "kimi_kl": 0.37890625, + "learning_rate": 3.89e-08, + "loss": 0.0034, + "ppl": 0.020263671875, + "reward": 0.9966879487037659, + "reward_std": 0.0013254747027531266, + "rewards/perpo_ocr_edit_distance_reward": 0.9966880679130554, + "step": 4611, + "temperature": 0.9 + }, + { + "advantages": -3.354890213813633e-05, + "completion_length": 688.0, + "delta_ref_entropy_loss": -0.0018463134765625, + "delta_ref_ppl": -0.0478515625, + "entropy_loss": -0.0517578125, + "epoch": 0.9224, + "grad_norm": 0.524187253262137, + "k1_kl": 0.048095703125, + "k3_kl": 0.040771484375, + "kimi_kl": 0.1171875, + "learning_rate": 3.88e-08, + "loss": 0.0017, + "ppl": 0.01953125, + "reward": 0.9841163754463196, + "reward_std": 0.0024369892198592424, + "rewards/perpo_ocr_edit_distance_reward": 0.9841164350509644, + "step": 4612, + "temperature": 0.9 + }, + { + "advantages": 7.220677161967615e-06, + "completion_length": 897.0, + "delta_ref_entropy_loss": 0.02587890625, + "delta_ref_ppl": -0.037353515625, + "entropy_loss": -0.039794921875, + "epoch": 0.9226, + "grad_norm": 0.4936829471048626, + "k1_kl": 0.037109375, + "k3_kl": 0.0196533203125, + "kimi_kl": 0.04833984375, + "learning_rate": 3.8699999999999996e-08, + "loss": 0.0008, + "ppl": 0.01171875, + "reward": 0.9848378896713257, + "reward_std": 0.0010762755991891026, + "rewards/perpo_ocr_edit_distance_reward": 0.9848378300666809, + "step": 4613, + "temperature": 0.9 + }, + { + "advantages": -6.148219108581543e-05, + "completion_length": 540.0, + "delta_ref_entropy_loss": 0.023193359375, + "delta_ref_ppl": -0.060302734375, + "entropy_loss": -0.08349609375, + "epoch": 0.9228, + "grad_norm": 0.6452296522783129, + "k1_kl": 0.060302734375, + "k3_kl": 0.038330078125, + "kimi_kl": 0.09326171875, + "learning_rate": 3.86e-08, + "loss": 0.0016, + "ppl": 0.024169921875, + "reward": 0.9910627007484436, + "reward_std": 0.0007311947410926223, + "rewards/perpo_ocr_edit_distance_reward": 0.9910627603530884, + "step": 4614, + "temperature": 0.9 + }, + { + "advantages": -4.9216409934160765e-06, + "completion_length": 216.0, + "delta_ref_entropy_loss": -0.05126953125, + "delta_ref_ppl": -0.275390625, + "entropy_loss": -0.310546875, + "epoch": 0.923, + "grad_norm": 7.8594601434599225, + "k1_kl": 0.275390625, + "k3_kl": 0.21875, + "kimi_kl": 0.67578125, + "learning_rate": 3.85e-08, + "loss": 0.0088, + "ppl": 0.1181640625, + "reward": 0.5272634625434875, + "reward_std": 0.010277117602527142, + "rewards/perpo_ocr_edit_distance_reward": 0.5272635221481323, + "step": 4615, + "temperature": 0.9 + }, + { + "advantages": -8.514949634275126e-09, + "completion_length": 1539.0, + "delta_ref_entropy_loss": -0.208984375, + "delta_ref_ppl": -0.025390625, + "entropy_loss": -0.3671875, + "epoch": 0.9232, + "grad_norm": 3.2815724306165936, + "k1_kl": 0.0252685546875, + "k3_kl": 0.06298828125, + "kimi_kl": 0.12255859375, + "learning_rate": 3.839999999999999e-08, + "loss": 0.0025, + "ppl": 0.1416015625, + "reward": 0.6264941692352295, + "reward_std": 0.036959268152713776, + "rewards/perpo_ocr_edit_distance_reward": 0.6264942288398743, + "step": 4616, + "temperature": 0.9 + }, + { + "advantages": -2.6362284188508056e-05, + "completion_length": 962.0, + "delta_ref_entropy_loss": 0.0026092529296875, + "delta_ref_ppl": -0.031982421875, + "entropy_loss": -0.053466796875, + "epoch": 0.9234, + "grad_norm": 0.42955088237320443, + "k1_kl": 0.031982421875, + "k3_kl": 0.0185546875, + "kimi_kl": 0.0546875, + "learning_rate": 3.83e-08, + "loss": 0.0008, + "ppl": 0.015869140625, + "reward": 0.9985893368721008, + "reward_std": 0.0005456013022921979, + "rewards/perpo_ocr_edit_distance_reward": 0.9985893368721008, + "step": 4617, + "temperature": 0.9 + }, + { + "advantages": -0.0002295119484188035, + "completion_length": 563.0, + "delta_ref_entropy_loss": 0.026611328125, + "delta_ref_ppl": -0.09033203125, + "entropy_loss": -0.07958984375, + "epoch": 0.9236, + "grad_norm": 0.5467943307249682, + "k1_kl": 0.08984375, + "k3_kl": 0.064453125, + "kimi_kl": 0.25390625, + "learning_rate": 3.82e-08, + "loss": 0.0028, + "ppl": 0.0224609375, + "reward": 0.9949259757995605, + "reward_std": 0.00034519899054430425, + "rewards/perpo_ocr_edit_distance_reward": 0.9949260950088501, + "step": 4618, + "temperature": 0.9 + }, + { + "advantages": -6.454331742133945e-05, + "completion_length": 536.0, + "delta_ref_entropy_loss": 0.032958984375, + "delta_ref_ppl": -0.11181640625, + "entropy_loss": -0.08837890625, + "epoch": 0.9238, + "grad_norm": 0.7175629577340606, + "k1_kl": 0.11181640625, + "k3_kl": 0.07470703125, + "kimi_kl": 0.2578125, + "learning_rate": 3.81e-08, + "loss": 0.0031, + "ppl": 0.0244140625, + "reward": 0.9585462808609009, + "reward_std": 0.0008231361862272024, + "rewards/perpo_ocr_edit_distance_reward": 0.9585464000701904, + "step": 4619, + "temperature": 0.9 + }, + { + "advantages": -5.187307397136465e-05, + "completion_length": 1001.0, + "delta_ref_entropy_loss": 0.0194091796875, + "delta_ref_ppl": -0.05908203125, + "entropy_loss": -0.07177734375, + "epoch": 0.924, + "grad_norm": 0.6521120196475532, + "k1_kl": 0.05908203125, + "k3_kl": 0.03515625, + "kimi_kl": 0.09130859375, + "learning_rate": 3.7999999999999996e-08, + "loss": 0.0015, + "ppl": 0.0223388671875, + "reward": 0.9905058741569519, + "reward_std": 0.0003924958291463554, + "rewards/perpo_ocr_edit_distance_reward": 0.9905058741569519, + "step": 4620, + "temperature": 0.9 + }, + { + "advantages": -6.004444367135875e-05, + "completion_length": 444.0, + "delta_ref_entropy_loss": 0.01507568359375, + "delta_ref_ppl": -0.05908203125, + "entropy_loss": -0.04736328125, + "epoch": 0.9242, + "grad_norm": 0.6001532401827533, + "k1_kl": 0.05908203125, + "k3_kl": 0.04248046875, + "kimi_kl": 0.140625, + "learning_rate": 3.79e-08, + "loss": 0.0018, + "ppl": 0.015625, + "reward": 0.7042041420936584, + "reward_std": 0.0007507652044296265, + "rewards/perpo_ocr_edit_distance_reward": 0.704204261302948, + "step": 4621, + "temperature": 0.9 + }, + { + "advantages": -0.00013245856098365039, + "completion_length": 1419.0, + "delta_ref_entropy_loss": 0.02783203125, + "delta_ref_ppl": -0.058349609375, + "entropy_loss": -0.08203125, + "epoch": 0.9244, + "grad_norm": 1.4020393095141803, + "k1_kl": 0.058837890625, + "k3_kl": 0.04248046875, + "kimi_kl": 0.107421875, + "learning_rate": 3.78e-08, + "loss": 0.0018, + "ppl": 0.038330078125, + "reward": 0.9708070158958435, + "reward_std": 0.0007998943910934031, + "rewards/perpo_ocr_edit_distance_reward": 0.9708071351051331, + "step": 4622, + "temperature": 0.9 + }, + { + "advantages": 1.3623919414840202e-07, + "completion_length": 102.0, + "delta_ref_entropy_loss": -0.859375, + "delta_ref_ppl": -0.318359375, + "entropy_loss": -1.8515625, + "epoch": 0.9246, + "grad_norm": 9.845871595112053, + "k1_kl": 0.31640625, + "k3_kl": 0.37890625, + "kimi_kl": 1.0625, + "learning_rate": 3.7699999999999993e-08, + "loss": 0.0152, + "ppl": 0.77734375, + "reward": 0.29334092140197754, + "reward_std": 0.04396527633070946, + "rewards/perpo_ocr_edit_distance_reward": 0.29334089159965515, + "step": 4623, + "temperature": 0.9 + }, + { + "advantages": -7.322856845348724e-07, + "completion_length": 596.0, + "delta_ref_entropy_loss": -0.62890625, + "delta_ref_ppl": -0.051513671875, + "entropy_loss": -1.15625, + "epoch": 0.9248, + "grad_norm": 5.769358403586279, + "k1_kl": 0.0517578125, + "k3_kl": 0.17578125, + "kimi_kl": 0.3359375, + "learning_rate": 3.76e-08, + "loss": 0.0071, + "ppl": 0.52734375, + "reward": 0.8390147089958191, + "reward_std": 0.04616616666316986, + "rewards/perpo_ocr_edit_distance_reward": 0.8390147686004639, + "step": 4624, + "temperature": 0.9 + }, + { + "advantages": -8.153064118232578e-05, + "completion_length": 678.0, + "delta_ref_entropy_loss": 0.0037384033203125, + "delta_ref_ppl": -0.030517578125, + "entropy_loss": -0.039306640625, + "epoch": 0.925, + "grad_norm": 0.3487135309109632, + "k1_kl": 0.0306396484375, + "k3_kl": 0.0283203125, + "kimi_kl": 0.076171875, + "learning_rate": 3.75e-08, + "loss": 0.0012, + "ppl": 0.0106201171875, + "reward": 0.9955602884292603, + "reward_std": 0.0008401861414313316, + "rewards/perpo_ocr_edit_distance_reward": 0.9955604076385498, + "step": 4625, + "temperature": 0.9 + }, + { + "advantages": -0.00015255383914336562, + "completion_length": 756.0, + "delta_ref_entropy_loss": 0.037109375, + "delta_ref_ppl": -0.055908203125, + "entropy_loss": -0.053466796875, + "epoch": 0.9252, + "grad_norm": 0.3842973395224713, + "k1_kl": 0.055908203125, + "k3_kl": 0.031005859375, + "kimi_kl": 0.07373046875, + "learning_rate": 3.7400000000000004e-08, + "loss": 0.0014, + "ppl": 0.0174560546875, + "reward": 0.9961127042770386, + "reward_std": 0.000458112801425159, + "rewards/perpo_ocr_edit_distance_reward": 0.9961128234863281, + "step": 4626, + "temperature": 0.9 + }, + { + "advantages": -2.302442408108618e-05, + "completion_length": 588.0, + "delta_ref_entropy_loss": 0.04150390625, + "delta_ref_ppl": -0.08642578125, + "entropy_loss": -0.057373046875, + "epoch": 0.9254, + "grad_norm": 0.2691153260422745, + "k1_kl": 0.08642578125, + "k3_kl": 0.05859375, + "kimi_kl": 0.2294921875, + "learning_rate": 3.7299999999999997e-08, + "loss": 0.0024, + "ppl": 0.0152587890625, + "reward": 0.8033879399299622, + "reward_std": 0.0006396571407094598, + "rewards/perpo_ocr_edit_distance_reward": 0.8033879399299622, + "step": 4627, + "temperature": 0.9 + }, + { + "advantages": -2.539158049330581e-05, + "completion_length": 153.0, + "delta_ref_entropy_loss": 0.026611328125, + "delta_ref_ppl": -0.1845703125, + "entropy_loss": -0.0849609375, + "epoch": 0.9256, + "grad_norm": 1.6375893862016484, + "k1_kl": 0.1845703125, + "k3_kl": 0.1474609375, + "kimi_kl": 0.60546875, + "learning_rate": 3.7199999999999996e-08, + "loss": 0.0059, + "ppl": 0.0341796875, + "reward": 0.98045814037323, + "reward_std": 0.0032550368923693895, + "rewards/perpo_ocr_edit_distance_reward": 0.9804583191871643, + "step": 4628, + "temperature": 0.9 + }, + { + "advantages": 1.3998576832818799e-05, + "completion_length": 627.0, + "delta_ref_entropy_loss": 0.0186767578125, + "delta_ref_ppl": -0.0625, + "entropy_loss": -0.0615234375, + "epoch": 0.9258, + "grad_norm": 0.35685204843217977, + "k1_kl": 0.06298828125, + "k3_kl": 0.036865234375, + "kimi_kl": 0.09375, + "learning_rate": 3.71e-08, + "loss": 0.0015, + "ppl": 0.0184326171875, + "reward": 0.9836679100990295, + "reward_std": 0.0005083620199002326, + "rewards/perpo_ocr_edit_distance_reward": 0.9836679697036743, + "step": 4629, + "temperature": 0.9 + }, + { + "advantages": -4.700252247857861e-05, + "completion_length": 661.0, + "delta_ref_entropy_loss": 0.0303955078125, + "delta_ref_ppl": -0.059814453125, + "entropy_loss": -0.052978515625, + "epoch": 0.926, + "grad_norm": 0.5001887613185126, + "k1_kl": 0.0595703125, + "k3_kl": 0.04052734375, + "kimi_kl": 0.1435546875, + "learning_rate": 3.6999999999999994e-08, + "loss": 0.0017, + "ppl": 0.017333984375, + "reward": 0.9942616820335388, + "reward_std": 0.0015302321407943964, + "rewards/perpo_ocr_edit_distance_reward": 0.9942617416381836, + "step": 4630, + "temperature": 0.9 + }, + { + "advantages": -8.280788460979238e-05, + "completion_length": 778.0, + "delta_ref_entropy_loss": 0.033447265625, + "delta_ref_ppl": -0.07177734375, + "entropy_loss": -0.08447265625, + "epoch": 0.9262, + "grad_norm": 1.5822945366686123, + "k1_kl": 0.07177734375, + "k3_kl": 0.041259765625, + "kimi_kl": 0.107421875, + "learning_rate": 3.69e-08, + "loss": 0.0017, + "ppl": 0.029296875, + "reward": 0.9973330497741699, + "reward_std": 0.0009281007805839181, + "rewards/perpo_ocr_edit_distance_reward": 0.9973331689834595, + "step": 4631, + "temperature": 0.9 + }, + { + "advantages": 1.405818147759419e-05, + "completion_length": 829.0, + "delta_ref_entropy_loss": 0.01458740234375, + "delta_ref_ppl": -0.038330078125, + "entropy_loss": -0.040283203125, + "epoch": 0.9264, + "grad_norm": 0.4401326224618513, + "k1_kl": 0.038330078125, + "k3_kl": 0.028076171875, + "kimi_kl": 0.0849609375, + "learning_rate": 3.68e-08, + "loss": 0.0011, + "ppl": 0.0157470703125, + "reward": 0.9941304326057434, + "reward_std": 0.0011107673635706306, + "rewards/perpo_ocr_edit_distance_reward": 0.9941304326057434, + "step": 4632, + "temperature": 0.9 + }, + { + "advantages": -3.2561169064138085e-05, + "completion_length": 470.0, + "delta_ref_entropy_loss": -0.0147705078125, + "delta_ref_ppl": -0.09619140625, + "entropy_loss": -0.1923828125, + "epoch": 0.9266, + "grad_norm": 1.5448931305073592, + "k1_kl": 0.095703125, + "k3_kl": 0.07763671875, + "kimi_kl": 0.2099609375, + "learning_rate": 3.6700000000000004e-08, + "loss": 0.0031, + "ppl": 0.08544921875, + "reward": 0.9061132073402405, + "reward_std": 0.0025137995835393667, + "rewards/perpo_ocr_edit_distance_reward": 0.9061132669448853, + "step": 4633, + "temperature": 0.9 + }, + { + "advantages": -3.3395634090993553e-05, + "completion_length": 170.0, + "delta_ref_entropy_loss": 0.06982421875, + "delta_ref_ppl": -0.345703125, + "entropy_loss": -0.34765625, + "epoch": 0.9268, + "grad_norm": 3.0729802997764217, + "k1_kl": 0.34765625, + "k3_kl": 0.265625, + "kimi_kl": 0.96484375, + "learning_rate": 3.66e-08, + "loss": 0.0107, + "ppl": 0.1337890625, + "reward": 0.34874895215034485, + "reward_std": 0.0005373903550207615, + "rewards/perpo_ocr_edit_distance_reward": 0.3487490117549896, + "step": 4634, + "temperature": 0.9 + }, + { + "advantages": -4.96251268486958e-05, + "completion_length": 1415.0, + "delta_ref_entropy_loss": 0.039306640625, + "delta_ref_ppl": -0.04248046875, + "entropy_loss": -0.05810546875, + "epoch": 0.927, + "grad_norm": 0.4693776070615649, + "k1_kl": 0.042236328125, + "k3_kl": 0.0240478515625, + "kimi_kl": 0.0634765625, + "learning_rate": 3.6499999999999996e-08, + "loss": 0.001, + "ppl": 0.019775390625, + "reward": 0.9919905066490173, + "reward_std": 0.0009294866467826068, + "rewards/perpo_ocr_edit_distance_reward": 0.9919906258583069, + "step": 4635, + "temperature": 0.9 + }, + { + "advantages": 6.811959707420101e-08, + "completion_length": 1712.0, + "delta_ref_entropy_loss": -0.1044921875, + "delta_ref_ppl": -0.08056640625, + "entropy_loss": -0.6796875, + "epoch": 0.9272, + "grad_norm": 3.4068825865766397, + "k1_kl": 0.08154296875, + "k3_kl": 0.09130859375, + "kimi_kl": 0.1513671875, + "learning_rate": 3.64e-08, + "loss": 0.0036, + "ppl": 0.3515625, + "reward": 0.6394166946411133, + "reward_std": 0.06310126930475235, + "rewards/perpo_ocr_edit_distance_reward": 0.6394166946411133, + "step": 4636, + "temperature": 0.9 + }, + { + "advantages": -6.948198915779358e-06, + "completion_length": 397.0, + "delta_ref_entropy_loss": 0.050537109375, + "delta_ref_ppl": -0.0908203125, + "entropy_loss": -0.09716796875, + "epoch": 0.9274, + "grad_norm": 1.0744218409603563, + "k1_kl": 0.0908203125, + "k3_kl": 0.056396484375, + "kimi_kl": 0.1474609375, + "learning_rate": 3.6299999999999994e-08, + "loss": 0.0023, + "ppl": 0.036376953125, + "reward": 0.9941691160202026, + "reward_std": 0.002343097236007452, + "rewards/perpo_ocr_edit_distance_reward": 0.9941691160202026, + "step": 4637, + "temperature": 0.9 + }, + { + "advantages": -1.2516975402832031e-06, + "completion_length": 2048.0, + "delta_ref_entropy_loss": -0.02099609375, + "delta_ref_ppl": -0.0301513671875, + "entropy_loss": -0.09130859375, + "epoch": 0.9276, + "grad_norm": 0.902390758691859, + "k1_kl": 0.0302734375, + "k3_kl": 0.0264892578125, + "kimi_kl": 0.08203125, + "learning_rate": 3.62e-08, + "loss": 0.0011, + "ppl": 0.03955078125, + "reward": 0.8919990658760071, + "reward_std": 0.013483414426445961, + "rewards/perpo_ocr_edit_distance_reward": 0.8919991254806519, + "step": 4638, + "temperature": 0.9 + }, + { + "advantages": 8.480889846396167e-06, + "completion_length": 435.0, + "delta_ref_entropy_loss": 0.015380859375, + "delta_ref_ppl": -0.07666015625, + "entropy_loss": -0.06103515625, + "epoch": 0.9278, + "grad_norm": 0.9315172722641306, + "k1_kl": 0.07666015625, + "k3_kl": 0.05517578125, + "kimi_kl": 0.169921875, + "learning_rate": 3.61e-08, + "loss": 0.0022, + "ppl": 0.0184326171875, + "reward": 0.9958732724189758, + "reward_std": 0.0019049590919166803, + "rewards/perpo_ocr_edit_distance_reward": 0.995873212814331, + "step": 4639, + "temperature": 0.9 + }, + { + "advantages": -5.269050961942412e-05, + "completion_length": 818.0, + "delta_ref_entropy_loss": -0.0244140625, + "delta_ref_ppl": -0.068359375, + "entropy_loss": -0.21484375, + "epoch": 0.928, + "grad_norm": 0.7804992689749252, + "k1_kl": 0.068359375, + "k3_kl": 0.05224609375, + "kimi_kl": 0.1494140625, + "learning_rate": 3.6e-08, + "loss": 0.0021, + "ppl": 0.07958984375, + "reward": 0.8758515119552612, + "reward_std": 0.0015158019959926605, + "rewards/perpo_ocr_edit_distance_reward": 0.8758516311645508, + "step": 4640, + "temperature": 0.9 + }, + { + "advantages": -8.509840699844062e-05, + "completion_length": 597.0, + "delta_ref_entropy_loss": 0.0240478515625, + "delta_ref_ppl": -0.06298828125, + "entropy_loss": -0.07080078125, + "epoch": 0.9282, + "grad_norm": 0.4066972109765967, + "k1_kl": 0.0625, + "k3_kl": 0.037353515625, + "kimi_kl": 0.0908203125, + "learning_rate": 3.59e-08, + "loss": 0.0016, + "ppl": 0.0255126953125, + "reward": 0.9980807900428772, + "reward_std": 0.0012009998317807913, + "rewards/perpo_ocr_edit_distance_reward": 0.998080849647522, + "step": 4641, + "temperature": 0.9 + }, + { + "advantages": -5.0536225899122655e-05, + "completion_length": 574.0, + "delta_ref_entropy_loss": 0.0264892578125, + "delta_ref_ppl": -0.048828125, + "entropy_loss": -0.060546875, + "epoch": 0.9284, + "grad_norm": 0.5924473489434364, + "k1_kl": 0.048828125, + "k3_kl": 0.0301513671875, + "kimi_kl": 0.07666015625, + "learning_rate": 3.5799999999999996e-08, + "loss": 0.0013, + "ppl": 0.022705078125, + "reward": 0.9952484965324402, + "reward_std": 0.0012479161377996206, + "rewards/perpo_ocr_edit_distance_reward": 0.995248556137085, + "step": 4642, + "temperature": 0.9 + }, + { + "advantages": -0.00010137898789253086, + "completion_length": 608.0, + "delta_ref_entropy_loss": 0.021240234375, + "delta_ref_ppl": -0.057861328125, + "entropy_loss": -0.06298828125, + "epoch": 0.9286, + "grad_norm": 0.4118419772559943, + "k1_kl": 0.057861328125, + "k3_kl": 0.043212890625, + "kimi_kl": 0.111328125, + "learning_rate": 3.57e-08, + "loss": 0.0018, + "ppl": 0.0196533203125, + "reward": 0.9963282346725464, + "reward_std": 0.0004879333428107202, + "rewards/perpo_ocr_edit_distance_reward": 0.9963282346725464, + "step": 4643, + "temperature": 0.9 + }, + { + "advantages": -5.480221443576738e-05, + "completion_length": 315.0, + "delta_ref_entropy_loss": 0.00142669677734375, + "delta_ref_ppl": -0.1025390625, + "entropy_loss": -0.05615234375, + "epoch": 0.9288, + "grad_norm": 0.6749490189310063, + "k1_kl": 0.1025390625, + "k3_kl": 0.08154296875, + "kimi_kl": 0.435546875, + "learning_rate": 3.56e-08, + "loss": 0.0033, + "ppl": 0.0184326171875, + "reward": 0.996370255947113, + "reward_std": 0.0009878851706162095, + "rewards/perpo_ocr_edit_distance_reward": 0.9963703155517578, + "step": 4644, + "temperature": 0.9 + }, + { + "advantages": -3.232275048503652e-05, + "completion_length": 411.0, + "delta_ref_entropy_loss": -0.019287109375, + "delta_ref_ppl": -0.10400390625, + "entropy_loss": -0.103515625, + "epoch": 0.929, + "grad_norm": 0.870452305871403, + "k1_kl": 0.10400390625, + "k3_kl": 0.076171875, + "kimi_kl": 0.2265625, + "learning_rate": 3.5499999999999994e-08, + "loss": 0.0031, + "ppl": 0.03857421875, + "reward": 0.9874429106712341, + "reward_std": 0.0017437435453757644, + "rewards/perpo_ocr_edit_distance_reward": 0.9874429702758789, + "step": 4645, + "temperature": 0.9 + }, + { + "advantages": -1.3913428119849414e-05, + "completion_length": 1077.0, + "delta_ref_entropy_loss": 0.0145263671875, + "delta_ref_ppl": -0.0419921875, + "entropy_loss": -0.047607421875, + "epoch": 0.9292, + "grad_norm": 0.44719684535187715, + "k1_kl": 0.0419921875, + "k3_kl": 0.029541015625, + "kimi_kl": 0.07861328125, + "learning_rate": 3.54e-08, + "loss": 0.0012, + "ppl": 0.0169677734375, + "reward": 0.9954545497894287, + "reward_std": 0.0005119852721691132, + "rewards/perpo_ocr_edit_distance_reward": 0.9954545497894287, + "step": 4646, + "temperature": 0.9 + }, + { + "advantages": -1.7200197817146545e-06, + "completion_length": 925.0, + "delta_ref_entropy_loss": -0.119140625, + "delta_ref_ppl": -0.035400390625, + "entropy_loss": -0.1982421875, + "epoch": 0.9294, + "grad_norm": 1.0411050783323126, + "k1_kl": 0.03564453125, + "k3_kl": 0.03857421875, + "kimi_kl": 0.109375, + "learning_rate": 3.53e-08, + "loss": 0.0015, + "ppl": 0.041748046875, + "reward": 0.9797145128250122, + "reward_std": 0.039265841245651245, + "rewards/perpo_ocr_edit_distance_reward": 0.979714572429657, + "step": 4647, + "temperature": 0.9 + }, + { + "advantages": -6.580352783203125e-05, + "completion_length": 464.0, + "delta_ref_entropy_loss": 0.0184326171875, + "delta_ref_ppl": -0.052734375, + "entropy_loss": -0.10302734375, + "epoch": 0.9296, + "grad_norm": 1.1697371234395597, + "k1_kl": 0.052978515625, + "k3_kl": 0.0419921875, + "kimi_kl": 0.09375, + "learning_rate": 3.52e-08, + "loss": 0.0017, + "ppl": 0.04345703125, + "reward": 0.9403767585754395, + "reward_std": 0.0006761380354873836, + "rewards/perpo_ocr_edit_distance_reward": 0.940376877784729, + "step": 4648, + "temperature": 0.9 + }, + { + "advantages": 0.0, + "completion_length": 807.0, + "delta_ref_entropy_loss": -0.07177734375, + "delta_ref_ppl": -0.07177734375, + "entropy_loss": -0.54296875, + "epoch": 0.9298, + "grad_norm": 3.2303990406392513, + "k1_kl": 0.07177734375, + "k3_kl": 0.06689453125, + "kimi_kl": 0.1201171875, + "learning_rate": 3.5099999999999997e-08, + "loss": 0.0027, + "ppl": 0.2451171875, + "reward": 0.5240151882171631, + "reward_std": 0.007710022386163473, + "rewards/perpo_ocr_edit_distance_reward": 0.5240151882171631, + "step": 4649, + "temperature": 0.9 + }, + { + "advantages": -1.9499235349940136e-05, + "completion_length": 327.0, + "delta_ref_entropy_loss": 0.045654296875, + "delta_ref_ppl": -0.1650390625, + "entropy_loss": -0.1015625, + "epoch": 0.93, + "grad_norm": 0.610224636350531, + "k1_kl": 0.1650390625, + "k3_kl": 0.1259765625, + "kimi_kl": 0.455078125, + "learning_rate": 3.5e-08, + "loss": 0.0051, + "ppl": 0.0361328125, + "reward": 0.9898222088813782, + "reward_std": 0.000772336614318192, + "rewards/perpo_ocr_edit_distance_reward": 0.9898222088813782, + "step": 4650, + "temperature": 0.9 + }, + { + "advantages": -3.295285523563507e-06, + "completion_length": 72.0, + "delta_ref_entropy_loss": -0.01336669921875, + "delta_ref_ppl": -0.59375, + "entropy_loss": -0.2314453125, + "epoch": 0.9302, + "grad_norm": 3.75190553353274, + "k1_kl": 0.59375, + "k3_kl": 0.5, + "kimi_kl": 2.328125, + "learning_rate": 3.49e-08, + "loss": 0.0201, + "ppl": 0.076171875, + "reward": 0.9851469397544861, + "reward_std": 0.005069863982498646, + "rewards/perpo_ocr_edit_distance_reward": 0.9851470589637756, + "step": 4651, + "temperature": 0.9 + }, + { + "advantages": -1.1648450708889868e-05, + "completion_length": 88.0, + "delta_ref_entropy_loss": 0.038818359375, + "delta_ref_ppl": -0.380859375, + "entropy_loss": -0.1494140625, + "epoch": 0.9304, + "grad_norm": 2.846609033351967, + "k1_kl": 0.37890625, + "k3_kl": 0.306640625, + "kimi_kl": 1.3125, + "learning_rate": 3.4799999999999994e-08, + "loss": 0.0122, + "ppl": 0.0634765625, + "reward": 0.9792370796203613, + "reward_std": 0.005737167317420244, + "rewards/perpo_ocr_edit_distance_reward": 0.9792371392250061, + "step": 4652, + "temperature": 0.9 + }, + { + "advantages": -1.8409320546197705e-05, + "completion_length": 292.0, + "delta_ref_entropy_loss": 0.033935546875, + "delta_ref_ppl": -0.09375, + "entropy_loss": -0.05029296875, + "epoch": 0.9306, + "grad_norm": 0.730789559698056, + "k1_kl": 0.09375, + "k3_kl": 0.0703125, + "kimi_kl": 0.2451171875, + "learning_rate": 3.47e-08, + "loss": 0.0028, + "ppl": 0.0167236328125, + "reward": 0.9928475022315979, + "reward_std": 0.001287125633098185, + "rewards/perpo_ocr_edit_distance_reward": 0.9928475022315979, + "step": 4653, + "temperature": 0.9 + }, + { + "advantages": -0.00011164801981067285, + "completion_length": 441.0, + "delta_ref_entropy_loss": 0.0184326171875, + "delta_ref_ppl": -0.09375, + "entropy_loss": -0.07763671875, + "epoch": 0.9308, + "grad_norm": 0.26870797566389587, + "k1_kl": 0.09375, + "k3_kl": 0.06982421875, + "kimi_kl": 0.27734375, + "learning_rate": 3.46e-08, + "loss": 0.0029, + "ppl": 0.021728515625, + "reward": 0.9973440766334534, + "reward_std": 0.0006627426482737064, + "rewards/perpo_ocr_edit_distance_reward": 0.9973441958427429, + "step": 4654, + "temperature": 0.9 + }, + { + "advantages": -1.3283321095514111e-05, + "completion_length": 433.0, + "delta_ref_entropy_loss": 0.0272216796875, + "delta_ref_ppl": -0.0908203125, + "entropy_loss": -0.1103515625, + "epoch": 0.931, + "grad_norm": 1.00607907131717, + "k1_kl": 0.09033203125, + "k3_kl": 0.06396484375, + "kimi_kl": 0.1923828125, + "learning_rate": 3.4500000000000005e-08, + "loss": 0.0026, + "ppl": 0.042724609375, + "reward": 0.9835289120674133, + "reward_std": 0.0037535051815211773, + "rewards/perpo_ocr_edit_distance_reward": 0.9835289716720581, + "step": 4655, + "temperature": 0.9 + }, + { + "advantages": -3.065381974920456e-07, + "completion_length": 49.0, + "delta_ref_entropy_loss": -0.2265625, + "delta_ref_ppl": -0.7109375, + "entropy_loss": -0.9609375, + "epoch": 0.9312, + "grad_norm": 7.490574723055749, + "k1_kl": 0.71484375, + "k3_kl": 0.63671875, + "kimi_kl": 2.359375, + "learning_rate": 3.44e-08, + "loss": 0.0254, + "ppl": 0.400390625, + "reward": 0.8415456414222717, + "reward_std": 0.10529033094644547, + "rewards/perpo_ocr_edit_distance_reward": 0.8415457010269165, + "step": 4656, + "temperature": 0.9 + }, + { + "advantages": 1.7029899268550253e-08, + "completion_length": 1443.0, + "delta_ref_entropy_loss": 0.022216796875, + "delta_ref_ppl": -0.054443359375, + "entropy_loss": -0.09765625, + "epoch": 0.9314, + "grad_norm": 9.590800611280477, + "k1_kl": 0.05419921875, + "k3_kl": 0.046875, + "kimi_kl": 0.099609375, + "learning_rate": 3.4299999999999996e-08, + "loss": 0.0019, + "ppl": 0.051025390625, + "reward": 0.9837172031402588, + "reward_std": 0.0016227064188569784, + "rewards/perpo_ocr_edit_distance_reward": 0.9837172627449036, + "step": 4657, + "temperature": 0.9 + }, + { + "advantages": -1.2091228427379974e-06, + "completion_length": 101.0, + "delta_ref_entropy_loss": 0.01495361328125, + "delta_ref_ppl": -0.390625, + "entropy_loss": -0.10400390625, + "epoch": 0.9316, + "grad_norm": 3.2331024838779743, + "k1_kl": 0.390625, + "k3_kl": 0.3203125, + "kimi_kl": 1.5390625, + "learning_rate": 3.42e-08, + "loss": 0.0129, + "ppl": 0.0286865234375, + "reward": 0.9406392574310303, + "reward_std": 0.006974987685680389, + "rewards/perpo_ocr_edit_distance_reward": 0.9406392574310303, + "step": 4658, + "temperature": 0.9 + }, + { + "advantages": -5.231585237197578e-05, + "completion_length": 992.0, + "delta_ref_entropy_loss": 0.036865234375, + "delta_ref_ppl": -0.064453125, + "entropy_loss": -0.058837890625, + "epoch": 0.9318, + "grad_norm": 1.2505408146172763, + "k1_kl": 0.064453125, + "k3_kl": 0.04345703125, + "kimi_kl": 0.1357421875, + "learning_rate": 3.4099999999999994e-08, + "loss": 0.0018, + "ppl": 0.02392578125, + "reward": 0.9844993948936462, + "reward_std": 0.001202327897772193, + "rewards/perpo_ocr_edit_distance_reward": 0.984499454498291, + "step": 4659, + "temperature": 0.9 + }, + { + "advantages": -8.113256626529619e-05, + "completion_length": 816.0, + "delta_ref_entropy_loss": 0.03857421875, + "delta_ref_ppl": -0.0615234375, + "entropy_loss": -0.0751953125, + "epoch": 0.932, + "grad_norm": 0.7974375383377511, + "k1_kl": 0.061767578125, + "k3_kl": 0.039794921875, + "kimi_kl": 0.08056640625, + "learning_rate": 3.4e-08, + "loss": 0.0017, + "ppl": 0.0279541015625, + "reward": 0.9920063018798828, + "reward_std": 0.000529570912476629, + "rewards/perpo_ocr_edit_distance_reward": 0.9920064210891724, + "step": 4660, + "temperature": 0.9 + }, + { + "advantages": -3.55839729309082e-05, + "completion_length": 858.0, + "delta_ref_entropy_loss": 0.06201171875, + "delta_ref_ppl": -0.0986328125, + "entropy_loss": -0.055908203125, + "epoch": 0.9322, + "grad_norm": 1.0578559885651722, + "k1_kl": 0.0986328125, + "k3_kl": 0.0634765625, + "kimi_kl": 0.189453125, + "learning_rate": 3.39e-08, + "loss": 0.0026, + "ppl": 0.0234375, + "reward": 0.9951573610305786, + "reward_std": 0.0006181058124639094, + "rewards/perpo_ocr_edit_distance_reward": 0.9951574206352234, + "step": 4661, + "temperature": 0.9 + }, + { + "advantages": 1.2942723515152466e-06, + "completion_length": 67.0, + "delta_ref_entropy_loss": 0.047607421875, + "delta_ref_ppl": -0.404296875, + "entropy_loss": -0.255859375, + "epoch": 0.9324, + "grad_norm": 7.342884097914984, + "k1_kl": 0.404296875, + "k3_kl": 0.31640625, + "kimi_kl": 1.2265625, + "learning_rate": 3.38e-08, + "loss": 0.0126, + "ppl": 0.10107421875, + "reward": 0.3778376281261444, + "reward_std": 0.009977380745112896, + "rewards/perpo_ocr_edit_distance_reward": 0.3778376281261444, + "step": 4662, + "temperature": 0.9 + }, + { + "advantages": -2.3228782083606347e-05, + "completion_length": 1007.0, + "delta_ref_entropy_loss": 0.03662109375, + "delta_ref_ppl": -0.05859375, + "entropy_loss": -0.044189453125, + "epoch": 0.9326, + "grad_norm": 0.1772045991821968, + "k1_kl": 0.058349609375, + "k3_kl": 0.03515625, + "kimi_kl": 0.10498046875, + "learning_rate": 3.37e-08, + "loss": 0.0014, + "ppl": 0.01226806640625, + "reward": 0.9834639430046082, + "reward_std": 0.0002664092753548175, + "rewards/perpo_ocr_edit_distance_reward": 0.9834640026092529, + "step": 4663, + "temperature": 0.9 + }, + { + "advantages": -4.5299530029296875e-06, + "completion_length": 904.0, + "delta_ref_entropy_loss": 0.02783203125, + "delta_ref_ppl": -0.08642578125, + "entropy_loss": -0.125, + "epoch": 0.9328, + "grad_norm": 1.120921110367033, + "k1_kl": 0.08642578125, + "k3_kl": 0.06201171875, + "kimi_kl": 0.134765625, + "learning_rate": 3.3599999999999996e-08, + "loss": 0.0025, + "ppl": 0.049072265625, + "reward": 0.8872180581092834, + "reward_std": 0.014890993945300579, + "rewards/perpo_ocr_edit_distance_reward": 0.8872181177139282, + "step": 4664, + "temperature": 0.9 + }, + { + "advantages": 1.8732889657258056e-06, + "completion_length": 1095.0, + "delta_ref_entropy_loss": -0.01092529296875, + "delta_ref_ppl": -0.037353515625, + "entropy_loss": -0.1005859375, + "epoch": 0.933, + "grad_norm": 0.7437787892893, + "k1_kl": 0.03759765625, + "k3_kl": 0.0306396484375, + "kimi_kl": 0.1005859375, + "learning_rate": 3.35e-08, + "loss": 0.0012, + "ppl": 0.034912109375, + "reward": 0.9838807582855225, + "reward_std": 0.008950313553214073, + "rewards/perpo_ocr_edit_distance_reward": 0.983880877494812, + "step": 4665, + "temperature": 0.9 + }, + { + "advantages": -2.384185791015625e-06, + "completion_length": 513.0, + "delta_ref_entropy_loss": -0.00823974609375, + "delta_ref_ppl": -0.0986328125, + "entropy_loss": -0.2431640625, + "epoch": 0.9332, + "grad_norm": 1.5343293485062799, + "k1_kl": 0.0986328125, + "k3_kl": 0.07763671875, + "kimi_kl": 0.1640625, + "learning_rate": 3.3399999999999995e-08, + "loss": 0.0031, + "ppl": 0.1015625, + "reward": 0.8783286809921265, + "reward_std": 0.0034378061536699533, + "rewards/perpo_ocr_edit_distance_reward": 0.8783286809921265, + "step": 4666, + "temperature": 0.9 + }, + { + "advantages": -6.639957427978516e-05, + "completion_length": 569.0, + "delta_ref_entropy_loss": 0.028564453125, + "delta_ref_ppl": -0.05615234375, + "entropy_loss": -0.047607421875, + "epoch": 0.9334, + "grad_norm": 0.27026135357675707, + "k1_kl": 0.05615234375, + "k3_kl": 0.033447265625, + "kimi_kl": 0.0947265625, + "learning_rate": 3.33e-08, + "loss": 0.0014, + "ppl": 0.0120849609375, + "reward": 0.9969505667686462, + "reward_std": 0.0002846640709321946, + "rewards/perpo_ocr_edit_distance_reward": 0.996950626373291, + "step": 4667, + "temperature": 0.9 + }, + { + "advantages": 1.685959978203755e-05, + "completion_length": 672.0, + "delta_ref_entropy_loss": 0.033447265625, + "delta_ref_ppl": -0.06298828125, + "entropy_loss": -0.055908203125, + "epoch": 0.9336, + "grad_norm": 0.2765872782234275, + "k1_kl": 0.06298828125, + "k3_kl": 0.042724609375, + "kimi_kl": 0.12060546875, + "learning_rate": 3.32e-08, + "loss": 0.0017, + "ppl": 0.017822265625, + "reward": 0.9956303834915161, + "reward_std": 0.00040444190381094813, + "rewards/perpo_ocr_edit_distance_reward": 0.9956304430961609, + "step": 4668, + "temperature": 0.9 + }, + { + "advantages": -3.405128518352285e-05, + "completion_length": 452.0, + "delta_ref_entropy_loss": 0.0281982421875, + "delta_ref_ppl": -0.0576171875, + "entropy_loss": -0.051513671875, + "epoch": 0.9338, + "grad_norm": 0.8267014424416957, + "k1_kl": 0.057373046875, + "k3_kl": 0.0361328125, + "kimi_kl": 0.1162109375, + "learning_rate": 3.31e-08, + "loss": 0.0015, + "ppl": 0.01544189453125, + "reward": 0.9936985373497009, + "reward_std": 0.0011501130647957325, + "rewards/perpo_ocr_edit_distance_reward": 0.9936986565589905, + "step": 4669, + "temperature": 0.9 + }, + { + "advantages": -2.4216516976593994e-05, + "completion_length": 357.0, + "delta_ref_entropy_loss": 0.01190185546875, + "delta_ref_ppl": -0.130859375, + "entropy_loss": -0.09423828125, + "epoch": 0.934, + "grad_norm": 0.8287536049961207, + "k1_kl": 0.130859375, + "k3_kl": 0.10107421875, + "kimi_kl": 0.41796875, + "learning_rate": 3.3e-08, + "loss": 0.0041, + "ppl": 0.032958984375, + "reward": 0.9935166239738464, + "reward_std": 0.0013060903875157237, + "rewards/perpo_ocr_edit_distance_reward": 0.9935166835784912, + "step": 4670, + "temperature": 0.9 + }, + { + "advantages": -0.00018119813466910273, + "completion_length": 422.0, + "delta_ref_entropy_loss": 0.0181884765625, + "delta_ref_ppl": -0.064453125, + "entropy_loss": -0.044677734375, + "epoch": 0.9342, + "grad_norm": 0.4029814395972284, + "k1_kl": 0.064453125, + "k3_kl": 0.043212890625, + "kimi_kl": 0.130859375, + "learning_rate": 3.29e-08, + "loss": 0.0019, + "ppl": 0.0125732421875, + "reward": 0.8613767027854919, + "reward_std": 0.00027597311418503523, + "rewards/perpo_ocr_edit_distance_reward": 0.8613767623901367, + "step": 4671, + "temperature": 0.9 + }, + { + "advantages": -1.411778612236958e-05, + "completion_length": 298.0, + "delta_ref_entropy_loss": 0.050537109375, + "delta_ref_ppl": -0.1220703125, + "entropy_loss": -0.08447265625, + "epoch": 0.9344, + "grad_norm": 1.2336988841847236, + "k1_kl": 0.12255859375, + "k3_kl": 0.08642578125, + "kimi_kl": 0.337890625, + "learning_rate": 3.28e-08, + "loss": 0.0035, + "ppl": 0.029541015625, + "reward": 0.9870145916938782, + "reward_std": 0.0011052575428038836, + "rewards/perpo_ocr_edit_distance_reward": 0.987014651298523, + "step": 4672, + "temperature": 0.9 + }, + { + "advantages": -3.473247852525674e-05, + "completion_length": 412.0, + "delta_ref_entropy_loss": 0.060302734375, + "delta_ref_ppl": -0.0908203125, + "entropy_loss": -0.052978515625, + "epoch": 0.9346, + "grad_norm": 0.43532366650186755, + "k1_kl": 0.0908203125, + "k3_kl": 0.056884765625, + "kimi_kl": 0.193359375, + "learning_rate": 3.27e-08, + "loss": 0.0023, + "ppl": 0.0147705078125, + "reward": 0.9901505708694458, + "reward_std": 0.0006356100784614682, + "rewards/perpo_ocr_edit_distance_reward": 0.9901505708694458, + "step": 4673, + "temperature": 0.9 + }, + { + "advantages": -5.023820222049835e-07, + "completion_length": 764.0, + "delta_ref_entropy_loss": -0.030517578125, + "delta_ref_ppl": -0.10498046875, + "entropy_loss": -0.2412109375, + "epoch": 0.9348, + "grad_norm": 1.161802686253262, + "k1_kl": 0.10546875, + "k3_kl": 0.08056640625, + "kimi_kl": 0.251953125, + "learning_rate": 3.2599999999999994e-08, + "loss": 0.0032, + "ppl": 0.123046875, + "reward": 0.5946389436721802, + "reward_std": 0.15854497253894806, + "rewards/perpo_ocr_edit_distance_reward": 0.594639003276825, + "step": 4674, + "temperature": 0.9 + }, + { + "advantages": -3.423009911784902e-05, + "completion_length": 706.0, + "delta_ref_entropy_loss": 0.0233154296875, + "delta_ref_ppl": -0.037109375, + "entropy_loss": -0.03662109375, + "epoch": 0.935, + "grad_norm": 0.5991506445932426, + "k1_kl": 0.037353515625, + "k3_kl": 0.0208740234375, + "kimi_kl": 0.052490234375, + "learning_rate": 3.25e-08, + "loss": 0.0009, + "ppl": 0.01129150390625, + "reward": 0.9986228942871094, + "reward_std": 0.0006466222112067044, + "rewards/perpo_ocr_edit_distance_reward": 0.9986229538917542, + "step": 4675, + "temperature": 0.9 + }, + { + "advantages": -2.3995127776288427e-05, + "completion_length": 802.0, + "delta_ref_entropy_loss": 0.020263671875, + "delta_ref_ppl": -0.06103515625, + "entropy_loss": -0.203125, + "epoch": 0.9352, + "grad_norm": 1.6195495591478268, + "k1_kl": 0.06103515625, + "k3_kl": 0.04345703125, + "kimi_kl": 0.09130859375, + "learning_rate": 3.24e-08, + "loss": 0.0018, + "ppl": 0.09033203125, + "reward": 0.9120103120803833, + "reward_std": 0.0020293535199016333, + "rewards/perpo_ocr_edit_distance_reward": 0.9120103716850281, + "step": 4676, + "temperature": 0.9 + }, + { + "advantages": -1.0916165592789184e-05, + "completion_length": 419.0, + "delta_ref_entropy_loss": 0.033935546875, + "delta_ref_ppl": -0.10205078125, + "entropy_loss": -0.0771484375, + "epoch": 0.9354, + "grad_norm": 0.8524135088924214, + "k1_kl": 0.10205078125, + "k3_kl": 0.0751953125, + "kimi_kl": 0.271484375, + "learning_rate": 3.23e-08, + "loss": 0.003, + "ppl": 0.032958984375, + "reward": 0.9891225099563599, + "reward_std": 0.0014611484948545694, + "rewards/perpo_ocr_edit_distance_reward": 0.9891225695610046, + "step": 4677, + "temperature": 0.9 + }, + { + "advantages": 1.7029899268550253e-08, + "completion_length": 597.0, + "delta_ref_entropy_loss": 0.01336669921875, + "delta_ref_ppl": -0.052978515625, + "entropy_loss": -0.0576171875, + "epoch": 0.9356, + "grad_norm": 0.4321041863317303, + "k1_kl": 0.052978515625, + "k3_kl": 0.033447265625, + "kimi_kl": 0.09423828125, + "learning_rate": 3.22e-08, + "loss": 0.0013, + "ppl": 0.0185546875, + "reward": 0.9972479343414307, + "reward_std": 0.000336811994202435, + "rewards/perpo_ocr_edit_distance_reward": 0.9972480535507202, + "step": 4678, + "temperature": 0.9 + }, + { + "advantages": 1.8937247659778222e-05, + "completion_length": 371.0, + "delta_ref_entropy_loss": 0.029296875, + "delta_ref_ppl": -0.0634765625, + "entropy_loss": -0.08642578125, + "epoch": 0.9358, + "grad_norm": 1.1153464126329262, + "k1_kl": 0.0634765625, + "k3_kl": 0.048583984375, + "kimi_kl": 0.1044921875, + "learning_rate": 3.2099999999999996e-08, + "loss": 0.0019, + "ppl": 0.032958984375, + "reward": 0.9805783629417419, + "reward_std": 0.0007973121828399599, + "rewards/perpo_ocr_edit_distance_reward": 0.9805783033370972, + "step": 4679, + "temperature": 0.9 + }, + { + "advantages": -5.500657607626636e-06, + "completion_length": 383.0, + "delta_ref_entropy_loss": -0.0025177001953125, + "delta_ref_ppl": -0.130859375, + "entropy_loss": -0.146484375, + "epoch": 0.936, + "grad_norm": 1.5132208455334013, + "k1_kl": 0.130859375, + "k3_kl": 0.09814453125, + "kimi_kl": 0.328125, + "learning_rate": 3.2e-08, + "loss": 0.0039, + "ppl": 0.056640625, + "reward": 0.8122013211250305, + "reward_std": 0.0014503266429528594, + "rewards/perpo_ocr_edit_distance_reward": 0.8122013807296753, + "step": 4680, + "temperature": 0.9 + }, + { + "advantages": -6.096704055380542e-06, + "completion_length": 77.0, + "delta_ref_entropy_loss": 0.04833984375, + "delta_ref_ppl": -0.55078125, + "entropy_loss": -0.2060546875, + "epoch": 0.9362, + "grad_norm": 3.437410945160306, + "k1_kl": 0.55078125, + "k3_kl": 0.458984375, + "kimi_kl": 2.046875, + "learning_rate": 3.1899999999999994e-08, + "loss": 0.0184, + "ppl": 0.08154296875, + "reward": 0.9710884690284729, + "reward_std": 0.005476696416735649, + "rewards/perpo_ocr_edit_distance_reward": 0.9710884690284729, + "step": 4681, + "temperature": 0.9 + }, + { + "advantages": -3.950936843466479e-06, + "completion_length": 122.0, + "delta_ref_entropy_loss": 0.01519775390625, + "delta_ref_ppl": -0.279296875, + "entropy_loss": -0.134765625, + "epoch": 0.9364, + "grad_norm": 2.1156696775174426, + "k1_kl": 0.279296875, + "k3_kl": 0.23046875, + "kimi_kl": 1.015625, + "learning_rate": 3.18e-08, + "loss": 0.0092, + "ppl": 0.060791015625, + "reward": 0.920421838760376, + "reward_std": 0.006341690197587013, + "rewards/perpo_ocr_edit_distance_reward": 0.9204218983650208, + "step": 4682, + "temperature": 0.9 + }, + { + "advantages": 2.588544703030493e-06, + "completion_length": 199.0, + "delta_ref_entropy_loss": -0.025390625, + "delta_ref_ppl": -0.1904296875, + "entropy_loss": -0.35546875, + "epoch": 0.9366, + "grad_norm": 5.32784617776314, + "k1_kl": 0.189453125, + "k3_kl": 0.1474609375, + "kimi_kl": 0.4375, + "learning_rate": 3.17e-08, + "loss": 0.0059, + "ppl": 0.1572265625, + "reward": 0.875421404838562, + "reward_std": 0.0031806412152945995, + "rewards/perpo_ocr_edit_distance_reward": 0.875421404838562, + "step": 4683, + "temperature": 0.9 + }, + { + "advantages": -5.9928213886450976e-05, + "completion_length": 533.0, + "delta_ref_entropy_loss": 0.0159912109375, + "delta_ref_ppl": -0.09326171875, + "entropy_loss": -0.0859375, + "epoch": 0.9368, + "grad_norm": 0.5608014525475279, + "k1_kl": 0.09326171875, + "k3_kl": 0.0693359375, + "kimi_kl": 0.2470703125, + "learning_rate": 3.16e-08, + "loss": 0.0028, + "ppl": 0.0303955078125, + "reward": 0.9730848670005798, + "reward_std": 0.0008945147274062037, + "rewards/perpo_ocr_edit_distance_reward": 0.9730849266052246, + "step": 4684, + "temperature": 0.9 + }, + { + "advantages": -3.0824116947769653e-06, + "completion_length": 55.0, + "delta_ref_entropy_loss": 0.034912109375, + "delta_ref_ppl": -0.79296875, + "entropy_loss": -0.388671875, + "epoch": 0.937, + "grad_norm": 3.5768182505153185, + "k1_kl": 0.79296875, + "k3_kl": 0.6484375, + "kimi_kl": 3.0625, + "learning_rate": 3.15e-08, + "loss": 0.0259, + "ppl": 0.11962890625, + "reward": 0.8145518898963928, + "reward_std": 0.011011268943548203, + "rewards/perpo_ocr_edit_distance_reward": 0.8145519495010376, + "step": 4685, + "temperature": 0.9 + }, + { + "advantages": -6.765127182006836e-05, + "completion_length": 491.0, + "delta_ref_entropy_loss": 0.0120849609375, + "delta_ref_ppl": -0.07568359375, + "entropy_loss": -0.06689453125, + "epoch": 0.9372, + "grad_norm": 0.5915001542270897, + "k1_kl": 0.07568359375, + "k3_kl": 0.06298828125, + "kimi_kl": 0.1826171875, + "learning_rate": 3.1399999999999997e-08, + "loss": 0.0026, + "ppl": 0.0203857421875, + "reward": 0.9988695979118347, + "reward_std": 0.0010325367329642177, + "rewards/perpo_ocr_edit_distance_reward": 0.9988696575164795, + "step": 4686, + "temperature": 0.9 + }, + { + "advantages": -1.3283321322887787e-06, + "completion_length": 1192.0, + "delta_ref_entropy_loss": -0.11669921875, + "delta_ref_ppl": -0.0166015625, + "entropy_loss": -0.1728515625, + "epoch": 0.9374, + "grad_norm": 0.7299427396594582, + "k1_kl": 0.0167236328125, + "k3_kl": 0.0302734375, + "kimi_kl": 0.09033203125, + "learning_rate": 3.13e-08, + "loss": 0.0012, + "ppl": 0.046630859375, + "reward": 0.868451714515686, + "reward_std": 0.06379999965429306, + "rewards/perpo_ocr_edit_distance_reward": 0.8684517741203308, + "step": 4687, + "temperature": 0.9 + }, + { + "advantages": -4.082492523593828e-05, + "completion_length": 508.0, + "delta_ref_entropy_loss": 0.0269775390625, + "delta_ref_ppl": -0.06884765625, + "entropy_loss": -0.064453125, + "epoch": 0.9376, + "grad_norm": 0.5452275408786749, + "k1_kl": 0.06884765625, + "k3_kl": 0.05029296875, + "kimi_kl": 0.173828125, + "learning_rate": 3.1199999999999995e-08, + "loss": 0.0021, + "ppl": 0.0167236328125, + "reward": 0.993436336517334, + "reward_std": 0.0007340622832998633, + "rewards/perpo_ocr_edit_distance_reward": 0.9934364557266235, + "step": 4688, + "temperature": 0.9 + }, + { + "advantages": -7.322856845348724e-07, + "completion_length": 532.0, + "delta_ref_entropy_loss": -0.28125, + "delta_ref_ppl": -0.1064453125, + "entropy_loss": -0.78125, + "epoch": 0.9378, + "grad_norm": 13.582622287941309, + "k1_kl": 0.10693359375, + "k3_kl": 0.138671875, + "kimi_kl": 0.2578125, + "learning_rate": 3.11e-08, + "loss": 0.0056, + "ppl": 0.30078125, + "reward": 0.682675302028656, + "reward_std": 0.1381618082523346, + "rewards/perpo_ocr_edit_distance_reward": 0.6826754212379456, + "step": 4689, + "temperature": 0.9 + }, + { + "advantages": -6.725107232341543e-05, + "completion_length": 516.0, + "delta_ref_entropy_loss": 0.043212890625, + "delta_ref_ppl": -0.0634765625, + "entropy_loss": -0.048583984375, + "epoch": 0.938, + "grad_norm": 0.38881229149681806, + "k1_kl": 0.0634765625, + "k3_kl": 0.042236328125, + "kimi_kl": 0.154296875, + "learning_rate": 3.1e-08, + "loss": 0.0018, + "ppl": 0.01214599609375, + "reward": 0.9919496178627014, + "reward_std": 0.0006597475148737431, + "rewards/perpo_ocr_edit_distance_reward": 0.991949737071991, + "step": 4690, + "temperature": 0.9 + }, + { + "advantages": -1.0728836059570312e-06, + "completion_length": 789.0, + "delta_ref_entropy_loss": -0.1650390625, + "delta_ref_ppl": -0.06640625, + "entropy_loss": -0.498046875, + "epoch": 0.9382, + "grad_norm": 2.9357024791303616, + "k1_kl": 0.06689453125, + "k3_kl": 0.07275390625, + "kimi_kl": 0.234375, + "learning_rate": 3.09e-08, + "loss": 0.0029, + "ppl": 0.189453125, + "reward": 0.273196280002594, + "reward_std": 0.024010591208934784, + "rewards/perpo_ocr_edit_distance_reward": 0.27319633960723877, + "step": 4691, + "temperature": 0.9 + }, + { + "advantages": -1.27724248955019e-07, + "completion_length": 522.0, + "delta_ref_entropy_loss": 0.01556396484375, + "delta_ref_ppl": -0.076171875, + "entropy_loss": -0.1201171875, + "epoch": 0.9384, + "grad_norm": 1.3730809785619158, + "k1_kl": 0.076171875, + "k3_kl": 0.057373046875, + "kimi_kl": 0.16015625, + "learning_rate": 3.08e-08, + "loss": 0.0023, + "ppl": 0.044677734375, + "reward": 0.7474368214607239, + "reward_std": 0.20716553926467896, + "rewards/perpo_ocr_edit_distance_reward": 0.7474368810653687, + "step": 4692, + "temperature": 0.9 + }, + { + "advantages": -7.765633927192539e-05, + "completion_length": 403.0, + "delta_ref_entropy_loss": 0.023681640625, + "delta_ref_ppl": -0.072265625, + "entropy_loss": -0.076171875, + "epoch": 0.9386, + "grad_norm": 0.5091010804235353, + "k1_kl": 0.072265625, + "k3_kl": 0.0478515625, + "kimi_kl": 0.10986328125, + "learning_rate": 3.07e-08, + "loss": 0.002, + "ppl": 0.0322265625, + "reward": 0.995364248752594, + "reward_std": 0.0005577729898504913, + "rewards/perpo_ocr_edit_distance_reward": 0.9953643679618835, + "step": 4693, + "temperature": 0.9 + }, + { + "advantages": -4.993166294298135e-05, + "completion_length": 475.0, + "delta_ref_entropy_loss": 0.032470703125, + "delta_ref_ppl": -0.0732421875, + "entropy_loss": -0.083984375, + "epoch": 0.9388, + "grad_norm": 0.7618871348386052, + "k1_kl": 0.07373046875, + "k3_kl": 0.049560546875, + "kimi_kl": 0.16796875, + "learning_rate": 3.0599999999999996e-08, + "loss": 0.002, + "ppl": 0.03271484375, + "reward": 0.9857325553894043, + "reward_std": 0.001945871626958251, + "rewards/perpo_ocr_edit_distance_reward": 0.9857326745986938, + "step": 4694, + "temperature": 0.9 + }, + { + "advantages": -8.932182026910596e-06, + "completion_length": 311.0, + "delta_ref_entropy_loss": 0.02685546875, + "delta_ref_ppl": -0.123046875, + "entropy_loss": -0.099609375, + "epoch": 0.939, + "grad_norm": 1.4495774838765934, + "k1_kl": 0.123046875, + "k3_kl": 0.0908203125, + "kimi_kl": 0.357421875, + "learning_rate": 3.0499999999999995e-08, + "loss": 0.0037, + "ppl": 0.03515625, + "reward": 0.9338570237159729, + "reward_std": 0.0037154536694288254, + "rewards/perpo_ocr_edit_distance_reward": 0.9338570833206177, + "step": 4695, + "temperature": 0.9 + }, + { + "advantages": -0.00014714684220962226, + "completion_length": 475.0, + "delta_ref_entropy_loss": 0.0279541015625, + "delta_ref_ppl": -0.083984375, + "entropy_loss": -0.0693359375, + "epoch": 0.9392, + "grad_norm": 0.8181572026389702, + "k1_kl": 0.083984375, + "k3_kl": 0.06494140625, + "kimi_kl": 0.2578125, + "learning_rate": 3.04e-08, + "loss": 0.0027, + "ppl": 0.024658203125, + "reward": 0.9951897263526917, + "reward_std": 0.00036290037678554654, + "rewards/perpo_ocr_edit_distance_reward": 0.9951898455619812, + "step": 4696, + "temperature": 0.9 + }, + { + "advantages": 3.760201798286289e-05, + "completion_length": 1028.0, + "delta_ref_entropy_loss": 0.0400390625, + "delta_ref_ppl": -0.05712890625, + "entropy_loss": -0.0927734375, + "epoch": 0.9394, + "grad_norm": 1.2248944250459004, + "k1_kl": 0.05712890625, + "k3_kl": 0.032470703125, + "kimi_kl": 0.0654296875, + "learning_rate": 3.03e-08, + "loss": 0.0013, + "ppl": 0.03515625, + "reward": 0.988130509853363, + "reward_std": 0.0005792827578261495, + "rewards/perpo_ocr_edit_distance_reward": 0.9881304502487183, + "step": 4697, + "temperature": 0.9 + }, + { + "advantages": -3.187997208442539e-05, + "completion_length": 844.0, + "delta_ref_entropy_loss": 0.0186767578125, + "delta_ref_ppl": -0.04736328125, + "entropy_loss": -0.08251953125, + "epoch": 0.9396, + "grad_norm": 0.9839288077878028, + "k1_kl": 0.047607421875, + "k3_kl": 0.028076171875, + "kimi_kl": 0.064453125, + "learning_rate": 3.02e-08, + "loss": 0.0012, + "ppl": 0.0286865234375, + "reward": 0.9822821617126465, + "reward_std": 0.0009691471932455897, + "rewards/perpo_ocr_edit_distance_reward": 0.982282280921936, + "step": 4698, + "temperature": 0.9 + }, + { + "advantages": -0.00017965692677535117, + "completion_length": 706.0, + "delta_ref_entropy_loss": 0.031982421875, + "delta_ref_ppl": -0.05224609375, + "entropy_loss": -0.0546875, + "epoch": 0.9398, + "grad_norm": 2.385563865219055, + "k1_kl": 0.052001953125, + "k3_kl": 0.03369140625, + "kimi_kl": 0.06640625, + "learning_rate": 3.01e-08, + "loss": 0.0015, + "ppl": 0.0186767578125, + "reward": 0.9627465605735779, + "reward_std": 0.0002791631850413978, + "rewards/perpo_ocr_edit_distance_reward": 0.9627466797828674, + "step": 4699, + "temperature": 0.9 + }, + { + "advantages": -9.877342108666198e-07, + "completion_length": 24.0, + "delta_ref_entropy_loss": -0.2421875, + "delta_ref_ppl": -0.921875, + "entropy_loss": -0.7109375, + "epoch": 0.94, + "grad_norm": 11.428276506383877, + "k1_kl": 0.91796875, + "k3_kl": 0.80078125, + "kimi_kl": 2.96875, + "learning_rate": 3e-08, + "loss": 0.032, + "ppl": 0.251953125, + "reward": 0.922448992729187, + "reward_std": 0.008568919263780117, + "rewards/perpo_ocr_edit_distance_reward": 0.922448992729187, + "step": 4700, + "temperature": 0.9 + }, + { + "advantages": -5.832740498590283e-06, + "completion_length": 213.0, + "delta_ref_entropy_loss": 0.043701171875, + "delta_ref_ppl": -0.1826171875, + "entropy_loss": -0.103515625, + "epoch": 0.9402, + "grad_norm": 1.420847952793261, + "k1_kl": 0.1826171875, + "k3_kl": 0.134765625, + "kimi_kl": 0.51953125, + "learning_rate": 2.9899999999999996e-08, + "loss": 0.0054, + "ppl": 0.03515625, + "reward": 0.988802969455719, + "reward_std": 0.0013620511163026094, + "rewards/perpo_ocr_edit_distance_reward": 0.9888030886650085, + "step": 4701, + "temperature": 0.9 + }, + { + "advantages": -2.9802324661432067e-06, + "completion_length": 429.0, + "delta_ref_entropy_loss": 0.01220703125, + "delta_ref_ppl": -0.0751953125, + "entropy_loss": -0.0703125, + "epoch": 0.9404, + "grad_norm": 1.1791369212828433, + "k1_kl": 0.0751953125, + "k3_kl": 0.056884765625, + "kimi_kl": 0.185546875, + "learning_rate": 2.98e-08, + "loss": 0.0023, + "ppl": 0.028564453125, + "reward": 0.9833507537841797, + "reward_std": 0.002761791693046689, + "rewards/perpo_ocr_edit_distance_reward": 0.9833508133888245, + "step": 4702, + "temperature": 0.9 + }, + { + "advantages": -7.12701285010553e-06, + "completion_length": 323.0, + "delta_ref_entropy_loss": 0.00830078125, + "delta_ref_ppl": -0.0908203125, + "entropy_loss": -0.056396484375, + "epoch": 0.9406, + "grad_norm": 0.9965619064639825, + "k1_kl": 0.09130859375, + "k3_kl": 0.064453125, + "kimi_kl": 0.306640625, + "learning_rate": 2.9699999999999998e-08, + "loss": 0.0026, + "ppl": 0.0157470703125, + "reward": 0.9978450536727905, + "reward_std": 0.0010973609751090407, + "rewards/perpo_ocr_edit_distance_reward": 0.9978450536727905, + "step": 4703, + "temperature": 0.9 + }, + { + "advantages": -1.2508460713434033e-05, + "completion_length": 983.0, + "delta_ref_entropy_loss": 0.0162353515625, + "delta_ref_ppl": -0.03857421875, + "entropy_loss": -0.053955078125, + "epoch": 0.9408, + "grad_norm": 0.40602191070571986, + "k1_kl": 0.03857421875, + "k3_kl": 0.023193359375, + "kimi_kl": 0.059326171875, + "learning_rate": 2.96e-08, + "loss": 0.0009, + "ppl": 0.0166015625, + "reward": 0.9958089590072632, + "reward_std": 0.0005814627511426806, + "rewards/perpo_ocr_edit_distance_reward": 0.9958089590072632, + "step": 4704, + "temperature": 0.9 + }, + { + "advantages": -8.753368092584424e-06, + "completion_length": 189.0, + "delta_ref_entropy_loss": 0.0201416015625, + "delta_ref_ppl": -0.140625, + "entropy_loss": -0.109375, + "epoch": 0.941, + "grad_norm": 2.0329519336277206, + "k1_kl": 0.140625, + "k3_kl": 0.10595703125, + "kimi_kl": 0.345703125, + "learning_rate": 2.9499999999999996e-08, + "loss": 0.0043, + "ppl": 0.046875, + "reward": 0.9860316514968872, + "reward_std": 0.0037874141708016396, + "rewards/perpo_ocr_edit_distance_reward": 0.9860317707061768, + "step": 4705, + "temperature": 0.9 + }, + { + "advantages": -3.474099457889679e-06, + "completion_length": 83.0, + "delta_ref_entropy_loss": 0.035888671875, + "delta_ref_ppl": -0.48046875, + "entropy_loss": -0.177734375, + "epoch": 0.9412, + "grad_norm": 3.349087153907657, + "k1_kl": 0.478515625, + "k3_kl": 0.412109375, + "kimi_kl": 2.515625, + "learning_rate": 2.94e-08, + "loss": 0.0165, + "ppl": 0.0673828125, + "reward": 0.9808542728424072, + "reward_std": 0.004801650065928698, + "rewards/perpo_ocr_edit_distance_reward": 0.9808542132377625, + "step": 4706, + "temperature": 0.9 + }, + { + "advantages": -4.3485848436830565e-05, + "completion_length": 525.0, + "delta_ref_entropy_loss": -0.0038299560546875, + "delta_ref_ppl": -0.057861328125, + "entropy_loss": -0.125, + "epoch": 0.9414, + "grad_norm": 0.9710531096945132, + "k1_kl": 0.057861328125, + "k3_kl": 0.052978515625, + "kimi_kl": 0.1748046875, + "learning_rate": 2.9299999999999998e-08, + "loss": 0.0022, + "ppl": 0.04052734375, + "reward": 0.9911264777183533, + "reward_std": 0.0014669176889583468, + "rewards/perpo_ocr_edit_distance_reward": 0.991126537322998, + "step": 4707, + "temperature": 0.9 + }, + { + "advantages": -6.709780336677795e-06, + "completion_length": 142.0, + "delta_ref_entropy_loss": -0.00139617919921875, + "delta_ref_ppl": -0.23828125, + "entropy_loss": -0.11767578125, + "epoch": 0.9416, + "grad_norm": 1.830763275421736, + "k1_kl": 0.23828125, + "k3_kl": 0.1923828125, + "kimi_kl": 0.828125, + "learning_rate": 2.92e-08, + "loss": 0.0077, + "ppl": 0.042236328125, + "reward": 0.9940211772918701, + "reward_std": 0.002440772019326687, + "rewards/perpo_ocr_edit_distance_reward": 0.9940212368965149, + "step": 4708, + "temperature": 0.9 + }, + { + "advantages": -6.3521524680254515e-06, + "completion_length": 963.0, + "delta_ref_entropy_loss": 0.002716064453125, + "delta_ref_ppl": -0.07666015625, + "entropy_loss": -0.1640625, + "epoch": 0.9418, + "grad_norm": 3.116484582766764, + "k1_kl": 0.07666015625, + "k3_kl": 0.05810546875, + "kimi_kl": 0.1474609375, + "learning_rate": 2.91e-08, + "loss": 0.0023, + "ppl": 0.0751953125, + "reward": 0.9836838245391846, + "reward_std": 0.007907578721642494, + "rewards/perpo_ocr_edit_distance_reward": 0.9836838841438293, + "step": 4709, + "temperature": 0.9 + }, + { + "advantages": -6.92861431161873e-05, + "completion_length": 587.0, + "delta_ref_entropy_loss": 0.0230712890625, + "delta_ref_ppl": -0.087890625, + "entropy_loss": -0.1025390625, + "epoch": 0.942, + "grad_norm": 0.39547856463470304, + "k1_kl": 0.087890625, + "k3_kl": 0.060791015625, + "kimi_kl": 0.2177734375, + "learning_rate": 2.9e-08, + "loss": 0.0025, + "ppl": 0.033447265625, + "reward": 0.9836257696151733, + "reward_std": 0.0011286744847893715, + "rewards/perpo_ocr_edit_distance_reward": 0.9836258888244629, + "step": 4710, + "temperature": 0.9 + }, + { + "advantages": -5.2758627134608105e-05, + "completion_length": 913.0, + "delta_ref_entropy_loss": 0.0140380859375, + "delta_ref_ppl": -0.0546875, + "entropy_loss": -0.11328125, + "epoch": 0.9422, + "grad_norm": 0.4080765271249335, + "k1_kl": 0.054443359375, + "k3_kl": 0.037109375, + "kimi_kl": 0.08837890625, + "learning_rate": 2.8899999999999997e-08, + "loss": 0.0015, + "ppl": 0.04150390625, + "reward": 0.9866896271705627, + "reward_std": 0.0011906304862350225, + "rewards/perpo_ocr_edit_distance_reward": 0.9866897463798523, + "step": 4711, + "temperature": 0.9 + }, + { + "advantages": -5.790165573671402e-07, + "completion_length": 53.0, + "delta_ref_entropy_loss": 0.061279296875, + "delta_ref_ppl": -0.7421875, + "entropy_loss": -0.23046875, + "epoch": 0.9424, + "grad_norm": 6.669198773408996, + "k1_kl": 0.7421875, + "k3_kl": 0.68359375, + "kimi_kl": 3.046875, + "learning_rate": 2.8799999999999996e-08, + "loss": 0.0274, + "ppl": 0.11279296875, + "reward": 0.2113175392150879, + "reward_std": 0.01488950289785862, + "rewards/perpo_ocr_edit_distance_reward": 0.21131755411624908, + "step": 4712, + "temperature": 0.9 + }, + { + "advantages": -5.568776941800024e-06, + "completion_length": 615.0, + "delta_ref_entropy_loss": 0.0306396484375, + "delta_ref_ppl": -0.07568359375, + "entropy_loss": -0.07080078125, + "epoch": 0.9426, + "grad_norm": 0.579750854100548, + "k1_kl": 0.07568359375, + "k3_kl": 0.05078125, + "kimi_kl": 0.1640625, + "learning_rate": 2.87e-08, + "loss": 0.002, + "ppl": 0.02978515625, + "reward": 0.9621656537055969, + "reward_std": 0.0014255833812057972, + "rewards/perpo_ocr_edit_distance_reward": 0.9621657133102417, + "step": 4713, + "temperature": 0.9 + }, + { + "advantages": -9.185927774524316e-05, + "completion_length": 505.0, + "delta_ref_entropy_loss": 0.049560546875, + "delta_ref_ppl": -0.0849609375, + "entropy_loss": -0.068359375, + "epoch": 0.9428, + "grad_norm": 0.5176287111418283, + "k1_kl": 0.0849609375, + "k3_kl": 0.057861328125, + "kimi_kl": 0.189453125, + "learning_rate": 2.8599999999999998e-08, + "loss": 0.0024, + "ppl": 0.0225830078125, + "reward": 0.992554783821106, + "reward_std": 0.0008267775410786271, + "rewards/perpo_ocr_edit_distance_reward": 0.9925548434257507, + "step": 4714, + "temperature": 0.9 + }, + { + "advantages": -1.9890921976184472e-05, + "completion_length": 436.0, + "delta_ref_entropy_loss": 0.033447265625, + "delta_ref_ppl": -0.07080078125, + "entropy_loss": -0.050048828125, + "epoch": 0.943, + "grad_norm": 0.6895856194243807, + "k1_kl": 0.07080078125, + "k3_kl": 0.04736328125, + "kimi_kl": 0.138671875, + "learning_rate": 2.85e-08, + "loss": 0.0019, + "ppl": 0.018310546875, + "reward": 0.9973024725914001, + "reward_std": 0.0007558349170722067, + "rewards/perpo_ocr_edit_distance_reward": 0.9973025321960449, + "step": 4715, + "temperature": 0.9 + }, + { + "advantages": -3.2067298889160156e-05, + "completion_length": 253.0, + "delta_ref_entropy_loss": 0.01019287109375, + "delta_ref_ppl": -0.1328125, + "entropy_loss": -0.04833984375, + "epoch": 0.9432, + "grad_norm": 1.007961377570751, + "k1_kl": 0.1337890625, + "k3_kl": 0.11181640625, + "kimi_kl": 0.52734375, + "learning_rate": 2.84e-08, + "loss": 0.0045, + "ppl": 0.01483154296875, + "reward": 0.9923624992370605, + "reward_std": 0.0014937258092686534, + "rewards/perpo_ocr_edit_distance_reward": 0.9923626184463501, + "step": 4716, + "temperature": 0.9 + }, + { + "advantages": -2.6247333153150976e-05, + "completion_length": 758.0, + "delta_ref_entropy_loss": 0.024169921875, + "delta_ref_ppl": -0.0693359375, + "entropy_loss": -0.09130859375, + "epoch": 0.9434, + "grad_norm": 1.1455943263052717, + "k1_kl": 0.0693359375, + "k3_kl": 0.046875, + "kimi_kl": 0.1376953125, + "learning_rate": 2.83e-08, + "loss": 0.0019, + "ppl": 0.0341796875, + "reward": 0.9744821190834045, + "reward_std": 0.003146552247926593, + "rewards/perpo_ocr_edit_distance_reward": 0.9744822978973389, + "step": 4717, + "temperature": 0.9 + }, + { + "advantages": -1.4305115882962127e-06, + "completion_length": 472.0, + "delta_ref_entropy_loss": 0.047119140625, + "delta_ref_ppl": -0.1552734375, + "entropy_loss": -0.265625, + "epoch": 0.9436, + "grad_norm": 1.3612614531168583, + "k1_kl": 0.1552734375, + "k3_kl": 0.10107421875, + "kimi_kl": 0.291015625, + "learning_rate": 2.8199999999999998e-08, + "loss": 0.004, + "ppl": 0.130859375, + "reward": 0.9467282295227051, + "reward_std": 0.005791353993117809, + "rewards/perpo_ocr_edit_distance_reward": 0.9467282295227051, + "step": 4718, + "temperature": 0.9 + }, + { + "advantages": -1.8852098946808837e-05, + "completion_length": 485.0, + "delta_ref_entropy_loss": 0.06591796875, + "delta_ref_ppl": -0.1298828125, + "entropy_loss": -0.1298828125, + "epoch": 0.9438, + "grad_norm": 0.8331075346661739, + "k1_kl": 0.130859375, + "k3_kl": 0.07763671875, + "kimi_kl": 0.255859375, + "learning_rate": 2.81e-08, + "loss": 0.0031, + "ppl": 0.046875, + "reward": 0.8892422914505005, + "reward_std": 0.0012543044285848737, + "rewards/perpo_ocr_edit_distance_reward": 0.8892423510551453, + "step": 4719, + "temperature": 0.9 + }, + { + "advantages": -3.1403134926222265e-05, + "completion_length": 1101.0, + "delta_ref_entropy_loss": 0.04052734375, + "delta_ref_ppl": -0.0595703125, + "entropy_loss": -0.10400390625, + "epoch": 0.944, + "grad_norm": 8.36520668155377, + "k1_kl": 0.059814453125, + "k3_kl": 0.055908203125, + "kimi_kl": 0.107421875, + "learning_rate": 2.8e-08, + "loss": 0.0023, + "ppl": 0.057373046875, + "reward": 0.9851805567741394, + "reward_std": 0.0009843686129897833, + "rewards/perpo_ocr_edit_distance_reward": 0.985180675983429, + "step": 4720, + "temperature": 0.9 + }, + { + "advantages": 3.4059798537100505e-08, + "completion_length": 129.0, + "delta_ref_entropy_loss": -1.0703125, + "delta_ref_ppl": -0.279296875, + "entropy_loss": -2.28125, + "epoch": 0.9442, + "grad_norm": 7.640833360082641, + "k1_kl": 0.279296875, + "k3_kl": 0.419921875, + "kimi_kl": 1.234375, + "learning_rate": 2.79e-08, + "loss": 0.0168, + "ppl": 0.96875, + "reward": 0.2855146527290344, + "reward_std": 0.06762505322694778, + "rewards/perpo_ocr_edit_distance_reward": 0.2855146527290344, + "step": 4721, + "temperature": 0.9 + }, + { + "advantages": -1.8392290712654358e-06, + "completion_length": 457.0, + "delta_ref_entropy_loss": -0.03125, + "delta_ref_ppl": -0.126953125, + "entropy_loss": -0.5625, + "epoch": 0.9444, + "grad_norm": 3.362630052445662, + "k1_kl": 0.1279296875, + "k3_kl": 0.103515625, + "kimi_kl": 0.240234375, + "learning_rate": 2.7799999999999997e-08, + "loss": 0.0042, + "ppl": 0.283203125, + "reward": 0.8787745237350464, + "reward_std": 0.004575615283101797, + "rewards/perpo_ocr_edit_distance_reward": 0.8787745237350464, + "step": 4722, + "temperature": 0.9 + }, + { + "advantages": -6.422826845664531e-05, + "completion_length": 509.0, + "delta_ref_entropy_loss": 0.06591796875, + "delta_ref_ppl": -0.10595703125, + "entropy_loss": -0.072265625, + "epoch": 0.9446, + "grad_norm": 0.602782694913927, + "k1_kl": 0.10595703125, + "k3_kl": 0.06640625, + "kimi_kl": 0.181640625, + "learning_rate": 2.7699999999999997e-08, + "loss": 0.0027, + "ppl": 0.0213623046875, + "reward": 0.9634334444999695, + "reward_std": 0.0006956880679354072, + "rewards/perpo_ocr_edit_distance_reward": 0.963433563709259, + "step": 4723, + "temperature": 0.9 + }, + { + "advantages": -2.5289400582551025e-06, + "completion_length": 59.0, + "delta_ref_entropy_loss": -0.03759765625, + "delta_ref_ppl": -0.7734375, + "entropy_loss": -0.3671875, + "epoch": 0.9448, + "grad_norm": 7.772862471913345, + "k1_kl": 0.7734375, + "k3_kl": 0.69921875, + "kimi_kl": 3.703125, + "learning_rate": 2.76e-08, + "loss": 0.0279, + "ppl": 0.12451171875, + "reward": 0.6185984015464783, + "reward_std": 0.00997973047196865, + "rewards/perpo_ocr_edit_distance_reward": 0.6185984015464783, + "step": 4724, + "temperature": 0.9 + }, + { + "advantages": -8.307184907607734e-05, + "completion_length": 866.0, + "delta_ref_entropy_loss": 0.0174560546875, + "delta_ref_ppl": -0.049560546875, + "entropy_loss": -0.076171875, + "epoch": 0.945, + "grad_norm": 0.4638149540164683, + "k1_kl": 0.04931640625, + "k3_kl": 0.0267333984375, + "kimi_kl": 0.049072265625, + "learning_rate": 2.7499999999999998e-08, + "loss": 0.0012, + "ppl": 0.02392578125, + "reward": 0.9973254799842834, + "reward_std": 0.0006176402093842626, + "rewards/perpo_ocr_edit_distance_reward": 0.9973255395889282, + "step": 4725, + "temperature": 0.9 + }, + { + "advantages": -4.0531158447265625e-06, + "completion_length": 452.0, + "delta_ref_entropy_loss": -0.125, + "delta_ref_ppl": -0.06494140625, + "entropy_loss": -0.2490234375, + "epoch": 0.9452, + "grad_norm": 2.9893017997491858, + "k1_kl": 0.0654296875, + "k3_kl": 0.0732421875, + "kimi_kl": 0.201171875, + "learning_rate": 2.74e-08, + "loss": 0.0029, + "ppl": 0.08349609375, + "reward": 0.9836047291755676, + "reward_std": 0.006161515600979328, + "rewards/perpo_ocr_edit_distance_reward": 0.9836047291755676, + "step": 4726, + "temperature": 0.9 + }, + { + "advantages": -6.667205889243633e-05, + "completion_length": 557.0, + "delta_ref_entropy_loss": 0.039306640625, + "delta_ref_ppl": -0.058837890625, + "entropy_loss": -0.05908203125, + "epoch": 0.9454, + "grad_norm": 0.36932261515890397, + "k1_kl": 0.05908203125, + "k3_kl": 0.03466796875, + "kimi_kl": 0.09423828125, + "learning_rate": 2.73e-08, + "loss": 0.0014, + "ppl": 0.0145263671875, + "reward": 0.9976940155029297, + "reward_std": 0.000921861210372299, + "rewards/perpo_ocr_edit_distance_reward": 0.9976940751075745, + "step": 4727, + "temperature": 0.9 + }, + { + "advantages": -2.1904708773945458e-05, + "completion_length": 425.0, + "delta_ref_entropy_loss": 0.052490234375, + "delta_ref_ppl": -0.09912109375, + "entropy_loss": -0.0341796875, + "epoch": 0.9456, + "grad_norm": 0.48176467960998987, + "k1_kl": 0.09912109375, + "k3_kl": 0.072265625, + "kimi_kl": 0.279296875, + "learning_rate": 2.72e-08, + "loss": 0.0029, + "ppl": 0.00885009765625, + "reward": 0.9987524747848511, + "reward_std": 0.0006775876972824335, + "rewards/perpo_ocr_edit_distance_reward": 0.9987524747848511, + "step": 4728, + "temperature": 0.9 + }, + { + "advantages": -4.802431703865295e-06, + "completion_length": 859.0, + "delta_ref_entropy_loss": -0.00531005859375, + "delta_ref_ppl": -0.0546875, + "entropy_loss": -0.146484375, + "epoch": 0.9458, + "grad_norm": 2.2514363776159025, + "k1_kl": 0.054931640625, + "k3_kl": 0.035400390625, + "kimi_kl": 0.08203125, + "learning_rate": 2.7099999999999998e-08, + "loss": 0.0014, + "ppl": 0.05224609375, + "reward": 0.9087664484977722, + "reward_std": 0.005230884067714214, + "rewards/perpo_ocr_edit_distance_reward": 0.9087664484977722, + "step": 4729, + "temperature": 0.9 + }, + { + "advantages": -2.7392592528485693e-05, + "completion_length": 258.0, + "delta_ref_entropy_loss": 0.052978515625, + "delta_ref_ppl": -0.2060546875, + "entropy_loss": -0.07763671875, + "epoch": 0.946, + "grad_norm": 0.9073488260862312, + "k1_kl": 0.2060546875, + "k3_kl": 0.1572265625, + "kimi_kl": 0.59765625, + "learning_rate": 2.6999999999999997e-08, + "loss": 0.0063, + "ppl": 0.0264892578125, + "reward": 0.995441198348999, + "reward_std": 0.001142841880209744, + "rewards/perpo_ocr_edit_distance_reward": 0.9954412579536438, + "step": 4730, + "temperature": 0.9 + }, + { + "advantages": -2.6702882678364404e-05, + "completion_length": 1276.0, + "delta_ref_entropy_loss": 0.033935546875, + "delta_ref_ppl": -0.055419921875, + "entropy_loss": -0.064453125, + "epoch": 0.9462, + "grad_norm": 0.6268442029229142, + "k1_kl": 0.055419921875, + "k3_kl": 0.031982421875, + "kimi_kl": 0.06787109375, + "learning_rate": 2.69e-08, + "loss": 0.0013, + "ppl": 0.025390625, + "reward": 0.9913251996040344, + "reward_std": 0.0008569079800508916, + "rewards/perpo_ocr_edit_distance_reward": 0.9913252592086792, + "step": 4731, + "temperature": 0.9 + }, + { + "advantages": -3.222057057428174e-05, + "completion_length": 912.0, + "delta_ref_entropy_loss": 0.0230712890625, + "delta_ref_ppl": -0.061767578125, + "entropy_loss": -0.07373046875, + "epoch": 0.9464, + "grad_norm": 0.689934698445788, + "k1_kl": 0.0615234375, + "k3_kl": 0.0419921875, + "kimi_kl": 0.1298828125, + "learning_rate": 2.68e-08, + "loss": 0.0017, + "ppl": 0.0279541015625, + "reward": 0.9823285937309265, + "reward_std": 0.0006924619665369391, + "rewards/perpo_ocr_edit_distance_reward": 0.9823287129402161, + "step": 4732, + "temperature": 0.9 + }, + { + "advantages": -9.707042408990674e-06, + "completion_length": 1250.0, + "delta_ref_entropy_loss": 0.0037689208984375, + "delta_ref_ppl": -0.08984375, + "entropy_loss": -0.298828125, + "epoch": 0.9466, + "grad_norm": 2.0672325989381193, + "k1_kl": 0.08935546875, + "k3_kl": 0.06591796875, + "kimi_kl": 0.1640625, + "learning_rate": 2.67e-08, + "loss": 0.0026, + "ppl": 0.1435546875, + "reward": 0.9252014756202698, + "reward_std": 0.009576462209224701, + "rewards/perpo_ocr_edit_distance_reward": 0.9252015948295593, + "step": 4733, + "temperature": 0.9 + }, + { + "advantages": -1.4219966033124365e-05, + "completion_length": 103.0, + "delta_ref_entropy_loss": -0.00811767578125, + "delta_ref_ppl": -0.2138671875, + "entropy_loss": -0.15625, + "epoch": 0.9468, + "grad_norm": 2.848972277075472, + "k1_kl": 0.2138671875, + "k3_kl": 0.16796875, + "kimi_kl": 0.5703125, + "learning_rate": 2.6599999999999997e-08, + "loss": 0.0067, + "ppl": 0.06298828125, + "reward": 0.9033322334289551, + "reward_std": 0.0040942467749118805, + "rewards/perpo_ocr_edit_distance_reward": 0.9033322930335999, + "step": 4734, + "temperature": 0.9 + }, + { + "advantages": -2.1193709471845068e-05, + "completion_length": 401.0, + "delta_ref_entropy_loss": 0.01171875, + "delta_ref_ppl": -0.07666015625, + "entropy_loss": -0.0546875, + "epoch": 0.947, + "grad_norm": 6.901703202905246, + "k1_kl": 0.07666015625, + "k3_kl": 0.0595703125, + "kimi_kl": 0.208984375, + "learning_rate": 2.65e-08, + "loss": 0.0024, + "ppl": 0.0184326171875, + "reward": 0.9955304265022278, + "reward_std": 0.003912594635039568, + "rewards/perpo_ocr_edit_distance_reward": 0.9955304265022278, + "step": 4735, + "temperature": 0.9 + }, + { + "advantages": -1.2951238204550464e-05, + "completion_length": 398.0, + "delta_ref_entropy_loss": 0.047119140625, + "delta_ref_ppl": -0.1123046875, + "entropy_loss": -0.099609375, + "epoch": 0.9472, + "grad_norm": 0.97143268526078, + "k1_kl": 0.1123046875, + "k3_kl": 0.08203125, + "kimi_kl": 0.3046875, + "learning_rate": 2.6399999999999998e-08, + "loss": 0.0033, + "ppl": 0.038330078125, + "reward": 0.9904888272285461, + "reward_std": 0.0018718823557719588, + "rewards/perpo_ocr_edit_distance_reward": 0.9904888868331909, + "step": 4736, + "temperature": 0.9 + }, + { + "advantages": -1.0609627679514233e-05, + "completion_length": 662.0, + "delta_ref_entropy_loss": 0.00927734375, + "delta_ref_ppl": -0.040283203125, + "entropy_loss": -0.08056640625, + "epoch": 0.9474, + "grad_norm": 0.44186283124618575, + "k1_kl": 0.040283203125, + "k3_kl": 0.02783203125, + "kimi_kl": 0.07861328125, + "learning_rate": 2.63e-08, + "loss": 0.0011, + "ppl": 0.0263671875, + "reward": 0.9827920198440552, + "reward_std": 0.0007023806101642549, + "rewards/perpo_ocr_edit_distance_reward": 0.9827920794487, + "step": 4737, + "temperature": 0.9 + }, + { + "advantages": 0.0, + "completion_length": 258.0, + "delta_ref_entropy_loss": 0.03662109375, + "delta_ref_ppl": -0.1357421875, + "entropy_loss": -0.080078125, + "epoch": 0.9476, + "grad_norm": 0.8412010116048438, + "k1_kl": 0.1357421875, + "k3_kl": 0.09912109375, + "kimi_kl": 0.3671875, + "learning_rate": 2.62e-08, + "loss": 0.004, + "ppl": 0.026123046875, + "reward": 0.9946996569633484, + "reward_std": 0.0011625709012150764, + "rewards/perpo_ocr_edit_distance_reward": 0.9946996569633484, + "step": 4738, + "temperature": 0.9 + }, + { + "advantages": -2.2241049009608105e-05, + "completion_length": 427.0, + "delta_ref_entropy_loss": 0.035400390625, + "delta_ref_ppl": -0.11328125, + "entropy_loss": -0.07177734375, + "epoch": 0.9478, + "grad_norm": 0.7539210882212153, + "k1_kl": 0.11328125, + "k3_kl": 0.0810546875, + "kimi_kl": 0.318359375, + "learning_rate": 2.6100000000000002e-08, + "loss": 0.0033, + "ppl": 0.019775390625, + "reward": 0.993143618106842, + "reward_std": 0.0021976495627313852, + "rewards/perpo_ocr_edit_distance_reward": 0.9931437969207764, + "step": 4739, + "temperature": 0.9 + }, + { + "advantages": 0.0, + "completion_length": 330.0, + "delta_ref_entropy_loss": 0.018310546875, + "delta_ref_ppl": -0.1220703125, + "entropy_loss": -0.0634765625, + "epoch": 0.948, + "grad_norm": 0.7251385781251896, + "k1_kl": 0.1220703125, + "k3_kl": 0.09375, + "kimi_kl": 0.431640625, + "learning_rate": 2.5999999999999998e-08, + "loss": 0.0037, + "ppl": 0.0257568359375, + "reward": 0.772357702255249, + "reward_std": 0.0009541708859615028, + "rewards/perpo_ocr_edit_distance_reward": 0.772357702255249, + "step": 4740, + "temperature": 0.9 + }, + { + "advantages": -1.5114035704755224e-05, + "completion_length": 893.0, + "delta_ref_entropy_loss": 0.0712890625, + "delta_ref_ppl": -0.06591796875, + "entropy_loss": -0.09716796875, + "epoch": 0.9482, + "grad_norm": 0.8968638276988118, + "k1_kl": 0.06591796875, + "k3_kl": 0.033935546875, + "kimi_kl": 0.09375, + "learning_rate": 2.5899999999999997e-08, + "loss": 0.0014, + "ppl": 0.041748046875, + "reward": 0.9685959815979004, + "reward_std": 0.002715672366321087, + "rewards/perpo_ocr_edit_distance_reward": 0.9685961008071899, + "step": 4741, + "temperature": 0.9 + }, + { + "advantages": 0.0, + "completion_length": 603.0, + "delta_ref_entropy_loss": 0.05224609375, + "delta_ref_ppl": -0.0986328125, + "entropy_loss": -0.1416015625, + "epoch": 0.9484, + "grad_norm": 1.3765028943173916, + "k1_kl": 0.0986328125, + "k3_kl": 0.06298828125, + "kimi_kl": 0.1806640625, + "learning_rate": 2.58e-08, + "loss": 0.0025, + "ppl": 0.061767578125, + "reward": 0.7689827084541321, + "reward_std": 0.002209313912317157, + "rewards/perpo_ocr_edit_distance_reward": 0.7689827680587769, + "step": 4742, + "temperature": 0.9 + }, + { + "advantages": 1.215934844367439e-05, + "completion_length": 404.0, + "delta_ref_entropy_loss": 0.068359375, + "delta_ref_ppl": -0.12890625, + "entropy_loss": -0.1298828125, + "epoch": 0.9486, + "grad_norm": 2.1619878418961025, + "k1_kl": 0.12890625, + "k3_kl": 0.08349609375, + "kimi_kl": 0.2333984375, + "learning_rate": 2.57e-08, + "loss": 0.0033, + "ppl": 0.05615234375, + "reward": 0.9256060719490051, + "reward_std": 0.001995780970901251, + "rewards/perpo_ocr_edit_distance_reward": 0.9256060719490051, + "step": 4743, + "temperature": 0.9 + }, + { + "advantages": -0.0001246929168701172, + "completion_length": 1213.0, + "delta_ref_entropy_loss": 0.0234375, + "delta_ref_ppl": -0.032470703125, + "entropy_loss": -0.03857421875, + "epoch": 0.9488, + "grad_norm": 1.0799736277015393, + "k1_kl": 0.032470703125, + "k3_kl": 0.021240234375, + "kimi_kl": 0.052490234375, + "learning_rate": 2.56e-08, + "loss": 0.001, + "ppl": 0.01336669921875, + "reward": 0.9962326288223267, + "reward_std": 0.00044634146615862846, + "rewards/perpo_ocr_edit_distance_reward": 0.9962327480316162, + "step": 4744, + "temperature": 0.9 + }, + { + "advantages": -1.52587890625e-05, + "completion_length": 944.0, + "delta_ref_entropy_loss": 0.0225830078125, + "delta_ref_ppl": -0.051025390625, + "entropy_loss": -0.03857421875, + "epoch": 0.949, + "grad_norm": 0.7646569476455908, + "k1_kl": 0.051025390625, + "k3_kl": 0.04248046875, + "kimi_kl": 0.12109375, + "learning_rate": 2.5499999999999997e-08, + "loss": 0.0017, + "ppl": 0.01611328125, + "reward": 0.9933158755302429, + "reward_std": 0.0021351873874664307, + "rewards/perpo_ocr_edit_distance_reward": 0.9933158755302429, + "step": 4745, + "temperature": 0.9 + }, + { + "advantages": -5.909374976909021e-06, + "completion_length": 566.0, + "delta_ref_entropy_loss": 0.005584716796875, + "delta_ref_ppl": -0.07177734375, + "entropy_loss": -0.10205078125, + "epoch": 0.9492, + "grad_norm": 0.8563875839196774, + "k1_kl": 0.072265625, + "k3_kl": 0.0556640625, + "kimi_kl": 0.1689453125, + "learning_rate": 2.54e-08, + "loss": 0.0022, + "ppl": 0.0341796875, + "reward": 0.9833748936653137, + "reward_std": 0.0027821501716971397, + "rewards/perpo_ocr_edit_distance_reward": 0.9833749532699585, + "step": 4746, + "temperature": 0.9 + }, + { + "advantages": -0.00021843399736098945, + "completion_length": 755.0, + "delta_ref_entropy_loss": 0.032958984375, + "delta_ref_ppl": -0.06201171875, + "entropy_loss": -0.050048828125, + "epoch": 0.9494, + "grad_norm": 0.7946310726349646, + "k1_kl": 0.062255859375, + "k3_kl": 0.037353515625, + "kimi_kl": 0.10400390625, + "learning_rate": 2.5299999999999998e-08, + "loss": 0.0017, + "ppl": 0.0177001953125, + "reward": 0.9985079169273376, + "reward_std": 0.00032882639789022505, + "rewards/perpo_ocr_edit_distance_reward": 0.9985079765319824, + "step": 4747, + "temperature": 0.9 + }, + { + "advantages": -1.6927719116210938e-05, + "completion_length": 985.0, + "delta_ref_entropy_loss": 0.056640625, + "delta_ref_ppl": -0.078125, + "entropy_loss": -0.169921875, + "epoch": 0.9496, + "grad_norm": 1.2617746638391079, + "k1_kl": 0.078125, + "k3_kl": 0.055419921875, + "kimi_kl": 0.1416015625, + "learning_rate": 2.52e-08, + "loss": 0.0022, + "ppl": 0.07958984375, + "reward": 0.9565145373344421, + "reward_std": 0.004426633007824421, + "rewards/perpo_ocr_edit_distance_reward": 0.9565146565437317, + "step": 4748, + "temperature": 0.9 + }, + { + "advantages": -1.0592597391223535e-05, + "completion_length": 581.0, + "delta_ref_entropy_loss": -0.032470703125, + "delta_ref_ppl": -0.1162109375, + "entropy_loss": -0.470703125, + "epoch": 0.9498, + "grad_norm": 2.1064143699230233, + "k1_kl": 0.1162109375, + "k3_kl": 0.09423828125, + "kimi_kl": 0.275390625, + "learning_rate": 2.51e-08, + "loss": 0.0038, + "ppl": 0.2294921875, + "reward": 0.5421428680419922, + "reward_std": 0.0063312966376543045, + "rewards/perpo_ocr_edit_distance_reward": 0.542142927646637, + "step": 4749, + "temperature": 0.9 + }, + { + "advantages": -0.00012200219498481601, + "completion_length": 436.0, + "delta_ref_entropy_loss": 0.037841796875, + "delta_ref_ppl": -0.08154296875, + "entropy_loss": -0.08447265625, + "epoch": 0.95, + "grad_norm": 0.6910127988712912, + "k1_kl": 0.08203125, + "k3_kl": 0.052734375, + "kimi_kl": 0.16015625, + "learning_rate": 2.5e-08, + "loss": 0.0022, + "ppl": 0.03271484375, + "reward": 0.9467082023620605, + "reward_std": 0.0005280900513753295, + "rewards/perpo_ocr_edit_distance_reward": 0.9467083215713501, + "step": 4750, + "temperature": 0.9 + }, + { + "advantages": 1.0899135531872162e-06, + "completion_length": 412.0, + "delta_ref_entropy_loss": 0.04052734375, + "delta_ref_ppl": -0.109375, + "entropy_loss": -0.2080078125, + "epoch": 0.9502, + "grad_norm": 1.7109818354553652, + "k1_kl": 0.109375, + "k3_kl": 0.06884765625, + "kimi_kl": 0.162109375, + "learning_rate": 2.4899999999999998e-08, + "loss": 0.0028, + "ppl": 0.0830078125, + "reward": 0.9071693420410156, + "reward_std": 0.0076358080841600895, + "rewards/perpo_ocr_edit_distance_reward": 0.9071694016456604, + "step": 4751, + "temperature": 0.9 + }, + { + "advantages": -3.768716851482168e-05, + "completion_length": 693.0, + "delta_ref_entropy_loss": 0.01495361328125, + "delta_ref_ppl": -0.0703125, + "entropy_loss": -0.0966796875, + "epoch": 0.9504, + "grad_norm": 0.7836812947675132, + "k1_kl": 0.0703125, + "k3_kl": 0.04736328125, + "kimi_kl": 0.11865234375, + "learning_rate": 2.4799999999999997e-08, + "loss": 0.0019, + "ppl": 0.041748046875, + "reward": 0.9904098510742188, + "reward_std": 0.0008036610670387745, + "rewards/perpo_ocr_edit_distance_reward": 0.9904099106788635, + "step": 4752, + "temperature": 0.9 + }, + { + "advantages": -8.514949634275126e-09, + "completion_length": 2048.0, + "delta_ref_entropy_loss": -0.62890625, + "delta_ref_ppl": 0.0111083984375, + "entropy_loss": -1.5390625, + "epoch": 0.9506, + "grad_norm": 3.7599003653823106, + "k1_kl": -0.011962890625, + "k3_kl": 0.11572265625, + "kimi_kl": 0.1640625, + "learning_rate": 2.47e-08, + "loss": 0.0046, + "ppl": 0.765625, + "reward": 0.484816312789917, + "reward_std": 0.24125227332115173, + "rewards/perpo_ocr_edit_distance_reward": 0.484816312789917, + "step": 4753, + "temperature": 0.9 + }, + { + "advantages": 9.02584633877268e-06, + "completion_length": 508.0, + "delta_ref_entropy_loss": 0.048583984375, + "delta_ref_ppl": -0.06591796875, + "entropy_loss": -0.06005859375, + "epoch": 0.9508, + "grad_norm": 0.5663727855754174, + "k1_kl": 0.06591796875, + "k3_kl": 0.0390625, + "kimi_kl": 0.11279296875, + "learning_rate": 2.46e-08, + "loss": 0.0016, + "ppl": 0.0189208984375, + "reward": 0.9941366910934448, + "reward_std": 0.000844828668050468, + "rewards/perpo_ocr_edit_distance_reward": 0.9941366910934448, + "step": 4754, + "temperature": 0.9 + }, + { + "advantages": -1.958438360816217e-06, + "completion_length": 1346.0, + "delta_ref_entropy_loss": 0.00970458984375, + "delta_ref_ppl": -0.02783203125, + "entropy_loss": -0.052978515625, + "epoch": 0.951, + "grad_norm": 0.5797450245492953, + "k1_kl": 0.02783203125, + "k3_kl": 0.0184326171875, + "kimi_kl": 0.050537109375, + "learning_rate": 2.45e-08, + "loss": 0.0007, + "ppl": 0.0224609375, + "reward": 0.9887650012969971, + "reward_std": 0.004226278513669968, + "rewards/perpo_ocr_edit_distance_reward": 0.9887650609016418, + "step": 4755, + "temperature": 0.9 + }, + { + "advantages": 1.9601413441705517e-05, + "completion_length": 340.0, + "delta_ref_entropy_loss": 0.046142578125, + "delta_ref_ppl": -0.126953125, + "entropy_loss": -0.06591796875, + "epoch": 0.9512, + "grad_norm": 0.634813216826165, + "k1_kl": 0.126953125, + "k3_kl": 0.0947265625, + "kimi_kl": 0.4296875, + "learning_rate": 2.44e-08, + "loss": 0.0038, + "ppl": 0.0216064453125, + "reward": 0.9965437054634094, + "reward_std": 0.0012042276794090867, + "rewards/perpo_ocr_edit_distance_reward": 0.9965436458587646, + "step": 4756, + "temperature": 0.9 + }, + { + "advantages": -9.25340864341706e-05, + "completion_length": 703.0, + "delta_ref_entropy_loss": 0.000766754150390625, + "delta_ref_ppl": -0.033447265625, + "entropy_loss": -0.057373046875, + "epoch": 0.9514, + "grad_norm": 0.6154719257562573, + "k1_kl": 0.033447265625, + "k3_kl": 0.0230712890625, + "kimi_kl": 0.0732421875, + "learning_rate": 2.43e-08, + "loss": 0.001, + "ppl": 0.0177001953125, + "reward": 0.8279577493667603, + "reward_std": 0.0008201709715649486, + "rewards/perpo_ocr_edit_distance_reward": 0.827957808971405, + "step": 4757, + "temperature": 0.9 + }, + { + "advantages": -3.7465778746081924e-07, + "completion_length": 677.0, + "delta_ref_entropy_loss": -0.2294921875, + "delta_ref_ppl": -0.1064453125, + "entropy_loss": -0.84765625, + "epoch": 0.9516, + "grad_norm": 7.096992424931627, + "k1_kl": 0.10595703125, + "k3_kl": 0.142578125, + "kimi_kl": 0.294921875, + "learning_rate": 2.4199999999999998e-08, + "loss": 0.0057, + "ppl": 0.435546875, + "reward": 0.8494135141372681, + "reward_std": 0.1559659093618393, + "rewards/perpo_ocr_edit_distance_reward": 0.8494135737419128, + "step": 4758, + "temperature": 0.9 + }, + { + "advantages": -7.006100349826738e-05, + "completion_length": 858.0, + "delta_ref_entropy_loss": 0.0235595703125, + "delta_ref_ppl": -0.05859375, + "entropy_loss": -0.062255859375, + "epoch": 0.9518, + "grad_norm": 0.4291126782985665, + "k1_kl": 0.058349609375, + "k3_kl": 0.0380859375, + "kimi_kl": 0.13671875, + "learning_rate": 2.4099999999999997e-08, + "loss": 0.0016, + "ppl": 0.0201416015625, + "reward": 0.9958268404006958, + "reward_std": 0.0007509032730013132, + "rewards/perpo_ocr_edit_distance_reward": 0.9958269000053406, + "step": 4759, + "temperature": 0.9 + }, + { + "advantages": -0.0001584546989761293, + "completion_length": 438.0, + "delta_ref_entropy_loss": 0.0284423828125, + "delta_ref_ppl": -0.060791015625, + "entropy_loss": -0.05029296875, + "epoch": 0.952, + "grad_norm": 0.6094557858466284, + "k1_kl": 0.060791015625, + "k3_kl": 0.037353515625, + "kimi_kl": 0.10986328125, + "learning_rate": 2.4e-08, + "loss": 0.0017, + "ppl": 0.0185546875, + "reward": 0.9971736669540405, + "reward_std": 0.0007060801144689322, + "rewards/perpo_ocr_edit_distance_reward": 0.9971737861633301, + "step": 4760, + "temperature": 0.9 + }, + { + "advantages": 0.0, + "completion_length": 456.0, + "delta_ref_entropy_loss": 0.0296630859375, + "delta_ref_ppl": -0.0791015625, + "entropy_loss": -0.06689453125, + "epoch": 0.9522, + "grad_norm": 0.3768970115975904, + "k1_kl": 0.0791015625, + "k3_kl": 0.056396484375, + "kimi_kl": 0.1923828125, + "learning_rate": 2.39e-08, + "loss": 0.0023, + "ppl": 0.01904296875, + "reward": 0.9813753962516785, + "reward_std": 0.0005514328950084746, + "rewards/perpo_ocr_edit_distance_reward": 0.9813753366470337, + "step": 4761, + "temperature": 0.9 + }, + { + "advantages": 2.091271562676411e-05, + "completion_length": 559.0, + "delta_ref_entropy_loss": 0.005859375, + "delta_ref_ppl": -0.064453125, + "entropy_loss": -0.1455078125, + "epoch": 0.9524, + "grad_norm": 2.246824555935704, + "k1_kl": 0.064453125, + "k3_kl": 0.046630859375, + "kimi_kl": 0.12255859375, + "learning_rate": 2.38e-08, + "loss": 0.0018, + "ppl": 0.0625, + "reward": 0.9869269132614136, + "reward_std": 0.0015281167579814792, + "rewards/perpo_ocr_edit_distance_reward": 0.9869269132614136, + "step": 4762, + "temperature": 0.9 + }, + { + "advantages": -2.9291427381394897e-06, + "completion_length": 378.0, + "delta_ref_entropy_loss": 0.0281982421875, + "delta_ref_ppl": -0.1328125, + "entropy_loss": -0.1982421875, + "epoch": 0.9526, + "grad_norm": 1.5362764534712037, + "k1_kl": 0.1328125, + "k3_kl": 0.09130859375, + "kimi_kl": 0.28515625, + "learning_rate": 2.3699999999999997e-08, + "loss": 0.0036, + "ppl": 0.07470703125, + "reward": 0.9387496709823608, + "reward_std": 0.005708200391381979, + "rewards/perpo_ocr_edit_distance_reward": 0.9387496709823608, + "step": 4763, + "temperature": 0.9 + }, + { + "advantages": -3.651210499810986e-05, + "completion_length": 1311.0, + "delta_ref_entropy_loss": 0.040771484375, + "delta_ref_ppl": -0.054931640625, + "entropy_loss": -0.072265625, + "epoch": 0.9528, + "grad_norm": 4.450347405820239, + "k1_kl": 0.054931640625, + "k3_kl": 0.033447265625, + "kimi_kl": 0.06982421875, + "learning_rate": 2.36e-08, + "loss": 0.0014, + "ppl": 0.040771484375, + "reward": 0.9921526312828064, + "reward_std": 0.0022314612288028, + "rewards/perpo_ocr_edit_distance_reward": 0.992152750492096, + "step": 4764, + "temperature": 0.9 + }, + { + "advantages": -8.770398380875122e-06, + "completion_length": 671.0, + "delta_ref_entropy_loss": -0.07666015625, + "delta_ref_ppl": -0.08203125, + "entropy_loss": -0.384765625, + "epoch": 0.953, + "grad_norm": 1.8371114629964744, + "k1_kl": 0.08203125, + "k3_kl": 0.07861328125, + "kimi_kl": 0.16015625, + "learning_rate": 2.35e-08, + "loss": 0.0032, + "ppl": 0.1630859375, + "reward": 0.8320566415786743, + "reward_std": 0.009633338078856468, + "rewards/perpo_ocr_edit_distance_reward": 0.8320567607879639, + "step": 4765, + "temperature": 0.9 + }, + { + "advantages": 5.3865573136135936e-05, + "completion_length": 512.0, + "delta_ref_entropy_loss": -0.0048828125, + "delta_ref_ppl": -0.060546875, + "entropy_loss": -0.060546875, + "epoch": 0.9532, + "grad_norm": 0.5382122750460195, + "k1_kl": 0.060546875, + "k3_kl": 0.045654296875, + "kimi_kl": 0.2119140625, + "learning_rate": 2.34e-08, + "loss": 0.0018, + "ppl": 0.01904296875, + "reward": 0.9981545209884644, + "reward_std": 0.00053242570720613, + "rewards/perpo_ocr_edit_distance_reward": 0.9981545209884644, + "step": 4766, + "temperature": 0.9 + }, + { + "advantages": -8.031725883483887e-06, + "completion_length": 119.0, + "delta_ref_entropy_loss": 0.0654296875, + "delta_ref_ppl": -0.3046875, + "entropy_loss": -0.2890625, + "epoch": 0.9534, + "grad_norm": 2.6619475661483714, + "k1_kl": 0.3046875, + "k3_kl": 0.251953125, + "kimi_kl": 0.91015625, + "learning_rate": 2.33e-08, + "loss": 0.0101, + "ppl": 0.08935546875, + "reward": 0.6759582161903381, + "reward_std": 0.009432224556803703, + "rewards/perpo_ocr_edit_distance_reward": 0.6759582757949829, + "step": 4767, + "temperature": 0.9 + }, + { + "advantages": -1.5803747373865917e-05, + "completion_length": 675.0, + "delta_ref_entropy_loss": 0.0225830078125, + "delta_ref_ppl": -0.0810546875, + "entropy_loss": -0.10791015625, + "epoch": 0.9536, + "grad_norm": 4.776673070013587, + "k1_kl": 0.0810546875, + "k3_kl": 0.055908203125, + "kimi_kl": 0.1572265625, + "learning_rate": 2.3199999999999996e-08, + "loss": 0.0023, + "ppl": 0.04541015625, + "reward": 0.980445384979248, + "reward_std": 0.001517993863672018, + "rewards/perpo_ocr_edit_distance_reward": 0.9804454445838928, + "step": 4768, + "temperature": 0.9 + }, + { + "advantages": 6.471361757576233e-07, + "completion_length": 918.0, + "delta_ref_entropy_loss": 0.054931640625, + "delta_ref_ppl": -0.09619140625, + "entropy_loss": -0.119140625, + "epoch": 0.9538, + "grad_norm": 1.0412583157980264, + "k1_kl": 0.095703125, + "k3_kl": 0.0615234375, + "kimi_kl": 0.1572265625, + "learning_rate": 2.31e-08, + "loss": 0.0025, + "ppl": 0.05078125, + "reward": 0.9492694735527039, + "reward_std": 0.01282366644591093, + "rewards/perpo_ocr_edit_distance_reward": 0.9492694735527039, + "step": 4769, + "temperature": 0.9 + }, + { + "advantages": -6.795355875510722e-05, + "completion_length": 485.0, + "delta_ref_entropy_loss": -0.00118255615234375, + "delta_ref_ppl": -0.0859375, + "entropy_loss": -0.13671875, + "epoch": 0.954, + "grad_norm": 0.6471104042326666, + "k1_kl": 0.0859375, + "k3_kl": 0.060791015625, + "kimi_kl": 0.177734375, + "learning_rate": 2.2999999999999998e-08, + "loss": 0.0025, + "ppl": 0.045166015625, + "reward": 0.9613695740699768, + "reward_std": 0.0007773083634674549, + "rewards/perpo_ocr_edit_distance_reward": 0.9613696336746216, + "step": 4770, + "temperature": 0.9 + }, + { + "advantages": 0.0, + "completion_length": 472.0, + "delta_ref_entropy_loss": 0.01312255859375, + "delta_ref_ppl": -0.072265625, + "entropy_loss": -0.040771484375, + "epoch": 0.9542, + "grad_norm": 0.018544875328690216, + "k1_kl": 0.072265625, + "k3_kl": 0.050048828125, + "kimi_kl": 0.158203125, + "learning_rate": 2.29e-08, + "loss": 0.002, + "ppl": 0.0087890625, + "reward": 0.9972664713859558, + "reward_std": 0.0, + "rewards/perpo_ocr_edit_distance_reward": 0.9972665309906006, + "step": 4771, + "temperature": 0.9 + }, + { + "advantages": -1.2559550668811426e-05, + "completion_length": 167.0, + "delta_ref_entropy_loss": 0.026123046875, + "delta_ref_ppl": -0.1357421875, + "entropy_loss": -0.056640625, + "epoch": 0.9544, + "grad_norm": 1.27038643476892, + "k1_kl": 0.1357421875, + "k3_kl": 0.09912109375, + "kimi_kl": 0.388671875, + "learning_rate": 2.28e-08, + "loss": 0.004, + "ppl": 0.0155029296875, + "reward": 0.9883720874786377, + "reward_std": 0.0032888553105294704, + "rewards/perpo_ocr_edit_distance_reward": 0.9883721470832825, + "step": 4772, + "temperature": 0.9 + }, + { + "advantages": -4.58104295830708e-05, + "completion_length": 301.0, + "delta_ref_entropy_loss": 0.054443359375, + "delta_ref_ppl": -0.1376953125, + "entropy_loss": -0.0869140625, + "epoch": 0.9546, + "grad_norm": 0.6971474546989871, + "k1_kl": 0.1376953125, + "k3_kl": 0.10302734375, + "kimi_kl": 0.32421875, + "learning_rate": 2.27e-08, + "loss": 0.0042, + "ppl": 0.02978515625, + "reward": 0.9898520112037659, + "reward_std": 0.0010148844448849559, + "rewards/perpo_ocr_edit_distance_reward": 0.9898521304130554, + "step": 4773, + "temperature": 0.9 + }, + { + "advantages": -1.1069434435739822e-07, + "completion_length": 125.0, + "delta_ref_entropy_loss": -0.466796875, + "delta_ref_ppl": -0.55078125, + "entropy_loss": -1.109375, + "epoch": 0.9548, + "grad_norm": 11.879627264804641, + "k1_kl": 0.55078125, + "k3_kl": 0.5390625, + "kimi_kl": 1.6640625, + "learning_rate": 2.2599999999999997e-08, + "loss": 0.0215, + "ppl": 0.419921875, + "reward": 0.5542588829994202, + "reward_std": 0.2043205201625824, + "rewards/perpo_ocr_edit_distance_reward": 0.5542589426040649, + "step": 4774, + "temperature": 0.9 + }, + { + "advantages": -6.134169962024316e-05, + "completion_length": 774.0, + "delta_ref_entropy_loss": 0.0177001953125, + "delta_ref_ppl": -0.053955078125, + "entropy_loss": -0.04931640625, + "epoch": 0.955, + "grad_norm": 0.3592574573171506, + "k1_kl": 0.053955078125, + "k3_kl": 0.035400390625, + "kimi_kl": 0.10595703125, + "learning_rate": 2.25e-08, + "loss": 0.0015, + "ppl": 0.0142822265625, + "reward": 0.9977888464927673, + "reward_std": 0.000594015175011009, + "rewards/perpo_ocr_edit_distance_reward": 0.9977889657020569, + "step": 4775, + "temperature": 0.9 + }, + { + "advantages": -1.2942723515152466e-06, + "completion_length": 405.0, + "delta_ref_entropy_loss": -0.060302734375, + "delta_ref_ppl": -0.0791015625, + "entropy_loss": -0.373046875, + "epoch": 0.9552, + "grad_norm": 5.389781240521691, + "k1_kl": 0.07958984375, + "k3_kl": 0.06787109375, + "kimi_kl": 0.171875, + "learning_rate": 2.24e-08, + "loss": 0.0027, + "ppl": 0.1640625, + "reward": 0.950654149055481, + "reward_std": 0.04585205763578415, + "rewards/perpo_ocr_edit_distance_reward": 0.9506542086601257, + "step": 4776, + "temperature": 0.9 + }, + { + "advantages": -0.00012445449829101562, + "completion_length": 1187.0, + "delta_ref_entropy_loss": 0.039794921875, + "delta_ref_ppl": -0.050537109375, + "entropy_loss": -0.05224609375, + "epoch": 0.9554, + "grad_norm": 0.6216259035282176, + "k1_kl": 0.050537109375, + "k3_kl": 0.0306396484375, + "kimi_kl": 0.1064453125, + "learning_rate": 2.2299999999999998e-08, + "loss": 0.0014, + "ppl": 0.020751953125, + "reward": 0.9976494312286377, + "reward_std": 0.0005156827974133193, + "rewards/perpo_ocr_edit_distance_reward": 0.9976494908332825, + "step": 4777, + "temperature": 0.9 + }, + { + "advantages": -5.6777684221742675e-05, + "completion_length": 499.0, + "delta_ref_entropy_loss": 0.0223388671875, + "delta_ref_ppl": -0.08642578125, + "entropy_loss": -0.05517578125, + "epoch": 0.9556, + "grad_norm": 0.6501200009770105, + "k1_kl": 0.08642578125, + "k3_kl": 0.062255859375, + "kimi_kl": 0.2578125, + "learning_rate": 2.22e-08, + "loss": 0.0025, + "ppl": 0.018798828125, + "reward": 0.9746943116188049, + "reward_std": 0.0007997025968506932, + "rewards/perpo_ocr_edit_distance_reward": 0.9746943116188049, + "step": 4778, + "temperature": 0.9 + }, + { + "advantages": -1.1920928955078125e-06, + "completion_length": 2048.0, + "delta_ref_entropy_loss": -0.1650390625, + "delta_ref_ppl": -0.0260009765625, + "entropy_loss": -0.703125, + "epoch": 0.9558, + "grad_norm": 26.451177496451006, + "k1_kl": 0.02587890625, + "k3_kl": 0.10595703125, + "kimi_kl": 0.1162109375, + "learning_rate": 2.21e-08, + "loss": 0.0042, + "ppl": 0.396484375, + "reward": 0.5026431679725647, + "reward_std": 0.04723047465085983, + "rewards/perpo_ocr_edit_distance_reward": 0.5026432275772095, + "step": 4779, + "temperature": 0.9 + }, + { + "advantages": 0.0, + "completion_length": 550.0, + "delta_ref_entropy_loss": 0.006134033203125, + "delta_ref_ppl": -0.07666015625, + "entropy_loss": -0.1923828125, + "epoch": 0.956, + "grad_norm": 1.329562074652886, + "k1_kl": 0.076171875, + "k3_kl": 0.051513671875, + "kimi_kl": 0.162109375, + "learning_rate": 2.2e-08, + "loss": 0.0021, + "ppl": 0.06787109375, + "reward": 0.9528200626373291, + "reward_std": 0.0016007726080715656, + "rewards/perpo_ocr_edit_distance_reward": 0.9528201222419739, + "step": 4780, + "temperature": 0.9 + }, + { + "advantages": 8.514949634275126e-09, + "completion_length": 780.0, + "delta_ref_entropy_loss": 0.037109375, + "delta_ref_ppl": -0.056640625, + "entropy_loss": -0.0791015625, + "epoch": 0.9562, + "grad_norm": 1.1176137086995355, + "k1_kl": 0.056640625, + "k3_kl": 0.035888671875, + "kimi_kl": 0.09326171875, + "learning_rate": 2.1899999999999998e-08, + "loss": 0.0014, + "ppl": 0.032470703125, + "reward": 0.9835322499275208, + "reward_std": 0.001663823495618999, + "rewards/perpo_ocr_edit_distance_reward": 0.9835323095321655, + "step": 4781, + "temperature": 0.9 + }, + { + "advantages": -0.00011266981164226308, + "completion_length": 662.0, + "delta_ref_entropy_loss": 0.0216064453125, + "delta_ref_ppl": -0.05029296875, + "entropy_loss": -0.044677734375, + "epoch": 0.9564, + "grad_norm": 0.3034039766903229, + "k1_kl": 0.050537109375, + "k3_kl": 0.03369140625, + "kimi_kl": 0.12890625, + "learning_rate": 2.18e-08, + "loss": 0.0015, + "ppl": 0.01361083984375, + "reward": 0.9923182725906372, + "reward_std": 0.0004289965145289898, + "rewards/perpo_ocr_edit_distance_reward": 0.992318332195282, + "step": 4782, + "temperature": 0.9 + }, + { + "advantages": -1.729599171085283e-05, + "completion_length": 454.0, + "delta_ref_entropy_loss": 0.0201416015625, + "delta_ref_ppl": -0.1044921875, + "entropy_loss": -0.318359375, + "epoch": 0.9566, + "grad_norm": 1.8375881974263912, + "k1_kl": 0.10498046875, + "k3_kl": 0.07958984375, + "kimi_kl": 0.232421875, + "learning_rate": 2.17e-08, + "loss": 0.0032, + "ppl": 0.140625, + "reward": 0.9117719531059265, + "reward_std": 0.002853228710591793, + "rewards/perpo_ocr_edit_distance_reward": 0.9117720127105713, + "step": 4783, + "temperature": 0.9 + }, + { + "advantages": -1.498631149843277e-06, + "completion_length": 377.0, + "delta_ref_entropy_loss": -0.0712890625, + "delta_ref_ppl": -0.1416015625, + "entropy_loss": -0.51953125, + "epoch": 0.9568, + "grad_norm": 4.4823482083059965, + "k1_kl": 0.140625, + "k3_kl": 0.12451171875, + "kimi_kl": 0.265625, + "learning_rate": 2.16e-08, + "loss": 0.005, + "ppl": 0.275390625, + "reward": 0.9073354005813599, + "reward_std": 0.04047024995088577, + "rewards/perpo_ocr_edit_distance_reward": 0.9073354601860046, + "step": 4784, + "temperature": 0.9 + }, + { + "advantages": -1.2431826235115295e-06, + "completion_length": 546.0, + "delta_ref_entropy_loss": -0.2392578125, + "delta_ref_ppl": -0.09521484375, + "entropy_loss": -0.443359375, + "epoch": 0.957, + "grad_norm": 4.2739863159519675, + "k1_kl": 0.09521484375, + "k3_kl": 0.0986328125, + "kimi_kl": 0.3203125, + "learning_rate": 2.1499999999999997e-08, + "loss": 0.0039, + "ppl": 0.1298828125, + "reward": 0.9062510132789612, + "reward_std": 0.0491037517786026, + "rewards/perpo_ocr_edit_distance_reward": 0.906251072883606, + "step": 4785, + "temperature": 0.9 + }, + { + "advantages": -7.152557827794226e-06, + "completion_length": 497.0, + "delta_ref_entropy_loss": 0.005218505859375, + "delta_ref_ppl": -0.1064453125, + "entropy_loss": -0.053955078125, + "epoch": 0.9572, + "grad_norm": 0.7356028160886436, + "k1_kl": 0.10595703125, + "k3_kl": 0.0849609375, + "kimi_kl": 0.455078125, + "learning_rate": 2.1399999999999996e-08, + "loss": 0.0034, + "ppl": 0.0147705078125, + "reward": 0.9908130764961243, + "reward_std": 0.0034781238064169884, + "rewards/perpo_ocr_edit_distance_reward": 0.9908130764961243, + "step": 4786, + "temperature": 0.9 + }, + { + "advantages": 2.1287374085687816e-09, + "completion_length": 131.0, + "delta_ref_entropy_loss": 0.01483154296875, + "delta_ref_ppl": -0.298828125, + "entropy_loss": -0.1103515625, + "epoch": 0.9574, + "grad_norm": 1.3936838349069145, + "k1_kl": 0.298828125, + "k3_kl": 0.2490234375, + "kimi_kl": 1.2109375, + "learning_rate": 2.13e-08, + "loss": 0.01, + "ppl": 0.033935546875, + "reward": 0.9887954592704773, + "reward_std": 0.0018571674590930343, + "rewards/perpo_ocr_edit_distance_reward": 0.9887954592704773, + "step": 4787, + "temperature": 0.9 + }, + { + "advantages": 1.6944750313996337e-05, + "completion_length": 1645.0, + "delta_ref_entropy_loss": 0.0023651123046875, + "delta_ref_ppl": -0.0203857421875, + "entropy_loss": -0.0439453125, + "epoch": 0.9576, + "grad_norm": 1.2419949694824157, + "k1_kl": 0.0205078125, + "k3_kl": 0.01611328125, + "kimi_kl": 0.035888671875, + "learning_rate": 2.1199999999999998e-08, + "loss": 0.0006, + "ppl": 0.0216064453125, + "reward": 0.987960159778595, + "reward_std": 0.0009058463620021939, + "rewards/perpo_ocr_edit_distance_reward": 0.987960159778595, + "step": 4788, + "temperature": 0.9 + }, + { + "advantages": -4.96251268486958e-05, + "completion_length": 634.0, + "delta_ref_entropy_loss": 0.0234375, + "delta_ref_ppl": -0.0546875, + "entropy_loss": -0.0693359375, + "epoch": 0.9578, + "grad_norm": 0.8215722802978804, + "k1_kl": 0.0546875, + "k3_kl": 0.0390625, + "kimi_kl": 0.1337890625, + "learning_rate": 2.11e-08, + "loss": 0.0016, + "ppl": 0.021728515625, + "reward": 0.9818504452705383, + "reward_std": 0.0009295429335907102, + "rewards/perpo_ocr_edit_distance_reward": 0.9818505048751831, + "step": 4789, + "temperature": 0.9 + }, + { + "advantages": -0.00013753346865996718, + "completion_length": 430.0, + "delta_ref_entropy_loss": 0.01080322265625, + "delta_ref_ppl": -0.05859375, + "entropy_loss": -0.0341796875, + "epoch": 0.958, + "grad_norm": 0.2259322549978882, + "k1_kl": 0.05859375, + "k3_kl": 0.042236328125, + "kimi_kl": 0.1318359375, + "learning_rate": 2.1e-08, + "loss": 0.0018, + "ppl": 0.0078125, + "reward": 0.9923940300941467, + "reward_std": 0.00020952176419086754, + "rewards/perpo_ocr_edit_distance_reward": 0.9923940896987915, + "step": 4790, + "temperature": 0.9 + }, + { + "advantages": -1.025199981086189e-05, + "completion_length": 577.0, + "delta_ref_entropy_loss": -0.068359375, + "delta_ref_ppl": -0.07177734375, + "entropy_loss": -0.185546875, + "epoch": 0.9582, + "grad_norm": 1.2786916623533928, + "k1_kl": 0.072265625, + "k3_kl": 0.05859375, + "kimi_kl": 0.181640625, + "learning_rate": 2.09e-08, + "loss": 0.0024, + "ppl": 0.056640625, + "reward": 0.9760991334915161, + "reward_std": 0.008211309090256691, + "rewards/perpo_ocr_edit_distance_reward": 0.9760991930961609, + "step": 4791, + "temperature": 0.9 + }, + { + "advantages": -2.1542822651099414e-05, + "completion_length": 1202.0, + "delta_ref_entropy_loss": 0.01263427734375, + "delta_ref_ppl": -0.034912109375, + "entropy_loss": -0.04931640625, + "epoch": 0.9584, + "grad_norm": 0.598263138351807, + "k1_kl": 0.034912109375, + "k3_kl": 0.023681640625, + "kimi_kl": 0.064453125, + "learning_rate": 2.0799999999999998e-08, + "loss": 0.001, + "ppl": 0.020751953125, + "reward": 0.9939501881599426, + "reward_std": 0.0010868312092497945, + "rewards/perpo_ocr_edit_distance_reward": 0.9939502477645874, + "step": 4792, + "temperature": 0.9 + }, + { + "advantages": -4.410743713378906e-06, + "completion_length": 1548.0, + "delta_ref_entropy_loss": -0.043212890625, + "delta_ref_ppl": -0.02490234375, + "entropy_loss": -0.1279296875, + "epoch": 0.9586, + "grad_norm": 0.7013369585211433, + "k1_kl": 0.02490234375, + "k3_kl": 0.0244140625, + "kimi_kl": 0.07373046875, + "learning_rate": 2.07e-08, + "loss": 0.001, + "ppl": 0.0419921875, + "reward": 0.7951046824455261, + "reward_std": 0.009569299407303333, + "rewards/perpo_ocr_edit_distance_reward": 0.7951047420501709, + "step": 4793, + "temperature": 0.9 + }, + { + "advantages": -3.2016209843277466e-06, + "completion_length": 201.0, + "delta_ref_entropy_loss": -0.0107421875, + "delta_ref_ppl": -0.1689453125, + "entropy_loss": -0.10107421875, + "epoch": 0.9588, + "grad_norm": 1.322927926913, + "k1_kl": 0.1689453125, + "k3_kl": 0.1376953125, + "kimi_kl": 0.70703125, + "learning_rate": 2.06e-08, + "loss": 0.0055, + "ppl": 0.0291748046875, + "reward": 0.8901917934417725, + "reward_std": 0.0025605312548577785, + "rewards/perpo_ocr_edit_distance_reward": 0.890191912651062, + "step": 4794, + "temperature": 0.9 + }, + { + "advantages": -7.382461262750439e-06, + "completion_length": 287.0, + "delta_ref_entropy_loss": 0.06689453125, + "delta_ref_ppl": -0.1494140625, + "entropy_loss": -0.1376953125, + "epoch": 0.959, + "grad_norm": 1.3611458709152169, + "k1_kl": 0.1494140625, + "k3_kl": 0.10595703125, + "kimi_kl": 0.375, + "learning_rate": 2.05e-08, + "loss": 0.0043, + "ppl": 0.052001953125, + "reward": 0.9283539056777954, + "reward_std": 0.003359848400577903, + "rewards/perpo_ocr_edit_distance_reward": 0.9283539056777954, + "step": 4795, + "temperature": 0.9 + }, + { + "advantages": -0.00017358576587866992, + "completion_length": 827.0, + "delta_ref_entropy_loss": 0.0277099609375, + "delta_ref_ppl": -0.04833984375, + "entropy_loss": -0.051025390625, + "epoch": 0.9592, + "grad_norm": 0.25455555463862506, + "k1_kl": 0.04833984375, + "k3_kl": 0.0269775390625, + "kimi_kl": 0.0751953125, + "learning_rate": 2.04e-08, + "loss": 0.0013, + "ppl": 0.0137939453125, + "reward": 0.985268771648407, + "reward_std": 0.00029240554431453347, + "rewards/perpo_ocr_edit_distance_reward": 0.9852688908576965, + "step": 4796, + "temperature": 0.9 + }, + { + "advantages": 7.373946573352441e-06, + "completion_length": 867.0, + "delta_ref_entropy_loss": 0.0380859375, + "delta_ref_ppl": -0.0703125, + "entropy_loss": -0.1884765625, + "epoch": 0.9594, + "grad_norm": 1.335074667771336, + "k1_kl": 0.0703125, + "k3_kl": 0.050048828125, + "kimi_kl": 0.11669921875, + "learning_rate": 2.0299999999999996e-08, + "loss": 0.002, + "ppl": 0.0927734375, + "reward": 0.9656164646148682, + "reward_std": 0.0010559620568528771, + "rewards/perpo_ocr_edit_distance_reward": 0.9656165242195129, + "step": 4797, + "temperature": 0.9 + }, + { + "advantages": -0.00010362693865317851, + "completion_length": 535.0, + "delta_ref_entropy_loss": 0.04345703125, + "delta_ref_ppl": -0.052490234375, + "entropy_loss": -0.04248046875, + "epoch": 0.9596, + "grad_norm": 0.6865766548712092, + "k1_kl": 0.052490234375, + "k3_kl": 0.02587890625, + "kimi_kl": 0.0634765625, + "learning_rate": 2.02e-08, + "loss": 0.0011, + "ppl": 0.01202392578125, + "reward": 0.9862654209136963, + "reward_std": 0.0005573916132561862, + "rewards/perpo_ocr_edit_distance_reward": 0.9862654805183411, + "step": 4798, + "temperature": 0.9 + }, + { + "advantages": -1.3623919414840202e-07, + "completion_length": 1471.0, + "delta_ref_entropy_loss": -0.00634765625, + "delta_ref_ppl": -0.033447265625, + "entropy_loss": -0.11083984375, + "epoch": 0.9598, + "grad_norm": 1.3981949356883303, + "k1_kl": 0.03369140625, + "k3_kl": 0.02392578125, + "kimi_kl": 0.04931640625, + "learning_rate": 2.0099999999999998e-08, + "loss": 0.001, + "ppl": 0.04931640625, + "reward": 0.5780718326568604, + "reward_std": 0.4098440110683441, + "rewards/perpo_ocr_edit_distance_reward": 0.5780718922615051, + "step": 4799, + "temperature": 0.9 + }, + { + "advantages": -1.7029899268550253e-08, + "completion_length": 718.0, + "delta_ref_entropy_loss": 0.04736328125, + "delta_ref_ppl": -0.062255859375, + "entropy_loss": -0.064453125, + "epoch": 0.96, + "grad_norm": 0.4740011344769999, + "k1_kl": 0.0625, + "k3_kl": 0.0361328125, + "kimi_kl": 0.0966796875, + "learning_rate": 2e-08, + "loss": 0.0014, + "ppl": 0.0213623046875, + "reward": 0.9976214170455933, + "reward_std": 0.000355994125129655, + "rewards/perpo_ocr_edit_distance_reward": 0.9976214170455933, + "step": 4800, + "temperature": 0.9 + }, + { + "advantages": -4.757727947435342e-05, + "completion_length": 1109.0, + "delta_ref_entropy_loss": -0.005035400390625, + "delta_ref_ppl": -0.03271484375, + "entropy_loss": -0.05078125, + "epoch": 0.9602, + "grad_norm": 0.3644293163592538, + "k1_kl": 0.032470703125, + "k3_kl": 0.0242919921875, + "kimi_kl": 0.08203125, + "learning_rate": 1.99e-08, + "loss": 0.001, + "ppl": 0.016357421875, + "reward": 0.9742154479026794, + "reward_std": 0.0009740939131006598, + "rewards/perpo_ocr_edit_distance_reward": 0.9742155075073242, + "step": 4801, + "temperature": 0.9 + }, + { + "advantages": -3.647804260253906e-05, + "completion_length": 211.0, + "delta_ref_entropy_loss": 0.12158203125, + "delta_ref_ppl": -0.240234375, + "entropy_loss": -0.11328125, + "epoch": 0.9604, + "grad_norm": 1.3667715099067341, + "k1_kl": 0.2412109375, + "k3_kl": 0.1669921875, + "kimi_kl": 0.7109375, + "learning_rate": 1.9800000000000002e-08, + "loss": 0.0067, + "ppl": 0.034423828125, + "reward": 0.9322752952575684, + "reward_std": 0.0015335994539782405, + "rewards/perpo_ocr_edit_distance_reward": 0.9322754144668579, + "step": 4802, + "temperature": 0.9 + }, + { + "advantages": -8.58306884765625e-06, + "completion_length": 2048.0, + "delta_ref_entropy_loss": 0.00341796875, + "delta_ref_ppl": -0.028564453125, + "entropy_loss": -0.078125, + "epoch": 0.9606, + "grad_norm": 23144.750182298532, + "k1_kl": 0.028564453125, + "k3_kl": 6.53125, + "kimi_kl": 0.0751953125, + "learning_rate": 1.9699999999999998e-08, + "loss": 0.2608, + "ppl": 0.04345703125, + "reward": 0.8821669220924377, + "reward_std": 0.004866158124059439, + "rewards/perpo_ocr_edit_distance_reward": 0.8821670413017273, + "step": 4803, + "temperature": 0.9 + }, + { + "advantages": -5.0996033678529784e-05, + "completion_length": 741.0, + "delta_ref_entropy_loss": 0.0048828125, + "delta_ref_ppl": -0.034423828125, + "entropy_loss": -0.06298828125, + "epoch": 0.9608, + "grad_norm": 0.7026860273646849, + "k1_kl": 0.034423828125, + "k3_kl": 0.0196533203125, + "kimi_kl": 0.052490234375, + "learning_rate": 1.9599999999999997e-08, + "loss": 0.0008, + "ppl": 0.0184326171875, + "reward": 0.9983575344085693, + "reward_std": 0.0005680117756128311, + "rewards/perpo_ocr_edit_distance_reward": 0.9983575940132141, + "step": 4804, + "temperature": 0.9 + }, + { + "advantages": -1.4611653568863403e-05, + "completion_length": 313.0, + "delta_ref_entropy_loss": 0.0216064453125, + "delta_ref_ppl": -0.1474609375, + "entropy_loss": -0.0654296875, + "epoch": 0.961, + "grad_norm": 0.6828241407478399, + "k1_kl": 0.1474609375, + "k3_kl": 0.11279296875, + "kimi_kl": 0.5, + "learning_rate": 1.95e-08, + "loss": 0.0045, + "ppl": 0.018310546875, + "reward": 0.9948576092720032, + "reward_std": 0.001066698576323688, + "rewards/perpo_ocr_edit_distance_reward": 0.994857668876648, + "step": 4805, + "temperature": 0.9 + }, + { + "advantages": -5.0067901611328125e-06, + "completion_length": 476.0, + "delta_ref_entropy_loss": 0.060302734375, + "delta_ref_ppl": -0.1103515625, + "entropy_loss": -0.10546875, + "epoch": 0.9612, + "grad_norm": 0.7760206333089796, + "k1_kl": 0.11083984375, + "k3_kl": 0.076171875, + "kimi_kl": 0.232421875, + "learning_rate": 1.94e-08, + "loss": 0.003, + "ppl": 0.043212890625, + "reward": 0.9347593784332275, + "reward_std": 0.0016024279175326228, + "rewards/perpo_ocr_edit_distance_reward": 0.9347594380378723, + "step": 4806, + "temperature": 0.9 + }, + { + "advantages": -2.4778502847766504e-05, + "completion_length": 505.0, + "delta_ref_entropy_loss": 0.048095703125, + "delta_ref_ppl": -0.1064453125, + "entropy_loss": -0.20703125, + "epoch": 0.9614, + "grad_norm": 1.2427245889218541, + "k1_kl": 0.10595703125, + "k3_kl": 0.076171875, + "kimi_kl": 0.1865234375, + "learning_rate": 1.93e-08, + "loss": 0.0031, + "ppl": 0.09326171875, + "reward": 0.9338842630386353, + "reward_std": 0.0009309070883318782, + "rewards/perpo_ocr_edit_distance_reward": 0.93388432264328, + "step": 4807, + "temperature": 0.9 + }, + { + "advantages": -1.8460410501575097e-05, + "completion_length": 292.0, + "delta_ref_entropy_loss": 0.00958251953125, + "delta_ref_ppl": -0.181640625, + "entropy_loss": -0.1201171875, + "epoch": 0.9616, + "grad_norm": 2.2849783890870787, + "k1_kl": 0.181640625, + "k3_kl": 0.1337890625, + "kimi_kl": 0.53125, + "learning_rate": 1.9199999999999997e-08, + "loss": 0.0054, + "ppl": 0.049072265625, + "reward": 0.9917887449264526, + "reward_std": 0.003590561915189028, + "rewards/perpo_ocr_edit_distance_reward": 0.9917887449264526, + "step": 4808, + "temperature": 0.9 + }, + { + "advantages": -5.134514594828943e-06, + "completion_length": 199.0, + "delta_ref_entropy_loss": -0.00167083740234375, + "delta_ref_ppl": -0.2001953125, + "entropy_loss": -0.07373046875, + "epoch": 0.9618, + "grad_norm": 1.6472101512034454, + "k1_kl": 0.2001953125, + "k3_kl": 0.166015625, + "kimi_kl": 0.8671875, + "learning_rate": 1.91e-08, + "loss": 0.0067, + "ppl": 0.0224609375, + "reward": 0.9850460290908813, + "reward_std": 0.001555990194901824, + "rewards/perpo_ocr_edit_distance_reward": 0.9850461483001709, + "step": 4809, + "temperature": 0.9 + }, + { + "advantages": -9.0309556981083e-05, + "completion_length": 502.0, + "delta_ref_entropy_loss": 0.04638671875, + "delta_ref_ppl": -0.09765625, + "entropy_loss": -0.07177734375, + "epoch": 0.962, + "grad_norm": 0.5742276811995441, + "k1_kl": 0.09765625, + "k3_kl": 0.06787109375, + "kimi_kl": 0.25390625, + "learning_rate": 1.8999999999999998e-08, + "loss": 0.0028, + "ppl": 0.0277099609375, + "reward": 0.9953945279121399, + "reward_std": 0.0006542521878145635, + "rewards/perpo_ocr_edit_distance_reward": 0.9953945875167847, + "step": 4810, + "temperature": 0.9 + }, + { + "advantages": -4.9693244363879785e-05, + "completion_length": 306.0, + "delta_ref_entropy_loss": 0.09521484375, + "delta_ref_ppl": -0.1572265625, + "entropy_loss": -0.11767578125, + "epoch": 0.9622, + "grad_norm": 0.9391447476944379, + "k1_kl": 0.15625, + "k3_kl": 0.1083984375, + "kimi_kl": 0.326171875, + "learning_rate": 1.89e-08, + "loss": 0.0044, + "ppl": 0.0478515625, + "reward": 0.9898911118507385, + "reward_std": 0.001099690911360085, + "rewards/perpo_ocr_edit_distance_reward": 0.9898911714553833, + "step": 4811, + "temperature": 0.9 + }, + { + "advantages": -1.3394015695666894e-05, + "completion_length": 865.0, + "delta_ref_entropy_loss": -0.044189453125, + "delta_ref_ppl": -0.07861328125, + "entropy_loss": -0.494140625, + "epoch": 0.9624, + "grad_norm": 1.635506286231262, + "k1_kl": 0.07861328125, + "k3_kl": 0.0615234375, + "kimi_kl": 0.1162109375, + "learning_rate": 1.88e-08, + "loss": 0.0025, + "ppl": 0.2197265625, + "reward": 0.7104749083518982, + "reward_std": 0.005625118501484394, + "rewards/perpo_ocr_edit_distance_reward": 0.7104750275611877, + "step": 4812, + "temperature": 0.9 + }, + { + "advantages": -2.767358637356665e-06, + "completion_length": 44.0, + "delta_ref_entropy_loss": 0.0849609375, + "delta_ref_ppl": -0.6015625, + "entropy_loss": -0.306640625, + "epoch": 0.9626, + "grad_norm": 9.883360899184074, + "k1_kl": 0.6015625, + "k3_kl": 0.494140625, + "kimi_kl": 2.09375, + "learning_rate": 1.8700000000000002e-08, + "loss": 0.0198, + "ppl": 0.140625, + "reward": 0.9588289856910706, + "reward_std": 0.018214115872979164, + "rewards/perpo_ocr_edit_distance_reward": 0.9588291049003601, + "step": 4813, + "temperature": 0.9 + }, + { + "advantages": -8.71930842549773e-06, + "completion_length": 283.0, + "delta_ref_entropy_loss": 0.037841796875, + "delta_ref_ppl": -0.13671875, + "entropy_loss": -0.06494140625, + "epoch": 0.9628, + "grad_norm": 1.1237242256551705, + "k1_kl": 0.1357421875, + "k3_kl": 0.1025390625, + "kimi_kl": 0.458984375, + "learning_rate": 1.8599999999999998e-08, + "loss": 0.0041, + "ppl": 0.0208740234375, + "reward": 0.9927341341972351, + "reward_std": 0.0018469489878043532, + "rewards/perpo_ocr_edit_distance_reward": 0.9927341938018799, + "step": 4814, + "temperature": 0.9 + }, + { + "advantages": -8.089202196970291e-07, + "completion_length": 679.0, + "delta_ref_entropy_loss": -0.087890625, + "delta_ref_ppl": -0.078125, + "entropy_loss": -0.26953125, + "epoch": 0.963, + "grad_norm": 1.885398555914181, + "k1_kl": 0.078125, + "k3_kl": 0.0634765625, + "kimi_kl": 0.1328125, + "learning_rate": 1.8499999999999997e-08, + "loss": 0.0025, + "ppl": 0.0947265625, + "reward": 0.5681738257408142, + "reward_std": 0.03671647980809212, + "rewards/perpo_ocr_edit_distance_reward": 0.568173885345459, + "step": 4815, + "temperature": 0.9 + }, + { + "advantages": -1.8732889373040962e-07, + "completion_length": 32.0, + "delta_ref_entropy_loss": -0.2451171875, + "delta_ref_ppl": -0.890625, + "entropy_loss": -0.94140625, + "epoch": 0.9632, + "grad_norm": 17.599904322816506, + "k1_kl": 0.890625, + "k3_kl": 0.86328125, + "kimi_kl": 4.65625, + "learning_rate": 1.84e-08, + "loss": 0.0344, + "ppl": 0.404296875, + "reward": 0.8718445301055908, + "reward_std": 0.17433033883571625, + "rewards/perpo_ocr_edit_distance_reward": 0.8718445897102356, + "step": 4816, + "temperature": 0.9 + }, + { + "advantages": -5.1089696171402466e-06, + "completion_length": 379.0, + "delta_ref_entropy_loss": -0.5859375, + "delta_ref_ppl": -0.140625, + "entropy_loss": -0.9921875, + "epoch": 0.9634, + "grad_norm": 3.961603382077373, + "k1_kl": 0.140625, + "k3_kl": 0.2099609375, + "kimi_kl": 0.494140625, + "learning_rate": 1.83e-08, + "loss": 0.0084, + "ppl": 0.4296875, + "reward": 0.9032307863235474, + "reward_std": 0.013321884907782078, + "rewards/perpo_ocr_edit_distance_reward": 0.9032308459281921, + "step": 4817, + "temperature": 0.9 + }, + { + "advantages": 1.27724248955019e-08, + "completion_length": 780.0, + "delta_ref_entropy_loss": -0.0225830078125, + "delta_ref_ppl": -0.10791015625, + "entropy_loss": -0.314453125, + "epoch": 0.9636, + "grad_norm": 1.3708489917173956, + "k1_kl": 0.10791015625, + "k3_kl": 0.080078125, + "kimi_kl": 0.1982421875, + "learning_rate": 1.82e-08, + "loss": 0.0032, + "ppl": 0.12158203125, + "reward": 0.8737528920173645, + "reward_std": 0.0030538232531398535, + "rewards/perpo_ocr_edit_distance_reward": 0.8737528920173645, + "step": 4818, + "temperature": 0.9 + }, + { + "advantages": -3.065381861233618e-06, + "completion_length": 272.0, + "delta_ref_entropy_loss": -0.0169677734375, + "delta_ref_ppl": -0.1376953125, + "entropy_loss": -0.291015625, + "epoch": 0.9638, + "grad_norm": 2.1262942830995013, + "k1_kl": 0.1376953125, + "k3_kl": 0.126953125, + "kimi_kl": 0.30859375, + "learning_rate": 1.81e-08, + "loss": 0.0051, + "ppl": 0.1162109375, + "reward": 0.8746410608291626, + "reward_std": 0.008194733411073685, + "rewards/perpo_ocr_edit_distance_reward": 0.8746411800384521, + "step": 4819, + "temperature": 0.9 + }, + { + "advantages": -1.0899135531872162e-06, + "completion_length": 373.0, + "delta_ref_entropy_loss": -0.0240478515625, + "delta_ref_ppl": -0.134765625, + "entropy_loss": -0.68359375, + "epoch": 0.964, + "grad_norm": 3.397121583280644, + "k1_kl": 0.1337890625, + "k3_kl": 0.1337890625, + "kimi_kl": 0.3125, + "learning_rate": 1.8e-08, + "loss": 0.0054, + "ppl": 0.33984375, + "reward": 0.7661926746368408, + "reward_std": 0.007725853938609362, + "rewards/perpo_ocr_edit_distance_reward": 0.7661927342414856, + "step": 4820, + "temperature": 0.9 + }, + { + "advantages": 5.500657607626636e-06, + "completion_length": 426.0, + "delta_ref_entropy_loss": -0.0025177001953125, + "delta_ref_ppl": -0.06591796875, + "entropy_loss": -0.27734375, + "epoch": 0.9642, + "grad_norm": 1.7534309047725989, + "k1_kl": 0.06591796875, + "k3_kl": 0.045654296875, + "kimi_kl": 0.1044921875, + "learning_rate": 1.7899999999999998e-08, + "loss": 0.0018, + "ppl": 0.11376953125, + "reward": 0.8426135182380676, + "reward_std": 0.003001247067004442, + "rewards/perpo_ocr_edit_distance_reward": 0.8426135182380676, + "step": 4821, + "temperature": 0.9 + }, + { + "advantages": -4.879066182184033e-06, + "completion_length": 811.0, + "delta_ref_entropy_loss": 0.0003147125244140625, + "delta_ref_ppl": -0.03466796875, + "entropy_loss": -0.038330078125, + "epoch": 0.9644, + "grad_norm": 0.694266614877386, + "k1_kl": 0.03466796875, + "k3_kl": 0.0247802734375, + "kimi_kl": 0.076171875, + "learning_rate": 1.78e-08, + "loss": 0.001, + "ppl": 0.0147705078125, + "reward": 0.9821239709854126, + "reward_std": 0.005129027180373669, + "rewards/perpo_ocr_edit_distance_reward": 0.9821240901947021, + "step": 4822, + "temperature": 0.9 + }, + { + "advantages": -0.00018213476869277656, + "completion_length": 733.0, + "delta_ref_entropy_loss": 0.018798828125, + "delta_ref_ppl": -0.048095703125, + "entropy_loss": -0.051513671875, + "epoch": 0.9646, + "grad_norm": 0.5362127535198213, + "k1_kl": 0.048095703125, + "k3_kl": 0.031005859375, + "kimi_kl": 0.09619140625, + "learning_rate": 1.77e-08, + "loss": 0.0014, + "ppl": 0.0184326171875, + "reward": 0.9991034269332886, + "reward_std": 0.00036747282138094306, + "rewards/perpo_ocr_edit_distance_reward": 0.9991035461425781, + "step": 4823, + "temperature": 0.9 + }, + { + "advantages": 0.0, + "completion_length": 1146.0, + "delta_ref_entropy_loss": -0.72265625, + "delta_ref_ppl": -0.5078125, + "entropy_loss": -1.5859375, + "epoch": 0.9648, + "grad_norm": 23.128521820821568, + "k1_kl": 0.5078125, + "k3_kl": 0.8515625, + "kimi_kl": 1.8046875, + "learning_rate": 1.76e-08, + "loss": 0.034, + "ppl": 0.6796875, + "reward": 0.531665027141571, + "reward_std": 0.30596187710762024, + "rewards/perpo_ocr_edit_distance_reward": 0.531665027141571, + "step": 4824, + "temperature": 0.9 + }, + { + "advantages": 1.2883118870377075e-05, + "completion_length": 430.0, + "delta_ref_entropy_loss": 0.01171875, + "delta_ref_ppl": -0.06591796875, + "entropy_loss": -0.03857421875, + "epoch": 0.965, + "grad_norm": 0.4664044268610837, + "k1_kl": 0.06591796875, + "k3_kl": 0.050048828125, + "kimi_kl": 0.1953125, + "learning_rate": 1.75e-08, + "loss": 0.002, + "ppl": 0.01190185546875, + "reward": 0.9977391362190247, + "reward_std": 0.0005607881466858089, + "rewards/perpo_ocr_edit_distance_reward": 0.9977391362190247, + "step": 4825, + "temperature": 0.9 + }, + { + "advantages": -7.169587661337573e-06, + "completion_length": 79.0, + "delta_ref_entropy_loss": -0.004608154296875, + "delta_ref_ppl": -0.396484375, + "entropy_loss": -0.18359375, + "epoch": 0.9652, + "grad_norm": 2.612117927296269, + "k1_kl": 0.39453125, + "k3_kl": 0.341796875, + "kimi_kl": 1.40625, + "learning_rate": 1.7399999999999997e-08, + "loss": 0.0137, + "ppl": 0.07470703125, + "reward": 0.9208972454071045, + "reward_std": 0.0034533573780208826, + "rewards/perpo_ocr_edit_distance_reward": 0.9208973050117493, + "step": 4826, + "temperature": 0.9 + }, + { + "advantages": -2.3441656594513915e-05, + "completion_length": 532.0, + "delta_ref_entropy_loss": -0.038330078125, + "delta_ref_ppl": -0.0625, + "entropy_loss": -0.1572265625, + "epoch": 0.9654, + "grad_norm": 0.7894176061766125, + "k1_kl": 0.0625, + "k3_kl": 0.05078125, + "kimi_kl": 0.1337890625, + "learning_rate": 1.73e-08, + "loss": 0.0021, + "ppl": 0.04248046875, + "reward": 0.9706426858901978, + "reward_std": 0.0031687722075730562, + "rewards/perpo_ocr_edit_distance_reward": 0.9706428050994873, + "step": 4827, + "temperature": 0.9 + }, + { + "advantages": 1.2738364603137597e-05, + "completion_length": 799.0, + "delta_ref_entropy_loss": 0.0194091796875, + "delta_ref_ppl": -0.05712890625, + "entropy_loss": -0.05859375, + "epoch": 0.9656, + "grad_norm": 0.47866687587540585, + "k1_kl": 0.057373046875, + "k3_kl": 0.03564453125, + "kimi_kl": 0.10400390625, + "learning_rate": 1.72e-08, + "loss": 0.0014, + "ppl": 0.0167236328125, + "reward": 0.9963845014572144, + "reward_std": 0.0019051439594477415, + "rewards/perpo_ocr_edit_distance_reward": 0.9963845014572144, + "step": 4828, + "temperature": 0.9 + }, + { + "advantages": -1.629761391086504e-05, + "completion_length": 857.0, + "delta_ref_entropy_loss": 0.00732421875, + "delta_ref_ppl": -0.047607421875, + "entropy_loss": -0.055908203125, + "epoch": 0.9658, + "grad_norm": 0.43910420118114074, + "k1_kl": 0.047607421875, + "k3_kl": 0.030029296875, + "kimi_kl": 0.0712890625, + "learning_rate": 1.71e-08, + "loss": 0.0012, + "ppl": 0.016845703125, + "reward": 0.9942271709442139, + "reward_std": 0.0009465317125432193, + "rewards/perpo_ocr_edit_distance_reward": 0.9942272901535034, + "step": 4829, + "temperature": 0.9 + }, + { + "advantages": 5.960464477539062e-07, + "completion_length": 923.0, + "delta_ref_entropy_loss": -0.0030670166015625, + "delta_ref_ppl": -0.054931640625, + "entropy_loss": -0.08349609375, + "epoch": 0.966, + "grad_norm": 0.5637353317832279, + "k1_kl": 0.0546875, + "k3_kl": 0.036376953125, + "kimi_kl": 0.1015625, + "learning_rate": 1.7e-08, + "loss": 0.0015, + "ppl": 0.02734375, + "reward": 0.9833143949508667, + "reward_std": 0.028032569214701653, + "rewards/perpo_ocr_edit_distance_reward": 0.9833143949508667, + "step": 4830, + "temperature": 0.9 + }, + { + "advantages": -5.960464477539062e-07, + "completion_length": 599.0, + "delta_ref_entropy_loss": -0.10986328125, + "delta_ref_ppl": -0.064453125, + "entropy_loss": -0.2294921875, + "epoch": 0.9662, + "grad_norm": 1.8441494969984706, + "k1_kl": 0.064453125, + "k3_kl": 0.06298828125, + "kimi_kl": 0.185546875, + "learning_rate": 1.69e-08, + "loss": 0.0025, + "ppl": 0.06787109375, + "reward": 0.9880788326263428, + "reward_std": 0.014109667390584946, + "rewards/perpo_ocr_edit_distance_reward": 0.9880788326263428, + "step": 4831, + "temperature": 0.9 + }, + { + "advantages": -1.621246337890625e-05, + "completion_length": 473.0, + "delta_ref_entropy_loss": 0.034423828125, + "delta_ref_ppl": -0.154296875, + "entropy_loss": -0.2333984375, + "epoch": 0.9664, + "grad_norm": 1.4957215123614254, + "k1_kl": 0.154296875, + "k3_kl": 0.11279296875, + "kimi_kl": 0.466796875, + "learning_rate": 1.6799999999999998e-08, + "loss": 0.0045, + "ppl": 0.1044921875, + "reward": 0.8392736911773682, + "reward_std": 0.0025239873211830854, + "rewards/perpo_ocr_edit_distance_reward": 0.8392738103866577, + "step": 4832, + "temperature": 0.9 + }, + { + "advantages": -4.2932377255056053e-05, + "completion_length": 1095.0, + "delta_ref_entropy_loss": 0.00244140625, + "delta_ref_ppl": -0.0260009765625, + "entropy_loss": -0.03662109375, + "epoch": 0.9666, + "grad_norm": 0.23654721111905824, + "k1_kl": 0.0260009765625, + "k3_kl": 0.0181884765625, + "kimi_kl": 0.053466796875, + "learning_rate": 1.6699999999999997e-08, + "loss": 0.0008, + "ppl": 0.00970458984375, + "reward": 0.9885562062263489, + "reward_std": 0.0004952150047756732, + "rewards/perpo_ocr_edit_distance_reward": 0.9885562062263489, + "step": 4833, + "temperature": 0.9 + }, + { + "advantages": -2.895082786835701e-07, + "completion_length": 1874.0, + "delta_ref_entropy_loss": -0.041259765625, + "delta_ref_ppl": -0.0400390625, + "entropy_loss": -0.2890625, + "epoch": 0.9668, + "grad_norm": 1.5491416311824395, + "k1_kl": 0.039794921875, + "k3_kl": 0.047607421875, + "kimi_kl": 0.09716796875, + "learning_rate": 1.66e-08, + "loss": 0.0019, + "ppl": 0.1494140625, + "reward": 0.8640773296356201, + "reward_std": 0.08751475065946579, + "rewards/perpo_ocr_edit_distance_reward": 0.8640774488449097, + "step": 4834, + "temperature": 0.9 + }, + { + "advantages": -0.00010126829874934629, + "completion_length": 647.0, + "delta_ref_entropy_loss": 0.032958984375, + "delta_ref_ppl": -0.07470703125, + "entropy_loss": -0.06103515625, + "epoch": 0.967, + "grad_norm": 0.47096211806738203, + "k1_kl": 0.0751953125, + "k3_kl": 0.0546875, + "kimi_kl": 0.16796875, + "learning_rate": 1.65e-08, + "loss": 0.0023, + "ppl": 0.022705078125, + "reward": 0.9964194297790527, + "reward_std": 0.0004886418464593589, + "rewards/perpo_ocr_edit_distance_reward": 0.9964195489883423, + "step": 4835, + "temperature": 0.9 + }, + { + "advantages": -2.0010131152048416e-07, + "completion_length": 68.0, + "delta_ref_entropy_loss": -1.0, + "delta_ref_ppl": -0.91796875, + "entropy_loss": -2.25, + "epoch": 0.9672, + "grad_norm": 16.868993522850293, + "k1_kl": 0.91796875, + "k3_kl": 0.89453125, + "kimi_kl": 4.53125, + "learning_rate": 1.64e-08, + "loss": 0.0358, + "ppl": 0.98828125, + "reward": 0.2707329988479614, + "reward_std": 0.07544361054897308, + "rewards/perpo_ocr_edit_distance_reward": 0.2707329988479614, + "step": 4836, + "temperature": 0.9 + }, + { + "advantages": -9.730884630698711e-05, + "completion_length": 620.0, + "delta_ref_entropy_loss": 0.0030059814453125, + "delta_ref_ppl": -0.072265625, + "entropy_loss": -0.07568359375, + "epoch": 0.9674, + "grad_norm": 0.577869054549125, + "k1_kl": 0.072265625, + "k3_kl": 0.0556640625, + "kimi_kl": 0.2353515625, + "learning_rate": 1.6299999999999997e-08, + "loss": 0.0023, + "ppl": 0.0223388671875, + "reward": 0.972786545753479, + "reward_std": 0.0005124909221194685, + "rewards/perpo_ocr_edit_distance_reward": 0.9727866649627686, + "step": 4837, + "temperature": 0.9 + }, + { + "advantages": -0.00014076914521865547, + "completion_length": 527.0, + "delta_ref_entropy_loss": 0.038330078125, + "delta_ref_ppl": -0.05224609375, + "entropy_loss": -0.040771484375, + "epoch": 0.9676, + "grad_norm": 0.4886623898267808, + "k1_kl": 0.05224609375, + "k3_kl": 0.03369140625, + "kimi_kl": 0.1171875, + "learning_rate": 1.62e-08, + "loss": 0.0015, + "ppl": 0.0128173828125, + "reward": 0.9939718246459961, + "reward_std": 0.00044436188181862235, + "rewards/perpo_ocr_edit_distance_reward": 0.9939718246459961, + "step": 4838, + "temperature": 0.9 + }, + { + "advantages": 2.384185791015625e-07, + "completion_length": 735.0, + "delta_ref_entropy_loss": 0.035400390625, + "delta_ref_ppl": -0.10546875, + "entropy_loss": -0.1943359375, + "epoch": 0.9678, + "grad_norm": 2.3806168116043516, + "k1_kl": 0.10546875, + "k3_kl": 0.07080078125, + "kimi_kl": 0.17578125, + "learning_rate": 1.61e-08, + "loss": 0.0028, + "ppl": 0.08349609375, + "reward": 0.6063550114631653, + "reward_std": 0.10854823142290115, + "rewards/perpo_ocr_edit_distance_reward": 0.6063550114631653, + "step": 4839, + "temperature": 0.9 + }, + { + "advantages": -6.215966277522966e-05, + "completion_length": 946.0, + "delta_ref_entropy_loss": 0.029541015625, + "delta_ref_ppl": -0.05517578125, + "entropy_loss": -0.055908203125, + "epoch": 0.968, + "grad_norm": 0.38036430899056406, + "k1_kl": 0.054931640625, + "k3_kl": 0.037109375, + "kimi_kl": 0.09716796875, + "learning_rate": 1.6e-08, + "loss": 0.0015, + "ppl": 0.023193359375, + "reward": 0.9974266886711121, + "reward_std": 0.00044797727605327964, + "rewards/perpo_ocr_edit_distance_reward": 0.9974266886711121, + "step": 4840, + "temperature": 0.9 + }, + { + "advantages": -4.104205800103955e-06, + "completion_length": 721.0, + "delta_ref_entropy_loss": 0.031494140625, + "delta_ref_ppl": -0.08740234375, + "entropy_loss": -0.0966796875, + "epoch": 0.9682, + "grad_norm": 0.7717224328712566, + "k1_kl": 0.0869140625, + "k3_kl": 0.058837890625, + "kimi_kl": 0.1552734375, + "learning_rate": 1.59e-08, + "loss": 0.0024, + "ppl": 0.0361328125, + "reward": 0.9723069667816162, + "reward_std": 0.004038581158965826, + "rewards/perpo_ocr_edit_distance_reward": 0.972307026386261, + "step": 4841, + "temperature": 0.9 + }, + { + "advantages": -1.6944750313996337e-06, + "completion_length": 179.0, + "delta_ref_entropy_loss": -0.078125, + "delta_ref_ppl": -0.224609375, + "entropy_loss": -0.31640625, + "epoch": 0.9684, + "grad_norm": 4.928233799093009, + "k1_kl": 0.224609375, + "k3_kl": 0.189453125, + "kimi_kl": 0.703125, + "learning_rate": 1.58e-08, + "loss": 0.0076, + "ppl": 0.11572265625, + "reward": 0.7049298286437988, + "reward_std": 0.03028051182627678, + "rewards/perpo_ocr_edit_distance_reward": 0.7049298882484436, + "step": 4842, + "temperature": 0.9 + }, + { + "advantages": -1.087359032680979e-05, + "completion_length": 480.0, + "delta_ref_entropy_loss": -0.057861328125, + "delta_ref_ppl": -0.07861328125, + "entropy_loss": -0.2255859375, + "epoch": 0.9686, + "grad_norm": 1.0218585765356991, + "k1_kl": 0.07861328125, + "k3_kl": 0.06201171875, + "kimi_kl": 0.2060546875, + "learning_rate": 1.5699999999999998e-08, + "loss": 0.0025, + "ppl": 0.06640625, + "reward": 0.9858790040016174, + "reward_std": 0.006161693949252367, + "rewards/perpo_ocr_edit_distance_reward": 0.9858790636062622, + "step": 4843, + "temperature": 0.9 + }, + { + "advantages": 0.0, + "completion_length": 466.0, + "delta_ref_entropy_loss": 0.023193359375, + "delta_ref_ppl": -0.06982421875, + "entropy_loss": -0.054443359375, + "epoch": 0.9688, + "grad_norm": 0.6948196729672661, + "k1_kl": 0.0693359375, + "k3_kl": 0.04931640625, + "kimi_kl": 0.1728515625, + "learning_rate": 1.5599999999999997e-08, + "loss": 0.002, + "ppl": 0.021240234375, + "reward": 0.9339712858200073, + "reward_std": 0.0011050065513700247, + "rewards/perpo_ocr_edit_distance_reward": 0.9339712858200073, + "step": 4844, + "temperature": 0.9 + }, + { + "advantages": -1.762594592946698e-06, + "completion_length": 633.0, + "delta_ref_entropy_loss": -0.1630859375, + "delta_ref_ppl": -0.09521484375, + "entropy_loss": -0.392578125, + "epoch": 0.969, + "grad_norm": 2.3674637136957766, + "k1_kl": 0.09521484375, + "k3_kl": 0.08837890625, + "kimi_kl": 0.2216796875, + "learning_rate": 1.55e-08, + "loss": 0.0035, + "ppl": 0.15625, + "reward": 0.8412438035011292, + "reward_std": 0.009514009580016136, + "rewards/perpo_ocr_edit_distance_reward": 0.8412438631057739, + "step": 4845, + "temperature": 0.9 + }, + { + "advantages": -4.257474728319721e-08, + "completion_length": 551.0, + "delta_ref_entropy_loss": -0.408203125, + "delta_ref_ppl": -0.01312255859375, + "entropy_loss": -0.7421875, + "epoch": 0.9692, + "grad_norm": 9.073862681966745, + "k1_kl": 0.01409912109375, + "k3_kl": 0.07080078125, + "kimi_kl": 0.1474609375, + "learning_rate": 1.54e-08, + "loss": 0.0028, + "ppl": 0.36328125, + "reward": 0.7637723088264465, + "reward_std": 0.22112619876861572, + "rewards/perpo_ocr_edit_distance_reward": 0.7637723684310913, + "step": 4846, + "temperature": 0.9 + }, + { + "advantages": -6.641660661443893e-07, + "completion_length": 112.0, + "delta_ref_entropy_loss": 0.0303955078125, + "delta_ref_ppl": -0.39453125, + "entropy_loss": -0.466796875, + "epoch": 0.9694, + "grad_norm": 5.091292816227462, + "k1_kl": 0.392578125, + "k3_kl": 0.33203125, + "kimi_kl": 1.109375, + "learning_rate": 1.5299999999999998e-08, + "loss": 0.0133, + "ppl": 0.1904296875, + "reward": 0.9012044072151184, + "reward_std": 0.0513792410492897, + "rewards/perpo_ocr_edit_distance_reward": 0.9012044072151184, + "step": 4847, + "temperature": 0.9 + }, + { + "advantages": -8.514949634275126e-09, + "completion_length": 668.0, + "delta_ref_entropy_loss": 0.01251220703125, + "delta_ref_ppl": -0.045166015625, + "entropy_loss": -0.0458984375, + "epoch": 0.9696, + "grad_norm": 0.40977647377030707, + "k1_kl": 0.045166015625, + "k3_kl": 0.0286865234375, + "kimi_kl": 0.05859375, + "learning_rate": 1.52e-08, + "loss": 0.0012, + "ppl": 0.01434326171875, + "reward": 0.9987079501152039, + "reward_std": 0.0004800396563950926, + "rewards/perpo_ocr_edit_distance_reward": 0.9987080693244934, + "step": 4848, + "temperature": 0.9 + }, + { + "advantages": -8.393612006329931e-06, + "completion_length": 1278.0, + "delta_ref_entropy_loss": 0.00848388671875, + "delta_ref_ppl": -0.04248046875, + "entropy_loss": -0.044677734375, + "epoch": 0.9698, + "grad_norm": 0.7713142889084844, + "k1_kl": 0.04248046875, + "k3_kl": 0.0281982421875, + "kimi_kl": 0.080078125, + "learning_rate": 1.51e-08, + "loss": 0.0011, + "ppl": 0.01495361328125, + "reward": 0.9983149766921997, + "reward_std": 0.0009156165760941803, + "rewards/perpo_ocr_edit_distance_reward": 0.9983150362968445, + "step": 4849, + "temperature": 0.9 + }, + { + "advantages": -7.115092012099922e-05, + "completion_length": 414.0, + "delta_ref_entropy_loss": 0.02685546875, + "delta_ref_ppl": -0.06103515625, + "entropy_loss": -0.0654296875, + "epoch": 0.97, + "grad_norm": 0.9792609793166379, + "k1_kl": 0.06103515625, + "k3_kl": 0.039306640625, + "kimi_kl": 0.10302734375, + "learning_rate": 1.5e-08, + "loss": 0.0016, + "ppl": 0.0223388671875, + "reward": 0.9972831606864929, + "reward_std": 0.0008572475053369999, + "rewards/perpo_ocr_edit_distance_reward": 0.9972831606864929, + "step": 4850, + "temperature": 0.9 + }, + { + "advantages": -5.449567765936081e-07, + "completion_length": 623.0, + "delta_ref_entropy_loss": 0.08544921875, + "delta_ref_ppl": -0.103515625, + "entropy_loss": -0.390625, + "epoch": 0.9702, + "grad_norm": 2.622513471389099, + "k1_kl": 0.103515625, + "k3_kl": 0.0673828125, + "kimi_kl": 0.12890625, + "learning_rate": 1.49e-08, + "loss": 0.0027, + "ppl": 0.1806640625, + "reward": 0.9000054001808167, + "reward_std": 0.015390748158097267, + "rewards/perpo_ocr_edit_distance_reward": 0.9000054001808167, + "step": 4851, + "temperature": 0.9 + }, + { + "advantages": -3.424712849664502e-05, + "completion_length": 1001.0, + "delta_ref_entropy_loss": -0.01611328125, + "delta_ref_ppl": -0.0255126953125, + "entropy_loss": -0.057373046875, + "epoch": 0.9704, + "grad_norm": 0.40877816446861015, + "k1_kl": 0.0255126953125, + "k3_kl": 0.0242919921875, + "kimi_kl": 0.0654296875, + "learning_rate": 1.48e-08, + "loss": 0.001, + "ppl": 0.0166015625, + "reward": 0.9771004915237427, + "reward_std": 0.001142743625678122, + "rewards/perpo_ocr_edit_distance_reward": 0.9771005511283875, + "step": 4852, + "temperature": 0.9 + }, + { + "advantages": -9.896925621433184e-05, + "completion_length": 445.0, + "delta_ref_entropy_loss": 0.0213623046875, + "delta_ref_ppl": -0.03662109375, + "entropy_loss": -0.036376953125, + "epoch": 0.9706, + "grad_norm": 0.3840279667344548, + "k1_kl": 0.036376953125, + "k3_kl": 0.025146484375, + "kimi_kl": 0.0869140625, + "learning_rate": 1.47e-08, + "loss": 0.0011, + "ppl": 0.01385498046875, + "reward": 0.9937853217124939, + "reward_std": 0.00041626827442087233, + "rewards/perpo_ocr_edit_distance_reward": 0.9937854409217834, + "step": 4853, + "temperature": 0.9 + }, + { + "advantages": -3.405979782655777e-07, + "completion_length": 963.0, + "delta_ref_entropy_loss": -0.00139617919921875, + "delta_ref_ppl": -0.08349609375, + "entropy_loss": -0.181640625, + "epoch": 0.9708, + "grad_norm": 1.5537051326744018, + "k1_kl": 0.083984375, + "k3_kl": 0.060302734375, + "kimi_kl": 0.1416015625, + "learning_rate": 1.46e-08, + "loss": 0.0024, + "ppl": 0.08203125, + "reward": 0.7833166122436523, + "reward_std": 0.295852392911911, + "rewards/perpo_ocr_edit_distance_reward": 0.7833166718482971, + "step": 4854, + "temperature": 0.9 + }, + { + "advantages": -5.3456853493116796e-05, + "completion_length": 397.0, + "delta_ref_entropy_loss": 0.0322265625, + "delta_ref_ppl": -0.099609375, + "entropy_loss": -0.06689453125, + "epoch": 0.971, + "grad_norm": 0.7277314322633746, + "k1_kl": 0.099609375, + "k3_kl": 0.0751953125, + "kimi_kl": 0.28125, + "learning_rate": 1.45e-08, + "loss": 0.0031, + "ppl": 0.0234375, + "reward": 0.9977575540542603, + "reward_std": 0.0008555098320357502, + "rewards/perpo_ocr_edit_distance_reward": 0.9977576732635498, + "step": 4855, + "temperature": 0.9 + }, + { + "advantages": -8.685248644724197e-07, + "completion_length": 1503.0, + "delta_ref_entropy_loss": -0.05078125, + "delta_ref_ppl": -0.0152587890625, + "entropy_loss": -0.08837890625, + "epoch": 0.9712, + "grad_norm": 0.40150463437224, + "k1_kl": 0.01531982421875, + "k3_kl": 0.01611328125, + "kimi_kl": 0.04296875, + "learning_rate": 1.4399999999999998e-08, + "loss": 0.0006, + "ppl": 0.01806640625, + "reward": 0.9083457589149475, + "reward_std": 0.08911018818616867, + "rewards/perpo_ocr_edit_distance_reward": 0.9083458185195923, + "step": 4856, + "temperature": 0.9 + }, + { + "advantages": -7.987022399902344e-05, + "completion_length": 871.0, + "delta_ref_entropy_loss": 0.0179443359375, + "delta_ref_ppl": -0.05029296875, + "entropy_loss": -0.0517578125, + "epoch": 0.9714, + "grad_norm": 0.9672836521470767, + "k1_kl": 0.05029296875, + "k3_kl": 0.03662109375, + "kimi_kl": 0.11328125, + "learning_rate": 1.4299999999999999e-08, + "loss": 0.0015, + "ppl": 0.0198974609375, + "reward": 0.9852850437164307, + "reward_std": 0.0005397710483521223, + "rewards/perpo_ocr_edit_distance_reward": 0.9852851629257202, + "step": 4857, + "temperature": 0.9 + }, + { + "advantages": -2.4250575734185986e-05, + "completion_length": 1654.0, + "delta_ref_entropy_loss": -0.01409912109375, + "delta_ref_ppl": -0.043701171875, + "entropy_loss": -0.146484375, + "epoch": 0.9716, + "grad_norm": 3.1333874986255763, + "k1_kl": 0.043701171875, + "k3_kl": 0.03955078125, + "kimi_kl": 0.07470703125, + "learning_rate": 1.42e-08, + "loss": 0.0016, + "ppl": 0.06591796875, + "reward": 0.9543145298957825, + "reward_std": 0.0020077351946383715, + "rewards/perpo_ocr_edit_distance_reward": 0.9543145895004272, + "step": 4858, + "temperature": 0.9 + }, + { + "advantages": -5.619866669803741e-07, + "completion_length": 816.0, + "delta_ref_entropy_loss": -0.047607421875, + "delta_ref_ppl": -0.05859375, + "entropy_loss": -0.30859375, + "epoch": 0.9718, + "grad_norm": 2.0875534896532555, + "k1_kl": 0.058349609375, + "k3_kl": 0.053466796875, + "kimi_kl": 0.0859375, + "learning_rate": 1.4099999999999999e-08, + "loss": 0.0021, + "ppl": 0.1416015625, + "reward": 0.9437242150306702, + "reward_std": 0.047435928136110306, + "rewards/perpo_ocr_edit_distance_reward": 0.9437242746353149, + "step": 4859, + "temperature": 0.9 + }, + { + "advantages": -1.093319497158518e-05, + "completion_length": 402.0, + "delta_ref_entropy_loss": 0.0118408203125, + "delta_ref_ppl": -0.046875, + "entropy_loss": -0.04345703125, + "epoch": 0.972, + "grad_norm": 0.43435205423548634, + "k1_kl": 0.046875, + "k3_kl": 0.032958984375, + "kimi_kl": 0.12109375, + "learning_rate": 1.4e-08, + "loss": 0.0013, + "ppl": 0.01409912109375, + "reward": 0.9954063892364502, + "reward_std": 0.0006795605877414346, + "rewards/perpo_ocr_edit_distance_reward": 0.995406448841095, + "step": 4860, + "temperature": 0.9 + }, + { + "advantages": -1.532690987460228e-07, + "completion_length": 204.0, + "delta_ref_entropy_loss": -0.90234375, + "delta_ref_ppl": -0.2353515625, + "entropy_loss": -1.640625, + "epoch": 0.9722, + "grad_norm": 8.525972395525784, + "k1_kl": 0.2353515625, + "k3_kl": 0.3671875, + "kimi_kl": 1.0, + "learning_rate": 1.3899999999999999e-08, + "loss": 0.0147, + "ppl": 0.68359375, + "reward": 0.39546576142311096, + "reward_std": 0.20491094887256622, + "rewards/perpo_ocr_edit_distance_reward": 0.39546579122543335, + "step": 4861, + "temperature": 0.9 + }, + { + "advantages": -1.9175666238879785e-05, + "completion_length": 563.0, + "delta_ref_entropy_loss": 0.03369140625, + "delta_ref_ppl": -0.12109375, + "entropy_loss": -0.16796875, + "epoch": 0.9724, + "grad_norm": 1.1911367597020435, + "k1_kl": 0.12109375, + "k3_kl": 0.0869140625, + "kimi_kl": 0.283203125, + "learning_rate": 1.38e-08, + "loss": 0.0035, + "ppl": 0.072265625, + "reward": 0.9672594666481018, + "reward_std": 0.003454323159530759, + "rewards/perpo_ocr_edit_distance_reward": 0.9672594666481018, + "step": 4862, + "temperature": 0.9 + }, + { + "advantages": -6.931169082236011e-06, + "completion_length": 719.0, + "delta_ref_entropy_loss": 0.044921875, + "delta_ref_ppl": -0.09423828125, + "entropy_loss": -0.34375, + "epoch": 0.9726, + "grad_norm": 2.313419345661002, + "k1_kl": 0.09375, + "k3_kl": 0.0654296875, + "kimi_kl": 0.1455078125, + "learning_rate": 1.37e-08, + "loss": 0.0026, + "ppl": 0.15234375, + "reward": 0.7345044016838074, + "reward_std": 0.0060460250824689865, + "rewards/perpo_ocr_edit_distance_reward": 0.7345044016838074, + "step": 4863, + "temperature": 0.9 + }, + { + "advantages": -2.316066274943296e-05, + "completion_length": 478.0, + "delta_ref_entropy_loss": 0.04443359375, + "delta_ref_ppl": -0.115234375, + "entropy_loss": -0.296875, + "epoch": 0.9728, + "grad_norm": 3.3527069566518666, + "k1_kl": 0.11474609375, + "k3_kl": 0.06689453125, + "kimi_kl": 0.13671875, + "learning_rate": 1.36e-08, + "loss": 0.0027, + "ppl": 0.1455078125, + "reward": 0.7292475700378418, + "reward_std": 0.0017373156733810902, + "rewards/perpo_ocr_edit_distance_reward": 0.7292476296424866, + "step": 4864, + "temperature": 0.9 + }, + { + "advantages": -3.2101361284730956e-05, + "completion_length": 363.0, + "delta_ref_entropy_loss": 0.00836181640625, + "delta_ref_ppl": -0.1025390625, + "entropy_loss": -0.12158203125, + "epoch": 0.973, + "grad_norm": 0.9489865519518832, + "k1_kl": 0.1025390625, + "k3_kl": 0.076171875, + "kimi_kl": 0.29296875, + "learning_rate": 1.3499999999999998e-08, + "loss": 0.0031, + "ppl": 0.05126953125, + "reward": 0.9718513488769531, + "reward_std": 0.001225545653142035, + "rewards/perpo_ocr_edit_distance_reward": 0.9718514084815979, + "step": 4865, + "temperature": 0.9 + }, + { + "advantages": -5.65903537790291e-05, + "completion_length": 660.0, + "delta_ref_entropy_loss": 0.0242919921875, + "delta_ref_ppl": -0.07861328125, + "entropy_loss": -0.05078125, + "epoch": 0.9732, + "grad_norm": 0.5559980035095972, + "k1_kl": 0.0791015625, + "k3_kl": 0.054443359375, + "kimi_kl": 0.1845703125, + "learning_rate": 1.34e-08, + "loss": 0.0022, + "ppl": 0.0196533203125, + "reward": 0.9971141815185547, + "reward_std": 0.0008028308511711657, + "rewards/perpo_ocr_edit_distance_reward": 0.9971142411231995, + "step": 4866, + "temperature": 0.9 + }, + { + "advantages": -1.3462135029840283e-05, + "completion_length": 830.0, + "delta_ref_entropy_loss": 0.01055908203125, + "delta_ref_ppl": -0.053955078125, + "entropy_loss": -0.07958984375, + "epoch": 0.9734, + "grad_norm": 0.9986204191813959, + "k1_kl": 0.053955078125, + "k3_kl": 0.035400390625, + "kimi_kl": 0.08984375, + "learning_rate": 1.3299999999999998e-08, + "loss": 0.0014, + "ppl": 0.03173828125, + "reward": 0.9890687465667725, + "reward_std": 0.001164697576314211, + "rewards/perpo_ocr_edit_distance_reward": 0.9890688061714172, + "step": 4867, + "temperature": 0.9 + }, + { + "advantages": -4.257474817137563e-09, + "completion_length": 493.0, + "delta_ref_entropy_loss": 0.0230712890625, + "delta_ref_ppl": -0.0693359375, + "entropy_loss": -0.173828125, + "epoch": 0.9736, + "grad_norm": 1.2474187836700203, + "k1_kl": 0.06982421875, + "k3_kl": 0.050048828125, + "kimi_kl": 0.119140625, + "learning_rate": 1.3199999999999999e-08, + "loss": 0.002, + "ppl": 0.07470703125, + "reward": 0.9333646893501282, + "reward_std": 0.0013997943606227636, + "rewards/perpo_ocr_edit_distance_reward": 0.933364748954773, + "step": 4868, + "temperature": 0.9 + }, + { + "advantages": -3.297840157756582e-05, + "completion_length": 597.0, + "delta_ref_entropy_loss": 0.00946044921875, + "delta_ref_ppl": -0.047119140625, + "entropy_loss": -0.041015625, + "epoch": 0.9738, + "grad_norm": 0.9231013202789634, + "k1_kl": 0.04736328125, + "k3_kl": 0.03564453125, + "kimi_kl": 0.11279296875, + "learning_rate": 1.31e-08, + "loss": 0.0015, + "ppl": 0.0164794921875, + "reward": 0.9855350255966187, + "reward_std": 0.0019651069305837154, + "rewards/perpo_ocr_edit_distance_reward": 0.9855351448059082, + "step": 4869, + "temperature": 0.9 + }, + { + "advantages": -2.602168569865171e-05, + "completion_length": 417.0, + "delta_ref_entropy_loss": 0.06298828125, + "delta_ref_ppl": -0.119140625, + "entropy_loss": -0.09521484375, + "epoch": 0.974, + "grad_norm": 0.876153794156723, + "k1_kl": 0.119140625, + "k3_kl": 0.08154296875, + "kimi_kl": 0.25390625, + "learning_rate": 1.2999999999999999e-08, + "loss": 0.0033, + "ppl": 0.031982421875, + "reward": 0.9859287142753601, + "reward_std": 0.0018627246608957648, + "rewards/perpo_ocr_edit_distance_reward": 0.9859288334846497, + "step": 4870, + "temperature": 0.9 + }, + { + "advantages": -3.634606400737539e-05, + "completion_length": 556.0, + "delta_ref_entropy_loss": -0.031982421875, + "delta_ref_ppl": -0.09765625, + "entropy_loss": -0.31640625, + "epoch": 0.9742, + "grad_norm": 2.0981549459514137, + "k1_kl": 0.09814453125, + "k3_kl": 0.08935546875, + "kimi_kl": 0.2255859375, + "learning_rate": 1.29e-08, + "loss": 0.0036, + "ppl": 0.1484375, + "reward": 0.9415393471717834, + "reward_std": 0.0020087412558496, + "rewards/perpo_ocr_edit_distance_reward": 0.941539466381073, + "step": 4871, + "temperature": 0.9 + }, + { + "advantages": -1.1750630619644653e-06, + "completion_length": 897.0, + "delta_ref_entropy_loss": 0.036376953125, + "delta_ref_ppl": -0.060791015625, + "entropy_loss": -0.0791015625, + "epoch": 0.9744, + "grad_norm": 0.9403162700146251, + "k1_kl": 0.06103515625, + "k3_kl": 0.034423828125, + "kimi_kl": 0.078125, + "learning_rate": 1.28e-08, + "loss": 0.0014, + "ppl": 0.0284423828125, + "reward": 0.9694606065750122, + "reward_std": 0.042767543345689774, + "rewards/perpo_ocr_edit_distance_reward": 0.9694606065750122, + "step": 4872, + "temperature": 0.9 + }, + { + "advantages": -5.991118450765498e-05, + "completion_length": 613.0, + "delta_ref_entropy_loss": 0.0030670166015625, + "delta_ref_ppl": -0.04443359375, + "entropy_loss": -0.05419921875, + "epoch": 0.9746, + "grad_norm": 0.2827324053900614, + "k1_kl": 0.04443359375, + "k3_kl": 0.032958984375, + "kimi_kl": 0.130859375, + "learning_rate": 1.27e-08, + "loss": 0.0014, + "ppl": 0.0155029296875, + "reward": 0.9966864585876465, + "reward_std": 0.0004686143947765231, + "rewards/perpo_ocr_edit_distance_reward": 0.996686577796936, + "step": 4873, + "temperature": 0.9 + }, + { + "advantages": -6.482857133960351e-05, + "completion_length": 800.0, + "delta_ref_entropy_loss": -0.01092529296875, + "delta_ref_ppl": -0.0634765625, + "entropy_loss": -0.1220703125, + "epoch": 0.9748, + "grad_norm": 0.523050578174914, + "k1_kl": 0.0634765625, + "k3_kl": 0.051025390625, + "kimi_kl": 0.134765625, + "learning_rate": 1.26e-08, + "loss": 0.0021, + "ppl": 0.0390625, + "reward": 0.9954976439476013, + "reward_std": 0.0010820520110428333, + "rewards/perpo_ocr_edit_distance_reward": 0.9954977035522461, + "step": 4874, + "temperature": 0.9 + }, + { + "advantages": 9.332385161542334e-06, + "completion_length": 574.0, + "delta_ref_entropy_loss": 0.0233154296875, + "delta_ref_ppl": -0.103515625, + "entropy_loss": -0.0986328125, + "epoch": 0.975, + "grad_norm": 0.7976599572832339, + "k1_kl": 0.103515625, + "k3_kl": 0.07275390625, + "kimi_kl": 0.265625, + "learning_rate": 1.25e-08, + "loss": 0.0029, + "ppl": 0.031982421875, + "reward": 0.9957021474838257, + "reward_std": 0.000813954568002373, + "rewards/perpo_ocr_edit_distance_reward": 0.9957020878791809, + "step": 4875, + "temperature": 0.9 + }, + { + "advantages": -5.929810868110508e-05, + "completion_length": 1200.0, + "delta_ref_entropy_loss": 0.03515625, + "delta_ref_ppl": -0.058349609375, + "entropy_loss": -0.078125, + "epoch": 0.9752, + "grad_norm": 1.3482218793970315, + "k1_kl": 0.05810546875, + "k3_kl": 0.037109375, + "kimi_kl": 0.07958984375, + "learning_rate": 1.2399999999999999e-08, + "loss": 0.0015, + "ppl": 0.0341796875, + "reward": 0.9514693021774292, + "reward_std": 0.0011921299155801535, + "rewards/perpo_ocr_edit_distance_reward": 0.9514694213867188, + "step": 4876, + "temperature": 0.9 + }, + { + "advantages": -6.560768815688789e-05, + "completion_length": 669.0, + "delta_ref_entropy_loss": 0.0218505859375, + "delta_ref_ppl": -0.0673828125, + "entropy_loss": -0.06591796875, + "epoch": 0.9754, + "grad_norm": 0.31330060745737537, + "k1_kl": 0.0673828125, + "k3_kl": 0.041748046875, + "kimi_kl": 0.140625, + "learning_rate": 1.23e-08, + "loss": 0.0017, + "ppl": 0.02099609375, + "reward": 0.9910632967948914, + "reward_std": 0.0005491121555678546, + "rewards/perpo_ocr_edit_distance_reward": 0.9910632967948914, + "step": 4877, + "temperature": 0.9 + }, + { + "advantages": -5.0246719183633104e-05, + "completion_length": 541.0, + "delta_ref_entropy_loss": 0.01141357421875, + "delta_ref_ppl": -0.06103515625, + "entropy_loss": -0.08447265625, + "epoch": 0.9756, + "grad_norm": 0.4301491415674305, + "k1_kl": 0.060791015625, + "k3_kl": 0.038330078125, + "kimi_kl": 0.1025390625, + "learning_rate": 1.22e-08, + "loss": 0.0016, + "ppl": 0.02880859375, + "reward": 0.9914296865463257, + "reward_std": 0.0005777159822173417, + "rewards/perpo_ocr_edit_distance_reward": 0.9914296865463257, + "step": 4878, + "temperature": 0.9 + }, + { + "advantages": -3.315934372949414e-05, + "completion_length": 1314.0, + "delta_ref_entropy_loss": 0.0164794921875, + "delta_ref_ppl": -0.050048828125, + "entropy_loss": -0.12353515625, + "epoch": 0.9758, + "grad_norm": 3.8052871858387616, + "k1_kl": 0.050048828125, + "k3_kl": 0.09521484375, + "kimi_kl": 0.0751953125, + "learning_rate": 1.2099999999999999e-08, + "loss": 0.0038, + "ppl": 0.060302734375, + "reward": 0.9749112725257874, + "reward_std": 0.0024687768891453743, + "rewards/perpo_ocr_edit_distance_reward": 0.9749112725257874, + "step": 4879, + "temperature": 0.9 + }, + { + "advantages": -6.777900125598535e-06, + "completion_length": 183.0, + "delta_ref_entropy_loss": -0.05029296875, + "delta_ref_ppl": -0.232421875, + "entropy_loss": -0.33984375, + "epoch": 0.976, + "grad_norm": 2.6275543628861704, + "k1_kl": 0.232421875, + "k3_kl": 0.1865234375, + "kimi_kl": 0.69921875, + "learning_rate": 1.2e-08, + "loss": 0.0075, + "ppl": 0.12255859375, + "reward": 0.9484490752220154, + "reward_std": 0.012413082644343376, + "rewards/perpo_ocr_edit_distance_reward": 0.9484491944313049, + "step": 4880, + "temperature": 0.9 + }, + { + "advantages": -1.7029899268550253e-08, + "completion_length": 558.0, + "delta_ref_entropy_loss": 0.0281982421875, + "delta_ref_ppl": -0.11767578125, + "entropy_loss": -0.294921875, + "epoch": 0.9762, + "grad_norm": 1.6639435851641509, + "k1_kl": 0.1181640625, + "k3_kl": 0.0849609375, + "kimi_kl": 0.2216796875, + "learning_rate": 1.19e-08, + "loss": 0.0034, + "ppl": 0.140625, + "reward": 0.6067884564399719, + "reward_std": 0.005008702632039785, + "rewards/perpo_ocr_edit_distance_reward": 0.6067885160446167, + "step": 4881, + "temperature": 0.9 + }, + { + "advantages": -3.405979782655777e-07, + "completion_length": 330.0, + "delta_ref_entropy_loss": -0.6484375, + "delta_ref_ppl": -0.08056640625, + "entropy_loss": -1.3828125, + "epoch": 0.9764, + "grad_norm": 4.721454485236044, + "k1_kl": 0.08154296875, + "k3_kl": 0.2265625, + "kimi_kl": 0.4453125, + "learning_rate": 1.18e-08, + "loss": 0.0091, + "ppl": 0.6484375, + "reward": 0.7753308415412903, + "reward_std": 0.08733765035867691, + "rewards/perpo_ocr_edit_distance_reward": 0.7753308415412903, + "step": 4882, + "temperature": 0.9 + }, + { + "advantages": 1.021793991640152e-06, + "completion_length": 54.0, + "delta_ref_entropy_loss": -0.04248046875, + "delta_ref_ppl": -0.80078125, + "entropy_loss": -0.2734375, + "epoch": 0.9766, + "grad_norm": 3.765382180568089, + "k1_kl": 0.8046875, + "k3_kl": 0.71875, + "kimi_kl": 3.765625, + "learning_rate": 1.17e-08, + "loss": 0.0287, + "ppl": 0.07666015625, + "reward": 0.9724757671356201, + "reward_std": 0.008251039311289787, + "rewards/perpo_ocr_edit_distance_reward": 0.9724757671356201, + "step": 4883, + "temperature": 0.9 + }, + { + "advantages": -6.159714394016191e-05, + "completion_length": 551.0, + "delta_ref_entropy_loss": 0.04638671875, + "delta_ref_ppl": -0.05615234375, + "entropy_loss": -0.130859375, + "epoch": 0.9768, + "grad_norm": 0.7538587643271163, + "k1_kl": 0.05615234375, + "k3_kl": 0.0269775390625, + "kimi_kl": 0.0654296875, + "learning_rate": 1.1599999999999998e-08, + "loss": 0.0011, + "ppl": 0.047607421875, + "reward": 0.6451517939567566, + "reward_std": 0.0008676494471728802, + "rewards/perpo_ocr_edit_distance_reward": 0.6451518535614014, + "step": 4884, + "temperature": 0.9 + }, + { + "advantages": -4.751341748487903e-06, + "completion_length": 1120.0, + "delta_ref_entropy_loss": -0.0576171875, + "delta_ref_ppl": -0.028076171875, + "entropy_loss": -0.12451171875, + "epoch": 0.977, + "grad_norm": 0.5902986533410063, + "k1_kl": 0.0279541015625, + "k3_kl": 0.026611328125, + "kimi_kl": 0.0712890625, + "learning_rate": 1.1499999999999999e-08, + "loss": 0.0011, + "ppl": 0.02978515625, + "reward": 0.9947855472564697, + "reward_std": 0.0034968131221830845, + "rewards/perpo_ocr_edit_distance_reward": 0.994785487651825, + "step": 4885, + "temperature": 0.9 + }, + { + "advantages": -1.0677747013687622e-05, + "completion_length": 205.0, + "delta_ref_entropy_loss": 0.0311279296875, + "delta_ref_ppl": -0.212890625, + "entropy_loss": -0.2060546875, + "epoch": 0.9772, + "grad_norm": 1.9349911567850022, + "k1_kl": 0.2138671875, + "k3_kl": 0.162109375, + "kimi_kl": 0.5859375, + "learning_rate": 1.14e-08, + "loss": 0.0065, + "ppl": 0.0693359375, + "reward": 0.9792274236679077, + "reward_std": 0.0022952419240027666, + "rewards/perpo_ocr_edit_distance_reward": 0.9792274236679077, + "step": 4886, + "temperature": 0.9 + }, + { + "advantages": -5.46659748579259e-06, + "completion_length": 135.0, + "delta_ref_entropy_loss": 0.07763671875, + "delta_ref_ppl": -0.31640625, + "entropy_loss": -0.2119140625, + "epoch": 0.9774, + "grad_norm": 2.7876033030412417, + "k1_kl": 0.31640625, + "k3_kl": 0.2431640625, + "kimi_kl": 0.86328125, + "learning_rate": 1.1299999999999999e-08, + "loss": 0.0097, + "ppl": 0.087890625, + "reward": 0.9773690700531006, + "reward_std": 0.006141523830592632, + "rewards/perpo_ocr_edit_distance_reward": 0.9773691892623901, + "step": 4887, + "temperature": 0.9 + }, + { + "advantages": 1.411778612236958e-05, + "completion_length": 1433.0, + "delta_ref_entropy_loss": 0.0146484375, + "delta_ref_ppl": -0.0576171875, + "entropy_loss": -0.07080078125, + "epoch": 0.9776, + "grad_norm": 1.1843985928962064, + "k1_kl": 0.057373046875, + "k3_kl": 0.039794921875, + "kimi_kl": 0.09521484375, + "learning_rate": 1.12e-08, + "loss": 0.0016, + "ppl": 0.032958984375, + "reward": 0.9789108037948608, + "reward_std": 0.0017093088245019317, + "rewards/perpo_ocr_edit_distance_reward": 0.9789108037948608, + "step": 4888, + "temperature": 0.9 + }, + { + "advantages": -0.00011658669245662168, + "completion_length": 970.0, + "delta_ref_entropy_loss": 0.0299072265625, + "delta_ref_ppl": -0.044189453125, + "entropy_loss": -0.049072265625, + "epoch": 0.9778, + "grad_norm": 0.4021230026188008, + "k1_kl": 0.044189453125, + "k3_kl": 0.0263671875, + "kimi_kl": 0.0703125, + "learning_rate": 1.11e-08, + "loss": 0.0012, + "ppl": 0.015869140625, + "reward": 0.9823773503303528, + "reward_std": 0.0002651455579325557, + "rewards/perpo_ocr_edit_distance_reward": 0.9823773503303528, + "step": 4889, + "temperature": 0.9 + }, + { + "advantages": 8.514949634275126e-09, + "completion_length": 583.0, + "delta_ref_entropy_loss": 0.0111083984375, + "delta_ref_ppl": -0.0576171875, + "entropy_loss": -0.064453125, + "epoch": 0.978, + "grad_norm": 0.34822764911594783, + "k1_kl": 0.0576171875, + "k3_kl": 0.0361328125, + "kimi_kl": 0.099609375, + "learning_rate": 1.1e-08, + "loss": 0.0014, + "ppl": 0.017578125, + "reward": 0.9848806262016296, + "reward_std": 0.000761641189455986, + "rewards/perpo_ocr_edit_distance_reward": 0.9848806262016296, + "step": 4890, + "temperature": 0.9 + }, + { + "advantages": 0.0, + "completion_length": 1080.0, + "delta_ref_entropy_loss": 0.01483154296875, + "delta_ref_ppl": -0.046875, + "entropy_loss": -0.054931640625, + "epoch": 0.9782, + "grad_norm": 0.47943989727032177, + "k1_kl": 0.046875, + "k3_kl": 0.030029296875, + "kimi_kl": 0.08447265625, + "learning_rate": 1.09e-08, + "loss": 0.0012, + "ppl": 0.01611328125, + "reward": 0.9961222410202026, + "reward_std": 0.003848250722512603, + "rewards/perpo_ocr_edit_distance_reward": 0.9961222410202026, + "step": 4891, + "temperature": 0.9 + }, + { + "advantages": -8.895567589206621e-05, + "completion_length": 490.0, + "delta_ref_entropy_loss": 0.018310546875, + "delta_ref_ppl": -0.0634765625, + "entropy_loss": -0.07421875, + "epoch": 0.9784, + "grad_norm": 0.45078784293833124, + "k1_kl": 0.0634765625, + "k3_kl": 0.0439453125, + "kimi_kl": 0.138671875, + "learning_rate": 1.08e-08, + "loss": 0.0018, + "ppl": 0.0206298828125, + "reward": 0.9808589816093445, + "reward_std": 0.0008570887730456889, + "rewards/perpo_ocr_edit_distance_reward": 0.980859100818634, + "step": 4892, + "temperature": 0.9 + }, + { + "advantages": -1.1608005479502026e-05, + "completion_length": 755.0, + "delta_ref_entropy_loss": 0.01446533203125, + "delta_ref_ppl": -0.0830078125, + "entropy_loss": -0.10400390625, + "epoch": 0.9786, + "grad_norm": 3.6212280097130574, + "k1_kl": 0.0830078125, + "k3_kl": 0.0703125, + "kimi_kl": 0.2099609375, + "learning_rate": 1.0699999999999998e-08, + "loss": 0.0028, + "ppl": 0.048828125, + "reward": 0.7252357602119446, + "reward_std": 0.0020989880431443453, + "rewards/perpo_ocr_edit_distance_reward": 0.7252358198165894, + "step": 4893, + "temperature": 0.9 + }, + { + "advantages": 1.8647739125299267e-05, + "completion_length": 531.0, + "delta_ref_entropy_loss": 0.034912109375, + "delta_ref_ppl": -0.08642578125, + "entropy_loss": -0.0791015625, + "epoch": 0.9788, + "grad_norm": 0.7209661963693439, + "k1_kl": 0.08642578125, + "k3_kl": 0.060791015625, + "kimi_kl": 0.2041015625, + "learning_rate": 1.0599999999999999e-08, + "loss": 0.0024, + "ppl": 0.031494140625, + "reward": 0.9518001675605774, + "reward_std": 0.0008127723122015595, + "rewards/perpo_ocr_edit_distance_reward": 0.9518002271652222, + "step": 4894, + "temperature": 0.9 + }, + { + "advantages": 1.2551035979413427e-05, + "completion_length": 653.0, + "delta_ref_entropy_loss": 0.029541015625, + "delta_ref_ppl": -0.0634765625, + "entropy_loss": -0.056396484375, + "epoch": 0.979, + "grad_norm": 0.6620345343335877, + "k1_kl": 0.0634765625, + "k3_kl": 0.03955078125, + "kimi_kl": 0.1171875, + "learning_rate": 1.05e-08, + "loss": 0.0016, + "ppl": 0.0177001953125, + "reward": 0.9967654347419739, + "reward_std": 0.0005786571418866515, + "rewards/perpo_ocr_edit_distance_reward": 0.9967654943466187, + "step": 4895, + "temperature": 0.9 + }, + { + "advantages": -1.8749919036054052e-05, + "completion_length": 144.0, + "delta_ref_entropy_loss": 0.025390625, + "delta_ref_ppl": -0.267578125, + "entropy_loss": -0.1259765625, + "epoch": 0.9792, + "grad_norm": 2.394564402677921, + "k1_kl": 0.267578125, + "k3_kl": 0.2177734375, + "kimi_kl": 1.0625, + "learning_rate": 1.0399999999999999e-08, + "loss": 0.0087, + "ppl": 0.03955078125, + "reward": 0.9944967031478882, + "reward_std": 0.0012629196280613542, + "rewards/perpo_ocr_edit_distance_reward": 0.9944967031478882, + "step": 4896, + "temperature": 0.9 + }, + { + "advantages": -2.963202405226184e-06, + "completion_length": 37.0, + "delta_ref_entropy_loss": 0.031982421875, + "delta_ref_ppl": -0.94140625, + "entropy_loss": -0.2490234375, + "epoch": 0.9794, + "grad_norm": 7.241722671385088, + "k1_kl": 0.9375, + "k3_kl": 0.8125, + "kimi_kl": 5.625, + "learning_rate": 1.03e-08, + "loss": 0.0326, + "ppl": 0.06982421875, + "reward": 0.7692307233810425, + "reward_std": 0.00277570728212595, + "rewards/perpo_ocr_edit_distance_reward": 0.7692307829856873, + "step": 4897, + "temperature": 0.9 + }, + { + "advantages": -8.004052460819366e-07, + "completion_length": 472.0, + "delta_ref_entropy_loss": 0.002227783203125, + "delta_ref_ppl": -0.1416015625, + "entropy_loss": -0.5, + "epoch": 0.9796, + "grad_norm": 2.2831009905980317, + "k1_kl": 0.1416015625, + "k3_kl": 0.10009765625, + "kimi_kl": 0.22265625, + "learning_rate": 1.02e-08, + "loss": 0.004, + "ppl": 0.234375, + "reward": 0.8772953152656555, + "reward_std": 0.010764090344309807, + "rewards/perpo_ocr_edit_distance_reward": 0.8772953748703003, + "step": 4898, + "temperature": 0.9 + }, + { + "advantages": -3.4059798537100505e-08, + "completion_length": 516.0, + "delta_ref_entropy_loss": 0.005645751953125, + "delta_ref_ppl": -0.04638671875, + "entropy_loss": -0.07373046875, + "epoch": 0.9798, + "grad_norm": 0.8551227365388645, + "k1_kl": 0.04638671875, + "k3_kl": 0.034423828125, + "kimi_kl": 0.08154296875, + "learning_rate": 1.01e-08, + "loss": 0.0014, + "ppl": 0.02783203125, + "reward": 0.970079779624939, + "reward_std": 0.0010857514571398497, + "rewards/perpo_ocr_edit_distance_reward": 0.970079779624939, + "step": 4899, + "temperature": 0.9 + }, + { + "advantages": -9.5367431640625e-06, + "completion_length": 41.0, + "delta_ref_entropy_loss": -0.06982421875, + "delta_ref_ppl": -1.0390625, + "entropy_loss": -0.640625, + "epoch": 0.98, + "grad_norm": 9.076976429765875, + "k1_kl": 1.0390625, + "k3_kl": 0.9140625, + "kimi_kl": 4.4375, + "learning_rate": 1e-08, + "loss": 0.0367, + "ppl": 0.2197265625, + "reward": 0.3119911253452301, + "reward_std": 0.003916718065738678, + "rewards/perpo_ocr_edit_distance_reward": 0.3119911551475525, + "step": 4900, + "temperature": 0.9 + }, + { + "advantages": -1.7029899268550253e-08, + "completion_length": 1183.0, + "delta_ref_entropy_loss": -0.002716064453125, + "delta_ref_ppl": -0.06591796875, + "entropy_loss": -0.1357421875, + "epoch": 0.9802, + "grad_norm": 7.131324729434083, + "k1_kl": 0.06591796875, + "k3_kl": 0.049560546875, + "kimi_kl": 0.1318359375, + "learning_rate": 9.900000000000001e-09, + "loss": 0.002, + "ppl": 0.056884765625, + "reward": 0.6930649280548096, + "reward_std": 0.2314814329147339, + "rewards/perpo_ocr_edit_distance_reward": 0.6930649876594543, + "step": 4901, + "temperature": 0.9 + }, + { + "advantages": -6.817920075263828e-05, + "completion_length": 728.0, + "delta_ref_entropy_loss": 0.0228271484375, + "delta_ref_ppl": -0.0419921875, + "entropy_loss": -0.052978515625, + "epoch": 0.9804, + "grad_norm": 0.2930378515481692, + "k1_kl": 0.0419921875, + "k3_kl": 0.0303955078125, + "kimi_kl": 0.0791015625, + "learning_rate": 9.799999999999998e-09, + "loss": 0.0013, + "ppl": 0.01422119140625, + "reward": 0.9974279403686523, + "reward_std": 0.0005243350751698017, + "rewards/perpo_ocr_edit_distance_reward": 0.9974279999732971, + "step": 4902, + "temperature": 0.9 + }, + { + "advantages": 8.250985956692602e-06, + "completion_length": 606.0, + "delta_ref_entropy_loss": -0.01416015625, + "delta_ref_ppl": -0.048828125, + "entropy_loss": -0.1201171875, + "epoch": 0.9806, + "grad_norm": 0.6115371011589369, + "k1_kl": 0.048828125, + "k3_kl": 0.035888671875, + "kimi_kl": 0.11865234375, + "learning_rate": 9.7e-09, + "loss": 0.0014, + "ppl": 0.031005859375, + "reward": 0.8368047475814819, + "reward_std": 0.0009339440148323774, + "rewards/perpo_ocr_edit_distance_reward": 0.8368048071861267, + "step": 4903, + "temperature": 0.9 + }, + { + "advantages": -2.4182456400012597e-05, + "completion_length": 863.0, + "delta_ref_entropy_loss": 0.00445556640625, + "delta_ref_ppl": -0.06787109375, + "entropy_loss": -0.134765625, + "epoch": 0.9808, + "grad_norm": 1.0359571390480007, + "k1_kl": 0.0673828125, + "k3_kl": 0.056884765625, + "kimi_kl": 0.125, + "learning_rate": 9.599999999999998e-09, + "loss": 0.0023, + "ppl": 0.055908203125, + "reward": 0.9527502059936523, + "reward_std": 0.0023663071915507317, + "rewards/perpo_ocr_edit_distance_reward": 0.9527502655982971, + "step": 4904, + "temperature": 0.9 + }, + { + "advantages": -0.00012046099436702207, + "completion_length": 447.0, + "delta_ref_entropy_loss": 0.0322265625, + "delta_ref_ppl": -0.08203125, + "entropy_loss": -0.05322265625, + "epoch": 0.981, + "grad_norm": 0.344944077989771, + "k1_kl": 0.08203125, + "k3_kl": 0.057373046875, + "kimi_kl": 0.1904296875, + "learning_rate": 9.499999999999999e-09, + "loss": 0.0024, + "ppl": 0.0126953125, + "reward": 0.9935832023620605, + "reward_std": 0.0005361250950954854, + "rewards/perpo_ocr_edit_distance_reward": 0.9935833215713501, + "step": 4905, + "temperature": 0.9 + }, + { + "advantages": -4.13571106037125e-05, + "completion_length": 734.0, + "delta_ref_entropy_loss": 0.0260009765625, + "delta_ref_ppl": -0.07373046875, + "entropy_loss": -0.07470703125, + "epoch": 0.9812, + "grad_norm": 0.4073307664322847, + "k1_kl": 0.07373046875, + "k3_kl": 0.048828125, + "kimi_kl": 0.12890625, + "learning_rate": 9.4e-09, + "loss": 0.002, + "ppl": 0.0274658203125, + "reward": 0.976023256778717, + "reward_std": 0.0007236134260892868, + "rewards/perpo_ocr_edit_distance_reward": 0.9760233759880066, + "step": 4906, + "temperature": 0.9 + }, + { + "advantages": -8.479187090415508e-05, + "completion_length": 1216.0, + "delta_ref_entropy_loss": 0.042236328125, + "delta_ref_ppl": -0.06494140625, + "entropy_loss": -0.099609375, + "epoch": 0.9814, + "grad_norm": 0.9475273852830574, + "k1_kl": 0.06494140625, + "k3_kl": 0.044677734375, + "kimi_kl": 0.1494140625, + "learning_rate": 9.299999999999999e-09, + "loss": 0.0019, + "ppl": 0.0458984375, + "reward": 0.9865426421165466, + "reward_std": 0.001105180592276156, + "rewards/perpo_ocr_edit_distance_reward": 0.9865427613258362, + "step": 4907, + "temperature": 0.9 + }, + { + "advantages": -9.196145583700854e-06, + "completion_length": 1226.0, + "delta_ref_entropy_loss": -0.01165771484375, + "delta_ref_ppl": -0.0260009765625, + "entropy_loss": -0.05126953125, + "epoch": 0.9816, + "grad_norm": 0.5708106094739156, + "k1_kl": 0.0260009765625, + "k3_kl": 0.0218505859375, + "kimi_kl": 0.059326171875, + "learning_rate": 9.2e-09, + "loss": 0.0009, + "ppl": 0.0169677734375, + "reward": 0.9899182915687561, + "reward_std": 0.007295165676623583, + "rewards/perpo_ocr_edit_distance_reward": 0.9899183511734009, + "step": 4908, + "temperature": 0.9 + }, + { + "advantages": -4.1808401874732226e-05, + "completion_length": 519.0, + "delta_ref_entropy_loss": 0.03173828125, + "delta_ref_ppl": -0.07666015625, + "entropy_loss": -0.0654296875, + "epoch": 0.9818, + "grad_norm": 0.6557439831151841, + "k1_kl": 0.07666015625, + "k3_kl": 0.0478515625, + "kimi_kl": 0.1484375, + "learning_rate": 9.1e-09, + "loss": 0.002, + "ppl": 0.0174560546875, + "reward": 0.9988938570022583, + "reward_std": 0.0005108598852530122, + "rewards/perpo_ocr_edit_distance_reward": 0.9988939762115479, + "step": 4909, + "temperature": 0.9 + }, + { + "advantages": -6.839207344455644e-05, + "completion_length": 717.0, + "delta_ref_entropy_loss": 0.003631591796875, + "delta_ref_ppl": -0.038818359375, + "entropy_loss": -0.0732421875, + "epoch": 0.982, + "grad_norm": 0.37211668694123673, + "k1_kl": 0.038818359375, + "k3_kl": 0.0272216796875, + "kimi_kl": 0.06591796875, + "learning_rate": 9e-09, + "loss": 0.0012, + "ppl": 0.0269775390625, + "reward": 0.9935833215713501, + "reward_std": 0.0007716740947216749, + "rewards/perpo_ocr_edit_distance_reward": 0.9935834407806396, + "step": 4910, + "temperature": 0.9 + }, + { + "advantages": -1.2636185601877514e-05, + "completion_length": 753.0, + "delta_ref_entropy_loss": 0.018310546875, + "delta_ref_ppl": -0.0625, + "entropy_loss": -0.057861328125, + "epoch": 0.9822, + "grad_norm": 0.5294193635698023, + "k1_kl": 0.062255859375, + "k3_kl": 0.040283203125, + "kimi_kl": 0.10986328125, + "learning_rate": 8.9e-09, + "loss": 0.0016, + "ppl": 0.021240234375, + "reward": 0.9876648187637329, + "reward_std": 0.000573987839743495, + "rewards/perpo_ocr_edit_distance_reward": 0.9876648187637329, + "step": 4911, + "temperature": 0.9 + }, + { + "advantages": 3.6614283089875244e-06, + "completion_length": 570.0, + "delta_ref_entropy_loss": -0.06591796875, + "delta_ref_ppl": -0.0859375, + "entropy_loss": -0.1982421875, + "epoch": 0.9824, + "grad_norm": 0.928027167223826, + "k1_kl": 0.0859375, + "k3_kl": 0.068359375, + "kimi_kl": 0.216796875, + "learning_rate": 8.8e-09, + "loss": 0.0027, + "ppl": 0.0537109375, + "reward": 0.9945095181465149, + "reward_std": 0.002225543837994337, + "rewards/perpo_ocr_edit_distance_reward": 0.9945095777511597, + "step": 4912, + "temperature": 0.9 + }, + { + "advantages": -9.196145356327179e-07, + "completion_length": 225.0, + "delta_ref_entropy_loss": 0.0018157958984375, + "delta_ref_ppl": -0.08935546875, + "entropy_loss": -0.11865234375, + "epoch": 0.9826, + "grad_norm": 1.3813323665983184, + "k1_kl": 0.08935546875, + "k3_kl": 0.06005859375, + "kimi_kl": 0.1572265625, + "learning_rate": 8.699999999999998e-09, + "loss": 0.0024, + "ppl": 0.047607421875, + "reward": 0.9055588245391846, + "reward_std": 0.018414339050650597, + "rewards/perpo_ocr_edit_distance_reward": 0.9055588841438293, + "step": 4913, + "temperature": 0.9 + }, + { + "advantages": 1.6178404393940582e-07, + "completion_length": 468.0, + "delta_ref_entropy_loss": -0.466796875, + "delta_ref_ppl": -0.146484375, + "entropy_loss": -1.4296875, + "epoch": 0.9828, + "grad_norm": 4.096444447706106, + "k1_kl": 0.1474609375, + "k3_kl": 0.19921875, + "kimi_kl": 0.388671875, + "learning_rate": 8.6e-09, + "loss": 0.008, + "ppl": 0.73828125, + "reward": 0.466715008020401, + "reward_std": 0.027521951124072075, + "rewards/perpo_ocr_edit_distance_reward": 0.466715008020401, + "step": 4914, + "temperature": 0.9 + }, + { + "advantages": 1.1580331147342804e-06, + "completion_length": 52.0, + "delta_ref_entropy_loss": -0.00390625, + "delta_ref_ppl": -0.69921875, + "entropy_loss": -0.5078125, + "epoch": 0.983, + "grad_norm": 5.0977749685947265, + "k1_kl": 0.69921875, + "k3_kl": 0.57421875, + "kimi_kl": 2.046875, + "learning_rate": 8.5e-09, + "loss": 0.023, + "ppl": 0.201171875, + "reward": 0.6932772994041443, + "reward_std": 0.014555057510733604, + "rewards/perpo_ocr_edit_distance_reward": 0.6932772994041443, + "step": 4915, + "temperature": 0.9 + }, + { + "advantages": -2.2990363504504785e-05, + "completion_length": 2018.0, + "delta_ref_entropy_loss": -0.005767822265625, + "delta_ref_ppl": -0.0286865234375, + "entropy_loss": -0.0693359375, + "epoch": 0.9832, + "grad_norm": 1.110174317032103, + "k1_kl": 0.0286865234375, + "k3_kl": 0.025634765625, + "kimi_kl": 0.07275390625, + "learning_rate": 8.399999999999999e-09, + "loss": 0.001, + "ppl": 0.0296630859375, + "reward": 0.9865971803665161, + "reward_std": 0.004344272427260876, + "rewards/perpo_ocr_edit_distance_reward": 0.9865972399711609, + "step": 4916, + "temperature": 0.9 + }, + { + "advantages": -1.0456357813382056e-05, + "completion_length": 712.0, + "delta_ref_entropy_loss": 0.01031494140625, + "delta_ref_ppl": -0.052978515625, + "entropy_loss": -0.06005859375, + "epoch": 0.9834, + "grad_norm": 0.5692616344517719, + "k1_kl": 0.05322265625, + "k3_kl": 0.035888671875, + "kimi_kl": 0.09814453125, + "learning_rate": 8.3e-09, + "loss": 0.0014, + "ppl": 0.019775390625, + "reward": 0.9776270389556885, + "reward_std": 0.0015296211931854486, + "rewards/perpo_ocr_edit_distance_reward": 0.9776270985603333, + "step": 4917, + "temperature": 0.9 + }, + { + "advantages": 4.257474817137563e-09, + "completion_length": 697.0, + "delta_ref_entropy_loss": -0.04345703125, + "delta_ref_ppl": -0.053955078125, + "entropy_loss": -0.158203125, + "epoch": 0.9836, + "grad_norm": 1.4804527149381441, + "k1_kl": 0.053955078125, + "k3_kl": 0.040771484375, + "kimi_kl": 0.11962890625, + "learning_rate": 8.2e-09, + "loss": 0.0016, + "ppl": 0.040283203125, + "reward": 0.9609720706939697, + "reward_std": 0.0010454930597916245, + "rewards/perpo_ocr_edit_distance_reward": 0.960972011089325, + "step": 4918, + "temperature": 0.9 + }, + { + "advantages": -9.005410538520664e-05, + "completion_length": 486.0, + "delta_ref_entropy_loss": 0.024658203125, + "delta_ref_ppl": -0.0517578125, + "entropy_loss": -0.06396484375, + "epoch": 0.9838, + "grad_norm": 0.7937872673501071, + "k1_kl": 0.0517578125, + "k3_kl": 0.033447265625, + "kimi_kl": 0.10498046875, + "learning_rate": 8.1e-09, + "loss": 0.0014, + "ppl": 0.0198974609375, + "reward": 0.9960632920265198, + "reward_std": 0.0005620194715447724, + "rewards/perpo_ocr_edit_distance_reward": 0.9960634112358093, + "step": 4919, + "temperature": 0.9 + }, + { + "advantages": -5.832740725963959e-07, + "completion_length": 698.0, + "delta_ref_entropy_loss": 0.017333984375, + "delta_ref_ppl": -0.08447265625, + "entropy_loss": -0.29296875, + "epoch": 0.984, + "grad_norm": 3.1916911094596556, + "k1_kl": 0.083984375, + "k3_kl": 0.0703125, + "kimi_kl": 0.1474609375, + "learning_rate": 8e-09, + "loss": 0.0028, + "ppl": 0.1513671875, + "reward": 0.4039035737514496, + "reward_std": 0.01421936135739088, + "rewards/perpo_ocr_edit_distance_reward": 0.4039035737514496, + "step": 4920, + "temperature": 0.9 + }, + { + "advantages": -2.6379313567304052e-05, + "completion_length": 714.0, + "delta_ref_entropy_loss": 0.0068359375, + "delta_ref_ppl": -0.044677734375, + "entropy_loss": -0.123046875, + "epoch": 0.9842, + "grad_norm": 1.235730663944336, + "k1_kl": 0.044677734375, + "k3_kl": 0.038818359375, + "kimi_kl": 0.09130859375, + "learning_rate": 7.9e-09, + "loss": 0.0016, + "ppl": 0.05908203125, + "reward": 0.9791267514228821, + "reward_std": 0.0018371818587183952, + "rewards/perpo_ocr_edit_distance_reward": 0.9791268110275269, + "step": 4921, + "temperature": 0.9 + }, + { + "advantages": 5.040850282966858e-06, + "completion_length": 714.0, + "delta_ref_entropy_loss": 0.023193359375, + "delta_ref_ppl": -0.0869140625, + "entropy_loss": -0.185546875, + "epoch": 0.9844, + "grad_norm": 1.613928759325363, + "k1_kl": 0.0869140625, + "k3_kl": 0.06005859375, + "kimi_kl": 0.1689453125, + "learning_rate": 7.799999999999999e-09, + "loss": 0.0024, + "ppl": 0.0732421875, + "reward": 0.9142745733261108, + "reward_std": 0.0015965548809617758, + "rewards/perpo_ocr_edit_distance_reward": 0.9142745733261108, + "step": 4922, + "temperature": 0.9 + }, + { + "advantages": -2.1478959752130322e-05, + "completion_length": 1460.0, + "delta_ref_entropy_loss": 0.01300048828125, + "delta_ref_ppl": -0.0322265625, + "entropy_loss": -0.0654296875, + "epoch": 0.9846, + "grad_norm": 0.8560045623264726, + "k1_kl": 0.0322265625, + "k3_kl": 0.0223388671875, + "kimi_kl": 0.051025390625, + "learning_rate": 7.7e-09, + "loss": 0.0009, + "ppl": 0.026611328125, + "reward": 0.9955543875694275, + "reward_std": 0.0018816059455275536, + "rewards/perpo_ocr_edit_distance_reward": 0.9955544471740723, + "step": 4923, + "temperature": 0.9 + }, + { + "advantages": 1.7029899268550253e-08, + "completion_length": 263.0, + "delta_ref_entropy_loss": -1.1875, + "delta_ref_ppl": -0.244140625, + "entropy_loss": -2.09375, + "epoch": 0.9848, + "grad_norm": 18.817988950581277, + "k1_kl": 0.244140625, + "k3_kl": 0.412109375, + "kimi_kl": 1.5078125, + "learning_rate": 7.6e-09, + "loss": 0.0165, + "ppl": 0.83984375, + "reward": 0.41422590613365173, + "reward_std": 0.19542475044727325, + "rewards/perpo_ocr_edit_distance_reward": 0.4142259359359741, + "step": 4924, + "temperature": 0.9 + }, + { + "advantages": -3.4553664590930566e-05, + "completion_length": 1020.0, + "delta_ref_entropy_loss": 0.000766754150390625, + "delta_ref_ppl": -0.045166015625, + "entropy_loss": -0.0849609375, + "epoch": 0.985, + "grad_norm": 0.669299475038017, + "k1_kl": 0.04541015625, + "k3_kl": 0.032470703125, + "kimi_kl": 0.07763671875, + "learning_rate": 7.5e-09, + "loss": 0.0013, + "ppl": 0.0301513671875, + "reward": 0.9854443669319153, + "reward_std": 0.001132476725615561, + "rewards/perpo_ocr_edit_distance_reward": 0.9854443669319153, + "step": 4925, + "temperature": 0.9 + }, + { + "advantages": -7.924011879367754e-05, + "completion_length": 422.0, + "delta_ref_entropy_loss": 0.0390625, + "delta_ref_ppl": -0.08642578125, + "entropy_loss": -0.06494140625, + "epoch": 0.9852, + "grad_norm": 0.6223147549384987, + "k1_kl": 0.08642578125, + "k3_kl": 0.06005859375, + "kimi_kl": 0.177734375, + "learning_rate": 7.4e-09, + "loss": 0.0025, + "ppl": 0.023193359375, + "reward": 0.997769296169281, + "reward_std": 0.0006522925687022507, + "rewards/perpo_ocr_edit_distance_reward": 0.9977694153785706, + "step": 4926, + "temperature": 0.9 + }, + { + "advantages": -4.440546399564482e-05, + "completion_length": 776.0, + "delta_ref_entropy_loss": 0.02734375, + "delta_ref_ppl": -0.06591796875, + "entropy_loss": -0.08984375, + "epoch": 0.9854, + "grad_norm": 0.6188751085435749, + "k1_kl": 0.06591796875, + "k3_kl": 0.03955078125, + "kimi_kl": 0.1064453125, + "learning_rate": 7.3e-09, + "loss": 0.0016, + "ppl": 0.03271484375, + "reward": 0.9925581216812134, + "reward_std": 0.0008585731266066432, + "rewards/perpo_ocr_edit_distance_reward": 0.9925581812858582, + "step": 4927, + "temperature": 0.9 + }, + { + "advantages": -4.83649137095199e-06, + "completion_length": 33.0, + "delta_ref_entropy_loss": -0.10595703125, + "delta_ref_ppl": -0.7734375, + "entropy_loss": -0.5546875, + "epoch": 0.9856, + "grad_norm": 8.536288132648851, + "k1_kl": 0.7734375, + "k3_kl": 0.64453125, + "kimi_kl": 2.921875, + "learning_rate": 7.199999999999999e-09, + "loss": 0.0257, + "ppl": 0.2177734375, + "reward": 0.9700771570205688, + "reward_std": 0.012247657403349876, + "rewards/perpo_ocr_edit_distance_reward": 0.9700772762298584, + "step": 4928, + "temperature": 0.9 + }, + { + "advantages": -0.0001169954048236832, + "completion_length": 522.0, + "delta_ref_entropy_loss": 0.0269775390625, + "delta_ref_ppl": -0.06591796875, + "entropy_loss": -0.07861328125, + "epoch": 0.9858, + "grad_norm": 0.3942575675545791, + "k1_kl": 0.06591796875, + "k3_kl": 0.03955078125, + "kimi_kl": 0.095703125, + "learning_rate": 7.1e-09, + "loss": 0.0017, + "ppl": 0.02685546875, + "reward": 0.994483470916748, + "reward_std": 0.00040949415415525436, + "rewards/perpo_ocr_edit_distance_reward": 0.9944835305213928, + "step": 4929, + "temperature": 0.9 + }, + { + "advantages": -0.00015090193483047187, + "completion_length": 1220.0, + "delta_ref_entropy_loss": 0.01031494140625, + "delta_ref_ppl": -0.0257568359375, + "entropy_loss": -0.052734375, + "epoch": 0.986, + "grad_norm": 0.45895432042178824, + "k1_kl": 0.02587890625, + "k3_kl": 0.0169677734375, + "kimi_kl": 0.041015625, + "learning_rate": 7e-09, + "loss": 0.0008, + "ppl": 0.0198974609375, + "reward": 0.997983455657959, + "reward_std": 0.0006334629724733531, + "rewards/perpo_ocr_edit_distance_reward": 0.9979835748672485, + "step": 4930, + "temperature": 0.9 + }, + { + "advantages": -0.00010907650721492246, + "completion_length": 682.0, + "delta_ref_entropy_loss": 0.00921630859375, + "delta_ref_ppl": -0.03662109375, + "entropy_loss": -0.052978515625, + "epoch": 0.9862, + "grad_norm": 0.5012849193757697, + "k1_kl": 0.03662109375, + "k3_kl": 0.024169921875, + "kimi_kl": 0.0576171875, + "learning_rate": 6.9e-09, + "loss": 0.0011, + "ppl": 0.0167236328125, + "reward": 0.9911759495735168, + "reward_std": 0.0009149561519734561, + "rewards/perpo_ocr_edit_distance_reward": 0.9911761283874512, + "step": 4931, + "temperature": 0.9 + }, + { + "advantages": -5.449567765936081e-07, + "completion_length": 82.0, + "delta_ref_entropy_loss": -0.1748046875, + "delta_ref_ppl": -0.4921875, + "entropy_loss": -0.455078125, + "epoch": 0.9864, + "grad_norm": 6.030779435483899, + "k1_kl": 0.4921875, + "k3_kl": 0.427734375, + "kimi_kl": 1.8046875, + "learning_rate": 6.8e-09, + "loss": 0.0172, + "ppl": 0.1484375, + "reward": 0.5508167147636414, + "reward_std": 0.031581245362758636, + "rewards/perpo_ocr_edit_distance_reward": 0.5508167743682861, + "step": 4932, + "temperature": 0.9 + }, + { + "advantages": -4.022462235298008e-05, + "completion_length": 1210.0, + "delta_ref_entropy_loss": 0.048583984375, + "delta_ref_ppl": -0.06494140625, + "entropy_loss": -0.060791015625, + "epoch": 0.9866, + "grad_norm": 0.8525585448744532, + "k1_kl": 0.06494140625, + "k3_kl": 0.03564453125, + "kimi_kl": 0.08984375, + "learning_rate": 6.7e-09, + "loss": 0.0015, + "ppl": 0.0230712890625, + "reward": 0.9743289351463318, + "reward_std": 0.002229065168648958, + "rewards/perpo_ocr_edit_distance_reward": 0.9743290543556213, + "step": 4933, + "temperature": 0.9 + }, + { + "advantages": -7.853763963794336e-05, + "completion_length": 969.0, + "delta_ref_entropy_loss": 0.0400390625, + "delta_ref_ppl": -0.06298828125, + "entropy_loss": -0.03564453125, + "epoch": 0.9868, + "grad_norm": 0.7892134912832413, + "k1_kl": 0.06298828125, + "k3_kl": 0.039794921875, + "kimi_kl": 0.154296875, + "learning_rate": 6.5999999999999995e-09, + "loss": 0.0017, + "ppl": 0.010498046875, + "reward": 0.9969345331192017, + "reward_std": 0.0004419985634740442, + "rewards/perpo_ocr_edit_distance_reward": 0.9969345927238464, + "step": 4934, + "temperature": 0.9 + }, + { + "advantages": -2.0248549844836816e-05, + "completion_length": 376.0, + "delta_ref_entropy_loss": 0.0289306640625, + "delta_ref_ppl": -0.1357421875, + "entropy_loss": -0.11572265625, + "epoch": 0.987, + "grad_norm": 0.8540430072858375, + "k1_kl": 0.1357421875, + "k3_kl": 0.10693359375, + "kimi_kl": 0.33203125, + "learning_rate": 6.4999999999999995e-09, + "loss": 0.0043, + "ppl": 0.0439453125, + "reward": 0.3366885185241699, + "reward_std": 0.0009499638108536601, + "rewards/perpo_ocr_edit_distance_reward": 0.3366885483264923, + "step": 4935, + "temperature": 0.9 + }, + { + "advantages": -3.2356809242628515e-05, + "completion_length": 1146.0, + "delta_ref_entropy_loss": 0.000522613525390625, + "delta_ref_ppl": -0.02685546875, + "entropy_loss": -0.06103515625, + "epoch": 0.9872, + "grad_norm": 0.37956184367076606, + "k1_kl": 0.0269775390625, + "k3_kl": 0.0167236328125, + "kimi_kl": 0.0390625, + "learning_rate": 6.4e-09, + "loss": 0.0007, + "ppl": 0.0201416015625, + "reward": 0.9955602884292603, + "reward_std": 0.0006896309205330908, + "rewards/perpo_ocr_edit_distance_reward": 0.995560348033905, + "step": 4936, + "temperature": 0.9 + }, + { + "advantages": -1.1248248483752832e-05, + "completion_length": 417.0, + "delta_ref_entropy_loss": 0.0098876953125, + "delta_ref_ppl": -0.06396484375, + "entropy_loss": -0.052001953125, + "epoch": 0.9874, + "grad_norm": 0.5843592363306559, + "k1_kl": 0.06396484375, + "k3_kl": 0.04833984375, + "kimi_kl": 0.158203125, + "learning_rate": 6.3e-09, + "loss": 0.0019, + "ppl": 0.0189208984375, + "reward": 0.9224538803100586, + "reward_std": 0.0021706537809222937, + "rewards/perpo_ocr_edit_distance_reward": 0.9224539399147034, + "step": 4937, + "temperature": 0.9 + }, + { + "advantages": -3.807033863267861e-05, + "completion_length": 431.0, + "delta_ref_entropy_loss": 0.024169921875, + "delta_ref_ppl": -0.06787109375, + "entropy_loss": -0.048828125, + "epoch": 0.9876, + "grad_norm": 0.8454660715808386, + "k1_kl": 0.06787109375, + "k3_kl": 0.048583984375, + "kimi_kl": 0.177734375, "learning_rate": 6.199999999999999e-09, - "loss": 0.0003, - "ppl": 0.0076141357421875, - "reward": 1.0, - "reward_std": 0.0, - "rewards/perpo_ocr_edit_distance_reward": 1.0, - "step": 2469, + "loss": 0.002, + "ppl": 0.0164794921875, + "reward": 0.996435821056366, + "reward_std": 0.0010183370904996991, + "rewards/perpo_ocr_edit_distance_reward": 0.9964358806610107, + "step": 4938, + "temperature": 0.9 + }, + { + "advantages": -4.815204192709643e-06, + "completion_length": 107.0, + "delta_ref_entropy_loss": 0.005584716796875, + "delta_ref_ppl": -0.267578125, + "entropy_loss": -0.392578125, + "epoch": 0.9878, + "grad_norm": 6.514786060148732, + "k1_kl": 0.267578125, + "k3_kl": 0.234375, + "kimi_kl": 0.69140625, + "learning_rate": 6.1e-09, + "loss": 0.0094, + "ppl": 0.1552734375, + "reward": 0.771174967288971, + "reward_std": 0.008744637481868267, + "rewards/perpo_ocr_edit_distance_reward": 0.7711750268936157, + "step": 4939, "temperature": 0.9 }, { - "advantages": -4.487378328121849e-05, - "completion_length": 579.5, - "delta_ref_entropy_loss": 0.08837890625, - "delta_ref_ppl": -0.090118408203125, - "entropy_loss": -0.0714111328125, + "advantages": -5.8523248299025e-05, + "completion_length": 1138.0, + "delta_ref_entropy_loss": 0.0025177001953125, + "delta_ref_ppl": -0.0308837890625, + "entropy_loss": -0.080078125, "epoch": 0.988, - "grad_norm": 1.6301731704451004, - "k1_kl": 0.090057373046875, - "k3_kl": 0.0547637939453125, - "kimi_kl": 0.156036376953125, + "grad_norm": 0.6746195463910113, + "k1_kl": 0.031005859375, + "k3_kl": 0.0220947265625, + "kimi_kl": 0.049072265625, "learning_rate": 6e-09, - "loss": 0.0022, - "ppl": 0.03729248046875, - "reward": 0.8841784000396729, - "reward_std": 0.0018256460025440902, - "rewards/perpo_ocr_edit_distance_reward": 0.8841784596443176, - "step": 2470, + "loss": 0.0009, + "ppl": 0.036376953125, + "reward": 0.9930108189582825, + "reward_std": 0.0016460398910567164, + "rewards/perpo_ocr_edit_distance_reward": 0.9930109977722168, + "step": 4940, "temperature": 0.9 }, { - "advantages": -1.5624932984792395e-05, - "completion_length": 637.5, - "delta_ref_entropy_loss": 0.1015625, - "delta_ref_ppl": -0.0616455078125, - "entropy_loss": -0.137451171875, + "advantages": 1.8051692904919037e-06, + "completion_length": 485.0, + "delta_ref_entropy_loss": -0.0703125, + "delta_ref_ppl": -0.130859375, + "entropy_loss": -0.474609375, + "epoch": 0.9882, + "grad_norm": 1.7632806526617502, + "k1_kl": 0.1298828125, + "k3_kl": 0.1142578125, + "kimi_kl": 0.283203125, + "learning_rate": 5.9e-09, + "loss": 0.0046, + "ppl": 0.22265625, + "reward": 0.8433602452278137, + "reward_std": 0.004635846242308617, + "rewards/perpo_ocr_edit_distance_reward": 0.8433603048324585, + "step": 4941, + "temperature": 0.9 + }, + { + "advantages": -4.938671054333099e-07, + "completion_length": 893.0, + "delta_ref_entropy_loss": -0.181640625, + "delta_ref_ppl": -0.041259765625, + "entropy_loss": -0.5390625, "epoch": 0.9884, - "grad_norm": 1.9377584643490655, - "k1_kl": 0.0616455078125, - "k3_kl": 0.03656005859375, - "kimi_kl": 0.078125, + "grad_norm": 3.497706050157619, + "k1_kl": 0.041259765625, + "k3_kl": 0.0634765625, + "kimi_kl": 0.11083984375, "learning_rate": 5.799999999999999e-09, - "loss": 0.0015, - "ppl": 0.0751953125, - "reward": 0.8998611569404602, - "reward_std": 0.00426923471968621, - "rewards/perpo_ocr_edit_distance_reward": 0.8998612463474274, - "step": 2471, + "loss": 0.0025, + "ppl": 0.2373046875, + "reward": 0.9499680995941162, + "reward_std": 0.05239442363381386, + "rewards/perpo_ocr_edit_distance_reward": 0.9499682188034058, + "step": 4942, "temperature": 0.9 }, { - "advantages": -1.545889153931057e-05, - "completion_length": 539.0, - "delta_ref_entropy_loss": 0.0421142578125, - "delta_ref_ppl": -0.0467529296875, - "entropy_loss": -0.04351806640625, + "advantages": -4.679816265706904e-05, + "completion_length": 559.0, + "delta_ref_entropy_loss": 0.04345703125, + "delta_ref_ppl": -0.083984375, + "entropy_loss": -0.09912109375, + "epoch": 0.9886, + "grad_norm": 0.8187613539509702, + "k1_kl": 0.08447265625, + "k3_kl": 0.056640625, + "kimi_kl": 0.1298828125, + "learning_rate": 5.7e-09, + "loss": 0.0023, + "ppl": 0.0380859375, + "reward": 0.9642033576965332, + "reward_std": 0.0009917563293129206, + "rewards/perpo_ocr_edit_distance_reward": 0.9642033576965332, + "step": 4943, + "temperature": 0.9 + }, + { + "advantages": -4.214048749417998e-05, + "completion_length": 598.0, + "delta_ref_entropy_loss": 0.0135498046875, + "delta_ref_ppl": -0.0908203125, + "entropy_loss": -0.0576171875, "epoch": 0.9888, - "grad_norm": 0.7999184091150054, - "k1_kl": 0.046630859375, - "k3_kl": 0.0301513671875, - "kimi_kl": 0.107177734375, + "grad_norm": 0.37386644590239926, + "k1_kl": 0.0908203125, + "k3_kl": 0.07421875, + "kimi_kl": 0.357421875, "learning_rate": 5.6e-09, - "loss": 0.0012, - "ppl": 0.02252197265625, - "reward": 0.9827545881271362, - "reward_std": 0.0021236231550574303, - "rewards/perpo_ocr_edit_distance_reward": 0.9827546179294586, - "step": 2472, + "loss": 0.003, + "ppl": 0.01556396484375, + "reward": 0.9966190457344055, + "reward_std": 0.0005060008843429387, + "rewards/perpo_ocr_edit_distance_reward": 0.9966190457344055, + "step": 4944, "temperature": 0.9 }, { - "advantages": -2.525959826016333e-05, - "completion_length": 808.5, - "delta_ref_entropy_loss": 0.02996826171875, - "delta_ref_ppl": -0.03118896484375, - "entropy_loss": -0.033721923828125, + "advantages": -8.053439523791894e-05, + "completion_length": 1447.0, + "delta_ref_entropy_loss": 0.0250244140625, + "delta_ref_ppl": -0.0625, + "entropy_loss": -0.10595703125, + "epoch": 0.989, + "grad_norm": 0.8672993508618041, + "k1_kl": 0.06298828125, + "k3_kl": 0.041015625, + "kimi_kl": 0.095703125, + "learning_rate": 5.5e-09, + "loss": 0.0017, + "ppl": 0.052490234375, + "reward": 0.9780036807060242, + "reward_std": 0.0005343359080143273, + "rewards/perpo_ocr_edit_distance_reward": 0.9780036807060242, + "step": 4945, + "temperature": 0.9 + }, + { + "advantages": -3.9790360460756347e-05, + "completion_length": 307.0, + "delta_ref_entropy_loss": 0.0250244140625, + "delta_ref_ppl": -0.1357421875, + "entropy_loss": -0.056640625, "epoch": 0.9892, - "grad_norm": 0.5805083481498587, - "k1_kl": 0.0311279296875, - "k3_kl": 0.02484130859375, - "kimi_kl": 0.05609130859375, + "grad_norm": 0.6050428018248364, + "k1_kl": 0.1357421875, + "k3_kl": 0.1064453125, + "kimi_kl": 0.404296875, "learning_rate": 5.4e-09, - "loss": 0.001, - "ppl": 0.020263671875, - "reward": 0.9955495595932007, - "reward_std": 0.0010285105963703245, - "rewards/perpo_ocr_edit_distance_reward": 0.9955496490001678, - "step": 2473, + "loss": 0.0043, + "ppl": 0.015625, + "reward": 0.9968821406364441, + "reward_std": 0.0011835835175588727, + "rewards/perpo_ocr_edit_distance_reward": 0.9968822002410889, + "step": 4946, + "temperature": 0.9 + }, + { + "advantages": -0.0001596212387084961, + "completion_length": 976.0, + "delta_ref_entropy_loss": 0.0289306640625, + "delta_ref_ppl": -0.057373046875, + "entropy_loss": -0.0556640625, + "epoch": 0.9894, + "grad_norm": 0.2945514384225497, + "k1_kl": 0.057373046875, + "k3_kl": 0.038818359375, + "kimi_kl": 0.1328125, + "learning_rate": 5.2999999999999995e-09, + "loss": 0.0017, + "ppl": 0.022216796875, + "reward": 0.9933868646621704, + "reward_std": 0.0002734249283093959, + "rewards/perpo_ocr_edit_distance_reward": 0.99338698387146, + "step": 4947, + "temperature": 0.9 + }, + { + "advantages": -8.497920134686865e-06, + "completion_length": 1773.0, + "delta_ref_entropy_loss": 0.0003490447998046875, + "delta_ref_ppl": -0.03173828125, + "entropy_loss": -0.087890625, + "epoch": 0.9896, + "grad_norm": 0.8046479966082009, + "k1_kl": 0.03173828125, + "k3_kl": 0.0274658203125, + "kimi_kl": 0.055419921875, + "learning_rate": 5.1999999999999994e-09, + "loss": 0.0011, + "ppl": 0.041259765625, + "reward": 0.9903139472007751, + "reward_std": 0.0029045825358480215, + "rewards/perpo_ocr_edit_distance_reward": 0.9903140068054199, + "step": 4948, "temperature": 0.9 }, { - "advantages": -5.2903382311342284e-05, - "completion_length": 1304.5, - "delta_ref_entropy_loss": 0.0245361328125, - "delta_ref_ppl": -0.02056884765625, - "entropy_loss": -0.0347900390625, - "epoch": 0.9896, - "grad_norm": 0.5076979895232903, - "k1_kl": 0.0206298828125, - "k3_kl": 0.0120849609375, - "kimi_kl": 0.0335693359375, - "learning_rate": 5.1999999999999994e-09, - "loss": 0.0005, - "ppl": 0.017364501953125, - "reward": 0.9950035810470581, - "reward_std": 0.001663552291574888, - "rewards/perpo_ocr_edit_distance_reward": 0.9950036406517029, - "step": 2474, + "advantages": -0.00010452952119521797, + "completion_length": 730.0, + "delta_ref_entropy_loss": 0.0235595703125, + "delta_ref_ppl": -0.03857421875, + "entropy_loss": -0.051025390625, + "epoch": 0.9898, + "grad_norm": 0.18874958622297133, + "k1_kl": 0.03857421875, + "k3_kl": 0.019775390625, + "kimi_kl": 0.043701171875, + "learning_rate": 5.1e-09, + "loss": 0.0009, + "ppl": 0.017822265625, + "reward": 0.9987829923629761, + "reward_std": 0.000633139512501657, + "rewards/perpo_ocr_edit_distance_reward": 0.9987831115722656, + "step": 4949, "temperature": 0.9 }, { - "advantages": -4.9046107960748486e-05, - "completion_length": 577.5, - "delta_ref_entropy_loss": 0.03070068359375, - "delta_ref_ppl": -0.03094482421875, - "entropy_loss": -0.02349853515625, + "advantages": -1.3385501006268896e-05, + "completion_length": 503.0, + "delta_ref_entropy_loss": 0.037353515625, + "delta_ref_ppl": -0.0849609375, + "entropy_loss": -0.06494140625, "epoch": 0.99, - "grad_norm": 0.4073807147840045, - "k1_kl": 0.030914306640625, - "k3_kl": 0.020416259765625, - "kimi_kl": 0.077301025390625, + "grad_norm": 0.7433478240195647, + "k1_kl": 0.0849609375, + "k3_kl": 0.0654296875, + "kimi_kl": 0.255859375, "learning_rate": 5e-09, - "loss": 0.0009, - "ppl": 0.0106201171875, - "reward": 0.9956726431846619, - "reward_std": 0.00031371986551675946, - "rewards/perpo_ocr_edit_distance_reward": 0.9956726729869843, - "step": 2475, + "loss": 0.0026, + "ppl": 0.0186767578125, + "reward": 0.9895251393318176, + "reward_std": 0.0005367687554098666, + "rewards/perpo_ocr_edit_distance_reward": 0.9895251989364624, + "step": 4950, "temperature": 0.9 }, { - "advantages": -6.352152558974922e-05, - "completion_length": 377.0, - "delta_ref_entropy_loss": 0.02044677734375, - "delta_ref_ppl": -0.0347900390625, - "entropy_loss": -0.0252685546875, + "advantages": -5.1694260037038475e-05, + "completion_length": 659.0, + "delta_ref_entropy_loss": 0.025146484375, + "delta_ref_ppl": -0.05712890625, + "entropy_loss": -0.08056640625, + "epoch": 0.9902, + "grad_norm": 2.1160067427649345, + "k1_kl": 0.05712890625, + "k3_kl": 0.03662109375, + "kimi_kl": 0.1484375, + "learning_rate": 4.899999999999999e-09, + "loss": 0.0015, + "ppl": 0.02880859375, + "reward": 0.9955618381500244, + "reward_std": 0.001053617219440639, + "rewards/perpo_ocr_edit_distance_reward": 0.9955618977546692, + "step": 4951, + "temperature": 0.9 + }, + { + "advantages": 4.564013124763733e-06, + "completion_length": 137.0, + "delta_ref_entropy_loss": 0.021240234375, + "delta_ref_ppl": -0.255859375, + "entropy_loss": -0.091796875, "epoch": 0.9904, - "grad_norm": 0.2358613718000749, - "k1_kl": 0.03485107421875, - "k3_kl": 0.02557373046875, - "kimi_kl": 0.09814453125, + "grad_norm": 2.0523844884442037, + "k1_kl": 0.255859375, + "k3_kl": 0.2158203125, + "kimi_kl": 1.328125, "learning_rate": 4.799999999999999e-09, - "loss": 0.0011, - "ppl": 0.00930023193359375, - "reward": 0.9998361766338348, - "reward_std": 0.0002181636227760464, - "rewards/perpo_ocr_edit_distance_reward": 0.9998362064361572, - "step": 2476, + "loss": 0.0086, + "ppl": 0.0286865234375, + "reward": 0.9669789671897888, + "reward_std": 0.001752532203681767, + "rewards/perpo_ocr_edit_distance_reward": 0.966978907585144, + "step": 4952, "temperature": 0.9 }, { - "advantages": -3.15542750968234e-05, - "completion_length": 540.5, - "delta_ref_entropy_loss": 0.03448486328125, - "delta_ref_ppl": -0.0380859375, - "entropy_loss": -0.018798828125, + "advantages": -9.690013030194677e-06, + "completion_length": 631.0, + "delta_ref_entropy_loss": 0.0308837890625, + "delta_ref_ppl": -0.06689453125, + "entropy_loss": -0.07666015625, + "epoch": 0.9906, + "grad_norm": 0.5268775902250145, + "k1_kl": 0.0673828125, + "k3_kl": 0.039306640625, + "kimi_kl": 0.10595703125, + "learning_rate": 4.7e-09, + "loss": 0.0016, + "ppl": 0.023193359375, + "reward": 0.9844992160797119, + "reward_std": 0.001654747175052762, + "rewards/perpo_ocr_edit_distance_reward": 0.9844992160797119, + "step": 4953, + "temperature": 0.9 + }, + { + "advantages": -1.233390412380686e-05, + "completion_length": 529.0, + "delta_ref_entropy_loss": 0.0284423828125, + "delta_ref_ppl": -0.08251953125, + "entropy_loss": -0.039794921875, "epoch": 0.9908, - "grad_norm": 0.6180139945092369, - "k1_kl": 0.0382080078125, - "k3_kl": 0.02581787109375, - "kimi_kl": 0.1041259765625, + "grad_norm": 0.3558563035498535, + "k1_kl": 0.0830078125, + "k3_kl": 0.0654296875, + "kimi_kl": 0.26953125, "learning_rate": 4.6e-09, - "loss": 0.0011, - "ppl": 0.00821685791015625, - "reward": 0.987324982881546, - "reward_std": 0.018714856880251318, - "rewards/perpo_ocr_edit_distance_reward": 0.9873250424861908, - "step": 2477, + "loss": 0.0026, + "ppl": 0.0123291015625, + "reward": 0.9979788661003113, + "reward_std": 0.0005908231833018363, + "rewards/perpo_ocr_edit_distance_reward": 0.997978925704956, + "step": 4954, + "temperature": 0.9 + }, + { + "advantages": 4.470348358154297e-06, + "completion_length": 411.0, + "delta_ref_entropy_loss": 0.043212890625, + "delta_ref_ppl": -0.10546875, + "entropy_loss": -0.244140625, + "epoch": 0.991, + "grad_norm": 1.4597804824125165, + "k1_kl": 0.10546875, + "k3_kl": 0.07421875, + "kimi_kl": 0.1767578125, + "learning_rate": 4.5e-09, + "loss": 0.003, + "ppl": 0.1064453125, + "reward": 0.9067023992538452, + "reward_std": 0.0018050926737487316, + "rewards/perpo_ocr_edit_distance_reward": 0.9067023992538452, + "step": 4955, "temperature": 0.9 }, { - "advantages": -0.00030095981719568954, - "completion_length": 354.5, - "delta_ref_entropy_loss": 0.06256103515625, - "delta_ref_ppl": -0.065673828125, - "entropy_loss": -0.08709716796875, + "advantages": -5.84977024118416e-05, + "completion_length": 346.0, + "delta_ref_entropy_loss": 0.021728515625, + "delta_ref_ppl": -0.09765625, + "entropy_loss": -0.06298828125, "epoch": 0.9912, - "grad_norm": 3.262224199409054, - "k1_kl": 0.065673828125, - "k3_kl": 0.03961181640625, - "kimi_kl": 0.104736328125, + "grad_norm": 1.0246326104848822, + "k1_kl": 0.09765625, + "k3_kl": 0.0732421875, + "kimi_kl": 0.2578125, "learning_rate": 4.4e-09, - "loss": 0.0019, - "ppl": 0.046478271484375, - "reward": 0.9704541563987732, - "reward_std": 0.0014010603772476315, - "rewards/perpo_ocr_edit_distance_reward": 0.970454216003418, - "step": 2478, + "loss": 0.003, + "ppl": 0.0269775390625, + "reward": 0.9777834415435791, + "reward_std": 0.0015011595096439123, + "rewards/perpo_ocr_edit_distance_reward": 0.9777835011482239, + "step": 4956, "temperature": 0.9 }, { - "advantages": -2.5851387363218237e-05, - "completion_length": 690.5, - "delta_ref_entropy_loss": 0.02789306640625, - "delta_ref_ppl": -0.0224609375, - "entropy_loss": -0.01959228515625, + "advantages": -2.359492464165669e-05, + "completion_length": 560.0, + "delta_ref_entropy_loss": 0.006134033203125, + "delta_ref_ppl": -0.0830078125, + "entropy_loss": -0.091796875, + "epoch": 0.9914, + "grad_norm": 0.7847101648688133, + "k1_kl": 0.0830078125, + "k3_kl": 0.05908203125, + "kimi_kl": 0.1943359375, + "learning_rate": 4.3e-09, + "loss": 0.0024, + "ppl": 0.03076171875, + "reward": 0.9875311851501465, + "reward_std": 0.00278811389580369, + "rewards/perpo_ocr_edit_distance_reward": 0.9875312447547913, + "step": 4957, + "temperature": 0.9 + }, + { + "advantages": -4.254068699083291e-05, + "completion_length": 957.0, + "delta_ref_entropy_loss": -0.038330078125, + "delta_ref_ppl": -0.060791015625, + "entropy_loss": -0.1328125, "epoch": 0.9916, - "grad_norm": 0.44109551838768163, - "k1_kl": 0.02252197265625, - "k3_kl": 0.012115478515625, - "kimi_kl": 0.029296875, + "grad_norm": 4.118738269974286, + "k1_kl": 0.060546875, + "k3_kl": 0.04248046875, + "kimi_kl": 0.10888671875, "learning_rate": 4.1999999999999996e-09, - "loss": 0.0005, - "ppl": 0.0074615478515625, - "reward": 0.9962053000926971, - "reward_std": 0.001675555540714413, - "rewards/perpo_ocr_edit_distance_reward": 0.9962053894996643, - "step": 2479, + "loss": 0.0017, + "ppl": 0.027587890625, + "reward": 0.9765396118164062, + "reward_std": 0.0013008067617192864, + "rewards/perpo_ocr_edit_distance_reward": 0.976539671421051, + "step": 4958, "temperature": 0.9 }, { - "advantages": -2.7247838829680404e-07, - "completion_length": 846.5, - "delta_ref_entropy_loss": 0.03082275390625, - "delta_ref_ppl": -0.02978515625, - "entropy_loss": -0.051513671875, + "advantages": -3.3250878914259374e-05, + "completion_length": 655.0, + "delta_ref_entropy_loss": 0.01055908203125, + "delta_ref_ppl": -0.04443359375, + "entropy_loss": -0.083984375, + "epoch": 0.9918, + "grad_norm": 0.642656917400966, + "k1_kl": 0.04443359375, + "k3_kl": 0.0274658203125, + "kimi_kl": 0.0654296875, + "learning_rate": 4.1e-09, + "loss": 0.0011, + "ppl": 0.031005859375, + "reward": 0.9893597960472107, + "reward_std": 0.0019493718864396214, + "rewards/perpo_ocr_edit_distance_reward": 0.9893598556518555, + "step": 4959, + "temperature": 0.9 + }, + { + "advantages": 5.10896995820076e-08, + "completion_length": 913.0, + "delta_ref_entropy_loss": -0.39453125, + "delta_ref_ppl": -0.031982421875, + "entropy_loss": -1.0, "epoch": 0.992, - "grad_norm": 0.9448676707285736, - "k1_kl": 0.02978515625, - "k3_kl": 0.01953125, - "kimi_kl": 0.0526123046875, + "grad_norm": 8.771340551746603, + "k1_kl": 0.0306396484375, + "k3_kl": 0.1015625, + "kimi_kl": 0.1865234375, "learning_rate": 4e-09, - "loss": 0.0008, - "ppl": 0.02581787109375, - "reward": 0.8302285075187683, - "reward_std": 0.1535230204463005, - "rewards/perpo_ocr_edit_distance_reward": 0.8302285671234131, - "step": 2480, + "loss": 0.0041, + "ppl": 0.5078125, + "reward": 0.7238953709602356, + "reward_std": 0.14978303015232086, + "rewards/perpo_ocr_edit_distance_reward": 0.7238953709602356, + "step": 4960, "temperature": 0.9 }, { - "advantages": -1.5497207641601562e-05, - "completion_length": 926.5, - "delta_ref_entropy_loss": 0.0445556640625, - "delta_ref_ppl": -0.03302001953125, - "entropy_loss": -0.046234130859375, + "advantages": -1.71831688930979e-05, + "completion_length": 1248.0, + "delta_ref_entropy_loss": 0.039794921875, + "delta_ref_ppl": -0.07373046875, + "entropy_loss": -0.150390625, + "epoch": 0.9922, + "grad_norm": 2.555019489642028, + "k1_kl": 0.07373046875, + "k3_kl": 0.048828125, + "kimi_kl": 0.11328125, + "learning_rate": 3.899999999999999e-09, + "loss": 0.002, + "ppl": 0.08154296875, + "reward": 0.9704430103302002, + "reward_std": 0.002376255812123418, + "rewards/perpo_ocr_edit_distance_reward": 0.970443069934845, + "step": 4961, + "temperature": 0.9 + }, + { + "advantages": -5.3456853493116796e-05, + "completion_length": 721.0, + "delta_ref_entropy_loss": 0.01361083984375, + "delta_ref_ppl": -0.042724609375, + "entropy_loss": -0.0556640625, "epoch": 0.9924, - "grad_norm": 0.45305890126949055, - "k1_kl": 0.03314208984375, - "k3_kl": 0.019317626953125, - "kimi_kl": 0.05633544921875, + "grad_norm": 5.749678814501269, + "k1_kl": 0.042724609375, + "k3_kl": 0.0303955078125, + "kimi_kl": 0.08056640625, "learning_rate": 3.8e-09, - "loss": 0.0008, - "ppl": 0.0244293212890625, - "reward": 0.9894558489322662, - "reward_std": 0.0006370770861394703, - "rewards/perpo_ocr_edit_distance_reward": 0.9894558787345886, - "step": 2481, + "loss": 0.0013, + "ppl": 0.01507568359375, + "reward": 0.994415283203125, + "reward_std": 0.0005370288272388279, + "rewards/perpo_ocr_edit_distance_reward": 0.9944153428077698, + "step": 4962, + "temperature": 0.9 + }, + { + "advantages": -2.2479466679214966e-06, + "completion_length": 271.0, + "delta_ref_entropy_loss": -0.08447265625, + "delta_ref_ppl": -0.1435546875, + "entropy_loss": -0.2431640625, + "epoch": 0.9926, + "grad_norm": 2.11864436060842, + "k1_kl": 0.1435546875, + "k3_kl": 0.12353515625, + "kimi_kl": 0.44140625, + "learning_rate": 3.7e-09, + "loss": 0.0049, + "ppl": 0.06640625, + "reward": 0.9369558095932007, + "reward_std": 0.044983871281147, + "rewards/perpo_ocr_edit_distance_reward": 0.9369559288024902, + "step": 4963, "temperature": 0.9 }, { - "advantages": -8.699723753125e-05, - "completion_length": 446.5, - "delta_ref_entropy_loss": 0.0477294921875, - "delta_ref_ppl": -0.06024169921875, - "entropy_loss": -0.033538818359375, + "advantages": -2.2820065623818664e-06, + "completion_length": 670.0, + "delta_ref_entropy_loss": -0.1826171875, + "delta_ref_ppl": -0.046142578125, + "entropy_loss": -0.3125, "epoch": 0.9928, - "grad_norm": 1.3638284101681688, - "k1_kl": 0.06005859375, - "k3_kl": 0.0423583984375, - "kimi_kl": 0.1236572265625, + "grad_norm": 2.499704396152239, + "k1_kl": 0.045654296875, + "k3_kl": 0.059814453125, + "kimi_kl": 0.1552734375, "learning_rate": 3.5999999999999996e-09, - "loss": 0.0018, - "ppl": 0.017486572265625, - "reward": 0.9854961931705475, - "reward_std": 0.009238595346687362, - "rewards/perpo_ocr_edit_distance_reward": 0.9854961931705475, - "step": 2482, + "loss": 0.0024, + "ppl": 0.09326171875, + "reward": 0.9674586653709412, + "reward_std": 0.026082908734679222, + "rewards/perpo_ocr_edit_distance_reward": 0.9674587845802307, + "step": 4964, "temperature": 0.9 }, { - "advantages": -8.004052460819366e-07, - "completion_length": 847.5, - "delta_ref_entropy_loss": 0.03466796875, - "delta_ref_ppl": -0.027099609375, - "entropy_loss": -0.048095703125, + "advantages": -6.968634988879785e-05, + "completion_length": 883.0, + "delta_ref_entropy_loss": 0.04296875, + "delta_ref_ppl": -0.0703125, + "entropy_loss": -0.0947265625, + "epoch": 0.993, + "grad_norm": 1.5595577309945972, + "k1_kl": 0.0703125, + "k3_kl": 0.037841796875, + "kimi_kl": 0.0927734375, + "learning_rate": 3.5e-09, + "loss": 0.0016, + "ppl": 0.033935546875, + "reward": 0.9102392792701721, + "reward_std": 0.0014891967875882983, + "rewards/perpo_ocr_edit_distance_reward": 0.9102393984794617, + "step": 4965, + "temperature": 0.9 + }, + { + "advantages": -1.7029898913278885e-07, + "completion_length": 824.0, + "delta_ref_entropy_loss": -0.142578125, + "delta_ref_ppl": -0.037109375, + "entropy_loss": -0.369140625, "epoch": 0.9932, - "grad_norm": 2.5194844705224013, - "k1_kl": 0.02716064453125, - "k3_kl": 0.015472412109375, - "kimi_kl": 0.03289794921875, + "grad_norm": 2.072084836121308, + "k1_kl": 0.03759765625, + "k3_kl": 0.05224609375, + "kimi_kl": 0.1357421875, "learning_rate": 3.4e-09, - "loss": 0.0006, - "ppl": 0.0234375, - "reward": 0.776016891002655, - "reward_std": 0.03943532519042492, - "rewards/perpo_ocr_edit_distance_reward": 0.7760169208049774, - "step": 2483, + "loss": 0.0021, + "ppl": 0.1708984375, + "reward": 0.623171865940094, + "reward_std": 0.17231827974319458, + "rewards/perpo_ocr_edit_distance_reward": 0.6231719255447388, + "step": 4966, "temperature": 0.9 }, { - "advantages": 1.4305115554336112e-06, - "completion_length": 676.0, - "delta_ref_entropy_loss": 0.1192626953125, - "delta_ref_ppl": -0.06915283203125, - "entropy_loss": -0.14178466796875, + "advantages": 4.947185516357422e-06, + "completion_length": 776.0, + "delta_ref_entropy_loss": 0.0264892578125, + "delta_ref_ppl": -0.0400390625, + "entropy_loss": -0.07470703125, + "epoch": 0.9934, + "grad_norm": 0.8829797746095006, + "k1_kl": 0.0400390625, + "k3_kl": 0.02490234375, + "kimi_kl": 0.061279296875, + "learning_rate": 3.2999999999999998e-09, + "loss": 0.001, + "ppl": 0.02490234375, + "reward": 0.9806634187698364, + "reward_std": 0.003345932811498642, + "rewards/perpo_ocr_edit_distance_reward": 0.9806634783744812, + "step": 4967, + "temperature": 0.9 + }, + { + "advantages": -2.2649765014648438e-06, + "completion_length": 592.0, + "delta_ref_entropy_loss": -0.0030670166015625, + "delta_ref_ppl": -0.12158203125, + "entropy_loss": -0.5078125, "epoch": 0.9936, - "grad_norm": 1.2214643527202618, - "k1_kl": 0.06915283203125, - "k3_kl": 0.03839111328125, - "kimi_kl": 0.0892333984375, + "grad_norm": 2.4890301619437327, + "k1_kl": 0.12158203125, + "k3_kl": 0.09423828125, + "kimi_kl": 0.2275390625, "learning_rate": 3.2e-09, - "loss": 0.0015, - "ppl": 0.075836181640625, - "reward": 0.826344758272171, - "reward_std": 0.012304799281992018, - "rewards/perpo_ocr_edit_distance_reward": 0.8263447880744934, - "step": 2484, + "loss": 0.0038, + "ppl": 0.244140625, + "reward": 0.507095992565155, + "reward_std": 0.009337538853287697, + "rewards/perpo_ocr_edit_distance_reward": 0.507095992565155, + "step": 4968, "temperature": 0.9 }, { - "advantages": -0.00016335505279130302, - "completion_length": 795.5, - "delta_ref_entropy_loss": 0.0260009765625, - "delta_ref_ppl": -0.01275634765625, - "entropy_loss": -0.021240234375, + "advantages": -1.7600401406525634e-05, + "completion_length": 605.0, + "delta_ref_entropy_loss": 0.03759765625, + "delta_ref_ppl": -0.049072265625, + "entropy_loss": -0.05517578125, + "epoch": 0.9938, + "grad_norm": 0.676123314031508, + "k1_kl": 0.049072265625, + "k3_kl": 0.027587890625, + "kimi_kl": 0.07275390625, + "learning_rate": 3.0999999999999996e-09, + "loss": 0.0011, + "ppl": 0.0205078125, + "reward": 0.9952208399772644, + "reward_std": 0.0013526624534279108, + "rewards/perpo_ocr_edit_distance_reward": 0.9952208995819092, + "step": 4969, + "temperature": 0.9 + }, + { + "advantages": -2.315640631422866e-05, + "completion_length": 1084.0, + "delta_ref_entropy_loss": 0.00469970703125, + "delta_ref_ppl": -0.035888671875, + "entropy_loss": -0.05615234375, "epoch": 0.994, - "grad_norm": 0.4347185645669548, - "k1_kl": 0.01275634765625, - "k3_kl": 0.006378173828125, - "kimi_kl": 0.01123046875, + "grad_norm": 0.4096047891928738, + "k1_kl": 0.035888671875, + "k3_kl": 0.027587890625, + "kimi_kl": 0.060302734375, "learning_rate": 3e-09, - "loss": 0.0004, - "ppl": 0.0098876953125, - "reward": 0.9990850389003754, - "reward_std": 0.0003758339153137058, - "rewards/perpo_ocr_edit_distance_reward": 0.9990851283073425, - "step": 2485, + "loss": 0.0011, + "ppl": 0.0191650390625, + "reward": 0.9914394617080688, + "reward_std": 0.0006357455276884139, + "rewards/perpo_ocr_edit_distance_reward": 0.9914395809173584, + "step": 4970, "temperature": 0.9 }, { - "advantages": -2.250501120215631e-05, - "completion_length": 570.5, - "delta_ref_entropy_loss": 0.0576171875, - "delta_ref_ppl": -0.06256103515625, - "entropy_loss": -0.052978515625, + "advantages": 1.7200197817146545e-06, + "completion_length": 638.0, + "delta_ref_entropy_loss": 0.0211181640625, + "delta_ref_ppl": -0.0537109375, + "entropy_loss": -0.0986328125, + "epoch": 0.9942, + "grad_norm": 1.6429416506676482, + "k1_kl": 0.0537109375, + "k3_kl": 0.037353515625, + "kimi_kl": 0.10107421875, + "learning_rate": 2.8999999999999995e-09, + "loss": 0.0015, + "ppl": 0.044189453125, + "reward": 0.9851347804069519, + "reward_std": 0.009815791621804237, + "rewards/perpo_ocr_edit_distance_reward": 0.9851348400115967, + "step": 4971, + "temperature": 0.9 + }, + { + "advantages": -1.393045749864541e-05, + "completion_length": 579.0, + "delta_ref_entropy_loss": 0.0322265625, + "delta_ref_ppl": -0.10791015625, + "entropy_loss": -0.162109375, "epoch": 0.9944, - "grad_norm": 0.5997283807086806, - "k1_kl": 0.06256103515625, - "k3_kl": 0.041778564453125, - "kimi_kl": 0.1270751953125, + "grad_norm": 0.9579470774920542, + "k1_kl": 0.1083984375, + "k3_kl": 0.07470703125, + "kimi_kl": 0.2197265625, "learning_rate": 2.8e-09, - "loss": 0.0017, - "ppl": 0.02655029296875, - "reward": 0.9905325472354889, - "reward_std": 0.001010551437502727, - "rewards/perpo_ocr_edit_distance_reward": 0.9905325770378113, - "step": 2486, + "loss": 0.003, + "ppl": 0.061767578125, + "reward": 0.9771538972854614, + "reward_std": 0.00234555103816092, + "rewards/perpo_ocr_edit_distance_reward": 0.9771539568901062, + "step": 4972, "temperature": 0.9 }, { - "advantages": -2.731595773752815e-05, - "completion_length": 515.0, - "delta_ref_entropy_loss": 0.05401611328125, - "delta_ref_ppl": -0.059326171875, - "entropy_loss": -0.082763671875, + "advantages": -3.4059798537100505e-08, + "completion_length": 1033.0, + "delta_ref_entropy_loss": -0.953125, + "delta_ref_ppl": 0.042724609375, + "entropy_loss": -1.6640625, + "epoch": 0.9946, + "grad_norm": 7.662627047711819, + "k1_kl": -0.042724609375, + "k3_kl": 0.1845703125, + "kimi_kl": 0.251953125, + "learning_rate": 2.7e-09, + "loss": 0.0074, + "ppl": 0.78515625, + "reward": 0.6340488195419312, + "reward_std": 0.22954018414020538, + "rewards/perpo_ocr_edit_distance_reward": 0.6340488791465759, + "step": 4973, + "temperature": 0.9 + }, + { + "advantages": -7.237706995510962e-06, + "completion_length": 77.0, + "delta_ref_entropy_loss": -0.0185546875, + "delta_ref_ppl": -0.4453125, + "entropy_loss": -0.3125, "epoch": 0.9948, - "grad_norm": 2.171121274176849, - "k1_kl": 0.05902099609375, - "k3_kl": 0.03729248046875, - "kimi_kl": 0.106689453125, + "grad_norm": 4.0640650612783435, + "k1_kl": 0.4453125, + "k3_kl": 0.359375, + "kimi_kl": 1.4375, "learning_rate": 2.5999999999999997e-09, + "loss": 0.0143, + "ppl": 0.1298828125, + "reward": 0.9471544623374939, + "reward_std": 0.008130076341331005, + "rewards/perpo_ocr_edit_distance_reward": 0.9471545219421387, + "step": 4974, + "temperature": 0.9 + }, + { + "advantages": -8.290154801215976e-05, + "completion_length": 797.0, + "delta_ref_entropy_loss": 0.01470947265625, + "delta_ref_ppl": -0.061279296875, + "entropy_loss": -0.0869140625, + "epoch": 0.995, + "grad_norm": 0.3947846292013174, + "k1_kl": 0.061279296875, + "k3_kl": 0.034912109375, + "kimi_kl": 0.08251953125, + "learning_rate": 2.5e-09, "loss": 0.0015, - "ppl": 0.0379638671875, - "reward": 0.9585049450397491, - "reward_std": 0.029639346175827086, - "rewards/perpo_ocr_edit_distance_reward": 0.9585050046443939, - "step": 2487, + "ppl": 0.030029296875, + "reward": 0.9953007102012634, + "reward_std": 0.0008241940522566438, + "rewards/perpo_ocr_edit_distance_reward": 0.995300829410553, + "step": 4975, "temperature": 0.9 }, { - "advantages": -0.00011526474190759473, - "completion_length": 997.5, - "delta_ref_entropy_loss": 0.01812744140625, - "delta_ref_ppl": -0.019256591796875, - "entropy_loss": -0.02508544921875, + "advantages": -1.9414084817981347e-05, + "completion_length": 1351.0, + "delta_ref_entropy_loss": 0.0167236328125, + "delta_ref_ppl": -0.0311279296875, + "entropy_loss": -0.06005859375, "epoch": 0.9952, - "grad_norm": 0.5658132299984877, - "k1_kl": 0.019317626953125, - "k3_kl": 0.0118408203125, - "kimi_kl": 0.026123046875, + "grad_norm": 1.9565578666016616, + "k1_kl": 0.031005859375, + "k3_kl": 0.0235595703125, + "kimi_kl": 0.06640625, "learning_rate": 2.3999999999999996e-09, - "loss": 0.0006, - "ppl": 0.01025390625, - "reward": 0.9992066621780396, - "reward_std": 0.00045146449338062666, - "rewards/perpo_ocr_edit_distance_reward": 0.9992067217826843, - "step": 2488, + "loss": 0.001, + "ppl": 0.02685546875, + "reward": 0.9951894879341125, + "reward_std": 0.000777638575527817, + "rewards/perpo_ocr_edit_distance_reward": 0.9951895475387573, + "step": 4976, "temperature": 0.9 }, { - "advantages": 2.946172571682837e-06, - "completion_length": 687.5, - "delta_ref_entropy_loss": 0.0462646484375, - "delta_ref_ppl": -0.0306396484375, - "entropy_loss": -0.0289306640625, + "advantages": -2.1270343495416455e-05, + "completion_length": 1042.0, + "delta_ref_entropy_loss": 0.031982421875, + "delta_ref_ppl": -0.056640625, + "entropy_loss": -0.095703125, + "epoch": 0.9954, + "grad_norm": 1.3719476512734665, + "k1_kl": 0.056640625, + "k3_kl": 0.03466796875, + "kimi_kl": 0.0771484375, + "learning_rate": 2.3e-09, + "loss": 0.0014, + "ppl": 0.036865234375, + "reward": 0.9926977157592773, + "reward_std": 0.003502659033983946, + "rewards/perpo_ocr_edit_distance_reward": 0.9926977753639221, + "step": 4977, + "temperature": 0.9 + }, + { + "advantages": -5.320140553521924e-05, + "completion_length": 437.0, + "delta_ref_entropy_loss": 0.028076171875, + "delta_ref_ppl": -0.10693359375, + "entropy_loss": -0.11865234375, "epoch": 0.9956, - "grad_norm": 1.1512985265863755, - "k1_kl": 0.03070068359375, - "k3_kl": 0.017059326171875, - "kimi_kl": 0.05029296875, + "grad_norm": 0.939546383048227, + "k1_kl": 0.10693359375, + "k3_kl": 0.07666015625, + "kimi_kl": 0.263671875, "learning_rate": 2.2e-09, - "loss": 0.0007, - "ppl": 0.0122222900390625, - "reward": 0.9809485375881195, - "reward_std": 0.001374217332340777, - "rewards/perpo_ocr_edit_distance_reward": 0.9809485077857971, - "step": 2489, + "loss": 0.0031, + "ppl": 0.04833984375, + "reward": 0.9573121070861816, + "reward_std": 0.0014999237610027194, + "rewards/perpo_ocr_edit_distance_reward": 0.9573122262954712, + "step": 4978, "temperature": 0.9 }, { - "advantages": -0.00010259876034979243, - "completion_length": 570.0, - "delta_ref_entropy_loss": 0.03045654296875, - "delta_ref_ppl": -0.031494140625, - "entropy_loss": -0.0252685546875, + "advantages": -3.912193642463535e-05, + "completion_length": 542.0, + "delta_ref_entropy_loss": 0.023193359375, + "delta_ref_ppl": -0.095703125, + "entropy_loss": -0.09375, + "epoch": 0.9958, + "grad_norm": 1.9257818022191269, + "k1_kl": 0.095703125, + "k3_kl": 0.06640625, + "kimi_kl": 0.2236328125, + "learning_rate": 2.0999999999999998e-09, + "loss": 0.0027, + "ppl": 0.03515625, + "reward": 0.8981401920318604, + "reward_std": 0.0012053867103531957, + "rewards/perpo_ocr_edit_distance_reward": 0.8981402516365051, + "step": 4979, + "temperature": 0.9 + }, + { + "advantages": -3.084114723606035e-05, + "completion_length": 657.0, + "delta_ref_entropy_loss": 0.0234375, + "delta_ref_ppl": -0.058349609375, + "entropy_loss": -0.053466796875, "epoch": 0.996, - "grad_norm": 0.31214347451416213, - "k1_kl": 0.031494140625, - "k3_kl": 0.020355224609375, - "kimi_kl": 0.0655517578125, + "grad_norm": 0.5780855979909519, + "k1_kl": 0.058349609375, + "k3_kl": 0.04248046875, + "kimi_kl": 0.1435546875, "learning_rate": 2e-09, - "loss": 0.0009, - "ppl": 0.010772705078125, - "reward": 0.953401118516922, - "reward_std": 0.0003757432132260874, - "rewards/perpo_ocr_edit_distance_reward": 0.9534012079238892, - "step": 2490, + "loss": 0.0017, + "ppl": 0.0213623046875, + "reward": 0.9949104189872742, + "reward_std": 0.0010050866985693574, + "rewards/perpo_ocr_edit_distance_reward": 0.9949104189872742, + "step": 4980, "temperature": 0.9 }, { - "advantages": -0.0005960464477539062, - "completion_length": 434.0, - "delta_ref_entropy_loss": 0.0263671875, - "delta_ref_ppl": -0.02630615234375, - "entropy_loss": -0.012939453125, + "advantages": -3.4144948585890234e-05, + "completion_length": 580.0, + "delta_ref_entropy_loss": 0.02490234375, + "delta_ref_ppl": -0.0703125, + "entropy_loss": -0.046875, + "epoch": 0.9962, + "grad_norm": 0.670050243109062, + "k1_kl": 0.0703125, + "k3_kl": 0.05126953125, + "kimi_kl": 0.216796875, + "learning_rate": 1.9e-09, + "loss": 0.0021, + "ppl": 0.0106201171875, + "reward": 0.997408390045166, + "reward_std": 0.0006479094736278057, + "rewards/perpo_ocr_edit_distance_reward": 0.9974084496498108, + "step": 4981, + "temperature": 0.9 + }, + { + "advantages": -4.1084633267018944e-05, + "completion_length": 333.0, + "delta_ref_entropy_loss": 0.0308837890625, + "delta_ref_ppl": -0.080078125, + "entropy_loss": -0.04638671875, "epoch": 0.9964, - "grad_norm": 0.010967939199813094, - "k1_kl": 0.0262451171875, - "k3_kl": 0.016448974609375, - "kimi_kl": 0.05267333984375, + "grad_norm": 0.9593690037580999, + "k1_kl": 0.07958984375, + "k3_kl": 0.05224609375, + "kimi_kl": 0.1884765625, "learning_rate": 1.7999999999999998e-09, - "loss": 0.0013, - "ppl": 0.0039520263671875, - "reward": 0.9983405470848083, - "reward_std": 0.0, - "rewards/perpo_ocr_edit_distance_reward": 0.9983406364917755, - "step": 2491, + "loss": 0.0021, + "ppl": 0.016357421875, + "reward": 0.9875544905662537, + "reward_std": 0.0021799677051603794, + "rewards/perpo_ocr_edit_distance_reward": 0.9875546097755432, + "step": 4982, + "temperature": 0.9 + }, + { + "advantages": -0.00011671441461658105, + "completion_length": 490.0, + "delta_ref_entropy_loss": 0.031005859375, + "delta_ref_ppl": -0.07958984375, + "entropy_loss": -0.06494140625, + "epoch": 0.9966, + "grad_norm": 0.40232453161340637, + "k1_kl": 0.07958984375, + "k3_kl": 0.056640625, + "kimi_kl": 0.216796875, + "learning_rate": 1.7e-09, + "loss": 0.0024, + "ppl": 0.0181884765625, + "reward": 0.9953080415725708, + "reward_std": 0.00048361116205342114, + "rewards/perpo_ocr_edit_distance_reward": 0.9953081011772156, + "step": 4983, "temperature": 0.9 }, { - "advantages": -0.00032852803087735083, - "completion_length": 791.5, - "delta_ref_entropy_loss": 0.03643798828125, - "delta_ref_ppl": -0.03045654296875, - "entropy_loss": -0.04302978515625, + "advantages": -3.358296089572832e-05, + "completion_length": 1171.0, + "delta_ref_entropy_loss": 0.035400390625, + "delta_ref_ppl": -0.05712890625, + "entropy_loss": -0.1337890625, "epoch": 0.9968, - "grad_norm": 1.1828364630586718, - "k1_kl": 0.0303955078125, - "k3_kl": 0.017059326171875, - "kimi_kl": 0.03936767578125, + "grad_norm": 1.8510801588335424, + "k1_kl": 0.05712890625, + "k3_kl": 0.03515625, + "kimi_kl": 0.06494140625, "learning_rate": 1.6e-09, - "loss": 0.001, - "ppl": 0.022430419921875, - "reward": 0.866706132888794, - "reward_std": 0.00029890512814745307, - "rewards/perpo_ocr_edit_distance_reward": 0.8667061626911163, - "step": 2492, + "loss": 0.0014, + "ppl": 0.0625, + "reward": 0.7177821397781372, + "reward_std": 0.0021799250971525908, + "rewards/perpo_ocr_edit_distance_reward": 0.717782199382782, + "step": 4984, "temperature": 0.9 }, { - "advantages": -8.037686711759306e-05, - "completion_length": 626.5, - "delta_ref_entropy_loss": 0.0369873046875, - "delta_ref_ppl": -0.029052734375, - "entropy_loss": -0.02093505859375, + "advantages": -9.599754412192851e-05, + "completion_length": 733.0, + "delta_ref_entropy_loss": 0.0230712890625, + "delta_ref_ppl": -0.05615234375, + "entropy_loss": -0.048583984375, + "epoch": 0.997, + "grad_norm": 0.3981089646209349, + "k1_kl": 0.05615234375, + "k3_kl": 0.036376953125, + "kimi_kl": 0.11767578125, + "learning_rate": 1.5e-09, + "loss": 0.0015, + "ppl": 0.01409912109375, + "reward": 0.9976156949996948, + "reward_std": 0.0006982760969549417, + "rewards/perpo_ocr_edit_distance_reward": 0.9976158142089844, + "step": 4985, + "temperature": 0.9 + }, + { + "advantages": -6.624630623264238e-05, + "completion_length": 579.0, + "delta_ref_entropy_loss": 0.033203125, + "delta_ref_ppl": -0.0810546875, + "entropy_loss": -0.0986328125, "epoch": 0.9972, - "grad_norm": 0.9892113333523245, - "k1_kl": 0.029052734375, - "k3_kl": 0.01763916015625, - "kimi_kl": 0.0491943359375, + "grad_norm": 0.5570545909585201, + "k1_kl": 0.08154296875, + "k3_kl": 0.05078125, + "kimi_kl": 0.154296875, "learning_rate": 1.4e-09, - "loss": 0.0008, - "ppl": 0.0106201171875, - "reward": 0.9984061419963837, - "reward_std": 0.000834090038551949, - "rewards/perpo_ocr_edit_distance_reward": 0.998406171798706, - "step": 2493, + "loss": 0.0021, + "ppl": 0.0263671875, + "reward": 0.9911850690841675, + "reward_std": 0.0007996959611773491, + "rewards/perpo_ocr_edit_distance_reward": 0.991185188293457, + "step": 4986, "temperature": 0.9 }, { - "advantages": -0.00010260088311042637, - "completion_length": 565.5, - "delta_ref_entropy_loss": 0.02337646484375, - "delta_ref_ppl": -0.0201416015625, - "entropy_loss": -0.02545166015625, + "advantages": -1.2176377822470386e-05, + "completion_length": 455.0, + "delta_ref_entropy_loss": 0.0079345703125, + "delta_ref_ppl": -0.04541015625, + "entropy_loss": -0.054931640625, + "epoch": 0.9974, + "grad_norm": 0.6488870066626878, + "k1_kl": 0.04541015625, + "k3_kl": 0.0291748046875, + "kimi_kl": 0.07958984375, + "learning_rate": 1.2999999999999999e-09, + "loss": 0.0012, + "ppl": 0.01434326171875, + "reward": 0.999481201171875, + "reward_std": 0.0005989153287373483, + "rewards/perpo_ocr_edit_distance_reward": 0.9994812607765198, + "step": 4987, + "temperature": 0.9 + }, + { + "advantages": 3.069639205932617e-05, + "completion_length": 716.0, + "delta_ref_entropy_loss": 0.0081787109375, + "delta_ref_ppl": -0.036865234375, + "entropy_loss": -0.0712890625, "epoch": 0.9976, - "grad_norm": 0.2527819558543986, - "k1_kl": 0.0201416015625, - "k3_kl": 0.01214599609375, - "kimi_kl": 0.03057861328125, + "grad_norm": 0.649340873767202, + "k1_kl": 0.036865234375, + "k3_kl": 0.024169921875, + "kimi_kl": 0.0517578125, "learning_rate": 1.1999999999999998e-09, - "loss": 0.0006, - "ppl": 0.0133056640625, - "reward": 0.999576061964035, - "reward_std": 0.0001574783818796277, - "rewards/perpo_ocr_edit_distance_reward": 0.9995760917663574, - "step": 2494, + "loss": 0.0009, + "ppl": 0.0263671875, + "reward": 0.9964093565940857, + "reward_std": 0.000732959306333214, + "rewards/perpo_ocr_edit_distance_reward": 0.9964092969894409, + "step": 4988, + "temperature": 0.9 + }, + { + "advantages": -1.221043748955708e-05, + "completion_length": 595.0, + "delta_ref_entropy_loss": 0.01318359375, + "delta_ref_ppl": -0.0751953125, + "entropy_loss": -0.1123046875, + "epoch": 0.9978, + "grad_norm": 0.9178150964727336, + "k1_kl": 0.0751953125, + "k3_kl": 0.052734375, + "kimi_kl": 0.1875, + "learning_rate": 1.1e-09, + "loss": 0.0021, + "ppl": 0.0306396484375, + "reward": 0.9513545632362366, + "reward_std": 0.0019923881627619267, + "rewards/perpo_ocr_edit_distance_reward": 0.9513546228408813, + "step": 4989, "temperature": 0.9 }, { - "advantages": -1.9797258636877757e-06, - "completion_length": 404.5, - "delta_ref_entropy_loss": 0.068115234375, - "delta_ref_ppl": -0.0540771484375, - "entropy_loss": -0.05999755859375, + "advantages": -4.291534423828125e-06, + "completion_length": 261.0, + "delta_ref_entropy_loss": 0.057373046875, + "delta_ref_ppl": -0.1240234375, + "entropy_loss": -0.197265625, "epoch": 0.998, - "grad_norm": 1.1839690479552298, - "k1_kl": 0.0540771484375, - "k3_kl": 0.03179931640625, - "kimi_kl": 0.0948486328125, + "grad_norm": 1.9238518857003726, + "k1_kl": 0.1240234375, + "k3_kl": 0.08349609375, + "kimi_kl": 0.205078125, "learning_rate": 1e-09, - "loss": 0.0013, - "ppl": 0.02655029296875, - "reward": 0.6403990536928177, - "reward_std": 0.002024983405135572, - "rewards/perpo_ocr_edit_distance_reward": 0.6403990983963013, - "step": 2495, + "loss": 0.0033, + "ppl": 0.078125, + "reward": 0.3533570468425751, + "reward_std": 0.001884043449535966, + "rewards/perpo_ocr_edit_distance_reward": 0.3533570468425751, + "step": 4990, "temperature": 0.9 }, { - "advantages": -2.704135165143562e-05, - "completion_length": 1298.5, - "delta_ref_entropy_loss": 0.0313720703125, - "delta_ref_ppl": -0.02423095703125, - "entropy_loss": -0.022216796875, + "advantages": -5.4691521654604e-05, + "completion_length": 1274.0, + "delta_ref_entropy_loss": 0.022216796875, + "delta_ref_ppl": -0.0322265625, + "entropy_loss": -0.07568359375, + "epoch": 0.9982, + "grad_norm": 1.9420799052305797, + "k1_kl": 0.0322265625, + "k3_kl": 0.024169921875, + "kimi_kl": 0.040771484375, + "learning_rate": 8.999999999999999e-10, + "loss": 0.001, + "ppl": 0.03857421875, + "reward": 0.9859917163848877, + "reward_std": 0.0008342328364960849, + "rewards/perpo_ocr_edit_distance_reward": 0.9859917163848877, + "step": 4991, + "temperature": 0.9 + }, + { + "advantages": -1.5565328794764355e-05, + "completion_length": 1600.0, + "delta_ref_entropy_loss": 0.0341796875, + "delta_ref_ppl": -0.0673828125, + "entropy_loss": -0.07666015625, "epoch": 0.9984, - "grad_norm": 0.2682439184580755, - "k1_kl": 0.02423095703125, - "k3_kl": 0.01342010498046875, - "kimi_kl": 0.0416259765625, + "grad_norm": 1.4139504041831379, + "k1_kl": 0.0673828125, + "k3_kl": 0.03955078125, + "kimi_kl": 0.1220703125, "learning_rate": 8e-10, - "loss": 0.0006, - "ppl": 0.0102386474609375, - "reward": 0.998653769493103, - "reward_std": 0.00042787602433236316, - "rewards/perpo_ocr_edit_distance_reward": 0.9986538290977478, - "step": 2496, + "loss": 0.0016, + "ppl": 0.027587890625, + "reward": 0.9925916790962219, + "reward_std": 0.003726609982550144, + "rewards/perpo_ocr_edit_distance_reward": 0.9925918579101562, + "step": 4992, "temperature": 0.9 }, { - "advantages": -1.6944750313996337e-06, - "completion_length": 674.5, - "delta_ref_entropy_loss": 0.038330078125, - "delta_ref_ppl": -0.0286865234375, - "entropy_loss": -0.02978515625, + "advantages": -5.09023702761624e-05, + "completion_length": 1093.0, + "delta_ref_entropy_loss": 0.031494140625, + "delta_ref_ppl": -0.0859375, + "entropy_loss": -0.08984375, + "epoch": 0.9986, + "grad_norm": 0.7658680455394057, + "k1_kl": 0.0859375, + "k3_kl": 0.0556640625, + "kimi_kl": 0.2021484375, + "learning_rate": 7e-10, + "loss": 0.0023, + "ppl": 0.03564453125, + "reward": 0.9472190737724304, + "reward_std": 0.0015728509752079844, + "rewards/perpo_ocr_edit_distance_reward": 0.94721919298172, + "step": 4993, + "temperature": 0.9 + }, + { + "advantages": -8.566039468860254e-06, + "completion_length": 344.0, + "delta_ref_entropy_loss": 0.004180908203125, + "delta_ref_ppl": -0.09326171875, + "entropy_loss": -0.1328125, "epoch": 0.9988, - "grad_norm": 0.40331150047419945, - "k1_kl": 0.02874755859375, - "k3_kl": 0.0169525146484375, - "kimi_kl": 0.06427001953125, + "grad_norm": 1.1721732364557076, + "k1_kl": 0.09326171875, + "k3_kl": 0.0712890625, + "kimi_kl": 0.2333984375, "learning_rate": 5.999999999999999e-10, - "loss": 0.0007, - "ppl": 0.014739990234375, - "reward": 0.9767457842826843, - "reward_std": 0.0037075404543429613, - "rewards/perpo_ocr_edit_distance_reward": 0.9767458438873291, - "step": 2497, + "loss": 0.0029, + "ppl": 0.048583984375, + "reward": 0.96335768699646, + "reward_std": 0.006848557852208614, + "rewards/perpo_ocr_edit_distance_reward": 0.9633578062057495, + "step": 4994, "temperature": 0.9 }, { - "advantages": -2.1857875026398688e-05, - "completion_length": 957.5, - "delta_ref_entropy_loss": 0.01611328125, - "delta_ref_ppl": -0.02252197265625, - "entropy_loss": -0.03131103515625, + "advantages": -1.9754684217332397e-06, + "completion_length": 1672.0, + "delta_ref_entropy_loss": -0.0196533203125, + "delta_ref_ppl": -0.021484375, + "entropy_loss": -0.07421875, + "epoch": 0.999, + "grad_norm": 0.8998940954246116, + "k1_kl": 0.0216064453125, + "k3_kl": 0.020263671875, + "kimi_kl": 0.043701171875, + "learning_rate": 5e-10, + "loss": 0.0008, + "ppl": 0.036865234375, + "reward": 0.8474091291427612, + "reward_std": 0.008555538021028042, + "rewards/perpo_ocr_edit_distance_reward": 0.8474091291427612, + "step": 4995, + "temperature": 0.9 + }, + { + "advantages": -5.4819247452542186e-05, + "completion_length": 327.0, + "delta_ref_entropy_loss": 0.0025177001953125, + "delta_ref_ppl": -0.10693359375, + "entropy_loss": -0.06640625, "epoch": 0.9992, - "grad_norm": 1.7232174125967605, - "k1_kl": 0.02252197265625, - "k3_kl": 0.013763427734375, - "kimi_kl": 0.0294189453125, + "grad_norm": 0.6537372558628084, + "k1_kl": 0.10693359375, + "k3_kl": 0.0859375, + "kimi_kl": 0.55859375, "learning_rate": 4e-10, - "loss": 0.0006, - "ppl": 0.02020263671875, - "reward": 0.9839839935302734, - "reward_std": 0.003971924685174599, - "rewards/perpo_ocr_edit_distance_reward": 0.9839840531349182, - "step": 2498, + "loss": 0.0035, + "ppl": 0.02001953125, + "reward": 0.9976502060890198, + "reward_std": 0.0006771455518901348, + "rewards/perpo_ocr_edit_distance_reward": 0.9976502656936646, + "step": 4996, "temperature": 0.9 }, { - "advantages": -3.3106123737525195e-05, - "completion_length": 684.0, - "delta_ref_entropy_loss": 0.02783203125, - "delta_ref_ppl": -0.04541015625, - "entropy_loss": -0.027587890625, + "advantages": -3.0704908567713574e-05, + "completion_length": 569.0, + "delta_ref_entropy_loss": 0.046875, + "delta_ref_ppl": -0.1552734375, + "entropy_loss": -0.09619140625, + "epoch": 0.9994, + "grad_norm": 0.8057458087612076, + "k1_kl": 0.1552734375, + "k3_kl": 0.11474609375, + "kimi_kl": 0.408203125, + "learning_rate": 2.9999999999999995e-10, + "loss": 0.0046, + "ppl": 0.041259765625, + "reward": 0.976569414138794, + "reward_std": 0.0018405800219625235, + "rewards/perpo_ocr_edit_distance_reward": 0.9765694737434387, + "step": 4997, + "temperature": 0.9 + }, + { + "advantages": -5.908523598918691e-05, + "completion_length": 899.0, + "delta_ref_entropy_loss": 0.00799560546875, + "delta_ref_ppl": -0.0390625, + "entropy_loss": -0.05224609375, "epoch": 0.9996, - "grad_norm": 0.6158116561818618, - "k1_kl": 0.04547119140625, - "k3_kl": 0.033935546875, - "kimi_kl": 0.14501953125, + "grad_norm": 0.46271087004032446, + "k1_kl": 0.0390625, + "k3_kl": 0.025146484375, + "kimi_kl": 0.076171875, "learning_rate": 2e-10, - "loss": 0.0014, - "ppl": 0.012542724609375, - "reward": 0.9899111986160278, - "reward_std": 0.0014309996913652867, - "rewards/perpo_ocr_edit_distance_reward": 0.989911288022995, - "step": 2499, + "loss": 0.0011, + "ppl": 0.0169677734375, + "reward": 0.9972739219665527, + "reward_std": 0.0010529550490900874, + "rewards/perpo_ocr_edit_distance_reward": 0.9972740411758423, + "step": 4998, "temperature": 0.9 }, { - "advantages": -0.7864032685756683, - "completion_length": 887.5, - "delta_ref_entropy_loss": 0.0250244140625, - "delta_ref_ppl": -0.0213623046875, - "entropy_loss": -0.02777099609375, + "advantages": -4.381793041829951e-05, + "completion_length": 1148.0, + "delta_ref_entropy_loss": 0.018798828125, + "delta_ref_ppl": -0.049072265625, + "entropy_loss": -0.0693359375, + "epoch": 0.9998, + "grad_norm": 2.5227546694209444, + "k1_kl": 0.049072265625, + "k3_kl": 0.0390625, + "kimi_kl": 0.08447265625, + "learning_rate": 1e-10, + "loss": 0.0016, + "ppl": 0.031005859375, + "reward": 0.9964830875396729, + "reward_std": 0.0008715693838894367, + "rewards/perpo_ocr_edit_distance_reward": 0.9964831471443176, + "step": 4999, + "temperature": 0.9 + }, + { + "advantages": 0.2696757912635803, + "completion_length": 708.0, + "delta_ref_entropy_loss": 0.0023193359375, + "delta_ref_ppl": -0.0537109375, + "entropy_loss": -0.06689453125, "epoch": 1.0, - "grad_norm": 0.3098248922502048, - "k1_kl": 0.021240234375, - "k3_kl": 0.01416015625, - "kimi_kl": 0.037353515625, + "grad_norm": 0.7460783081542957, + "k1_kl": 0.0537109375, + "k3_kl": 0.03662109375, + "kimi_kl": 0.1044921875, "learning_rate": 0.0, - "loss": 0.0005, - "ppl": 0.012725830078125, - "reward": 0.9983441829681396, - "reward_std": 0.0003253469767514616, - "rewards/perpo_ocr_edit_distance_reward": 0.998699963092804, - "step": 2500, + "loss": 0.0017, + "ppl": 0.0208740234375, + "reward": 0.9931955337524414, + "reward_std": 0.0050437659956514835, + "rewards/perpo_ocr_edit_distance_reward": 0.9918084144592285, + "step": 5000, "temperature": 0.9 } ], "logging_steps": 1.0, - "max_steps": 2500, + "max_steps": 5000, "num_input_tokens_seen": 0, "num_train_epochs": 1, - "save_steps": 100, + "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": {