| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.9979494190020506, | |
| "eval_steps": 100, | |
| "global_step": 1095, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.02734107997265892, | |
| "grad_norm": 706.2386474609375, | |
| "learning_rate": 2.0000000000000003e-06, | |
| "loss": 3.6456, | |
| "mean_token_accuracy": 0.43679042160511017, | |
| "num_tokens": 171126.0, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.05468215994531784, | |
| "grad_norm": 40.858299255371094, | |
| "learning_rate": 4.000000000000001e-06, | |
| "loss": 1.9492, | |
| "mean_token_accuracy": 0.6433063127100468, | |
| "num_tokens": 340272.0, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.08202323991797676, | |
| "grad_norm": 10.141822814941406, | |
| "learning_rate": 6e-06, | |
| "loss": 1.3747, | |
| "mean_token_accuracy": 0.7164170637726783, | |
| "num_tokens": 512138.0, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.10936431989063568, | |
| "grad_norm": 5.630201816558838, | |
| "learning_rate": 8.000000000000001e-06, | |
| "loss": 1.2492, | |
| "mean_token_accuracy": 0.7324269533157348, | |
| "num_tokens": 682029.0, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.1367053998632946, | |
| "grad_norm": 4.846619129180908, | |
| "learning_rate": 1e-05, | |
| "loss": 1.1189, | |
| "mean_token_accuracy": 0.7480411291122436, | |
| "num_tokens": 850280.0, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.16404647983595352, | |
| "grad_norm": 2.89782452583313, | |
| "learning_rate": 1.2e-05, | |
| "loss": 0.9996, | |
| "mean_token_accuracy": 0.7543315425515175, | |
| "num_tokens": 1018134.0, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.19138755980861244, | |
| "grad_norm": 2.9257192611694336, | |
| "learning_rate": 1.4e-05, | |
| "loss": 0.9432, | |
| "mean_token_accuracy": 0.7699216932058335, | |
| "num_tokens": 1187053.0, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.21872863978127136, | |
| "grad_norm": 2.4271352291107178, | |
| "learning_rate": 1.6000000000000003e-05, | |
| "loss": 0.8963, | |
| "mean_token_accuracy": 0.7777562007308007, | |
| "num_tokens": 1355716.0, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.24606971975393027, | |
| "grad_norm": 3.1461400985717773, | |
| "learning_rate": 1.8e-05, | |
| "loss": 0.8491, | |
| "mean_token_accuracy": 0.780939394235611, | |
| "num_tokens": 1523424.0, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.2734107997265892, | |
| "grad_norm": 1.6774334907531738, | |
| "learning_rate": 2e-05, | |
| "loss": 0.8691, | |
| "mean_token_accuracy": 0.7762249544262886, | |
| "num_tokens": 1692335.0, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.2734107997265892, | |
| "eval_loss": 0.8699743747711182, | |
| "eval_mean_token_accuracy": 0.7742338619349193, | |
| "eval_num_tokens": 1692335.0, | |
| "eval_runtime": 14.2578, | |
| "eval_samples_per_second": 364.783, | |
| "eval_steps_per_second": 11.432, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.3007518796992481, | |
| "grad_norm": 1.4909802675247192, | |
| "learning_rate": 1.999501589126174e-05, | |
| "loss": 0.8914, | |
| "mean_token_accuracy": 0.7690163850784302, | |
| "num_tokens": 1866084.0, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.32809295967190705, | |
| "grad_norm": 6.061567783355713, | |
| "learning_rate": 1.9980068533314937e-05, | |
| "loss": 0.9556, | |
| "mean_token_accuracy": 0.7586096897721291, | |
| "num_tokens": 2037177.0, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.35543403964456594, | |
| "grad_norm": 1.5067142248153687, | |
| "learning_rate": 1.995517282601106e-05, | |
| "loss": 0.9423, | |
| "mean_token_accuracy": 0.7592512294650078, | |
| "num_tokens": 2206650.0, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.3827751196172249, | |
| "grad_norm": 1.3846155405044556, | |
| "learning_rate": 1.992035358593258e-05, | |
| "loss": 0.8981, | |
| "mean_token_accuracy": 0.7691041335463524, | |
| "num_tokens": 2377519.0, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.4101161995898838, | |
| "grad_norm": 1.385062575340271, | |
| "learning_rate": 1.987564552165524e-05, | |
| "loss": 0.8706, | |
| "mean_token_accuracy": 0.7754243835806847, | |
| "num_tokens": 2543887.0, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.4374572795625427, | |
| "grad_norm": 1.2568836212158203, | |
| "learning_rate": 1.9821093199149806e-05, | |
| "loss": 0.9014, | |
| "mean_token_accuracy": 0.767622135579586, | |
| "num_tokens": 2717105.0, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.46479835953520166, | |
| "grad_norm": 1.3307958841323853, | |
| "learning_rate": 1.9756750997357738e-05, | |
| "loss": 0.8707, | |
| "mean_token_accuracy": 0.7744425281882286, | |
| "num_tokens": 2885656.0, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.49213943950786054, | |
| "grad_norm": 1.3584340810775757, | |
| "learning_rate": 1.9682683053985073e-05, | |
| "loss": 0.8685, | |
| "mean_token_accuracy": 0.7746870696544648, | |
| "num_tokens": 3055414.0, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.5194805194805194, | |
| "grad_norm": 1.4741311073303223, | |
| "learning_rate": 1.959896320156857e-05, | |
| "loss": 0.8758, | |
| "mean_token_accuracy": 0.7729167580604553, | |
| "num_tokens": 3224208.0, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.5468215994531784, | |
| "grad_norm": 1.252274990081787, | |
| "learning_rate": 1.950567489387783e-05, | |
| "loss": 0.8767, | |
| "mean_token_accuracy": 0.7731393739581108, | |
| "num_tokens": 3394113.0, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.5468215994531784, | |
| "eval_loss": 0.8808718323707581, | |
| "eval_mean_token_accuracy": 0.7713644226635892, | |
| "eval_num_tokens": 3394113.0, | |
| "eval_runtime": 14.1481, | |
| "eval_samples_per_second": 367.611, | |
| "eval_steps_per_second": 11.521, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.5741626794258373, | |
| "grad_norm": 1.2876965999603271, | |
| "learning_rate": 1.9402911122726756e-05, | |
| "loss": 0.8797, | |
| "mean_token_accuracy": 0.7735408559441567, | |
| "num_tokens": 3561161.0, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.6015037593984962, | |
| "grad_norm": 2.8176612854003906, | |
| "learning_rate": 1.9290774325277305e-05, | |
| "loss": 0.8902, | |
| "mean_token_accuracy": 0.7713056340813637, | |
| "num_tokens": 3732561.0, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.6288448393711552, | |
| "grad_norm": 1.2533715963363647, | |
| "learning_rate": 1.916937628192789e-05, | |
| "loss": 0.8588, | |
| "mean_token_accuracy": 0.7773521527647972, | |
| "num_tokens": 3900320.0, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.6561859193438141, | |
| "grad_norm": 1.3737971782684326, | |
| "learning_rate": 1.903883800488824e-05, | |
| "loss": 0.8796, | |
| "mean_token_accuracy": 0.7744974941015244, | |
| "num_tokens": 4068430.0, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.683526999316473, | |
| "grad_norm": 1.3005715608596802, | |
| "learning_rate": 1.8899289617551803e-05, | |
| "loss": 0.8767, | |
| "mean_token_accuracy": 0.7730578362941742, | |
| "num_tokens": 4238745.0, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.7108680792891319, | |
| "grad_norm": 1.2251440286636353, | |
| "learning_rate": 1.875087022478594e-05, | |
| "loss": 0.872, | |
| "mean_token_accuracy": 0.7743289664387702, | |
| "num_tokens": 4408233.0, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.7382091592617909, | |
| "grad_norm": 1.1912628412246704, | |
| "learning_rate": 1.8593727774269122e-05, | |
| "loss": 0.8857, | |
| "mean_token_accuracy": 0.7705358847975731, | |
| "num_tokens": 4581540.0, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.7655502392344498, | |
| "grad_norm": 1.1127012968063354, | |
| "learning_rate": 1.842801890901351e-05, | |
| "loss": 0.8716, | |
| "mean_token_accuracy": 0.7735901325941086, | |
| "num_tokens": 4752315.0, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.7928913192071086, | |
| "grad_norm": 1.1501351594924927, | |
| "learning_rate": 1.8253908811219764e-05, | |
| "loss": 0.8572, | |
| "mean_token_accuracy": 0.7767810940742492, | |
| "num_tokens": 4922291.0, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.8202323991797676, | |
| "grad_norm": 1.1722129583358765, | |
| "learning_rate": 1.8071571037619856e-05, | |
| "loss": 0.8814, | |
| "mean_token_accuracy": 0.7732006222009659, | |
| "num_tokens": 5091028.0, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.8202323991797676, | |
| "eval_loss": 0.8661420345306396, | |
| "eval_mean_token_accuracy": 0.774554943745853, | |
| "eval_num_tokens": 5091028.0, | |
| "eval_runtime": 14.1269, | |
| "eval_samples_per_second": 368.164, | |
| "eval_steps_per_second": 11.538, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.8475734791524265, | |
| "grad_norm": 1.1900874376296997, | |
| "learning_rate": 1.7881187346471924e-05, | |
| "loss": 0.878, | |
| "mean_token_accuracy": 0.7729042619466782, | |
| "num_tokens": 5263685.0, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.8749145591250854, | |
| "grad_norm": 1.1753484010696411, | |
| "learning_rate": 1.7682947516379706e-05, | |
| "loss": 0.8649, | |
| "mean_token_accuracy": 0.775020281970501, | |
| "num_tokens": 5433324.0, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.9022556390977443, | |
| "grad_norm": 1.0580289363861084, | |
| "learning_rate": 1.7477049157117093e-05, | |
| "loss": 0.873, | |
| "mean_token_accuracy": 0.7735655456781387, | |
| "num_tokens": 5602221.0, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.9295967190704033, | |
| "grad_norm": 1.3067166805267334, | |
| "learning_rate": 1.7263697512646397e-05, | |
| "loss": 0.8579, | |
| "mean_token_accuracy": 0.7792476788163185, | |
| "num_tokens": 5770719.0, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.9569377990430622, | |
| "grad_norm": 1.161084532737732, | |
| "learning_rate": 1.7043105256526723e-05, | |
| "loss": 0.8507, | |
| "mean_token_accuracy": 0.7788742691278457, | |
| "num_tokens": 5939298.0, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.9842788790157211, | |
| "grad_norm": 1.1978732347488403, | |
| "learning_rate": 1.681549227991634e-05, | |
| "loss": 0.8597, | |
| "mean_token_accuracy": 0.7768805012106895, | |
| "num_tokens": 6109792.0, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 1.0136705399863295, | |
| "grad_norm": 1.2675771713256836, | |
| "learning_rate": 1.658108547238038e-05, | |
| "loss": 0.803, | |
| "mean_token_accuracy": 0.8084554207034227, | |
| "num_tokens": 6283339.0, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 1.0410116199589883, | |
| "grad_norm": 1.265122890472412, | |
| "learning_rate": 1.634011849572239e-05, | |
| "loss": 0.5533, | |
| "mean_token_accuracy": 0.8468987360596657, | |
| "num_tokens": 6451470.0, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 1.0683526999316473, | |
| "grad_norm": 1.2544903755187988, | |
| "learning_rate": 1.609283155106517e-05, | |
| "loss": 0.534, | |
| "mean_token_accuracy": 0.8496995747089386, | |
| "num_tokens": 6620528.0, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 1.0956937799043063, | |
| "grad_norm": 1.2964367866516113, | |
| "learning_rate": 1.5839471139413065e-05, | |
| "loss": 0.5464, | |
| "mean_token_accuracy": 0.8472872495651245, | |
| "num_tokens": 6789027.0, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.0956937799043063, | |
| "eval_loss": 0.923022449016571, | |
| "eval_mean_token_accuracy": 0.7719546443114251, | |
| "eval_num_tokens": 6789027.0, | |
| "eval_runtime": 14.2222, | |
| "eval_samples_per_second": 365.696, | |
| "eval_steps_per_second": 11.461, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.123034859876965, | |
| "grad_norm": 1.214882254600525, | |
| "learning_rate": 1.55802898159344e-05, | |
| "loss": 0.5407, | |
| "mean_token_accuracy": 0.848370036482811, | |
| "num_tokens": 6958294.0, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 1.150375939849624, | |
| "grad_norm": 1.1957063674926758, | |
| "learning_rate": 1.5315545938209016e-05, | |
| "loss": 0.5303, | |
| "mean_token_accuracy": 0.8504750013351441, | |
| "num_tokens": 7126382.0, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 1.177717019822283, | |
| "grad_norm": 1.2198673486709595, | |
| "learning_rate": 1.5045503408691776e-05, | |
| "loss": 0.5361, | |
| "mean_token_accuracy": 0.8481929019093514, | |
| "num_tokens": 7296187.0, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 1.2050580997949418, | |
| "grad_norm": 1.2924712896347046, | |
| "learning_rate": 1.4770431411648898e-05, | |
| "loss": 0.5327, | |
| "mean_token_accuracy": 0.8494196251034737, | |
| "num_tokens": 7467368.0, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 1.2323991797676008, | |
| "grad_norm": 1.169443130493164, | |
| "learning_rate": 1.4490604144829204e-05, | |
| "loss": 0.5543, | |
| "mean_token_accuracy": 0.8449285939335823, | |
| "num_tokens": 7638452.0, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.2597402597402598, | |
| "grad_norm": 1.2594044208526611, | |
| "learning_rate": 1.4206300546137844e-05, | |
| "loss": 0.5341, | |
| "mean_token_accuracy": 0.8485873684287071, | |
| "num_tokens": 7810489.0, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 1.2870813397129186, | |
| "grad_norm": 1.2319672107696533, | |
| "learning_rate": 1.3917804015584932e-05, | |
| "loss": 0.5496, | |
| "mean_token_accuracy": 0.8453727856278419, | |
| "num_tokens": 7980694.0, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 1.3144224196855776, | |
| "grad_norm": 1.2721346616744995, | |
| "learning_rate": 1.3625402132786247e-05, | |
| "loss": 0.5241, | |
| "mean_token_accuracy": 0.8517711386084557, | |
| "num_tokens": 8150413.0, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 1.3417634996582364, | |
| "grad_norm": 1.2490406036376953, | |
| "learning_rate": 1.3329386370297615e-05, | |
| "loss": 0.5359, | |
| "mean_token_accuracy": 0.848786735534668, | |
| "num_tokens": 8320550.0, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 1.3691045796308954, | |
| "grad_norm": 1.2758678197860718, | |
| "learning_rate": 1.3030051803068729e-05, | |
| "loss": 0.5357, | |
| "mean_token_accuracy": 0.8479269713163375, | |
| "num_tokens": 8490464.0, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.3691045796308954, | |
| "eval_loss": 0.8972102999687195, | |
| "eval_mean_token_accuracy": 0.7732659333322677, | |
| "eval_num_tokens": 8490464.0, | |
| "eval_runtime": 14.2312, | |
| "eval_samples_per_second": 365.464, | |
| "eval_steps_per_second": 11.454, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.3964456596035544, | |
| "grad_norm": 1.197234034538269, | |
| "learning_rate": 1.2727696814306034e-05, | |
| "loss": 0.5424, | |
| "mean_token_accuracy": 0.8464474871754646, | |
| "num_tokens": 8658228.0, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 1.4237867395762134, | |
| "grad_norm": 1.2052682638168335, | |
| "learning_rate": 1.2422622798037833e-05, | |
| "loss": 0.5344, | |
| "mean_token_accuracy": 0.849498587846756, | |
| "num_tokens": 8827640.0, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 1.4511278195488722, | |
| "grad_norm": 1.2217556238174438, | |
| "learning_rate": 1.2115133858678192e-05, | |
| "loss": 0.5329, | |
| "mean_token_accuracy": 0.8495344504714012, | |
| "num_tokens": 8998630.0, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 1.4784688995215312, | |
| "grad_norm": 1.2305293083190918, | |
| "learning_rate": 1.1805536507889021e-05, | |
| "loss": 0.5324, | |
| "mean_token_accuracy": 0.8484964698553086, | |
| "num_tokens": 9165678.0, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 1.50580997949419, | |
| "grad_norm": 1.2119832038879395, | |
| "learning_rate": 1.1494139359042612e-05, | |
| "loss": 0.5365, | |
| "mean_token_accuracy": 0.8479975119233132, | |
| "num_tokens": 9336809.0, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 1.533151059466849, | |
| "grad_norm": 1.2407721281051636, | |
| "learning_rate": 1.1181252819589081e-05, | |
| "loss": 0.5316, | |
| "mean_token_accuracy": 0.8489836812019348, | |
| "num_tokens": 9507494.0, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 1.560492139439508, | |
| "grad_norm": 1.2563666105270386, | |
| "learning_rate": 1.086718878163551e-05, | |
| "loss": 0.5372, | |
| "mean_token_accuracy": 0.8482470452785492, | |
| "num_tokens": 9678069.0, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 1.587833219412167, | |
| "grad_norm": 1.1326807737350464, | |
| "learning_rate": 1.0552260311045082e-05, | |
| "loss": 0.5207, | |
| "mean_token_accuracy": 0.8524438634514808, | |
| "num_tokens": 9844214.0, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 1.6151742993848257, | |
| "grad_norm": 1.2195802927017212, | |
| "learning_rate": 1.0236781335366239e-05, | |
| "loss": 0.5437, | |
| "mean_token_accuracy": 0.8467420265078545, | |
| "num_tokens": 10013802.0, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 1.6425153793574845, | |
| "grad_norm": 1.2044719457626343, | |
| "learning_rate": 9.92106633090287e-06, | |
| "loss": 0.5234, | |
| "mean_token_accuracy": 0.8509897753596306, | |
| "num_tokens": 10183182.0, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.6425153793574845, | |
| "eval_loss": 0.8916485905647278, | |
| "eval_mean_token_accuracy": 0.7742225652092074, | |
| "eval_num_tokens": 10183182.0, | |
| "eval_runtime": 14.1444, | |
| "eval_samples_per_second": 367.707, | |
| "eval_steps_per_second": 11.524, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.6698564593301435, | |
| "grad_norm": 1.2906745672225952, | |
| "learning_rate": 9.605430009237474e-06, | |
| "loss": 0.549, | |
| "mean_token_accuracy": 0.8446809872984886, | |
| "num_tokens": 10355702.0, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 1.6971975393028025, | |
| "grad_norm": 1.1947541236877441, | |
| "learning_rate": 9.290187003519841e-06, | |
| "loss": 0.5447, | |
| "mean_token_accuracy": 0.8470652237534523, | |
| "num_tokens": 10527901.0, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 1.7245386192754615, | |
| "grad_norm": 1.2121537923812866, | |
| "learning_rate": 8.975651554833869e-06, | |
| "loss": 0.5352, | |
| "mean_token_accuracy": 0.8479515925049782, | |
| "num_tokens": 10698378.0, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 1.7518796992481203, | |
| "grad_norm": 1.274902105331421, | |
| "learning_rate": 8.662137198955211e-06, | |
| "loss": 0.5317, | |
| "mean_token_accuracy": 0.8490415424108505, | |
| "num_tokens": 10863945.0, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 1.7792207792207793, | |
| "grad_norm": 1.270011305809021, | |
| "learning_rate": 8.349956453812009e-06, | |
| "loss": 0.5193, | |
| "mean_token_accuracy": 0.8525262147188186, | |
| "num_tokens": 11032272.0, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 1.806561859193438, | |
| "grad_norm": 1.1416285037994385, | |
| "learning_rate": 8.03942050796022e-06, | |
| "loss": 0.538, | |
| "mean_token_accuracy": 0.8476535528898239, | |
| "num_tokens": 11200924.0, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 1.833902939166097, | |
| "grad_norm": 1.182759165763855, | |
| "learning_rate": 7.730838910384098e-06, | |
| "loss": 0.5182, | |
| "mean_token_accuracy": 0.8533438310027123, | |
| "num_tokens": 11369904.0, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 1.861244019138756, | |
| "grad_norm": 1.2786790132522583, | |
| "learning_rate": 7.424519261931036e-06, | |
| "loss": 0.5427, | |
| "mean_token_accuracy": 0.846244253218174, | |
| "num_tokens": 11541083.0, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 1.888585099111415, | |
| "grad_norm": 1.2706736326217651, | |
| "learning_rate": 7.1207669086883366e-06, | |
| "loss": 0.5438, | |
| "mean_token_accuracy": 0.8467564389109612, | |
| "num_tokens": 11715458.0, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 1.9159261790840738, | |
| "grad_norm": 1.23281991481781, | |
| "learning_rate": 6.819884637607619e-06, | |
| "loss": 0.522, | |
| "mean_token_accuracy": 0.85111885368824, | |
| "num_tokens": 11885424.0, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.9159261790840738, | |
| "eval_loss": 0.8901246190071106, | |
| "eval_mean_token_accuracy": 0.7761346054223418, | |
| "eval_num_tokens": 11885424.0, | |
| "eval_runtime": 14.2093, | |
| "eval_samples_per_second": 366.027, | |
| "eval_steps_per_second": 11.471, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.9432672590567326, | |
| "grad_norm": 1.181417465209961, | |
| "learning_rate": 6.522172374680177e-06, | |
| "loss": 0.5152, | |
| "mean_token_accuracy": 0.8532154947519303, | |
| "num_tokens": 12053676.0, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 1.9706083390293916, | |
| "grad_norm": 1.2707254886627197, | |
| "learning_rate": 6.2279268859642396e-06, | |
| "loss": 0.5259, | |
| "mean_token_accuracy": 0.8510755389928818, | |
| "num_tokens": 12221973.0, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 1.9979494190020506, | |
| "grad_norm": 1.197986125946045, | |
| "learning_rate": 5.937441481762112e-06, | |
| "loss": 0.5305, | |
| "mean_token_accuracy": 0.8508959412574768, | |
| "num_tokens": 12391740.0, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 2.027341079972659, | |
| "grad_norm": 1.7105273008346558, | |
| "learning_rate": 5.651005724242072e-06, | |
| "loss": 0.331, | |
| "mean_token_accuracy": 0.9215623707306094, | |
| "num_tokens": 12566261.0, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 2.054682159945318, | |
| "grad_norm": 1.1565046310424805, | |
| "learning_rate": 5.368905138796523e-06, | |
| "loss": 0.2543, | |
| "mean_token_accuracy": 0.9277019128203392, | |
| "num_tokens": 12734646.0, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 2.0820232399179766, | |
| "grad_norm": 1.153232216835022, | |
| "learning_rate": 5.091420929424065e-06, | |
| "loss": 0.2451, | |
| "mean_token_accuracy": 0.9302394583821296, | |
| "num_tokens": 12905690.0, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 2.1093643198906356, | |
| "grad_norm": 1.2222003936767578, | |
| "learning_rate": 4.818829698419225e-06, | |
| "loss": 0.2436, | |
| "mean_token_accuracy": 0.9303286671638489, | |
| "num_tokens": 13079445.0, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 2.1367053998632946, | |
| "grad_norm": 1.2984336614608765, | |
| "learning_rate": 4.551403170649299e-06, | |
| "loss": 0.2528, | |
| "mean_token_accuracy": 0.9280653029680253, | |
| "num_tokens": 13249001.0, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 2.1640464798359536, | |
| "grad_norm": 1.2998665571212769, | |
| "learning_rate": 4.289407922693053e-06, | |
| "loss": 0.2474, | |
| "mean_token_accuracy": 0.9294205904006958, | |
| "num_tokens": 13418283.0, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 2.1913875598086126, | |
| "grad_norm": 1.1589492559432983, | |
| "learning_rate": 4.033105117111441e-06, | |
| "loss": 0.244, | |
| "mean_token_accuracy": 0.9301661103963852, | |
| "num_tokens": 13591481.0, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 2.1913875598086126, | |
| "eval_loss": 1.0690910816192627, | |
| "eval_mean_token_accuracy": 0.7694480528860735, | |
| "eval_num_tokens": 13591481.0, | |
| "eval_runtime": 14.1576, | |
| "eval_samples_per_second": 367.366, | |
| "eval_steps_per_second": 11.513, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 2.2187286397812715, | |
| "grad_norm": 1.2524975538253784, | |
| "learning_rate": 3.7827502421150497e-06, | |
| "loss": 0.2465, | |
| "mean_token_accuracy": 0.9289533108472824, | |
| "num_tokens": 13759603.0, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 2.24606971975393, | |
| "grad_norm": 1.2752741575241089, | |
| "learning_rate": 3.5385928568879012e-06, | |
| "loss": 0.2489, | |
| "mean_token_accuracy": 0.9280223950743676, | |
| "num_tokens": 13928891.0, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 2.273410799726589, | |
| "grad_norm": 1.1992969512939453, | |
| "learning_rate": 3.300876342821451e-06, | |
| "loss": 0.2555, | |
| "mean_token_accuracy": 0.9266896471381187, | |
| "num_tokens": 14099181.0, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 2.300751879699248, | |
| "grad_norm": 1.2335748672485352, | |
| "learning_rate": 3.0698376609066828e-06, | |
| "loss": 0.2531, | |
| "mean_token_accuracy": 0.9276198267936706, | |
| "num_tokens": 14269195.0, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 2.328092959671907, | |
| "grad_norm": 1.2644842863082886, | |
| "learning_rate": 2.8457071155262885e-06, | |
| "loss": 0.2509, | |
| "mean_token_accuracy": 0.9287231177091598, | |
| "num_tokens": 14438098.0, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 2.355434039644566, | |
| "grad_norm": 1.2391464710235596, | |
| "learning_rate": 2.628708124882212e-06, | |
| "loss": 0.2467, | |
| "mean_token_accuracy": 0.9289599403738975, | |
| "num_tokens": 14609269.0, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 2.382775119617225, | |
| "grad_norm": 1.1992994546890259, | |
| "learning_rate": 2.419056998287547e-06, | |
| "loss": 0.2471, | |
| "mean_token_accuracy": 0.9298149168491363, | |
| "num_tokens": 14775550.0, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 2.4101161995898837, | |
| "grad_norm": 1.2357077598571777, | |
| "learning_rate": 2.216962720544703e-06, | |
| "loss": 0.2499, | |
| "mean_token_accuracy": 0.9283073455095291, | |
| "num_tokens": 14946021.0, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 2.4374572795625427, | |
| "grad_norm": 1.1988837718963623, | |
| "learning_rate": 2.022626743624807e-06, | |
| "loss": 0.2461, | |
| "mean_token_accuracy": 0.9295058876276017, | |
| "num_tokens": 15116026.0, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 2.4647983595352017, | |
| "grad_norm": 1.1734275817871094, | |
| "learning_rate": 1.8362427858560094e-06, | |
| "loss": 0.2454, | |
| "mean_token_accuracy": 0.9299230113625526, | |
| "num_tokens": 15287643.0, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 2.4647983595352017, | |
| "eval_loss": 1.0691540241241455, | |
| "eval_mean_token_accuracy": 0.7695598562070929, | |
| "eval_num_tokens": 15287643.0, | |
| "eval_runtime": 14.2047, | |
| "eval_samples_per_second": 366.146, | |
| "eval_steps_per_second": 11.475, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 2.4921394395078607, | |
| "grad_norm": 1.1861553192138672, | |
| "learning_rate": 1.6579966388208257e-06, | |
| "loss": 0.2415, | |
| "mean_token_accuracy": 0.9315058842301369, | |
| "num_tokens": 15455425.0, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 2.5194805194805197, | |
| "grad_norm": 1.1142207384109497, | |
| "learning_rate": 1.4880659821550547e-06, | |
| "loss": 0.2387, | |
| "mean_token_accuracy": 0.931833279132843, | |
| "num_tokens": 15625317.0, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 2.546821599453178, | |
| "grad_norm": 1.2019150257110596, | |
| "learning_rate": 1.3266202064328548e-06, | |
| "loss": 0.2424, | |
| "mean_token_accuracy": 0.9311563596129417, | |
| "num_tokens": 15795191.0, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 2.574162679425837, | |
| "grad_norm": 1.2547236680984497, | |
| "learning_rate": 1.1738202443145307e-06, | |
| "loss": 0.2449, | |
| "mean_token_accuracy": 0.9302171498537064, | |
| "num_tokens": 15966331.0, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 2.601503759398496, | |
| "grad_norm": 1.1526901721954346, | |
| "learning_rate": 1.029818410125365e-06, | |
| "loss": 0.2457, | |
| "mean_token_accuracy": 0.9300970509648323, | |
| "num_tokens": 16132046.0, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 2.628844839371155, | |
| "grad_norm": 1.288517951965332, | |
| "learning_rate": 8.94758248025378e-07, | |
| "loss": 0.2487, | |
| "mean_token_accuracy": 0.9283649578690529, | |
| "num_tokens": 16300373.0, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 2.656185919343814, | |
| "grad_norm": 1.21402108669281, | |
| "learning_rate": 7.687743889213939e-07, | |
| "loss": 0.2415, | |
| "mean_token_accuracy": 0.930217319726944, | |
| "num_tokens": 16469055.0, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 2.6835269993164728, | |
| "grad_norm": 1.2375903129577637, | |
| "learning_rate": 6.519924162640168e-07, | |
| "loss": 0.2408, | |
| "mean_token_accuracy": 0.9317809835076332, | |
| "num_tokens": 16638392.0, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 2.7108680792891318, | |
| "grad_norm": 1.1859582662582397, | |
| "learning_rate": 5.445287408633304e-07, | |
| "loss": 0.2416, | |
| "mean_token_accuracy": 0.9303795635700226, | |
| "num_tokens": 16807749.0, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 2.7382091592617908, | |
| "grad_norm": 1.150184154510498, | |
| "learning_rate": 4.464904848480522e-07, | |
| "loss": 0.2423, | |
| "mean_token_accuracy": 0.9296372309327126, | |
| "num_tokens": 16982293.0, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 2.7382091592617908, | |
| "eval_loss": 1.0727962255477905, | |
| "eval_mean_token_accuracy": 0.7691918337271989, | |
| "eval_num_tokens": 16982293.0, | |
| "eval_runtime": 14.2647, | |
| "eval_samples_per_second": 364.607, | |
| "eval_steps_per_second": 11.427, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 2.7655502392344498, | |
| "grad_norm": 1.2391895055770874, | |
| "learning_rate": 3.5797537488388326e-07, | |
| "loss": 0.2407, | |
| "mean_token_accuracy": 0.9306937634944916, | |
| "num_tokens": 17151239.0, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 2.7928913192071088, | |
| "grad_norm": 1.3776847124099731, | |
| "learning_rate": 2.790716447574304e-07, | |
| "loss": 0.2454, | |
| "mean_token_accuracy": 0.9301917076110839, | |
| "num_tokens": 17320214.0, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 2.8202323991797678, | |
| "grad_norm": 1.2018030881881714, | |
| "learning_rate": 2.098579474228546e-07, | |
| "loss": 0.2423, | |
| "mean_token_accuracy": 0.9306885093450546, | |
| "num_tokens": 17490016.0, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 2.8475734791524268, | |
| "grad_norm": 1.2227405309677124, | |
| "learning_rate": 1.504032765988961e-07, | |
| "loss": 0.2388, | |
| "mean_token_accuracy": 0.9320418834686279, | |
| "num_tokens": 17661813.0, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 2.8749145591250853, | |
| "grad_norm": 1.286917805671692, | |
| "learning_rate": 1.0076689799442874e-07, | |
| "loss": 0.2444, | |
| "mean_token_accuracy": 0.9299480274319649, | |
| "num_tokens": 17829411.0, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 2.9022556390977443, | |
| "grad_norm": 1.239208459854126, | |
| "learning_rate": 6.099829023112236e-08, | |
| "loss": 0.249, | |
| "mean_token_accuracy": 0.9299103036522866, | |
| "num_tokens": 17998820.0, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 2.9295967190704033, | |
| "grad_norm": 1.1208025217056274, | |
| "learning_rate": 3.1137095522068006e-08, | |
| "loss": 0.2402, | |
| "mean_token_accuracy": 0.9313350632786751, | |
| "num_tokens": 18167974.0, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 2.9569377990430623, | |
| "grad_norm": 1.7012122869491577, | |
| "learning_rate": 1.1213080155564327e-08, | |
| "loss": 0.2473, | |
| "mean_token_accuracy": 0.9298618510365486, | |
| "num_tokens": 18335256.0, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 2.9842788790157213, | |
| "grad_norm": 1.2129054069519043, | |
| "learning_rate": 1.246104823426908e-09, | |
| "loss": 0.2468, | |
| "mean_token_accuracy": 0.9303196057677269, | |
| "num_tokens": 18506697.0, | |
| "step": 1090 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 1095, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.4034604245899018e+18, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |