{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 30.0, "eval_steps": 500, "global_step": 9810, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0030581039755351682, "grad_norm": NaN, "learning_rate": 0.0, "loss": 15.1701, "step": 1 }, { "epoch": 0.03058103975535168, "grad_norm": 22.001853942871094, "learning_rate": 4.5e-06, "loss": 18.0699, "step": 10 }, { "epoch": 0.06116207951070336, "grad_norm": 31.736783981323242, "learning_rate": 9.5e-06, "loss": 16.5865, "step": 20 }, { "epoch": 0.09174311926605505, "grad_norm": 39.22868347167969, "learning_rate": 1.45e-05, "loss": 12.788, "step": 30 }, { "epoch": 0.12232415902140673, "grad_norm": 23.849876403808594, "learning_rate": 1.9500000000000003e-05, "loss": 8.3233, "step": 40 }, { "epoch": 0.1529051987767584, "grad_norm": 8.918281555175781, "learning_rate": 2.45e-05, "loss": 4.9971, "step": 50 }, { "epoch": 0.1834862385321101, "grad_norm": 7.4376420974731445, "learning_rate": 2.95e-05, "loss": 4.2418, "step": 60 }, { "epoch": 0.21406727828746178, "grad_norm": 3.3368353843688965, "learning_rate": 3.45e-05, "loss": 4.0631, "step": 70 }, { "epoch": 0.24464831804281345, "grad_norm": 3.5972115993499756, "learning_rate": 3.9500000000000005e-05, "loss": 3.8988, "step": 80 }, { "epoch": 0.27522935779816515, "grad_norm": 1.5172040462493896, "learning_rate": 4.4500000000000004e-05, "loss": 3.8347, "step": 90 }, { "epoch": 0.3058103975535168, "grad_norm": 2.7769057750701904, "learning_rate": 4.9500000000000004e-05, "loss": 3.7567, "step": 100 }, { "epoch": 0.3363914373088685, "grad_norm": 0.8358258605003357, "learning_rate": 5.45e-05, "loss": 3.6505, "step": 110 }, { "epoch": 0.3669724770642202, "grad_norm": 0.9350353479385376, "learning_rate": 5.95e-05, "loss": 3.5953, "step": 120 }, { "epoch": 0.39755351681957185, "grad_norm": 0.7625614404678345, "learning_rate": 6.450000000000001e-05, "loss": 3.6017, "step": 130 }, { "epoch": 0.42813455657492355, "grad_norm": 0.8317046761512756, "learning_rate": 6.95e-05, "loss": 3.6038, "step": 140 }, { "epoch": 0.45871559633027525, "grad_norm": 3.3260114192962646, "learning_rate": 7.450000000000001e-05, "loss": 3.6195, "step": 150 }, { "epoch": 0.4892966360856269, "grad_norm": 3.696671485900879, "learning_rate": 7.950000000000001e-05, "loss": 3.5211, "step": 160 }, { "epoch": 0.5198776758409785, "grad_norm": 1.0185092687606812, "learning_rate": 8.450000000000001e-05, "loss": 3.5306, "step": 170 }, { "epoch": 0.5504587155963303, "grad_norm": 0.7007076740264893, "learning_rate": 8.950000000000001e-05, "loss": 3.5614, "step": 180 }, { "epoch": 0.581039755351682, "grad_norm": 1.0399439334869385, "learning_rate": 9.449999999999999e-05, "loss": 3.5602, "step": 190 }, { "epoch": 0.6116207951070336, "grad_norm": 0.945879340171814, "learning_rate": 9.95e-05, "loss": 3.5759, "step": 200 }, { "epoch": 0.6422018348623854, "grad_norm": 0.6479012966156006, "learning_rate": 9.99063475546306e-05, "loss": 3.5129, "step": 210 }, { "epoch": 0.672782874617737, "grad_norm": 0.9196483492851257, "learning_rate": 9.980228928199792e-05, "loss": 3.4773, "step": 220 }, { "epoch": 0.7033639143730887, "grad_norm": 0.8915090560913086, "learning_rate": 9.969823100936525e-05, "loss": 3.5257, "step": 230 }, { "epoch": 0.7339449541284404, "grad_norm": 0.7033812403678894, "learning_rate": 9.959417273673257e-05, "loss": 3.5344, "step": 240 }, { "epoch": 0.764525993883792, "grad_norm": 2.416579246520996, "learning_rate": 9.94901144640999e-05, "loss": 3.5397, "step": 250 }, { "epoch": 0.7951070336391437, "grad_norm": 0.5419871211051941, "learning_rate": 9.938605619146722e-05, "loss": 3.4923, "step": 260 }, { "epoch": 0.8256880733944955, "grad_norm": 1.1078801155090332, "learning_rate": 9.928199791883456e-05, "loss": 3.4991, "step": 270 }, { "epoch": 0.8562691131498471, "grad_norm": 0.8176569938659668, "learning_rate": 9.917793964620187e-05, "loss": 3.5006, "step": 280 }, { "epoch": 0.8868501529051988, "grad_norm": 1.468253493309021, "learning_rate": 9.90738813735692e-05, "loss": 3.5245, "step": 290 }, { "epoch": 0.9174311926605505, "grad_norm": 1.12344491481781, "learning_rate": 9.896982310093654e-05, "loss": 3.576, "step": 300 }, { "epoch": 0.9480122324159022, "grad_norm": 1.7714366912841797, "learning_rate": 9.886576482830385e-05, "loss": 3.4923, "step": 310 }, { "epoch": 0.9785932721712538, "grad_norm": 0.7636281251907349, "learning_rate": 9.876170655567119e-05, "loss": 3.4947, "step": 320 }, { "epoch": 1.0091743119266054, "grad_norm": 1.032314658164978, "learning_rate": 9.865764828303851e-05, "loss": 3.4876, "step": 330 }, { "epoch": 1.039755351681957, "grad_norm": 1.217951774597168, "learning_rate": 9.855359001040582e-05, "loss": 3.4614, "step": 340 }, { "epoch": 1.070336391437309, "grad_norm": 2.222341299057007, "learning_rate": 9.844953173777316e-05, "loss": 3.3953, "step": 350 }, { "epoch": 1.1009174311926606, "grad_norm": 2.0637242794036865, "learning_rate": 9.834547346514049e-05, "loss": 3.3575, "step": 360 }, { "epoch": 1.1314984709480123, "grad_norm": 2.882171392440796, "learning_rate": 9.824141519250781e-05, "loss": 3.2256, "step": 370 }, { "epoch": 1.162079510703364, "grad_norm": 2.4455676078796387, "learning_rate": 9.813735691987514e-05, "loss": 3.1562, "step": 380 }, { "epoch": 1.1926605504587156, "grad_norm": 2.7748982906341553, "learning_rate": 9.803329864724245e-05, "loss": 2.9223, "step": 390 }, { "epoch": 1.2232415902140672, "grad_norm": 3.552553415298462, "learning_rate": 9.792924037460979e-05, "loss": 2.8258, "step": 400 }, { "epoch": 1.2538226299694188, "grad_norm": 7.971360206604004, "learning_rate": 9.782518210197711e-05, "loss": 2.711, "step": 410 }, { "epoch": 1.2844036697247707, "grad_norm": 4.942215442657471, "learning_rate": 9.772112382934444e-05, "loss": 2.6485, "step": 420 }, { "epoch": 1.3149847094801224, "grad_norm": 2.9292726516723633, "learning_rate": 9.761706555671176e-05, "loss": 2.6137, "step": 430 }, { "epoch": 1.345565749235474, "grad_norm": 3.012336015701294, "learning_rate": 9.751300728407909e-05, "loss": 2.4069, "step": 440 }, { "epoch": 1.3761467889908257, "grad_norm": 3.4564368724823, "learning_rate": 9.740894901144641e-05, "loss": 2.2903, "step": 450 }, { "epoch": 1.4067278287461773, "grad_norm": 5.477275848388672, "learning_rate": 9.730489073881374e-05, "loss": 2.2817, "step": 460 }, { "epoch": 1.4373088685015292, "grad_norm": 4.544290065765381, "learning_rate": 9.720083246618106e-05, "loss": 2.2782, "step": 470 }, { "epoch": 1.4678899082568808, "grad_norm": 3.39713454246521, "learning_rate": 9.709677419354839e-05, "loss": 2.2134, "step": 480 }, { "epoch": 1.4984709480122325, "grad_norm": 11.243364334106445, "learning_rate": 9.699271592091573e-05, "loss": 2.0284, "step": 490 }, { "epoch": 1.529051987767584, "grad_norm": 7.737382411956787, "learning_rate": 9.688865764828304e-05, "loss": 2.0166, "step": 500 }, { "epoch": 1.529051987767584, "eval_loss": 1.865260124206543, "eval_runtime": 17.067, "eval_samples_per_second": 17.05, "eval_steps_per_second": 8.555, "step": 500 }, { "epoch": 1.5596330275229358, "grad_norm": 7.2488694190979, "learning_rate": 9.678459937565036e-05, "loss": 2.0329, "step": 510 }, { "epoch": 1.5902140672782874, "grad_norm": 4.852959632873535, "learning_rate": 9.66805411030177e-05, "loss": 2.015, "step": 520 }, { "epoch": 1.620795107033639, "grad_norm": 5.300124645233154, "learning_rate": 9.657648283038501e-05, "loss": 2.0602, "step": 530 }, { "epoch": 1.6513761467889907, "grad_norm": 3.9786946773529053, "learning_rate": 9.647242455775235e-05, "loss": 1.8694, "step": 540 }, { "epoch": 1.6819571865443423, "grad_norm": 10.176203727722168, "learning_rate": 9.636836628511968e-05, "loss": 1.8715, "step": 550 }, { "epoch": 1.7125382262996942, "grad_norm": 4.76723051071167, "learning_rate": 9.626430801248699e-05, "loss": 1.8218, "step": 560 }, { "epoch": 1.7431192660550459, "grad_norm": 7.081244468688965, "learning_rate": 9.616024973985433e-05, "loss": 1.8937, "step": 570 }, { "epoch": 1.7737003058103975, "grad_norm": 5.473796367645264, "learning_rate": 9.605619146722165e-05, "loss": 1.961, "step": 580 }, { "epoch": 1.8042813455657494, "grad_norm": 6.410043239593506, "learning_rate": 9.595213319458898e-05, "loss": 1.7208, "step": 590 }, { "epoch": 1.834862385321101, "grad_norm": 6.149592876434326, "learning_rate": 9.58480749219563e-05, "loss": 1.7194, "step": 600 }, { "epoch": 1.8654434250764527, "grad_norm": 8.549052238464355, "learning_rate": 9.574401664932363e-05, "loss": 1.7529, "step": 610 }, { "epoch": 1.8960244648318043, "grad_norm": 9.397221565246582, "learning_rate": 9.563995837669095e-05, "loss": 1.8058, "step": 620 }, { "epoch": 1.926605504587156, "grad_norm": 5.5874104499816895, "learning_rate": 9.553590010405828e-05, "loss": 1.7526, "step": 630 }, { "epoch": 1.9571865443425076, "grad_norm": 6.131716251373291, "learning_rate": 9.54318418314256e-05, "loss": 1.6377, "step": 640 }, { "epoch": 1.9877675840978593, "grad_norm": 14.399773597717285, "learning_rate": 9.532778355879293e-05, "loss": 1.7617, "step": 650 }, { "epoch": 2.018348623853211, "grad_norm": 3.3374037742614746, "learning_rate": 9.522372528616027e-05, "loss": 1.6114, "step": 660 }, { "epoch": 2.0489296636085625, "grad_norm": 4.952519416809082, "learning_rate": 9.511966701352758e-05, "loss": 1.4885, "step": 670 }, { "epoch": 2.079510703363914, "grad_norm": 13.499002456665039, "learning_rate": 9.50156087408949e-05, "loss": 1.4875, "step": 680 }, { "epoch": 2.1100917431192663, "grad_norm": 5.6005473136901855, "learning_rate": 9.491155046826224e-05, "loss": 1.5514, "step": 690 }, { "epoch": 2.140672782874618, "grad_norm": 7.581764221191406, "learning_rate": 9.480749219562955e-05, "loss": 1.5322, "step": 700 }, { "epoch": 2.1712538226299696, "grad_norm": 4.7952046394348145, "learning_rate": 9.470343392299689e-05, "loss": 1.5022, "step": 710 }, { "epoch": 2.2018348623853212, "grad_norm": 5.151154041290283, "learning_rate": 9.45993756503642e-05, "loss": 1.3642, "step": 720 }, { "epoch": 2.232415902140673, "grad_norm": 4.606308937072754, "learning_rate": 9.449531737773153e-05, "loss": 1.4213, "step": 730 }, { "epoch": 2.2629969418960245, "grad_norm": 6.860902309417725, "learning_rate": 9.439125910509887e-05, "loss": 1.4115, "step": 740 }, { "epoch": 2.293577981651376, "grad_norm": 10.510758399963379, "learning_rate": 9.428720083246618e-05, "loss": 1.5298, "step": 750 }, { "epoch": 2.324159021406728, "grad_norm": 4.704411029815674, "learning_rate": 9.418314255983352e-05, "loss": 1.4785, "step": 760 }, { "epoch": 2.3547400611620795, "grad_norm": 4.206416130065918, "learning_rate": 9.407908428720084e-05, "loss": 1.3483, "step": 770 }, { "epoch": 2.385321100917431, "grad_norm": 8.520705223083496, "learning_rate": 9.397502601456815e-05, "loss": 1.2905, "step": 780 }, { "epoch": 2.4159021406727827, "grad_norm": 7.525352478027344, "learning_rate": 9.387096774193549e-05, "loss": 1.3142, "step": 790 }, { "epoch": 2.4464831804281344, "grad_norm": 8.8917818069458, "learning_rate": 9.376690946930282e-05, "loss": 1.4135, "step": 800 }, { "epoch": 2.477064220183486, "grad_norm": 7.770375728607178, "learning_rate": 9.366285119667014e-05, "loss": 1.3812, "step": 810 }, { "epoch": 2.5076452599388377, "grad_norm": 4.642964839935303, "learning_rate": 9.355879292403747e-05, "loss": 1.2248, "step": 820 }, { "epoch": 2.5382262996941893, "grad_norm": 7.788937091827393, "learning_rate": 9.345473465140479e-05, "loss": 1.3524, "step": 830 }, { "epoch": 2.5688073394495414, "grad_norm": 9.559152603149414, "learning_rate": 9.335067637877212e-05, "loss": 1.3342, "step": 840 }, { "epoch": 2.599388379204893, "grad_norm": 6.956350803375244, "learning_rate": 9.324661810613944e-05, "loss": 1.3578, "step": 850 }, { "epoch": 2.6299694189602447, "grad_norm": 4.262228012084961, "learning_rate": 9.314255983350677e-05, "loss": 1.3014, "step": 860 }, { "epoch": 2.6605504587155964, "grad_norm": 6.106989860534668, "learning_rate": 9.303850156087409e-05, "loss": 1.1986, "step": 870 }, { "epoch": 2.691131498470948, "grad_norm": 9.121493339538574, "learning_rate": 9.293444328824143e-05, "loss": 1.2134, "step": 880 }, { "epoch": 2.7217125382262997, "grad_norm": 6.555141925811768, "learning_rate": 9.283038501560874e-05, "loss": 1.2174, "step": 890 }, { "epoch": 2.7522935779816513, "grad_norm": 14.706107139587402, "learning_rate": 9.272632674297607e-05, "loss": 1.3036, "step": 900 }, { "epoch": 2.782874617737003, "grad_norm": 3.802637815475464, "learning_rate": 9.26222684703434e-05, "loss": 1.1968, "step": 910 }, { "epoch": 2.8134556574923546, "grad_norm": 5.46152925491333, "learning_rate": 9.251821019771072e-05, "loss": 1.2127, "step": 920 }, { "epoch": 2.8440366972477067, "grad_norm": 15.203720092773438, "learning_rate": 9.241415192507805e-05, "loss": 1.2008, "step": 930 }, { "epoch": 2.8746177370030583, "grad_norm": 7.590661525726318, "learning_rate": 9.231009365244538e-05, "loss": 1.2146, "step": 940 }, { "epoch": 2.90519877675841, "grad_norm": 5.586644172668457, "learning_rate": 9.220603537981269e-05, "loss": 1.2314, "step": 950 }, { "epoch": 2.9357798165137616, "grad_norm": 3.6150929927825928, "learning_rate": 9.210197710718003e-05, "loss": 1.2373, "step": 960 }, { "epoch": 2.9663608562691133, "grad_norm": 4.534612655639648, "learning_rate": 9.199791883454734e-05, "loss": 1.1448, "step": 970 }, { "epoch": 2.996941896024465, "grad_norm": 7.266969203948975, "learning_rate": 9.189386056191468e-05, "loss": 1.22, "step": 980 }, { "epoch": 3.0275229357798166, "grad_norm": 4.0980987548828125, "learning_rate": 9.1789802289282e-05, "loss": 1.0809, "step": 990 }, { "epoch": 3.058103975535168, "grad_norm": 7.4049201011657715, "learning_rate": 9.168574401664933e-05, "loss": 1.0163, "step": 1000 }, { "epoch": 3.058103975535168, "eval_loss": 0.9477021098136902, "eval_runtime": 15.4641, "eval_samples_per_second": 18.818, "eval_steps_per_second": 9.441, "step": 1000 }, { "epoch": 3.08868501529052, "grad_norm": 6.115686416625977, "learning_rate": 9.158168574401665e-05, "loss": 1.1167, "step": 1010 }, { "epoch": 3.1192660550458715, "grad_norm": 4.0201005935668945, "learning_rate": 9.147762747138398e-05, "loss": 1.0734, "step": 1020 }, { "epoch": 3.149847094801223, "grad_norm": 5.6173176765441895, "learning_rate": 9.13735691987513e-05, "loss": 1.1944, "step": 1030 }, { "epoch": 3.180428134556575, "grad_norm": 6.7310590744018555, "learning_rate": 9.126951092611863e-05, "loss": 1.0825, "step": 1040 }, { "epoch": 3.2110091743119265, "grad_norm": 3.8579742908477783, "learning_rate": 9.116545265348595e-05, "loss": 0.9264, "step": 1050 }, { "epoch": 3.241590214067278, "grad_norm": 6.208363056182861, "learning_rate": 9.106139438085328e-05, "loss": 1.0291, "step": 1060 }, { "epoch": 3.2721712538226297, "grad_norm": 8.63094711303711, "learning_rate": 9.09573361082206e-05, "loss": 1.0673, "step": 1070 }, { "epoch": 3.302752293577982, "grad_norm": 8.783857345581055, "learning_rate": 9.085327783558793e-05, "loss": 1.1216, "step": 1080 }, { "epoch": 3.3333333333333335, "grad_norm": 3.576794147491455, "learning_rate": 9.074921956295525e-05, "loss": 1.0424, "step": 1090 }, { "epoch": 3.363914373088685, "grad_norm": 5.663135528564453, "learning_rate": 9.06451612903226e-05, "loss": 0.9512, "step": 1100 }, { "epoch": 3.3944954128440368, "grad_norm": 5.0192108154296875, "learning_rate": 9.05411030176899e-05, "loss": 0.9458, "step": 1110 }, { "epoch": 3.4250764525993884, "grad_norm": 9.276246070861816, "learning_rate": 9.043704474505723e-05, "loss": 0.9878, "step": 1120 }, { "epoch": 3.45565749235474, "grad_norm": 6.982539653778076, "learning_rate": 9.033298647242457e-05, "loss": 1.0918, "step": 1130 }, { "epoch": 3.4862385321100917, "grad_norm": 4.335603713989258, "learning_rate": 9.022892819979188e-05, "loss": 0.9489, "step": 1140 }, { "epoch": 3.5168195718654434, "grad_norm": 4.5788092613220215, "learning_rate": 9.012486992715922e-05, "loss": 0.9649, "step": 1150 }, { "epoch": 3.547400611620795, "grad_norm": 4.774909496307373, "learning_rate": 9.002081165452654e-05, "loss": 0.9678, "step": 1160 }, { "epoch": 3.5779816513761467, "grad_norm": 9.079809188842773, "learning_rate": 8.991675338189387e-05, "loss": 1.049, "step": 1170 }, { "epoch": 3.6085626911314987, "grad_norm": 5.256271839141846, "learning_rate": 8.98126951092612e-05, "loss": 1.039, "step": 1180 }, { "epoch": 3.6391437308868504, "grad_norm": 5.269153118133545, "learning_rate": 8.970863683662852e-05, "loss": 0.9424, "step": 1190 }, { "epoch": 3.669724770642202, "grad_norm": 10.608845710754395, "learning_rate": 8.960457856399584e-05, "loss": 0.9202, "step": 1200 }, { "epoch": 3.7003058103975537, "grad_norm": 5.39443302154541, "learning_rate": 8.950052029136317e-05, "loss": 0.9395, "step": 1210 }, { "epoch": 3.7308868501529053, "grad_norm": 10.286017417907715, "learning_rate": 8.93964620187305e-05, "loss": 0.9538, "step": 1220 }, { "epoch": 3.761467889908257, "grad_norm": 7.594948768615723, "learning_rate": 8.929240374609782e-05, "loss": 1.0294, "step": 1230 }, { "epoch": 3.7920489296636086, "grad_norm": 6.064659595489502, "learning_rate": 8.918834547346514e-05, "loss": 0.9486, "step": 1240 }, { "epoch": 3.8226299694189603, "grad_norm": 5.366261959075928, "learning_rate": 8.908428720083247e-05, "loss": 0.9198, "step": 1250 }, { "epoch": 3.853211009174312, "grad_norm": 5.648370265960693, "learning_rate": 8.89802289281998e-05, "loss": 0.9617, "step": 1260 }, { "epoch": 3.8837920489296636, "grad_norm": 10.285215377807617, "learning_rate": 8.887617065556713e-05, "loss": 0.9628, "step": 1270 }, { "epoch": 3.914373088685015, "grad_norm": 5.895651340484619, "learning_rate": 8.877211238293444e-05, "loss": 0.9326, "step": 1280 }, { "epoch": 3.944954128440367, "grad_norm": 3.7789382934570312, "learning_rate": 8.866805411030177e-05, "loss": 0.8489, "step": 1290 }, { "epoch": 3.9755351681957185, "grad_norm": 8.380213737487793, "learning_rate": 8.85639958376691e-05, "loss": 0.8593, "step": 1300 }, { "epoch": 4.00611620795107, "grad_norm": 4.148671627044678, "learning_rate": 8.845993756503642e-05, "loss": 0.9416, "step": 1310 }, { "epoch": 4.036697247706422, "grad_norm": 4.745501518249512, "learning_rate": 8.835587929240376e-05, "loss": 0.7869, "step": 1320 }, { "epoch": 4.0672782874617734, "grad_norm": 6.28198766708374, "learning_rate": 8.825182101977107e-05, "loss": 0.8129, "step": 1330 }, { "epoch": 4.097859327217125, "grad_norm": 7.487410545349121, "learning_rate": 8.814776274713841e-05, "loss": 0.8523, "step": 1340 }, { "epoch": 4.128440366972477, "grad_norm": 7.346683979034424, "learning_rate": 8.804370447450573e-05, "loss": 0.8112, "step": 1350 }, { "epoch": 4.159021406727828, "grad_norm": 5.072343349456787, "learning_rate": 8.793964620187304e-05, "loss": 0.9141, "step": 1360 }, { "epoch": 4.18960244648318, "grad_norm": 6.268205165863037, "learning_rate": 8.783558792924038e-05, "loss": 0.7041, "step": 1370 }, { "epoch": 4.220183486238533, "grad_norm": 6.782501697540283, "learning_rate": 8.773152965660771e-05, "loss": 0.7478, "step": 1380 }, { "epoch": 4.250764525993883, "grad_norm": 6.050166606903076, "learning_rate": 8.762747138397503e-05, "loss": 0.8574, "step": 1390 }, { "epoch": 4.281345565749236, "grad_norm": 5.380760669708252, "learning_rate": 8.752341311134236e-05, "loss": 0.7921, "step": 1400 }, { "epoch": 4.3119266055045875, "grad_norm": 4.861249923706055, "learning_rate": 8.741935483870968e-05, "loss": 0.9141, "step": 1410 }, { "epoch": 4.342507645259939, "grad_norm": 7.817598819732666, "learning_rate": 8.731529656607701e-05, "loss": 0.7957, "step": 1420 }, { "epoch": 4.373088685015291, "grad_norm": 4.377925395965576, "learning_rate": 8.721123829344433e-05, "loss": 0.7507, "step": 1430 }, { "epoch": 4.4036697247706424, "grad_norm": 5.141635417938232, "learning_rate": 8.710718002081166e-05, "loss": 0.7613, "step": 1440 }, { "epoch": 4.434250764525994, "grad_norm": 4.599380970001221, "learning_rate": 8.700312174817898e-05, "loss": 0.8004, "step": 1450 }, { "epoch": 4.464831804281346, "grad_norm": 4.748518466949463, "learning_rate": 8.689906347554631e-05, "loss": 0.902, "step": 1460 }, { "epoch": 4.495412844036697, "grad_norm": 5.193148612976074, "learning_rate": 8.679500520291363e-05, "loss": 0.7295, "step": 1470 }, { "epoch": 4.525993883792049, "grad_norm": 8.4327392578125, "learning_rate": 8.669094693028096e-05, "loss": 0.7065, "step": 1480 }, { "epoch": 4.556574923547401, "grad_norm": 7.619236469268799, "learning_rate": 8.65868886576483e-05, "loss": 0.8132, "step": 1490 }, { "epoch": 4.587155963302752, "grad_norm": 5.090475559234619, "learning_rate": 8.648283038501561e-05, "loss": 0.8103, "step": 1500 }, { "epoch": 4.587155963302752, "eval_loss": 0.6886358857154846, "eval_runtime": 17.1364, "eval_samples_per_second": 16.981, "eval_steps_per_second": 8.52, "step": 1500 }, { "epoch": 4.617737003058104, "grad_norm": 4.3185133934021, "learning_rate": 8.637877211238295e-05, "loss": 0.8655, "step": 1510 }, { "epoch": 4.648318042813456, "grad_norm": 3.376910448074341, "learning_rate": 8.627471383975027e-05, "loss": 0.7238, "step": 1520 }, { "epoch": 4.678899082568807, "grad_norm": 3.3280067443847656, "learning_rate": 8.617065556711758e-05, "loss": 0.7003, "step": 1530 }, { "epoch": 4.709480122324159, "grad_norm": 9.140504837036133, "learning_rate": 8.606659729448492e-05, "loss": 0.7757, "step": 1540 }, { "epoch": 4.740061162079511, "grad_norm": 4.642270088195801, "learning_rate": 8.596253902185225e-05, "loss": 0.7487, "step": 1550 }, { "epoch": 4.770642201834862, "grad_norm": 4.702116966247559, "learning_rate": 8.585848074921957e-05, "loss": 0.8316, "step": 1560 }, { "epoch": 4.801223241590214, "grad_norm": 7.388566970825195, "learning_rate": 8.57544224765869e-05, "loss": 0.6916, "step": 1570 }, { "epoch": 4.8318042813455655, "grad_norm": 8.01981258392334, "learning_rate": 8.565036420395421e-05, "loss": 0.7335, "step": 1580 }, { "epoch": 4.862385321100917, "grad_norm": 16.20261573791504, "learning_rate": 8.554630593132155e-05, "loss": 0.75, "step": 1590 }, { "epoch": 4.892966360856269, "grad_norm": 10.875531196594238, "learning_rate": 8.544224765868887e-05, "loss": 0.7442, "step": 1600 }, { "epoch": 4.92354740061162, "grad_norm": 3.217120409011841, "learning_rate": 8.53381893860562e-05, "loss": 0.8253, "step": 1610 }, { "epoch": 4.954128440366972, "grad_norm": 5.443752765655518, "learning_rate": 8.523413111342352e-05, "loss": 0.7353, "step": 1620 }, { "epoch": 4.984709480122325, "grad_norm": 6.295218467712402, "learning_rate": 8.513007284079085e-05, "loss": 0.7714, "step": 1630 }, { "epoch": 5.015290519877676, "grad_norm": 4.707128524780273, "learning_rate": 8.502601456815817e-05, "loss": 0.7602, "step": 1640 }, { "epoch": 5.045871559633028, "grad_norm": 6.996250152587891, "learning_rate": 8.49219562955255e-05, "loss": 0.6049, "step": 1650 }, { "epoch": 5.07645259938838, "grad_norm": 5.075336933135986, "learning_rate": 8.481789802289282e-05, "loss": 0.6293, "step": 1660 }, { "epoch": 5.107033639143731, "grad_norm": 5.5144476890563965, "learning_rate": 8.471383975026015e-05, "loss": 0.6873, "step": 1670 }, { "epoch": 5.137614678899083, "grad_norm": 6.9491167068481445, "learning_rate": 8.460978147762747e-05, "loss": 0.6847, "step": 1680 }, { "epoch": 5.1681957186544345, "grad_norm": 4.417248249053955, "learning_rate": 8.45057232049948e-05, "loss": 0.7481, "step": 1690 }, { "epoch": 5.198776758409786, "grad_norm": 4.493064880371094, "learning_rate": 8.440166493236212e-05, "loss": 0.5935, "step": 1700 }, { "epoch": 5.229357798165138, "grad_norm": 5.201015472412109, "learning_rate": 8.429760665972946e-05, "loss": 0.6575, "step": 1710 }, { "epoch": 5.259938837920489, "grad_norm": 6.402657985687256, "learning_rate": 8.419354838709677e-05, "loss": 0.6748, "step": 1720 }, { "epoch": 5.290519877675841, "grad_norm": 12.927905082702637, "learning_rate": 8.408949011446411e-05, "loss": 0.7321, "step": 1730 }, { "epoch": 5.321100917431193, "grad_norm": 3.7228381633758545, "learning_rate": 8.398543184183144e-05, "loss": 0.7307, "step": 1740 }, { "epoch": 5.351681957186544, "grad_norm": 5.624021530151367, "learning_rate": 8.388137356919875e-05, "loss": 0.5725, "step": 1750 }, { "epoch": 5.382262996941896, "grad_norm": 7.779863357543945, "learning_rate": 8.377731529656609e-05, "loss": 0.6315, "step": 1760 }, { "epoch": 5.412844036697248, "grad_norm": 10.124401092529297, "learning_rate": 8.367325702393341e-05, "loss": 0.6838, "step": 1770 }, { "epoch": 5.443425076452599, "grad_norm": 6.292506217956543, "learning_rate": 8.356919875130074e-05, "loss": 0.6162, "step": 1780 }, { "epoch": 5.474006116207951, "grad_norm": 5.369190692901611, "learning_rate": 8.346514047866806e-05, "loss": 0.6983, "step": 1790 }, { "epoch": 5.504587155963303, "grad_norm": 4.705302715301514, "learning_rate": 8.336108220603539e-05, "loss": 0.5728, "step": 1800 }, { "epoch": 5.535168195718654, "grad_norm": 4.299398422241211, "learning_rate": 8.325702393340271e-05, "loss": 0.626, "step": 1810 }, { "epoch": 5.565749235474006, "grad_norm": 6.148344993591309, "learning_rate": 8.315296566077004e-05, "loss": 0.6327, "step": 1820 }, { "epoch": 5.5963302752293576, "grad_norm": 5.602970600128174, "learning_rate": 8.304890738813736e-05, "loss": 0.6552, "step": 1830 }, { "epoch": 5.626911314984709, "grad_norm": 3.46807861328125, "learning_rate": 8.294484911550469e-05, "loss": 0.7196, "step": 1840 }, { "epoch": 5.657492354740061, "grad_norm": 3.249087333679199, "learning_rate": 8.284079084287201e-05, "loss": 0.6087, "step": 1850 }, { "epoch": 5.6880733944954125, "grad_norm": 7.348158836364746, "learning_rate": 8.273673257023934e-05, "loss": 0.5612, "step": 1860 }, { "epoch": 5.718654434250764, "grad_norm": 5.426358222961426, "learning_rate": 8.263267429760666e-05, "loss": 0.6494, "step": 1870 }, { "epoch": 5.749235474006117, "grad_norm": 7.073860168457031, "learning_rate": 8.2528616024974e-05, "loss": 0.6407, "step": 1880 }, { "epoch": 5.779816513761467, "grad_norm": 4.780755519866943, "learning_rate": 8.242455775234131e-05, "loss": 0.6997, "step": 1890 }, { "epoch": 5.81039755351682, "grad_norm": 5.361114978790283, "learning_rate": 8.232049947970865e-05, "loss": 0.6212, "step": 1900 }, { "epoch": 5.840978593272172, "grad_norm": 5.250582695007324, "learning_rate": 8.221644120707596e-05, "loss": 0.6705, "step": 1910 }, { "epoch": 5.871559633027523, "grad_norm": 4.780821323394775, "learning_rate": 8.211238293444329e-05, "loss": 0.6421, "step": 1920 }, { "epoch": 5.902140672782875, "grad_norm": 6.975560188293457, "learning_rate": 8.200832466181062e-05, "loss": 0.7433, "step": 1930 }, { "epoch": 5.9327217125382266, "grad_norm": 3.3447937965393066, "learning_rate": 8.190426638917794e-05, "loss": 0.6109, "step": 1940 }, { "epoch": 5.963302752293578, "grad_norm": 9.269634246826172, "learning_rate": 8.180020811654527e-05, "loss": 0.5586, "step": 1950 }, { "epoch": 5.99388379204893, "grad_norm": 7.32548713684082, "learning_rate": 8.16961498439126e-05, "loss": 0.6575, "step": 1960 }, { "epoch": 6.0244648318042815, "grad_norm": 4.461253643035889, "learning_rate": 8.159209157127991e-05, "loss": 0.5831, "step": 1970 }, { "epoch": 6.055045871559633, "grad_norm": 4.997203350067139, "learning_rate": 8.148803329864725e-05, "loss": 0.5412, "step": 1980 }, { "epoch": 6.085626911314985, "grad_norm": 5.2825751304626465, "learning_rate": 8.138397502601457e-05, "loss": 0.5012, "step": 1990 }, { "epoch": 6.116207951070336, "grad_norm": 5.710301399230957, "learning_rate": 8.12799167533819e-05, "loss": 0.5548, "step": 2000 }, { "epoch": 6.116207951070336, "eval_loss": 0.5635126233100891, "eval_runtime": 17.154, "eval_samples_per_second": 16.964, "eval_steps_per_second": 8.511, "step": 2000 }, { "epoch": 6.146788990825688, "grad_norm": 6.4876179695129395, "learning_rate": 8.117585848074922e-05, "loss": 0.6589, "step": 2010 }, { "epoch": 6.17737003058104, "grad_norm": 3.520118474960327, "learning_rate": 8.107180020811655e-05, "loss": 0.5431, "step": 2020 }, { "epoch": 6.207951070336391, "grad_norm": 4.1320648193359375, "learning_rate": 8.096774193548387e-05, "loss": 0.5069, "step": 2030 }, { "epoch": 6.238532110091743, "grad_norm": 6.541991710662842, "learning_rate": 8.08636836628512e-05, "loss": 0.5477, "step": 2040 }, { "epoch": 6.269113149847095, "grad_norm": 6.10244607925415, "learning_rate": 8.075962539021852e-05, "loss": 0.5639, "step": 2050 }, { "epoch": 6.299694189602446, "grad_norm": 6.1276469230651855, "learning_rate": 8.065556711758585e-05, "loss": 0.5773, "step": 2060 }, { "epoch": 6.330275229357798, "grad_norm": 3.827613353729248, "learning_rate": 8.055150884495319e-05, "loss": 0.5283, "step": 2070 }, { "epoch": 6.36085626911315, "grad_norm": 5.0163421630859375, "learning_rate": 8.04474505723205e-05, "loss": 0.5174, "step": 2080 }, { "epoch": 6.391437308868501, "grad_norm": 5.943496227264404, "learning_rate": 8.034339229968782e-05, "loss": 0.5364, "step": 2090 }, { "epoch": 6.422018348623853, "grad_norm": 7.260120391845703, "learning_rate": 8.023933402705516e-05, "loss": 0.6289, "step": 2100 }, { "epoch": 6.4525993883792045, "grad_norm": 12.01782512664795, "learning_rate": 8.013527575442247e-05, "loss": 0.617, "step": 2110 }, { "epoch": 6.483180428134556, "grad_norm": 6.774069309234619, "learning_rate": 8.003121748178981e-05, "loss": 0.579, "step": 2120 }, { "epoch": 6.513761467889909, "grad_norm": 7.0621442794799805, "learning_rate": 7.992715920915714e-05, "loss": 0.4988, "step": 2130 }, { "epoch": 6.5443425076452595, "grad_norm": 9.868996620178223, "learning_rate": 7.982310093652445e-05, "loss": 0.53, "step": 2140 }, { "epoch": 6.574923547400612, "grad_norm": 20.389284133911133, "learning_rate": 7.971904266389179e-05, "loss": 0.5561, "step": 2150 }, { "epoch": 6.605504587155964, "grad_norm": 7.879746913909912, "learning_rate": 7.961498439125911e-05, "loss": 0.582, "step": 2160 }, { "epoch": 6.636085626911315, "grad_norm": 3.271695137023926, "learning_rate": 7.951092611862644e-05, "loss": 0.564, "step": 2170 }, { "epoch": 6.666666666666667, "grad_norm": 4.457621097564697, "learning_rate": 7.940686784599376e-05, "loss": 0.4886, "step": 2180 }, { "epoch": 6.697247706422019, "grad_norm": 4.690130710601807, "learning_rate": 7.930280957336107e-05, "loss": 0.4988, "step": 2190 }, { "epoch": 6.72782874617737, "grad_norm": 17.970327377319336, "learning_rate": 7.919875130072841e-05, "loss": 0.5308, "step": 2200 }, { "epoch": 6.758409785932722, "grad_norm": 8.012701034545898, "learning_rate": 7.909469302809574e-05, "loss": 0.6291, "step": 2210 }, { "epoch": 6.7889908256880735, "grad_norm": 4.772938251495361, "learning_rate": 7.899063475546306e-05, "loss": 0.58, "step": 2220 }, { "epoch": 6.819571865443425, "grad_norm": 3.9855034351348877, "learning_rate": 7.888657648283039e-05, "loss": 0.4787, "step": 2230 }, { "epoch": 6.850152905198777, "grad_norm": 10.868754386901855, "learning_rate": 7.878251821019771e-05, "loss": 0.5189, "step": 2240 }, { "epoch": 6.8807339449541285, "grad_norm": 4.059515476226807, "learning_rate": 7.867845993756504e-05, "loss": 0.5256, "step": 2250 }, { "epoch": 6.91131498470948, "grad_norm": 4.996955871582031, "learning_rate": 7.857440166493236e-05, "loss": 0.6207, "step": 2260 }, { "epoch": 6.941896024464832, "grad_norm": 3.1101465225219727, "learning_rate": 7.847034339229969e-05, "loss": 0.4958, "step": 2270 }, { "epoch": 6.972477064220183, "grad_norm": 8.754892349243164, "learning_rate": 7.836628511966701e-05, "loss": 0.5257, "step": 2280 }, { "epoch": 7.003058103975535, "grad_norm": 3.744485378265381, "learning_rate": 7.826222684703435e-05, "loss": 0.6034, "step": 2290 }, { "epoch": 7.033639143730887, "grad_norm": 4.851626396179199, "learning_rate": 7.815816857440166e-05, "loss": 0.3741, "step": 2300 }, { "epoch": 7.064220183486238, "grad_norm": 7.789270877838135, "learning_rate": 7.805411030176899e-05, "loss": 0.4436, "step": 2310 }, { "epoch": 7.09480122324159, "grad_norm": 7.35498571395874, "learning_rate": 7.795005202913633e-05, "loss": 0.4353, "step": 2320 }, { "epoch": 7.125382262996942, "grad_norm": 6.431797981262207, "learning_rate": 7.784599375650364e-05, "loss": 0.4976, "step": 2330 }, { "epoch": 7.155963302752293, "grad_norm": 4.586643218994141, "learning_rate": 7.774193548387098e-05, "loss": 0.6474, "step": 2340 }, { "epoch": 7.186544342507645, "grad_norm": 5.96403169631958, "learning_rate": 7.76378772112383e-05, "loss": 0.4536, "step": 2350 }, { "epoch": 7.217125382262997, "grad_norm": 4.443822860717773, "learning_rate": 7.753381893860561e-05, "loss": 0.4222, "step": 2360 }, { "epoch": 7.247706422018348, "grad_norm": 5.37199592590332, "learning_rate": 7.742976066597295e-05, "loss": 0.4821, "step": 2370 }, { "epoch": 7.2782874617737, "grad_norm": 5.589402675628662, "learning_rate": 7.732570239334028e-05, "loss": 0.4723, "step": 2380 }, { "epoch": 7.3088685015290515, "grad_norm": 2.493727207183838, "learning_rate": 7.72216441207076e-05, "loss": 0.5624, "step": 2390 }, { "epoch": 7.339449541284404, "grad_norm": 4.967494010925293, "learning_rate": 7.711758584807493e-05, "loss": 0.4189, "step": 2400 }, { "epoch": 7.370030581039756, "grad_norm": 4.298621654510498, "learning_rate": 7.701352757544225e-05, "loss": 0.4328, "step": 2410 }, { "epoch": 7.400611620795107, "grad_norm": 6.013545989990234, "learning_rate": 7.690946930280958e-05, "loss": 0.4803, "step": 2420 }, { "epoch": 7.431192660550459, "grad_norm": 5.232725620269775, "learning_rate": 7.68054110301769e-05, "loss": 0.4758, "step": 2430 }, { "epoch": 7.461773700305811, "grad_norm": 5.801144123077393, "learning_rate": 7.670135275754423e-05, "loss": 0.495, "step": 2440 }, { "epoch": 7.492354740061162, "grad_norm": 2.754412889480591, "learning_rate": 7.659729448491155e-05, "loss": 0.4522, "step": 2450 }, { "epoch": 7.522935779816514, "grad_norm": 6.403210163116455, "learning_rate": 7.649323621227889e-05, "loss": 0.4281, "step": 2460 }, { "epoch": 7.553516819571866, "grad_norm": 5.0712056159973145, "learning_rate": 7.63891779396462e-05, "loss": 0.4274, "step": 2470 }, { "epoch": 7.584097859327217, "grad_norm": 6.147752285003662, "learning_rate": 7.628511966701353e-05, "loss": 0.4729, "step": 2480 }, { "epoch": 7.614678899082569, "grad_norm": 3.0379574298858643, "learning_rate": 7.618106139438087e-05, "loss": 0.5294, "step": 2490 }, { "epoch": 7.6452599388379205, "grad_norm": 3.286498785018921, "learning_rate": 7.607700312174818e-05, "loss": 0.4543, "step": 2500 }, { "epoch": 7.6452599388379205, "eval_loss": 0.48799338936805725, "eval_runtime": 16.6393, "eval_samples_per_second": 17.489, "eval_steps_per_second": 8.774, "step": 2500 }, { "epoch": 7.675840978593272, "grad_norm": 9.105060577392578, "learning_rate": 7.597294484911552e-05, "loss": 0.428, "step": 2510 }, { "epoch": 7.706422018348624, "grad_norm": 4.746338844299316, "learning_rate": 7.586888657648283e-05, "loss": 0.4439, "step": 2520 }, { "epoch": 7.7370030581039755, "grad_norm": 4.770055770874023, "learning_rate": 7.576482830385015e-05, "loss": 0.4902, "step": 2530 }, { "epoch": 7.767584097859327, "grad_norm": 5.230956554412842, "learning_rate": 7.566077003121749e-05, "loss": 0.4818, "step": 2540 }, { "epoch": 7.798165137614679, "grad_norm": 5.195344924926758, "learning_rate": 7.55567117585848e-05, "loss": 0.4143, "step": 2550 }, { "epoch": 7.82874617737003, "grad_norm": 3.9105961322784424, "learning_rate": 7.545265348595214e-05, "loss": 0.4193, "step": 2560 }, { "epoch": 7.859327217125382, "grad_norm": 9.038330078125, "learning_rate": 7.534859521331947e-05, "loss": 0.4578, "step": 2570 }, { "epoch": 7.889908256880734, "grad_norm": 8.360684394836426, "learning_rate": 7.524453694068678e-05, "loss": 0.4504, "step": 2580 }, { "epoch": 7.920489296636085, "grad_norm": 4.143953800201416, "learning_rate": 7.514047866805412e-05, "loss": 0.5283, "step": 2590 }, { "epoch": 7.951070336391437, "grad_norm": 3.5100886821746826, "learning_rate": 7.503642039542144e-05, "loss": 0.45, "step": 2600 }, { "epoch": 7.981651376146789, "grad_norm": 4.23678731918335, "learning_rate": 7.493236212278877e-05, "loss": 0.4842, "step": 2610 }, { "epoch": 8.01223241590214, "grad_norm": 3.595038414001465, "learning_rate": 7.482830385015609e-05, "loss": 0.4797, "step": 2620 }, { "epoch": 8.042813455657493, "grad_norm": 4.2236504554748535, "learning_rate": 7.472424557752342e-05, "loss": 0.3455, "step": 2630 }, { "epoch": 8.073394495412844, "grad_norm": 3.604245901107788, "learning_rate": 7.462018730489074e-05, "loss": 0.3759, "step": 2640 }, { "epoch": 8.103975535168196, "grad_norm": 5.744846820831299, "learning_rate": 7.451612903225807e-05, "loss": 0.4431, "step": 2650 }, { "epoch": 8.134556574923547, "grad_norm": 6.683863639831543, "learning_rate": 7.441207075962539e-05, "loss": 0.5102, "step": 2660 }, { "epoch": 8.1651376146789, "grad_norm": 3.7429959774017334, "learning_rate": 7.430801248699272e-05, "loss": 0.5118, "step": 2670 }, { "epoch": 8.19571865443425, "grad_norm": 4.308508396148682, "learning_rate": 7.420395421436005e-05, "loss": 0.3776, "step": 2680 }, { "epoch": 8.226299694189603, "grad_norm": 4.801083564758301, "learning_rate": 7.409989594172737e-05, "loss": 0.378, "step": 2690 }, { "epoch": 8.256880733944953, "grad_norm": 5.005183219909668, "learning_rate": 7.399583766909469e-05, "loss": 0.4783, "step": 2700 }, { "epoch": 8.287461773700306, "grad_norm": 6.771042346954346, "learning_rate": 7.389177939646203e-05, "loss": 0.4222, "step": 2710 }, { "epoch": 8.318042813455657, "grad_norm": 3.7550055980682373, "learning_rate": 7.378772112382934e-05, "loss": 0.4088, "step": 2720 }, { "epoch": 8.34862385321101, "grad_norm": 3.91751766204834, "learning_rate": 7.368366285119668e-05, "loss": 0.3798, "step": 2730 }, { "epoch": 8.37920489296636, "grad_norm": 5.044673919677734, "learning_rate": 7.3579604578564e-05, "loss": 0.3783, "step": 2740 }, { "epoch": 8.409785932721713, "grad_norm": 10.467144012451172, "learning_rate": 7.347554630593132e-05, "loss": 0.4106, "step": 2750 }, { "epoch": 8.440366972477065, "grad_norm": 8.879876136779785, "learning_rate": 7.337148803329866e-05, "loss": 0.4369, "step": 2760 }, { "epoch": 8.470948012232416, "grad_norm": 2.9480531215667725, "learning_rate": 7.326742976066597e-05, "loss": 0.4218, "step": 2770 }, { "epoch": 8.501529051987767, "grad_norm": 5.187920093536377, "learning_rate": 7.31633714880333e-05, "loss": 0.3972, "step": 2780 }, { "epoch": 8.53211009174312, "grad_norm": 9.206551551818848, "learning_rate": 7.305931321540063e-05, "loss": 0.4019, "step": 2790 }, { "epoch": 8.562691131498472, "grad_norm": 15.84662914276123, "learning_rate": 7.295525494276796e-05, "loss": 0.3883, "step": 2800 }, { "epoch": 8.593272171253822, "grad_norm": 8.379307746887207, "learning_rate": 7.285119667013528e-05, "loss": 0.4943, "step": 2810 }, { "epoch": 8.623853211009175, "grad_norm": 4.381606578826904, "learning_rate": 7.27471383975026e-05, "loss": 0.4541, "step": 2820 }, { "epoch": 8.654434250764526, "grad_norm": 6.217578411102295, "learning_rate": 7.264308012486993e-05, "loss": 0.4034, "step": 2830 }, { "epoch": 8.685015290519878, "grad_norm": 4.155258655548096, "learning_rate": 7.253902185223726e-05, "loss": 0.3874, "step": 2840 }, { "epoch": 8.715596330275229, "grad_norm": 5.202000617980957, "learning_rate": 7.243496357960458e-05, "loss": 0.4272, "step": 2850 }, { "epoch": 8.746177370030582, "grad_norm": 7.137246131896973, "learning_rate": 7.23309053069719e-05, "loss": 0.4251, "step": 2860 }, { "epoch": 8.776758409785932, "grad_norm": 3.7905514240264893, "learning_rate": 7.222684703433923e-05, "loss": 0.5237, "step": 2870 }, { "epoch": 8.807339449541285, "grad_norm": 4.002864837646484, "learning_rate": 7.212278876170656e-05, "loss": 0.3235, "step": 2880 }, { "epoch": 8.837920489296636, "grad_norm": 9.481928825378418, "learning_rate": 7.201873048907388e-05, "loss": 0.4215, "step": 2890 }, { "epoch": 8.868501529051988, "grad_norm": 3.8038759231567383, "learning_rate": 7.191467221644122e-05, "loss": 0.3849, "step": 2900 }, { "epoch": 8.899082568807339, "grad_norm": 15.837183952331543, "learning_rate": 7.181061394380853e-05, "loss": 0.4646, "step": 2910 }, { "epoch": 8.929663608562691, "grad_norm": 5.814904689788818, "learning_rate": 7.170655567117586e-05, "loss": 0.4505, "step": 2920 }, { "epoch": 8.960244648318042, "grad_norm": 5.111677169799805, "learning_rate": 7.16024973985432e-05, "loss": 0.3565, "step": 2930 }, { "epoch": 8.990825688073395, "grad_norm": 20.767641067504883, "learning_rate": 7.14984391259105e-05, "loss": 0.4058, "step": 2940 }, { "epoch": 9.021406727828746, "grad_norm": 5.076265811920166, "learning_rate": 7.139438085327784e-05, "loss": 0.3846, "step": 2950 }, { "epoch": 9.051987767584098, "grad_norm": 3.0534474849700928, "learning_rate": 7.129032258064517e-05, "loss": 0.2875, "step": 2960 }, { "epoch": 9.082568807339449, "grad_norm": 3.4252254962921143, "learning_rate": 7.11862643080125e-05, "loss": 0.3941, "step": 2970 }, { "epoch": 9.113149847094801, "grad_norm": 7.061174392700195, "learning_rate": 7.108220603537982e-05, "loss": 0.439, "step": 2980 }, { "epoch": 9.143730886850152, "grad_norm": 5.309439659118652, "learning_rate": 7.097814776274714e-05, "loss": 0.3839, "step": 2990 }, { "epoch": 9.174311926605505, "grad_norm": 2.809702157974243, "learning_rate": 7.087408949011447e-05, "loss": 0.3664, "step": 3000 }, { "epoch": 9.174311926605505, "eval_loss": 0.48507484793663025, "eval_runtime": 17.7814, "eval_samples_per_second": 16.365, "eval_steps_per_second": 8.211, "step": 3000 }, { "epoch": 9.204892966360855, "grad_norm": 4.209444522857666, "learning_rate": 7.07700312174818e-05, "loss": 0.3277, "step": 3010 }, { "epoch": 9.235474006116208, "grad_norm": 4.111099720001221, "learning_rate": 7.066597294484912e-05, "loss": 0.3804, "step": 3020 }, { "epoch": 9.26605504587156, "grad_norm": 7.38637638092041, "learning_rate": 7.056191467221644e-05, "loss": 0.3965, "step": 3030 }, { "epoch": 9.296636085626911, "grad_norm": 4.757504463195801, "learning_rate": 7.045785639958377e-05, "loss": 0.3981, "step": 3040 }, { "epoch": 9.327217125382264, "grad_norm": 3.749709367752075, "learning_rate": 7.03537981269511e-05, "loss": 0.405, "step": 3050 }, { "epoch": 9.357798165137615, "grad_norm": 3.167997121810913, "learning_rate": 7.024973985431842e-05, "loss": 0.292, "step": 3060 }, { "epoch": 9.388379204892967, "grad_norm": 4.194080352783203, "learning_rate": 7.014568158168576e-05, "loss": 0.3458, "step": 3070 }, { "epoch": 9.418960244648318, "grad_norm": 4.001060962677002, "learning_rate": 7.004162330905307e-05, "loss": 0.3434, "step": 3080 }, { "epoch": 9.44954128440367, "grad_norm": 7.514442443847656, "learning_rate": 6.99375650364204e-05, "loss": 0.3827, "step": 3090 }, { "epoch": 9.480122324159021, "grad_norm": 3.734842300415039, "learning_rate": 6.983350676378772e-05, "loss": 0.3885, "step": 3100 }, { "epoch": 9.510703363914374, "grad_norm": 5.834413051605225, "learning_rate": 6.972944849115504e-05, "loss": 0.3644, "step": 3110 }, { "epoch": 9.541284403669724, "grad_norm": 13.83302116394043, "learning_rate": 6.962539021852238e-05, "loss": 0.3304, "step": 3120 }, { "epoch": 9.571865443425077, "grad_norm": 4.693499565124512, "learning_rate": 6.95213319458897e-05, "loss": 0.3597, "step": 3130 }, { "epoch": 9.602446483180428, "grad_norm": 8.545695304870605, "learning_rate": 6.941727367325703e-05, "loss": 0.4419, "step": 3140 }, { "epoch": 9.63302752293578, "grad_norm": 3.0196399688720703, "learning_rate": 6.931321540062436e-05, "loss": 0.3519, "step": 3150 }, { "epoch": 9.663608562691131, "grad_norm": 4.620004177093506, "learning_rate": 6.920915712799167e-05, "loss": 0.3464, "step": 3160 }, { "epoch": 9.694189602446484, "grad_norm": 3.939121961593628, "learning_rate": 6.910509885535901e-05, "loss": 0.3331, "step": 3170 }, { "epoch": 9.724770642201834, "grad_norm": 6.618518352508545, "learning_rate": 6.900104058272633e-05, "loss": 0.3621, "step": 3180 }, { "epoch": 9.755351681957187, "grad_norm": 6.519993782043457, "learning_rate": 6.889698231009366e-05, "loss": 0.3965, "step": 3190 }, { "epoch": 9.785932721712538, "grad_norm": 4.361577033996582, "learning_rate": 6.879292403746098e-05, "loss": 0.4112, "step": 3200 }, { "epoch": 9.81651376146789, "grad_norm": 4.898428916931152, "learning_rate": 6.868886576482831e-05, "loss": 0.3391, "step": 3210 }, { "epoch": 9.84709480122324, "grad_norm": 7.772880554199219, "learning_rate": 6.858480749219563e-05, "loss": 0.35, "step": 3220 }, { "epoch": 9.877675840978593, "grad_norm": 7.611955642700195, "learning_rate": 6.848074921956296e-05, "loss": 0.4289, "step": 3230 }, { "epoch": 9.908256880733944, "grad_norm": 4.031200885772705, "learning_rate": 6.837669094693028e-05, "loss": 0.4108, "step": 3240 }, { "epoch": 9.938837920489297, "grad_norm": 3.0720908641815186, "learning_rate": 6.827263267429761e-05, "loss": 0.371, "step": 3250 }, { "epoch": 9.96941896024465, "grad_norm": 9.898725509643555, "learning_rate": 6.816857440166493e-05, "loss": 0.3057, "step": 3260 }, { "epoch": 10.0, "grad_norm": 10.916779518127441, "learning_rate": 6.806451612903226e-05, "loss": 0.3364, "step": 3270 }, { "epoch": 10.030581039755353, "grad_norm": 7.887121200561523, "learning_rate": 6.796045785639958e-05, "loss": 0.257, "step": 3280 }, { "epoch": 10.061162079510703, "grad_norm": 5.914815425872803, "learning_rate": 6.785639958376692e-05, "loss": 0.3088, "step": 3290 }, { "epoch": 10.091743119266056, "grad_norm": 4.333220481872559, "learning_rate": 6.775234131113423e-05, "loss": 0.3107, "step": 3300 }, { "epoch": 10.122324159021407, "grad_norm": 3.6494174003601074, "learning_rate": 6.764828303850156e-05, "loss": 0.3632, "step": 3310 }, { "epoch": 10.15290519877676, "grad_norm": 10.518165588378906, "learning_rate": 6.75442247658689e-05, "loss": 0.4544, "step": 3320 }, { "epoch": 10.18348623853211, "grad_norm": 8.803187370300293, "learning_rate": 6.744016649323621e-05, "loss": 0.3239, "step": 3330 }, { "epoch": 10.214067278287462, "grad_norm": 4.997848033905029, "learning_rate": 6.733610822060355e-05, "loss": 0.2882, "step": 3340 }, { "epoch": 10.244648318042813, "grad_norm": 4.853428363800049, "learning_rate": 6.723204994797087e-05, "loss": 0.3277, "step": 3350 }, { "epoch": 10.275229357798166, "grad_norm": 8.042211532592773, "learning_rate": 6.71279916753382e-05, "loss": 0.3872, "step": 3360 }, { "epoch": 10.305810397553516, "grad_norm": 6.132227897644043, "learning_rate": 6.702393340270552e-05, "loss": 0.3562, "step": 3370 }, { "epoch": 10.336391437308869, "grad_norm": 3.8088772296905518, "learning_rate": 6.691987513007283e-05, "loss": 0.3054, "step": 3380 }, { "epoch": 10.36697247706422, "grad_norm": 6.178166389465332, "learning_rate": 6.681581685744017e-05, "loss": 0.3423, "step": 3390 }, { "epoch": 10.397553516819572, "grad_norm": 6.065817832946777, "learning_rate": 6.67117585848075e-05, "loss": 0.3252, "step": 3400 }, { "epoch": 10.428134556574923, "grad_norm": 5.649379730224609, "learning_rate": 6.660770031217482e-05, "loss": 0.3581, "step": 3410 }, { "epoch": 10.458715596330276, "grad_norm": 6.1458563804626465, "learning_rate": 6.650364203954215e-05, "loss": 0.453, "step": 3420 }, { "epoch": 10.489296636085626, "grad_norm": 3.4970128536224365, "learning_rate": 6.639958376690947e-05, "loss": 0.3148, "step": 3430 }, { "epoch": 10.519877675840979, "grad_norm": 2.8173370361328125, "learning_rate": 6.62955254942768e-05, "loss": 0.2592, "step": 3440 }, { "epoch": 10.55045871559633, "grad_norm": 8.554411888122559, "learning_rate": 6.619146722164412e-05, "loss": 0.3032, "step": 3450 }, { "epoch": 10.581039755351682, "grad_norm": 7.391510009765625, "learning_rate": 6.608740894901145e-05, "loss": 0.3607, "step": 3460 }, { "epoch": 10.611620795107033, "grad_norm": 7.969365119934082, "learning_rate": 6.598335067637877e-05, "loss": 0.3567, "step": 3470 }, { "epoch": 10.642201834862385, "grad_norm": 4.969130039215088, "learning_rate": 6.58792924037461e-05, "loss": 0.3078, "step": 3480 }, { "epoch": 10.672782874617736, "grad_norm": 4.509185314178467, "learning_rate": 6.577523413111342e-05, "loss": 0.3299, "step": 3490 }, { "epoch": 10.703363914373089, "grad_norm": 4.516345024108887, "learning_rate": 6.567117585848075e-05, "loss": 0.3525, "step": 3500 }, { "epoch": 10.703363914373089, "eval_loss": 0.45856353640556335, "eval_runtime": 17.4638, "eval_samples_per_second": 16.663, "eval_steps_per_second": 8.36, "step": 3500 }, { "epoch": 10.73394495412844, "grad_norm": 6.109631538391113, "learning_rate": 6.556711758584809e-05, "loss": 0.35, "step": 3510 }, { "epoch": 10.764525993883792, "grad_norm": 5.233398914337158, "learning_rate": 6.54630593132154e-05, "loss": 0.3717, "step": 3520 }, { "epoch": 10.795107033639145, "grad_norm": 3.2442190647125244, "learning_rate": 6.535900104058274e-05, "loss": 0.2845, "step": 3530 }, { "epoch": 10.825688073394495, "grad_norm": 4.037608623504639, "learning_rate": 6.525494276795006e-05, "loss": 0.2987, "step": 3540 }, { "epoch": 10.856269113149848, "grad_norm": 3.441417932510376, "learning_rate": 6.515088449531737e-05, "loss": 0.34, "step": 3550 }, { "epoch": 10.886850152905199, "grad_norm": 7.213795185089111, "learning_rate": 6.504682622268471e-05, "loss": 0.3387, "step": 3560 }, { "epoch": 10.917431192660551, "grad_norm": 5.970945358276367, "learning_rate": 6.494276795005204e-05, "loss": 0.4408, "step": 3570 }, { "epoch": 10.948012232415902, "grad_norm": 6.310761451721191, "learning_rate": 6.483870967741936e-05, "loss": 0.2882, "step": 3580 }, { "epoch": 10.978593272171254, "grad_norm": 12.499755859375, "learning_rate": 6.473465140478669e-05, "loss": 0.2837, "step": 3590 }, { "epoch": 11.009174311926605, "grad_norm": 4.7203369140625, "learning_rate": 6.463059313215401e-05, "loss": 0.3552, "step": 3600 }, { "epoch": 11.039755351681958, "grad_norm": 5.568735599517822, "learning_rate": 6.452653485952134e-05, "loss": 0.2643, "step": 3610 }, { "epoch": 11.070336391437309, "grad_norm": 5.771465301513672, "learning_rate": 6.442247658688866e-05, "loss": 0.2558, "step": 3620 }, { "epoch": 11.100917431192661, "grad_norm": 3.8241806030273438, "learning_rate": 6.431841831425599e-05, "loss": 0.2871, "step": 3630 }, { "epoch": 11.131498470948012, "grad_norm": 4.5878777503967285, "learning_rate": 6.421436004162331e-05, "loss": 0.3779, "step": 3640 }, { "epoch": 11.162079510703364, "grad_norm": 4.807687759399414, "learning_rate": 6.411030176899064e-05, "loss": 0.3694, "step": 3650 }, { "epoch": 11.192660550458715, "grad_norm": 4.102357387542725, "learning_rate": 6.400624349635796e-05, "loss": 0.2575, "step": 3660 }, { "epoch": 11.223241590214068, "grad_norm": 4.665652275085449, "learning_rate": 6.390218522372529e-05, "loss": 0.3022, "step": 3670 }, { "epoch": 11.253822629969418, "grad_norm": 4.404171943664551, "learning_rate": 6.379812695109262e-05, "loss": 0.2951, "step": 3680 }, { "epoch": 11.284403669724771, "grad_norm": 6.702642917633057, "learning_rate": 6.369406867845994e-05, "loss": 0.2925, "step": 3690 }, { "epoch": 11.314984709480122, "grad_norm": 3.5163631439208984, "learning_rate": 6.359001040582727e-05, "loss": 0.3496, "step": 3700 }, { "epoch": 11.345565749235474, "grad_norm": 4.392800331115723, "learning_rate": 6.348595213319459e-05, "loss": 0.2605, "step": 3710 }, { "epoch": 11.376146788990825, "grad_norm": 5.423502445220947, "learning_rate": 6.338189386056191e-05, "loss": 0.2948, "step": 3720 }, { "epoch": 11.406727828746178, "grad_norm": 4.449337959289551, "learning_rate": 6.327783558792925e-05, "loss": 0.2971, "step": 3730 }, { "epoch": 11.437308868501528, "grad_norm": 6.444003582000732, "learning_rate": 6.317377731529656e-05, "loss": 0.3255, "step": 3740 }, { "epoch": 11.46788990825688, "grad_norm": 2.682093858718872, "learning_rate": 6.30697190426639e-05, "loss": 0.3969, "step": 3750 }, { "epoch": 11.498470948012232, "grad_norm": 4.033663272857666, "learning_rate": 6.296566077003122e-05, "loss": 0.2669, "step": 3760 }, { "epoch": 11.529051987767584, "grad_norm": 3.125478506088257, "learning_rate": 6.286160249739854e-05, "loss": 0.3093, "step": 3770 }, { "epoch": 11.559633027522935, "grad_norm": 5.048392295837402, "learning_rate": 6.275754422476587e-05, "loss": 0.2829, "step": 3780 }, { "epoch": 11.590214067278287, "grad_norm": 3.8091633319854736, "learning_rate": 6.26534859521332e-05, "loss": 0.3207, "step": 3790 }, { "epoch": 11.62079510703364, "grad_norm": 5.138482093811035, "learning_rate": 6.254942767950052e-05, "loss": 0.3514, "step": 3800 }, { "epoch": 11.65137614678899, "grad_norm": 3.7958028316497803, "learning_rate": 6.244536940686785e-05, "loss": 0.2416, "step": 3810 }, { "epoch": 11.681957186544343, "grad_norm": 5.143332481384277, "learning_rate": 6.234131113423518e-05, "loss": 0.2647, "step": 3820 }, { "epoch": 11.712538226299694, "grad_norm": 6.322818756103516, "learning_rate": 6.22372528616025e-05, "loss": 0.323, "step": 3830 }, { "epoch": 11.743119266055047, "grad_norm": 5.7363996505737305, "learning_rate": 6.213319458896983e-05, "loss": 0.3049, "step": 3840 }, { "epoch": 11.773700305810397, "grad_norm": 4.893774032592773, "learning_rate": 6.202913631633715e-05, "loss": 0.3502, "step": 3850 }, { "epoch": 11.80428134556575, "grad_norm": 3.848017930984497, "learning_rate": 6.192507804370448e-05, "loss": 0.2771, "step": 3860 }, { "epoch": 11.8348623853211, "grad_norm": 5.943398952484131, "learning_rate": 6.182101977107181e-05, "loss": 0.301, "step": 3870 }, { "epoch": 11.865443425076453, "grad_norm": 3.520704984664917, "learning_rate": 6.171696149843913e-05, "loss": 0.2926, "step": 3880 }, { "epoch": 11.896024464831804, "grad_norm": 4.850738048553467, "learning_rate": 6.161290322580645e-05, "loss": 0.2805, "step": 3890 }, { "epoch": 11.926605504587156, "grad_norm": 2.6821794509887695, "learning_rate": 6.150884495317379e-05, "loss": 0.3268, "step": 3900 }, { "epoch": 11.957186544342507, "grad_norm": 4.901425838470459, "learning_rate": 6.14047866805411e-05, "loss": 0.2954, "step": 3910 }, { "epoch": 11.98776758409786, "grad_norm": 4.856095314025879, "learning_rate": 6.130072840790844e-05, "loss": 0.3739, "step": 3920 }, { "epoch": 12.01834862385321, "grad_norm": 5.4404616355896, "learning_rate": 6.119667013527576e-05, "loss": 0.28, "step": 3930 }, { "epoch": 12.048929663608563, "grad_norm": 4.431248188018799, "learning_rate": 6.109261186264308e-05, "loss": 0.2479, "step": 3940 }, { "epoch": 12.079510703363914, "grad_norm": 7.073538780212402, "learning_rate": 6.0988553590010414e-05, "loss": 0.266, "step": 3950 }, { "epoch": 12.110091743119266, "grad_norm": 6.206909656524658, "learning_rate": 6.088449531737773e-05, "loss": 0.3466, "step": 3960 }, { "epoch": 12.140672782874617, "grad_norm": 6.207326412200928, "learning_rate": 6.0780437044745064e-05, "loss": 0.3262, "step": 3970 }, { "epoch": 12.17125382262997, "grad_norm": 3.017399787902832, "learning_rate": 6.067637877211239e-05, "loss": 0.2778, "step": 3980 }, { "epoch": 12.20183486238532, "grad_norm": 3.116441488265991, "learning_rate": 6.057232049947971e-05, "loss": 0.2127, "step": 3990 }, { "epoch": 12.232415902140673, "grad_norm": 5.957989692687988, "learning_rate": 6.046826222684704e-05, "loss": 0.2901, "step": 4000 }, { "epoch": 12.232415902140673, "eval_loss": 0.4600982069969177, "eval_runtime": 16.8374, "eval_samples_per_second": 17.283, "eval_steps_per_second": 8.671, "step": 4000 }, { "epoch": 12.262996941896024, "grad_norm": 3.1144094467163086, "learning_rate": 6.036420395421436e-05, "loss": 0.2878, "step": 4010 }, { "epoch": 12.293577981651376, "grad_norm": 5.286678314208984, "learning_rate": 6.026014568158169e-05, "loss": 0.298, "step": 4020 }, { "epoch": 12.324159021406729, "grad_norm": 3.7269203662872314, "learning_rate": 6.0156087408949014e-05, "loss": 0.2888, "step": 4030 }, { "epoch": 12.35474006116208, "grad_norm": 3.3287158012390137, "learning_rate": 6.005202913631633e-05, "loss": 0.2063, "step": 4040 }, { "epoch": 12.385321100917432, "grad_norm": 4.839067459106445, "learning_rate": 5.9947970863683664e-05, "loss": 0.2779, "step": 4050 }, { "epoch": 12.415902140672783, "grad_norm": 5.016460418701172, "learning_rate": 5.984391259105099e-05, "loss": 0.2527, "step": 4060 }, { "epoch": 12.446483180428135, "grad_norm": 5.951940536499023, "learning_rate": 5.973985431841832e-05, "loss": 0.3517, "step": 4070 }, { "epoch": 12.477064220183486, "grad_norm": 3.960165023803711, "learning_rate": 5.963579604578564e-05, "loss": 0.317, "step": 4080 }, { "epoch": 12.507645259938839, "grad_norm": 2.830717086791992, "learning_rate": 5.953173777315297e-05, "loss": 0.3019, "step": 4090 }, { "epoch": 12.53822629969419, "grad_norm": 2.6632418632507324, "learning_rate": 5.9427679500520296e-05, "loss": 0.2186, "step": 4100 }, { "epoch": 12.568807339449542, "grad_norm": 5.966037273406982, "learning_rate": 5.9323621227887614e-05, "loss": 0.2752, "step": 4110 }, { "epoch": 12.599388379204893, "grad_norm": 7.089763164520264, "learning_rate": 5.9219562955254946e-05, "loss": 0.3096, "step": 4120 }, { "epoch": 12.629969418960245, "grad_norm": 2.9808754920959473, "learning_rate": 5.911550468262227e-05, "loss": 0.262, "step": 4130 }, { "epoch": 12.660550458715596, "grad_norm": 3.30267596244812, "learning_rate": 5.90114464099896e-05, "loss": 0.2317, "step": 4140 }, { "epoch": 12.691131498470948, "grad_norm": 3.521730422973633, "learning_rate": 5.890738813735692e-05, "loss": 0.2719, "step": 4150 }, { "epoch": 12.7217125382263, "grad_norm": 4.6262078285217285, "learning_rate": 5.8803329864724246e-05, "loss": 0.2598, "step": 4160 }, { "epoch": 12.752293577981652, "grad_norm": 8.410408973693848, "learning_rate": 5.869927159209158e-05, "loss": 0.3356, "step": 4170 }, { "epoch": 12.782874617737003, "grad_norm": 3.9074435234069824, "learning_rate": 5.8595213319458896e-05, "loss": 0.2509, "step": 4180 }, { "epoch": 12.813455657492355, "grad_norm": 3.5229334831237793, "learning_rate": 5.849115504682623e-05, "loss": 0.2223, "step": 4190 }, { "epoch": 12.844036697247706, "grad_norm": 6.239946365356445, "learning_rate": 5.838709677419355e-05, "loss": 0.2412, "step": 4200 }, { "epoch": 12.874617737003058, "grad_norm": 6.538459300994873, "learning_rate": 5.828303850156087e-05, "loss": 0.259, "step": 4210 }, { "epoch": 12.905198776758409, "grad_norm": 5.794137954711914, "learning_rate": 5.81789802289282e-05, "loss": 0.2934, "step": 4220 }, { "epoch": 12.935779816513762, "grad_norm": 4.787842273712158, "learning_rate": 5.807492195629553e-05, "loss": 0.3066, "step": 4230 }, { "epoch": 12.966360856269112, "grad_norm": 2.687736988067627, "learning_rate": 5.797086368366286e-05, "loss": 0.2459, "step": 4240 }, { "epoch": 12.996941896024465, "grad_norm": 4.587925910949707, "learning_rate": 5.786680541103018e-05, "loss": 0.2548, "step": 4250 }, { "epoch": 13.027522935779816, "grad_norm": 3.2574503421783447, "learning_rate": 5.776274713839751e-05, "loss": 0.2463, "step": 4260 }, { "epoch": 13.058103975535168, "grad_norm": 4.4264397621154785, "learning_rate": 5.7658688865764835e-05, "loss": 0.2214, "step": 4270 }, { "epoch": 13.08868501529052, "grad_norm": 3.6344454288482666, "learning_rate": 5.755463059313215e-05, "loss": 0.2442, "step": 4280 }, { "epoch": 13.119266055045872, "grad_norm": 5.120426654815674, "learning_rate": 5.7450572320499485e-05, "loss": 0.2977, "step": 4290 }, { "epoch": 13.149847094801224, "grad_norm": 5.738251686096191, "learning_rate": 5.73465140478668e-05, "loss": 0.3104, "step": 4300 }, { "epoch": 13.180428134556575, "grad_norm": 3.5654373168945312, "learning_rate": 5.724245577523414e-05, "loss": 0.2432, "step": 4310 }, { "epoch": 13.211009174311927, "grad_norm": 4.370543956756592, "learning_rate": 5.713839750260146e-05, "loss": 0.2373, "step": 4320 }, { "epoch": 13.241590214067278, "grad_norm": 6.025704383850098, "learning_rate": 5.703433922996878e-05, "loss": 0.2658, "step": 4330 }, { "epoch": 13.27217125382263, "grad_norm": 4.021401882171631, "learning_rate": 5.693028095733611e-05, "loss": 0.2805, "step": 4340 }, { "epoch": 13.302752293577981, "grad_norm": 8.499091148376465, "learning_rate": 5.6826222684703435e-05, "loss": 0.3026, "step": 4350 }, { "epoch": 13.333333333333334, "grad_norm": 2.7453525066375732, "learning_rate": 5.672216441207077e-05, "loss": 0.2622, "step": 4360 }, { "epoch": 13.363914373088685, "grad_norm": 5.434438228607178, "learning_rate": 5.6618106139438085e-05, "loss": 0.2392, "step": 4370 }, { "epoch": 13.394495412844037, "grad_norm": 6.497437000274658, "learning_rate": 5.651404786680541e-05, "loss": 0.243, "step": 4380 }, { "epoch": 13.425076452599388, "grad_norm": 5.23800802230835, "learning_rate": 5.640998959417274e-05, "loss": 0.3129, "step": 4390 }, { "epoch": 13.45565749235474, "grad_norm": 4.006523609161377, "learning_rate": 5.630593132154006e-05, "loss": 0.2886, "step": 4400 }, { "epoch": 13.486238532110091, "grad_norm": 9.827424049377441, "learning_rate": 5.620187304890739e-05, "loss": 0.2281, "step": 4410 }, { "epoch": 13.516819571865444, "grad_norm": 4.516007900238037, "learning_rate": 5.609781477627472e-05, "loss": 0.2079, "step": 4420 }, { "epoch": 13.547400611620795, "grad_norm": 4.366119384765625, "learning_rate": 5.599375650364205e-05, "loss": 0.2361, "step": 4430 }, { "epoch": 13.577981651376147, "grad_norm": 5.7396111488342285, "learning_rate": 5.588969823100937e-05, "loss": 0.2765, "step": 4440 }, { "epoch": 13.608562691131498, "grad_norm": 4.654730796813965, "learning_rate": 5.578563995837669e-05, "loss": 0.3094, "step": 4450 }, { "epoch": 13.63914373088685, "grad_norm": 2.6884396076202393, "learning_rate": 5.5681581685744024e-05, "loss": 0.2124, "step": 4460 }, { "epoch": 13.669724770642201, "grad_norm": 6.302408695220947, "learning_rate": 5.557752341311134e-05, "loss": 0.2303, "step": 4470 }, { "epoch": 13.700305810397554, "grad_norm": 2.9143168926239014, "learning_rate": 5.5473465140478674e-05, "loss": 0.2771, "step": 4480 }, { "epoch": 13.730886850152904, "grad_norm": 4.365865707397461, "learning_rate": 5.5369406867846e-05, "loss": 0.2519, "step": 4490 }, { "epoch": 13.761467889908257, "grad_norm": 5.209056854248047, "learning_rate": 5.526534859521332e-05, "loss": 0.3578, "step": 4500 }, { "epoch": 13.761467889908257, "eval_loss": 0.44830259680747986, "eval_runtime": 17.1439, "eval_samples_per_second": 16.974, "eval_steps_per_second": 8.516, "step": 4500 }, { "epoch": 13.792048929663608, "grad_norm": 3.6942572593688965, "learning_rate": 5.516129032258065e-05, "loss": 0.2407, "step": 4510 }, { "epoch": 13.82262996941896, "grad_norm": 3.4600281715393066, "learning_rate": 5.5057232049947974e-05, "loss": 0.2362, "step": 4520 }, { "epoch": 13.853211009174313, "grad_norm": 5.58060359954834, "learning_rate": 5.4953173777315306e-05, "loss": 0.2432, "step": 4530 }, { "epoch": 13.883792048929664, "grad_norm": 6.685091018676758, "learning_rate": 5.4849115504682624e-05, "loss": 0.2267, "step": 4540 }, { "epoch": 13.914373088685016, "grad_norm": 5.045437335968018, "learning_rate": 5.474505723204994e-05, "loss": 0.3498, "step": 4550 }, { "epoch": 13.944954128440367, "grad_norm": 3.360764741897583, "learning_rate": 5.464099895941728e-05, "loss": 0.2323, "step": 4560 }, { "epoch": 13.97553516819572, "grad_norm": 8.18934440612793, "learning_rate": 5.45369406867846e-05, "loss": 0.2403, "step": 4570 }, { "epoch": 14.00611620795107, "grad_norm": 2.808126926422119, "learning_rate": 5.443288241415193e-05, "loss": 0.3151, "step": 4580 }, { "epoch": 14.036697247706423, "grad_norm": 4.768661975860596, "learning_rate": 5.432882414151925e-05, "loss": 0.2096, "step": 4590 }, { "epoch": 14.067278287461773, "grad_norm": 3.9848787784576416, "learning_rate": 5.422476586888659e-05, "loss": 0.187, "step": 4600 }, { "epoch": 14.097859327217126, "grad_norm": 8.465381622314453, "learning_rate": 5.4120707596253906e-05, "loss": 0.2029, "step": 4610 }, { "epoch": 14.128440366972477, "grad_norm": 5.414556503295898, "learning_rate": 5.4016649323621224e-05, "loss": 0.2401, "step": 4620 }, { "epoch": 14.15902140672783, "grad_norm": 3.5335428714752197, "learning_rate": 5.3912591050988556e-05, "loss": 0.3059, "step": 4630 }, { "epoch": 14.18960244648318, "grad_norm": 3.0602972507476807, "learning_rate": 5.380853277835588e-05, "loss": 0.2089, "step": 4640 }, { "epoch": 14.220183486238533, "grad_norm": 4.8753557205200195, "learning_rate": 5.370447450572321e-05, "loss": 0.2213, "step": 4650 }, { "epoch": 14.250764525993883, "grad_norm": 4.4842987060546875, "learning_rate": 5.360041623309053e-05, "loss": 0.2155, "step": 4660 }, { "epoch": 14.281345565749236, "grad_norm": 4.742305278778076, "learning_rate": 5.3496357960457856e-05, "loss": 0.2515, "step": 4670 }, { "epoch": 14.311926605504587, "grad_norm": 2.4470579624176025, "learning_rate": 5.339229968782519e-05, "loss": 0.3141, "step": 4680 }, { "epoch": 14.34250764525994, "grad_norm": 2.6575069427490234, "learning_rate": 5.3288241415192506e-05, "loss": 0.1942, "step": 4690 }, { "epoch": 14.37308868501529, "grad_norm": 6.382633686065674, "learning_rate": 5.318418314255984e-05, "loss": 0.24, "step": 4700 }, { "epoch": 14.403669724770642, "grad_norm": 4.684356212615967, "learning_rate": 5.308012486992716e-05, "loss": 0.2164, "step": 4710 }, { "epoch": 14.434250764525993, "grad_norm": 6.916840076446533, "learning_rate": 5.297606659729448e-05, "loss": 0.2048, "step": 4720 }, { "epoch": 14.464831804281346, "grad_norm": 3.9860119819641113, "learning_rate": 5.287200832466181e-05, "loss": 0.2869, "step": 4730 }, { "epoch": 14.495412844036696, "grad_norm": 4.020848274230957, "learning_rate": 5.276795005202914e-05, "loss": 0.204, "step": 4740 }, { "epoch": 14.525993883792049, "grad_norm": 4.997827053070068, "learning_rate": 5.266389177939647e-05, "loss": 0.2295, "step": 4750 }, { "epoch": 14.5565749235474, "grad_norm": 4.601790904998779, "learning_rate": 5.255983350676379e-05, "loss": 0.2526, "step": 4760 }, { "epoch": 14.587155963302752, "grad_norm": 5.125568389892578, "learning_rate": 5.245577523413111e-05, "loss": 0.2559, "step": 4770 }, { "epoch": 14.617737003058103, "grad_norm": 4.057760715484619, "learning_rate": 5.2351716961498445e-05, "loss": 0.3112, "step": 4780 }, { "epoch": 14.648318042813456, "grad_norm": 3.295330047607422, "learning_rate": 5.224765868886576e-05, "loss": 0.2071, "step": 4790 }, { "epoch": 14.678899082568808, "grad_norm": 3.893869638442993, "learning_rate": 5.2143600416233095e-05, "loss": 0.1763, "step": 4800 }, { "epoch": 14.709480122324159, "grad_norm": 8.921279907226562, "learning_rate": 5.203954214360042e-05, "loss": 0.2331, "step": 4810 }, { "epoch": 14.740061162079511, "grad_norm": 3.6646783351898193, "learning_rate": 5.193548387096775e-05, "loss": 0.246, "step": 4820 }, { "epoch": 14.770642201834862, "grad_norm": 2.544281482696533, "learning_rate": 5.183142559833507e-05, "loss": 0.3041, "step": 4830 }, { "epoch": 14.801223241590215, "grad_norm": 2.496971607208252, "learning_rate": 5.1727367325702395e-05, "loss": 0.1993, "step": 4840 }, { "epoch": 14.831804281345565, "grad_norm": 5.756928443908691, "learning_rate": 5.162330905306973e-05, "loss": 0.2215, "step": 4850 }, { "epoch": 14.862385321100918, "grad_norm": 4.231186866760254, "learning_rate": 5.1519250780437045e-05, "loss": 0.1927, "step": 4860 }, { "epoch": 14.892966360856269, "grad_norm": 9.099519729614258, "learning_rate": 5.141519250780438e-05, "loss": 0.2556, "step": 4870 }, { "epoch": 14.923547400611621, "grad_norm": 3.435333728790283, "learning_rate": 5.13111342351717e-05, "loss": 0.2672, "step": 4880 }, { "epoch": 14.954128440366972, "grad_norm": 4.523071765899658, "learning_rate": 5.120707596253902e-05, "loss": 0.2166, "step": 4890 }, { "epoch": 14.984709480122325, "grad_norm": 7.479111671447754, "learning_rate": 5.110301768990635e-05, "loss": 0.2489, "step": 4900 }, { "epoch": 15.015290519877675, "grad_norm": 6.366982936859131, "learning_rate": 5.099895941727367e-05, "loss": 0.2488, "step": 4910 }, { "epoch": 15.045871559633028, "grad_norm": 3.5568156242370605, "learning_rate": 5.0894901144641e-05, "loss": 0.1866, "step": 4920 }, { "epoch": 15.076452599388379, "grad_norm": 4.744721412658691, "learning_rate": 5.079084287200833e-05, "loss": 0.2186, "step": 4930 }, { "epoch": 15.107033639143731, "grad_norm": 4.74827241897583, "learning_rate": 5.0686784599375645e-05, "loss": 0.2372, "step": 4940 }, { "epoch": 15.137614678899082, "grad_norm": 14.134720802307129, "learning_rate": 5.058272632674298e-05, "loss": 0.235, "step": 4950 }, { "epoch": 15.168195718654435, "grad_norm": 5.064000606536865, "learning_rate": 5.04786680541103e-05, "loss": 0.2712, "step": 4960 }, { "epoch": 15.198776758409785, "grad_norm": 3.46882700920105, "learning_rate": 5.0374609781477634e-05, "loss": 0.1872, "step": 4970 }, { "epoch": 15.229357798165138, "grad_norm": 6.42874002456665, "learning_rate": 5.027055150884495e-05, "loss": 0.1819, "step": 4980 }, { "epoch": 15.259938837920489, "grad_norm": 8.690680503845215, "learning_rate": 5.0166493236212284e-05, "loss": 0.2267, "step": 4990 }, { "epoch": 15.290519877675841, "grad_norm": 5.257503032684326, "learning_rate": 5.006243496357961e-05, "loss": 0.218, "step": 5000 }, { "epoch": 15.290519877675841, "eval_loss": 0.45569944381713867, "eval_runtime": 17.4347, "eval_samples_per_second": 16.691, "eval_steps_per_second": 8.374, "step": 5000 }, { "epoch": 15.321100917431192, "grad_norm": 3.1005799770355225, "learning_rate": 4.9958376690946934e-05, "loss": 0.2364, "step": 5010 }, { "epoch": 15.351681957186544, "grad_norm": 6.173751354217529, "learning_rate": 4.985431841831426e-05, "loss": 0.1919, "step": 5020 }, { "epoch": 15.382262996941897, "grad_norm": 3.07297682762146, "learning_rate": 4.9750260145681584e-05, "loss": 0.154, "step": 5030 }, { "epoch": 15.412844036697248, "grad_norm": 5.775254249572754, "learning_rate": 4.964620187304891e-05, "loss": 0.2291, "step": 5040 }, { "epoch": 15.4434250764526, "grad_norm": 3.3316097259521484, "learning_rate": 4.9542143600416234e-05, "loss": 0.2434, "step": 5050 }, { "epoch": 15.474006116207951, "grad_norm": 3.749220848083496, "learning_rate": 4.943808532778356e-05, "loss": 0.2795, "step": 5060 }, { "epoch": 15.504587155963304, "grad_norm": 2.5141384601593018, "learning_rate": 4.933402705515089e-05, "loss": 0.1833, "step": 5070 }, { "epoch": 15.535168195718654, "grad_norm": 3.9967784881591797, "learning_rate": 4.922996878251821e-05, "loss": 0.1838, "step": 5080 }, { "epoch": 15.565749235474007, "grad_norm": 3.8655612468719482, "learning_rate": 4.9125910509885534e-05, "loss": 0.2571, "step": 5090 }, { "epoch": 15.596330275229358, "grad_norm": 8.397068977355957, "learning_rate": 4.9021852237252866e-05, "loss": 0.2467, "step": 5100 }, { "epoch": 15.62691131498471, "grad_norm": 4.210080623626709, "learning_rate": 4.891779396462019e-05, "loss": 0.2353, "step": 5110 }, { "epoch": 15.65749235474006, "grad_norm": 3.7114758491516113, "learning_rate": 4.8813735691987516e-05, "loss": 0.1764, "step": 5120 }, { "epoch": 15.688073394495413, "grad_norm": 4.410521507263184, "learning_rate": 4.870967741935484e-05, "loss": 0.2373, "step": 5130 }, { "epoch": 15.718654434250764, "grad_norm": 4.798542022705078, "learning_rate": 4.8605619146722166e-05, "loss": 0.2254, "step": 5140 }, { "epoch": 15.749235474006117, "grad_norm": 5.082320690155029, "learning_rate": 4.850156087408949e-05, "loss": 0.2278, "step": 5150 }, { "epoch": 15.779816513761467, "grad_norm": 3.4736669063568115, "learning_rate": 4.8397502601456816e-05, "loss": 0.2479, "step": 5160 }, { "epoch": 15.81039755351682, "grad_norm": 8.030004501342773, "learning_rate": 4.829344432882415e-05, "loss": 0.1571, "step": 5170 }, { "epoch": 15.84097859327217, "grad_norm": 4.511882305145264, "learning_rate": 4.818938605619147e-05, "loss": 0.2471, "step": 5180 }, { "epoch": 15.871559633027523, "grad_norm": 4.218798637390137, "learning_rate": 4.80853277835588e-05, "loss": 0.2226, "step": 5190 }, { "epoch": 15.902140672782874, "grad_norm": 9.834012985229492, "learning_rate": 4.7981269510926116e-05, "loss": 0.2012, "step": 5200 }, { "epoch": 15.932721712538227, "grad_norm": 3.036139965057373, "learning_rate": 4.787721123829345e-05, "loss": 0.2287, "step": 5210 }, { "epoch": 15.963302752293577, "grad_norm": 4.578927516937256, "learning_rate": 4.777315296566077e-05, "loss": 0.1971, "step": 5220 }, { "epoch": 15.99388379204893, "grad_norm": 26.36978530883789, "learning_rate": 4.76690946930281e-05, "loss": 0.2227, "step": 5230 }, { "epoch": 16.02446483180428, "grad_norm": 3.6557741165161133, "learning_rate": 4.756503642039542e-05, "loss": 0.1704, "step": 5240 }, { "epoch": 16.05504587155963, "grad_norm": 4.2049031257629395, "learning_rate": 4.746097814776275e-05, "loss": 0.1698, "step": 5250 }, { "epoch": 16.085626911314986, "grad_norm": 6.779061794281006, "learning_rate": 4.735691987513007e-05, "loss": 0.2246, "step": 5260 }, { "epoch": 16.116207951070336, "grad_norm": 4.245777130126953, "learning_rate": 4.72528616024974e-05, "loss": 0.2008, "step": 5270 }, { "epoch": 16.146788990825687, "grad_norm": 10.013638496398926, "learning_rate": 4.714880332986473e-05, "loss": 0.2578, "step": 5280 }, { "epoch": 16.17737003058104, "grad_norm": 2.074145555496216, "learning_rate": 4.7044745057232055e-05, "loss": 0.1921, "step": 5290 }, { "epoch": 16.207951070336392, "grad_norm": 4.0752434730529785, "learning_rate": 4.694068678459938e-05, "loss": 0.1822, "step": 5300 }, { "epoch": 16.238532110091743, "grad_norm": 4.347853660583496, "learning_rate": 4.6836628511966705e-05, "loss": 0.1856, "step": 5310 }, { "epoch": 16.269113149847094, "grad_norm": 4.117323398590088, "learning_rate": 4.673257023933403e-05, "loss": 0.1841, "step": 5320 }, { "epoch": 16.299694189602448, "grad_norm": 4.033961772918701, "learning_rate": 4.6628511966701355e-05, "loss": 0.2984, "step": 5330 }, { "epoch": 16.3302752293578, "grad_norm": 2.2842674255371094, "learning_rate": 4.652445369406868e-05, "loss": 0.1907, "step": 5340 }, { "epoch": 16.36085626911315, "grad_norm": 3.3764960765838623, "learning_rate": 4.6420395421436005e-05, "loss": 0.1756, "step": 5350 }, { "epoch": 16.3914373088685, "grad_norm": 6.444164276123047, "learning_rate": 4.6316337148803337e-05, "loss": 0.2153, "step": 5360 }, { "epoch": 16.422018348623855, "grad_norm": 8.567184448242188, "learning_rate": 4.6212278876170655e-05, "loss": 0.2273, "step": 5370 }, { "epoch": 16.452599388379205, "grad_norm": 5.575711250305176, "learning_rate": 4.610822060353798e-05, "loss": 0.2563, "step": 5380 }, { "epoch": 16.483180428134556, "grad_norm": 3.3823914527893066, "learning_rate": 4.600416233090531e-05, "loss": 0.2264, "step": 5390 }, { "epoch": 16.513761467889907, "grad_norm": 3.6987643241882324, "learning_rate": 4.590010405827264e-05, "loss": 0.1638, "step": 5400 }, { "epoch": 16.54434250764526, "grad_norm": 2.492995262145996, "learning_rate": 4.579604578563996e-05, "loss": 0.2032, "step": 5410 }, { "epoch": 16.574923547400612, "grad_norm": 4.453232288360596, "learning_rate": 4.569198751300729e-05, "loss": 0.2184, "step": 5420 }, { "epoch": 16.605504587155963, "grad_norm": 7.290363788604736, "learning_rate": 4.558792924037461e-05, "loss": 0.2135, "step": 5430 }, { "epoch": 16.636085626911314, "grad_norm": 4.6387810707092285, "learning_rate": 4.548387096774194e-05, "loss": 0.2016, "step": 5440 }, { "epoch": 16.666666666666668, "grad_norm": 5.628077983856201, "learning_rate": 4.537981269510926e-05, "loss": 0.1585, "step": 5450 }, { "epoch": 16.69724770642202, "grad_norm": 3.2144572734832764, "learning_rate": 4.5275754422476594e-05, "loss": 0.2066, "step": 5460 }, { "epoch": 16.72782874617737, "grad_norm": 16.819278717041016, "learning_rate": 4.517169614984392e-05, "loss": 0.2294, "step": 5470 }, { "epoch": 16.75840978593272, "grad_norm": 3.323765516281128, "learning_rate": 4.506763787721124e-05, "loss": 0.2446, "step": 5480 }, { "epoch": 16.788990825688074, "grad_norm": 3.800917625427246, "learning_rate": 4.496357960457856e-05, "loss": 0.2067, "step": 5490 }, { "epoch": 16.819571865443425, "grad_norm": 4.269155025482178, "learning_rate": 4.4859521331945894e-05, "loss": 0.1493, "step": 5500 }, { "epoch": 16.819571865443425, "eval_loss": 0.45923885703086853, "eval_runtime": 17.6763, "eval_samples_per_second": 16.463, "eval_steps_per_second": 8.26, "step": 5500 }, { "epoch": 16.850152905198776, "grad_norm": 4.534053802490234, "learning_rate": 4.475546305931322e-05, "loss": 0.1816, "step": 5510 }, { "epoch": 16.88073394495413, "grad_norm": 3.5566651821136475, "learning_rate": 4.4651404786680544e-05, "loss": 0.1771, "step": 5520 }, { "epoch": 16.91131498470948, "grad_norm": 7.620478630065918, "learning_rate": 4.454734651404787e-05, "loss": 0.2447, "step": 5530 }, { "epoch": 16.941896024464832, "grad_norm": 1.7376987934112549, "learning_rate": 4.4443288241415194e-05, "loss": 0.1855, "step": 5540 }, { "epoch": 16.972477064220183, "grad_norm": 4.00967264175415, "learning_rate": 4.433922996878252e-05, "loss": 0.2166, "step": 5550 }, { "epoch": 17.003058103975537, "grad_norm": 2.8900110721588135, "learning_rate": 4.4235171696149844e-05, "loss": 0.2166, "step": 5560 }, { "epoch": 17.033639143730888, "grad_norm": 3.8202905654907227, "learning_rate": 4.4131113423517176e-05, "loss": 0.174, "step": 5570 }, { "epoch": 17.06422018348624, "grad_norm": 3.6928577423095703, "learning_rate": 4.40270551508845e-05, "loss": 0.1657, "step": 5580 }, { "epoch": 17.09480122324159, "grad_norm": 5.6383957862854, "learning_rate": 4.392299687825182e-05, "loss": 0.1918, "step": 5590 }, { "epoch": 17.125382262996943, "grad_norm": 4.009942054748535, "learning_rate": 4.381893860561915e-05, "loss": 0.1959, "step": 5600 }, { "epoch": 17.155963302752294, "grad_norm": 2.3905227184295654, "learning_rate": 4.3714880332986476e-05, "loss": 0.2115, "step": 5610 }, { "epoch": 17.186544342507645, "grad_norm": 4.9664740562438965, "learning_rate": 4.36108220603538e-05, "loss": 0.171, "step": 5620 }, { "epoch": 17.217125382262996, "grad_norm": 4.175055980682373, "learning_rate": 4.3506763787721126e-05, "loss": 0.1552, "step": 5630 }, { "epoch": 17.24770642201835, "grad_norm": 5.902973651885986, "learning_rate": 4.340270551508846e-05, "loss": 0.1792, "step": 5640 }, { "epoch": 17.2782874617737, "grad_norm": 4.277851104736328, "learning_rate": 4.3298647242455776e-05, "loss": 0.225, "step": 5650 }, { "epoch": 17.30886850152905, "grad_norm": 3.0731680393218994, "learning_rate": 4.31945889698231e-05, "loss": 0.2421, "step": 5660 }, { "epoch": 17.339449541284402, "grad_norm": 3.9331276416778564, "learning_rate": 4.3090530697190426e-05, "loss": 0.1605, "step": 5670 }, { "epoch": 17.370030581039757, "grad_norm": 5.0467424392700195, "learning_rate": 4.298647242455776e-05, "loss": 0.1699, "step": 5680 }, { "epoch": 17.400611620795107, "grad_norm": 4.776144504547119, "learning_rate": 4.288241415192508e-05, "loss": 0.167, "step": 5690 }, { "epoch": 17.431192660550458, "grad_norm": 23.452909469604492, "learning_rate": 4.27783558792924e-05, "loss": 0.1966, "step": 5700 }, { "epoch": 17.46177370030581, "grad_norm": 3.5143637657165527, "learning_rate": 4.267429760665973e-05, "loss": 0.2255, "step": 5710 }, { "epoch": 17.492354740061163, "grad_norm": 3.8650062084198, "learning_rate": 4.257023933402706e-05, "loss": 0.1401, "step": 5720 }, { "epoch": 17.522935779816514, "grad_norm": 4.191610336303711, "learning_rate": 4.246618106139438e-05, "loss": 0.192, "step": 5730 }, { "epoch": 17.553516819571865, "grad_norm": 5.632226943969727, "learning_rate": 4.236212278876171e-05, "loss": 0.1822, "step": 5740 }, { "epoch": 17.584097859327215, "grad_norm": 6.700896739959717, "learning_rate": 4.225806451612904e-05, "loss": 0.2147, "step": 5750 }, { "epoch": 17.61467889908257, "grad_norm": 2.0719738006591797, "learning_rate": 4.215400624349636e-05, "loss": 0.2758, "step": 5760 }, { "epoch": 17.64525993883792, "grad_norm": 3.314340829849243, "learning_rate": 4.204994797086368e-05, "loss": 0.1544, "step": 5770 }, { "epoch": 17.67584097859327, "grad_norm": 3.5887811183929443, "learning_rate": 4.1945889698231015e-05, "loss": 0.1905, "step": 5780 }, { "epoch": 17.706422018348626, "grad_norm": 4.045060634613037, "learning_rate": 4.184183142559834e-05, "loss": 0.1851, "step": 5790 }, { "epoch": 17.737003058103976, "grad_norm": 5.480162620544434, "learning_rate": 4.1737773152965665e-05, "loss": 0.1968, "step": 5800 }, { "epoch": 17.767584097859327, "grad_norm": 2.7789804935455322, "learning_rate": 4.163371488033298e-05, "loss": 0.2081, "step": 5810 }, { "epoch": 17.798165137614678, "grad_norm": 3.0406036376953125, "learning_rate": 4.1529656607700315e-05, "loss": 0.16, "step": 5820 }, { "epoch": 17.828746177370032, "grad_norm": 4.462724208831787, "learning_rate": 4.142559833506764e-05, "loss": 0.1612, "step": 5830 }, { "epoch": 17.859327217125383, "grad_norm": 6.905117511749268, "learning_rate": 4.1321540062434965e-05, "loss": 0.2294, "step": 5840 }, { "epoch": 17.889908256880734, "grad_norm": 5.394728660583496, "learning_rate": 4.121748178980229e-05, "loss": 0.172, "step": 5850 }, { "epoch": 17.920489296636084, "grad_norm": 4.424241065979004, "learning_rate": 4.111342351716962e-05, "loss": 0.2024, "step": 5860 }, { "epoch": 17.95107033639144, "grad_norm": 4.529147148132324, "learning_rate": 4.100936524453694e-05, "loss": 0.1698, "step": 5870 }, { "epoch": 17.98165137614679, "grad_norm": 4.008469581604004, "learning_rate": 4.0905306971904265e-05, "loss": 0.2036, "step": 5880 }, { "epoch": 18.01223241590214, "grad_norm": 2.6121578216552734, "learning_rate": 4.0801248699271597e-05, "loss": 0.2187, "step": 5890 }, { "epoch": 18.04281345565749, "grad_norm": 2.463407278060913, "learning_rate": 4.069719042663892e-05, "loss": 0.1473, "step": 5900 }, { "epoch": 18.073394495412845, "grad_norm": 4.79568338394165, "learning_rate": 4.0593132154006247e-05, "loss": 0.1553, "step": 5910 }, { "epoch": 18.103975535168196, "grad_norm": 3.8196969032287598, "learning_rate": 4.048907388137357e-05, "loss": 0.1382, "step": 5920 }, { "epoch": 18.134556574923547, "grad_norm": 2.5826215744018555, "learning_rate": 4.03850156087409e-05, "loss": 0.1984, "step": 5930 }, { "epoch": 18.165137614678898, "grad_norm": 5.074633598327637, "learning_rate": 4.028095733610822e-05, "loss": 0.2118, "step": 5940 }, { "epoch": 18.195718654434252, "grad_norm": 2.8331735134124756, "learning_rate": 4.017689906347555e-05, "loss": 0.1504, "step": 5950 }, { "epoch": 18.226299694189603, "grad_norm": 6.5332207679748535, "learning_rate": 4.007284079084287e-05, "loss": 0.1873, "step": 5960 }, { "epoch": 18.256880733944953, "grad_norm": 5.0922651290893555, "learning_rate": 3.9968782518210203e-05, "loss": 0.1896, "step": 5970 }, { "epoch": 18.287461773700304, "grad_norm": 6.440834045410156, "learning_rate": 3.986472424557752e-05, "loss": 0.2065, "step": 5980 }, { "epoch": 18.31804281345566, "grad_norm": 3.152738332748413, "learning_rate": 3.976066597294485e-05, "loss": 0.2464, "step": 5990 }, { "epoch": 18.34862385321101, "grad_norm": 2.717283248901367, "learning_rate": 3.965660770031218e-05, "loss": 0.1428, "step": 6000 }, { "epoch": 18.34862385321101, "eval_loss": 0.44961076974868774, "eval_runtime": 17.3868, "eval_samples_per_second": 16.737, "eval_steps_per_second": 8.397, "step": 6000 }, { "epoch": 18.37920489296636, "grad_norm": 5.773160457611084, "learning_rate": 3.9552549427679504e-05, "loss": 0.204, "step": 6010 }, { "epoch": 18.40978593272171, "grad_norm": 7.26622200012207, "learning_rate": 3.944849115504683e-05, "loss": 0.1919, "step": 6020 }, { "epoch": 18.440366972477065, "grad_norm": 4.916703224182129, "learning_rate": 3.9344432882414154e-05, "loss": 0.1785, "step": 6030 }, { "epoch": 18.470948012232416, "grad_norm": 2.300110101699829, "learning_rate": 3.924037460978148e-05, "loss": 0.1954, "step": 6040 }, { "epoch": 18.501529051987767, "grad_norm": 3.844594955444336, "learning_rate": 3.9136316337148804e-05, "loss": 0.1527, "step": 6050 }, { "epoch": 18.53211009174312, "grad_norm": 2.7383902072906494, "learning_rate": 3.903225806451613e-05, "loss": 0.1831, "step": 6060 }, { "epoch": 18.56269113149847, "grad_norm": 22.698638916015625, "learning_rate": 3.892819979188346e-05, "loss": 0.2091, "step": 6070 }, { "epoch": 18.593272171253822, "grad_norm": 8.911148071289062, "learning_rate": 3.8824141519250785e-05, "loss": 0.2182, "step": 6080 }, { "epoch": 18.623853211009173, "grad_norm": 2.808556079864502, "learning_rate": 3.872008324661811e-05, "loss": 0.2172, "step": 6090 }, { "epoch": 18.654434250764528, "grad_norm": 2.2882444858551025, "learning_rate": 3.861602497398543e-05, "loss": 0.1442, "step": 6100 }, { "epoch": 18.68501529051988, "grad_norm": 2.9601521492004395, "learning_rate": 3.851196670135276e-05, "loss": 0.1747, "step": 6110 }, { "epoch": 18.71559633027523, "grad_norm": 3.576287269592285, "learning_rate": 3.8407908428720086e-05, "loss": 0.1767, "step": 6120 }, { "epoch": 18.74617737003058, "grad_norm": 4.117494583129883, "learning_rate": 3.830385015608741e-05, "loss": 0.179, "step": 6130 }, { "epoch": 18.776758409785934, "grad_norm": 3.8094818592071533, "learning_rate": 3.8199791883454736e-05, "loss": 0.2504, "step": 6140 }, { "epoch": 18.807339449541285, "grad_norm": 2.3974802494049072, "learning_rate": 3.809573361082206e-05, "loss": 0.1616, "step": 6150 }, { "epoch": 18.837920489296636, "grad_norm": 3.671895742416382, "learning_rate": 3.7991675338189386e-05, "loss": 0.1324, "step": 6160 }, { "epoch": 18.868501529051986, "grad_norm": 3.833237648010254, "learning_rate": 3.788761706555671e-05, "loss": 0.1597, "step": 6170 }, { "epoch": 18.89908256880734, "grad_norm": 5.964929580688477, "learning_rate": 3.778355879292404e-05, "loss": 0.2068, "step": 6180 }, { "epoch": 18.92966360856269, "grad_norm": 3.1389944553375244, "learning_rate": 3.767950052029137e-05, "loss": 0.2317, "step": 6190 }, { "epoch": 18.960244648318042, "grad_norm": 4.046453952789307, "learning_rate": 3.757544224765869e-05, "loss": 0.1787, "step": 6200 }, { "epoch": 18.990825688073393, "grad_norm": 3.8653130531311035, "learning_rate": 3.747138397502602e-05, "loss": 0.1851, "step": 6210 }, { "epoch": 19.021406727828747, "grad_norm": 2.5173943042755127, "learning_rate": 3.736732570239334e-05, "loss": 0.1761, "step": 6220 }, { "epoch": 19.051987767584098, "grad_norm": 2.8177404403686523, "learning_rate": 3.726326742976067e-05, "loss": 0.1356, "step": 6230 }, { "epoch": 19.08256880733945, "grad_norm": 4.992142200469971, "learning_rate": 3.715920915712799e-05, "loss": 0.1783, "step": 6240 }, { "epoch": 19.1131498470948, "grad_norm": 3.925229072570801, "learning_rate": 3.7055150884495324e-05, "loss": 0.1689, "step": 6250 }, { "epoch": 19.143730886850154, "grad_norm": 4.234835624694824, "learning_rate": 3.695109261186264e-05, "loss": 0.1858, "step": 6260 }, { "epoch": 19.174311926605505, "grad_norm": 6.015228748321533, "learning_rate": 3.684703433922997e-05, "loss": 0.1984, "step": 6270 }, { "epoch": 19.204892966360855, "grad_norm": 3.5733656883239746, "learning_rate": 3.674297606659729e-05, "loss": 0.1424, "step": 6280 }, { "epoch": 19.235474006116206, "grad_norm": 2.8804519176483154, "learning_rate": 3.6638917793964624e-05, "loss": 0.1838, "step": 6290 }, { "epoch": 19.26605504587156, "grad_norm": 6.07488489151001, "learning_rate": 3.653485952133195e-05, "loss": 0.1532, "step": 6300 }, { "epoch": 19.29663608562691, "grad_norm": 4.1511054039001465, "learning_rate": 3.6430801248699275e-05, "loss": 0.2108, "step": 6310 }, { "epoch": 19.327217125382262, "grad_norm": 3.896618366241455, "learning_rate": 3.63267429760666e-05, "loss": 0.1359, "step": 6320 }, { "epoch": 19.357798165137616, "grad_norm": 3.186441659927368, "learning_rate": 3.6222684703433925e-05, "loss": 0.1386, "step": 6330 }, { "epoch": 19.388379204892967, "grad_norm": 5.969933986663818, "learning_rate": 3.611862643080125e-05, "loss": 0.1675, "step": 6340 }, { "epoch": 19.418960244648318, "grad_norm": 5.708403587341309, "learning_rate": 3.6014568158168575e-05, "loss": 0.1671, "step": 6350 }, { "epoch": 19.44954128440367, "grad_norm": 6.588597297668457, "learning_rate": 3.5910509885535906e-05, "loss": 0.1699, "step": 6360 }, { "epoch": 19.480122324159023, "grad_norm": 3.4485206604003906, "learning_rate": 3.580645161290323e-05, "loss": 0.1553, "step": 6370 }, { "epoch": 19.510703363914374, "grad_norm": 2.0754480361938477, "learning_rate": 3.570239334027055e-05, "loss": 0.1325, "step": 6380 }, { "epoch": 19.541284403669724, "grad_norm": 5.2881693840026855, "learning_rate": 3.5598335067637875e-05, "loss": 0.1833, "step": 6390 }, { "epoch": 19.571865443425075, "grad_norm": 7.182536602020264, "learning_rate": 3.5494276795005206e-05, "loss": 0.1694, "step": 6400 }, { "epoch": 19.60244648318043, "grad_norm": 6.095035076141357, "learning_rate": 3.539021852237253e-05, "loss": 0.2415, "step": 6410 }, { "epoch": 19.63302752293578, "grad_norm": 3.0211124420166016, "learning_rate": 3.5286160249739857e-05, "loss": 0.198, "step": 6420 }, { "epoch": 19.66360856269113, "grad_norm": 3.4166922569274902, "learning_rate": 3.518210197710718e-05, "loss": 0.1311, "step": 6430 }, { "epoch": 19.69418960244648, "grad_norm": 2.852652072906494, "learning_rate": 3.5078043704474507e-05, "loss": 0.1563, "step": 6440 }, { "epoch": 19.724770642201836, "grad_norm": 7.064545154571533, "learning_rate": 3.497398543184183e-05, "loss": 0.1824, "step": 6450 }, { "epoch": 19.755351681957187, "grad_norm": 6.215057849884033, "learning_rate": 3.4869927159209157e-05, "loss": 0.1797, "step": 6460 }, { "epoch": 19.785932721712538, "grad_norm": 3.3205864429473877, "learning_rate": 3.476586888657649e-05, "loss": 0.1543, "step": 6470 }, { "epoch": 19.81651376146789, "grad_norm": 4.110857963562012, "learning_rate": 3.4661810613943813e-05, "loss": 0.1745, "step": 6480 }, { "epoch": 19.847094801223243, "grad_norm": 3.430475950241089, "learning_rate": 3.455775234131113e-05, "loss": 0.1741, "step": 6490 }, { "epoch": 19.877675840978593, "grad_norm": 4.670803546905518, "learning_rate": 3.4453694068678463e-05, "loss": 0.1856, "step": 6500 }, { "epoch": 19.877675840978593, "eval_loss": 0.44353702664375305, "eval_runtime": 16.8697, "eval_samples_per_second": 17.25, "eval_steps_per_second": 8.655, "step": 6500 }, { "epoch": 19.908256880733944, "grad_norm": 5.290159702301025, "learning_rate": 3.434963579604579e-05, "loss": 0.2133, "step": 6510 }, { "epoch": 19.938837920489295, "grad_norm": 3.216883420944214, "learning_rate": 3.4245577523413113e-05, "loss": 0.1675, "step": 6520 }, { "epoch": 19.96941896024465, "grad_norm": 3.890711784362793, "learning_rate": 3.414151925078044e-05, "loss": 0.1533, "step": 6530 }, { "epoch": 20.0, "grad_norm": 12.24163818359375, "learning_rate": 3.403746097814777e-05, "loss": 0.2362, "step": 6540 }, { "epoch": 20.03058103975535, "grad_norm": 4.171586990356445, "learning_rate": 3.393340270551509e-05, "loss": 0.1078, "step": 6550 }, { "epoch": 20.061162079510705, "grad_norm": 4.116422176361084, "learning_rate": 3.3829344432882414e-05, "loss": 0.1466, "step": 6560 }, { "epoch": 20.091743119266056, "grad_norm": 4.5232625007629395, "learning_rate": 3.372528616024974e-05, "loss": 0.1598, "step": 6570 }, { "epoch": 20.122324159021407, "grad_norm": 5.76244592666626, "learning_rate": 3.362122788761707e-05, "loss": 0.1706, "step": 6580 }, { "epoch": 20.152905198776757, "grad_norm": 6.140753746032715, "learning_rate": 3.3517169614984395e-05, "loss": 0.2467, "step": 6590 }, { "epoch": 20.18348623853211, "grad_norm": 3.3037118911743164, "learning_rate": 3.3413111342351714e-05, "loss": 0.1593, "step": 6600 }, { "epoch": 20.214067278287462, "grad_norm": 7.814075469970703, "learning_rate": 3.3309053069719045e-05, "loss": 0.1354, "step": 6610 }, { "epoch": 20.244648318042813, "grad_norm": 4.812346458435059, "learning_rate": 3.320499479708637e-05, "loss": 0.1493, "step": 6620 }, { "epoch": 20.275229357798164, "grad_norm": 4.187191009521484, "learning_rate": 3.3100936524453695e-05, "loss": 0.1723, "step": 6630 }, { "epoch": 20.30581039755352, "grad_norm": 8.240633010864258, "learning_rate": 3.299687825182102e-05, "loss": 0.249, "step": 6640 }, { "epoch": 20.33639143730887, "grad_norm": 2.668346881866455, "learning_rate": 3.289281997918835e-05, "loss": 0.1191, "step": 6650 }, { "epoch": 20.36697247706422, "grad_norm": 3.798555850982666, "learning_rate": 3.278876170655567e-05, "loss": 0.1465, "step": 6660 }, { "epoch": 20.39755351681957, "grad_norm": 5.0725417137146, "learning_rate": 3.2684703433922996e-05, "loss": 0.161, "step": 6670 }, { "epoch": 20.428134556574925, "grad_norm": 2.8009026050567627, "learning_rate": 3.258064516129033e-05, "loss": 0.2017, "step": 6680 }, { "epoch": 20.458715596330276, "grad_norm": 4.939574718475342, "learning_rate": 3.247658688865765e-05, "loss": 0.1874, "step": 6690 }, { "epoch": 20.489296636085626, "grad_norm": 3.652327060699463, "learning_rate": 3.237252861602498e-05, "loss": 0.1294, "step": 6700 }, { "epoch": 20.519877675840977, "grad_norm": 4.777719020843506, "learning_rate": 3.2268470343392296e-05, "loss": 0.1049, "step": 6710 }, { "epoch": 20.55045871559633, "grad_norm": 4.19663667678833, "learning_rate": 3.216441207075963e-05, "loss": 0.1251, "step": 6720 }, { "epoch": 20.581039755351682, "grad_norm": 7.416243553161621, "learning_rate": 3.206035379812695e-05, "loss": 0.186, "step": 6730 }, { "epoch": 20.611620795107033, "grad_norm": 16.07038688659668, "learning_rate": 3.195629552549428e-05, "loss": 0.201, "step": 6740 }, { "epoch": 20.642201834862384, "grad_norm": 5.279475212097168, "learning_rate": 3.18522372528616e-05, "loss": 0.1347, "step": 6750 }, { "epoch": 20.672782874617738, "grad_norm": 4.603892803192139, "learning_rate": 3.1748178980228934e-05, "loss": 0.1489, "step": 6760 }, { "epoch": 20.70336391437309, "grad_norm": 6.145246982574463, "learning_rate": 3.164412070759625e-05, "loss": 0.1138, "step": 6770 }, { "epoch": 20.73394495412844, "grad_norm": 5.94775915145874, "learning_rate": 3.154006243496358e-05, "loss": 0.1769, "step": 6780 }, { "epoch": 20.764525993883794, "grad_norm": 6.3351826667785645, "learning_rate": 3.143600416233091e-05, "loss": 0.1765, "step": 6790 }, { "epoch": 20.795107033639145, "grad_norm": 1.8415037393569946, "learning_rate": 3.1331945889698234e-05, "loss": 0.131, "step": 6800 }, { "epoch": 20.825688073394495, "grad_norm": 2.168361186981201, "learning_rate": 3.122788761706556e-05, "loss": 0.1462, "step": 6810 }, { "epoch": 20.856269113149846, "grad_norm": 7.718116760253906, "learning_rate": 3.1123829344432884e-05, "loss": 0.1667, "step": 6820 }, { "epoch": 20.8868501529052, "grad_norm": 4.486772060394287, "learning_rate": 3.101977107180021e-05, "loss": 0.1617, "step": 6830 }, { "epoch": 20.91743119266055, "grad_norm": 9.251279830932617, "learning_rate": 3.0915712799167534e-05, "loss": 0.2185, "step": 6840 }, { "epoch": 20.948012232415902, "grad_norm": 2.624577522277832, "learning_rate": 3.081165452653486e-05, "loss": 0.1468, "step": 6850 }, { "epoch": 20.978593272171253, "grad_norm": 4.017003059387207, "learning_rate": 3.0707596253902185e-05, "loss": 0.1703, "step": 6860 }, { "epoch": 21.009174311926607, "grad_norm": 5.159716606140137, "learning_rate": 3.0603537981269516e-05, "loss": 0.1809, "step": 6870 }, { "epoch": 21.039755351681958, "grad_norm": 3.0058741569519043, "learning_rate": 3.0499479708636835e-05, "loss": 0.1118, "step": 6880 }, { "epoch": 21.07033639143731, "grad_norm": 2.598550796508789, "learning_rate": 3.0395421436004163e-05, "loss": 0.1684, "step": 6890 }, { "epoch": 21.10091743119266, "grad_norm": 4.881763935089111, "learning_rate": 3.0291363163371488e-05, "loss": 0.1274, "step": 6900 }, { "epoch": 21.131498470948014, "grad_norm": 6.203042984008789, "learning_rate": 3.0187304890738816e-05, "loss": 0.1916, "step": 6910 }, { "epoch": 21.162079510703364, "grad_norm": 2.4897990226745605, "learning_rate": 3.008324661810614e-05, "loss": 0.2015, "step": 6920 }, { "epoch": 21.192660550458715, "grad_norm": 2.7538557052612305, "learning_rate": 2.997918834547347e-05, "loss": 0.1044, "step": 6930 }, { "epoch": 21.223241590214066, "grad_norm": 7.675657272338867, "learning_rate": 2.987513007284079e-05, "loss": 0.1387, "step": 6940 }, { "epoch": 21.25382262996942, "grad_norm": 3.915666341781616, "learning_rate": 2.9771071800208116e-05, "loss": 0.1505, "step": 6950 }, { "epoch": 21.28440366972477, "grad_norm": 6.621923446655273, "learning_rate": 2.9667013527575445e-05, "loss": 0.2056, "step": 6960 }, { "epoch": 21.31498470948012, "grad_norm": 2.253448486328125, "learning_rate": 2.956295525494277e-05, "loss": 0.1935, "step": 6970 }, { "epoch": 21.345565749235472, "grad_norm": 3.531188726425171, "learning_rate": 2.94588969823101e-05, "loss": 0.1359, "step": 6980 }, { "epoch": 21.376146788990827, "grad_norm": 4.113672256469727, "learning_rate": 2.9354838709677417e-05, "loss": 0.1521, "step": 6990 }, { "epoch": 21.406727828746178, "grad_norm": 4.267278671264648, "learning_rate": 2.9250780437044745e-05, "loss": 0.1129, "step": 7000 }, { "epoch": 21.406727828746178, "eval_loss": 0.4731081426143646, "eval_runtime": 16.9507, "eval_samples_per_second": 17.167, "eval_steps_per_second": 8.613, "step": 7000 }, { "epoch": 21.43730886850153, "grad_norm": 5.572481155395508, "learning_rate": 2.914672216441207e-05, "loss": 0.1351, "step": 7010 }, { "epoch": 21.46788990825688, "grad_norm": 2.6719911098480225, "learning_rate": 2.90426638917794e-05, "loss": 0.1672, "step": 7020 }, { "epoch": 21.498470948012233, "grad_norm": 6.055787086486816, "learning_rate": 2.8938605619146723e-05, "loss": 0.1013, "step": 7030 }, { "epoch": 21.529051987767584, "grad_norm": 4.567263603210449, "learning_rate": 2.8834547346514052e-05, "loss": 0.1349, "step": 7040 }, { "epoch": 21.559633027522935, "grad_norm": 5.775519847869873, "learning_rate": 2.8730489073881373e-05, "loss": 0.1357, "step": 7050 }, { "epoch": 21.59021406727829, "grad_norm": 7.519472122192383, "learning_rate": 2.86264308012487e-05, "loss": 0.1715, "step": 7060 }, { "epoch": 21.62079510703364, "grad_norm": 2.7148330211639404, "learning_rate": 2.8522372528616027e-05, "loss": 0.1807, "step": 7070 }, { "epoch": 21.65137614678899, "grad_norm": 3.665200710296631, "learning_rate": 2.8418314255983352e-05, "loss": 0.1299, "step": 7080 }, { "epoch": 21.68195718654434, "grad_norm": 3.740471601486206, "learning_rate": 2.831425598335068e-05, "loss": 0.1203, "step": 7090 }, { "epoch": 21.712538226299696, "grad_norm": 6.551999092102051, "learning_rate": 2.8210197710718005e-05, "loss": 0.1833, "step": 7100 }, { "epoch": 21.743119266055047, "grad_norm": 9.063637733459473, "learning_rate": 2.8106139438085327e-05, "loss": 0.1705, "step": 7110 }, { "epoch": 21.773700305810397, "grad_norm": 4.627596855163574, "learning_rate": 2.8002081165452655e-05, "loss": 0.1727, "step": 7120 }, { "epoch": 21.804281345565748, "grad_norm": 2.3520357608795166, "learning_rate": 2.789802289281998e-05, "loss": 0.125, "step": 7130 }, { "epoch": 21.834862385321102, "grad_norm": 5.15336275100708, "learning_rate": 2.779396462018731e-05, "loss": 0.1442, "step": 7140 }, { "epoch": 21.865443425076453, "grad_norm": 9.608550071716309, "learning_rate": 2.7689906347554634e-05, "loss": 0.1595, "step": 7150 }, { "epoch": 21.896024464831804, "grad_norm": 6.136108875274658, "learning_rate": 2.7585848074921955e-05, "loss": 0.1819, "step": 7160 }, { "epoch": 21.926605504587155, "grad_norm": 2.7484302520751953, "learning_rate": 2.748178980228928e-05, "loss": 0.1706, "step": 7170 }, { "epoch": 21.95718654434251, "grad_norm": 4.393002510070801, "learning_rate": 2.737773152965661e-05, "loss": 0.1237, "step": 7180 }, { "epoch": 21.98776758409786, "grad_norm": 2.5787153244018555, "learning_rate": 2.7273673257023934e-05, "loss": 0.1456, "step": 7190 }, { "epoch": 22.01834862385321, "grad_norm": 5.6595377922058105, "learning_rate": 2.7169614984391262e-05, "loss": 0.1583, "step": 7200 }, { "epoch": 22.04892966360856, "grad_norm": 2.430314779281616, "learning_rate": 2.7065556711758587e-05, "loss": 0.1023, "step": 7210 }, { "epoch": 22.079510703363916, "grad_norm": 4.03518533706665, "learning_rate": 2.696149843912591e-05, "loss": 0.1829, "step": 7220 }, { "epoch": 22.110091743119266, "grad_norm": 5.839175701141357, "learning_rate": 2.6857440166493237e-05, "loss": 0.1378, "step": 7230 }, { "epoch": 22.140672782874617, "grad_norm": 3.737353563308716, "learning_rate": 2.6753381893860562e-05, "loss": 0.1801, "step": 7240 }, { "epoch": 22.171253822629968, "grad_norm": 1.6595263481140137, "learning_rate": 2.664932362122789e-05, "loss": 0.1422, "step": 7250 }, { "epoch": 22.201834862385322, "grad_norm": 2.8589601516723633, "learning_rate": 2.6545265348595216e-05, "loss": 0.1056, "step": 7260 }, { "epoch": 22.232415902140673, "grad_norm": 2.1064066886901855, "learning_rate": 2.6441207075962544e-05, "loss": 0.1541, "step": 7270 }, { "epoch": 22.262996941896024, "grad_norm": 3.4547598361968994, "learning_rate": 2.6337148803329862e-05, "loss": 0.1684, "step": 7280 }, { "epoch": 22.293577981651374, "grad_norm": 5.208943843841553, "learning_rate": 2.623309053069719e-05, "loss": 0.1583, "step": 7290 }, { "epoch": 22.32415902140673, "grad_norm": 2.59063458442688, "learning_rate": 2.6129032258064516e-05, "loss": 0.147, "step": 7300 }, { "epoch": 22.35474006116208, "grad_norm": 2.2470364570617676, "learning_rate": 2.6024973985431844e-05, "loss": 0.1587, "step": 7310 }, { "epoch": 22.38532110091743, "grad_norm": 3.6315572261810303, "learning_rate": 2.592091571279917e-05, "loss": 0.1373, "step": 7320 }, { "epoch": 22.415902140672785, "grad_norm": 6.476180076599121, "learning_rate": 2.581685744016649e-05, "loss": 0.194, "step": 7330 }, { "epoch": 22.446483180428135, "grad_norm": 5.465244293212891, "learning_rate": 2.571279916753382e-05, "loss": 0.1674, "step": 7340 }, { "epoch": 22.477064220183486, "grad_norm": 3.601835250854492, "learning_rate": 2.5608740894901144e-05, "loss": 0.1335, "step": 7350 }, { "epoch": 22.507645259938837, "grad_norm": 2.8011441230773926, "learning_rate": 2.5504682622268473e-05, "loss": 0.0966, "step": 7360 }, { "epoch": 22.53822629969419, "grad_norm": 3.8074159622192383, "learning_rate": 2.5400624349635798e-05, "loss": 0.1206, "step": 7370 }, { "epoch": 22.568807339449542, "grad_norm": 2.789634943008423, "learning_rate": 2.5296566077003126e-05, "loss": 0.1234, "step": 7380 }, { "epoch": 22.599388379204893, "grad_norm": 3.4443519115448, "learning_rate": 2.5192507804370448e-05, "loss": 0.1578, "step": 7390 }, { "epoch": 22.629969418960243, "grad_norm": 3.9452195167541504, "learning_rate": 2.5088449531737773e-05, "loss": 0.1737, "step": 7400 }, { "epoch": 22.660550458715598, "grad_norm": 2.4158213138580322, "learning_rate": 2.49843912591051e-05, "loss": 0.1387, "step": 7410 }, { "epoch": 22.69113149847095, "grad_norm": 6.3911356925964355, "learning_rate": 2.4880332986472426e-05, "loss": 0.1519, "step": 7420 }, { "epoch": 22.7217125382263, "grad_norm": 2.70169734954834, "learning_rate": 2.477627471383975e-05, "loss": 0.16, "step": 7430 }, { "epoch": 22.75229357798165, "grad_norm": 4.864673614501953, "learning_rate": 2.4672216441207076e-05, "loss": 0.133, "step": 7440 }, { "epoch": 22.782874617737004, "grad_norm": 1.9068208932876587, "learning_rate": 2.4568158168574405e-05, "loss": 0.1434, "step": 7450 }, { "epoch": 22.813455657492355, "grad_norm": 1.8495513200759888, "learning_rate": 2.4464099895941726e-05, "loss": 0.1397, "step": 7460 }, { "epoch": 22.844036697247706, "grad_norm": 4.008700370788574, "learning_rate": 2.4360041623309055e-05, "loss": 0.1333, "step": 7470 }, { "epoch": 22.874617737003057, "grad_norm": 3.1677236557006836, "learning_rate": 2.425598335067638e-05, "loss": 0.1479, "step": 7480 }, { "epoch": 22.90519877675841, "grad_norm": 3.1551225185394287, "learning_rate": 2.4151925078043705e-05, "loss": 0.1471, "step": 7490 }, { "epoch": 22.93577981651376, "grad_norm": 3.30476450920105, "learning_rate": 2.4047866805411033e-05, "loss": 0.1454, "step": 7500 }, { "epoch": 22.93577981651376, "eval_loss": 0.459464430809021, "eval_runtime": 16.9937, "eval_samples_per_second": 17.124, "eval_steps_per_second": 8.591, "step": 7500 }, { "epoch": 22.966360856269112, "grad_norm": 2.7632486820220947, "learning_rate": 2.3943808532778355e-05, "loss": 0.1611, "step": 7510 }, { "epoch": 22.996941896024463, "grad_norm": 6.361327171325684, "learning_rate": 2.3839750260145683e-05, "loss": 0.1678, "step": 7520 }, { "epoch": 23.027522935779817, "grad_norm": 3.0553674697875977, "learning_rate": 2.373569198751301e-05, "loss": 0.1319, "step": 7530 }, { "epoch": 23.058103975535168, "grad_norm": 5.1710357666015625, "learning_rate": 2.3631633714880333e-05, "loss": 0.1498, "step": 7540 }, { "epoch": 23.08868501529052, "grad_norm": 5.715560436248779, "learning_rate": 2.352757544224766e-05, "loss": 0.1304, "step": 7550 }, { "epoch": 23.119266055045873, "grad_norm": 6.18096923828125, "learning_rate": 2.3423517169614987e-05, "loss": 0.1642, "step": 7560 }, { "epoch": 23.149847094801224, "grad_norm": 18.369741439819336, "learning_rate": 2.3319458896982312e-05, "loss": 0.1894, "step": 7570 }, { "epoch": 23.180428134556575, "grad_norm": 2.6054799556732178, "learning_rate": 2.3215400624349637e-05, "loss": 0.0919, "step": 7580 }, { "epoch": 23.211009174311926, "grad_norm": 5.282476425170898, "learning_rate": 2.3111342351716965e-05, "loss": 0.0982, "step": 7590 }, { "epoch": 23.24159021406728, "grad_norm": 5.387862205505371, "learning_rate": 2.3007284079084287e-05, "loss": 0.1567, "step": 7600 }, { "epoch": 23.27217125382263, "grad_norm": 3.5458264350891113, "learning_rate": 2.2903225806451615e-05, "loss": 0.1508, "step": 7610 }, { "epoch": 23.30275229357798, "grad_norm": 5.379571914672852, "learning_rate": 2.279916753381894e-05, "loss": 0.2044, "step": 7620 }, { "epoch": 23.333333333333332, "grad_norm": 3.6042699813842773, "learning_rate": 2.2695109261186265e-05, "loss": 0.119, "step": 7630 }, { "epoch": 23.363914373088686, "grad_norm": 2.713719606399536, "learning_rate": 2.259105098855359e-05, "loss": 0.1356, "step": 7640 }, { "epoch": 23.394495412844037, "grad_norm": 4.7480597496032715, "learning_rate": 2.248699271592092e-05, "loss": 0.1428, "step": 7650 }, { "epoch": 23.425076452599388, "grad_norm": 5.461562156677246, "learning_rate": 2.2382934443288244e-05, "loss": 0.1555, "step": 7660 }, { "epoch": 23.45565749235474, "grad_norm": 8.009157180786133, "learning_rate": 2.227887617065557e-05, "loss": 0.1491, "step": 7670 }, { "epoch": 23.486238532110093, "grad_norm": 2.9787027835845947, "learning_rate": 2.2174817898022894e-05, "loss": 0.1062, "step": 7680 }, { "epoch": 23.516819571865444, "grad_norm": 3.0121078491210938, "learning_rate": 2.207075962539022e-05, "loss": 0.1247, "step": 7690 }, { "epoch": 23.547400611620795, "grad_norm": 6.196150302886963, "learning_rate": 2.1966701352757547e-05, "loss": 0.1115, "step": 7700 }, { "epoch": 23.577981651376145, "grad_norm": 3.6438441276550293, "learning_rate": 2.186264308012487e-05, "loss": 0.1245, "step": 7710 }, { "epoch": 23.6085626911315, "grad_norm": 9.425365447998047, "learning_rate": 2.1758584807492197e-05, "loss": 0.1827, "step": 7720 }, { "epoch": 23.63914373088685, "grad_norm": 3.6670544147491455, "learning_rate": 2.1654526534859522e-05, "loss": 0.1519, "step": 7730 }, { "epoch": 23.6697247706422, "grad_norm": 3.302945137023926, "learning_rate": 2.1550468262226847e-05, "loss": 0.1221, "step": 7740 }, { "epoch": 23.700305810397552, "grad_norm": 3.1169800758361816, "learning_rate": 2.1446409989594172e-05, "loss": 0.1274, "step": 7750 }, { "epoch": 23.730886850152906, "grad_norm": 3.494316577911377, "learning_rate": 2.13423517169615e-05, "loss": 0.2026, "step": 7760 }, { "epoch": 23.761467889908257, "grad_norm": 4.792800426483154, "learning_rate": 2.1238293444328826e-05, "loss": 0.1336, "step": 7770 }, { "epoch": 23.792048929663608, "grad_norm": 1.7731387615203857, "learning_rate": 2.113423517169615e-05, "loss": 0.0959, "step": 7780 }, { "epoch": 23.822629969418962, "grad_norm": 4.196662425994873, "learning_rate": 2.103017689906348e-05, "loss": 0.1162, "step": 7790 }, { "epoch": 23.853211009174313, "grad_norm": 5.490009784698486, "learning_rate": 2.09261186264308e-05, "loss": 0.1544, "step": 7800 }, { "epoch": 23.883792048929664, "grad_norm": 2.1621978282928467, "learning_rate": 2.082206035379813e-05, "loss": 0.1212, "step": 7810 }, { "epoch": 23.914373088685014, "grad_norm": 4.357117176055908, "learning_rate": 2.071800208116545e-05, "loss": 0.165, "step": 7820 }, { "epoch": 23.94495412844037, "grad_norm": 4.021772861480713, "learning_rate": 2.061394380853278e-05, "loss": 0.1099, "step": 7830 }, { "epoch": 23.97553516819572, "grad_norm": 2.0271944999694824, "learning_rate": 2.0509885535900104e-05, "loss": 0.1267, "step": 7840 }, { "epoch": 24.00611620795107, "grad_norm": 3.9538087844848633, "learning_rate": 2.040582726326743e-05, "loss": 0.1773, "step": 7850 }, { "epoch": 24.03669724770642, "grad_norm": 4.695821285247803, "learning_rate": 2.0301768990634758e-05, "loss": 0.0898, "step": 7860 }, { "epoch": 24.067278287461775, "grad_norm": 3.817899227142334, "learning_rate": 2.0197710718002083e-05, "loss": 0.1105, "step": 7870 }, { "epoch": 24.097859327217126, "grad_norm": 6.2242536544799805, "learning_rate": 2.0093652445369408e-05, "loss": 0.1275, "step": 7880 }, { "epoch": 24.128440366972477, "grad_norm": 6.821825981140137, "learning_rate": 1.9989594172736733e-05, "loss": 0.1637, "step": 7890 }, { "epoch": 24.159021406727827, "grad_norm": 1.6077946424484253, "learning_rate": 1.988553590010406e-05, "loss": 0.1687, "step": 7900 }, { "epoch": 24.189602446483182, "grad_norm": 3.4850125312805176, "learning_rate": 1.9781477627471383e-05, "loss": 0.1124, "step": 7910 }, { "epoch": 24.220183486238533, "grad_norm": 7.470705509185791, "learning_rate": 1.967741935483871e-05, "loss": 0.106, "step": 7920 }, { "epoch": 24.250764525993883, "grad_norm": 7.171663761138916, "learning_rate": 1.9573361082206036e-05, "loss": 0.1532, "step": 7930 }, { "epoch": 24.281345565749234, "grad_norm": 1.5503133535385132, "learning_rate": 1.946930280957336e-05, "loss": 0.1559, "step": 7940 }, { "epoch": 24.31192660550459, "grad_norm": 2.006579875946045, "learning_rate": 1.936524453694069e-05, "loss": 0.1435, "step": 7950 }, { "epoch": 24.34250764525994, "grad_norm": 5.5806756019592285, "learning_rate": 1.926118626430801e-05, "loss": 0.1127, "step": 7960 }, { "epoch": 24.37308868501529, "grad_norm": 4.723907470703125, "learning_rate": 1.915712799167534e-05, "loss": 0.1293, "step": 7970 }, { "epoch": 24.40366972477064, "grad_norm": 4.086251258850098, "learning_rate": 1.9053069719042665e-05, "loss": 0.1407, "step": 7980 }, { "epoch": 24.434250764525995, "grad_norm": 4.814991474151611, "learning_rate": 1.894901144640999e-05, "loss": 0.1398, "step": 7990 }, { "epoch": 24.464831804281346, "grad_norm": 2.7642455101013184, "learning_rate": 1.8844953173777315e-05, "loss": 0.1645, "step": 8000 }, { "epoch": 24.464831804281346, "eval_loss": 0.4339035153388977, "eval_runtime": 17.7992, "eval_samples_per_second": 16.349, "eval_steps_per_second": 8.203, "step": 8000 }, { "epoch": 24.495412844036696, "grad_norm": 2.2885043621063232, "learning_rate": 1.8740894901144643e-05, "loss": 0.1168, "step": 8010 }, { "epoch": 24.525993883792047, "grad_norm": 4.910865783691406, "learning_rate": 1.8636836628511968e-05, "loss": 0.1254, "step": 8020 }, { "epoch": 24.5565749235474, "grad_norm": 3.811713457107544, "learning_rate": 1.8532778355879293e-05, "loss": 0.1535, "step": 8030 }, { "epoch": 24.587155963302752, "grad_norm": 3.527937650680542, "learning_rate": 1.842872008324662e-05, "loss": 0.1472, "step": 8040 }, { "epoch": 24.617737003058103, "grad_norm": 1.450607180595398, "learning_rate": 1.8324661810613943e-05, "loss": 0.2001, "step": 8050 }, { "epoch": 24.648318042813457, "grad_norm": 3.5444912910461426, "learning_rate": 1.822060353798127e-05, "loss": 0.1147, "step": 8060 }, { "epoch": 24.678899082568808, "grad_norm": 6.098001480102539, "learning_rate": 1.8116545265348597e-05, "loss": 0.1144, "step": 8070 }, { "epoch": 24.70948012232416, "grad_norm": 2.527642011642456, "learning_rate": 1.8012486992715922e-05, "loss": 0.1105, "step": 8080 }, { "epoch": 24.74006116207951, "grad_norm": 3.0823781490325928, "learning_rate": 1.7908428720083247e-05, "loss": 0.1126, "step": 8090 }, { "epoch": 24.770642201834864, "grad_norm": 2.1876089572906494, "learning_rate": 1.7804370447450572e-05, "loss": 0.1586, "step": 8100 }, { "epoch": 24.801223241590215, "grad_norm": 2.728623390197754, "learning_rate": 1.77003121748179e-05, "loss": 0.1086, "step": 8110 }, { "epoch": 24.831804281345565, "grad_norm": 1.5861458778381348, "learning_rate": 1.7596253902185225e-05, "loss": 0.0836, "step": 8120 }, { "epoch": 24.862385321100916, "grad_norm": 2.1710517406463623, "learning_rate": 1.749219562955255e-05, "loss": 0.1129, "step": 8130 }, { "epoch": 24.89296636085627, "grad_norm": 4.549278736114502, "learning_rate": 1.7388137356919875e-05, "loss": 0.0956, "step": 8140 }, { "epoch": 24.92354740061162, "grad_norm": 2.860056161880493, "learning_rate": 1.7284079084287204e-05, "loss": 0.1406, "step": 8150 }, { "epoch": 24.954128440366972, "grad_norm": 2.902945041656494, "learning_rate": 1.7180020811654525e-05, "loss": 0.1217, "step": 8160 }, { "epoch": 24.984709480122323, "grad_norm": 6.512890815734863, "learning_rate": 1.7075962539021854e-05, "loss": 0.1534, "step": 8170 }, { "epoch": 25.015290519877677, "grad_norm": 2.204719066619873, "learning_rate": 1.697190426638918e-05, "loss": 0.1133, "step": 8180 }, { "epoch": 25.045871559633028, "grad_norm": 4.600652694702148, "learning_rate": 1.6867845993756504e-05, "loss": 0.1018, "step": 8190 }, { "epoch": 25.07645259938838, "grad_norm": 3.88714861869812, "learning_rate": 1.676378772112383e-05, "loss": 0.1141, "step": 8200 }, { "epoch": 25.10703363914373, "grad_norm": 2.694763660430908, "learning_rate": 1.6659729448491157e-05, "loss": 0.105, "step": 8210 }, { "epoch": 25.137614678899084, "grad_norm": 5.237843990325928, "learning_rate": 1.6555671175858482e-05, "loss": 0.134, "step": 8220 }, { "epoch": 25.168195718654435, "grad_norm": 2.7006750106811523, "learning_rate": 1.6451612903225807e-05, "loss": 0.1189, "step": 8230 }, { "epoch": 25.198776758409785, "grad_norm": 5.1693115234375, "learning_rate": 1.6347554630593136e-05, "loss": 0.1248, "step": 8240 }, { "epoch": 25.229357798165136, "grad_norm": 4.986877918243408, "learning_rate": 1.6243496357960457e-05, "loss": 0.1507, "step": 8250 }, { "epoch": 25.25993883792049, "grad_norm": 1.4471940994262695, "learning_rate": 1.6139438085327786e-05, "loss": 0.1265, "step": 8260 }, { "epoch": 25.29051987767584, "grad_norm": 5.540067672729492, "learning_rate": 1.6035379812695107e-05, "loss": 0.1422, "step": 8270 }, { "epoch": 25.321100917431192, "grad_norm": 4.367918014526367, "learning_rate": 1.5931321540062436e-05, "loss": 0.1546, "step": 8280 }, { "epoch": 25.351681957186543, "grad_norm": 2.7890803813934326, "learning_rate": 1.582726326742976e-05, "loss": 0.0769, "step": 8290 }, { "epoch": 25.382262996941897, "grad_norm": 1.6187372207641602, "learning_rate": 1.5723204994797086e-05, "loss": 0.1068, "step": 8300 }, { "epoch": 25.412844036697248, "grad_norm": 3.0006723403930664, "learning_rate": 1.5619146722164414e-05, "loss": 0.1262, "step": 8310 }, { "epoch": 25.4434250764526, "grad_norm": 4.67561674118042, "learning_rate": 1.551508844953174e-05, "loss": 0.1522, "step": 8320 }, { "epoch": 25.474006116207953, "grad_norm": 1.59788179397583, "learning_rate": 1.5411030176899064e-05, "loss": 0.1875, "step": 8330 }, { "epoch": 25.504587155963304, "grad_norm": 2.5928938388824463, "learning_rate": 1.530697190426639e-05, "loss": 0.0929, "step": 8340 }, { "epoch": 25.535168195718654, "grad_norm": 3.5686800479888916, "learning_rate": 1.5202913631633716e-05, "loss": 0.1185, "step": 8350 }, { "epoch": 25.565749235474005, "grad_norm": 5.571007251739502, "learning_rate": 1.5098855359001041e-05, "loss": 0.1214, "step": 8360 }, { "epoch": 25.59633027522936, "grad_norm": 9.126275062561035, "learning_rate": 1.4994797086368368e-05, "loss": 0.1516, "step": 8370 }, { "epoch": 25.62691131498471, "grad_norm": 2.9007301330566406, "learning_rate": 1.4890738813735694e-05, "loss": 0.1372, "step": 8380 }, { "epoch": 25.65749235474006, "grad_norm": 3.1054575443267822, "learning_rate": 1.4786680541103018e-05, "loss": 0.1055, "step": 8390 }, { "epoch": 25.68807339449541, "grad_norm": 4.602766990661621, "learning_rate": 1.4682622268470344e-05, "loss": 0.1056, "step": 8400 }, { "epoch": 25.718654434250766, "grad_norm": 2.9134981632232666, "learning_rate": 1.4578563995837668e-05, "loss": 0.1074, "step": 8410 }, { "epoch": 25.749235474006117, "grad_norm": 4.738123893737793, "learning_rate": 1.4474505723204994e-05, "loss": 0.1139, "step": 8420 }, { "epoch": 25.779816513761467, "grad_norm": 5.146528720855713, "learning_rate": 1.4370447450572321e-05, "loss": 0.1333, "step": 8430 }, { "epoch": 25.810397553516818, "grad_norm": 1.8651458024978638, "learning_rate": 1.4266389177939646e-05, "loss": 0.1101, "step": 8440 }, { "epoch": 25.840978593272173, "grad_norm": 3.372619152069092, "learning_rate": 1.4162330905306973e-05, "loss": 0.1285, "step": 8450 }, { "epoch": 25.871559633027523, "grad_norm": 4.9253363609313965, "learning_rate": 1.40582726326743e-05, "loss": 0.127, "step": 8460 }, { "epoch": 25.902140672782874, "grad_norm": 4.081017971038818, "learning_rate": 1.3954214360041623e-05, "loss": 0.1061, "step": 8470 }, { "epoch": 25.932721712538225, "grad_norm": 1.752782940864563, "learning_rate": 1.385015608740895e-05, "loss": 0.138, "step": 8480 }, { "epoch": 25.96330275229358, "grad_norm": 1.8544362783432007, "learning_rate": 1.3746097814776276e-05, "loss": 0.0969, "step": 8490 }, { "epoch": 25.99388379204893, "grad_norm": 6.175636291503906, "learning_rate": 1.36420395421436e-05, "loss": 0.1628, "step": 8500 }, { "epoch": 25.99388379204893, "eval_loss": 0.46008336544036865, "eval_runtime": 17.473, "eval_samples_per_second": 16.654, "eval_steps_per_second": 8.356, "step": 8500 }, { "epoch": 26.02446483180428, "grad_norm": 4.042583465576172, "learning_rate": 1.3537981269510926e-05, "loss": 0.116, "step": 8510 }, { "epoch": 26.05504587155963, "grad_norm": 3.8415026664733887, "learning_rate": 1.3433922996878253e-05, "loss": 0.1106, "step": 8520 }, { "epoch": 26.085626911314986, "grad_norm": 3.7799582481384277, "learning_rate": 1.3329864724245578e-05, "loss": 0.0886, "step": 8530 }, { "epoch": 26.116207951070336, "grad_norm": 6.632818698883057, "learning_rate": 1.3225806451612905e-05, "loss": 0.1219, "step": 8540 }, { "epoch": 26.146788990825687, "grad_norm": 6.225608825683594, "learning_rate": 1.3121748178980228e-05, "loss": 0.1348, "step": 8550 }, { "epoch": 26.17737003058104, "grad_norm": 5.72769832611084, "learning_rate": 1.3017689906347555e-05, "loss": 0.1163, "step": 8560 }, { "epoch": 26.207951070336392, "grad_norm": 3.572054147720337, "learning_rate": 1.2913631633714882e-05, "loss": 0.0977, "step": 8570 }, { "epoch": 26.238532110091743, "grad_norm": 5.419766426086426, "learning_rate": 1.2809573361082205e-05, "loss": 0.1244, "step": 8580 }, { "epoch": 26.269113149847094, "grad_norm": 4.254158973693848, "learning_rate": 1.2705515088449532e-05, "loss": 0.1376, "step": 8590 }, { "epoch": 26.299694189602448, "grad_norm": 7.157454013824463, "learning_rate": 1.2601456815816858e-05, "loss": 0.182, "step": 8600 }, { "epoch": 26.3302752293578, "grad_norm": 1.7647346258163452, "learning_rate": 1.2497398543184183e-05, "loss": 0.1129, "step": 8610 }, { "epoch": 26.36085626911315, "grad_norm": 3.3649981021881104, "learning_rate": 1.239334027055151e-05, "loss": 0.0961, "step": 8620 }, { "epoch": 26.3914373088685, "grad_norm": 7.596597194671631, "learning_rate": 1.2289281997918835e-05, "loss": 0.1253, "step": 8630 }, { "epoch": 26.422018348623855, "grad_norm": 4.112176418304443, "learning_rate": 1.2185223725286162e-05, "loss": 0.129, "step": 8640 }, { "epoch": 26.452599388379205, "grad_norm": 2.766486644744873, "learning_rate": 1.2081165452653487e-05, "loss": 0.1485, "step": 8650 }, { "epoch": 26.483180428134556, "grad_norm": 3.5616629123687744, "learning_rate": 1.1977107180020812e-05, "loss": 0.1298, "step": 8660 }, { "epoch": 26.513761467889907, "grad_norm": 2.5189101696014404, "learning_rate": 1.1873048907388137e-05, "loss": 0.1013, "step": 8670 }, { "epoch": 26.54434250764526, "grad_norm": 2.5541913509368896, "learning_rate": 1.1768990634755464e-05, "loss": 0.1185, "step": 8680 }, { "epoch": 26.574923547400612, "grad_norm": 11.571924209594727, "learning_rate": 1.1664932362122789e-05, "loss": 0.1364, "step": 8690 }, { "epoch": 26.605504587155963, "grad_norm": 6.36802339553833, "learning_rate": 1.1560874089490115e-05, "loss": 0.1866, "step": 8700 }, { "epoch": 26.636085626911314, "grad_norm": 3.861549139022827, "learning_rate": 1.1456815816857442e-05, "loss": 0.0996, "step": 8710 }, { "epoch": 26.666666666666668, "grad_norm": 1.4031004905700684, "learning_rate": 1.1352757544224767e-05, "loss": 0.0686, "step": 8720 }, { "epoch": 26.69724770642202, "grad_norm": 3.146599054336548, "learning_rate": 1.1248699271592092e-05, "loss": 0.1375, "step": 8730 }, { "epoch": 26.72782874617737, "grad_norm": 3.636478900909424, "learning_rate": 1.1144640998959417e-05, "loss": 0.1234, "step": 8740 }, { "epoch": 26.75840978593272, "grad_norm": 2.630683660507202, "learning_rate": 1.1040582726326744e-05, "loss": 0.1324, "step": 8750 }, { "epoch": 26.788990825688074, "grad_norm": 2.7682950496673584, "learning_rate": 1.0936524453694069e-05, "loss": 0.08, "step": 8760 }, { "epoch": 26.819571865443425, "grad_norm": 2.6922128200531006, "learning_rate": 1.0832466181061394e-05, "loss": 0.1352, "step": 8770 }, { "epoch": 26.850152905198776, "grad_norm": 6.487355709075928, "learning_rate": 1.072840790842872e-05, "loss": 0.1123, "step": 8780 }, { "epoch": 26.88073394495413, "grad_norm": 2.877392530441284, "learning_rate": 1.0624349635796047e-05, "loss": 0.1286, "step": 8790 }, { "epoch": 26.91131498470948, "grad_norm": 4.019444465637207, "learning_rate": 1.0520291363163372e-05, "loss": 0.1137, "step": 8800 }, { "epoch": 26.941896024464832, "grad_norm": 2.5582876205444336, "learning_rate": 1.0416233090530697e-05, "loss": 0.1026, "step": 8810 }, { "epoch": 26.972477064220183, "grad_norm": 2.6786487102508545, "learning_rate": 1.0312174817898024e-05, "loss": 0.1093, "step": 8820 }, { "epoch": 27.003058103975537, "grad_norm": 1.792412281036377, "learning_rate": 1.0208116545265349e-05, "loss": 0.1588, "step": 8830 }, { "epoch": 27.033639143730888, "grad_norm": 1.3316484689712524, "learning_rate": 1.0104058272632674e-05, "loss": 0.0828, "step": 8840 }, { "epoch": 27.06422018348624, "grad_norm": 2.7321407794952393, "learning_rate": 1e-05, "loss": 0.0917, "step": 8850 }, { "epoch": 27.09480122324159, "grad_norm": 1.614942193031311, "learning_rate": 9.895941727367326e-06, "loss": 0.1273, "step": 8860 }, { "epoch": 27.125382262996943, "grad_norm": 6.456067085266113, "learning_rate": 9.791883454734651e-06, "loss": 0.1361, "step": 8870 }, { "epoch": 27.155963302752294, "grad_norm": 1.9082986116409302, "learning_rate": 9.687825182101978e-06, "loss": 0.1529, "step": 8880 }, { "epoch": 27.186544342507645, "grad_norm": 3.1089670658111572, "learning_rate": 9.583766909469304e-06, "loss": 0.0918, "step": 8890 }, { "epoch": 27.217125382262996, "grad_norm": 2.7883081436157227, "learning_rate": 9.47970863683663e-06, "loss": 0.1191, "step": 8900 }, { "epoch": 27.24770642201835, "grad_norm": 3.825026035308838, "learning_rate": 9.375650364203954e-06, "loss": 0.113, "step": 8910 }, { "epoch": 27.2782874617737, "grad_norm": 9.698514938354492, "learning_rate": 9.271592091571281e-06, "loss": 0.0969, "step": 8920 }, { "epoch": 27.30886850152905, "grad_norm": 1.5716568231582642, "learning_rate": 9.167533818938606e-06, "loss": 0.1342, "step": 8930 }, { "epoch": 27.339449541284402, "grad_norm": 2.928072690963745, "learning_rate": 9.063475546305931e-06, "loss": 0.0834, "step": 8940 }, { "epoch": 27.370030581039757, "grad_norm": 2.3197057247161865, "learning_rate": 8.959417273673256e-06, "loss": 0.1149, "step": 8950 }, { "epoch": 27.400611620795107, "grad_norm": 7.332393646240234, "learning_rate": 8.855359001040583e-06, "loss": 0.1016, "step": 8960 }, { "epoch": 27.431192660550458, "grad_norm": 4.15636682510376, "learning_rate": 8.75130072840791e-06, "loss": 0.1038, "step": 8970 }, { "epoch": 27.46177370030581, "grad_norm": 1.6034491062164307, "learning_rate": 8.647242455775235e-06, "loss": 0.1807, "step": 8980 }, { "epoch": 27.492354740061163, "grad_norm": 4.26979398727417, "learning_rate": 8.543184183142561e-06, "loss": 0.1017, "step": 8990 }, { "epoch": 27.522935779816514, "grad_norm": 2.539623975753784, "learning_rate": 8.439125910509886e-06, "loss": 0.0951, "step": 9000 }, { "epoch": 27.522935779816514, "eval_loss": 0.4583536684513092, "eval_runtime": 17.3431, "eval_samples_per_second": 16.779, "eval_steps_per_second": 8.418, "step": 9000 }, { "epoch": 27.553516819571865, "grad_norm": 7.396114349365234, "learning_rate": 8.335067637877211e-06, "loss": 0.1293, "step": 9010 }, { "epoch": 27.584097859327215, "grad_norm": 3.8735439777374268, "learning_rate": 8.231009365244538e-06, "loss": 0.0991, "step": 9020 }, { "epoch": 27.61467889908257, "grad_norm": 2.752223491668701, "learning_rate": 8.126951092611863e-06, "loss": 0.1317, "step": 9030 }, { "epoch": 27.64525993883792, "grad_norm": 3.082855224609375, "learning_rate": 8.022892819979188e-06, "loss": 0.0865, "step": 9040 }, { "epoch": 27.67584097859327, "grad_norm": 3.818155527114868, "learning_rate": 7.918834547346513e-06, "loss": 0.0893, "step": 9050 }, { "epoch": 27.706422018348626, "grad_norm": 4.239770412445068, "learning_rate": 7.81477627471384e-06, "loss": 0.1284, "step": 9060 }, { "epoch": 27.737003058103976, "grad_norm": 2.538876533508301, "learning_rate": 7.710718002081167e-06, "loss": 0.1326, "step": 9070 }, { "epoch": 27.767584097859327, "grad_norm": 2.477917432785034, "learning_rate": 7.6066597294484915e-06, "loss": 0.1679, "step": 9080 }, { "epoch": 27.798165137614678, "grad_norm": 6.487174987792969, "learning_rate": 7.502601456815818e-06, "loss": 0.0955, "step": 9090 }, { "epoch": 27.828746177370032, "grad_norm": 3.586022138595581, "learning_rate": 7.398543184183143e-06, "loss": 0.1289, "step": 9100 }, { "epoch": 27.859327217125383, "grad_norm": 4.1030049324035645, "learning_rate": 7.294484911550468e-06, "loss": 0.1064, "step": 9110 }, { "epoch": 27.889908256880734, "grad_norm": 2.473414421081543, "learning_rate": 7.190426638917794e-06, "loss": 0.1148, "step": 9120 }, { "epoch": 27.920489296636084, "grad_norm": 2.4675068855285645, "learning_rate": 7.086368366285121e-06, "loss": 0.1222, "step": 9130 }, { "epoch": 27.95107033639144, "grad_norm": 2.7934279441833496, "learning_rate": 6.982310093652446e-06, "loss": 0.0951, "step": 9140 }, { "epoch": 27.98165137614679, "grad_norm": 7.207830429077148, "learning_rate": 6.878251821019771e-06, "loss": 0.1336, "step": 9150 }, { "epoch": 28.01223241590214, "grad_norm": 2.382178544998169, "learning_rate": 6.774193548387098e-06, "loss": 0.0944, "step": 9160 }, { "epoch": 28.04281345565749, "grad_norm": 3.0280263423919678, "learning_rate": 6.670135275754423e-06, "loss": 0.0803, "step": 9170 }, { "epoch": 28.073394495412845, "grad_norm": 2.1945552825927734, "learning_rate": 6.5660770031217485e-06, "loss": 0.0828, "step": 9180 }, { "epoch": 28.103975535168196, "grad_norm": 4.57396936416626, "learning_rate": 6.4620187304890735e-06, "loss": 0.124, "step": 9190 }, { "epoch": 28.134556574923547, "grad_norm": 9.008644104003906, "learning_rate": 6.3579604578564e-06, "loss": 0.1269, "step": 9200 }, { "epoch": 28.165137614678898, "grad_norm": 2.750105857849121, "learning_rate": 6.253902185223725e-06, "loss": 0.1416, "step": 9210 }, { "epoch": 28.195718654434252, "grad_norm": 5.317841529846191, "learning_rate": 6.149843912591052e-06, "loss": 0.1053, "step": 9220 }, { "epoch": 28.226299694189603, "grad_norm": 2.429410219192505, "learning_rate": 6.045785639958377e-06, "loss": 0.1105, "step": 9230 }, { "epoch": 28.256880733944953, "grad_norm": 4.838024139404297, "learning_rate": 5.941727367325703e-06, "loss": 0.1138, "step": 9240 }, { "epoch": 28.287461773700304, "grad_norm": 12.658930778503418, "learning_rate": 5.837669094693028e-06, "loss": 0.13, "step": 9250 }, { "epoch": 28.31804281345566, "grad_norm": 1.7371057271957397, "learning_rate": 5.733610822060355e-06, "loss": 0.1251, "step": 9260 }, { "epoch": 28.34862385321101, "grad_norm": 2.2298595905303955, "learning_rate": 5.62955254942768e-06, "loss": 0.0757, "step": 9270 }, { "epoch": 28.37920489296636, "grad_norm": 4.572708606719971, "learning_rate": 5.5254942767950055e-06, "loss": 0.1127, "step": 9280 }, { "epoch": 28.40978593272171, "grad_norm": 3.1214189529418945, "learning_rate": 5.421436004162331e-06, "loss": 0.1395, "step": 9290 }, { "epoch": 28.440366972477065, "grad_norm": 3.5031135082244873, "learning_rate": 5.317377731529656e-06, "loss": 0.1255, "step": 9300 }, { "epoch": 28.470948012232416, "grad_norm": 2.455829620361328, "learning_rate": 5.213319458896983e-06, "loss": 0.0986, "step": 9310 }, { "epoch": 28.501529051987767, "grad_norm": 2.4105734825134277, "learning_rate": 5.109261186264308e-06, "loss": 0.1033, "step": 9320 }, { "epoch": 28.53211009174312, "grad_norm": 5.415170669555664, "learning_rate": 5.005202913631634e-06, "loss": 0.106, "step": 9330 }, { "epoch": 28.56269113149847, "grad_norm": 1.5614656209945679, "learning_rate": 4.901144640998959e-06, "loss": 0.1119, "step": 9340 }, { "epoch": 28.593272171253822, "grad_norm": 11.242796897888184, "learning_rate": 4.797086368366286e-06, "loss": 0.1031, "step": 9350 }, { "epoch": 28.623853211009173, "grad_norm": 1.98354172706604, "learning_rate": 4.693028095733612e-06, "loss": 0.1192, "step": 9360 }, { "epoch": 28.654434250764528, "grad_norm": 3.658066749572754, "learning_rate": 4.588969823100937e-06, "loss": 0.08, "step": 9370 }, { "epoch": 28.68501529051988, "grad_norm": 2.7829060554504395, "learning_rate": 4.4849115504682625e-06, "loss": 0.0842, "step": 9380 }, { "epoch": 28.71559633027523, "grad_norm": 3.9607882499694824, "learning_rate": 4.3808532778355875e-06, "loss": 0.0955, "step": 9390 }, { "epoch": 28.74617737003058, "grad_norm": 5.205819606781006, "learning_rate": 4.276795005202914e-06, "loss": 0.1385, "step": 9400 }, { "epoch": 28.776758409785934, "grad_norm": 2.6670114994049072, "learning_rate": 4.172736732570239e-06, "loss": 0.1485, "step": 9410 }, { "epoch": 28.807339449541285, "grad_norm": 6.059555530548096, "learning_rate": 4.068678459937565e-06, "loss": 0.0743, "step": 9420 }, { "epoch": 28.837920489296636, "grad_norm": 2.8076841831207275, "learning_rate": 3.964620187304891e-06, "loss": 0.0958, "step": 9430 }, { "epoch": 28.868501529051986, "grad_norm": 6.3077616691589355, "learning_rate": 3.860561914672217e-06, "loss": 0.0761, "step": 9440 }, { "epoch": 28.89908256880734, "grad_norm": 7.713011264801025, "learning_rate": 3.7565036420395423e-06, "loss": 0.0894, "step": 9450 }, { "epoch": 28.92966360856269, "grad_norm": 2.3206980228424072, "learning_rate": 3.6524453694068677e-06, "loss": 0.1241, "step": 9460 }, { "epoch": 28.960244648318042, "grad_norm": 3.4398224353790283, "learning_rate": 3.5483870967741936e-06, "loss": 0.0933, "step": 9470 }, { "epoch": 28.990825688073393, "grad_norm": 7.903160572052002, "learning_rate": 3.44432882414152e-06, "loss": 0.1263, "step": 9480 }, { "epoch": 29.021406727828747, "grad_norm": 1.9907540082931519, "learning_rate": 3.340270551508845e-06, "loss": 0.0967, "step": 9490 }, { "epoch": 29.051987767584098, "grad_norm": 1.7297402620315552, "learning_rate": 3.236212278876171e-06, "loss": 0.0935, "step": 9500 }, { "epoch": 29.051987767584098, "eval_loss": 0.46150296926498413, "eval_runtime": 17.3886, "eval_samples_per_second": 16.735, "eval_steps_per_second": 8.396, "step": 9500 }, { "epoch": 29.08256880733945, "grad_norm": 4.011406421661377, "learning_rate": 3.1321540062434962e-06, "loss": 0.1097, "step": 9510 }, { "epoch": 29.1131498470948, "grad_norm": 3.3061842918395996, "learning_rate": 3.0280957336108225e-06, "loss": 0.1327, "step": 9520 }, { "epoch": 29.143730886850154, "grad_norm": 11.67180061340332, "learning_rate": 2.924037460978148e-06, "loss": 0.1092, "step": 9530 }, { "epoch": 29.174311926605505, "grad_norm": 4.720720291137695, "learning_rate": 2.819979188345474e-06, "loss": 0.1067, "step": 9540 }, { "epoch": 29.204892966360855, "grad_norm": 3.8615598678588867, "learning_rate": 2.7159209157127993e-06, "loss": 0.0915, "step": 9550 }, { "epoch": 29.235474006116206, "grad_norm": 4.769123554229736, "learning_rate": 2.6118626430801247e-06, "loss": 0.1029, "step": 9560 }, { "epoch": 29.26605504587156, "grad_norm": 10.71865177154541, "learning_rate": 2.5078043704474506e-06, "loss": 0.1094, "step": 9570 }, { "epoch": 29.29663608562691, "grad_norm": 4.421198844909668, "learning_rate": 2.4037460978147764e-06, "loss": 0.1504, "step": 9580 }, { "epoch": 29.327217125382262, "grad_norm": 4.271670341491699, "learning_rate": 2.2996878251821023e-06, "loss": 0.1025, "step": 9590 }, { "epoch": 29.357798165137616, "grad_norm": 3.598496913909912, "learning_rate": 2.1956295525494278e-06, "loss": 0.1062, "step": 9600 }, { "epoch": 29.388379204892967, "grad_norm": 2.3889999389648438, "learning_rate": 2.0915712799167536e-06, "loss": 0.0964, "step": 9610 }, { "epoch": 29.418960244648318, "grad_norm": 4.520473957061768, "learning_rate": 1.987513007284079e-06, "loss": 0.1036, "step": 9620 }, { "epoch": 29.44954128440367, "grad_norm": 6.458251476287842, "learning_rate": 1.8834547346514047e-06, "loss": 0.129, "step": 9630 }, { "epoch": 29.480122324159023, "grad_norm": 2.3070294857025146, "learning_rate": 1.7793964620187304e-06, "loss": 0.1162, "step": 9640 }, { "epoch": 29.510703363914374, "grad_norm": 2.367161512374878, "learning_rate": 1.6753381893860565e-06, "loss": 0.104, "step": 9650 }, { "epoch": 29.541284403669724, "grad_norm": 6.095667839050293, "learning_rate": 1.5712799167533821e-06, "loss": 0.0903, "step": 9660 }, { "epoch": 29.571865443425075, "grad_norm": 10.448101043701172, "learning_rate": 1.4672216441207078e-06, "loss": 0.1061, "step": 9670 }, { "epoch": 29.60244648318043, "grad_norm": 3.1639466285705566, "learning_rate": 1.3631633714880334e-06, "loss": 0.1307, "step": 9680 }, { "epoch": 29.63302752293578, "grad_norm": 2.028094530105591, "learning_rate": 1.259105098855359e-06, "loss": 0.1, "step": 9690 }, { "epoch": 29.66360856269113, "grad_norm": 4.976256847381592, "learning_rate": 1.1550468262226847e-06, "loss": 0.0844, "step": 9700 }, { "epoch": 29.69418960244648, "grad_norm": 4.531723499298096, "learning_rate": 1.0509885535900104e-06, "loss": 0.1207, "step": 9710 }, { "epoch": 29.724770642201836, "grad_norm": 3.63309645652771, "learning_rate": 9.46930280957336e-07, "loss": 0.0812, "step": 9720 }, { "epoch": 29.755351681957187, "grad_norm": 14.414322853088379, "learning_rate": 8.428720083246619e-07, "loss": 0.1532, "step": 9730 }, { "epoch": 29.785932721712538, "grad_norm": 3.6692416667938232, "learning_rate": 7.388137356919876e-07, "loss": 0.1036, "step": 9740 }, { "epoch": 29.81651376146789, "grad_norm": 2.8107922077178955, "learning_rate": 6.347554630593132e-07, "loss": 0.0884, "step": 9750 }, { "epoch": 29.847094801223243, "grad_norm": 9.009915351867676, "learning_rate": 5.306971904266389e-07, "loss": 0.0975, "step": 9760 }, { "epoch": 29.877675840978593, "grad_norm": 4.969494819641113, "learning_rate": 4.2663891779396464e-07, "loss": 0.1096, "step": 9770 }, { "epoch": 29.908256880733944, "grad_norm": 3.147620916366577, "learning_rate": 3.2258064516129035e-07, "loss": 0.1088, "step": 9780 }, { "epoch": 29.938837920489295, "grad_norm": 12.04923152923584, "learning_rate": 2.1852237252861603e-07, "loss": 0.1043, "step": 9790 }, { "epoch": 29.96941896024465, "grad_norm": 4.238800048828125, "learning_rate": 1.1446409989594173e-07, "loss": 0.1298, "step": 9800 }, { "epoch": 30.0, "grad_norm": 3.123566150665283, "learning_rate": 1.040582726326743e-08, "loss": 0.099, "step": 9810 } ], "logging_steps": 10, "max_steps": 9810, "num_input_tokens_seen": 0, "num_train_epochs": 30, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.5687795117499023e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null }