{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 10.0, "eval_steps": 500, "global_step": 2820, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03546099290780142, "grad_norm": 12.458266258239746, "learning_rate": 6.3829787234042555e-06, "loss": 1.452, "step": 10 }, { "epoch": 0.07092198581560284, "grad_norm": 2.127192974090576, "learning_rate": 1.347517730496454e-05, "loss": 0.5036, "step": 20 }, { "epoch": 0.10638297872340426, "grad_norm": 1.6429086923599243, "learning_rate": 2.0567375886524822e-05, "loss": 0.3388, "step": 30 }, { "epoch": 0.14184397163120568, "grad_norm": 1.6117417812347412, "learning_rate": 2.765957446808511e-05, "loss": 0.3197, "step": 40 }, { "epoch": 0.1773049645390071, "grad_norm": 1.5715168714523315, "learning_rate": 3.4751773049645395e-05, "loss": 0.244, "step": 50 }, { "epoch": 0.2127659574468085, "grad_norm": 1.6177239418029785, "learning_rate": 4.1843971631205674e-05, "loss": 0.2038, "step": 60 }, { "epoch": 0.24822695035460993, "grad_norm": 1.159698724746704, "learning_rate": 4.893617021276596e-05, "loss": 0.1898, "step": 70 }, { "epoch": 0.28368794326241137, "grad_norm": 1.3677239418029785, "learning_rate": 5.602836879432625e-05, "loss": 0.1535, "step": 80 }, { "epoch": 0.3191489361702128, "grad_norm": 0.9369192719459534, "learning_rate": 6.312056737588653e-05, "loss": 0.148, "step": 90 }, { "epoch": 0.3546099290780142, "grad_norm": 1.7364780902862549, "learning_rate": 7.021276595744681e-05, "loss": 0.1408, "step": 100 }, { "epoch": 0.3900709219858156, "grad_norm": 1.2476744651794434, "learning_rate": 7.73049645390071e-05, "loss": 0.1224, "step": 110 }, { "epoch": 0.425531914893617, "grad_norm": 1.190326452255249, "learning_rate": 8.439716312056739e-05, "loss": 0.1166, "step": 120 }, { "epoch": 0.46099290780141844, "grad_norm": 1.6969983577728271, "learning_rate": 9.148936170212766e-05, "loss": 0.1248, "step": 130 }, { "epoch": 0.49645390070921985, "grad_norm": 2.593116044998169, "learning_rate": 9.858156028368794e-05, "loss": 0.1202, "step": 140 }, { "epoch": 0.5319148936170213, "grad_norm": 1.28589928150177, "learning_rate": 9.999779975463078e-05, "loss": 0.1123, "step": 150 }, { "epoch": 0.5673758865248227, "grad_norm": 0.9029638171195984, "learning_rate": 9.9988861589696e-05, "loss": 0.1015, "step": 160 }, { "epoch": 0.6028368794326241, "grad_norm": 1.8789817094802856, "learning_rate": 9.99730492180442e-05, "loss": 0.1087, "step": 170 }, { "epoch": 0.6382978723404256, "grad_norm": 1.440181851387024, "learning_rate": 9.995036481411004e-05, "loss": 0.128, "step": 180 }, { "epoch": 0.6737588652482269, "grad_norm": 0.8681181073188782, "learning_rate": 9.992081149733404e-05, "loss": 0.1248, "step": 190 }, { "epoch": 0.7092198581560284, "grad_norm": 0.7192113399505615, "learning_rate": 9.988439333173374e-05, "loss": 0.1062, "step": 200 }, { "epoch": 0.7446808510638298, "grad_norm": 1.1343145370483398, "learning_rate": 9.98411153253447e-05, "loss": 0.1067, "step": 210 }, { "epoch": 0.7801418439716312, "grad_norm": 0.8589990139007568, "learning_rate": 9.979098342953199e-05, "loss": 0.1008, "step": 220 }, { "epoch": 0.8156028368794326, "grad_norm": 0.9460039734840393, "learning_rate": 9.97340045381716e-05, "loss": 0.0987, "step": 230 }, { "epoch": 0.851063829787234, "grad_norm": 1.0302399396896362, "learning_rate": 9.967018648670259e-05, "loss": 0.101, "step": 240 }, { "epoch": 0.8865248226950354, "grad_norm": 1.2364697456359863, "learning_rate": 9.959953805104952e-05, "loss": 0.0961, "step": 250 }, { "epoch": 0.9219858156028369, "grad_norm": 1.174009919166565, "learning_rate": 9.952206894641565e-05, "loss": 0.0933, "step": 260 }, { "epoch": 0.9574468085106383, "grad_norm": 1.2122607231140137, "learning_rate": 9.943778982594694e-05, "loss": 0.0892, "step": 270 }, { "epoch": 0.9929078014184397, "grad_norm": 1.4150590896606445, "learning_rate": 9.934671227926712e-05, "loss": 0.0856, "step": 280 }, { "epoch": 1.0283687943262412, "grad_norm": 0.9654158353805542, "learning_rate": 9.924884883088391e-05, "loss": 0.0918, "step": 290 }, { "epoch": 1.0638297872340425, "grad_norm": 1.0064071416854858, "learning_rate": 9.914421293846674e-05, "loss": 0.0861, "step": 300 }, { "epoch": 1.099290780141844, "grad_norm": 1.430910348892212, "learning_rate": 9.90328189909961e-05, "loss": 0.085, "step": 310 }, { "epoch": 1.1347517730496455, "grad_norm": 1.0088697671890259, "learning_rate": 9.891468230678487e-05, "loss": 0.0927, "step": 320 }, { "epoch": 1.1702127659574468, "grad_norm": 0.7953307032585144, "learning_rate": 9.878981913137179e-05, "loss": 0.0962, "step": 330 }, { "epoch": 1.2056737588652482, "grad_norm": 0.7745844721794128, "learning_rate": 9.86582466352875e-05, "loss": 0.0806, "step": 340 }, { "epoch": 1.2411347517730495, "grad_norm": 0.39872488379478455, "learning_rate": 9.85199829116933e-05, "loss": 0.0751, "step": 350 }, { "epoch": 1.2765957446808511, "grad_norm": 0.9581345915794373, "learning_rate": 9.837504697389311e-05, "loss": 0.0839, "step": 360 }, { "epoch": 1.3120567375886525, "grad_norm": 0.9904010891914368, "learning_rate": 9.822345875271883e-05, "loss": 0.0747, "step": 370 }, { "epoch": 1.3475177304964538, "grad_norm": 1.0076277256011963, "learning_rate": 9.806523909378956e-05, "loss": 0.072, "step": 380 }, { "epoch": 1.3829787234042552, "grad_norm": 1.1363801956176758, "learning_rate": 9.790040975464503e-05, "loss": 0.09, "step": 390 }, { "epoch": 1.4184397163120568, "grad_norm": 0.936143159866333, "learning_rate": 9.772899340175361e-05, "loss": 0.0956, "step": 400 }, { "epoch": 1.4539007092198581, "grad_norm": 0.5812869071960449, "learning_rate": 9.755101360739536e-05, "loss": 0.09, "step": 410 }, { "epoch": 1.4893617021276595, "grad_norm": 1.0588104724884033, "learning_rate": 9.736649484642043e-05, "loss": 0.0771, "step": 420 }, { "epoch": 1.524822695035461, "grad_norm": 1.1113085746765137, "learning_rate": 9.71754624928835e-05, "loss": 0.0681, "step": 430 }, { "epoch": 1.5602836879432624, "grad_norm": 0.7930032014846802, "learning_rate": 9.697794281655438e-05, "loss": 0.0752, "step": 440 }, { "epoch": 1.5957446808510638, "grad_norm": 0.8521007895469666, "learning_rate": 9.67739629793056e-05, "loss": 0.0754, "step": 450 }, { "epoch": 1.6312056737588652, "grad_norm": 0.7623393535614014, "learning_rate": 9.65635510313772e-05, "loss": 0.0858, "step": 460 }, { "epoch": 1.6666666666666665, "grad_norm": 1.1308192014694214, "learning_rate": 9.634673590751944e-05, "loss": 0.0819, "step": 470 }, { "epoch": 1.702127659574468, "grad_norm": 0.8282614350318909, "learning_rate": 9.612354742301382e-05, "loss": 0.0769, "step": 480 }, { "epoch": 1.7375886524822695, "grad_norm": 0.8726593852043152, "learning_rate": 9.589401626957308e-05, "loss": 0.0654, "step": 490 }, { "epoch": 1.773049645390071, "grad_norm": 0.6696294546127319, "learning_rate": 9.56581740111206e-05, "loss": 0.062, "step": 500 }, { "epoch": 1.8085106382978724, "grad_norm": 1.136682391166687, "learning_rate": 9.54160530794499e-05, "loss": 0.0733, "step": 510 }, { "epoch": 1.8439716312056738, "grad_norm": 0.6757348775863647, "learning_rate": 9.516768676976475e-05, "loss": 0.0786, "step": 520 }, { "epoch": 1.8794326241134751, "grad_norm": 0.6466293931007385, "learning_rate": 9.49131092361007e-05, "loss": 0.0707, "step": 530 }, { "epoch": 1.9148936170212765, "grad_norm": 0.49455875158309937, "learning_rate": 9.46523554866283e-05, "loss": 0.0719, "step": 540 }, { "epoch": 1.950354609929078, "grad_norm": 0.5616086721420288, "learning_rate": 9.438546137883897e-05, "loss": 0.0753, "step": 550 }, { "epoch": 1.9858156028368794, "grad_norm": 0.5382352471351624, "learning_rate": 9.41124636146141e-05, "loss": 0.0722, "step": 560 }, { "epoch": 2.021276595744681, "grad_norm": 0.5831167101860046, "learning_rate": 9.383339973517795e-05, "loss": 0.0628, "step": 570 }, { "epoch": 2.0567375886524824, "grad_norm": 0.6010406613349915, "learning_rate": 9.354830811593527e-05, "loss": 0.0643, "step": 580 }, { "epoch": 2.0921985815602837, "grad_norm": 0.6855683922767639, "learning_rate": 9.325722796119397e-05, "loss": 0.0734, "step": 590 }, { "epoch": 2.127659574468085, "grad_norm": 0.5798056125640869, "learning_rate": 9.29601992987741e-05, "loss": 0.0683, "step": 600 }, { "epoch": 2.1631205673758864, "grad_norm": 0.5442080497741699, "learning_rate": 9.265726297450333e-05, "loss": 0.0683, "step": 610 }, { "epoch": 2.198581560283688, "grad_norm": 0.5252788662910461, "learning_rate": 9.234846064660009e-05, "loss": 0.0776, "step": 620 }, { "epoch": 2.2340425531914896, "grad_norm": 0.7919511198997498, "learning_rate": 9.203383477994494e-05, "loss": 0.0611, "step": 630 }, { "epoch": 2.269503546099291, "grad_norm": 0.7639462947845459, "learning_rate": 9.171342864024101e-05, "loss": 0.0588, "step": 640 }, { "epoch": 2.3049645390070923, "grad_norm": 0.9514329433441162, "learning_rate": 9.138728628806439e-05, "loss": 0.0725, "step": 650 }, { "epoch": 2.3404255319148937, "grad_norm": 0.8672018051147461, "learning_rate": 9.105545257280501e-05, "loss": 0.0568, "step": 660 }, { "epoch": 2.375886524822695, "grad_norm": 0.7574223279953003, "learning_rate": 9.071797312649935e-05, "loss": 0.0752, "step": 670 }, { "epoch": 2.4113475177304964, "grad_norm": 1.0806403160095215, "learning_rate": 9.037489435755524e-05, "loss": 0.0682, "step": 680 }, { "epoch": 2.4468085106382977, "grad_norm": 0.8450857400894165, "learning_rate": 9.00262634443701e-05, "loss": 0.0669, "step": 690 }, { "epoch": 2.482269503546099, "grad_norm": 1.4907677173614502, "learning_rate": 8.96721283288432e-05, "loss": 0.0754, "step": 700 }, { "epoch": 2.5177304964539005, "grad_norm": 1.3826724290847778, "learning_rate": 8.93125377097828e-05, "loss": 0.07, "step": 710 }, { "epoch": 2.5531914893617023, "grad_norm": 1.0240296125411987, "learning_rate": 8.894754103620964e-05, "loss": 0.077, "step": 720 }, { "epoch": 2.5886524822695036, "grad_norm": 0.8244665265083313, "learning_rate": 8.857718850055662e-05, "loss": 0.0731, "step": 730 }, { "epoch": 2.624113475177305, "grad_norm": 0.5112609267234802, "learning_rate": 8.820153103176693e-05, "loss": 0.0737, "step": 740 }, { "epoch": 2.6595744680851063, "grad_norm": 0.6853227019309998, "learning_rate": 8.782062028829028e-05, "loss": 0.077, "step": 750 }, { "epoch": 2.6950354609929077, "grad_norm": 0.6311427354812622, "learning_rate": 8.743450865097929e-05, "loss": 0.0635, "step": 760 }, { "epoch": 2.7304964539007095, "grad_norm": 0.5705634355545044, "learning_rate": 8.704324921588632e-05, "loss": 0.0699, "step": 770 }, { "epoch": 2.7659574468085104, "grad_norm": 1.1000028848648071, "learning_rate": 8.664689578696188e-05, "loss": 0.0761, "step": 780 }, { "epoch": 2.801418439716312, "grad_norm": 0.7304648160934448, "learning_rate": 8.624550286865591e-05, "loss": 0.0679, "step": 790 }, { "epoch": 2.8368794326241136, "grad_norm": 0.6147809028625488, "learning_rate": 8.583912565842257e-05, "loss": 0.0675, "step": 800 }, { "epoch": 2.872340425531915, "grad_norm": 0.5900303721427917, "learning_rate": 8.542782003912972e-05, "loss": 0.0649, "step": 810 }, { "epoch": 2.9078014184397163, "grad_norm": 0.5605118274688721, "learning_rate": 8.50116425713743e-05, "loss": 0.0701, "step": 820 }, { "epoch": 2.9432624113475176, "grad_norm": 0.691639244556427, "learning_rate": 8.459065048570434e-05, "loss": 0.0679, "step": 830 }, { "epoch": 2.978723404255319, "grad_norm": 0.6846038103103638, "learning_rate": 8.416490167474895e-05, "loss": 0.0693, "step": 840 }, { "epoch": 3.0141843971631204, "grad_norm": 0.5968617796897888, "learning_rate": 8.373445468525719e-05, "loss": 0.0575, "step": 850 }, { "epoch": 3.049645390070922, "grad_norm": 1.1188421249389648, "learning_rate": 8.329936871004703e-05, "loss": 0.0715, "step": 860 }, { "epoch": 3.0851063829787235, "grad_norm": 0.888618528842926, "learning_rate": 8.285970357986559e-05, "loss": 0.057, "step": 870 }, { "epoch": 3.120567375886525, "grad_norm": 0.8040549159049988, "learning_rate": 8.241551975516133e-05, "loss": 0.068, "step": 880 }, { "epoch": 3.1560283687943262, "grad_norm": 0.8645780086517334, "learning_rate": 8.196687831776996e-05, "loss": 0.0666, "step": 890 }, { "epoch": 3.1914893617021276, "grad_norm": 0.9035763144493103, "learning_rate": 8.15138409625148e-05, "loss": 0.0616, "step": 900 }, { "epoch": 3.226950354609929, "grad_norm": 0.6809470057487488, "learning_rate": 8.105646998872274e-05, "loss": 0.0608, "step": 910 }, { "epoch": 3.2624113475177303, "grad_norm": 0.4224311113357544, "learning_rate": 8.059482829165728e-05, "loss": 0.0547, "step": 920 }, { "epoch": 3.297872340425532, "grad_norm": 0.5534777045249939, "learning_rate": 8.012897935386937e-05, "loss": 0.0578, "step": 930 }, { "epoch": 3.3333333333333335, "grad_norm": 0.6809782981872559, "learning_rate": 7.965898723646776e-05, "loss": 0.0537, "step": 940 }, { "epoch": 3.368794326241135, "grad_norm": 0.6213827729225159, "learning_rate": 7.918491657030955e-05, "loss": 0.0574, "step": 950 }, { "epoch": 3.404255319148936, "grad_norm": 1.0936675071716309, "learning_rate": 7.870683254711255e-05, "loss": 0.0655, "step": 960 }, { "epoch": 3.4397163120567376, "grad_norm": 0.879903256893158, "learning_rate": 7.822480091049041e-05, "loss": 0.0655, "step": 970 }, { "epoch": 3.475177304964539, "grad_norm": 0.576164722442627, "learning_rate": 7.773888794691191e-05, "loss": 0.0582, "step": 980 }, { "epoch": 3.5106382978723403, "grad_norm": 0.7445582747459412, "learning_rate": 7.724916047658568e-05, "loss": 0.0636, "step": 990 }, { "epoch": 3.546099290780142, "grad_norm": 0.6598033905029297, "learning_rate": 7.675568584427125e-05, "loss": 0.0642, "step": 1000 }, { "epoch": 3.581560283687943, "grad_norm": 0.8604206442832947, "learning_rate": 7.625853191001838e-05, "loss": 0.0665, "step": 1010 }, { "epoch": 3.617021276595745, "grad_norm": 0.7154328227043152, "learning_rate": 7.575776703983508e-05, "loss": 0.0557, "step": 1020 }, { "epoch": 3.652482269503546, "grad_norm": 0.49133405089378357, "learning_rate": 7.525346009628646e-05, "loss": 0.0551, "step": 1030 }, { "epoch": 3.6879432624113475, "grad_norm": 0.8421124815940857, "learning_rate": 7.474568042902496e-05, "loss": 0.0612, "step": 1040 }, { "epoch": 3.723404255319149, "grad_norm": 0.7970739603042603, "learning_rate": 7.42344978652539e-05, "loss": 0.0598, "step": 1050 }, { "epoch": 3.7588652482269502, "grad_norm": 0.8975237011909485, "learning_rate": 7.371998270012504e-05, "loss": 0.0574, "step": 1060 }, { "epoch": 3.794326241134752, "grad_norm": 0.7334581613540649, "learning_rate": 7.320220568707206e-05, "loss": 0.0571, "step": 1070 }, { "epoch": 3.829787234042553, "grad_norm": 0.3972456753253937, "learning_rate": 7.268123802808096e-05, "loss": 0.0564, "step": 1080 }, { "epoch": 3.8652482269503547, "grad_norm": 0.7282454967498779, "learning_rate": 7.215715136389862e-05, "loss": 0.0537, "step": 1090 }, { "epoch": 3.900709219858156, "grad_norm": 0.5621278285980225, "learning_rate": 7.163001776418122e-05, "loss": 0.0628, "step": 1100 }, { "epoch": 3.9361702127659575, "grad_norm": 0.4074684977531433, "learning_rate": 7.109990971758354e-05, "loss": 0.05, "step": 1110 }, { "epoch": 3.971631205673759, "grad_norm": 0.9387255311012268, "learning_rate": 7.056690012179077e-05, "loss": 0.0617, "step": 1120 }, { "epoch": 4.00709219858156, "grad_norm": 0.8795057535171509, "learning_rate": 7.003106227349399e-05, "loss": 0.0555, "step": 1130 }, { "epoch": 4.042553191489362, "grad_norm": 0.6610236763954163, "learning_rate": 6.949246985831068e-05, "loss": 0.0526, "step": 1140 }, { "epoch": 4.078014184397163, "grad_norm": 0.519904613494873, "learning_rate": 6.895119694065204e-05, "loss": 0.0508, "step": 1150 }, { "epoch": 4.113475177304965, "grad_norm": 0.7559794187545776, "learning_rate": 6.840731795353787e-05, "loss": 0.0589, "step": 1160 }, { "epoch": 4.148936170212766, "grad_norm": 0.5495615601539612, "learning_rate": 6.786090768836108e-05, "loss": 0.054, "step": 1170 }, { "epoch": 4.184397163120567, "grad_norm": 0.38352513313293457, "learning_rate": 6.731204128460264e-05, "loss": 0.0496, "step": 1180 }, { "epoch": 4.219858156028369, "grad_norm": 2.50418758392334, "learning_rate": 6.676079421949887e-05, "loss": 0.0542, "step": 1190 }, { "epoch": 4.25531914893617, "grad_norm": 0.6766555905342102, "learning_rate": 6.620724229766218e-05, "loss": 0.0512, "step": 1200 }, { "epoch": 4.290780141843972, "grad_norm": 0.6073281764984131, "learning_rate": 6.565146164065683e-05, "loss": 0.0515, "step": 1210 }, { "epoch": 4.326241134751773, "grad_norm": 0.5780677795410156, "learning_rate": 6.509352867653106e-05, "loss": 0.0601, "step": 1220 }, { "epoch": 4.361702127659575, "grad_norm": 1.2549879550933838, "learning_rate": 6.453352012930712e-05, "loss": 0.0538, "step": 1230 }, { "epoch": 4.397163120567376, "grad_norm": 1.9051744937896729, "learning_rate": 6.397151300843065e-05, "loss": 0.0544, "step": 1240 }, { "epoch": 4.432624113475177, "grad_norm": 0.48807811737060547, "learning_rate": 6.340758459818059e-05, "loss": 0.0514, "step": 1250 }, { "epoch": 4.468085106382979, "grad_norm": 0.7081537246704102, "learning_rate": 6.284181244704161e-05, "loss": 0.0595, "step": 1260 }, { "epoch": 4.50354609929078, "grad_norm": 0.6502603888511658, "learning_rate": 6.227427435703997e-05, "loss": 0.0629, "step": 1270 }, { "epoch": 4.539007092198582, "grad_norm": 0.9540507197380066, "learning_rate": 6.170504837304458e-05, "loss": 0.045, "step": 1280 }, { "epoch": 4.574468085106383, "grad_norm": 0.5669839382171631, "learning_rate": 6.11342127720347e-05, "loss": 0.059, "step": 1290 }, { "epoch": 4.609929078014185, "grad_norm": 0.7695325016975403, "learning_rate": 6.056184605233576e-05, "loss": 0.0558, "step": 1300 }, { "epoch": 4.6453900709219855, "grad_norm": 0.6503219604492188, "learning_rate": 5.9988026922824534e-05, "loss": 0.0528, "step": 1310 }, { "epoch": 4.680851063829787, "grad_norm": 0.8653897643089294, "learning_rate": 5.9412834292105676e-05, "loss": 0.0521, "step": 1320 }, { "epoch": 4.716312056737589, "grad_norm": 1.1591063737869263, "learning_rate": 5.883634725766048e-05, "loss": 0.0568, "step": 1330 }, { "epoch": 4.75177304964539, "grad_norm": 0.5562894940376282, "learning_rate": 5.825864509496991e-05, "loss": 0.0515, "step": 1340 }, { "epoch": 4.787234042553192, "grad_norm": 0.6316646337509155, "learning_rate": 5.767980724661295e-05, "loss": 0.045, "step": 1350 }, { "epoch": 4.822695035460993, "grad_norm": 0.8003104329109192, "learning_rate": 5.7099913311342234e-05, "loss": 0.0457, "step": 1360 }, { "epoch": 4.858156028368795, "grad_norm": 0.4407203793525696, "learning_rate": 5.651904303313784e-05, "loss": 0.055, "step": 1370 }, { "epoch": 4.8936170212765955, "grad_norm": 0.5941787362098694, "learning_rate": 5.593727629024148e-05, "loss": 0.0476, "step": 1380 }, { "epoch": 4.929078014184397, "grad_norm": 0.4574081599712372, "learning_rate": 5.535469308417198e-05, "loss": 0.0435, "step": 1390 }, { "epoch": 4.964539007092198, "grad_norm": 0.7371949553489685, "learning_rate": 5.4771373528723924e-05, "loss": 0.0576, "step": 1400 }, { "epoch": 5.0, "grad_norm": 0.4565671682357788, "learning_rate": 5.418739783895078e-05, "loss": 0.0415, "step": 1410 }, { "epoch": 5.035460992907802, "grad_norm": 0.6150527000427246, "learning_rate": 5.360284632013421e-05, "loss": 0.0435, "step": 1420 }, { "epoch": 5.070921985815603, "grad_norm": 1.273917317390442, "learning_rate": 5.301779935674087e-05, "loss": 0.0475, "step": 1430 }, { "epoch": 5.1063829787234045, "grad_norm": 0.5376979112625122, "learning_rate": 5.243233740136833e-05, "loss": 0.0476, "step": 1440 }, { "epoch": 5.141843971631205, "grad_norm": 0.4023731052875519, "learning_rate": 5.1846540963681725e-05, "loss": 0.0441, "step": 1450 }, { "epoch": 5.177304964539007, "grad_norm": 0.630713939666748, "learning_rate": 5.1260490599342395e-05, "loss": 0.0454, "step": 1460 }, { "epoch": 5.212765957446808, "grad_norm": 0.9711371660232544, "learning_rate": 5.067426689893042e-05, "loss": 0.0485, "step": 1470 }, { "epoch": 5.24822695035461, "grad_norm": 0.5721754431724548, "learning_rate": 5.00879504768621e-05, "loss": 0.0482, "step": 1480 }, { "epoch": 5.283687943262412, "grad_norm": 1.25078547000885, "learning_rate": 4.9501621960304323e-05, "loss": 0.0495, "step": 1490 }, { "epoch": 5.319148936170213, "grad_norm": 0.5160683989524841, "learning_rate": 4.891536197808719e-05, "loss": 0.0433, "step": 1500 }, { "epoch": 5.3546099290780145, "grad_norm": 0.5063398480415344, "learning_rate": 4.832925114961629e-05, "loss": 0.0512, "step": 1510 }, { "epoch": 5.390070921985815, "grad_norm": 0.5604034066200256, "learning_rate": 4.774337007378633e-05, "loss": 0.042, "step": 1520 }, { "epoch": 5.425531914893617, "grad_norm": 0.5415745377540588, "learning_rate": 4.715779931789776e-05, "loss": 0.0451, "step": 1530 }, { "epoch": 5.460992907801418, "grad_norm": 0.43733975291252136, "learning_rate": 4.657261940657732e-05, "loss": 0.0512, "step": 1540 }, { "epoch": 5.49645390070922, "grad_norm": 0.39489609003067017, "learning_rate": 4.5987910810704925e-05, "loss": 0.0408, "step": 1550 }, { "epoch": 5.531914893617021, "grad_norm": 0.5629368424415588, "learning_rate": 4.540375393634762e-05, "loss": 0.0425, "step": 1560 }, { "epoch": 5.567375886524823, "grad_norm": 0.764885425567627, "learning_rate": 4.48202291137026e-05, "loss": 0.0531, "step": 1570 }, { "epoch": 5.602836879432624, "grad_norm": 0.6770176291465759, "learning_rate": 4.423741658605066e-05, "loss": 0.0433, "step": 1580 }, { "epoch": 5.638297872340425, "grad_norm": 0.8399835228919983, "learning_rate": 4.365539649872146e-05, "loss": 0.0353, "step": 1590 }, { "epoch": 5.673758865248227, "grad_norm": 0.8981763124465942, "learning_rate": 4.307424888807242e-05, "loss": 0.0488, "step": 1600 }, { "epoch": 5.709219858156028, "grad_norm": 0.8662058711051941, "learning_rate": 4.249405367048254e-05, "loss": 0.0495, "step": 1610 }, { "epoch": 5.74468085106383, "grad_norm": 0.6840683221817017, "learning_rate": 4.191489063136274e-05, "loss": 0.0502, "step": 1620 }, { "epoch": 5.780141843971631, "grad_norm": 0.511677622795105, "learning_rate": 4.133683941418411e-05, "loss": 0.0419, "step": 1630 }, { "epoch": 5.815602836879433, "grad_norm": 1.7016257047653198, "learning_rate": 4.0759979509525824e-05, "loss": 0.0469, "step": 1640 }, { "epoch": 5.851063829787234, "grad_norm": 0.4668084681034088, "learning_rate": 4.018439024414398e-05, "loss": 0.049, "step": 1650 }, { "epoch": 5.886524822695035, "grad_norm": 0.8224870562553406, "learning_rate": 3.9610150770063015e-05, "loss": 0.044, "step": 1660 }, { "epoch": 5.921985815602837, "grad_norm": 0.41005393862724304, "learning_rate": 3.903734005369115e-05, "loss": 0.0443, "step": 1670 }, { "epoch": 5.957446808510638, "grad_norm": 0.3030180335044861, "learning_rate": 3.8466036864961314e-05, "loss": 0.0381, "step": 1680 }, { "epoch": 5.99290780141844, "grad_norm": 1.0716583728790283, "learning_rate": 3.789631976649907e-05, "loss": 0.0426, "step": 1690 }, { "epoch": 6.028368794326241, "grad_norm": 0.5316461324691772, "learning_rate": 3.7328267102819225e-05, "loss": 0.0535, "step": 1700 }, { "epoch": 6.0638297872340425, "grad_norm": 0.3643524944782257, "learning_rate": 3.676195698955214e-05, "loss": 0.0413, "step": 1710 }, { "epoch": 6.099290780141844, "grad_norm": 0.4204830527305603, "learning_rate": 3.619746730270185e-05, "loss": 0.0381, "step": 1720 }, { "epoch": 6.134751773049645, "grad_norm": 0.42800143361091614, "learning_rate": 3.56348756679368e-05, "loss": 0.0377, "step": 1730 }, { "epoch": 6.170212765957447, "grad_norm": 0.446856290102005, "learning_rate": 3.5074259449915284e-05, "loss": 0.0441, "step": 1740 }, { "epoch": 6.205673758865248, "grad_norm": 0.5602040886878967, "learning_rate": 3.4515695741646584e-05, "loss": 0.0489, "step": 1750 }, { "epoch": 6.24113475177305, "grad_norm": 0.8359407782554626, "learning_rate": 3.3959261353889605e-05, "loss": 0.0471, "step": 1760 }, { "epoch": 6.276595744680851, "grad_norm": 0.41129371523857117, "learning_rate": 3.3405032804590244e-05, "loss": 0.0411, "step": 1770 }, { "epoch": 6.3120567375886525, "grad_norm": 0.5542031526565552, "learning_rate": 3.2853086308359025e-05, "loss": 0.0419, "step": 1780 }, { "epoch": 6.347517730496454, "grad_norm": 0.3114689886569977, "learning_rate": 3.230349776599044e-05, "loss": 0.0399, "step": 1790 }, { "epoch": 6.382978723404255, "grad_norm": 0.5866858959197998, "learning_rate": 3.1756342754025546e-05, "loss": 0.0392, "step": 1800 }, { "epoch": 6.418439716312057, "grad_norm": 0.7895340323448181, "learning_rate": 3.121169651435903e-05, "loss": 0.0364, "step": 1810 }, { "epoch": 6.453900709219858, "grad_norm": 0.6045717597007751, "learning_rate": 3.0669633943892295e-05, "loss": 0.0377, "step": 1820 }, { "epoch": 6.48936170212766, "grad_norm": 0.505060076713562, "learning_rate": 3.0130229584234115e-05, "loss": 0.0382, "step": 1830 }, { "epoch": 6.524822695035461, "grad_norm": 0.5746676325798035, "learning_rate": 2.9593557611450008e-05, "loss": 0.0332, "step": 1840 }, { "epoch": 6.560283687943262, "grad_norm": 0.3892854154109955, "learning_rate": 2.9059691825861922e-05, "loss": 0.0374, "step": 1850 }, { "epoch": 6.595744680851064, "grad_norm": 2.366185188293457, "learning_rate": 2.8528705641899667e-05, "loss": 0.0448, "step": 1860 }, { "epoch": 6.631205673758865, "grad_norm": 0.6133996248245239, "learning_rate": 2.8000672078005274e-05, "loss": 0.04, "step": 1870 }, { "epoch": 6.666666666666667, "grad_norm": 0.45607250928878784, "learning_rate": 2.7475663746591906e-05, "loss": 0.0465, "step": 1880 }, { "epoch": 6.702127659574468, "grad_norm": 0.6318036317825317, "learning_rate": 2.69537528440586e-05, "loss": 0.0382, "step": 1890 }, { "epoch": 6.73758865248227, "grad_norm": 0.409967839717865, "learning_rate": 2.6435011140862166e-05, "loss": 0.045, "step": 1900 }, { "epoch": 6.773049645390071, "grad_norm": 0.6712722778320312, "learning_rate": 2.59195099716477e-05, "loss": 0.0367, "step": 1910 }, { "epoch": 6.808510638297872, "grad_norm": 0.8447416424751282, "learning_rate": 2.5407320225439047e-05, "loss": 0.0379, "step": 1920 }, { "epoch": 6.843971631205674, "grad_norm": 1.3314627408981323, "learning_rate": 2.4898512335890422e-05, "loss": 0.0409, "step": 1930 }, { "epoch": 6.879432624113475, "grad_norm": 0.5814293026924133, "learning_rate": 2.439315627160085e-05, "loss": 0.0471, "step": 1940 }, { "epoch": 6.914893617021277, "grad_norm": 0.7060461640357971, "learning_rate": 2.3891321526492427e-05, "loss": 0.0429, "step": 1950 }, { "epoch": 6.950354609929078, "grad_norm": 0.8079804182052612, "learning_rate": 2.3393077110253835e-05, "loss": 0.0392, "step": 1960 }, { "epoch": 6.98581560283688, "grad_norm": 0.5584299564361572, "learning_rate": 2.2898491538850475e-05, "loss": 0.0432, "step": 1970 }, { "epoch": 7.0212765957446805, "grad_norm": 0.6511906385421753, "learning_rate": 2.2407632825102603e-05, "loss": 0.0376, "step": 1980 }, { "epoch": 7.056737588652482, "grad_norm": 0.30886492133140564, "learning_rate": 2.192056846933246e-05, "loss": 0.0338, "step": 1990 }, { "epoch": 7.092198581560283, "grad_norm": 0.432918518781662, "learning_rate": 2.1437365450082114e-05, "loss": 0.0436, "step": 2000 }, { "epoch": 7.127659574468085, "grad_norm": 0.3994809687137604, "learning_rate": 2.0958090214902733e-05, "loss": 0.0308, "step": 2010 }, { "epoch": 7.163120567375887, "grad_norm": 0.43582919239997864, "learning_rate": 2.0482808671217217e-05, "loss": 0.0375, "step": 2020 }, { "epoch": 7.198581560283688, "grad_norm": 0.5949472784996033, "learning_rate": 2.001158617725692e-05, "loss": 0.0387, "step": 2030 }, { "epoch": 7.23404255319149, "grad_norm": 0.3335230350494385, "learning_rate": 1.9544487533073887e-05, "loss": 0.0348, "step": 2040 }, { "epoch": 7.2695035460992905, "grad_norm": 0.6592429280281067, "learning_rate": 1.9081576971629927e-05, "loss": 0.0419, "step": 2050 }, { "epoch": 7.304964539007092, "grad_norm": 1.4891948699951172, "learning_rate": 1.8622918149963626e-05, "loss": 0.0347, "step": 2060 }, { "epoch": 7.340425531914893, "grad_norm": 0.45448461174964905, "learning_rate": 1.816857414043655e-05, "loss": 0.0367, "step": 2070 }, { "epoch": 7.375886524822695, "grad_norm": 0.38540035486221313, "learning_rate": 1.771860742205988e-05, "loss": 0.0319, "step": 2080 }, { "epoch": 7.411347517730497, "grad_norm": 1.0512962341308594, "learning_rate": 1.727307987190262e-05, "loss": 0.0425, "step": 2090 }, { "epoch": 7.446808510638298, "grad_norm": 0.8107235431671143, "learning_rate": 1.6832052756582584e-05, "loss": 0.038, "step": 2100 }, { "epoch": 7.4822695035460995, "grad_norm": 0.6066608428955078, "learning_rate": 1.6395586723841327e-05, "loss": 0.0326, "step": 2110 }, { "epoch": 7.5177304964539005, "grad_norm": 0.32806602120399475, "learning_rate": 1.5963741794204207e-05, "loss": 0.0328, "step": 2120 }, { "epoch": 7.553191489361702, "grad_norm": 0.5049278140068054, "learning_rate": 1.5536577352726607e-05, "loss": 0.0431, "step": 2130 }, { "epoch": 7.588652482269503, "grad_norm": 0.5972107648849487, "learning_rate": 1.5114152140827742e-05, "loss": 0.0339, "step": 2140 }, { "epoch": 7.624113475177305, "grad_norm": 0.5267333984375, "learning_rate": 1.4696524248212746e-05, "loss": 0.034, "step": 2150 }, { "epoch": 7.659574468085106, "grad_norm": 0.45704758167266846, "learning_rate": 1.4283751104884446e-05, "loss": 0.03, "step": 2160 }, { "epoch": 7.695035460992908, "grad_norm": 0.25589045882225037, "learning_rate": 1.3875889473245995e-05, "loss": 0.0324, "step": 2170 }, { "epoch": 7.7304964539007095, "grad_norm": 0.3722244203090668, "learning_rate": 1.3472995440295182e-05, "loss": 0.0309, "step": 2180 }, { "epoch": 7.76595744680851, "grad_norm": 0.4319707155227661, "learning_rate": 1.3075124409911581e-05, "loss": 0.0354, "step": 2190 }, { "epoch": 7.801418439716312, "grad_norm": 0.5372457504272461, "learning_rate": 1.2682331095237698e-05, "loss": 0.0374, "step": 2200 }, { "epoch": 7.836879432624113, "grad_norm": 0.3582875430583954, "learning_rate": 1.2294669511155193e-05, "loss": 0.0366, "step": 2210 }, { "epoch": 7.872340425531915, "grad_norm": 0.4589178264141083, "learning_rate": 1.1912192966856961e-05, "loss": 0.0356, "step": 2220 }, { "epoch": 7.907801418439716, "grad_norm": 0.4580075442790985, "learning_rate": 1.1534954058516356e-05, "loss": 0.0403, "step": 2230 }, { "epoch": 7.943262411347518, "grad_norm": 0.8937917351722717, "learning_rate": 1.1163004662054432e-05, "loss": 0.0308, "step": 2240 }, { "epoch": 7.9787234042553195, "grad_norm": 0.4087877869606018, "learning_rate": 1.0796395926006259e-05, "loss": 0.0366, "step": 2250 }, { "epoch": 8.01418439716312, "grad_norm": 0.4792381823062897, "learning_rate": 1.0435178264487206e-05, "loss": 0.0321, "step": 2260 }, { "epoch": 8.049645390070921, "grad_norm": 0.49857097864151, "learning_rate": 1.0079401350260287e-05, "loss": 0.0309, "step": 2270 }, { "epoch": 8.085106382978724, "grad_norm": 1.709592342376709, "learning_rate": 9.729114107905417e-06, "loss": 0.0379, "step": 2280 }, { "epoch": 8.120567375886525, "grad_norm": 0.8211184144020081, "learning_rate": 9.384364707091559e-06, "loss": 0.0375, "step": 2290 }, { "epoch": 8.156028368794326, "grad_norm": 0.4921300411224365, "learning_rate": 9.04520055595266e-06, "loss": 0.0385, "step": 2300 }, { "epoch": 8.191489361702128, "grad_norm": 0.9879147410392761, "learning_rate": 8.71166829456837e-06, "loss": 0.0295, "step": 2310 }, { "epoch": 8.22695035460993, "grad_norm": 0.9006680846214294, "learning_rate": 8.383813788550326e-06, "loss": 0.0368, "step": 2320 }, { "epoch": 8.26241134751773, "grad_norm": 0.5323331952095032, "learning_rate": 8.061682122734937e-06, "loss": 0.0418, "step": 2330 }, { "epoch": 8.297872340425531, "grad_norm": 0.37056395411491394, "learning_rate": 7.745317594983598e-06, "loss": 0.0347, "step": 2340 }, { "epoch": 8.333333333333334, "grad_norm": 0.399641215801239, "learning_rate": 7.434763710090992e-06, "loss": 0.0335, "step": 2350 }, { "epoch": 8.368794326241135, "grad_norm": 0.36943164467811584, "learning_rate": 7.130063173802637e-06, "loss": 0.0304, "step": 2360 }, { "epoch": 8.404255319148936, "grad_norm": 0.8294058442115784, "learning_rate": 6.8312578869422e-06, "loss": 0.0348, "step": 2370 }, { "epoch": 8.439716312056738, "grad_norm": 0.8177503347396851, "learning_rate": 6.538388939649443e-06, "loss": 0.0307, "step": 2380 }, { "epoch": 8.47517730496454, "grad_norm": 0.5157350301742554, "learning_rate": 6.251496605729773e-06, "loss": 0.0308, "step": 2390 }, { "epoch": 8.51063829787234, "grad_norm": 0.6393640041351318, "learning_rate": 5.970620337116012e-06, "loss": 0.0323, "step": 2400 }, { "epoch": 8.546099290780141, "grad_norm": 0.6659883260726929, "learning_rate": 5.695798758443133e-06, "loss": 0.0359, "step": 2410 }, { "epoch": 8.581560283687944, "grad_norm": 0.40989336371421814, "learning_rate": 5.427069661736872e-06, "loss": 0.034, "step": 2420 }, { "epoch": 8.617021276595745, "grad_norm": 0.6710225343704224, "learning_rate": 5.164470001216659e-06, "loss": 0.0368, "step": 2430 }, { "epoch": 8.652482269503546, "grad_norm": 0.3472451865673065, "learning_rate": 4.908035888213963e-06, "loss": 0.0335, "step": 2440 }, { "epoch": 8.687943262411348, "grad_norm": 0.5614815950393677, "learning_rate": 4.657802586206411e-06, "loss": 0.0333, "step": 2450 }, { "epoch": 8.72340425531915, "grad_norm": 0.8335484266281128, "learning_rate": 4.413804505968533e-06, "loss": 0.0364, "step": 2460 }, { "epoch": 8.75886524822695, "grad_norm": 0.3221122920513153, "learning_rate": 4.17607520083979e-06, "loss": 0.0342, "step": 2470 }, { "epoch": 8.794326241134751, "grad_norm": 0.5895943641662598, "learning_rate": 3.944647362110487e-06, "loss": 0.0305, "step": 2480 }, { "epoch": 8.829787234042554, "grad_norm": 0.9525559544563293, "learning_rate": 3.7195528145262337e-06, "loss": 0.0273, "step": 2490 }, { "epoch": 8.865248226950355, "grad_norm": 1.2005960941314697, "learning_rate": 3.5008225119115777e-06, "loss": 0.0447, "step": 2500 }, { "epoch": 8.900709219858156, "grad_norm": 0.5620400309562683, "learning_rate": 3.2884865329133986e-06, "loss": 0.0283, "step": 2510 }, { "epoch": 8.936170212765958, "grad_norm": 0.8444902896881104, "learning_rate": 3.082574076864636e-06, "loss": 0.0323, "step": 2520 }, { "epoch": 8.97163120567376, "grad_norm": 0.21826264262199402, "learning_rate": 2.88311345976896e-06, "loss": 0.0312, "step": 2530 }, { "epoch": 9.00709219858156, "grad_norm": 0.6866464614868164, "learning_rate": 2.6901321104069287e-06, "loss": 0.0279, "step": 2540 }, { "epoch": 9.042553191489361, "grad_norm": 0.5157074332237244, "learning_rate": 2.5036565665640444e-06, "loss": 0.0314, "step": 2550 }, { "epoch": 9.078014184397164, "grad_norm": 0.4928205907344818, "learning_rate": 2.3237124713815285e-06, "loss": 0.0304, "step": 2560 }, { "epoch": 9.113475177304965, "grad_norm": 0.6988965272903442, "learning_rate": 2.1503245698299314e-06, "loss": 0.035, "step": 2570 }, { "epoch": 9.148936170212766, "grad_norm": 0.5993123650550842, "learning_rate": 1.9835167053063376e-06, "loss": 0.036, "step": 2580 }, { "epoch": 9.184397163120567, "grad_norm": 0.429684579372406, "learning_rate": 1.8233118163555828e-06, "loss": 0.0333, "step": 2590 }, { "epoch": 9.21985815602837, "grad_norm": 0.3981442153453827, "learning_rate": 1.6697319335158613e-06, "loss": 0.0275, "step": 2600 }, { "epoch": 9.25531914893617, "grad_norm": 0.4685744345188141, "learning_rate": 1.5227981762891586e-06, "loss": 0.0322, "step": 2610 }, { "epoch": 9.290780141843971, "grad_norm": 0.32031360268592834, "learning_rate": 1.3825307502370488e-06, "loss": 0.0275, "step": 2620 }, { "epoch": 9.326241134751774, "grad_norm": 0.316394567489624, "learning_rate": 1.2489489442021274e-06, "loss": 0.0282, "step": 2630 }, { "epoch": 9.361702127659575, "grad_norm": 0.5381686091423035, "learning_rate": 1.1220711276554773e-06, "loss": 0.0342, "step": 2640 }, { "epoch": 9.397163120567376, "grad_norm": 0.5553480386734009, "learning_rate": 1.0019147481706625e-06, "loss": 0.0284, "step": 2650 }, { "epoch": 9.432624113475176, "grad_norm": 0.868281364440918, "learning_rate": 8.884963290243408e-07, "loss": 0.0314, "step": 2660 }, { "epoch": 9.46808510638298, "grad_norm": 0.31793954968452454, "learning_rate": 7.818314669241544e-07, "loss": 0.0319, "step": 2670 }, { "epoch": 9.50354609929078, "grad_norm": 0.6224974989891052, "learning_rate": 6.819348298638839e-07, "loss": 0.0291, "step": 2680 }, { "epoch": 9.539007092198581, "grad_norm": 0.646937370300293, "learning_rate": 5.888201551064288e-07, "loss": 0.0325, "step": 2690 }, { "epoch": 9.574468085106384, "grad_norm": 0.3932403326034546, "learning_rate": 5.025002472947071e-07, "loss": 0.032, "step": 2700 }, { "epoch": 9.609929078014185, "grad_norm": 0.3964429795742035, "learning_rate": 4.2298697669084785e-07, "loss": 0.0329, "step": 2710 }, { "epoch": 9.645390070921986, "grad_norm": 0.9335715770721436, "learning_rate": 3.502912775438183e-07, "loss": 0.0267, "step": 2720 }, { "epoch": 9.680851063829786, "grad_norm": 0.4232543408870697, "learning_rate": 2.8442314658584936e-07, "loss": 0.0398, "step": 2730 }, { "epoch": 9.71631205673759, "grad_norm": 0.8966019153594971, "learning_rate": 2.2539164165770177e-07, "loss": 0.03, "step": 2740 }, { "epoch": 9.75177304964539, "grad_norm": 1.149170160293579, "learning_rate": 1.732048804630959e-07, "loss": 0.0275, "step": 2750 }, { "epoch": 9.787234042553191, "grad_norm": 0.6613313555717468, "learning_rate": 1.2787003945239906e-07, "loss": 0.0251, "step": 2760 }, { "epoch": 9.822695035460994, "grad_norm": 0.5298823118209839, "learning_rate": 8.939335283577599e-08, "loss": 0.0296, "step": 2770 }, { "epoch": 9.858156028368795, "grad_norm": 0.3377326428890228, "learning_rate": 5.778011172586362e-08, "loss": 0.0348, "step": 2780 }, { "epoch": 9.893617021276595, "grad_norm": 0.6410892009735107, "learning_rate": 3.3034663410191904e-08, "loss": 0.0287, "step": 2790 }, { "epoch": 9.929078014184396, "grad_norm": 1.0103349685668945, "learning_rate": 1.51604107533454e-08, "loss": 0.0292, "step": 2800 }, { "epoch": 9.964539007092199, "grad_norm": 0.25607502460479736, "learning_rate": 4.159811729037566e-09, "loss": 0.0289, "step": 2810 }, { "epoch": 10.0, "grad_norm": 0.471729040145874, "learning_rate": 3.437908209780183e-11, "loss": 0.0342, "step": 2820 }, { "epoch": 10.0, "step": 2820, "total_flos": 0.0, "train_loss": 0.0648594191099735, "train_runtime": 2953.1296, "train_samples_per_second": 46.774, "train_steps_per_second": 0.955 } ], "logging_steps": 10, "max_steps": 2820, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 49, "trial_name": null, "trial_params": null }