{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.170974487662066, "eval_steps": 500, "global_step": 7000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.001672940192388122, "grad_norm": 5.676821708679199, "learning_rate": 1.6722408026755853e-07, "loss": 0.5306, "step": 10 }, { "epoch": 0.003345880384776244, "grad_norm": 5.614320755004883, "learning_rate": 3.3444816053511706e-07, "loss": 0.5273, "step": 20 }, { "epoch": 0.005018820577164366, "grad_norm": 4.935384750366211, "learning_rate": 5.016722408026756e-07, "loss": 0.4967, "step": 30 }, { "epoch": 0.006691760769552488, "grad_norm": 3.304027557373047, "learning_rate": 6.688963210702341e-07, "loss": 0.4004, "step": 40 }, { "epoch": 0.008364700961940611, "grad_norm": 1.907157301902771, "learning_rate": 8.361204013377927e-07, "loss": 0.2762, "step": 50 }, { "epoch": 0.010037641154328732, "grad_norm": 1.6795320510864258, "learning_rate": 1.0033444816053512e-06, "loss": 0.1914, "step": 60 }, { "epoch": 0.011710581346716854, "grad_norm": 0.7994567155838013, "learning_rate": 1.1705685618729099e-06, "loss": 0.1291, "step": 70 }, { "epoch": 0.013383521539104977, "grad_norm": 0.520195722579956, "learning_rate": 1.3377926421404683e-06, "loss": 0.0983, "step": 80 }, { "epoch": 0.015056461731493099, "grad_norm": 0.4581398069858551, "learning_rate": 1.505016722408027e-06, "loss": 0.0812, "step": 90 }, { "epoch": 0.016729401923881223, "grad_norm": 0.3682957887649536, "learning_rate": 1.6722408026755855e-06, "loss": 0.063, "step": 100 }, { "epoch": 0.018402342116269343, "grad_norm": 0.4702453017234802, "learning_rate": 1.8394648829431439e-06, "loss": 0.0554, "step": 110 }, { "epoch": 0.020075282308657464, "grad_norm": 0.38541579246520996, "learning_rate": 2.0066889632107025e-06, "loss": 0.048, "step": 120 }, { "epoch": 0.021748222501045588, "grad_norm": 0.32105550169944763, "learning_rate": 2.173913043478261e-06, "loss": 0.0449, "step": 130 }, { "epoch": 0.02342116269343371, "grad_norm": 0.32155075669288635, "learning_rate": 2.3411371237458197e-06, "loss": 0.0402, "step": 140 }, { "epoch": 0.025094102885821833, "grad_norm": 0.3209912180900574, "learning_rate": 2.5083612040133783e-06, "loss": 0.0364, "step": 150 }, { "epoch": 0.026767043078209953, "grad_norm": 0.3099250793457031, "learning_rate": 2.6755852842809365e-06, "loss": 0.035, "step": 160 }, { "epoch": 0.028439983270598077, "grad_norm": 0.38229283690452576, "learning_rate": 2.842809364548495e-06, "loss": 0.0323, "step": 170 }, { "epoch": 0.030112923462986198, "grad_norm": 0.3536956012248993, "learning_rate": 3.010033444816054e-06, "loss": 0.0305, "step": 180 }, { "epoch": 0.03178586365537432, "grad_norm": 0.3397085666656494, "learning_rate": 3.1772575250836123e-06, "loss": 0.0299, "step": 190 }, { "epoch": 0.033458803847762446, "grad_norm": 0.5954955220222473, "learning_rate": 3.344481605351171e-06, "loss": 0.0254, "step": 200 }, { "epoch": 0.03513174404015056, "grad_norm": 0.3390955328941345, "learning_rate": 3.511705685618729e-06, "loss": 0.0259, "step": 210 }, { "epoch": 0.03680468423253869, "grad_norm": 0.2665148377418518, "learning_rate": 3.6789297658862878e-06, "loss": 0.0261, "step": 220 }, { "epoch": 0.03847762442492681, "grad_norm": 0.29703593254089355, "learning_rate": 3.846153846153847e-06, "loss": 0.0234, "step": 230 }, { "epoch": 0.04015056461731493, "grad_norm": 0.27780285477638245, "learning_rate": 4.013377926421405e-06, "loss": 0.02, "step": 240 }, { "epoch": 0.04182350480970305, "grad_norm": 0.2706936299800873, "learning_rate": 4.180602006688963e-06, "loss": 0.0221, "step": 250 }, { "epoch": 0.043496445002091176, "grad_norm": 0.27815213799476624, "learning_rate": 4.347826086956522e-06, "loss": 0.0195, "step": 260 }, { "epoch": 0.0451693851944793, "grad_norm": 0.27958977222442627, "learning_rate": 4.51505016722408e-06, "loss": 0.018, "step": 270 }, { "epoch": 0.04684232538686742, "grad_norm": 0.24512343108654022, "learning_rate": 4.6822742474916394e-06, "loss": 0.0172, "step": 280 }, { "epoch": 0.04851526557925554, "grad_norm": 0.2707979381084442, "learning_rate": 4.849498327759198e-06, "loss": 0.0169, "step": 290 }, { "epoch": 0.050188205771643665, "grad_norm": 0.28464800119400024, "learning_rate": 5.016722408026757e-06, "loss": 0.0171, "step": 300 }, { "epoch": 0.05186114596403179, "grad_norm": 0.21844109892845154, "learning_rate": 5.183946488294315e-06, "loss": 0.017, "step": 310 }, { "epoch": 0.053534086156419906, "grad_norm": 0.28929072618484497, "learning_rate": 5.351170568561873e-06, "loss": 0.0156, "step": 320 }, { "epoch": 0.05520702634880803, "grad_norm": 0.23370195925235748, "learning_rate": 5.518394648829431e-06, "loss": 0.0147, "step": 330 }, { "epoch": 0.056879966541196154, "grad_norm": 0.3077922463417053, "learning_rate": 5.68561872909699e-06, "loss": 0.0152, "step": 340 }, { "epoch": 0.05855290673358427, "grad_norm": 0.2194584608078003, "learning_rate": 5.852842809364549e-06, "loss": 0.0139, "step": 350 }, { "epoch": 0.060225846925972396, "grad_norm": 0.2424791306257248, "learning_rate": 6.020066889632108e-06, "loss": 0.0139, "step": 360 }, { "epoch": 0.06189878711836052, "grad_norm": 0.2439364641904831, "learning_rate": 6.1872909698996665e-06, "loss": 0.0143, "step": 370 }, { "epoch": 0.06357172731074864, "grad_norm": 0.239066019654274, "learning_rate": 6.354515050167225e-06, "loss": 0.0125, "step": 380 }, { "epoch": 0.06524466750313676, "grad_norm": 0.21066606044769287, "learning_rate": 6.521739130434783e-06, "loss": 0.0126, "step": 390 }, { "epoch": 0.06691760769552489, "grad_norm": 0.24605374038219452, "learning_rate": 6.688963210702342e-06, "loss": 0.0125, "step": 400 }, { "epoch": 0.06859054788791301, "grad_norm": 0.23474596440792084, "learning_rate": 6.8561872909699e-06, "loss": 0.0124, "step": 410 }, { "epoch": 0.07026348808030113, "grad_norm": 0.22990989685058594, "learning_rate": 7.023411371237458e-06, "loss": 0.012, "step": 420 }, { "epoch": 0.07193642827268926, "grad_norm": 0.20189732313156128, "learning_rate": 7.190635451505017e-06, "loss": 0.0112, "step": 430 }, { "epoch": 0.07360936846507737, "grad_norm": 0.23320627212524414, "learning_rate": 7.3578595317725755e-06, "loss": 0.0109, "step": 440 }, { "epoch": 0.07528230865746549, "grad_norm": 0.24466697871685028, "learning_rate": 7.5250836120401346e-06, "loss": 0.0115, "step": 450 }, { "epoch": 0.07695524884985362, "grad_norm": 0.20141980051994324, "learning_rate": 7.692307692307694e-06, "loss": 0.0112, "step": 460 }, { "epoch": 0.07862818904224174, "grad_norm": 0.24387459456920624, "learning_rate": 7.859531772575253e-06, "loss": 0.0091, "step": 470 }, { "epoch": 0.08030112923462986, "grad_norm": 0.20812922716140747, "learning_rate": 8.02675585284281e-06, "loss": 0.0096, "step": 480 }, { "epoch": 0.08197406942701799, "grad_norm": 0.17994938790798187, "learning_rate": 8.193979933110369e-06, "loss": 0.0097, "step": 490 }, { "epoch": 0.0836470096194061, "grad_norm": 0.2290204018354416, "learning_rate": 8.361204013377926e-06, "loss": 0.0102, "step": 500 }, { "epoch": 0.08531994981179424, "grad_norm": 0.22309386730194092, "learning_rate": 8.528428093645485e-06, "loss": 0.0091, "step": 510 }, { "epoch": 0.08699289000418235, "grad_norm": 0.20844189822673798, "learning_rate": 8.695652173913044e-06, "loss": 0.0094, "step": 520 }, { "epoch": 0.08866583019657047, "grad_norm": 0.22521355748176575, "learning_rate": 8.862876254180602e-06, "loss": 0.0092, "step": 530 }, { "epoch": 0.0903387703889586, "grad_norm": 0.20999939739704132, "learning_rate": 9.03010033444816e-06, "loss": 0.0096, "step": 540 }, { "epoch": 0.09201171058134672, "grad_norm": 0.15172499418258667, "learning_rate": 9.19732441471572e-06, "loss": 0.009, "step": 550 }, { "epoch": 0.09368465077373483, "grad_norm": 0.17486289143562317, "learning_rate": 9.364548494983279e-06, "loss": 0.0077, "step": 560 }, { "epoch": 0.09535759096612297, "grad_norm": 0.16407956182956696, "learning_rate": 9.531772575250838e-06, "loss": 0.009, "step": 570 }, { "epoch": 0.09703053115851108, "grad_norm": 0.18756012618541718, "learning_rate": 9.698996655518395e-06, "loss": 0.0089, "step": 580 }, { "epoch": 0.0987034713508992, "grad_norm": 0.184035986661911, "learning_rate": 9.866220735785954e-06, "loss": 0.0087, "step": 590 }, { "epoch": 0.10037641154328733, "grad_norm": 0.17904379963874817, "learning_rate": 1.0033444816053513e-05, "loss": 0.0077, "step": 600 }, { "epoch": 0.10204935173567545, "grad_norm": 0.13281692564487457, "learning_rate": 1.020066889632107e-05, "loss": 0.0092, "step": 610 }, { "epoch": 0.10372229192806358, "grad_norm": 0.14460720121860504, "learning_rate": 1.036789297658863e-05, "loss": 0.0088, "step": 620 }, { "epoch": 0.1053952321204517, "grad_norm": 0.22102676331996918, "learning_rate": 1.0535117056856187e-05, "loss": 0.0081, "step": 630 }, { "epoch": 0.10706817231283981, "grad_norm": 0.17085184156894684, "learning_rate": 1.0702341137123746e-05, "loss": 0.0079, "step": 640 }, { "epoch": 0.10874111250522794, "grad_norm": 0.1741437315940857, "learning_rate": 1.0869565217391305e-05, "loss": 0.0084, "step": 650 }, { "epoch": 0.11041405269761606, "grad_norm": 0.1678648442029953, "learning_rate": 1.1036789297658862e-05, "loss": 0.0071, "step": 660 }, { "epoch": 0.11208699289000418, "grad_norm": 0.21275614202022552, "learning_rate": 1.1204013377926421e-05, "loss": 0.0077, "step": 670 }, { "epoch": 0.11375993308239231, "grad_norm": 0.15497252345085144, "learning_rate": 1.137123745819398e-05, "loss": 0.0071, "step": 680 }, { "epoch": 0.11543287327478043, "grad_norm": 0.18764236569404602, "learning_rate": 1.1538461538461538e-05, "loss": 0.0077, "step": 690 }, { "epoch": 0.11710581346716854, "grad_norm": 0.14283548295497894, "learning_rate": 1.1705685618729099e-05, "loss": 0.0072, "step": 700 }, { "epoch": 0.11877875365955667, "grad_norm": 0.1618950366973877, "learning_rate": 1.1872909698996658e-05, "loss": 0.0075, "step": 710 }, { "epoch": 0.12045169385194479, "grad_norm": 0.15630610287189484, "learning_rate": 1.2040133779264217e-05, "loss": 0.0067, "step": 720 }, { "epoch": 0.12212463404433292, "grad_norm": 0.12766779959201813, "learning_rate": 1.2207357859531774e-05, "loss": 0.0071, "step": 730 }, { "epoch": 0.12379757423672104, "grad_norm": 0.15148226916790009, "learning_rate": 1.2374581939799333e-05, "loss": 0.0065, "step": 740 }, { "epoch": 0.12547051442910917, "grad_norm": 0.12982220947742462, "learning_rate": 1.254180602006689e-05, "loss": 0.0061, "step": 750 }, { "epoch": 0.1271434546214973, "grad_norm": 0.12766385078430176, "learning_rate": 1.270903010033445e-05, "loss": 0.0081, "step": 760 }, { "epoch": 0.1288163948138854, "grad_norm": 0.14440308511257172, "learning_rate": 1.2876254180602008e-05, "loss": 0.0069, "step": 770 }, { "epoch": 0.13048933500627352, "grad_norm": 0.11201298236846924, "learning_rate": 1.3043478260869566e-05, "loss": 0.0063, "step": 780 }, { "epoch": 0.13216227519866164, "grad_norm": 0.14001229405403137, "learning_rate": 1.3210702341137125e-05, "loss": 0.006, "step": 790 }, { "epoch": 0.13383521539104978, "grad_norm": 0.1446337252855301, "learning_rate": 1.3377926421404684e-05, "loss": 0.0057, "step": 800 }, { "epoch": 0.1355081555834379, "grad_norm": 0.14818094670772552, "learning_rate": 1.3545150501672241e-05, "loss": 0.0069, "step": 810 }, { "epoch": 0.13718109577582602, "grad_norm": 0.1410125344991684, "learning_rate": 1.37123745819398e-05, "loss": 0.0068, "step": 820 }, { "epoch": 0.13885403596821413, "grad_norm": 0.1553238183259964, "learning_rate": 1.387959866220736e-05, "loss": 0.0068, "step": 830 }, { "epoch": 0.14052697616060225, "grad_norm": 0.13678601384162903, "learning_rate": 1.4046822742474917e-05, "loss": 0.0069, "step": 840 }, { "epoch": 0.14219991635299037, "grad_norm": 0.12651897966861725, "learning_rate": 1.4214046822742476e-05, "loss": 0.0063, "step": 850 }, { "epoch": 0.1438728565453785, "grad_norm": 0.10169650614261627, "learning_rate": 1.4381270903010035e-05, "loss": 0.0064, "step": 860 }, { "epoch": 0.14554579673776663, "grad_norm": 0.11214001476764679, "learning_rate": 1.4548494983277592e-05, "loss": 0.0067, "step": 870 }, { "epoch": 0.14721873693015475, "grad_norm": 0.10438688844442368, "learning_rate": 1.4715719063545151e-05, "loss": 0.0062, "step": 880 }, { "epoch": 0.14889167712254286, "grad_norm": 0.14689168334007263, "learning_rate": 1.4882943143812708e-05, "loss": 0.0065, "step": 890 }, { "epoch": 0.15056461731493098, "grad_norm": 0.15179641544818878, "learning_rate": 1.5050167224080269e-05, "loss": 0.0057, "step": 900 }, { "epoch": 0.15223755750731913, "grad_norm": 0.1304953694343567, "learning_rate": 1.5217391304347828e-05, "loss": 0.0059, "step": 910 }, { "epoch": 0.15391049769970724, "grad_norm": 0.1492306888103485, "learning_rate": 1.5384615384615387e-05, "loss": 0.0059, "step": 920 }, { "epoch": 0.15558343789209536, "grad_norm": 0.10364175587892532, "learning_rate": 1.5551839464882946e-05, "loss": 0.0063, "step": 930 }, { "epoch": 0.15725637808448348, "grad_norm": 0.12704779207706451, "learning_rate": 1.5719063545150505e-05, "loss": 0.006, "step": 940 }, { "epoch": 0.1589293182768716, "grad_norm": 0.10850061476230621, "learning_rate": 1.588628762541806e-05, "loss": 0.006, "step": 950 }, { "epoch": 0.1606022584692597, "grad_norm": 0.15113425254821777, "learning_rate": 1.605351170568562e-05, "loss": 0.0061, "step": 960 }, { "epoch": 0.16227519866164786, "grad_norm": 0.09442683309316635, "learning_rate": 1.622073578595318e-05, "loss": 0.0058, "step": 970 }, { "epoch": 0.16394813885403597, "grad_norm": 0.10059680789709091, "learning_rate": 1.6387959866220738e-05, "loss": 0.0052, "step": 980 }, { "epoch": 0.1656210790464241, "grad_norm": 0.11736520379781723, "learning_rate": 1.6555183946488297e-05, "loss": 0.0057, "step": 990 }, { "epoch": 0.1672940192388122, "grad_norm": 0.11506947129964828, "learning_rate": 1.6722408026755853e-05, "loss": 0.0055, "step": 1000 }, { "epoch": 0.16896695943120033, "grad_norm": 0.10081777721643448, "learning_rate": 1.6889632107023412e-05, "loss": 0.0053, "step": 1010 }, { "epoch": 0.17063989962358847, "grad_norm": 0.10682202875614166, "learning_rate": 1.705685618729097e-05, "loss": 0.0058, "step": 1020 }, { "epoch": 0.1723128398159766, "grad_norm": 0.11874356865882874, "learning_rate": 1.722408026755853e-05, "loss": 0.0061, "step": 1030 }, { "epoch": 0.1739857800083647, "grad_norm": 0.10397034883499146, "learning_rate": 1.739130434782609e-05, "loss": 0.0055, "step": 1040 }, { "epoch": 0.17565872020075282, "grad_norm": 0.11033766716718674, "learning_rate": 1.7558528428093648e-05, "loss": 0.0131, "step": 1050 }, { "epoch": 0.17733166039314094, "grad_norm": 0.13374905288219452, "learning_rate": 1.7725752508361204e-05, "loss": 0.0068, "step": 1060 }, { "epoch": 0.17900460058552906, "grad_norm": 0.1060086116194725, "learning_rate": 1.7892976588628763e-05, "loss": 0.0064, "step": 1070 }, { "epoch": 0.1806775407779172, "grad_norm": 0.11733996123075485, "learning_rate": 1.806020066889632e-05, "loss": 0.0055, "step": 1080 }, { "epoch": 0.18235048097030532, "grad_norm": 0.10346407443284988, "learning_rate": 1.822742474916388e-05, "loss": 0.0059, "step": 1090 }, { "epoch": 0.18402342116269343, "grad_norm": 0.0990874171257019, "learning_rate": 1.839464882943144e-05, "loss": 0.0059, "step": 1100 }, { "epoch": 0.18569636135508155, "grad_norm": 0.10037863999605179, "learning_rate": 1.8561872909699e-05, "loss": 0.0054, "step": 1110 }, { "epoch": 0.18736930154746967, "grad_norm": 0.10332591086626053, "learning_rate": 1.8729096989966558e-05, "loss": 0.005, "step": 1120 }, { "epoch": 0.1890422417398578, "grad_norm": 0.10546638071537018, "learning_rate": 1.8896321070234117e-05, "loss": 0.0052, "step": 1130 }, { "epoch": 0.19071518193224593, "grad_norm": 0.1075560674071312, "learning_rate": 1.9063545150501676e-05, "loss": 0.0046, "step": 1140 }, { "epoch": 0.19238812212463405, "grad_norm": 0.12965404987335205, "learning_rate": 1.923076923076923e-05, "loss": 0.0059, "step": 1150 }, { "epoch": 0.19406106231702216, "grad_norm": 0.11760328710079193, "learning_rate": 1.939799331103679e-05, "loss": 0.005, "step": 1160 }, { "epoch": 0.19573400250941028, "grad_norm": 0.11101441830396652, "learning_rate": 1.956521739130435e-05, "loss": 0.0051, "step": 1170 }, { "epoch": 0.1974069427017984, "grad_norm": 0.0942314863204956, "learning_rate": 1.973244147157191e-05, "loss": 0.0057, "step": 1180 }, { "epoch": 0.19907988289418654, "grad_norm": 0.09319698065519333, "learning_rate": 1.9899665551839468e-05, "loss": 0.0051, "step": 1190 }, { "epoch": 0.20075282308657466, "grad_norm": 0.11335150897502899, "learning_rate": 1.9999993177766133e-05, "loss": 0.0049, "step": 1200 }, { "epoch": 0.20242576327896278, "grad_norm": 0.06988847255706787, "learning_rate": 1.999991642774201e-05, "loss": 0.0054, "step": 1210 }, { "epoch": 0.2040987034713509, "grad_norm": 0.08338000625371933, "learning_rate": 1.9999754400558125e-05, "loss": 0.0049, "step": 1220 }, { "epoch": 0.205771643663739, "grad_norm": 0.10430186986923218, "learning_rate": 1.9999507097596203e-05, "loss": 0.0054, "step": 1230 }, { "epoch": 0.20744458385612716, "grad_norm": 0.1048823744058609, "learning_rate": 1.9999174520965194e-05, "loss": 0.0047, "step": 1240 }, { "epoch": 0.20911752404851527, "grad_norm": 0.1247139424085617, "learning_rate": 1.999875667350124e-05, "loss": 0.0056, "step": 1250 }, { "epoch": 0.2107904642409034, "grad_norm": 0.07579420506954193, "learning_rate": 1.9998253558767658e-05, "loss": 0.0052, "step": 1260 }, { "epoch": 0.2124634044332915, "grad_norm": 0.09081237763166428, "learning_rate": 1.9997665181054903e-05, "loss": 0.0046, "step": 1270 }, { "epoch": 0.21413634462567963, "grad_norm": 0.09489645808935165, "learning_rate": 1.9996991545380534e-05, "loss": 0.0047, "step": 1280 }, { "epoch": 0.21580928481806774, "grad_norm": 0.07921761274337769, "learning_rate": 1.999623265748917e-05, "loss": 0.0045, "step": 1290 }, { "epoch": 0.2174822250104559, "grad_norm": 0.08877990394830704, "learning_rate": 1.9995388523852445e-05, "loss": 0.0051, "step": 1300 }, { "epoch": 0.219155165202844, "grad_norm": 0.08277109265327454, "learning_rate": 1.9994459151668958e-05, "loss": 0.0043, "step": 1310 }, { "epoch": 0.22082810539523212, "grad_norm": 0.09742449223995209, "learning_rate": 1.9993444548864195e-05, "loss": 0.005, "step": 1320 }, { "epoch": 0.22250104558762024, "grad_norm": 0.093328095972538, "learning_rate": 1.9992344724090467e-05, "loss": 0.0038, "step": 1330 }, { "epoch": 0.22417398578000836, "grad_norm": 0.0758630782365799, "learning_rate": 1.999115968672685e-05, "loss": 0.0046, "step": 1340 }, { "epoch": 0.2258469259723965, "grad_norm": 0.07586976885795593, "learning_rate": 1.9989889446879092e-05, "loss": 0.0041, "step": 1350 }, { "epoch": 0.22751986616478462, "grad_norm": 0.10360102355480194, "learning_rate": 1.998853401537953e-05, "loss": 0.0043, "step": 1360 }, { "epoch": 0.22919280635717273, "grad_norm": 0.07323385030031204, "learning_rate": 1.9987093403786987e-05, "loss": 0.004, "step": 1370 }, { "epoch": 0.23086574654956085, "grad_norm": 0.08378604054450989, "learning_rate": 1.9985567624386695e-05, "loss": 0.0043, "step": 1380 }, { "epoch": 0.23253868674194897, "grad_norm": 0.08974593132734299, "learning_rate": 1.998395669019018e-05, "loss": 0.0042, "step": 1390 }, { "epoch": 0.23421162693433709, "grad_norm": 0.07128868997097015, "learning_rate": 1.9982260614935146e-05, "loss": 0.004, "step": 1400 }, { "epoch": 0.23588456712672523, "grad_norm": 0.09208974242210388, "learning_rate": 1.998047941308536e-05, "loss": 0.0042, "step": 1410 }, { "epoch": 0.23755750731911335, "grad_norm": 0.08087027817964554, "learning_rate": 1.997861309983053e-05, "loss": 0.0038, "step": 1420 }, { "epoch": 0.23923044751150147, "grad_norm": 0.10293801873922348, "learning_rate": 1.997666169108618e-05, "loss": 0.0042, "step": 1430 }, { "epoch": 0.24090338770388958, "grad_norm": 0.09307019412517548, "learning_rate": 1.9974625203493515e-05, "loss": 0.0041, "step": 1440 }, { "epoch": 0.2425763278962777, "grad_norm": 0.06533504277467728, "learning_rate": 1.9972503654419258e-05, "loss": 0.0039, "step": 1450 }, { "epoch": 0.24424926808866584, "grad_norm": 0.08785014599561691, "learning_rate": 1.9970297061955533e-05, "loss": 0.0037, "step": 1460 }, { "epoch": 0.24592220828105396, "grad_norm": 0.08904122561216354, "learning_rate": 1.9968005444919695e-05, "loss": 0.0048, "step": 1470 }, { "epoch": 0.24759514847344208, "grad_norm": 0.06951840966939926, "learning_rate": 1.9965628822854155e-05, "loss": 0.0039, "step": 1480 }, { "epoch": 0.2492680886658302, "grad_norm": 0.16835308074951172, "learning_rate": 1.996316721602625e-05, "loss": 0.0417, "step": 1490 }, { "epoch": 0.25094102885821834, "grad_norm": 1.1013474464416504, "learning_rate": 1.9960620645428038e-05, "loss": 0.1569, "step": 1500 }, { "epoch": 0.25261396905060646, "grad_norm": 0.2059653103351593, "learning_rate": 1.9957989132776132e-05, "loss": 0.0207, "step": 1510 }, { "epoch": 0.2542869092429946, "grad_norm": 0.15886810421943665, "learning_rate": 1.9955272700511506e-05, "loss": 0.0086, "step": 1520 }, { "epoch": 0.2559598494353827, "grad_norm": 0.1458134800195694, "learning_rate": 1.9952471371799325e-05, "loss": 0.0065, "step": 1530 }, { "epoch": 0.2576327896277708, "grad_norm": 0.1110554188489914, "learning_rate": 1.9949585170528717e-05, "loss": 0.0068, "step": 1540 }, { "epoch": 0.2593057298201589, "grad_norm": 0.12432471662759781, "learning_rate": 1.9946614121312595e-05, "loss": 0.0055, "step": 1550 }, { "epoch": 0.26097867001254704, "grad_norm": 0.08449316769838333, "learning_rate": 1.9943558249487437e-05, "loss": 0.0055, "step": 1560 }, { "epoch": 0.26265161020493516, "grad_norm": 0.07353685796260834, "learning_rate": 1.9940417581113062e-05, "loss": 0.0054, "step": 1570 }, { "epoch": 0.2643245503973233, "grad_norm": 0.07474063336849213, "learning_rate": 1.9937192142972427e-05, "loss": 0.0043, "step": 1580 }, { "epoch": 0.2659974905897114, "grad_norm": 0.0787465050816536, "learning_rate": 1.9933881962571382e-05, "loss": 0.0047, "step": 1590 }, { "epoch": 0.26767043078209957, "grad_norm": 0.10569368302822113, "learning_rate": 1.9930487068138437e-05, "loss": 0.0044, "step": 1600 }, { "epoch": 0.2693433709744877, "grad_norm": 0.07662618160247803, "learning_rate": 1.9927007488624535e-05, "loss": 0.0044, "step": 1610 }, { "epoch": 0.2710163111668758, "grad_norm": 0.0629221647977829, "learning_rate": 1.9923443253702784e-05, "loss": 0.0044, "step": 1620 }, { "epoch": 0.2726892513592639, "grad_norm": 0.0779779925942421, "learning_rate": 1.9919794393768225e-05, "loss": 0.0041, "step": 1630 }, { "epoch": 0.27436219155165203, "grad_norm": 0.08380386233329773, "learning_rate": 1.9916060939937562e-05, "loss": 0.0046, "step": 1640 }, { "epoch": 0.27603513174404015, "grad_norm": 0.08056768029928207, "learning_rate": 1.9912242924048886e-05, "loss": 0.0041, "step": 1650 }, { "epoch": 0.27770807193642827, "grad_norm": 0.09446844458580017, "learning_rate": 1.990834037866143e-05, "loss": 0.005, "step": 1660 }, { "epoch": 0.2793810121288164, "grad_norm": 0.06388242542743683, "learning_rate": 1.990435333705527e-05, "loss": 0.0043, "step": 1670 }, { "epoch": 0.2810539523212045, "grad_norm": 0.0716908797621727, "learning_rate": 1.990028183323105e-05, "loss": 0.0045, "step": 1680 }, { "epoch": 0.2827268925135926, "grad_norm": 0.07838106900453568, "learning_rate": 1.989612590190969e-05, "loss": 0.0044, "step": 1690 }, { "epoch": 0.28439983270598074, "grad_norm": 0.07069439440965652, "learning_rate": 1.9891885578532082e-05, "loss": 0.0038, "step": 1700 }, { "epoch": 0.2860727728983689, "grad_norm": 0.06536339223384857, "learning_rate": 1.9887560899258806e-05, "loss": 0.0044, "step": 1710 }, { "epoch": 0.287745713090757, "grad_norm": 0.0823822170495987, "learning_rate": 1.9883151900969805e-05, "loss": 0.0037, "step": 1720 }, { "epoch": 0.28941865328314514, "grad_norm": 0.0675312727689743, "learning_rate": 1.987865862126408e-05, "loss": 0.0037, "step": 1730 }, { "epoch": 0.29109159347553326, "grad_norm": 0.0691055878996849, "learning_rate": 1.9874081098459362e-05, "loss": 0.0036, "step": 1740 }, { "epoch": 0.2927645336679214, "grad_norm": 0.10043274611234665, "learning_rate": 1.986941937159179e-05, "loss": 0.0038, "step": 1750 }, { "epoch": 0.2944374738603095, "grad_norm": 0.09526947140693665, "learning_rate": 1.986467348041559e-05, "loss": 0.0036, "step": 1760 }, { "epoch": 0.2961104140526976, "grad_norm": 0.07462775707244873, "learning_rate": 1.9859843465402697e-05, "loss": 0.0034, "step": 1770 }, { "epoch": 0.29778335424508573, "grad_norm": 0.06207914277911186, "learning_rate": 1.9854929367742462e-05, "loss": 0.0041, "step": 1780 }, { "epoch": 0.29945629443747385, "grad_norm": 0.05888642743229866, "learning_rate": 1.9849931229341258e-05, "loss": 0.0037, "step": 1790 }, { "epoch": 0.30112923462986196, "grad_norm": 0.0687062069773674, "learning_rate": 1.984484909282215e-05, "loss": 0.0035, "step": 1800 }, { "epoch": 0.3028021748222501, "grad_norm": 0.06710348278284073, "learning_rate": 1.9839683001524516e-05, "loss": 0.0031, "step": 1810 }, { "epoch": 0.30447511501463825, "grad_norm": 0.05600139498710632, "learning_rate": 1.9834432999503684e-05, "loss": 0.0032, "step": 1820 }, { "epoch": 0.30614805520702637, "grad_norm": 0.05627594515681267, "learning_rate": 1.9829099131530553e-05, "loss": 0.0037, "step": 1830 }, { "epoch": 0.3078209953994145, "grad_norm": 0.07693224400281906, "learning_rate": 1.9823681443091215e-05, "loss": 0.0034, "step": 1840 }, { "epoch": 0.3094939355918026, "grad_norm": 0.07977078855037689, "learning_rate": 1.9818179980386564e-05, "loss": 0.0039, "step": 1850 }, { "epoch": 0.3111668757841907, "grad_norm": 0.05494411662220955, "learning_rate": 1.9812594790331902e-05, "loss": 0.0033, "step": 1860 }, { "epoch": 0.31283981597657884, "grad_norm": 0.06616249680519104, "learning_rate": 1.9806925920556546e-05, "loss": 0.0036, "step": 1870 }, { "epoch": 0.31451275616896696, "grad_norm": 0.08185126632452011, "learning_rate": 1.9801173419403406e-05, "loss": 0.0039, "step": 1880 }, { "epoch": 0.3161856963613551, "grad_norm": 0.06396634876728058, "learning_rate": 1.9795337335928593e-05, "loss": 0.004, "step": 1890 }, { "epoch": 0.3178586365537432, "grad_norm": 0.06563973426818848, "learning_rate": 1.978941771990098e-05, "loss": 0.0036, "step": 1900 }, { "epoch": 0.3195315767461313, "grad_norm": 0.06523874402046204, "learning_rate": 1.9783414621801797e-05, "loss": 0.0029, "step": 1910 }, { "epoch": 0.3212045169385194, "grad_norm": 0.06342514604330063, "learning_rate": 1.9777328092824187e-05, "loss": 0.0038, "step": 1920 }, { "epoch": 0.3228774571309076, "grad_norm": 0.05971871688961983, "learning_rate": 1.977115818487277e-05, "loss": 0.0034, "step": 1930 }, { "epoch": 0.3245503973232957, "grad_norm": 0.06687960028648376, "learning_rate": 1.9764904950563215e-05, "loss": 0.0033, "step": 1940 }, { "epoch": 0.32622333751568383, "grad_norm": 0.050653185695409775, "learning_rate": 1.975856844322177e-05, "loss": 0.0034, "step": 1950 }, { "epoch": 0.32789627770807195, "grad_norm": 0.07198058813810349, "learning_rate": 1.9752148716884822e-05, "loss": 0.0031, "step": 1960 }, { "epoch": 0.32956921790046007, "grad_norm": 0.07705294340848923, "learning_rate": 1.974564582629843e-05, "loss": 0.0034, "step": 1970 }, { "epoch": 0.3312421580928482, "grad_norm": 0.04596183821558952, "learning_rate": 1.9739059826917866e-05, "loss": 0.0028, "step": 1980 }, { "epoch": 0.3329150982852363, "grad_norm": 0.09095992147922516, "learning_rate": 1.9732390774907126e-05, "loss": 0.003, "step": 1990 }, { "epoch": 0.3345880384776244, "grad_norm": 0.06304284930229187, "learning_rate": 1.9725638727138466e-05, "loss": 0.0035, "step": 2000 }, { "epoch": 0.33626097867001253, "grad_norm": 0.05771150067448616, "learning_rate": 1.9718803741191918e-05, "loss": 0.0033, "step": 2010 }, { "epoch": 0.33793391886240065, "grad_norm": 0.09310884773731232, "learning_rate": 1.9711885875354787e-05, "loss": 0.0032, "step": 2020 }, { "epoch": 0.33960685905478877, "grad_norm": 0.05958499014377594, "learning_rate": 1.970488518862116e-05, "loss": 0.0038, "step": 2030 }, { "epoch": 0.34127979924717694, "grad_norm": 0.05149533972144127, "learning_rate": 1.9697801740691402e-05, "loss": 0.0032, "step": 2040 }, { "epoch": 0.34295273943956506, "grad_norm": 0.05784996226429939, "learning_rate": 1.9690635591971655e-05, "loss": 0.0038, "step": 2050 }, { "epoch": 0.3446256796319532, "grad_norm": 0.08538904786109924, "learning_rate": 1.9683386803573303e-05, "loss": 0.0033, "step": 2060 }, { "epoch": 0.3462986198243413, "grad_norm": 0.07361331582069397, "learning_rate": 1.967605543731248e-05, "loss": 0.0033, "step": 2070 }, { "epoch": 0.3479715600167294, "grad_norm": 0.05513383448123932, "learning_rate": 1.966864155570951e-05, "loss": 0.0031, "step": 2080 }, { "epoch": 0.3496445002091175, "grad_norm": 0.06754180043935776, "learning_rate": 1.96611452219884e-05, "loss": 0.0028, "step": 2090 }, { "epoch": 0.35131744040150564, "grad_norm": 0.051086556166410446, "learning_rate": 1.9653566500076293e-05, "loss": 0.003, "step": 2100 }, { "epoch": 0.35299038059389376, "grad_norm": 0.0629250630736351, "learning_rate": 1.9645905454602916e-05, "loss": 0.0029, "step": 2110 }, { "epoch": 0.3546633207862819, "grad_norm": 0.06138652190566063, "learning_rate": 1.9638162150900028e-05, "loss": 0.0035, "step": 2120 }, { "epoch": 0.35633626097867, "grad_norm": 0.04756954684853554, "learning_rate": 1.9630336655000887e-05, "loss": 0.0032, "step": 2130 }, { "epoch": 0.3580092011710581, "grad_norm": 0.05386006087064743, "learning_rate": 1.962242903363965e-05, "loss": 0.0027, "step": 2140 }, { "epoch": 0.3596821413634463, "grad_norm": 0.06012064963579178, "learning_rate": 1.9614439354250825e-05, "loss": 0.0033, "step": 2150 }, { "epoch": 0.3613550815558344, "grad_norm": 0.04432355612516403, "learning_rate": 1.960636768496871e-05, "loss": 0.0028, "step": 2160 }, { "epoch": 0.3630280217482225, "grad_norm": 0.050032179802656174, "learning_rate": 1.959821409462677e-05, "loss": 0.003, "step": 2170 }, { "epoch": 0.36470096194061064, "grad_norm": 0.051682740449905396, "learning_rate": 1.95899786527571e-05, "loss": 0.0027, "step": 2180 }, { "epoch": 0.36637390213299875, "grad_norm": 0.06810294091701508, "learning_rate": 1.9581661429589794e-05, "loss": 0.0032, "step": 2190 }, { "epoch": 0.36804684232538687, "grad_norm": 0.06511209160089493, "learning_rate": 1.9573262496052357e-05, "loss": 0.0029, "step": 2200 }, { "epoch": 0.369719782517775, "grad_norm": 0.06401335448026657, "learning_rate": 1.956478192376911e-05, "loss": 0.0035, "step": 2210 }, { "epoch": 0.3713927227101631, "grad_norm": 0.07354243099689484, "learning_rate": 1.9556219785060567e-05, "loss": 0.0031, "step": 2220 }, { "epoch": 0.3730656629025512, "grad_norm": 0.057497262954711914, "learning_rate": 1.954757615294283e-05, "loss": 0.0029, "step": 2230 }, { "epoch": 0.37473860309493934, "grad_norm": 0.08023671060800552, "learning_rate": 1.9538851101126948e-05, "loss": 0.0034, "step": 2240 }, { "epoch": 0.37641154328732745, "grad_norm": 0.057895828038454056, "learning_rate": 1.9530044704018313e-05, "loss": 0.0033, "step": 2250 }, { "epoch": 0.3780844834797156, "grad_norm": 0.08360813558101654, "learning_rate": 1.9521157036716003e-05, "loss": 0.0034, "step": 2260 }, { "epoch": 0.37975742367210374, "grad_norm": 0.06430185586214066, "learning_rate": 1.9512188175012155e-05, "loss": 0.0027, "step": 2270 }, { "epoch": 0.38143036386449186, "grad_norm": 0.0604422502219677, "learning_rate": 1.9503138195391314e-05, "loss": 0.0029, "step": 2280 }, { "epoch": 0.38310330405688, "grad_norm": 0.0626441016793251, "learning_rate": 1.9494007175029776e-05, "loss": 0.0032, "step": 2290 }, { "epoch": 0.3847762442492681, "grad_norm": 0.05957617238163948, "learning_rate": 1.9484795191794944e-05, "loss": 0.0028, "step": 2300 }, { "epoch": 0.3864491844416562, "grad_norm": 0.060407839715480804, "learning_rate": 1.947550232424465e-05, "loss": 0.0029, "step": 2310 }, { "epoch": 0.38812212463404433, "grad_norm": 0.04406201094388962, "learning_rate": 1.94661286516265e-05, "loss": 0.003, "step": 2320 }, { "epoch": 0.38979506482643245, "grad_norm": 0.04771639406681061, "learning_rate": 1.9456674253877162e-05, "loss": 0.0031, "step": 2330 }, { "epoch": 0.39146800501882056, "grad_norm": 0.04817575588822365, "learning_rate": 1.944713921162174e-05, "loss": 0.0031, "step": 2340 }, { "epoch": 0.3931409452112087, "grad_norm": 0.05885681137442589, "learning_rate": 1.9437523606173048e-05, "loss": 0.0028, "step": 2350 }, { "epoch": 0.3948138854035968, "grad_norm": 0.06338904052972794, "learning_rate": 1.942782751953092e-05, "loss": 0.0034, "step": 2360 }, { "epoch": 0.39648682559598497, "grad_norm": 0.06422600150108337, "learning_rate": 1.9418051034381513e-05, "loss": 0.0027, "step": 2370 }, { "epoch": 0.3981597657883731, "grad_norm": 0.0508551187813282, "learning_rate": 1.9408194234096625e-05, "loss": 0.0026, "step": 2380 }, { "epoch": 0.3998327059807612, "grad_norm": 0.056066691875457764, "learning_rate": 1.939825720273294e-05, "loss": 0.0029, "step": 2390 }, { "epoch": 0.4015056461731493, "grad_norm": 0.06123988702893257, "learning_rate": 1.9388240025031356e-05, "loss": 0.0034, "step": 2400 }, { "epoch": 0.40317858636553744, "grad_norm": 0.052961982786655426, "learning_rate": 1.9378142786416232e-05, "loss": 0.0028, "step": 2410 }, { "epoch": 0.40485152655792556, "grad_norm": 0.05537119880318642, "learning_rate": 1.9367965572994667e-05, "loss": 0.0027, "step": 2420 }, { "epoch": 0.4065244667503137, "grad_norm": 0.053024958819150925, "learning_rate": 1.9357708471555772e-05, "loss": 0.0028, "step": 2430 }, { "epoch": 0.4081974069427018, "grad_norm": 0.05063009634613991, "learning_rate": 1.934737156956993e-05, "loss": 0.0026, "step": 2440 }, { "epoch": 0.4098703471350899, "grad_norm": 0.053828030824661255, "learning_rate": 1.9336954955188042e-05, "loss": 0.0027, "step": 2450 }, { "epoch": 0.411543287327478, "grad_norm": 0.08630996942520142, "learning_rate": 1.932645871724077e-05, "loss": 0.003, "step": 2460 }, { "epoch": 0.41321622751986614, "grad_norm": 0.04268857091665268, "learning_rate": 1.9315882945237808e-05, "loss": 0.0026, "step": 2470 }, { "epoch": 0.4148891677122543, "grad_norm": 0.06442458927631378, "learning_rate": 1.9305227729367088e-05, "loss": 0.0028, "step": 2480 }, { "epoch": 0.41656210790464243, "grad_norm": 0.04712328687310219, "learning_rate": 1.929449316049402e-05, "loss": 0.0027, "step": 2490 }, { "epoch": 0.41823504809703055, "grad_norm": 0.04693415388464928, "learning_rate": 1.9283679330160726e-05, "loss": 0.0031, "step": 2500 }, { "epoch": 0.41990798828941867, "grad_norm": 0.08450551331043243, "learning_rate": 1.927278633058525e-05, "loss": 0.0033, "step": 2510 }, { "epoch": 0.4215809284818068, "grad_norm": 0.06886743009090424, "learning_rate": 1.9261814254660778e-05, "loss": 0.0028, "step": 2520 }, { "epoch": 0.4232538686741949, "grad_norm": 0.09345709532499313, "learning_rate": 1.9250763195954832e-05, "loss": 0.0032, "step": 2530 }, { "epoch": 0.424926808866583, "grad_norm": 0.06122855469584465, "learning_rate": 1.92396332487085e-05, "loss": 0.0029, "step": 2540 }, { "epoch": 0.42659974905897113, "grad_norm": 0.0528654046356678, "learning_rate": 1.922842450783559e-05, "loss": 0.0029, "step": 2550 }, { "epoch": 0.42827268925135925, "grad_norm": 0.048696111887693405, "learning_rate": 1.9217137068921875e-05, "loss": 0.0028, "step": 2560 }, { "epoch": 0.42994562944374737, "grad_norm": 0.04957443103194237, "learning_rate": 1.920577102822422e-05, "loss": 0.0032, "step": 2570 }, { "epoch": 0.4316185696361355, "grad_norm": 0.05680517107248306, "learning_rate": 1.9194326482669807e-05, "loss": 0.0026, "step": 2580 }, { "epoch": 0.43329150982852366, "grad_norm": 0.04418332129716873, "learning_rate": 1.9182803529855287e-05, "loss": 0.0033, "step": 2590 }, { "epoch": 0.4349644500209118, "grad_norm": 0.07163397967815399, "learning_rate": 1.917120226804595e-05, "loss": 0.0034, "step": 2600 }, { "epoch": 0.4366373902132999, "grad_norm": 0.05478557571768761, "learning_rate": 1.9159522796174882e-05, "loss": 0.0029, "step": 2610 }, { "epoch": 0.438310330405688, "grad_norm": 0.06399893760681152, "learning_rate": 1.9147765213842145e-05, "loss": 0.0024, "step": 2620 }, { "epoch": 0.4399832705980761, "grad_norm": 0.11805685609579086, "learning_rate": 1.913592962131389e-05, "loss": 0.0028, "step": 2630 }, { "epoch": 0.44165621079046424, "grad_norm": 0.07779286801815033, "learning_rate": 1.9124016119521535e-05, "loss": 0.0035, "step": 2640 }, { "epoch": 0.44332915098285236, "grad_norm": 0.07101720571517944, "learning_rate": 1.9112024810060892e-05, "loss": 0.0029, "step": 2650 }, { "epoch": 0.4450020911752405, "grad_norm": 0.07000937312841415, "learning_rate": 1.9099955795191296e-05, "loss": 0.003, "step": 2660 }, { "epoch": 0.4466750313676286, "grad_norm": 0.038181502372026443, "learning_rate": 1.908780917783473e-05, "loss": 0.0025, "step": 2670 }, { "epoch": 0.4483479715600167, "grad_norm": 0.053493183106184006, "learning_rate": 1.907558506157498e-05, "loss": 0.0027, "step": 2680 }, { "epoch": 0.45002091175240483, "grad_norm": 0.04917820170521736, "learning_rate": 1.906328355065669e-05, "loss": 0.0029, "step": 2690 }, { "epoch": 0.451693851944793, "grad_norm": 0.0457712821662426, "learning_rate": 1.9050904749984532e-05, "loss": 0.0027, "step": 2700 }, { "epoch": 0.4533667921371811, "grad_norm": 0.04898484796285629, "learning_rate": 1.9038448765122295e-05, "loss": 0.0029, "step": 2710 }, { "epoch": 0.45503973232956924, "grad_norm": 0.05471361428499222, "learning_rate": 1.902591570229196e-05, "loss": 0.0036, "step": 2720 }, { "epoch": 0.45671267252195735, "grad_norm": 0.0681205689907074, "learning_rate": 1.9013305668372818e-05, "loss": 0.0029, "step": 2730 }, { "epoch": 0.45838561271434547, "grad_norm": 0.04269558563828468, "learning_rate": 1.900061877090056e-05, "loss": 0.0026, "step": 2740 }, { "epoch": 0.4600585529067336, "grad_norm": 0.04693697392940521, "learning_rate": 1.898785511806635e-05, "loss": 0.0027, "step": 2750 }, { "epoch": 0.4617314930991217, "grad_norm": 0.04651269689202309, "learning_rate": 1.89750148187159e-05, "loss": 0.0034, "step": 2760 }, { "epoch": 0.4634044332915098, "grad_norm": 0.05927550420165062, "learning_rate": 1.896209798234855e-05, "loss": 0.0026, "step": 2770 }, { "epoch": 0.46507737348389794, "grad_norm": 0.058397527784109116, "learning_rate": 1.8949104719116334e-05, "loss": 0.0026, "step": 2780 }, { "epoch": 0.46675031367628605, "grad_norm": 0.04576428234577179, "learning_rate": 1.8936035139823037e-05, "loss": 0.0029, "step": 2790 }, { "epoch": 0.46842325386867417, "grad_norm": 0.057682596147060394, "learning_rate": 1.8922889355923253e-05, "loss": 0.0026, "step": 2800 }, { "epoch": 0.47009619406106234, "grad_norm": 0.06394952535629272, "learning_rate": 1.8909667479521427e-05, "loss": 0.0025, "step": 2810 }, { "epoch": 0.47176913425345046, "grad_norm": 0.05278909206390381, "learning_rate": 1.889636962337091e-05, "loss": 0.003, "step": 2820 }, { "epoch": 0.4734420744458386, "grad_norm": 0.03546448051929474, "learning_rate": 1.8882995900872985e-05, "loss": 0.0022, "step": 2830 }, { "epoch": 0.4751150146382267, "grad_norm": 0.06166864186525345, "learning_rate": 1.886954642607592e-05, "loss": 0.0026, "step": 2840 }, { "epoch": 0.4767879548306148, "grad_norm": 0.05084056407213211, "learning_rate": 1.8856021313673967e-05, "loss": 0.0028, "step": 2850 }, { "epoch": 0.47846089502300293, "grad_norm": 0.07769855856895447, "learning_rate": 1.8842420679006414e-05, "loss": 0.0027, "step": 2860 }, { "epoch": 0.48013383521539105, "grad_norm": 0.06483329087495804, "learning_rate": 1.8828744638056575e-05, "loss": 0.0027, "step": 2870 }, { "epoch": 0.48180677540777916, "grad_norm": 0.058500535786151886, "learning_rate": 1.8814993307450818e-05, "loss": 0.0034, "step": 2880 }, { "epoch": 0.4834797156001673, "grad_norm": 0.057272814214229584, "learning_rate": 1.880116680445757e-05, "loss": 0.0032, "step": 2890 }, { "epoch": 0.4851526557925554, "grad_norm": 0.05039543658494949, "learning_rate": 1.87872652469863e-05, "loss": 0.0034, "step": 2900 }, { "epoch": 0.4868255959849435, "grad_norm": 0.06207288056612015, "learning_rate": 1.877328875358653e-05, "loss": 0.0026, "step": 2910 }, { "epoch": 0.4884985361773317, "grad_norm": 0.04906097799539566, "learning_rate": 1.8759237443446836e-05, "loss": 0.0023, "step": 2920 }, { "epoch": 0.4901714763697198, "grad_norm": 0.04154697060585022, "learning_rate": 1.8745111436393788e-05, "loss": 0.0023, "step": 2930 }, { "epoch": 0.4918444165621079, "grad_norm": 0.03479761257767677, "learning_rate": 1.8730910852890974e-05, "loss": 0.0034, "step": 2940 }, { "epoch": 0.49351735675449604, "grad_norm": 0.0442926287651062, "learning_rate": 1.871663581403795e-05, "loss": 0.003, "step": 2950 }, { "epoch": 0.49519029694688416, "grad_norm": 0.06912216544151306, "learning_rate": 1.8702286441569206e-05, "loss": 0.0032, "step": 2960 }, { "epoch": 0.4968632371392723, "grad_norm": 0.058648865669965744, "learning_rate": 1.8687862857853143e-05, "loss": 0.0028, "step": 2970 }, { "epoch": 0.4985361773316604, "grad_norm": 0.07963908463716507, "learning_rate": 1.867336518589101e-05, "loss": 0.0037, "step": 2980 }, { "epoch": 0.5002091175240485, "grad_norm": 0.057127002626657486, "learning_rate": 1.865879354931587e-05, "loss": 0.0025, "step": 2990 }, { "epoch": 0.5018820577164367, "grad_norm": 0.04185406491160393, "learning_rate": 1.864414807239154e-05, "loss": 0.003, "step": 3000 }, { "epoch": 0.5035549979088247, "grad_norm": 0.0446491576731205, "learning_rate": 1.862942888001153e-05, "loss": 0.0027, "step": 3010 }, { "epoch": 0.5052279381012129, "grad_norm": 0.07425040751695633, "learning_rate": 1.8614636097697984e-05, "loss": 0.0028, "step": 3020 }, { "epoch": 0.506900878293601, "grad_norm": 0.044719237834215164, "learning_rate": 1.85997698516006e-05, "loss": 0.0024, "step": 3030 }, { "epoch": 0.5085738184859891, "grad_norm": 0.034016042947769165, "learning_rate": 1.8584830268495564e-05, "loss": 0.0022, "step": 3040 }, { "epoch": 0.5102467586783772, "grad_norm": 0.04879382997751236, "learning_rate": 1.856981747578446e-05, "loss": 0.0025, "step": 3050 }, { "epoch": 0.5119196988707654, "grad_norm": 0.06697077304124832, "learning_rate": 1.855473160149319e-05, "loss": 0.003, "step": 3060 }, { "epoch": 0.5135926390631534, "grad_norm": 0.06290528178215027, "learning_rate": 1.853957277427088e-05, "loss": 0.0025, "step": 3070 }, { "epoch": 0.5152655792555416, "grad_norm": 0.0789925828576088, "learning_rate": 1.852434112338879e-05, "loss": 0.0027, "step": 3080 }, { "epoch": 0.5169385194479298, "grad_norm": 0.049853019416332245, "learning_rate": 1.8509036778739188e-05, "loss": 0.0023, "step": 3090 }, { "epoch": 0.5186114596403179, "grad_norm": 0.06557432562112808, "learning_rate": 1.8493659870834278e-05, "loss": 0.0023, "step": 3100 }, { "epoch": 0.520284399832706, "grad_norm": 0.03741566464304924, "learning_rate": 1.847821053080505e-05, "loss": 0.0027, "step": 3110 }, { "epoch": 0.5219573400250941, "grad_norm": 0.054189976304769516, "learning_rate": 1.8462688890400196e-05, "loss": 0.0029, "step": 3120 }, { "epoch": 0.5236302802174823, "grad_norm": 0.05733225867152214, "learning_rate": 1.8447095081984964e-05, "loss": 0.0027, "step": 3130 }, { "epoch": 0.5253032204098703, "grad_norm": 0.06136534363031387, "learning_rate": 1.843142923854003e-05, "loss": 0.0026, "step": 3140 }, { "epoch": 0.5269761606022585, "grad_norm": 0.05807511880993843, "learning_rate": 1.8415691493660372e-05, "loss": 0.0025, "step": 3150 }, { "epoch": 0.5286491007946466, "grad_norm": 0.05741124972701073, "learning_rate": 1.8399881981554137e-05, "loss": 0.0032, "step": 3160 }, { "epoch": 0.5303220409870347, "grad_norm": 0.058726195245981216, "learning_rate": 1.8384000837041478e-05, "loss": 0.0027, "step": 3170 }, { "epoch": 0.5319949811794228, "grad_norm": 0.05113428831100464, "learning_rate": 1.8368048195553415e-05, "loss": 0.0022, "step": 3180 }, { "epoch": 0.533667921371811, "grad_norm": 0.044133204966783524, "learning_rate": 1.8352024193130677e-05, "loss": 0.0021, "step": 3190 }, { "epoch": 0.5353408615641991, "grad_norm": 0.039649948477745056, "learning_rate": 1.8335928966422555e-05, "loss": 0.0024, "step": 3200 }, { "epoch": 0.5370138017565872, "grad_norm": 0.0497254878282547, "learning_rate": 1.8319762652685716e-05, "loss": 0.0031, "step": 3210 }, { "epoch": 0.5386867419489754, "grad_norm": 0.05728726461529732, "learning_rate": 1.8303525389783045e-05, "loss": 0.0026, "step": 3220 }, { "epoch": 0.5403596821413634, "grad_norm": 0.0611911416053772, "learning_rate": 1.828721731618246e-05, "loss": 0.003, "step": 3230 }, { "epoch": 0.5420326223337516, "grad_norm": 0.043818481266498566, "learning_rate": 1.8270838570955747e-05, "loss": 0.0024, "step": 3240 }, { "epoch": 0.5437055625261397, "grad_norm": 0.03561480715870857, "learning_rate": 1.825438929377736e-05, "loss": 0.0026, "step": 3250 }, { "epoch": 0.5453785027185278, "grad_norm": 0.048015352338552475, "learning_rate": 1.8237869624923238e-05, "loss": 0.0025, "step": 3260 }, { "epoch": 0.5470514429109159, "grad_norm": 0.07016065716743469, "learning_rate": 1.8221279705269597e-05, "loss": 0.0027, "step": 3270 }, { "epoch": 0.5487243831033041, "grad_norm": 0.040604934096336365, "learning_rate": 1.8204619676291746e-05, "loss": 0.0028, "step": 3280 }, { "epoch": 0.5503973232956921, "grad_norm": 0.034456051886081696, "learning_rate": 1.8187889680062864e-05, "loss": 0.0021, "step": 3290 }, { "epoch": 0.5520702634880803, "grad_norm": 0.03523964062333107, "learning_rate": 1.8171089859252803e-05, "loss": 0.0024, "step": 3300 }, { "epoch": 0.5537432036804685, "grad_norm": 0.042577095329761505, "learning_rate": 1.8154220357126862e-05, "loss": 0.0023, "step": 3310 }, { "epoch": 0.5554161438728565, "grad_norm": 0.05949290096759796, "learning_rate": 1.8137281317544562e-05, "loss": 0.0024, "step": 3320 }, { "epoch": 0.5570890840652447, "grad_norm": 0.0492510050535202, "learning_rate": 1.812027288495843e-05, "loss": 0.0024, "step": 3330 }, { "epoch": 0.5587620242576328, "grad_norm": 0.0665510818362236, "learning_rate": 1.8103195204412763e-05, "loss": 0.0026, "step": 3340 }, { "epoch": 0.560434964450021, "grad_norm": 0.045212406665086746, "learning_rate": 1.8086048421542383e-05, "loss": 0.0033, "step": 3350 }, { "epoch": 0.562107904642409, "grad_norm": 0.03701690956950188, "learning_rate": 1.806883268257141e-05, "loss": 0.0024, "step": 3360 }, { "epoch": 0.5637808448347972, "grad_norm": 0.04281214624643326, "learning_rate": 1.8051548134312e-05, "loss": 0.0025, "step": 3370 }, { "epoch": 0.5654537850271852, "grad_norm": 0.05213675647974014, "learning_rate": 1.8034194924163105e-05, "loss": 0.0023, "step": 3380 }, { "epoch": 0.5671267252195734, "grad_norm": 0.05859183147549629, "learning_rate": 1.801677320010921e-05, "loss": 0.0022, "step": 3390 }, { "epoch": 0.5687996654119615, "grad_norm": 0.034795377403497696, "learning_rate": 1.799928311071907e-05, "loss": 0.0022, "step": 3400 }, { "epoch": 0.5704726056043496, "grad_norm": 0.05178803950548172, "learning_rate": 1.7981724805144444e-05, "loss": 0.0025, "step": 3410 }, { "epoch": 0.5721455457967378, "grad_norm": 0.04766694828867912, "learning_rate": 1.7964098433118833e-05, "loss": 0.0022, "step": 3420 }, { "epoch": 0.5738184859891259, "grad_norm": 0.04999823123216629, "learning_rate": 1.7946404144956183e-05, "loss": 0.0028, "step": 3430 }, { "epoch": 0.575491426181514, "grad_norm": 0.039944496005773544, "learning_rate": 1.7928642091549616e-05, "loss": 0.0024, "step": 3440 }, { "epoch": 0.5771643663739021, "grad_norm": 0.041734714061021805, "learning_rate": 1.7910812424370148e-05, "loss": 0.0023, "step": 3450 }, { "epoch": 0.5788373065662903, "grad_norm": 0.04880327731370926, "learning_rate": 1.7892915295465387e-05, "loss": 0.0026, "step": 3460 }, { "epoch": 0.5805102467586783, "grad_norm": 0.031567804515361786, "learning_rate": 1.7874950857458243e-05, "loss": 0.0024, "step": 3470 }, { "epoch": 0.5821831869510665, "grad_norm": 0.04270700365304947, "learning_rate": 1.7856919263545615e-05, "loss": 0.0022, "step": 3480 }, { "epoch": 0.5838561271434546, "grad_norm": 0.04917534068226814, "learning_rate": 1.7838820667497108e-05, "loss": 0.0027, "step": 3490 }, { "epoch": 0.5855290673358428, "grad_norm": 0.04890109598636627, "learning_rate": 1.782065522365369e-05, "loss": 0.0023, "step": 3500 }, { "epoch": 0.5872020075282308, "grad_norm": 0.03596416488289833, "learning_rate": 1.780242308692641e-05, "loss": 0.0026, "step": 3510 }, { "epoch": 0.588874947720619, "grad_norm": 0.04228467866778374, "learning_rate": 1.7784124412795046e-05, "loss": 0.0023, "step": 3520 }, { "epoch": 0.5905478879130072, "grad_norm": 0.050580546259880066, "learning_rate": 1.7765759357306795e-05, "loss": 0.0029, "step": 3530 }, { "epoch": 0.5922208281053952, "grad_norm": 0.04836999252438545, "learning_rate": 1.7747328077074945e-05, "loss": 0.0025, "step": 3540 }, { "epoch": 0.5938937682977834, "grad_norm": 0.07261595875024796, "learning_rate": 1.772883072927754e-05, "loss": 0.0026, "step": 3550 }, { "epoch": 0.5955667084901715, "grad_norm": 0.059777967631816864, "learning_rate": 1.7710267471656014e-05, "loss": 0.0024, "step": 3560 }, { "epoch": 0.5972396486825596, "grad_norm": 0.06525000929832458, "learning_rate": 1.769163846251389e-05, "loss": 0.003, "step": 3570 }, { "epoch": 0.5989125888749477, "grad_norm": 0.04377102479338646, "learning_rate": 1.7672943860715385e-05, "loss": 0.0028, "step": 3580 }, { "epoch": 0.6005855290673359, "grad_norm": 0.041901711374521255, "learning_rate": 1.7654183825684093e-05, "loss": 0.0026, "step": 3590 }, { "epoch": 0.6022584692597239, "grad_norm": 0.06845422089099884, "learning_rate": 1.7635358517401594e-05, "loss": 0.0027, "step": 3600 }, { "epoch": 0.6039314094521121, "grad_norm": 0.047201305627822876, "learning_rate": 1.7616468096406124e-05, "loss": 0.0027, "step": 3610 }, { "epoch": 0.6056043496445002, "grad_norm": 0.04882335290312767, "learning_rate": 1.7597512723791163e-05, "loss": 0.0028, "step": 3620 }, { "epoch": 0.6072772898368883, "grad_norm": 0.039089955389499664, "learning_rate": 1.75784925612041e-05, "loss": 0.002, "step": 3630 }, { "epoch": 0.6089502300292765, "grad_norm": 0.04400509223341942, "learning_rate": 1.7559407770844833e-05, "loss": 0.0023, "step": 3640 }, { "epoch": 0.6106231702216646, "grad_norm": 0.06384477019309998, "learning_rate": 1.7540258515464398e-05, "loss": 0.0021, "step": 3650 }, { "epoch": 0.6122961104140527, "grad_norm": 0.028682034462690353, "learning_rate": 1.7521044958363567e-05, "loss": 0.0025, "step": 3660 }, { "epoch": 0.6139690506064408, "grad_norm": 0.03143743798136711, "learning_rate": 1.750176726339147e-05, "loss": 0.0025, "step": 3670 }, { "epoch": 0.615641990798829, "grad_norm": 0.04500582441687584, "learning_rate": 1.7482425594944184e-05, "loss": 0.0022, "step": 3680 }, { "epoch": 0.617314930991217, "grad_norm": 0.030934732407331467, "learning_rate": 1.7463020117963344e-05, "loss": 0.0024, "step": 3690 }, { "epoch": 0.6189878711836052, "grad_norm": 0.044432759284973145, "learning_rate": 1.7443550997934733e-05, "loss": 0.0022, "step": 3700 }, { "epoch": 0.6206608113759933, "grad_norm": 0.04397678002715111, "learning_rate": 1.742401840088686e-05, "loss": 0.0025, "step": 3710 }, { "epoch": 0.6223337515683814, "grad_norm": 0.05255619063973427, "learning_rate": 1.740442249338956e-05, "loss": 0.0022, "step": 3720 }, { "epoch": 0.6240066917607695, "grad_norm": 0.051583826541900635, "learning_rate": 1.738476344255256e-05, "loss": 0.0026, "step": 3730 }, { "epoch": 0.6256796319531577, "grad_norm": 0.07572329789400101, "learning_rate": 1.7365041416024067e-05, "loss": 0.0024, "step": 3740 }, { "epoch": 0.6273525721455459, "grad_norm": 0.05568915978074074, "learning_rate": 1.7345256581989322e-05, "loss": 0.0025, "step": 3750 }, { "epoch": 0.6290255123379339, "grad_norm": 0.04794057458639145, "learning_rate": 1.7325409109169184e-05, "loss": 0.0024, "step": 3760 }, { "epoch": 0.6306984525303221, "grad_norm": 0.0429316982626915, "learning_rate": 1.730549916681868e-05, "loss": 0.0022, "step": 3770 }, { "epoch": 0.6323713927227101, "grad_norm": 0.04290391504764557, "learning_rate": 1.728552692472556e-05, "loss": 0.002, "step": 3780 }, { "epoch": 0.6340443329150983, "grad_norm": 0.03937142342329025, "learning_rate": 1.7265492553208853e-05, "loss": 0.0023, "step": 3790 }, { "epoch": 0.6357172731074864, "grad_norm": 0.0411640964448452, "learning_rate": 1.724539622311742e-05, "loss": 0.0023, "step": 3800 }, { "epoch": 0.6373902132998746, "grad_norm": 0.05297888442873955, "learning_rate": 1.722523810582849e-05, "loss": 0.0022, "step": 3810 }, { "epoch": 0.6390631534922626, "grad_norm": 0.05264117568731308, "learning_rate": 1.7205018373246194e-05, "loss": 0.0018, "step": 3820 }, { "epoch": 0.6407360936846508, "grad_norm": 0.041034262627363205, "learning_rate": 1.7184737197800116e-05, "loss": 0.0021, "step": 3830 }, { "epoch": 0.6424090338770388, "grad_norm": 0.04253237321972847, "learning_rate": 1.7164394752443797e-05, "loss": 0.0023, "step": 3840 }, { "epoch": 0.644081974069427, "grad_norm": 0.04876425489783287, "learning_rate": 1.714399121065329e-05, "loss": 0.0021, "step": 3850 }, { "epoch": 0.6457549142618152, "grad_norm": 0.07671269029378891, "learning_rate": 1.712352674642565e-05, "loss": 0.0021, "step": 3860 }, { "epoch": 0.6474278544542033, "grad_norm": 0.06842278689146042, "learning_rate": 1.7103001534277488e-05, "loss": 0.0019, "step": 3870 }, { "epoch": 0.6491007946465914, "grad_norm": 0.03893548622727394, "learning_rate": 1.7082415749243436e-05, "loss": 0.0019, "step": 3880 }, { "epoch": 0.6507737348389795, "grad_norm": 0.031787365674972534, "learning_rate": 1.706176956687469e-05, "loss": 0.002, "step": 3890 }, { "epoch": 0.6524466750313677, "grad_norm": 0.05576930195093155, "learning_rate": 1.70410631632375e-05, "loss": 0.0023, "step": 3900 }, { "epoch": 0.6541196152237557, "grad_norm": 0.05747951567173004, "learning_rate": 1.7020296714911677e-05, "loss": 0.0021, "step": 3910 }, { "epoch": 0.6557925554161439, "grad_norm": 0.05123858526349068, "learning_rate": 1.699947039898907e-05, "loss": 0.0024, "step": 3920 }, { "epoch": 0.657465495608532, "grad_norm": 0.06327506899833679, "learning_rate": 1.6978584393072063e-05, "loss": 0.0025, "step": 3930 }, { "epoch": 0.6591384358009201, "grad_norm": 0.04733841493725777, "learning_rate": 1.6957638875272086e-05, "loss": 0.0023, "step": 3940 }, { "epoch": 0.6608113759933082, "grad_norm": 0.05361257493495941, "learning_rate": 1.693663402420805e-05, "loss": 0.0022, "step": 3950 }, { "epoch": 0.6624843161856964, "grad_norm": 0.0419698990881443, "learning_rate": 1.6915570019004854e-05, "loss": 0.0021, "step": 3960 }, { "epoch": 0.6641572563780845, "grad_norm": 0.033510979264974594, "learning_rate": 1.6894447039291853e-05, "loss": 0.0021, "step": 3970 }, { "epoch": 0.6658301965704726, "grad_norm": 0.042343463748693466, "learning_rate": 1.687326526520133e-05, "loss": 0.0021, "step": 3980 }, { "epoch": 0.6675031367628608, "grad_norm": 0.04558710381388664, "learning_rate": 1.6852024877366945e-05, "loss": 0.002, "step": 3990 }, { "epoch": 0.6691760769552488, "grad_norm": 0.0489596351981163, "learning_rate": 1.6830726056922206e-05, "loss": 0.0028, "step": 4000 }, { "epoch": 0.670849017147637, "grad_norm": 0.06211019307374954, "learning_rate": 1.680936898549892e-05, "loss": 0.0025, "step": 4010 }, { "epoch": 0.6725219573400251, "grad_norm": 0.038762129843235016, "learning_rate": 1.678795384522565e-05, "loss": 0.0029, "step": 4020 }, { "epoch": 0.6741948975324132, "grad_norm": 0.05437132343649864, "learning_rate": 1.676648081872616e-05, "loss": 0.0019, "step": 4030 }, { "epoch": 0.6758678377248013, "grad_norm": 0.06182729825377464, "learning_rate": 1.6744950089117847e-05, "loss": 0.0031, "step": 4040 }, { "epoch": 0.6775407779171895, "grad_norm": 0.13201574981212616, "learning_rate": 1.6723361840010194e-05, "loss": 0.0022, "step": 4050 }, { "epoch": 0.6792137181095775, "grad_norm": 0.042076628655195236, "learning_rate": 1.6701716255503193e-05, "loss": 0.0021, "step": 4060 }, { "epoch": 0.6808866583019657, "grad_norm": 0.04640534520149231, "learning_rate": 1.6680013520185788e-05, "loss": 0.0023, "step": 4070 }, { "epoch": 0.6825595984943539, "grad_norm": 0.06874120235443115, "learning_rate": 1.6658253819134288e-05, "loss": 0.0021, "step": 4080 }, { "epoch": 0.6842325386867419, "grad_norm": 0.059104256331920624, "learning_rate": 1.663643733791079e-05, "loss": 0.0023, "step": 4090 }, { "epoch": 0.6859054788791301, "grad_norm": 0.044120050966739655, "learning_rate": 1.661456426256161e-05, "loss": 0.0024, "step": 4100 }, { "epoch": 0.6875784190715182, "grad_norm": 0.04526931047439575, "learning_rate": 1.6592634779615678e-05, "loss": 0.0024, "step": 4110 }, { "epoch": 0.6892513592639063, "grad_norm": 0.02994440123438835, "learning_rate": 1.6570649076082957e-05, "loss": 0.002, "step": 4120 }, { "epoch": 0.6909242994562944, "grad_norm": 0.0492926687002182, "learning_rate": 1.6548607339452853e-05, "loss": 0.0025, "step": 4130 }, { "epoch": 0.6925972396486826, "grad_norm": 0.03923075646162033, "learning_rate": 1.6526509757692604e-05, "loss": 0.003, "step": 4140 }, { "epoch": 0.6942701798410706, "grad_norm": 0.05586059018969536, "learning_rate": 1.6504356519245685e-05, "loss": 0.0022, "step": 4150 }, { "epoch": 0.6959431200334588, "grad_norm": 0.037069130688905716, "learning_rate": 1.6482147813030203e-05, "loss": 0.0024, "step": 4160 }, { "epoch": 0.6976160602258469, "grad_norm": 0.043507181107997894, "learning_rate": 1.645988382843728e-05, "loss": 0.0022, "step": 4170 }, { "epoch": 0.699289000418235, "grad_norm": 0.05088340491056442, "learning_rate": 1.643756475532944e-05, "loss": 0.0021, "step": 4180 }, { "epoch": 0.7009619406106232, "grad_norm": 0.06906166672706604, "learning_rate": 1.6415190784038983e-05, "loss": 0.0023, "step": 4190 }, { "epoch": 0.7026348808030113, "grad_norm": 0.04218286648392677, "learning_rate": 1.6392762105366385e-05, "loss": 0.0019, "step": 4200 }, { "epoch": 0.7043078209953995, "grad_norm": 0.03700464963912964, "learning_rate": 1.6370278910578644e-05, "loss": 0.002, "step": 4210 }, { "epoch": 0.7059807611877875, "grad_norm": 0.0531776063144207, "learning_rate": 1.6347741391407655e-05, "loss": 0.0028, "step": 4220 }, { "epoch": 0.7076537013801757, "grad_norm": 0.06566280126571655, "learning_rate": 1.6325149740048594e-05, "loss": 0.0029, "step": 4230 }, { "epoch": 0.7093266415725638, "grad_norm": 0.03531255945563316, "learning_rate": 1.6302504149158256e-05, "loss": 0.0022, "step": 4240 }, { "epoch": 0.7109995817649519, "grad_norm": 0.04077228158712387, "learning_rate": 1.627980481185342e-05, "loss": 0.0021, "step": 4250 }, { "epoch": 0.71267252195734, "grad_norm": 0.15944042801856995, "learning_rate": 1.6257051921709205e-05, "loss": 0.0023, "step": 4260 }, { "epoch": 0.7143454621497282, "grad_norm": 0.04040149971842766, "learning_rate": 1.623424567275742e-05, "loss": 0.0019, "step": 4270 }, { "epoch": 0.7160184023421162, "grad_norm": 0.030274273827672005, "learning_rate": 1.62113862594849e-05, "loss": 0.0026, "step": 4280 }, { "epoch": 0.7176913425345044, "grad_norm": 0.037320125848054886, "learning_rate": 1.6188473876831863e-05, "loss": 0.0024, "step": 4290 }, { "epoch": 0.7193642827268926, "grad_norm": 0.051040168851614, "learning_rate": 1.616550872019023e-05, "loss": 0.0025, "step": 4300 }, { "epoch": 0.7210372229192806, "grad_norm": 0.05426260828971863, "learning_rate": 1.6142490985401972e-05, "loss": 0.002, "step": 4310 }, { "epoch": 0.7227101631116688, "grad_norm": 0.04254674166440964, "learning_rate": 1.6119420868757433e-05, "loss": 0.0021, "step": 4320 }, { "epoch": 0.7243831033040569, "grad_norm": 0.03362365439534187, "learning_rate": 1.609629856699366e-05, "loss": 0.0024, "step": 4330 }, { "epoch": 0.726056043496445, "grad_norm": 0.0613832026720047, "learning_rate": 1.6073124277292728e-05, "loss": 0.002, "step": 4340 }, { "epoch": 0.7277289836888331, "grad_norm": 0.028343969956040382, "learning_rate": 1.604989819728004e-05, "loss": 0.0025, "step": 4350 }, { "epoch": 0.7294019238812213, "grad_norm": 0.07086797803640366, "learning_rate": 1.602662052502267e-05, "loss": 0.0022, "step": 4360 }, { "epoch": 0.7310748640736093, "grad_norm": 0.03145402669906616, "learning_rate": 1.6003291459027654e-05, "loss": 0.0022, "step": 4370 }, { "epoch": 0.7327478042659975, "grad_norm": 0.0429244339466095, "learning_rate": 1.597991119824031e-05, "loss": 0.0025, "step": 4380 }, { "epoch": 0.7344207444583856, "grad_norm": 0.055743567645549774, "learning_rate": 1.5956479942042517e-05, "loss": 0.0023, "step": 4390 }, { "epoch": 0.7360936846507737, "grad_norm": 0.05092700943350792, "learning_rate": 1.593299789025104e-05, "loss": 0.0024, "step": 4400 }, { "epoch": 0.7377666248431619, "grad_norm": 0.041956719011068344, "learning_rate": 1.5909465243115835e-05, "loss": 0.0021, "step": 4410 }, { "epoch": 0.73943956503555, "grad_norm": 0.03763611987233162, "learning_rate": 1.5885882201318302e-05, "loss": 0.0021, "step": 4420 }, { "epoch": 0.7411125052279381, "grad_norm": 0.04014522209763527, "learning_rate": 1.5862248965969604e-05, "loss": 0.0022, "step": 4430 }, { "epoch": 0.7427854454203262, "grad_norm": 0.03660379722714424, "learning_rate": 1.5838565738608948e-05, "loss": 0.0029, "step": 4440 }, { "epoch": 0.7444583856127144, "grad_norm": 0.038071878254413605, "learning_rate": 1.5814832721201853e-05, "loss": 0.0019, "step": 4450 }, { "epoch": 0.7461313258051024, "grad_norm": 0.0340408980846405, "learning_rate": 1.579105011613844e-05, "loss": 0.0028, "step": 4460 }, { "epoch": 0.7478042659974906, "grad_norm": 0.04271842539310455, "learning_rate": 1.5767218126231717e-05, "loss": 0.0021, "step": 4470 }, { "epoch": 0.7494772061898787, "grad_norm": 0.0902421623468399, "learning_rate": 1.574333695471581e-05, "loss": 0.002, "step": 4480 }, { "epoch": 0.7511501463822668, "grad_norm": 0.04056239128112793, "learning_rate": 1.5719406805244276e-05, "loss": 0.002, "step": 4490 }, { "epoch": 0.7528230865746549, "grad_norm": 0.05462808534502983, "learning_rate": 1.5695427881888335e-05, "loss": 0.0022, "step": 4500 }, { "epoch": 0.7544960267670431, "grad_norm": 0.05215197801589966, "learning_rate": 1.5671400389135153e-05, "loss": 0.0021, "step": 4510 }, { "epoch": 0.7561689669594313, "grad_norm": 0.057196639478206635, "learning_rate": 1.5647324531886066e-05, "loss": 0.0019, "step": 4520 }, { "epoch": 0.7578419071518193, "grad_norm": 0.04054103419184685, "learning_rate": 1.5623200515454863e-05, "loss": 0.0022, "step": 4530 }, { "epoch": 0.7595148473442075, "grad_norm": 0.025195389986038208, "learning_rate": 1.5599028545566028e-05, "loss": 0.0023, "step": 4540 }, { "epoch": 0.7611877875365956, "grad_norm": 0.05260899290442467, "learning_rate": 1.5574808828352978e-05, "loss": 0.0021, "step": 4550 }, { "epoch": 0.7628607277289837, "grad_norm": 0.043028589338064194, "learning_rate": 1.55505415703563e-05, "loss": 0.0019, "step": 4560 }, { "epoch": 0.7645336679213718, "grad_norm": 0.036908626556396484, "learning_rate": 1.552622697852202e-05, "loss": 0.0018, "step": 4570 }, { "epoch": 0.76620660811376, "grad_norm": 0.052433595061302185, "learning_rate": 1.5501865260199796e-05, "loss": 0.0019, "step": 4580 }, { "epoch": 0.767879548306148, "grad_norm": 0.03188903257250786, "learning_rate": 1.547745662314118e-05, "loss": 0.0021, "step": 4590 }, { "epoch": 0.7695524884985362, "grad_norm": 0.06399247795343399, "learning_rate": 1.5453001275497834e-05, "loss": 0.0022, "step": 4600 }, { "epoch": 0.7712254286909243, "grad_norm": 0.04007547348737717, "learning_rate": 1.5428499425819767e-05, "loss": 0.0019, "step": 4610 }, { "epoch": 0.7728983688833124, "grad_norm": 0.03243298456072807, "learning_rate": 1.5403951283053528e-05, "loss": 0.0018, "step": 4620 }, { "epoch": 0.7745713090757006, "grad_norm": 0.04403265565633774, "learning_rate": 1.5379357056540465e-05, "loss": 0.0018, "step": 4630 }, { "epoch": 0.7762442492680887, "grad_norm": 0.08717833459377289, "learning_rate": 1.535471695601491e-05, "loss": 0.0018, "step": 4640 }, { "epoch": 0.7779171894604768, "grad_norm": 0.03721160441637039, "learning_rate": 1.5330031191602395e-05, "loss": 0.002, "step": 4650 }, { "epoch": 0.7795901296528649, "grad_norm": 0.0505303256213665, "learning_rate": 1.5305299973817863e-05, "loss": 0.0022, "step": 4660 }, { "epoch": 0.7812630698452531, "grad_norm": 0.04627244174480438, "learning_rate": 1.5280523513563886e-05, "loss": 0.0018, "step": 4670 }, { "epoch": 0.7829360100376411, "grad_norm": 0.042483508586883545, "learning_rate": 1.525570202212884e-05, "loss": 0.0021, "step": 4680 }, { "epoch": 0.7846089502300293, "grad_norm": 0.047481536865234375, "learning_rate": 1.5230835711185121e-05, "loss": 0.002, "step": 4690 }, { "epoch": 0.7862818904224174, "grad_norm": 0.035280562937259674, "learning_rate": 1.5205924792787345e-05, "loss": 0.0021, "step": 4700 }, { "epoch": 0.7879548306148055, "grad_norm": 0.07199792563915253, "learning_rate": 1.518096947937052e-05, "loss": 0.0019, "step": 4710 }, { "epoch": 0.7896277708071936, "grad_norm": 0.053782444447278976, "learning_rate": 1.5155969983748251e-05, "loss": 0.003, "step": 4720 }, { "epoch": 0.7913007109995818, "grad_norm": 0.030725333839654922, "learning_rate": 1.5130926519110915e-05, "loss": 0.0017, "step": 4730 }, { "epoch": 0.7929736511919699, "grad_norm": 0.04464433342218399, "learning_rate": 1.5105839299023852e-05, "loss": 0.0018, "step": 4740 }, { "epoch": 0.794646591384358, "grad_norm": 0.03293019160628319, "learning_rate": 1.5080708537425542e-05, "loss": 0.0017, "step": 4750 }, { "epoch": 0.7963195315767462, "grad_norm": 0.04540516436100006, "learning_rate": 1.5055534448625766e-05, "loss": 0.002, "step": 4760 }, { "epoch": 0.7979924717691342, "grad_norm": 0.034682005643844604, "learning_rate": 1.50303172473038e-05, "loss": 0.0018, "step": 4770 }, { "epoch": 0.7996654119615224, "grad_norm": 0.040400758385658264, "learning_rate": 1.5005057148506574e-05, "loss": 0.0021, "step": 4780 }, { "epoch": 0.8013383521539105, "grad_norm": 0.05537265166640282, "learning_rate": 1.4979754367646835e-05, "loss": 0.0021, "step": 4790 }, { "epoch": 0.8030112923462986, "grad_norm": 0.04965697228908539, "learning_rate": 1.4954409120501313e-05, "loss": 0.0025, "step": 4800 }, { "epoch": 0.8046842325386867, "grad_norm": 0.034185994416475296, "learning_rate": 1.4929021623208885e-05, "loss": 0.0021, "step": 4810 }, { "epoch": 0.8063571727310749, "grad_norm": 0.04387965425848961, "learning_rate": 1.4903592092268728e-05, "loss": 0.0021, "step": 4820 }, { "epoch": 0.8080301129234629, "grad_norm": 0.0392509289085865, "learning_rate": 1.487812074453847e-05, "loss": 0.0022, "step": 4830 }, { "epoch": 0.8097030531158511, "grad_norm": 0.04664000868797302, "learning_rate": 1.4852607797232343e-05, "loss": 0.0018, "step": 4840 }, { "epoch": 0.8113759933082393, "grad_norm": 0.060999516397714615, "learning_rate": 1.482705346791934e-05, "loss": 0.0018, "step": 4850 }, { "epoch": 0.8130489335006273, "grad_norm": 0.03260907530784607, "learning_rate": 1.4801457974521336e-05, "loss": 0.002, "step": 4860 }, { "epoch": 0.8147218736930155, "grad_norm": 0.051893096417188644, "learning_rate": 1.477582153531126e-05, "loss": 0.0027, "step": 4870 }, { "epoch": 0.8163948138854036, "grad_norm": 0.027669653296470642, "learning_rate": 1.475014436891121e-05, "loss": 0.0023, "step": 4880 }, { "epoch": 0.8180677540777918, "grad_norm": 0.052082087844610214, "learning_rate": 1.4724426694290596e-05, "loss": 0.0022, "step": 4890 }, { "epoch": 0.8197406942701798, "grad_norm": 0.0576699897646904, "learning_rate": 1.4698668730764278e-05, "loss": 0.002, "step": 4900 }, { "epoch": 0.821413634462568, "grad_norm": 0.04373374581336975, "learning_rate": 1.4672870697990686e-05, "loss": 0.002, "step": 4910 }, { "epoch": 0.823086574654956, "grad_norm": 0.05360044538974762, "learning_rate": 1.4647032815969957e-05, "loss": 0.0022, "step": 4920 }, { "epoch": 0.8247595148473442, "grad_norm": 0.04683142527937889, "learning_rate": 1.4621155305042053e-05, "loss": 0.0025, "step": 4930 }, { "epoch": 0.8264324550397323, "grad_norm": 0.05963839590549469, "learning_rate": 1.459523838588488e-05, "loss": 0.0023, "step": 4940 }, { "epoch": 0.8281053952321205, "grad_norm": 0.037882860749959946, "learning_rate": 1.4569282279512406e-05, "loss": 0.0021, "step": 4950 }, { "epoch": 0.8297783354245086, "grad_norm": 0.04986179992556572, "learning_rate": 1.4543287207272792e-05, "loss": 0.0021, "step": 4960 }, { "epoch": 0.8314512756168967, "grad_norm": 0.05136257782578468, "learning_rate": 1.4517253390846481e-05, "loss": 0.0019, "step": 4970 }, { "epoch": 0.8331242158092849, "grad_norm": 0.043505389243364334, "learning_rate": 1.4491181052244317e-05, "loss": 0.0019, "step": 4980 }, { "epoch": 0.8347971560016729, "grad_norm": 0.04339772090315819, "learning_rate": 1.4465070413805657e-05, "loss": 0.0022, "step": 4990 }, { "epoch": 0.8364700961940611, "grad_norm": 0.039579328149557114, "learning_rate": 1.4438921698196477e-05, "loss": 0.0026, "step": 5000 }, { "epoch": 0.8381430363864492, "grad_norm": 0.0351238027215004, "learning_rate": 1.4412735128407453e-05, "loss": 0.0023, "step": 5010 }, { "epoch": 0.8398159765788373, "grad_norm": 0.03863988071680069, "learning_rate": 1.4386510927752086e-05, "loss": 0.0034, "step": 5020 }, { "epoch": 0.8414889167712254, "grad_norm": 0.041700851172208786, "learning_rate": 1.4360249319864777e-05, "loss": 0.0021, "step": 5030 }, { "epoch": 0.8431618569636136, "grad_norm": 0.05484633147716522, "learning_rate": 1.433395052869893e-05, "loss": 0.0018, "step": 5040 }, { "epoch": 0.8448347971560016, "grad_norm": 0.04248538240790367, "learning_rate": 1.4307614778525043e-05, "loss": 0.0019, "step": 5050 }, { "epoch": 0.8465077373483898, "grad_norm": 0.049268871545791626, "learning_rate": 1.428124229392879e-05, "loss": 0.0019, "step": 5060 }, { "epoch": 0.848180677540778, "grad_norm": 0.024322429671883583, "learning_rate": 1.425483329980911e-05, "loss": 0.0019, "step": 5070 }, { "epoch": 0.849853617733166, "grad_norm": 0.040467243641614914, "learning_rate": 1.4228388021376282e-05, "loss": 0.0017, "step": 5080 }, { "epoch": 0.8515265579255542, "grad_norm": 0.06459440290927887, "learning_rate": 1.420190668415002e-05, "loss": 0.0021, "step": 5090 }, { "epoch": 0.8531994981179423, "grad_norm": 0.03809395432472229, "learning_rate": 1.4175389513957529e-05, "loss": 0.0018, "step": 5100 }, { "epoch": 0.8548724383103304, "grad_norm": 0.054389119148254395, "learning_rate": 1.4148836736931598e-05, "loss": 0.0019, "step": 5110 }, { "epoch": 0.8565453785027185, "grad_norm": 0.039873525500297546, "learning_rate": 1.4122248579508658e-05, "loss": 0.002, "step": 5120 }, { "epoch": 0.8582183186951067, "grad_norm": 0.07599814236164093, "learning_rate": 1.4095625268426856e-05, "loss": 0.0017, "step": 5130 }, { "epoch": 0.8598912588874947, "grad_norm": 0.05019741132855415, "learning_rate": 1.4068967030724131e-05, "loss": 0.0017, "step": 5140 }, { "epoch": 0.8615641990798829, "grad_norm": 0.03371502831578255, "learning_rate": 1.4042274093736259e-05, "loss": 0.0018, "step": 5150 }, { "epoch": 0.863237139272271, "grad_norm": 0.04091212898492813, "learning_rate": 1.401554668509493e-05, "loss": 0.0016, "step": 5160 }, { "epoch": 0.8649100794646591, "grad_norm": 0.02863364852964878, "learning_rate": 1.3988785032725795e-05, "loss": 0.0019, "step": 5170 }, { "epoch": 0.8665830196570473, "grad_norm": 0.04318336024880409, "learning_rate": 1.3961989364846533e-05, "loss": 0.0015, "step": 5180 }, { "epoch": 0.8682559598494354, "grad_norm": 0.037916697561740875, "learning_rate": 1.3935159909964901e-05, "loss": 0.0019, "step": 5190 }, { "epoch": 0.8699289000418235, "grad_norm": 0.04094701260328293, "learning_rate": 1.3908296896876778e-05, "loss": 0.0021, "step": 5200 }, { "epoch": 0.8716018402342116, "grad_norm": 0.0430636890232563, "learning_rate": 1.3881400554664229e-05, "loss": 0.0017, "step": 5210 }, { "epoch": 0.8732747804265998, "grad_norm": 0.036845896393060684, "learning_rate": 1.3854471112693536e-05, "loss": 0.0018, "step": 5220 }, { "epoch": 0.8749477206189878, "grad_norm": 0.03569072112441063, "learning_rate": 1.382750880061325e-05, "loss": 0.0019, "step": 5230 }, { "epoch": 0.876620660811376, "grad_norm": 0.02674141153693199, "learning_rate": 1.380051384835223e-05, "loss": 0.0018, "step": 5240 }, { "epoch": 0.8782936010037641, "grad_norm": 0.04462336748838425, "learning_rate": 1.3773486486117692e-05, "loss": 0.0019, "step": 5250 }, { "epoch": 0.8799665411961523, "grad_norm": 0.02786438912153244, "learning_rate": 1.3746426944393225e-05, "loss": 0.0017, "step": 5260 }, { "epoch": 0.8816394813885403, "grad_norm": 0.0349322184920311, "learning_rate": 1.3719335453936845e-05, "loss": 0.0018, "step": 5270 }, { "epoch": 0.8833124215809285, "grad_norm": 0.026458587497472763, "learning_rate": 1.3692212245779022e-05, "loss": 0.0019, "step": 5280 }, { "epoch": 0.8849853617733167, "grad_norm": 0.03487461432814598, "learning_rate": 1.3665057551220702e-05, "loss": 0.0019, "step": 5290 }, { "epoch": 0.8866583019657047, "grad_norm": 0.03653564304113388, "learning_rate": 1.363787160183134e-05, "loss": 0.0019, "step": 5300 }, { "epoch": 0.8883312421580929, "grad_norm": 0.04243851080536842, "learning_rate": 1.3610654629446938e-05, "loss": 0.0017, "step": 5310 }, { "epoch": 0.890004182350481, "grad_norm": 0.02937169373035431, "learning_rate": 1.3583406866168036e-05, "loss": 0.0019, "step": 5320 }, { "epoch": 0.8916771225428691, "grad_norm": 0.04182581603527069, "learning_rate": 1.3556128544357762e-05, "loss": 0.0018, "step": 5330 }, { "epoch": 0.8933500627352572, "grad_norm": 0.03566766530275345, "learning_rate": 1.352881989663985e-05, "loss": 0.0021, "step": 5340 }, { "epoch": 0.8950230029276454, "grad_norm": 0.05481470003724098, "learning_rate": 1.3501481155896635e-05, "loss": 0.0016, "step": 5350 }, { "epoch": 0.8966959431200334, "grad_norm": 0.04701324924826622, "learning_rate": 1.3474112555267071e-05, "loss": 0.0019, "step": 5360 }, { "epoch": 0.8983688833124216, "grad_norm": 0.03247402235865593, "learning_rate": 1.3446714328144771e-05, "loss": 0.0019, "step": 5370 }, { "epoch": 0.9000418235048097, "grad_norm": 0.027220681309700012, "learning_rate": 1.3419286708175989e-05, "loss": 0.0018, "step": 5380 }, { "epoch": 0.9017147636971978, "grad_norm": 0.02662051096558571, "learning_rate": 1.3391829929257624e-05, "loss": 0.0016, "step": 5390 }, { "epoch": 0.903387703889586, "grad_norm": 0.02885391376912594, "learning_rate": 1.3364344225535255e-05, "loss": 0.0021, "step": 5400 }, { "epoch": 0.9050606440819741, "grad_norm": 0.04084537550806999, "learning_rate": 1.3336829831401108e-05, "loss": 0.0017, "step": 5410 }, { "epoch": 0.9067335842743622, "grad_norm": 0.030064033344388008, "learning_rate": 1.3309286981492084e-05, "loss": 0.0019, "step": 5420 }, { "epoch": 0.9084065244667503, "grad_norm": 0.036757320165634155, "learning_rate": 1.3281715910687749e-05, "loss": 0.0017, "step": 5430 }, { "epoch": 0.9100794646591385, "grad_norm": 0.03228607401251793, "learning_rate": 1.325411685410833e-05, "loss": 0.0017, "step": 5440 }, { "epoch": 0.9117524048515265, "grad_norm": 0.029094552621245384, "learning_rate": 1.3226490047112703e-05, "loss": 0.0014, "step": 5450 }, { "epoch": 0.9134253450439147, "grad_norm": 0.037664901465177536, "learning_rate": 1.3198835725296407e-05, "loss": 0.0018, "step": 5460 }, { "epoch": 0.9150982852363028, "grad_norm": 0.042677655816078186, "learning_rate": 1.3171154124489614e-05, "loss": 0.0019, "step": 5470 }, { "epoch": 0.9167712254286909, "grad_norm": 0.038974203169345856, "learning_rate": 1.3143445480755123e-05, "loss": 0.0022, "step": 5480 }, { "epoch": 0.918444165621079, "grad_norm": 0.058354251086711884, "learning_rate": 1.3115710030386358e-05, "loss": 0.0017, "step": 5490 }, { "epoch": 0.9201171058134672, "grad_norm": 0.03196679800748825, "learning_rate": 1.3087948009905334e-05, "loss": 0.0017, "step": 5500 }, { "epoch": 0.9217900460058553, "grad_norm": 0.034976791590452194, "learning_rate": 1.3060159656060654e-05, "loss": 0.0018, "step": 5510 }, { "epoch": 0.9234629861982434, "grad_norm": 0.037620075047016144, "learning_rate": 1.3032345205825495e-05, "loss": 0.0019, "step": 5520 }, { "epoch": 0.9251359263906316, "grad_norm": 0.02885134518146515, "learning_rate": 1.3004504896395564e-05, "loss": 0.0019, "step": 5530 }, { "epoch": 0.9268088665830196, "grad_norm": 0.04703853279352188, "learning_rate": 1.2976638965187095e-05, "loss": 0.0022, "step": 5540 }, { "epoch": 0.9284818067754078, "grad_norm": 0.03883330523967743, "learning_rate": 1.294874764983483e-05, "loss": 0.0018, "step": 5550 }, { "epoch": 0.9301547469677959, "grad_norm": 0.0379517637193203, "learning_rate": 1.2920831188189967e-05, "loss": 0.0018, "step": 5560 }, { "epoch": 0.931827687160184, "grad_norm": 0.04439307376742363, "learning_rate": 1.289288981831815e-05, "loss": 0.0019, "step": 5570 }, { "epoch": 0.9335006273525721, "grad_norm": 0.04029279202222824, "learning_rate": 1.2864923778497439e-05, "loss": 0.0017, "step": 5580 }, { "epoch": 0.9351735675449603, "grad_norm": 0.037751927971839905, "learning_rate": 1.2836933307216268e-05, "loss": 0.0017, "step": 5590 }, { "epoch": 0.9368465077373483, "grad_norm": 0.03357413783669472, "learning_rate": 1.2808918643171424e-05, "loss": 0.0019, "step": 5600 }, { "epoch": 0.9385194479297365, "grad_norm": 0.030700083822011948, "learning_rate": 1.2780880025266007e-05, "loss": 0.0018, "step": 5610 }, { "epoch": 0.9401923881221247, "grad_norm": 0.025235312059521675, "learning_rate": 1.2752817692607373e-05, "loss": 0.0017, "step": 5620 }, { "epoch": 0.9418653283145128, "grad_norm": 0.026510193943977356, "learning_rate": 1.2724731884505134e-05, "loss": 0.0017, "step": 5630 }, { "epoch": 0.9435382685069009, "grad_norm": 0.03207218274474144, "learning_rate": 1.2696622840469084e-05, "loss": 0.0018, "step": 5640 }, { "epoch": 0.945211208699289, "grad_norm": 0.042601097375154495, "learning_rate": 1.2668490800207169e-05, "loss": 0.0017, "step": 5650 }, { "epoch": 0.9468841488916772, "grad_norm": 0.0320468544960022, "learning_rate": 1.2640336003623443e-05, "loss": 0.0016, "step": 5660 }, { "epoch": 0.9485570890840652, "grad_norm": 0.030902091413736343, "learning_rate": 1.2612158690816024e-05, "loss": 0.0021, "step": 5670 }, { "epoch": 0.9502300292764534, "grad_norm": 0.031640663743019104, "learning_rate": 1.2583959102075035e-05, "loss": 0.0019, "step": 5680 }, { "epoch": 0.9519029694688415, "grad_norm": 0.05038362741470337, "learning_rate": 1.2555737477880577e-05, "loss": 0.0015, "step": 5690 }, { "epoch": 0.9535759096612296, "grad_norm": 0.017514105886220932, "learning_rate": 1.2527494058900649e-05, "loss": 0.0016, "step": 5700 }, { "epoch": 0.9552488498536177, "grad_norm": 0.03043646179139614, "learning_rate": 1.2499229085989124e-05, "loss": 0.0018, "step": 5710 }, { "epoch": 0.9569217900460059, "grad_norm": 0.026818247511982918, "learning_rate": 1.2470942800183676e-05, "loss": 0.0015, "step": 5720 }, { "epoch": 0.958594730238394, "grad_norm": 0.025088932365179062, "learning_rate": 1.2442635442703734e-05, "loss": 0.0019, "step": 5730 }, { "epoch": 0.9602676704307821, "grad_norm": 0.0770934596657753, "learning_rate": 1.2414307254948416e-05, "loss": 0.0017, "step": 5740 }, { "epoch": 0.9619406106231703, "grad_norm": 0.04576810821890831, "learning_rate": 1.2385958478494487e-05, "loss": 0.002, "step": 5750 }, { "epoch": 0.9636135508155583, "grad_norm": 0.0292399600148201, "learning_rate": 1.2357589355094275e-05, "loss": 0.0016, "step": 5760 }, { "epoch": 0.9652864910079465, "grad_norm": 0.0422188900411129, "learning_rate": 1.2329200126673629e-05, "loss": 0.0017, "step": 5770 }, { "epoch": 0.9669594312003346, "grad_norm": 0.03836767002940178, "learning_rate": 1.2300791035329854e-05, "loss": 0.0024, "step": 5780 }, { "epoch": 0.9686323713927227, "grad_norm": 0.03259129449725151, "learning_rate": 1.2272362323329632e-05, "loss": 0.0018, "step": 5790 }, { "epoch": 0.9703053115851108, "grad_norm": 0.03140456974506378, "learning_rate": 1.224391423310697e-05, "loss": 0.0018, "step": 5800 }, { "epoch": 0.971978251777499, "grad_norm": 0.02620626799762249, "learning_rate": 1.2215447007261134e-05, "loss": 0.0015, "step": 5810 }, { "epoch": 0.973651191969887, "grad_norm": 0.029441634193062782, "learning_rate": 1.218696088855457e-05, "loss": 0.0016, "step": 5820 }, { "epoch": 0.9753241321622752, "grad_norm": 0.02403791807591915, "learning_rate": 1.2158456119910826e-05, "loss": 0.0015, "step": 5830 }, { "epoch": 0.9769970723546634, "grad_norm": 0.030648913234472275, "learning_rate": 1.2129932944412518e-05, "loss": 0.0016, "step": 5840 }, { "epoch": 0.9786700125470514, "grad_norm": 0.043718162924051285, "learning_rate": 1.2101391605299215e-05, "loss": 0.0016, "step": 5850 }, { "epoch": 0.9803429527394396, "grad_norm": 0.044535037130117416, "learning_rate": 1.2072832345965381e-05, "loss": 0.0019, "step": 5860 }, { "epoch": 0.9820158929318277, "grad_norm": 0.03934067487716675, "learning_rate": 1.2044255409958305e-05, "loss": 0.0015, "step": 5870 }, { "epoch": 0.9836888331242158, "grad_norm": 0.02708713710308075, "learning_rate": 1.201566104097602e-05, "loss": 0.0016, "step": 5880 }, { "epoch": 0.9853617733166039, "grad_norm": 0.03255411237478256, "learning_rate": 1.1987049482865212e-05, "loss": 0.0016, "step": 5890 }, { "epoch": 0.9870347135089921, "grad_norm": 0.029331672936677933, "learning_rate": 1.1958420979619176e-05, "loss": 0.0016, "step": 5900 }, { "epoch": 0.9887076537013801, "grad_norm": 0.03698103129863739, "learning_rate": 1.1929775775375685e-05, "loss": 0.0019, "step": 5910 }, { "epoch": 0.9903805938937683, "grad_norm": 0.0296563021838665, "learning_rate": 1.190111411441495e-05, "loss": 0.0015, "step": 5920 }, { "epoch": 0.9920535340861564, "grad_norm": 0.037474025040864944, "learning_rate": 1.1872436241157519e-05, "loss": 0.0016, "step": 5930 }, { "epoch": 0.9937264742785445, "grad_norm": 0.04677669703960419, "learning_rate": 1.1843742400162193e-05, "loss": 0.0014, "step": 5940 }, { "epoch": 0.9953994144709327, "grad_norm": 0.029643068090081215, "learning_rate": 1.1815032836123943e-05, "loss": 0.0015, "step": 5950 }, { "epoch": 0.9970723546633208, "grad_norm": 0.032378487288951874, "learning_rate": 1.1786307793871825e-05, "loss": 0.0014, "step": 5960 }, { "epoch": 0.998745294855709, "grad_norm": 0.033199042081832886, "learning_rate": 1.1757567518366883e-05, "loss": 0.0014, "step": 5970 }, { "epoch": 1.0003345880384775, "grad_norm": 0.017692681401968002, "learning_rate": 1.1728812254700074e-05, "loss": 0.0018, "step": 5980 }, { "epoch": 1.0020075282308658, "grad_norm": 0.017270682379603386, "learning_rate": 1.1700042248090175e-05, "loss": 0.0014, "step": 5990 }, { "epoch": 1.0036804684232539, "grad_norm": 0.017059732228517532, "learning_rate": 1.1671257743881675e-05, "loss": 0.0016, "step": 6000 }, { "epoch": 1.005353408615642, "grad_norm": 0.013884035870432854, "learning_rate": 1.16424589875427e-05, "loss": 0.0015, "step": 6010 }, { "epoch": 1.0070263488080302, "grad_norm": 0.018221765756607056, "learning_rate": 1.1613646224662922e-05, "loss": 0.0013, "step": 6020 }, { "epoch": 1.0086992890004183, "grad_norm": 0.028090432286262512, "learning_rate": 1.1584819700951451e-05, "loss": 0.0014, "step": 6030 }, { "epoch": 1.0103722291928063, "grad_norm": 0.024853363633155823, "learning_rate": 1.1555979662234746e-05, "loss": 0.0014, "step": 6040 }, { "epoch": 1.0120451693851944, "grad_norm": 0.017815180122852325, "learning_rate": 1.1527126354454526e-05, "loss": 0.0012, "step": 6050 }, { "epoch": 1.0137181095775827, "grad_norm": 0.025194672867655754, "learning_rate": 1.1498260023665657e-05, "loss": 0.0013, "step": 6060 }, { "epoch": 1.0153910497699707, "grad_norm": 0.013006742112338543, "learning_rate": 1.1469380916034068e-05, "loss": 0.0014, "step": 6070 }, { "epoch": 1.0170639899623588, "grad_norm": 0.02811233513057232, "learning_rate": 1.1440489277834645e-05, "loss": 0.0013, "step": 6080 }, { "epoch": 1.0187369301547469, "grad_norm": 0.08740483224391937, "learning_rate": 1.1411585355449131e-05, "loss": 0.0017, "step": 6090 }, { "epoch": 1.0204098703471352, "grad_norm": 0.020205924287438393, "learning_rate": 1.1382669395364026e-05, "loss": 0.0014, "step": 6100 }, { "epoch": 1.0220828105395232, "grad_norm": 0.018776023760437965, "learning_rate": 1.1353741644168489e-05, "loss": 0.0017, "step": 6110 }, { "epoch": 1.0237557507319113, "grad_norm": 0.02919163554906845, "learning_rate": 1.1324802348552224e-05, "loss": 0.0017, "step": 6120 }, { "epoch": 1.0254286909242996, "grad_norm": 0.0847286656498909, "learning_rate": 1.1295851755303388e-05, "loss": 0.002, "step": 6130 }, { "epoch": 1.0271016311166876, "grad_norm": 0.024996411055326462, "learning_rate": 1.1266890111306484e-05, "loss": 0.0014, "step": 6140 }, { "epoch": 1.0287745713090757, "grad_norm": 0.04961877316236496, "learning_rate": 1.1237917663540243e-05, "loss": 0.0013, "step": 6150 }, { "epoch": 1.0304475115014637, "grad_norm": 0.049556002020835876, "learning_rate": 1.1208934659075543e-05, "loss": 0.0016, "step": 6160 }, { "epoch": 1.032120451693852, "grad_norm": 0.016331976279616356, "learning_rate": 1.1179941345073278e-05, "loss": 0.0014, "step": 6170 }, { "epoch": 1.03379339188624, "grad_norm": 0.024324314668774605, "learning_rate": 1.1150937968782256e-05, "loss": 0.0014, "step": 6180 }, { "epoch": 1.0354663320786281, "grad_norm": 0.020227763801813126, "learning_rate": 1.1121924777537108e-05, "loss": 0.0014, "step": 6190 }, { "epoch": 1.0371392722710162, "grad_norm": 0.031031692400574684, "learning_rate": 1.1092902018756151e-05, "loss": 0.0019, "step": 6200 }, { "epoch": 1.0388122124634045, "grad_norm": 0.020849648863077164, "learning_rate": 1.1063869939939296e-05, "loss": 0.0016, "step": 6210 }, { "epoch": 1.0404851526557926, "grad_norm": 0.023178936913609505, "learning_rate": 1.1034828788665936e-05, "loss": 0.0013, "step": 6220 }, { "epoch": 1.0421580928481806, "grad_norm": 0.019570866599678993, "learning_rate": 1.1005778812592834e-05, "loss": 0.0013, "step": 6230 }, { "epoch": 1.043831033040569, "grad_norm": 0.016351137310266495, "learning_rate": 1.0976720259452e-05, "loss": 0.0013, "step": 6240 }, { "epoch": 1.045503973232957, "grad_norm": 0.01905253529548645, "learning_rate": 1.0947653377048597e-05, "loss": 0.0015, "step": 6250 }, { "epoch": 1.047176913425345, "grad_norm": 0.022501403465867043, "learning_rate": 1.0918578413258812e-05, "loss": 0.0016, "step": 6260 }, { "epoch": 1.048849853617733, "grad_norm": 0.02250710316002369, "learning_rate": 1.0889495616027748e-05, "loss": 0.0017, "step": 6270 }, { "epoch": 1.0505227938101214, "grad_norm": 0.048828549683094025, "learning_rate": 1.0860405233367317e-05, "loss": 0.0011, "step": 6280 }, { "epoch": 1.0521957340025094, "grad_norm": 0.02636878937482834, "learning_rate": 1.0831307513354113e-05, "loss": 0.0015, "step": 6290 }, { "epoch": 1.0538686741948975, "grad_norm": 0.016637520864605904, "learning_rate": 1.0802202704127293e-05, "loss": 0.0014, "step": 6300 }, { "epoch": 1.0555416143872856, "grad_norm": 0.01669999770820141, "learning_rate": 1.0773091053886488e-05, "loss": 0.0012, "step": 6310 }, { "epoch": 1.0572145545796738, "grad_norm": 0.026662765070796013, "learning_rate": 1.0743972810889656e-05, "loss": 0.0013, "step": 6320 }, { "epoch": 1.058887494772062, "grad_norm": 0.014320231042802334, "learning_rate": 1.0714848223450975e-05, "loss": 0.0013, "step": 6330 }, { "epoch": 1.06056043496445, "grad_norm": 0.011061044409871101, "learning_rate": 1.0685717539938733e-05, "loss": 0.0014, "step": 6340 }, { "epoch": 1.0622333751568382, "grad_norm": 0.02683224156498909, "learning_rate": 1.06565810087732e-05, "loss": 0.0014, "step": 6350 }, { "epoch": 1.0639063153492263, "grad_norm": 0.02099456638097763, "learning_rate": 1.0627438878424512e-05, "loss": 0.0014, "step": 6360 }, { "epoch": 1.0655792555416144, "grad_norm": 0.03347095474600792, "learning_rate": 1.0598291397410563e-05, "loss": 0.0017, "step": 6370 }, { "epoch": 1.0672521957340024, "grad_norm": 0.019640810787677765, "learning_rate": 1.0569138814294864e-05, "loss": 0.0013, "step": 6380 }, { "epoch": 1.0689251359263907, "grad_norm": 0.03604935482144356, "learning_rate": 1.0539981377684443e-05, "loss": 0.0015, "step": 6390 }, { "epoch": 1.0705980761187788, "grad_norm": 0.016895698383450508, "learning_rate": 1.0510819336227717e-05, "loss": 0.0013, "step": 6400 }, { "epoch": 1.0722710163111668, "grad_norm": 0.016278522089123726, "learning_rate": 1.0481652938612374e-05, "loss": 0.0016, "step": 6410 }, { "epoch": 1.073943956503555, "grad_norm": 0.01755623146891594, "learning_rate": 1.0452482433563241e-05, "loss": 0.0015, "step": 6420 }, { "epoch": 1.0756168966959432, "grad_norm": 0.026648730039596558, "learning_rate": 1.0423308069840183e-05, "loss": 0.0016, "step": 6430 }, { "epoch": 1.0772898368883312, "grad_norm": 0.016650011762976646, "learning_rate": 1.0394130096235966e-05, "loss": 0.0013, "step": 6440 }, { "epoch": 1.0789627770807193, "grad_norm": 0.024689314886927605, "learning_rate": 1.0364948761574142e-05, "loss": 0.0012, "step": 6450 }, { "epoch": 1.0806357172731076, "grad_norm": 0.019947919994592667, "learning_rate": 1.0335764314706925e-05, "loss": 0.0013, "step": 6460 }, { "epoch": 1.0823086574654956, "grad_norm": 0.02537013776600361, "learning_rate": 1.0306577004513065e-05, "loss": 0.0013, "step": 6470 }, { "epoch": 1.0839815976578837, "grad_norm": 0.019749384373426437, "learning_rate": 1.0277387079895736e-05, "loss": 0.0016, "step": 6480 }, { "epoch": 1.0856545378502718, "grad_norm": 0.0292816199362278, "learning_rate": 1.0248194789780406e-05, "loss": 0.0015, "step": 6490 }, { "epoch": 1.08732747804266, "grad_norm": 0.027332182973623276, "learning_rate": 1.0219000383112714e-05, "loss": 0.0016, "step": 6500 }, { "epoch": 1.0890004182350481, "grad_norm": 0.022406267002224922, "learning_rate": 1.0189804108856347e-05, "loss": 0.0014, "step": 6510 }, { "epoch": 1.0906733584274362, "grad_norm": 0.01899942196905613, "learning_rate": 1.0160606215990922e-05, "loss": 0.0012, "step": 6520 }, { "epoch": 1.0923462986198242, "grad_norm": 0.016422558575868607, "learning_rate": 1.0131406953509856e-05, "loss": 0.0013, "step": 6530 }, { "epoch": 1.0940192388122125, "grad_norm": 0.0951632708311081, "learning_rate": 1.010220657041825e-05, "loss": 0.0014, "step": 6540 }, { "epoch": 1.0956921790046006, "grad_norm": 0.02774394303560257, "learning_rate": 1.0073005315730757e-05, "loss": 0.0012, "step": 6550 }, { "epoch": 1.0973651191969886, "grad_norm": 0.02383301593363285, "learning_rate": 1.0043803438469462e-05, "loss": 0.0014, "step": 6560 }, { "epoch": 1.099038059389377, "grad_norm": 0.02542874775826931, "learning_rate": 1.001460118766176e-05, "loss": 0.0018, "step": 6570 }, { "epoch": 1.100710999581765, "grad_norm": 0.037845779210329056, "learning_rate": 9.98539881233824e-06, "loss": 0.0014, "step": 6580 }, { "epoch": 1.102383939774153, "grad_norm": 0.04525664076209068, "learning_rate": 9.956196561530542e-06, "loss": 0.0011, "step": 6590 }, { "epoch": 1.1040568799665411, "grad_norm": 0.030203547328710556, "learning_rate": 9.92699468426925e-06, "loss": 0.0011, "step": 6600 }, { "epoch": 1.1057298201589294, "grad_norm": 0.011778382584452629, "learning_rate": 9.89779342958175e-06, "loss": 0.0017, "step": 6610 }, { "epoch": 1.1074027603513175, "grad_norm": 0.011243265122175217, "learning_rate": 9.868593046490145e-06, "loss": 0.0015, "step": 6620 }, { "epoch": 1.1090757005437055, "grad_norm": 0.020497525110840797, "learning_rate": 9.839393784009078e-06, "loss": 0.0015, "step": 6630 }, { "epoch": 1.1107486407360936, "grad_norm": 0.022749830037355423, "learning_rate": 9.810195891143656e-06, "loss": 0.0013, "step": 6640 }, { "epoch": 1.1124215809284819, "grad_norm": 0.030486248433589935, "learning_rate": 9.78099961688729e-06, "loss": 0.0012, "step": 6650 }, { "epoch": 1.11409452112087, "grad_norm": 0.01755516044795513, "learning_rate": 9.751805210219595e-06, "loss": 0.0013, "step": 6660 }, { "epoch": 1.115767461313258, "grad_norm": 0.027382057160139084, "learning_rate": 9.722612920104267e-06, "loss": 0.0017, "step": 6670 }, { "epoch": 1.1174404015056463, "grad_norm": 0.019617881625890732, "learning_rate": 9.693422995486938e-06, "loss": 0.0015, "step": 6680 }, { "epoch": 1.1191133416980343, "grad_norm": 0.02035129815340042, "learning_rate": 9.664235685293079e-06, "loss": 0.0013, "step": 6690 }, { "epoch": 1.1207862818904224, "grad_norm": 0.029037421569228172, "learning_rate": 9.635051238425863e-06, "loss": 0.0015, "step": 6700 }, { "epoch": 1.1224592220828105, "grad_norm": 0.020131433382630348, "learning_rate": 9.605869903764037e-06, "loss": 0.0018, "step": 6710 }, { "epoch": 1.1241321622751987, "grad_norm": 0.03933103010058403, "learning_rate": 9.57669193015982e-06, "loss": 0.0019, "step": 6720 }, { "epoch": 1.1258051024675868, "grad_norm": 0.020689697936177254, "learning_rate": 9.547517566436764e-06, "loss": 0.0014, "step": 6730 }, { "epoch": 1.1274780426599749, "grad_norm": 0.024334974586963654, "learning_rate": 9.518347061387629e-06, "loss": 0.0013, "step": 6740 }, { "epoch": 1.1291509828523631, "grad_norm": 0.011828083544969559, "learning_rate": 9.489180663772284e-06, "loss": 0.0015, "step": 6750 }, { "epoch": 1.1308239230447512, "grad_norm": 0.01996997743844986, "learning_rate": 9.460018622315557e-06, "loss": 0.0014, "step": 6760 }, { "epoch": 1.1324968632371393, "grad_norm": 0.02487107180058956, "learning_rate": 9.430861185705138e-06, "loss": 0.0016, "step": 6770 }, { "epoch": 1.1341698034295273, "grad_norm": 0.029872538521885872, "learning_rate": 9.401708602589442e-06, "loss": 0.0015, "step": 6780 }, { "epoch": 1.1358427436219154, "grad_norm": 0.025728629902005196, "learning_rate": 9.37256112157549e-06, "loss": 0.0016, "step": 6790 }, { "epoch": 1.1375156838143037, "grad_norm": 0.024806654080748558, "learning_rate": 9.343418991226803e-06, "loss": 0.0014, "step": 6800 }, { "epoch": 1.1391886240066917, "grad_norm": 0.022717388346791267, "learning_rate": 9.314282460061272e-06, "loss": 0.0015, "step": 6810 }, { "epoch": 1.1408615641990798, "grad_norm": 0.0174353439360857, "learning_rate": 9.285151776549026e-06, "loss": 0.0013, "step": 6820 }, { "epoch": 1.142534504391468, "grad_norm": 0.016155697405338287, "learning_rate": 9.256027189110346e-06, "loss": 0.0013, "step": 6830 }, { "epoch": 1.1442074445838561, "grad_norm": 0.026408901438117027, "learning_rate": 9.226908946113511e-06, "loss": 0.0012, "step": 6840 }, { "epoch": 1.1458803847762442, "grad_norm": 0.01538822427392006, "learning_rate": 9.197797295872709e-06, "loss": 0.0015, "step": 6850 }, { "epoch": 1.1475533249686323, "grad_norm": 0.019588863477110863, "learning_rate": 9.168692486645894e-06, "loss": 0.0018, "step": 6860 }, { "epoch": 1.1492262651610206, "grad_norm": 0.03321785107254982, "learning_rate": 9.139594766632685e-06, "loss": 0.0024, "step": 6870 }, { "epoch": 1.1508992053534086, "grad_norm": 0.02110890857875347, "learning_rate": 9.110504383972256e-06, "loss": 0.0013, "step": 6880 }, { "epoch": 1.1525721455457967, "grad_norm": 0.015745697543025017, "learning_rate": 9.08142158674119e-06, "loss": 0.0012, "step": 6890 }, { "epoch": 1.154245085738185, "grad_norm": 0.021603936329483986, "learning_rate": 9.052346622951407e-06, "loss": 0.0015, "step": 6900 }, { "epoch": 1.155918025930573, "grad_norm": 0.019249098375439644, "learning_rate": 9.023279740548002e-06, "loss": 0.0012, "step": 6910 }, { "epoch": 1.157590966122961, "grad_norm": 0.02481662854552269, "learning_rate": 8.994221187407168e-06, "loss": 0.0014, "step": 6920 }, { "epoch": 1.1592639063153491, "grad_norm": 0.07811404019594193, "learning_rate": 8.965171211334066e-06, "loss": 0.0018, "step": 6930 }, { "epoch": 1.1609368465077374, "grad_norm": 0.03448295593261719, "learning_rate": 8.936130060060709e-06, "loss": 0.0016, "step": 6940 }, { "epoch": 1.1626097867001255, "grad_norm": 0.013815466314554214, "learning_rate": 8.907097981243852e-06, "loss": 0.0013, "step": 6950 }, { "epoch": 1.1642827268925136, "grad_norm": 0.051691070199012756, "learning_rate": 8.878075222462896e-06, "loss": 0.0018, "step": 6960 }, { "epoch": 1.1659556670849018, "grad_norm": 0.018571842461824417, "learning_rate": 8.849062031217744e-06, "loss": 0.0015, "step": 6970 }, { "epoch": 1.16762860727729, "grad_norm": 0.017027465626597404, "learning_rate": 8.820058654926726e-06, "loss": 0.0012, "step": 6980 }, { "epoch": 1.169301547469678, "grad_norm": 0.016351548954844475, "learning_rate": 8.791065340924462e-06, "loss": 0.0013, "step": 6990 }, { "epoch": 1.170974487662066, "grad_norm": 0.021529076620936394, "learning_rate": 8.762082336459758e-06, "loss": 0.0015, "step": 7000 } ], "logging_steps": 10, "max_steps": 11954, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.6670376742559416e+19, "train_batch_size": 12, "trial_name": null, "trial_params": null }