{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.99986375403061, "eval_steps": 500, "global_step": 33027, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0009083064626004814, "grad_norm": 3.338594913482666, "learning_rate": 3.027550711474417e-07, "loss": 2.0348, "step": 10 }, { "epoch": 0.0018166129252009627, "grad_norm": 3.300436019897461, "learning_rate": 6.055101422948834e-07, "loss": 1.7787, "step": 20 }, { "epoch": 0.0027249193878014443, "grad_norm": 3.069028854370117, "learning_rate": 9.082652134423252e-07, "loss": 2.189, "step": 30 }, { "epoch": 0.0036332258504019254, "grad_norm": 2.9285471439361572, "learning_rate": 1.2110202845897668e-06, "loss": 2.061, "step": 40 }, { "epoch": 0.004541532313002407, "grad_norm": 5.388884544372559, "learning_rate": 1.5137753557372086e-06, "loss": 1.9378, "step": 50 }, { "epoch": 0.005449838775602889, "grad_norm": 2.258514165878296, "learning_rate": 1.8165304268846505e-06, "loss": 1.9294, "step": 60 }, { "epoch": 0.00635814523820337, "grad_norm": 5.507309436798096, "learning_rate": 2.119285498032092e-06, "loss": 2.1704, "step": 70 }, { "epoch": 0.007266451700803851, "grad_norm": 3.9765431880950928, "learning_rate": 2.4220405691795337e-06, "loss": 1.7043, "step": 80 }, { "epoch": 0.008174758163404332, "grad_norm": 3.954787254333496, "learning_rate": 2.7247956403269755e-06, "loss": 1.8261, "step": 90 }, { "epoch": 0.009083064626004814, "grad_norm": 3.983407974243164, "learning_rate": 3.0275507114744173e-06, "loss": 1.9785, "step": 100 }, { "epoch": 0.009991371088605296, "grad_norm": 4.271358489990234, "learning_rate": 3.330305782621859e-06, "loss": 1.6443, "step": 110 }, { "epoch": 0.010899677551205777, "grad_norm": 3.3456268310546875, "learning_rate": 3.633060853769301e-06, "loss": 1.7081, "step": 120 }, { "epoch": 0.011807984013806259, "grad_norm": 5.413731575012207, "learning_rate": 3.935815924916743e-06, "loss": 1.4213, "step": 130 }, { "epoch": 0.01271629047640674, "grad_norm": 2.4782180786132812, "learning_rate": 4.238570996064184e-06, "loss": 1.4716, "step": 140 }, { "epoch": 0.01362459693900722, "grad_norm": 2.1670308113098145, "learning_rate": 4.541326067211626e-06, "loss": 1.2754, "step": 150 }, { "epoch": 0.014532903401607702, "grad_norm": 2.0522806644439697, "learning_rate": 4.844081138359067e-06, "loss": 1.336, "step": 160 }, { "epoch": 0.015441209864208183, "grad_norm": 2.745317220687866, "learning_rate": 5.1468362095065096e-06, "loss": 1.0416, "step": 170 }, { "epoch": 0.016349516326808665, "grad_norm": 2.3295204639434814, "learning_rate": 5.449591280653951e-06, "loss": 1.0337, "step": 180 }, { "epoch": 0.017257822789409148, "grad_norm": 2.544881582260132, "learning_rate": 5.752346351801393e-06, "loss": 1.0808, "step": 190 }, { "epoch": 0.018166129252009628, "grad_norm": 3.7135374546051025, "learning_rate": 6.055101422948835e-06, "loss": 1.2941, "step": 200 }, { "epoch": 0.019074435714610108, "grad_norm": 2.5924196243286133, "learning_rate": 6.357856494096276e-06, "loss": 0.8495, "step": 210 }, { "epoch": 0.01998274217721059, "grad_norm": 2.905205011367798, "learning_rate": 6.660611565243718e-06, "loss": 0.9497, "step": 220 }, { "epoch": 0.02089104863981107, "grad_norm": 2.4818644523620605, "learning_rate": 6.96336663639116e-06, "loss": 1.1603, "step": 230 }, { "epoch": 0.021799355102411554, "grad_norm": 1.5569452047348022, "learning_rate": 7.266121707538602e-06, "loss": 1.1973, "step": 240 }, { "epoch": 0.022707661565012034, "grad_norm": 2.8538522720336914, "learning_rate": 7.568876778686043e-06, "loss": 1.0413, "step": 250 }, { "epoch": 0.023615968027612518, "grad_norm": 2.4298572540283203, "learning_rate": 7.871631849833486e-06, "loss": 1.1573, "step": 260 }, { "epoch": 0.024524274490212997, "grad_norm": 3.1789722442626953, "learning_rate": 8.174386920980927e-06, "loss": 1.0138, "step": 270 }, { "epoch": 0.02543258095281348, "grad_norm": 2.8161139488220215, "learning_rate": 8.477141992128367e-06, "loss": 0.8869, "step": 280 }, { "epoch": 0.02634088741541396, "grad_norm": 3.413029670715332, "learning_rate": 8.77989706327581e-06, "loss": 1.0096, "step": 290 }, { "epoch": 0.02724919387801444, "grad_norm": 4.742816925048828, "learning_rate": 9.082652134423252e-06, "loss": 1.0573, "step": 300 }, { "epoch": 0.028157500340614924, "grad_norm": 2.580552577972412, "learning_rate": 9.385407205570694e-06, "loss": 0.8935, "step": 310 }, { "epoch": 0.029065806803215403, "grad_norm": 2.899801731109619, "learning_rate": 9.688162276718135e-06, "loss": 0.8834, "step": 320 }, { "epoch": 0.029974113265815887, "grad_norm": 4.4719109535217285, "learning_rate": 9.990917347865577e-06, "loss": 1.038, "step": 330 }, { "epoch": 0.030882419728416367, "grad_norm": 3.345717430114746, "learning_rate": 1.0293672419013019e-05, "loss": 0.7849, "step": 340 }, { "epoch": 0.031790726191016846, "grad_norm": 3.3921029567718506, "learning_rate": 1.0596427490160461e-05, "loss": 0.8869, "step": 350 }, { "epoch": 0.03269903265361733, "grad_norm": 2.750105142593384, "learning_rate": 1.0899182561307902e-05, "loss": 0.7852, "step": 360 }, { "epoch": 0.03360733911621781, "grad_norm": 3.8593287467956543, "learning_rate": 1.1201937632455344e-05, "loss": 1.0161, "step": 370 }, { "epoch": 0.034515645578818296, "grad_norm": 2.9770772457122803, "learning_rate": 1.1504692703602786e-05, "loss": 0.7652, "step": 380 }, { "epoch": 0.03542395204141877, "grad_norm": 3.784400463104248, "learning_rate": 1.1807447774750227e-05, "loss": 1.0225, "step": 390 }, { "epoch": 0.036332258504019256, "grad_norm": 2.728053092956543, "learning_rate": 1.211020284589767e-05, "loss": 0.7553, "step": 400 }, { "epoch": 0.03724056496661974, "grad_norm": 4.710628032684326, "learning_rate": 1.2412957917045111e-05, "loss": 0.7877, "step": 410 }, { "epoch": 0.038148871429220216, "grad_norm": 3.4088709354400635, "learning_rate": 1.2715712988192552e-05, "loss": 0.8358, "step": 420 }, { "epoch": 0.0390571778918207, "grad_norm": 4.734980583190918, "learning_rate": 1.3018468059339994e-05, "loss": 0.7867, "step": 430 }, { "epoch": 0.03996548435442118, "grad_norm": 3.5638625621795654, "learning_rate": 1.3321223130487436e-05, "loss": 0.8306, "step": 440 }, { "epoch": 0.040873790817021666, "grad_norm": 3.2275116443634033, "learning_rate": 1.3623978201634879e-05, "loss": 0.9478, "step": 450 }, { "epoch": 0.04178209727962214, "grad_norm": 4.820329189300537, "learning_rate": 1.392673327278232e-05, "loss": 0.8788, "step": 460 }, { "epoch": 0.042690403742222625, "grad_norm": 6.218160629272461, "learning_rate": 1.4229488343929761e-05, "loss": 0.7633, "step": 470 }, { "epoch": 0.04359871020482311, "grad_norm": 6.764943599700928, "learning_rate": 1.4532243415077204e-05, "loss": 0.6725, "step": 480 }, { "epoch": 0.04450701666742359, "grad_norm": 4.656595706939697, "learning_rate": 1.4834998486224644e-05, "loss": 0.7497, "step": 490 }, { "epoch": 0.04541532313002407, "grad_norm": 4.804786205291748, "learning_rate": 1.5137753557372086e-05, "loss": 0.736, "step": 500 }, { "epoch": 0.04541532313002407, "eval_loss": 0.6253027319908142, "eval_runtime": 1100.1983, "eval_samples_per_second": 8.896, "eval_steps_per_second": 8.896, "step": 500 }, { "epoch": 0.04632362959262455, "grad_norm": 4.544654369354248, "learning_rate": 1.544050862851953e-05, "loss": 0.6512, "step": 510 }, { "epoch": 0.047231936055225035, "grad_norm": 2.6357240676879883, "learning_rate": 1.5743263699666973e-05, "loss": 0.5626, "step": 520 }, { "epoch": 0.04814024251782551, "grad_norm": 6.021533012390137, "learning_rate": 1.604601877081441e-05, "loss": 0.8268, "step": 530 }, { "epoch": 0.049048548980425995, "grad_norm": 3.877450942993164, "learning_rate": 1.6348773841961854e-05, "loss": 0.6177, "step": 540 }, { "epoch": 0.04995685544302648, "grad_norm": 4.196897983551025, "learning_rate": 1.6651528913109298e-05, "loss": 0.6995, "step": 550 }, { "epoch": 0.05086516190562696, "grad_norm": 6.918705463409424, "learning_rate": 1.6954283984256735e-05, "loss": 0.7371, "step": 560 }, { "epoch": 0.05177346836822744, "grad_norm": 3.3960323333740234, "learning_rate": 1.725703905540418e-05, "loss": 0.8573, "step": 570 }, { "epoch": 0.05268177483082792, "grad_norm": 5.59420108795166, "learning_rate": 1.755979412655162e-05, "loss": 0.6923, "step": 580 }, { "epoch": 0.053590081293428404, "grad_norm": 7.072421550750732, "learning_rate": 1.7862549197699063e-05, "loss": 0.6268, "step": 590 }, { "epoch": 0.05449838775602888, "grad_norm": 9.044671058654785, "learning_rate": 1.8165304268846504e-05, "loss": 0.5251, "step": 600 }, { "epoch": 0.055406694218629364, "grad_norm": 5.206499099731445, "learning_rate": 1.8468059339993944e-05, "loss": 0.4026, "step": 610 }, { "epoch": 0.05631500068122985, "grad_norm": 5.055967807769775, "learning_rate": 1.8770814411141388e-05, "loss": 0.5343, "step": 620 }, { "epoch": 0.05722330714383033, "grad_norm": 3.5918002128601074, "learning_rate": 1.9073569482288832e-05, "loss": 0.5754, "step": 630 }, { "epoch": 0.05813161360643081, "grad_norm": 5.792262554168701, "learning_rate": 1.937632455343627e-05, "loss": 0.4651, "step": 640 }, { "epoch": 0.05903992006903129, "grad_norm": 8.631296157836914, "learning_rate": 1.9679079624583713e-05, "loss": 0.6393, "step": 650 }, { "epoch": 0.059948226531631774, "grad_norm": 8.974030494689941, "learning_rate": 1.9981834695731154e-05, "loss": 0.7645, "step": 660 }, { "epoch": 0.06085653299423226, "grad_norm": 6.88407039642334, "learning_rate": 2.0284589766878594e-05, "loss": 0.499, "step": 670 }, { "epoch": 0.06176483945683273, "grad_norm": 3.689136505126953, "learning_rate": 2.0587344838026038e-05, "loss": 0.4638, "step": 680 }, { "epoch": 0.06267314591943322, "grad_norm": 2.9718682765960693, "learning_rate": 2.089009990917348e-05, "loss": 0.4591, "step": 690 }, { "epoch": 0.06358145238203369, "grad_norm": 2.2784411907196045, "learning_rate": 2.1192854980320923e-05, "loss": 0.4373, "step": 700 }, { "epoch": 0.06448975884463418, "grad_norm": 9.02578067779541, "learning_rate": 2.1495610051468363e-05, "loss": 0.3688, "step": 710 }, { "epoch": 0.06539806530723466, "grad_norm": 4.688114643096924, "learning_rate": 2.1798365122615804e-05, "loss": 0.4679, "step": 720 }, { "epoch": 0.06630637176983514, "grad_norm": 9.619386672973633, "learning_rate": 2.2101120193763248e-05, "loss": 0.3789, "step": 730 }, { "epoch": 0.06721467823243563, "grad_norm": 10.355782508850098, "learning_rate": 2.2403875264910688e-05, "loss": 0.5481, "step": 740 }, { "epoch": 0.06812298469503611, "grad_norm": 6.707198619842529, "learning_rate": 2.270663033605813e-05, "loss": 0.3771, "step": 750 }, { "epoch": 0.06903129115763659, "grad_norm": 8.621395111083984, "learning_rate": 2.3009385407205573e-05, "loss": 0.4958, "step": 760 }, { "epoch": 0.06993959762023706, "grad_norm": 3.129951000213623, "learning_rate": 2.3312140478353013e-05, "loss": 0.4799, "step": 770 }, { "epoch": 0.07084790408283755, "grad_norm": 5.378387451171875, "learning_rate": 2.3614895549500454e-05, "loss": 0.4629, "step": 780 }, { "epoch": 0.07175621054543803, "grad_norm": 2.839278221130371, "learning_rate": 2.3917650620647898e-05, "loss": 0.4689, "step": 790 }, { "epoch": 0.07266451700803851, "grad_norm": 7.540127754211426, "learning_rate": 2.422040569179534e-05, "loss": 0.4241, "step": 800 }, { "epoch": 0.073572823470639, "grad_norm": 8.50964069366455, "learning_rate": 2.4523160762942782e-05, "loss": 0.4131, "step": 810 }, { "epoch": 0.07448112993323948, "grad_norm": 3.257770538330078, "learning_rate": 2.4825915834090223e-05, "loss": 0.5464, "step": 820 }, { "epoch": 0.07538943639583996, "grad_norm": 9.130038261413574, "learning_rate": 2.512867090523766e-05, "loss": 0.4016, "step": 830 }, { "epoch": 0.07629774285844043, "grad_norm": 4.686695098876953, "learning_rate": 2.5431425976385104e-05, "loss": 0.3708, "step": 840 }, { "epoch": 0.07720604932104091, "grad_norm": 9.499434471130371, "learning_rate": 2.5734181047532548e-05, "loss": 0.3776, "step": 850 }, { "epoch": 0.0781143557836414, "grad_norm": 4.375223636627197, "learning_rate": 2.603693611867999e-05, "loss": 0.3831, "step": 860 }, { "epoch": 0.07902266224624188, "grad_norm": 6.465493202209473, "learning_rate": 2.6339691189827432e-05, "loss": 0.5323, "step": 870 }, { "epoch": 0.07993096870884236, "grad_norm": 3.532640218734741, "learning_rate": 2.6642446260974873e-05, "loss": 0.3452, "step": 880 }, { "epoch": 0.08083927517144285, "grad_norm": 4.774614334106445, "learning_rate": 2.6945201332122317e-05, "loss": 0.3781, "step": 890 }, { "epoch": 0.08174758163404333, "grad_norm": 5.908001899719238, "learning_rate": 2.7247956403269757e-05, "loss": 0.3255, "step": 900 }, { "epoch": 0.0826558880966438, "grad_norm": 7.525854110717773, "learning_rate": 2.7550711474417194e-05, "loss": 0.5784, "step": 910 }, { "epoch": 0.08356419455924428, "grad_norm": 4.886518478393555, "learning_rate": 2.785346654556464e-05, "loss": 0.4912, "step": 920 }, { "epoch": 0.08447250102184477, "grad_norm": 5.826849937438965, "learning_rate": 2.8156221616712082e-05, "loss": 0.3143, "step": 930 }, { "epoch": 0.08538080748444525, "grad_norm": 5.070947647094727, "learning_rate": 2.8458976687859523e-05, "loss": 0.4231, "step": 940 }, { "epoch": 0.08628911394704573, "grad_norm": 7.089649677276611, "learning_rate": 2.8761731759006967e-05, "loss": 0.397, "step": 950 }, { "epoch": 0.08719742040964622, "grad_norm": 4.6128010749816895, "learning_rate": 2.9064486830154407e-05, "loss": 0.411, "step": 960 }, { "epoch": 0.0881057268722467, "grad_norm": 5.077552318572998, "learning_rate": 2.936724190130185e-05, "loss": 0.385, "step": 970 }, { "epoch": 0.08901403333484718, "grad_norm": 10.358783721923828, "learning_rate": 2.966999697244929e-05, "loss": 0.2776, "step": 980 }, { "epoch": 0.08992233979744765, "grad_norm": 5.617259979248047, "learning_rate": 2.997275204359673e-05, "loss": 0.3867, "step": 990 }, { "epoch": 0.09083064626004814, "grad_norm": 5.202264308929443, "learning_rate": 3.0275507114744173e-05, "loss": 0.4861, "step": 1000 }, { "epoch": 0.09083064626004814, "eval_loss": 0.3906579315662384, "eval_runtime": 1101.7131, "eval_samples_per_second": 8.883, "eval_steps_per_second": 8.883, "step": 1000 }, { "epoch": 0.09173895272264862, "grad_norm": 5.670671463012695, "learning_rate": 3.0578262185891613e-05, "loss": 0.4029, "step": 1010 }, { "epoch": 0.0926472591852491, "grad_norm": 7.460565090179443, "learning_rate": 3.088101725703906e-05, "loss": 0.3051, "step": 1020 }, { "epoch": 0.09355556564784959, "grad_norm": 5.6009931564331055, "learning_rate": 3.11837723281865e-05, "loss": 0.3726, "step": 1030 }, { "epoch": 0.09446387211045007, "grad_norm": 10.718893051147461, "learning_rate": 3.1486527399333945e-05, "loss": 0.3665, "step": 1040 }, { "epoch": 0.09537217857305055, "grad_norm": 4.965304851531982, "learning_rate": 3.178928247048138e-05, "loss": 0.4164, "step": 1050 }, { "epoch": 0.09628048503565102, "grad_norm": 6.058121681213379, "learning_rate": 3.209203754162882e-05, "loss": 0.436, "step": 1060 }, { "epoch": 0.0971887914982515, "grad_norm": 4.10232400894165, "learning_rate": 3.2394792612776263e-05, "loss": 0.4748, "step": 1070 }, { "epoch": 0.09809709796085199, "grad_norm": 4.127154350280762, "learning_rate": 3.269754768392371e-05, "loss": 0.3472, "step": 1080 }, { "epoch": 0.09900540442345247, "grad_norm": 3.687777519226074, "learning_rate": 3.300030275507115e-05, "loss": 0.3024, "step": 1090 }, { "epoch": 0.09991371088605296, "grad_norm": 3.852682113647461, "learning_rate": 3.3303057826218595e-05, "loss": 0.3624, "step": 1100 }, { "epoch": 0.10082201734865344, "grad_norm": 3.8656630516052246, "learning_rate": 3.360581289736603e-05, "loss": 0.4335, "step": 1110 }, { "epoch": 0.10173032381125392, "grad_norm": 6.087235927581787, "learning_rate": 3.390856796851347e-05, "loss": 0.4863, "step": 1120 }, { "epoch": 0.10263863027385439, "grad_norm": 4.87807035446167, "learning_rate": 3.4211323039660914e-05, "loss": 0.3193, "step": 1130 }, { "epoch": 0.10354693673645488, "grad_norm": 2.795630931854248, "learning_rate": 3.451407811080836e-05, "loss": 0.3322, "step": 1140 }, { "epoch": 0.10445524319905536, "grad_norm": 3.009498119354248, "learning_rate": 3.48168331819558e-05, "loss": 0.3875, "step": 1150 }, { "epoch": 0.10536354966165584, "grad_norm": 4.550341606140137, "learning_rate": 3.511958825310324e-05, "loss": 0.3504, "step": 1160 }, { "epoch": 0.10627185612425633, "grad_norm": 3.3924739360809326, "learning_rate": 3.542234332425068e-05, "loss": 0.5231, "step": 1170 }, { "epoch": 0.10718016258685681, "grad_norm": 4.267087459564209, "learning_rate": 3.5725098395398126e-05, "loss": 0.3416, "step": 1180 }, { "epoch": 0.10808846904945729, "grad_norm": 3.5114586353302, "learning_rate": 3.6027853466545564e-05, "loss": 0.4291, "step": 1190 }, { "epoch": 0.10899677551205776, "grad_norm": 7.757761478424072, "learning_rate": 3.633060853769301e-05, "loss": 0.3437, "step": 1200 }, { "epoch": 0.10990508197465824, "grad_norm": 2.9646482467651367, "learning_rate": 3.663336360884045e-05, "loss": 0.3831, "step": 1210 }, { "epoch": 0.11081338843725873, "grad_norm": 2.7294435501098633, "learning_rate": 3.693611867998789e-05, "loss": 0.3572, "step": 1220 }, { "epoch": 0.11172169489985921, "grad_norm": 2.723872184753418, "learning_rate": 3.723887375113533e-05, "loss": 0.2803, "step": 1230 }, { "epoch": 0.1126300013624597, "grad_norm": 2.8666157722473145, "learning_rate": 3.7541628822282776e-05, "loss": 0.3511, "step": 1240 }, { "epoch": 0.11353830782506018, "grad_norm": 8.346423149108887, "learning_rate": 3.784438389343022e-05, "loss": 0.3304, "step": 1250 }, { "epoch": 0.11444661428766066, "grad_norm": 5.239733695983887, "learning_rate": 3.8147138964577664e-05, "loss": 0.3271, "step": 1260 }, { "epoch": 0.11535492075026114, "grad_norm": 8.23116397857666, "learning_rate": 3.84498940357251e-05, "loss": 0.3625, "step": 1270 }, { "epoch": 0.11626322721286161, "grad_norm": 2.809619426727295, "learning_rate": 3.875264910687254e-05, "loss": 0.3142, "step": 1280 }, { "epoch": 0.1171715336754621, "grad_norm": 2.8086495399475098, "learning_rate": 3.905540417801998e-05, "loss": 0.4077, "step": 1290 }, { "epoch": 0.11807984013806258, "grad_norm": 2.7639265060424805, "learning_rate": 3.9358159249167426e-05, "loss": 0.4292, "step": 1300 }, { "epoch": 0.11898814660066306, "grad_norm": 6.900950908660889, "learning_rate": 3.966091432031487e-05, "loss": 0.4495, "step": 1310 }, { "epoch": 0.11989645306326355, "grad_norm": 1.8697035312652588, "learning_rate": 3.996366939146231e-05, "loss": 0.3122, "step": 1320 }, { "epoch": 0.12080475952586403, "grad_norm": 7.288318634033203, "learning_rate": 4.026642446260975e-05, "loss": 0.4207, "step": 1330 }, { "epoch": 0.12171306598846451, "grad_norm": 3.6624181270599365, "learning_rate": 4.056917953375719e-05, "loss": 0.3835, "step": 1340 }, { "epoch": 0.12262137245106498, "grad_norm": 3.2489120960235596, "learning_rate": 4.087193460490463e-05, "loss": 0.336, "step": 1350 }, { "epoch": 0.12352967891366547, "grad_norm": 4.533010005950928, "learning_rate": 4.1174689676052077e-05, "loss": 0.2819, "step": 1360 }, { "epoch": 0.12443798537626595, "grad_norm": 6.301357746124268, "learning_rate": 4.147744474719952e-05, "loss": 0.3306, "step": 1370 }, { "epoch": 0.12534629183886645, "grad_norm": 3.648895263671875, "learning_rate": 4.178019981834696e-05, "loss": 0.4164, "step": 1380 }, { "epoch": 0.1262545983014669, "grad_norm": 3.5831856727600098, "learning_rate": 4.20829548894944e-05, "loss": 0.349, "step": 1390 }, { "epoch": 0.12716290476406739, "grad_norm": 2.155290365219116, "learning_rate": 4.2385709960641845e-05, "loss": 0.3208, "step": 1400 }, { "epoch": 0.12807121122666787, "grad_norm": 1.8411073684692383, "learning_rate": 4.268846503178928e-05, "loss": 0.2674, "step": 1410 }, { "epoch": 0.12897951768926835, "grad_norm": 3.39276123046875, "learning_rate": 4.2991220102936727e-05, "loss": 0.3717, "step": 1420 }, { "epoch": 0.12988782415186884, "grad_norm": 6.457704544067383, "learning_rate": 4.3293975174084164e-05, "loss": 0.4465, "step": 1430 }, { "epoch": 0.13079613061446932, "grad_norm": 3.7800941467285156, "learning_rate": 4.359673024523161e-05, "loss": 0.2991, "step": 1440 }, { "epoch": 0.1317044370770698, "grad_norm": 7.212883472442627, "learning_rate": 4.389948531637905e-05, "loss": 0.2393, "step": 1450 }, { "epoch": 0.13261274353967029, "grad_norm": 9.63516616821289, "learning_rate": 4.4202240387526495e-05, "loss": 0.4212, "step": 1460 }, { "epoch": 0.13352105000227077, "grad_norm": 3.1892004013061523, "learning_rate": 4.450499545867394e-05, "loss": 0.2943, "step": 1470 }, { "epoch": 0.13442935646487125, "grad_norm": 1.7803316116333008, "learning_rate": 4.4807750529821377e-05, "loss": 0.3232, "step": 1480 }, { "epoch": 0.13533766292747174, "grad_norm": 6.318504333496094, "learning_rate": 4.5110505600968814e-05, "loss": 0.2998, "step": 1490 }, { "epoch": 0.13624596939007222, "grad_norm": 3.7782177925109863, "learning_rate": 4.541326067211626e-05, "loss": 0.4181, "step": 1500 }, { "epoch": 0.13624596939007222, "eval_loss": 0.35027948021888733, "eval_runtime": 1110.5264, "eval_samples_per_second": 8.813, "eval_steps_per_second": 8.813, "step": 1500 }, { "epoch": 0.1371542758526727, "grad_norm": 3.8431742191314697, "learning_rate": 4.57160157432637e-05, "loss": 0.3391, "step": 1510 }, { "epoch": 0.13806258231527319, "grad_norm": 2.8375356197357178, "learning_rate": 4.6018770814411146e-05, "loss": 0.3778, "step": 1520 }, { "epoch": 0.13897088877787364, "grad_norm": 2.583806037902832, "learning_rate": 4.632152588555859e-05, "loss": 0.2592, "step": 1530 }, { "epoch": 0.13987919524047412, "grad_norm": 6.339893817901611, "learning_rate": 4.662428095670603e-05, "loss": 0.3111, "step": 1540 }, { "epoch": 0.1407875017030746, "grad_norm": 5.245959758758545, "learning_rate": 4.692703602785347e-05, "loss": 0.3159, "step": 1550 }, { "epoch": 0.1416958081656751, "grad_norm": 4.119424343109131, "learning_rate": 4.722979109900091e-05, "loss": 0.405, "step": 1560 }, { "epoch": 0.14260411462827557, "grad_norm": 2.2902514934539795, "learning_rate": 4.753254617014835e-05, "loss": 0.341, "step": 1570 }, { "epoch": 0.14351242109087606, "grad_norm": 5.703978538513184, "learning_rate": 4.7835301241295796e-05, "loss": 0.331, "step": 1580 }, { "epoch": 0.14442072755347654, "grad_norm": 3.5796759128570557, "learning_rate": 4.813805631244323e-05, "loss": 0.4025, "step": 1590 }, { "epoch": 0.14532903401607702, "grad_norm": 6.113958358764648, "learning_rate": 4.844081138359068e-05, "loss": 0.2938, "step": 1600 }, { "epoch": 0.1462373404786775, "grad_norm": 7.59998893737793, "learning_rate": 4.874356645473812e-05, "loss": 0.3189, "step": 1610 }, { "epoch": 0.147145646941278, "grad_norm": 4.103620529174805, "learning_rate": 4.9046321525885565e-05, "loss": 0.4239, "step": 1620 }, { "epoch": 0.14805395340387847, "grad_norm": 6.133818626403809, "learning_rate": 4.9349076597033e-05, "loss": 0.3855, "step": 1630 }, { "epoch": 0.14896225986647896, "grad_norm": 3.1838295459747314, "learning_rate": 4.9651831668180446e-05, "loss": 0.4001, "step": 1640 }, { "epoch": 0.14987056632907944, "grad_norm": 2.063427209854126, "learning_rate": 4.995458673932788e-05, "loss": 0.3609, "step": 1650 }, { "epoch": 0.15077887279167992, "grad_norm": 2.0234663486480713, "learning_rate": 5.025734181047532e-05, "loss": 0.2213, "step": 1660 }, { "epoch": 0.1516871792542804, "grad_norm": 3.5450401306152344, "learning_rate": 5.0560096881622764e-05, "loss": 0.3957, "step": 1670 }, { "epoch": 0.15259548571688086, "grad_norm": 4.300881385803223, "learning_rate": 5.086285195277021e-05, "loss": 0.2907, "step": 1680 }, { "epoch": 0.15350379217948135, "grad_norm": 3.1789603233337402, "learning_rate": 5.116560702391765e-05, "loss": 0.4167, "step": 1690 }, { "epoch": 0.15441209864208183, "grad_norm": 6.079739570617676, "learning_rate": 5.1468362095065096e-05, "loss": 0.3793, "step": 1700 }, { "epoch": 0.1553204051046823, "grad_norm": 1.4587153196334839, "learning_rate": 5.177111716621253e-05, "loss": 0.2525, "step": 1710 }, { "epoch": 0.1562287115672828, "grad_norm": 3.1632964611053467, "learning_rate": 5.207387223735998e-05, "loss": 0.3123, "step": 1720 }, { "epoch": 0.15713701802988328, "grad_norm": 3.547654151916504, "learning_rate": 5.237662730850742e-05, "loss": 0.2736, "step": 1730 }, { "epoch": 0.15804532449248376, "grad_norm": 4.630010604858398, "learning_rate": 5.2679382379654865e-05, "loss": 0.2871, "step": 1740 }, { "epoch": 0.15895363095508425, "grad_norm": 4.889133930206299, "learning_rate": 5.29821374508023e-05, "loss": 0.278, "step": 1750 }, { "epoch": 0.15986193741768473, "grad_norm": 3.6746017932891846, "learning_rate": 5.3284892521949746e-05, "loss": 0.3517, "step": 1760 }, { "epoch": 0.1607702438802852, "grad_norm": 7.743109703063965, "learning_rate": 5.358764759309719e-05, "loss": 0.3534, "step": 1770 }, { "epoch": 0.1616785503428857, "grad_norm": 3.4851949214935303, "learning_rate": 5.3890402664244634e-05, "loss": 0.4232, "step": 1780 }, { "epoch": 0.16258685680548618, "grad_norm": 4.265970706939697, "learning_rate": 5.419315773539208e-05, "loss": 0.4902, "step": 1790 }, { "epoch": 0.16349516326808666, "grad_norm": 2.101942300796509, "learning_rate": 5.4495912806539515e-05, "loss": 0.32, "step": 1800 }, { "epoch": 0.16440346973068715, "grad_norm": 4.498259544372559, "learning_rate": 5.479866787768695e-05, "loss": 0.353, "step": 1810 }, { "epoch": 0.1653117761932876, "grad_norm": 4.671932220458984, "learning_rate": 5.510142294883439e-05, "loss": 0.2265, "step": 1820 }, { "epoch": 0.16622008265588809, "grad_norm": 4.008241176605225, "learning_rate": 5.540417801998183e-05, "loss": 0.2063, "step": 1830 }, { "epoch": 0.16712838911848857, "grad_norm": 7.81430196762085, "learning_rate": 5.570693309112928e-05, "loss": 0.3475, "step": 1840 }, { "epoch": 0.16803669558108905, "grad_norm": 2.110200881958008, "learning_rate": 5.600968816227672e-05, "loss": 0.3679, "step": 1850 }, { "epoch": 0.16894500204368953, "grad_norm": 4.210465431213379, "learning_rate": 5.6312443233424165e-05, "loss": 0.2803, "step": 1860 }, { "epoch": 0.16985330850629002, "grad_norm": 4.099610328674316, "learning_rate": 5.66151983045716e-05, "loss": 0.2945, "step": 1870 }, { "epoch": 0.1707616149688905, "grad_norm": 4.1514692306518555, "learning_rate": 5.6917953375719046e-05, "loss": 0.3243, "step": 1880 }, { "epoch": 0.17166992143149098, "grad_norm": 3.7215261459350586, "learning_rate": 5.722070844686649e-05, "loss": 0.2885, "step": 1890 }, { "epoch": 0.17257822789409147, "grad_norm": 4.979272365570068, "learning_rate": 5.7523463518013934e-05, "loss": 0.3249, "step": 1900 }, { "epoch": 0.17348653435669195, "grad_norm": 2.9677441120147705, "learning_rate": 5.782621858916137e-05, "loss": 0.1708, "step": 1910 }, { "epoch": 0.17439484081929243, "grad_norm": 3.196699857711792, "learning_rate": 5.8128973660308815e-05, "loss": 0.4306, "step": 1920 }, { "epoch": 0.17530314728189292, "grad_norm": 5.171126842498779, "learning_rate": 5.843172873145626e-05, "loss": 0.4302, "step": 1930 }, { "epoch": 0.1762114537444934, "grad_norm": 3.0857436656951904, "learning_rate": 5.87344838026037e-05, "loss": 0.2819, "step": 1940 }, { "epoch": 0.17711976020709388, "grad_norm": 8.951590538024902, "learning_rate": 5.903723887375113e-05, "loss": 0.2783, "step": 1950 }, { "epoch": 0.17802806666969437, "grad_norm": 2.1123290061950684, "learning_rate": 5.933999394489858e-05, "loss": 0.3496, "step": 1960 }, { "epoch": 0.17893637313229482, "grad_norm": 6.788313388824463, "learning_rate": 5.964274901604602e-05, "loss": 0.3005, "step": 1970 }, { "epoch": 0.1798446795948953, "grad_norm": 0.5559670329093933, "learning_rate": 5.994550408719346e-05, "loss": 0.3418, "step": 1980 }, { "epoch": 0.1807529860574958, "grad_norm": 5.687252998352051, "learning_rate": 6.02482591583409e-05, "loss": 0.2575, "step": 1990 }, { "epoch": 0.18166129252009627, "grad_norm": 1.0116881132125854, "learning_rate": 6.0551014229488346e-05, "loss": 0.3028, "step": 2000 }, { "epoch": 0.18166129252009627, "eval_loss": 0.33436936140060425, "eval_runtime": 1103.6248, "eval_samples_per_second": 8.868, "eval_steps_per_second": 8.868, "step": 2000 }, { "epoch": 0.18256959898269676, "grad_norm": 8.560587882995605, "learning_rate": 6.085376930063579e-05, "loss": 0.2988, "step": 2010 }, { "epoch": 0.18347790544529724, "grad_norm": 2.860863447189331, "learning_rate": 6.115652437178323e-05, "loss": 0.3376, "step": 2020 }, { "epoch": 0.18438621190789772, "grad_norm": 1.7168954610824585, "learning_rate": 6.145927944293068e-05, "loss": 0.3494, "step": 2030 }, { "epoch": 0.1852945183704982, "grad_norm": 1.7139688730239868, "learning_rate": 6.176203451407811e-05, "loss": 0.3514, "step": 2040 }, { "epoch": 0.1862028248330987, "grad_norm": 2.753044605255127, "learning_rate": 6.206478958522555e-05, "loss": 0.3175, "step": 2050 }, { "epoch": 0.18711113129569917, "grad_norm": 2.4018144607543945, "learning_rate": 6.2367544656373e-05, "loss": 0.3021, "step": 2060 }, { "epoch": 0.18801943775829966, "grad_norm": 2.9217660427093506, "learning_rate": 6.267029972752044e-05, "loss": 0.3569, "step": 2070 }, { "epoch": 0.18892774422090014, "grad_norm": 1.668607234954834, "learning_rate": 6.297305479866789e-05, "loss": 0.3829, "step": 2080 }, { "epoch": 0.18983605068350062, "grad_norm": 4.287295341491699, "learning_rate": 6.327580986981533e-05, "loss": 0.3784, "step": 2090 }, { "epoch": 0.1907443571461011, "grad_norm": 1.7109812498092651, "learning_rate": 6.357856494096276e-05, "loss": 0.2066, "step": 2100 }, { "epoch": 0.19165266360870156, "grad_norm": 4.245151042938232, "learning_rate": 6.38813200121102e-05, "loss": 0.3775, "step": 2110 }, { "epoch": 0.19256097007130205, "grad_norm": 3.4276931285858154, "learning_rate": 6.418407508325764e-05, "loss": 0.3553, "step": 2120 }, { "epoch": 0.19346927653390253, "grad_norm": 1.5890930891036987, "learning_rate": 6.448683015440509e-05, "loss": 0.2415, "step": 2130 }, { "epoch": 0.194377582996503, "grad_norm": 0.9807523488998413, "learning_rate": 6.478958522555253e-05, "loss": 0.223, "step": 2140 }, { "epoch": 0.1952858894591035, "grad_norm": 3.1109156608581543, "learning_rate": 6.509234029669998e-05, "loss": 0.3229, "step": 2150 }, { "epoch": 0.19619419592170398, "grad_norm": 1.1516168117523193, "learning_rate": 6.539509536784741e-05, "loss": 0.379, "step": 2160 }, { "epoch": 0.19710250238430446, "grad_norm": 1.3079661130905151, "learning_rate": 6.569785043899485e-05, "loss": 0.2762, "step": 2170 }, { "epoch": 0.19801080884690495, "grad_norm": 3.513230562210083, "learning_rate": 6.60006055101423e-05, "loss": 0.2875, "step": 2180 }, { "epoch": 0.19891911530950543, "grad_norm": 1.116614818572998, "learning_rate": 6.630336058128974e-05, "loss": 0.257, "step": 2190 }, { "epoch": 0.1998274217721059, "grad_norm": 4.153254985809326, "learning_rate": 6.660611565243719e-05, "loss": 0.3651, "step": 2200 }, { "epoch": 0.2007357282347064, "grad_norm": 5.723878860473633, "learning_rate": 6.690887072358463e-05, "loss": 0.3572, "step": 2210 }, { "epoch": 0.20164403469730688, "grad_norm": 1.8387700319290161, "learning_rate": 6.721162579473206e-05, "loss": 0.3496, "step": 2220 }, { "epoch": 0.20255234115990736, "grad_norm": 3.0848801136016846, "learning_rate": 6.751438086587952e-05, "loss": 0.2295, "step": 2230 }, { "epoch": 0.20346064762250785, "grad_norm": 2.2742481231689453, "learning_rate": 6.781713593702694e-05, "loss": 0.3496, "step": 2240 }, { "epoch": 0.20436895408510833, "grad_norm": 5.9683966636657715, "learning_rate": 6.811989100817439e-05, "loss": 0.3947, "step": 2250 }, { "epoch": 0.20527726054770878, "grad_norm": 3.069723129272461, "learning_rate": 6.842264607932183e-05, "loss": 0.3708, "step": 2260 }, { "epoch": 0.20618556701030927, "grad_norm": 2.689054012298584, "learning_rate": 6.872540115046926e-05, "loss": 0.4205, "step": 2270 }, { "epoch": 0.20709387347290975, "grad_norm": 4.613401889801025, "learning_rate": 6.902815622161671e-05, "loss": 0.3423, "step": 2280 }, { "epoch": 0.20800217993551023, "grad_norm": 1.7353312969207764, "learning_rate": 6.933091129276415e-05, "loss": 0.2756, "step": 2290 }, { "epoch": 0.20891048639811072, "grad_norm": 1.450920581817627, "learning_rate": 6.96336663639116e-05, "loss": 0.3051, "step": 2300 }, { "epoch": 0.2098187928607112, "grad_norm": 3.726011037826538, "learning_rate": 6.993642143505904e-05, "loss": 0.4165, "step": 2310 }, { "epoch": 0.21072709932331168, "grad_norm": 5.8684797286987305, "learning_rate": 7.023917650620648e-05, "loss": 0.2586, "step": 2320 }, { "epoch": 0.21163540578591217, "grad_norm": 3.6498560905456543, "learning_rate": 7.054193157735393e-05, "loss": 0.3039, "step": 2330 }, { "epoch": 0.21254371224851265, "grad_norm": 2.1884047985076904, "learning_rate": 7.084468664850136e-05, "loss": 0.248, "step": 2340 }, { "epoch": 0.21345201871111313, "grad_norm": 4.936244010925293, "learning_rate": 7.114744171964882e-05, "loss": 0.2127, "step": 2350 }, { "epoch": 0.21436032517371362, "grad_norm": 5.229279041290283, "learning_rate": 7.145019679079625e-05, "loss": 0.3108, "step": 2360 }, { "epoch": 0.2152686316363141, "grad_norm": 1.1564161777496338, "learning_rate": 7.175295186194369e-05, "loss": 0.1903, "step": 2370 }, { "epoch": 0.21617693809891458, "grad_norm": 0.7484591603279114, "learning_rate": 7.205570693309113e-05, "loss": 0.2961, "step": 2380 }, { "epoch": 0.21708524456151507, "grad_norm": 3.334428310394287, "learning_rate": 7.235846200423856e-05, "loss": 0.275, "step": 2390 }, { "epoch": 0.21799355102411552, "grad_norm": 4.915581703186035, "learning_rate": 7.266121707538601e-05, "loss": 0.4109, "step": 2400 }, { "epoch": 0.218901857486716, "grad_norm": 0.6211265325546265, "learning_rate": 7.296397214653345e-05, "loss": 0.2372, "step": 2410 }, { "epoch": 0.2198101639493165, "grad_norm": 3.541180372238159, "learning_rate": 7.32667272176809e-05, "loss": 0.3583, "step": 2420 }, { "epoch": 0.22071847041191697, "grad_norm": 2.2823076248168945, "learning_rate": 7.356948228882834e-05, "loss": 0.3384, "step": 2430 }, { "epoch": 0.22162677687451746, "grad_norm": 3.7065205574035645, "learning_rate": 7.387223735997578e-05, "loss": 0.269, "step": 2440 }, { "epoch": 0.22253508333711794, "grad_norm": 3.4792094230651855, "learning_rate": 7.417499243112323e-05, "loss": 0.2912, "step": 2450 }, { "epoch": 0.22344338979971842, "grad_norm": 8.721348762512207, "learning_rate": 7.447774750227067e-05, "loss": 0.2726, "step": 2460 }, { "epoch": 0.2243516962623189, "grad_norm": 2.321892738342285, "learning_rate": 7.478050257341812e-05, "loss": 0.2639, "step": 2470 }, { "epoch": 0.2252600027249194, "grad_norm": 1.896389365196228, "learning_rate": 7.508325764456555e-05, "loss": 0.4415, "step": 2480 }, { "epoch": 0.22616830918751987, "grad_norm": 1.4781962633132935, "learning_rate": 7.538601271571299e-05, "loss": 0.2188, "step": 2490 }, { "epoch": 0.22707661565012036, "grad_norm": 3.1824159622192383, "learning_rate": 7.568876778686044e-05, "loss": 0.3214, "step": 2500 }, { "epoch": 0.22707661565012036, "eval_loss": 0.3015625774860382, "eval_runtime": 1097.3322, "eval_samples_per_second": 8.919, "eval_steps_per_second": 8.919, "step": 2500 }, { "epoch": 0.22798492211272084, "grad_norm": 5.825647354125977, "learning_rate": 7.599152285800788e-05, "loss": 0.243, "step": 2510 }, { "epoch": 0.22889322857532132, "grad_norm": 3.5803263187408447, "learning_rate": 7.629427792915533e-05, "loss": 0.4338, "step": 2520 }, { "epoch": 0.2298015350379218, "grad_norm": 3.526430606842041, "learning_rate": 7.659703300030275e-05, "loss": 0.3041, "step": 2530 }, { "epoch": 0.2307098415005223, "grad_norm": 1.5475349426269531, "learning_rate": 7.68997880714502e-05, "loss": 0.2197, "step": 2540 }, { "epoch": 0.23161814796312274, "grad_norm": 1.0156314373016357, "learning_rate": 7.720254314259764e-05, "loss": 0.2578, "step": 2550 }, { "epoch": 0.23252645442572323, "grad_norm": 0.788859486579895, "learning_rate": 7.750529821374508e-05, "loss": 0.2129, "step": 2560 }, { "epoch": 0.2334347608883237, "grad_norm": 10.085786819458008, "learning_rate": 7.780805328489253e-05, "loss": 0.3049, "step": 2570 }, { "epoch": 0.2343430673509242, "grad_norm": 3.3765556812286377, "learning_rate": 7.811080835603997e-05, "loss": 0.348, "step": 2580 }, { "epoch": 0.23525137381352468, "grad_norm": 6.121445178985596, "learning_rate": 7.84135634271874e-05, "loss": 0.3223, "step": 2590 }, { "epoch": 0.23615968027612516, "grad_norm": 5.216106414794922, "learning_rate": 7.871631849833485e-05, "loss": 0.352, "step": 2600 }, { "epoch": 0.23706798673872564, "grad_norm": 3.919224977493286, "learning_rate": 7.901907356948229e-05, "loss": 0.2632, "step": 2610 }, { "epoch": 0.23797629320132613, "grad_norm": 0.17887432873249054, "learning_rate": 7.932182864062974e-05, "loss": 0.2934, "step": 2620 }, { "epoch": 0.2388845996639266, "grad_norm": 3.7668051719665527, "learning_rate": 7.962458371177718e-05, "loss": 0.3656, "step": 2630 }, { "epoch": 0.2397929061265271, "grad_norm": 2.110879898071289, "learning_rate": 7.992733878292462e-05, "loss": 0.3019, "step": 2640 }, { "epoch": 0.24070121258912758, "grad_norm": 5.825246334075928, "learning_rate": 8.023009385407207e-05, "loss": 0.356, "step": 2650 }, { "epoch": 0.24160951905172806, "grad_norm": 2.2753820419311523, "learning_rate": 8.05328489252195e-05, "loss": 0.2906, "step": 2660 }, { "epoch": 0.24251782551432854, "grad_norm": 4.666349411010742, "learning_rate": 8.083560399636694e-05, "loss": 0.3519, "step": 2670 }, { "epoch": 0.24342613197692903, "grad_norm": 0.09160958975553513, "learning_rate": 8.113835906751438e-05, "loss": 0.2944, "step": 2680 }, { "epoch": 0.2443344384395295, "grad_norm": 6.469869613647461, "learning_rate": 8.144111413866183e-05, "loss": 0.3283, "step": 2690 }, { "epoch": 0.24524274490212997, "grad_norm": 7.446476459503174, "learning_rate": 8.174386920980927e-05, "loss": 0.3447, "step": 2700 }, { "epoch": 0.24615105136473045, "grad_norm": 6.067270755767822, "learning_rate": 8.20466242809567e-05, "loss": 0.3355, "step": 2710 }, { "epoch": 0.24705935782733093, "grad_norm": 7.6380486488342285, "learning_rate": 8.234937935210415e-05, "loss": 0.3478, "step": 2720 }, { "epoch": 0.24796766428993142, "grad_norm": 4.286864757537842, "learning_rate": 8.265213442325159e-05, "loss": 0.3191, "step": 2730 }, { "epoch": 0.2488759707525319, "grad_norm": 3.1768646240234375, "learning_rate": 8.295488949439904e-05, "loss": 0.318, "step": 2740 }, { "epoch": 0.24978427721513238, "grad_norm": 1.57326340675354, "learning_rate": 8.325764456554648e-05, "loss": 0.3255, "step": 2750 }, { "epoch": 0.2506925836777329, "grad_norm": 3.070631742477417, "learning_rate": 8.356039963669392e-05, "loss": 0.2671, "step": 2760 }, { "epoch": 0.2516008901403333, "grad_norm": 2.6445627212524414, "learning_rate": 8.386315470784137e-05, "loss": 0.3714, "step": 2770 }, { "epoch": 0.2525091966029338, "grad_norm": 0.6450175642967224, "learning_rate": 8.41659097789888e-05, "loss": 0.1934, "step": 2780 }, { "epoch": 0.2534175030655343, "grad_norm": 2.299419641494751, "learning_rate": 8.446866485013625e-05, "loss": 0.3637, "step": 2790 }, { "epoch": 0.25432580952813477, "grad_norm": 2.9132511615753174, "learning_rate": 8.477141992128369e-05, "loss": 0.2826, "step": 2800 }, { "epoch": 0.25523411599073526, "grad_norm": 1.469300627708435, "learning_rate": 8.507417499243113e-05, "loss": 0.2916, "step": 2810 }, { "epoch": 0.25614242245333574, "grad_norm": 1.5228191614151, "learning_rate": 8.537693006357857e-05, "loss": 0.2122, "step": 2820 }, { "epoch": 0.2570507289159362, "grad_norm": 1.6211310625076294, "learning_rate": 8.5679685134726e-05, "loss": 0.1918, "step": 2830 }, { "epoch": 0.2579590353785367, "grad_norm": 1.7209854125976562, "learning_rate": 8.598244020587345e-05, "loss": 0.3668, "step": 2840 }, { "epoch": 0.2588673418411372, "grad_norm": 2.655238389968872, "learning_rate": 8.628519527702089e-05, "loss": 0.2345, "step": 2850 }, { "epoch": 0.25977564830373767, "grad_norm": 0.6278648972511292, "learning_rate": 8.658795034816833e-05, "loss": 0.2641, "step": 2860 }, { "epoch": 0.26068395476633816, "grad_norm": 1.4019625186920166, "learning_rate": 8.689070541931578e-05, "loss": 0.1962, "step": 2870 }, { "epoch": 0.26159226122893864, "grad_norm": 3.3027048110961914, "learning_rate": 8.719346049046322e-05, "loss": 0.3615, "step": 2880 }, { "epoch": 0.2625005676915391, "grad_norm": 1.3872700929641724, "learning_rate": 8.749621556161067e-05, "loss": 0.2218, "step": 2890 }, { "epoch": 0.2634088741541396, "grad_norm": 1.7197564840316772, "learning_rate": 8.77989706327581e-05, "loss": 0.4656, "step": 2900 }, { "epoch": 0.2643171806167401, "grad_norm": 1.6212080717086792, "learning_rate": 8.810172570390554e-05, "loss": 0.3432, "step": 2910 }, { "epoch": 0.26522548707934057, "grad_norm": 2.277846097946167, "learning_rate": 8.840448077505299e-05, "loss": 0.3147, "step": 2920 }, { "epoch": 0.26613379354194105, "grad_norm": 1.2557038068771362, "learning_rate": 8.870723584620043e-05, "loss": 0.2799, "step": 2930 }, { "epoch": 0.26704210000454154, "grad_norm": 1.531083345413208, "learning_rate": 8.900999091734788e-05, "loss": 0.2445, "step": 2940 }, { "epoch": 0.267950406467142, "grad_norm": 4.8993659019470215, "learning_rate": 8.931274598849532e-05, "loss": 0.29, "step": 2950 }, { "epoch": 0.2688587129297425, "grad_norm": 0.481231153011322, "learning_rate": 8.961550105964275e-05, "loss": 0.3209, "step": 2960 }, { "epoch": 0.269767019392343, "grad_norm": 3.6143953800201416, "learning_rate": 8.991825613079019e-05, "loss": 0.2892, "step": 2970 }, { "epoch": 0.27067532585494347, "grad_norm": 6.258852481842041, "learning_rate": 9.022101120193763e-05, "loss": 0.3126, "step": 2980 }, { "epoch": 0.27158363231754395, "grad_norm": 2.198021411895752, "learning_rate": 9.052376627308508e-05, "loss": 0.3837, "step": 2990 }, { "epoch": 0.27249193878014444, "grad_norm": 2.3235702514648438, "learning_rate": 9.082652134423252e-05, "loss": 0.303, "step": 3000 }, { "epoch": 0.27249193878014444, "eval_loss": 0.29772886633872986, "eval_runtime": 1100.9904, "eval_samples_per_second": 8.889, "eval_steps_per_second": 8.889, "step": 3000 }, { "epoch": 0.2734002452427449, "grad_norm": 2.203789234161377, "learning_rate": 9.112927641537997e-05, "loss": 0.2456, "step": 3010 }, { "epoch": 0.2743085517053454, "grad_norm": 4.2476806640625, "learning_rate": 9.14320314865274e-05, "loss": 0.2913, "step": 3020 }, { "epoch": 0.2752168581679459, "grad_norm": 2.3045849800109863, "learning_rate": 9.173478655767484e-05, "loss": 0.2695, "step": 3030 }, { "epoch": 0.27612516463054637, "grad_norm": 2.152757167816162, "learning_rate": 9.203754162882229e-05, "loss": 0.3419, "step": 3040 }, { "epoch": 0.27703347109314685, "grad_norm": 1.9420047998428345, "learning_rate": 9.234029669996973e-05, "loss": 0.3086, "step": 3050 }, { "epoch": 0.2779417775557473, "grad_norm": 2.193958282470703, "learning_rate": 9.264305177111718e-05, "loss": 0.2528, "step": 3060 }, { "epoch": 0.27885008401834777, "grad_norm": 3.7951338291168213, "learning_rate": 9.294580684226462e-05, "loss": 0.2541, "step": 3070 }, { "epoch": 0.27975839048094825, "grad_norm": 5.23020601272583, "learning_rate": 9.324856191341205e-05, "loss": 0.1835, "step": 3080 }, { "epoch": 0.28066669694354873, "grad_norm": 3.660047769546509, "learning_rate": 9.35513169845595e-05, "loss": 0.4929, "step": 3090 }, { "epoch": 0.2815750034061492, "grad_norm": 1.9879060983657837, "learning_rate": 9.385407205570694e-05, "loss": 0.2224, "step": 3100 }, { "epoch": 0.2824833098687497, "grad_norm": 1.6106420755386353, "learning_rate": 9.415682712685438e-05, "loss": 0.3835, "step": 3110 }, { "epoch": 0.2833916163313502, "grad_norm": 1.8096975088119507, "learning_rate": 9.445958219800182e-05, "loss": 0.2302, "step": 3120 }, { "epoch": 0.28429992279395067, "grad_norm": 2.0133962631225586, "learning_rate": 9.476233726914927e-05, "loss": 0.2194, "step": 3130 }, { "epoch": 0.28520822925655115, "grad_norm": 3.797724485397339, "learning_rate": 9.50650923402967e-05, "loss": 0.2873, "step": 3140 }, { "epoch": 0.28611653571915163, "grad_norm": 3.9310216903686523, "learning_rate": 9.536784741144414e-05, "loss": 0.2767, "step": 3150 }, { "epoch": 0.2870248421817521, "grad_norm": 2.2894582748413086, "learning_rate": 9.567060248259159e-05, "loss": 0.2854, "step": 3160 }, { "epoch": 0.2879331486443526, "grad_norm": 4.461854934692383, "learning_rate": 9.597335755373903e-05, "loss": 0.3234, "step": 3170 }, { "epoch": 0.2888414551069531, "grad_norm": 2.8144259452819824, "learning_rate": 9.627611262488647e-05, "loss": 0.3457, "step": 3180 }, { "epoch": 0.28974976156955357, "grad_norm": 1.186907410621643, "learning_rate": 9.657886769603392e-05, "loss": 0.2968, "step": 3190 }, { "epoch": 0.29065806803215405, "grad_norm": 2.9434351921081543, "learning_rate": 9.688162276718135e-05, "loss": 0.1845, "step": 3200 }, { "epoch": 0.29156637449475453, "grad_norm": 2.204430103302002, "learning_rate": 9.71843778383288e-05, "loss": 0.2282, "step": 3210 }, { "epoch": 0.292474680957355, "grad_norm": 3.3270576000213623, "learning_rate": 9.748713290947624e-05, "loss": 0.3016, "step": 3220 }, { "epoch": 0.2933829874199555, "grad_norm": 6.7817912101745605, "learning_rate": 9.778988798062368e-05, "loss": 0.2479, "step": 3230 }, { "epoch": 0.294291293882556, "grad_norm": 2.370866298675537, "learning_rate": 9.809264305177113e-05, "loss": 0.331, "step": 3240 }, { "epoch": 0.29519960034515647, "grad_norm": 4.51743745803833, "learning_rate": 9.839539812291855e-05, "loss": 0.3349, "step": 3250 }, { "epoch": 0.29610790680775695, "grad_norm": 2.5231852531433105, "learning_rate": 9.8698153194066e-05, "loss": 0.3447, "step": 3260 }, { "epoch": 0.29701621327035743, "grad_norm": 3.4839766025543213, "learning_rate": 9.900090826521344e-05, "loss": 0.3707, "step": 3270 }, { "epoch": 0.2979245197329579, "grad_norm": 2.039285898208618, "learning_rate": 9.930366333636089e-05, "loss": 0.3509, "step": 3280 }, { "epoch": 0.2988328261955584, "grad_norm": 1.6068166494369507, "learning_rate": 9.960641840750833e-05, "loss": 0.3274, "step": 3290 }, { "epoch": 0.2997411326581589, "grad_norm": 0.9441395998001099, "learning_rate": 9.990917347865577e-05, "loss": 0.2235, "step": 3300 }, { "epoch": 0.30064943912075937, "grad_norm": 2.695178985595703, "learning_rate": 9.999998631574025e-05, "loss": 0.3749, "step": 3310 }, { "epoch": 0.30155774558335985, "grad_norm": 2.3268465995788574, "learning_rate": 9.999991929081261e-05, "loss": 0.3521, "step": 3320 }, { "epoch": 0.30246605204596033, "grad_norm": 1.5726791620254517, "learning_rate": 9.999979641185637e-05, "loss": 0.2681, "step": 3330 }, { "epoch": 0.3033743585085608, "grad_norm": 1.4084157943725586, "learning_rate": 9.99996176790088e-05, "loss": 0.3769, "step": 3340 }, { "epoch": 0.30428266497116124, "grad_norm": 5.386907577514648, "learning_rate": 9.999938309246957e-05, "loss": 0.316, "step": 3350 }, { "epoch": 0.3051909714337617, "grad_norm": 1.4777470827102661, "learning_rate": 9.999909265250072e-05, "loss": 0.2847, "step": 3360 }, { "epoch": 0.3060992778963622, "grad_norm": 0.7825295925140381, "learning_rate": 9.999874635942672e-05, "loss": 0.3056, "step": 3370 }, { "epoch": 0.3070075843589627, "grad_norm": 3.5879859924316406, "learning_rate": 9.999834421363436e-05, "loss": 0.3526, "step": 3380 }, { "epoch": 0.3079158908215632, "grad_norm": 1.4638277292251587, "learning_rate": 9.999788621557293e-05, "loss": 0.2637, "step": 3390 }, { "epoch": 0.30882419728416366, "grad_norm": 0.931563675403595, "learning_rate": 9.999737236575402e-05, "loss": 0.2315, "step": 3400 }, { "epoch": 0.30973250374676414, "grad_norm": 1.7772397994995117, "learning_rate": 9.999680266475162e-05, "loss": 0.2689, "step": 3410 }, { "epoch": 0.3106408102093646, "grad_norm": 3.6969714164733887, "learning_rate": 9.999617711320217e-05, "loss": 0.3863, "step": 3420 }, { "epoch": 0.3115491166719651, "grad_norm": 2.8069283962249756, "learning_rate": 9.999549571180444e-05, "loss": 0.293, "step": 3430 }, { "epoch": 0.3124574231345656, "grad_norm": 1.4433598518371582, "learning_rate": 9.999475846131965e-05, "loss": 0.194, "step": 3440 }, { "epoch": 0.3133657295971661, "grad_norm": 2.5882062911987305, "learning_rate": 9.999396536257131e-05, "loss": 0.425, "step": 3450 }, { "epoch": 0.31427403605976656, "grad_norm": 3.6749954223632812, "learning_rate": 9.999311641644542e-05, "loss": 0.2902, "step": 3460 }, { "epoch": 0.31518234252236704, "grad_norm": 3.1010360717773438, "learning_rate": 9.999221162389031e-05, "loss": 0.2909, "step": 3470 }, { "epoch": 0.3160906489849675, "grad_norm": 1.1431852579116821, "learning_rate": 9.99912509859167e-05, "loss": 0.2991, "step": 3480 }, { "epoch": 0.316998955447568, "grad_norm": 3.0429959297180176, "learning_rate": 9.999023450359773e-05, "loss": 0.2989, "step": 3490 }, { "epoch": 0.3179072619101685, "grad_norm": 1.453582763671875, "learning_rate": 9.998916217806886e-05, "loss": 0.3694, "step": 3500 }, { "epoch": 0.3179072619101685, "eval_loss": 0.29943642020225525, "eval_runtime": 1106.8441, "eval_samples_per_second": 8.842, "eval_steps_per_second": 8.842, "step": 3500 }, { "epoch": 0.318815568372769, "grad_norm": 2.698784112930298, "learning_rate": 9.998803401052799e-05, "loss": 0.3728, "step": 3510 }, { "epoch": 0.31972387483536946, "grad_norm": 1.2175794839859009, "learning_rate": 9.998685000223535e-05, "loss": 0.3165, "step": 3520 }, { "epoch": 0.32063218129796994, "grad_norm": 1.9806511402130127, "learning_rate": 9.99856101545136e-05, "loss": 0.3981, "step": 3530 }, { "epoch": 0.3215404877605704, "grad_norm": 2.456066608428955, "learning_rate": 9.998431446874774e-05, "loss": 0.3321, "step": 3540 }, { "epoch": 0.3224487942231709, "grad_norm": 0.7148177623748779, "learning_rate": 9.998296294638517e-05, "loss": 0.3147, "step": 3550 }, { "epoch": 0.3233571006857714, "grad_norm": 4.362259864807129, "learning_rate": 9.998155558893563e-05, "loss": 0.2929, "step": 3560 }, { "epoch": 0.3242654071483719, "grad_norm": 1.3332549333572388, "learning_rate": 9.998009239797126e-05, "loss": 0.2724, "step": 3570 }, { "epoch": 0.32517371361097236, "grad_norm": 4.9125752449035645, "learning_rate": 9.997857337512659e-05, "loss": 0.3963, "step": 3580 }, { "epoch": 0.32608202007357284, "grad_norm": 4.234802722930908, "learning_rate": 9.997699852209847e-05, "loss": 0.4195, "step": 3590 }, { "epoch": 0.3269903265361733, "grad_norm": 3.908548593521118, "learning_rate": 9.997536784064614e-05, "loss": 0.3581, "step": 3600 }, { "epoch": 0.3278986329987738, "grad_norm": 2.4295222759246826, "learning_rate": 9.997368133259122e-05, "loss": 0.3439, "step": 3610 }, { "epoch": 0.3288069394613743, "grad_norm": 2.70206618309021, "learning_rate": 9.997193899981766e-05, "loss": 0.3291, "step": 3620 }, { "epoch": 0.3297152459239748, "grad_norm": 0.8036856651306152, "learning_rate": 9.997014084427181e-05, "loss": 0.2642, "step": 3630 }, { "epoch": 0.3306235523865752, "grad_norm": 1.8172664642333984, "learning_rate": 9.996828686796233e-05, "loss": 0.3763, "step": 3640 }, { "epoch": 0.3315318588491757, "grad_norm": 1.267970085144043, "learning_rate": 9.996637707296031e-05, "loss": 0.3076, "step": 3650 }, { "epoch": 0.33244016531177617, "grad_norm": 2.740004301071167, "learning_rate": 9.996441146139909e-05, "loss": 0.2893, "step": 3660 }, { "epoch": 0.33334847177437665, "grad_norm": 4.538939476013184, "learning_rate": 9.996239003547446e-05, "loss": 0.2606, "step": 3670 }, { "epoch": 0.33425677823697714, "grad_norm": 1.9399051666259766, "learning_rate": 9.99603127974445e-05, "loss": 0.2179, "step": 3680 }, { "epoch": 0.3351650846995776, "grad_norm": 2.1359503269195557, "learning_rate": 9.995817974962968e-05, "loss": 0.2504, "step": 3690 }, { "epoch": 0.3360733911621781, "grad_norm": 2.596465587615967, "learning_rate": 9.995599089441276e-05, "loss": 0.3373, "step": 3700 }, { "epoch": 0.3369816976247786, "grad_norm": 2.419083595275879, "learning_rate": 9.995374623423891e-05, "loss": 0.2222, "step": 3710 }, { "epoch": 0.33789000408737907, "grad_norm": 4.043828964233398, "learning_rate": 9.995144577161556e-05, "loss": 0.3632, "step": 3720 }, { "epoch": 0.33879831054997955, "grad_norm": 2.252554178237915, "learning_rate": 9.994908950911254e-05, "loss": 0.25, "step": 3730 }, { "epoch": 0.33970661701258004, "grad_norm": 3.0803115367889404, "learning_rate": 9.994667744936197e-05, "loss": 0.3297, "step": 3740 }, { "epoch": 0.3406149234751805, "grad_norm": 2.867170572280884, "learning_rate": 9.994420959505834e-05, "loss": 0.279, "step": 3750 }, { "epoch": 0.341523229937781, "grad_norm": 3.165942668914795, "learning_rate": 9.994168594895844e-05, "loss": 0.2931, "step": 3760 }, { "epoch": 0.3424315364003815, "grad_norm": 1.7621864080429077, "learning_rate": 9.993910651388138e-05, "loss": 0.3384, "step": 3770 }, { "epoch": 0.34333984286298197, "grad_norm": 0.8255841135978699, "learning_rate": 9.993647129270862e-05, "loss": 0.2419, "step": 3780 }, { "epoch": 0.34424814932558245, "grad_norm": 1.530321478843689, "learning_rate": 9.993378028838392e-05, "loss": 0.289, "step": 3790 }, { "epoch": 0.34515645578818294, "grad_norm": 2.41097354888916, "learning_rate": 9.993103350391332e-05, "loss": 0.2646, "step": 3800 }, { "epoch": 0.3460647622507834, "grad_norm": 4.3905134201049805, "learning_rate": 9.992823094236524e-05, "loss": 0.2599, "step": 3810 }, { "epoch": 0.3469730687133839, "grad_norm": 1.826992392539978, "learning_rate": 9.992537260687036e-05, "loss": 0.2041, "step": 3820 }, { "epoch": 0.3478813751759844, "grad_norm": 2.0689048767089844, "learning_rate": 9.992245850062168e-05, "loss": 0.2571, "step": 3830 }, { "epoch": 0.34878968163858487, "grad_norm": 2.390310287475586, "learning_rate": 9.991948862687448e-05, "loss": 0.1969, "step": 3840 }, { "epoch": 0.34969798810118535, "grad_norm": 0.1661413162946701, "learning_rate": 9.991646298894637e-05, "loss": 0.2791, "step": 3850 }, { "epoch": 0.35060629456378584, "grad_norm": 0.9088653922080994, "learning_rate": 9.991338159021723e-05, "loss": 0.3214, "step": 3860 }, { "epoch": 0.3515146010263863, "grad_norm": 2.2035303115844727, "learning_rate": 9.991024443412924e-05, "loss": 0.2663, "step": 3870 }, { "epoch": 0.3524229074889868, "grad_norm": 2.0854992866516113, "learning_rate": 9.990705152418687e-05, "loss": 0.2426, "step": 3880 }, { "epoch": 0.3533312139515873, "grad_norm": 0.47039660811424255, "learning_rate": 9.990380286395684e-05, "loss": 0.2594, "step": 3890 }, { "epoch": 0.35423952041418777, "grad_norm": 3.758791446685791, "learning_rate": 9.99004984570682e-05, "loss": 0.2216, "step": 3900 }, { "epoch": 0.35514782687678825, "grad_norm": 5.700037479400635, "learning_rate": 9.989713830721222e-05, "loss": 0.3209, "step": 3910 }, { "epoch": 0.35605613333938874, "grad_norm": 2.671236038208008, "learning_rate": 9.989372241814248e-05, "loss": 0.393, "step": 3920 }, { "epoch": 0.35696443980198916, "grad_norm": 5.253303527832031, "learning_rate": 9.98902507936748e-05, "loss": 0.2609, "step": 3930 }, { "epoch": 0.35787274626458965, "grad_norm": 1.3937228918075562, "learning_rate": 9.988672343768728e-05, "loss": 0.2647, "step": 3940 }, { "epoch": 0.35878105272719013, "grad_norm": 2.714449405670166, "learning_rate": 9.988314035412026e-05, "loss": 0.261, "step": 3950 }, { "epoch": 0.3596893591897906, "grad_norm": 2.035243034362793, "learning_rate": 9.987950154697634e-05, "loss": 0.2401, "step": 3960 }, { "epoch": 0.3605976656523911, "grad_norm": 0.7483572959899902, "learning_rate": 9.987580702032036e-05, "loss": 0.2704, "step": 3970 }, { "epoch": 0.3615059721149916, "grad_norm": 1.0530205965042114, "learning_rate": 9.987205677827943e-05, "loss": 0.2611, "step": 3980 }, { "epoch": 0.36241427857759206, "grad_norm": 2.2220864295959473, "learning_rate": 9.986825082504288e-05, "loss": 0.2674, "step": 3990 }, { "epoch": 0.36332258504019255, "grad_norm": 3.886570930480957, "learning_rate": 9.986438916486223e-05, "loss": 0.3416, "step": 4000 }, { "epoch": 0.36332258504019255, "eval_loss": 0.29881542921066284, "eval_runtime": 1141.6629, "eval_samples_per_second": 8.573, "eval_steps_per_second": 8.573, "step": 4000 }, { "epoch": 0.36423089150279303, "grad_norm": 2.3018815517425537, "learning_rate": 9.986047180205132e-05, "loss": 0.2859, "step": 4010 }, { "epoch": 0.3651391979653935, "grad_norm": 4.607203006744385, "learning_rate": 9.985649874098615e-05, "loss": 0.307, "step": 4020 }, { "epoch": 0.366047504427994, "grad_norm": 1.433944582939148, "learning_rate": 9.985246998610496e-05, "loss": 0.2622, "step": 4030 }, { "epoch": 0.3669558108905945, "grad_norm": 2.0020110607147217, "learning_rate": 9.984838554190821e-05, "loss": 0.2806, "step": 4040 }, { "epoch": 0.36786411735319496, "grad_norm": 1.358676791191101, "learning_rate": 9.984424541295852e-05, "loss": 0.3102, "step": 4050 }, { "epoch": 0.36877242381579545, "grad_norm": 1.7420293092727661, "learning_rate": 9.98400496038808e-05, "loss": 0.1811, "step": 4060 }, { "epoch": 0.36968073027839593, "grad_norm": 0.5398093461990356, "learning_rate": 9.98357981193621e-05, "loss": 0.2214, "step": 4070 }, { "epoch": 0.3705890367409964, "grad_norm": 5.648110389709473, "learning_rate": 9.983149096415169e-05, "loss": 0.3171, "step": 4080 }, { "epoch": 0.3714973432035969, "grad_norm": 2.0738329887390137, "learning_rate": 9.982712814306099e-05, "loss": 0.3258, "step": 4090 }, { "epoch": 0.3724056496661974, "grad_norm": 2.0598042011260986, "learning_rate": 9.982270966096364e-05, "loss": 0.2615, "step": 4100 }, { "epoch": 0.37331395612879786, "grad_norm": 2.4509778022766113, "learning_rate": 9.981823552279545e-05, "loss": 0.2315, "step": 4110 }, { "epoch": 0.37422226259139835, "grad_norm": 1.6241751909255981, "learning_rate": 9.981370573355442e-05, "loss": 0.2736, "step": 4120 }, { "epoch": 0.37513056905399883, "grad_norm": 2.6666295528411865, "learning_rate": 9.980912029830068e-05, "loss": 0.3559, "step": 4130 }, { "epoch": 0.3760388755165993, "grad_norm": 0.8722617030143738, "learning_rate": 9.980447922215653e-05, "loss": 0.2654, "step": 4140 }, { "epoch": 0.3769471819791998, "grad_norm": 2.1647086143493652, "learning_rate": 9.979978251030645e-05, "loss": 0.2361, "step": 4150 }, { "epoch": 0.3778554884418003, "grad_norm": 2.6035797595977783, "learning_rate": 9.979503016799705e-05, "loss": 0.3007, "step": 4160 }, { "epoch": 0.37876379490440076, "grad_norm": 1.8636099100112915, "learning_rate": 9.979022220053708e-05, "loss": 0.2035, "step": 4170 }, { "epoch": 0.37967210136700125, "grad_norm": 4.108850955963135, "learning_rate": 9.978535861329746e-05, "loss": 0.2916, "step": 4180 }, { "epoch": 0.38058040782960173, "grad_norm": 2.4220221042633057, "learning_rate": 9.978043941171118e-05, "loss": 0.2674, "step": 4190 }, { "epoch": 0.3814887142922022, "grad_norm": 1.6163685321807861, "learning_rate": 9.977546460127343e-05, "loss": 0.2509, "step": 4200 }, { "epoch": 0.3823970207548027, "grad_norm": 3.880574941635132, "learning_rate": 9.977043418754146e-05, "loss": 0.3137, "step": 4210 }, { "epoch": 0.3833053272174031, "grad_norm": 3.2876386642456055, "learning_rate": 9.976534817613467e-05, "loss": 0.2928, "step": 4220 }, { "epoch": 0.3842136336800036, "grad_norm": 1.216164469718933, "learning_rate": 9.976020657273452e-05, "loss": 0.4365, "step": 4230 }, { "epoch": 0.3851219401426041, "grad_norm": 3.832597494125366, "learning_rate": 9.975500938308466e-05, "loss": 0.2025, "step": 4240 }, { "epoch": 0.3860302466052046, "grad_norm": 1.8185060024261475, "learning_rate": 9.974975661299075e-05, "loss": 0.2029, "step": 4250 }, { "epoch": 0.38693855306780506, "grad_norm": 3.0377776622772217, "learning_rate": 9.974444826832056e-05, "loss": 0.3456, "step": 4260 }, { "epoch": 0.38784685953040554, "grad_norm": 0.983494758605957, "learning_rate": 9.973908435500395e-05, "loss": 0.2795, "step": 4270 }, { "epoch": 0.388755165993006, "grad_norm": 1.9286140203475952, "learning_rate": 9.973366487903287e-05, "loss": 0.2719, "step": 4280 }, { "epoch": 0.3896634724556065, "grad_norm": 3.8136041164398193, "learning_rate": 9.972818984646131e-05, "loss": 0.2604, "step": 4290 }, { "epoch": 0.390571778918207, "grad_norm": 0.22050902247428894, "learning_rate": 9.972265926340532e-05, "loss": 0.2715, "step": 4300 }, { "epoch": 0.3914800853808075, "grad_norm": 2.356178045272827, "learning_rate": 9.971707313604303e-05, "loss": 0.3466, "step": 4310 }, { "epoch": 0.39238839184340796, "grad_norm": 2.370565414428711, "learning_rate": 9.971143147061462e-05, "loss": 0.3667, "step": 4320 }, { "epoch": 0.39329669830600844, "grad_norm": 0.28996747732162476, "learning_rate": 9.970573427342225e-05, "loss": 0.2214, "step": 4330 }, { "epoch": 0.3942050047686089, "grad_norm": 0.8521168231964111, "learning_rate": 9.969998155083021e-05, "loss": 0.3029, "step": 4340 }, { "epoch": 0.3951133112312094, "grad_norm": 0.692247748374939, "learning_rate": 9.969417330926474e-05, "loss": 0.2133, "step": 4350 }, { "epoch": 0.3960216176938099, "grad_norm": 1.6133493185043335, "learning_rate": 9.968830955521412e-05, "loss": 0.3374, "step": 4360 }, { "epoch": 0.3969299241564104, "grad_norm": 2.2590017318725586, "learning_rate": 9.968239029522866e-05, "loss": 0.2487, "step": 4370 }, { "epoch": 0.39783823061901086, "grad_norm": 1.1666592359542847, "learning_rate": 9.967641553592064e-05, "loss": 0.1924, "step": 4380 }, { "epoch": 0.39874653708161134, "grad_norm": 0.9639243483543396, "learning_rate": 9.967038528396437e-05, "loss": 0.3526, "step": 4390 }, { "epoch": 0.3996548435442118, "grad_norm": 2.7259087562561035, "learning_rate": 9.966429954609615e-05, "loss": 0.1871, "step": 4400 }, { "epoch": 0.4005631500068123, "grad_norm": 3.2307636737823486, "learning_rate": 9.965815832911425e-05, "loss": 0.2268, "step": 4410 }, { "epoch": 0.4014714564694128, "grad_norm": 1.6424212455749512, "learning_rate": 9.965196163987889e-05, "loss": 0.3152, "step": 4420 }, { "epoch": 0.4023797629320133, "grad_norm": 1.5231444835662842, "learning_rate": 9.964570948531231e-05, "loss": 0.3417, "step": 4430 }, { "epoch": 0.40328806939461376, "grad_norm": 2.430523633956909, "learning_rate": 9.963940187239867e-05, "loss": 0.308, "step": 4440 }, { "epoch": 0.40419637585721424, "grad_norm": 3.5801784992218018, "learning_rate": 9.963303880818407e-05, "loss": 0.365, "step": 4450 }, { "epoch": 0.4051046823198147, "grad_norm": 3.5012221336364746, "learning_rate": 9.962662029977663e-05, "loss": 0.2037, "step": 4460 }, { "epoch": 0.4060129887824152, "grad_norm": 0.7117727398872375, "learning_rate": 9.962014635434631e-05, "loss": 0.2598, "step": 4470 }, { "epoch": 0.4069212952450157, "grad_norm": 2.0606374740600586, "learning_rate": 9.961361697912504e-05, "loss": 0.4256, "step": 4480 }, { "epoch": 0.4078296017076162, "grad_norm": 1.2288482189178467, "learning_rate": 9.96070321814067e-05, "loss": 0.2701, "step": 4490 }, { "epoch": 0.40873790817021666, "grad_norm": 2.6544997692108154, "learning_rate": 9.960039196854702e-05, "loss": 0.266, "step": 4500 }, { "epoch": 0.40873790817021666, "eval_loss": 0.27328890562057495, "eval_runtime": 1107.5917, "eval_samples_per_second": 8.836, "eval_steps_per_second": 8.836, "step": 4500 }, { "epoch": 0.4096462146328171, "grad_norm": 0.4306527376174927, "learning_rate": 9.95936963479637e-05, "loss": 0.2993, "step": 4510 }, { "epoch": 0.41055452109541757, "grad_norm": 0.9289443492889404, "learning_rate": 9.958694532713626e-05, "loss": 0.2737, "step": 4520 }, { "epoch": 0.41146282755801805, "grad_norm": 2.4210355281829834, "learning_rate": 9.958013891360618e-05, "loss": 0.2897, "step": 4530 }, { "epoch": 0.41237113402061853, "grad_norm": 2.8126158714294434, "learning_rate": 9.957327711497678e-05, "loss": 0.2599, "step": 4540 }, { "epoch": 0.413279440483219, "grad_norm": 0.22515113651752472, "learning_rate": 9.956635993891322e-05, "loss": 0.1617, "step": 4550 }, { "epoch": 0.4141877469458195, "grad_norm": 1.239336371421814, "learning_rate": 9.955938739314258e-05, "loss": 0.2971, "step": 4560 }, { "epoch": 0.41509605340842, "grad_norm": 0.627289891242981, "learning_rate": 9.955235948545377e-05, "loss": 0.2073, "step": 4570 }, { "epoch": 0.41600435987102047, "grad_norm": 1.3213613033294678, "learning_rate": 9.954527622369753e-05, "loss": 0.1608, "step": 4580 }, { "epoch": 0.41691266633362095, "grad_norm": 1.2792463302612305, "learning_rate": 9.953813761578648e-05, "loss": 0.2393, "step": 4590 }, { "epoch": 0.41782097279622143, "grad_norm": 2.0013575553894043, "learning_rate": 9.9530943669695e-05, "loss": 0.3523, "step": 4600 }, { "epoch": 0.4187292792588219, "grad_norm": 0.6592528820037842, "learning_rate": 9.952369439345933e-05, "loss": 0.2399, "step": 4610 }, { "epoch": 0.4196375857214224, "grad_norm": 0.7856512069702148, "learning_rate": 9.95163897951775e-05, "loss": 0.2266, "step": 4620 }, { "epoch": 0.4205458921840229, "grad_norm": 2.3692429065704346, "learning_rate": 9.950902988300937e-05, "loss": 0.2537, "step": 4630 }, { "epoch": 0.42145419864662337, "grad_norm": 0.5489498972892761, "learning_rate": 9.950161466517655e-05, "loss": 0.2197, "step": 4640 }, { "epoch": 0.42236250510922385, "grad_norm": 3.102792501449585, "learning_rate": 9.949414414996245e-05, "loss": 0.2083, "step": 4650 }, { "epoch": 0.42327081157182433, "grad_norm": 3.8738179206848145, "learning_rate": 9.948661834571226e-05, "loss": 0.3517, "step": 4660 }, { "epoch": 0.4241791180344248, "grad_norm": 1.5303255319595337, "learning_rate": 9.947903726083292e-05, "loss": 0.3526, "step": 4670 }, { "epoch": 0.4250874244970253, "grad_norm": 1.8834445476531982, "learning_rate": 9.947140090379311e-05, "loss": 0.2831, "step": 4680 }, { "epoch": 0.4259957309596258, "grad_norm": 1.3233453035354614, "learning_rate": 9.94637092831233e-05, "loss": 0.3124, "step": 4690 }, { "epoch": 0.42690403742222627, "grad_norm": 1.8146551847457886, "learning_rate": 9.945596240741563e-05, "loss": 0.266, "step": 4700 }, { "epoch": 0.42781234388482675, "grad_norm": 0.647864818572998, "learning_rate": 9.944816028532404e-05, "loss": 0.2485, "step": 4710 }, { "epoch": 0.42872065034742723, "grad_norm": 1.021762490272522, "learning_rate": 9.94403029255641e-05, "loss": 0.2522, "step": 4720 }, { "epoch": 0.4296289568100277, "grad_norm": 0.8479620218276978, "learning_rate": 9.943239033691315e-05, "loss": 0.2427, "step": 4730 }, { "epoch": 0.4305372632726282, "grad_norm": 0.9768635034561157, "learning_rate": 9.94244225282102e-05, "loss": 0.2062, "step": 4740 }, { "epoch": 0.4314455697352287, "grad_norm": 1.0540388822555542, "learning_rate": 9.941639950835595e-05, "loss": 0.314, "step": 4750 }, { "epoch": 0.43235387619782917, "grad_norm": 2.2613306045532227, "learning_rate": 9.940832128631279e-05, "loss": 0.3078, "step": 4760 }, { "epoch": 0.43326218266042965, "grad_norm": 1.2481287717819214, "learning_rate": 9.940018787110472e-05, "loss": 0.2687, "step": 4770 }, { "epoch": 0.43417048912303013, "grad_norm": 1.1386662721633911, "learning_rate": 9.939199927181747e-05, "loss": 0.2565, "step": 4780 }, { "epoch": 0.4350787955856306, "grad_norm": 0.341910719871521, "learning_rate": 9.938375549759838e-05, "loss": 0.291, "step": 4790 }, { "epoch": 0.43598710204823105, "grad_norm": 1.2025765180587769, "learning_rate": 9.937545655765639e-05, "loss": 0.2385, "step": 4800 }, { "epoch": 0.43689540851083153, "grad_norm": 0.7784351110458374, "learning_rate": 9.936710246126213e-05, "loss": 0.3387, "step": 4810 }, { "epoch": 0.437803714973432, "grad_norm": 1.882187843322754, "learning_rate": 9.93586932177478e-05, "loss": 0.2237, "step": 4820 }, { "epoch": 0.4387120214360325, "grad_norm": 2.0169270038604736, "learning_rate": 9.935022883650723e-05, "loss": 0.2394, "step": 4830 }, { "epoch": 0.439620327898633, "grad_norm": 1.947173833847046, "learning_rate": 9.934170932699582e-05, "loss": 0.2457, "step": 4840 }, { "epoch": 0.44052863436123346, "grad_norm": 2.0706264972686768, "learning_rate": 9.933313469873058e-05, "loss": 0.2615, "step": 4850 }, { "epoch": 0.44143694082383395, "grad_norm": 8.043557167053223, "learning_rate": 9.932450496129007e-05, "loss": 0.2624, "step": 4860 }, { "epoch": 0.44234524728643443, "grad_norm": 3.470097780227661, "learning_rate": 9.93158201243144e-05, "loss": 0.3107, "step": 4870 }, { "epoch": 0.4432535537490349, "grad_norm": 1.3256440162658691, "learning_rate": 9.930708019750526e-05, "loss": 0.3491, "step": 4880 }, { "epoch": 0.4441618602116354, "grad_norm": 0.5626578330993652, "learning_rate": 9.929828519062586e-05, "loss": 0.2477, "step": 4890 }, { "epoch": 0.4450701666742359, "grad_norm": 1.9002071619033813, "learning_rate": 9.928943511350098e-05, "loss": 0.1801, "step": 4900 }, { "epoch": 0.44597847313683636, "grad_norm": 5.107746601104736, "learning_rate": 9.928052997601683e-05, "loss": 0.1939, "step": 4910 }, { "epoch": 0.44688677959943685, "grad_norm": 2.4780921936035156, "learning_rate": 9.927156978812125e-05, "loss": 0.2763, "step": 4920 }, { "epoch": 0.44779508606203733, "grad_norm": 2.579906463623047, "learning_rate": 9.926255455982344e-05, "loss": 0.1929, "step": 4930 }, { "epoch": 0.4487033925246378, "grad_norm": 1.4029031991958618, "learning_rate": 9.925348430119419e-05, "loss": 0.3113, "step": 4940 }, { "epoch": 0.4496116989872383, "grad_norm": 1.3975722789764404, "learning_rate": 9.924435902236572e-05, "loss": 0.309, "step": 4950 }, { "epoch": 0.4505200054498388, "grad_norm": 1.163485050201416, "learning_rate": 9.923517873353174e-05, "loss": 0.2066, "step": 4960 }, { "epoch": 0.45142831191243926, "grad_norm": 1.0222816467285156, "learning_rate": 9.922594344494734e-05, "loss": 0.2625, "step": 4970 }, { "epoch": 0.45233661837503975, "grad_norm": 1.226617455482483, "learning_rate": 9.921665316692915e-05, "loss": 0.2855, "step": 4980 }, { "epoch": 0.45324492483764023, "grad_norm": 1.081939935684204, "learning_rate": 9.920730790985512e-05, "loss": 0.3243, "step": 4990 }, { "epoch": 0.4541532313002407, "grad_norm": 0.5114734768867493, "learning_rate": 9.919790768416472e-05, "loss": 0.2433, "step": 5000 }, { "epoch": 0.4541532313002407, "eval_loss": 0.28244665265083313, "eval_runtime": 1123.0073, "eval_samples_per_second": 8.715, "eval_steps_per_second": 8.715, "step": 5000 }, { "epoch": 0.4550615377628412, "grad_norm": 0.6946306824684143, "learning_rate": 9.918845250035877e-05, "loss": 0.2688, "step": 5010 }, { "epoch": 0.4559698442254417, "grad_norm": 2.392451524734497, "learning_rate": 9.917894236899948e-05, "loss": 0.3144, "step": 5020 }, { "epoch": 0.45687815068804216, "grad_norm": 1.639190673828125, "learning_rate": 9.916937730071044e-05, "loss": 0.3433, "step": 5030 }, { "epoch": 0.45778645715064264, "grad_norm": 2.897522211074829, "learning_rate": 9.915975730617662e-05, "loss": 0.2835, "step": 5040 }, { "epoch": 0.45869476361324313, "grad_norm": 3.3496289253234863, "learning_rate": 9.915008239614435e-05, "loss": 0.2266, "step": 5050 }, { "epoch": 0.4596030700758436, "grad_norm": 2.0436525344848633, "learning_rate": 9.914035258142131e-05, "loss": 0.2279, "step": 5060 }, { "epoch": 0.4605113765384441, "grad_norm": 1.277806282043457, "learning_rate": 9.913056787287651e-05, "loss": 0.2484, "step": 5070 }, { "epoch": 0.4614196830010446, "grad_norm": 1.3939757347106934, "learning_rate": 9.912072828144025e-05, "loss": 0.2695, "step": 5080 }, { "epoch": 0.46232798946364506, "grad_norm": 2.1752147674560547, "learning_rate": 9.91108338181042e-05, "loss": 0.3264, "step": 5090 }, { "epoch": 0.4632362959262455, "grad_norm": 3.932366132736206, "learning_rate": 9.910088449392127e-05, "loss": 0.3119, "step": 5100 }, { "epoch": 0.464144602388846, "grad_norm": 1.4198850393295288, "learning_rate": 9.909088032000567e-05, "loss": 0.2321, "step": 5110 }, { "epoch": 0.46505290885144646, "grad_norm": 4.705433368682861, "learning_rate": 9.908082130753288e-05, "loss": 0.2542, "step": 5120 }, { "epoch": 0.46596121531404694, "grad_norm": 2.429344654083252, "learning_rate": 9.907070746773966e-05, "loss": 0.2434, "step": 5130 }, { "epoch": 0.4668695217766474, "grad_norm": 0.5079476237297058, "learning_rate": 9.906053881192401e-05, "loss": 0.281, "step": 5140 }, { "epoch": 0.4677778282392479, "grad_norm": 1.0167567729949951, "learning_rate": 9.905031535144513e-05, "loss": 0.2869, "step": 5150 }, { "epoch": 0.4686861347018484, "grad_norm": 3.0708744525909424, "learning_rate": 9.904003709772351e-05, "loss": 0.2674, "step": 5160 }, { "epoch": 0.4695944411644489, "grad_norm": 2.0558860301971436, "learning_rate": 9.902970406224075e-05, "loss": 0.2209, "step": 5170 }, { "epoch": 0.47050274762704936, "grad_norm": 0.37494325637817383, "learning_rate": 9.901931625653975e-05, "loss": 0.2413, "step": 5180 }, { "epoch": 0.47141105408964984, "grad_norm": 1.0552047491073608, "learning_rate": 9.900887369222451e-05, "loss": 0.1717, "step": 5190 }, { "epoch": 0.4723193605522503, "grad_norm": 2.227484941482544, "learning_rate": 9.899837638096026e-05, "loss": 0.2449, "step": 5200 }, { "epoch": 0.4732276670148508, "grad_norm": 5.132054328918457, "learning_rate": 9.898782433447335e-05, "loss": 0.276, "step": 5210 }, { "epoch": 0.4741359734774513, "grad_norm": 1.6538174152374268, "learning_rate": 9.897721756455129e-05, "loss": 0.2124, "step": 5220 }, { "epoch": 0.4750442799400518, "grad_norm": 0.051094092428684235, "learning_rate": 9.89665560830427e-05, "loss": 0.2461, "step": 5230 }, { "epoch": 0.47595258640265226, "grad_norm": 0.728874683380127, "learning_rate": 9.895583990185736e-05, "loss": 0.2107, "step": 5240 }, { "epoch": 0.47686089286525274, "grad_norm": 2.5067851543426514, "learning_rate": 9.89450690329661e-05, "loss": 0.2838, "step": 5250 }, { "epoch": 0.4777691993278532, "grad_norm": 0.6972713470458984, "learning_rate": 9.893424348840091e-05, "loss": 0.158, "step": 5260 }, { "epoch": 0.4786775057904537, "grad_norm": 1.4032970666885376, "learning_rate": 9.892336328025476e-05, "loss": 0.2499, "step": 5270 }, { "epoch": 0.4795858122530542, "grad_norm": 1.8530218601226807, "learning_rate": 9.891242842068176e-05, "loss": 0.1962, "step": 5280 }, { "epoch": 0.4804941187156547, "grad_norm": 2.4675230979919434, "learning_rate": 9.890143892189704e-05, "loss": 0.2028, "step": 5290 }, { "epoch": 0.48140242517825516, "grad_norm": 0.8103827834129333, "learning_rate": 9.889039479617683e-05, "loss": 0.2537, "step": 5300 }, { "epoch": 0.48231073164085564, "grad_norm": 2.1776299476623535, "learning_rate": 9.887929605585827e-05, "loss": 0.3369, "step": 5310 }, { "epoch": 0.4832190381034561, "grad_norm": 1.612938642501831, "learning_rate": 9.886814271333958e-05, "loss": 0.3198, "step": 5320 }, { "epoch": 0.4841273445660566, "grad_norm": 2.2068440914154053, "learning_rate": 9.885693478107995e-05, "loss": 0.2812, "step": 5330 }, { "epoch": 0.4850356510286571, "grad_norm": 1.6196894645690918, "learning_rate": 9.88456722715996e-05, "loss": 0.3035, "step": 5340 }, { "epoch": 0.48594395749125757, "grad_norm": 1.1289561986923218, "learning_rate": 9.883435519747966e-05, "loss": 0.2722, "step": 5350 }, { "epoch": 0.48685226395385806, "grad_norm": 0.007345729973167181, "learning_rate": 9.882298357136221e-05, "loss": 0.1863, "step": 5360 }, { "epoch": 0.48776057041645854, "grad_norm": 1.0124266147613525, "learning_rate": 9.881155740595035e-05, "loss": 0.275, "step": 5370 }, { "epoch": 0.488668876879059, "grad_norm": 1.4120680093765259, "learning_rate": 9.880007671400799e-05, "loss": 0.1681, "step": 5380 }, { "epoch": 0.48957718334165945, "grad_norm": 2.2796225547790527, "learning_rate": 9.878854150836006e-05, "loss": 0.3377, "step": 5390 }, { "epoch": 0.49048548980425993, "grad_norm": 3.280296802520752, "learning_rate": 9.87769518018923e-05, "loss": 0.2857, "step": 5400 }, { "epoch": 0.4913937962668604, "grad_norm": 2.663513660430908, "learning_rate": 9.876530760755138e-05, "loss": 0.2713, "step": 5410 }, { "epoch": 0.4923021027294609, "grad_norm": 3.103539228439331, "learning_rate": 9.87536089383448e-05, "loss": 0.1616, "step": 5420 }, { "epoch": 0.4932104091920614, "grad_norm": 3.4854400157928467, "learning_rate": 9.874185580734098e-05, "loss": 0.2304, "step": 5430 }, { "epoch": 0.49411871565466187, "grad_norm": 1.5638200044631958, "learning_rate": 9.873004822766909e-05, "loss": 0.3132, "step": 5440 }, { "epoch": 0.49502702211726235, "grad_norm": 1.3070992231369019, "learning_rate": 9.871818621251921e-05, "loss": 0.2623, "step": 5450 }, { "epoch": 0.49593532857986283, "grad_norm": 1.6122404336929321, "learning_rate": 9.870626977514217e-05, "loss": 0.2165, "step": 5460 }, { "epoch": 0.4968436350424633, "grad_norm": 2.62027907371521, "learning_rate": 9.869429892884961e-05, "loss": 0.2426, "step": 5470 }, { "epoch": 0.4977519415050638, "grad_norm": 0.023721417412161827, "learning_rate": 9.868227368701394e-05, "loss": 0.2272, "step": 5480 }, { "epoch": 0.4986602479676643, "grad_norm": 0.9095699787139893, "learning_rate": 9.867019406306839e-05, "loss": 0.2498, "step": 5490 }, { "epoch": 0.49956855443026477, "grad_norm": 1.8093737363815308, "learning_rate": 9.865806007050684e-05, "loss": 0.2216, "step": 5500 }, { "epoch": 0.49956855443026477, "eval_loss": 0.2608264088630676, "eval_runtime": 1111.9631, "eval_samples_per_second": 8.802, "eval_steps_per_second": 8.802, "step": 5500 }, { "epoch": 0.5004768608928652, "grad_norm": 2.9160618782043457, "learning_rate": 9.8645871722884e-05, "loss": 0.2234, "step": 5510 }, { "epoch": 0.5013851673554658, "grad_norm": 5.796836853027344, "learning_rate": 9.863362903381523e-05, "loss": 0.206, "step": 5520 }, { "epoch": 0.5022934738180662, "grad_norm": 0.7747005224227905, "learning_rate": 9.862133201697663e-05, "loss": 0.1996, "step": 5530 }, { "epoch": 0.5032017802806666, "grad_norm": 2.962594747543335, "learning_rate": 9.8608980686105e-05, "loss": 0.2792, "step": 5540 }, { "epoch": 0.5041100867432672, "grad_norm": 3.1700422763824463, "learning_rate": 9.859657505499778e-05, "loss": 0.2468, "step": 5550 }, { "epoch": 0.5050183932058676, "grad_norm": 2.210045576095581, "learning_rate": 9.858411513751307e-05, "loss": 0.3052, "step": 5560 }, { "epoch": 0.5059266996684681, "grad_norm": 1.3153102397918701, "learning_rate": 9.857160094756964e-05, "loss": 0.2534, "step": 5570 }, { "epoch": 0.5068350061310686, "grad_norm": 0.9745922684669495, "learning_rate": 9.855903249914685e-05, "loss": 0.2347, "step": 5580 }, { "epoch": 0.5077433125936691, "grad_norm": 0.4881088435649872, "learning_rate": 9.854640980628472e-05, "loss": 0.2033, "step": 5590 }, { "epoch": 0.5086516190562695, "grad_norm": 1.641475796699524, "learning_rate": 9.853373288308382e-05, "loss": 0.2071, "step": 5600 }, { "epoch": 0.5095599255188701, "grad_norm": 1.9331648349761963, "learning_rate": 9.852100174370535e-05, "loss": 0.1719, "step": 5610 }, { "epoch": 0.5104682319814705, "grad_norm": 0.5361711382865906, "learning_rate": 9.850821640237098e-05, "loss": 0.1529, "step": 5620 }, { "epoch": 0.511376538444071, "grad_norm": 2.0810163021087646, "learning_rate": 9.849537687336305e-05, "loss": 0.2581, "step": 5630 }, { "epoch": 0.5122848449066715, "grad_norm": 1.8923650979995728, "learning_rate": 9.848248317102433e-05, "loss": 0.2599, "step": 5640 }, { "epoch": 0.513193151369272, "grad_norm": 1.4301342964172363, "learning_rate": 9.846953530975817e-05, "loss": 0.2457, "step": 5650 }, { "epoch": 0.5141014578318724, "grad_norm": 3.2128632068634033, "learning_rate": 9.84565333040284e-05, "loss": 0.2221, "step": 5660 }, { "epoch": 0.515009764294473, "grad_norm": 1.5607028007507324, "learning_rate": 9.844347716835932e-05, "loss": 0.2714, "step": 5670 }, { "epoch": 0.5159180707570734, "grad_norm": 1.9585386514663696, "learning_rate": 9.84303669173357e-05, "loss": 0.2928, "step": 5680 }, { "epoch": 0.516826377219674, "grad_norm": 3.728541851043701, "learning_rate": 9.841720256560279e-05, "loss": 0.1941, "step": 5690 }, { "epoch": 0.5177346836822744, "grad_norm": 0.6061636805534363, "learning_rate": 9.840398412786625e-05, "loss": 0.2759, "step": 5700 }, { "epoch": 0.5186429901448749, "grad_norm": 4.226606369018555, "learning_rate": 9.839071161889215e-05, "loss": 0.2467, "step": 5710 }, { "epoch": 0.5195512966074753, "grad_norm": 2.029357433319092, "learning_rate": 9.8377385053507e-05, "loss": 0.2084, "step": 5720 }, { "epoch": 0.5204596030700759, "grad_norm": 1.3528738021850586, "learning_rate": 9.836400444659765e-05, "loss": 0.2198, "step": 5730 }, { "epoch": 0.5213679095326763, "grad_norm": 2.6634912490844727, "learning_rate": 9.835056981311135e-05, "loss": 0.2778, "step": 5740 }, { "epoch": 0.5222762159952768, "grad_norm": 2.9520325660705566, "learning_rate": 9.833708116805569e-05, "loss": 0.2814, "step": 5750 }, { "epoch": 0.5231845224578773, "grad_norm": 1.276153326034546, "learning_rate": 9.832353852649861e-05, "loss": 0.226, "step": 5760 }, { "epoch": 0.5240928289204778, "grad_norm": 1.506553292274475, "learning_rate": 9.830994190356832e-05, "loss": 0.2314, "step": 5770 }, { "epoch": 0.5250011353830782, "grad_norm": 1.3607161045074463, "learning_rate": 9.829629131445342e-05, "loss": 0.2619, "step": 5780 }, { "epoch": 0.5259094418456788, "grad_norm": 6.06449556350708, "learning_rate": 9.82825867744027e-05, "loss": 0.3264, "step": 5790 }, { "epoch": 0.5268177483082792, "grad_norm": 1.3633052110671997, "learning_rate": 9.826882829872528e-05, "loss": 0.2581, "step": 5800 }, { "epoch": 0.5277260547708797, "grad_norm": 1.0127356052398682, "learning_rate": 9.82550159027905e-05, "loss": 0.1694, "step": 5810 }, { "epoch": 0.5286343612334802, "grad_norm": 0.6645146608352661, "learning_rate": 9.824114960202795e-05, "loss": 0.2712, "step": 5820 }, { "epoch": 0.5295426676960806, "grad_norm": 2.104870319366455, "learning_rate": 9.822722941192744e-05, "loss": 0.2452, "step": 5830 }, { "epoch": 0.5304509741586811, "grad_norm": 2.603210926055908, "learning_rate": 9.821325534803895e-05, "loss": 0.2074, "step": 5840 }, { "epoch": 0.5313592806212816, "grad_norm": 2.764094591140747, "learning_rate": 9.819922742597267e-05, "loss": 0.2349, "step": 5850 }, { "epoch": 0.5322675870838821, "grad_norm": 0.8582749962806702, "learning_rate": 9.818514566139895e-05, "loss": 0.3232, "step": 5860 }, { "epoch": 0.5331758935464825, "grad_norm": 1.8156553506851196, "learning_rate": 9.817101007004828e-05, "loss": 0.2991, "step": 5870 }, { "epoch": 0.5340842000090831, "grad_norm": 2.806356191635132, "learning_rate": 9.815682066771126e-05, "loss": 0.2452, "step": 5880 }, { "epoch": 0.5349925064716835, "grad_norm": 2.2366864681243896, "learning_rate": 9.814257747023866e-05, "loss": 0.3195, "step": 5890 }, { "epoch": 0.535900812934284, "grad_norm": 1.18310546875, "learning_rate": 9.812828049354124e-05, "loss": 0.2675, "step": 5900 }, { "epoch": 0.5368091193968845, "grad_norm": 0.7333977222442627, "learning_rate": 9.811392975358997e-05, "loss": 0.2349, "step": 5910 }, { "epoch": 0.537717425859485, "grad_norm": 1.0801305770874023, "learning_rate": 9.809952526641578e-05, "loss": 0.192, "step": 5920 }, { "epoch": 0.5386257323220854, "grad_norm": 2.4547624588012695, "learning_rate": 9.808506704810967e-05, "loss": 0.2399, "step": 5930 }, { "epoch": 0.539534038784686, "grad_norm": 1.488930344581604, "learning_rate": 9.807055511482265e-05, "loss": 0.1919, "step": 5940 }, { "epoch": 0.5404423452472864, "grad_norm": 1.2081831693649292, "learning_rate": 9.805598948276575e-05, "loss": 0.2074, "step": 5950 }, { "epoch": 0.5413506517098869, "grad_norm": 1.3577630519866943, "learning_rate": 9.804137016821e-05, "loss": 0.1835, "step": 5960 }, { "epoch": 0.5422589581724874, "grad_norm": 0.5049612522125244, "learning_rate": 9.802669718748637e-05, "loss": 0.2082, "step": 5970 }, { "epoch": 0.5431672646350879, "grad_norm": 0.7716193795204163, "learning_rate": 9.801197055698577e-05, "loss": 0.1238, "step": 5980 }, { "epoch": 0.5440755710976883, "grad_norm": 0.6192967891693115, "learning_rate": 9.799719029315908e-05, "loss": 0.3721, "step": 5990 }, { "epoch": 0.5449838775602889, "grad_norm": 1.347855567932129, "learning_rate": 9.798235641251705e-05, "loss": 0.2781, "step": 6000 }, { "epoch": 0.5449838775602889, "eval_loss": 0.25950294733047485, "eval_runtime": 1112.9914, "eval_samples_per_second": 8.793, "eval_steps_per_second": 8.793, "step": 6000 }, { "epoch": 0.5458921840228893, "grad_norm": 2.2206993103027344, "learning_rate": 9.796746893163038e-05, "loss": 0.3382, "step": 6010 }, { "epoch": 0.5468004904854898, "grad_norm": 2.0331504344940186, "learning_rate": 9.795252786712958e-05, "loss": 0.2334, "step": 6020 }, { "epoch": 0.5477087969480903, "grad_norm": 1.056046485900879, "learning_rate": 9.793753323570507e-05, "loss": 0.1936, "step": 6030 }, { "epoch": 0.5486171034106908, "grad_norm": 1.3401196002960205, "learning_rate": 9.792248505410708e-05, "loss": 0.2467, "step": 6040 }, { "epoch": 0.5495254098732912, "grad_norm": 0.7586830258369446, "learning_rate": 9.790738333914566e-05, "loss": 0.1746, "step": 6050 }, { "epoch": 0.5504337163358918, "grad_norm": 0.8465110063552856, "learning_rate": 9.78922281076907e-05, "loss": 0.2168, "step": 6060 }, { "epoch": 0.5513420227984922, "grad_norm": 3.791752815246582, "learning_rate": 9.787701937667181e-05, "loss": 0.3545, "step": 6070 }, { "epoch": 0.5522503292610927, "grad_norm": 0.9363363981246948, "learning_rate": 9.78617571630784e-05, "loss": 0.1947, "step": 6080 }, { "epoch": 0.5531586357236932, "grad_norm": 1.3840748071670532, "learning_rate": 9.784644148395964e-05, "loss": 0.2851, "step": 6090 }, { "epoch": 0.5540669421862937, "grad_norm": 1.4701730012893677, "learning_rate": 9.783107235642439e-05, "loss": 0.2634, "step": 6100 }, { "epoch": 0.5549752486488941, "grad_norm": 0.6601725220680237, "learning_rate": 9.781564979764122e-05, "loss": 0.1616, "step": 6110 }, { "epoch": 0.5558835551114946, "grad_norm": 2.042914867401123, "learning_rate": 9.780017382483844e-05, "loss": 0.3109, "step": 6120 }, { "epoch": 0.5567918615740951, "grad_norm": 1.4042184352874756, "learning_rate": 9.778464445530393e-05, "loss": 0.2867, "step": 6130 }, { "epoch": 0.5577001680366955, "grad_norm": 1.3017420768737793, "learning_rate": 9.776906170638531e-05, "loss": 0.2557, "step": 6140 }, { "epoch": 0.5586084744992961, "grad_norm": 2.8851115703582764, "learning_rate": 9.775342559548978e-05, "loss": 0.2709, "step": 6150 }, { "epoch": 0.5595167809618965, "grad_norm": 1.527881145477295, "learning_rate": 9.77377361400842e-05, "loss": 0.2002, "step": 6160 }, { "epoch": 0.560425087424497, "grad_norm": 2.1142618656158447, "learning_rate": 9.772199335769494e-05, "loss": 0.2364, "step": 6170 }, { "epoch": 0.5613333938870975, "grad_norm": 1.199766993522644, "learning_rate": 9.7706197265908e-05, "loss": 0.2643, "step": 6180 }, { "epoch": 0.562241700349698, "grad_norm": 1.5001795291900635, "learning_rate": 9.769034788236892e-05, "loss": 0.1383, "step": 6190 }, { "epoch": 0.5631500068122984, "grad_norm": 3.9307475090026855, "learning_rate": 9.767444522478276e-05, "loss": 0.2575, "step": 6200 }, { "epoch": 0.564058313274899, "grad_norm": 3.859506607055664, "learning_rate": 9.765848931091409e-05, "loss": 0.2765, "step": 6210 }, { "epoch": 0.5649666197374994, "grad_norm": 2.1432902812957764, "learning_rate": 9.764248015858702e-05, "loss": 0.2293, "step": 6220 }, { "epoch": 0.5658749262000999, "grad_norm": 1.0950026512145996, "learning_rate": 9.762641778568502e-05, "loss": 0.2712, "step": 6230 }, { "epoch": 0.5667832326627004, "grad_norm": 1.8098639249801636, "learning_rate": 9.761030221015117e-05, "loss": 0.227, "step": 6240 }, { "epoch": 0.5676915391253009, "grad_norm": 2.006303548812866, "learning_rate": 9.759413344998784e-05, "loss": 0.2554, "step": 6250 }, { "epoch": 0.5685998455879013, "grad_norm": 3.10978102684021, "learning_rate": 9.757791152325688e-05, "loss": 0.2668, "step": 6260 }, { "epoch": 0.5695081520505019, "grad_norm": 2.424288511276245, "learning_rate": 9.756163644807951e-05, "loss": 0.1804, "step": 6270 }, { "epoch": 0.5704164585131023, "grad_norm": 2.52396559715271, "learning_rate": 9.754530824263635e-05, "loss": 0.2134, "step": 6280 }, { "epoch": 0.5713247649757028, "grad_norm": 1.2024539709091187, "learning_rate": 9.752892692516732e-05, "loss": 0.2082, "step": 6290 }, { "epoch": 0.5722330714383033, "grad_norm": 0.8746946454048157, "learning_rate": 9.751249251397173e-05, "loss": 0.321, "step": 6300 }, { "epoch": 0.5731413779009038, "grad_norm": 0.837874174118042, "learning_rate": 9.749600502740817e-05, "loss": 0.1795, "step": 6310 }, { "epoch": 0.5740496843635042, "grad_norm": 2.4251363277435303, "learning_rate": 9.74794644838945e-05, "loss": 0.2261, "step": 6320 }, { "epoch": 0.5749579908261048, "grad_norm": 3.0766308307647705, "learning_rate": 9.746287090190789e-05, "loss": 0.2807, "step": 6330 }, { "epoch": 0.5758662972887052, "grad_norm": 2.7274250984191895, "learning_rate": 9.744622429998472e-05, "loss": 0.1933, "step": 6340 }, { "epoch": 0.5767746037513057, "grad_norm": 2.2885406017303467, "learning_rate": 9.742952469672062e-05, "loss": 0.2472, "step": 6350 }, { "epoch": 0.5776829102139062, "grad_norm": 2.623743772506714, "learning_rate": 9.741277211077043e-05, "loss": 0.2099, "step": 6360 }, { "epoch": 0.5785912166765067, "grad_norm": 0.9801775217056274, "learning_rate": 9.739596656084818e-05, "loss": 0.2183, "step": 6370 }, { "epoch": 0.5794995231391071, "grad_norm": 6.600966453552246, "learning_rate": 9.737910806572703e-05, "loss": 0.2931, "step": 6380 }, { "epoch": 0.5804078296017077, "grad_norm": 2.9125006198883057, "learning_rate": 9.736219664423931e-05, "loss": 0.2255, "step": 6390 }, { "epoch": 0.5813161360643081, "grad_norm": 2.9835238456726074, "learning_rate": 9.734523231527648e-05, "loss": 0.2697, "step": 6400 }, { "epoch": 0.5822244425269085, "grad_norm": 1.1131948232650757, "learning_rate": 9.732821509778908e-05, "loss": 0.2298, "step": 6410 }, { "epoch": 0.5831327489895091, "grad_norm": 1.360267996788025, "learning_rate": 9.731114501078678e-05, "loss": 0.2656, "step": 6420 }, { "epoch": 0.5840410554521095, "grad_norm": 1.0155123472213745, "learning_rate": 9.729402207333823e-05, "loss": 0.3565, "step": 6430 }, { "epoch": 0.58494936191471, "grad_norm": 2.719919204711914, "learning_rate": 9.727684630457119e-05, "loss": 0.1983, "step": 6440 }, { "epoch": 0.5858576683773105, "grad_norm": 1.197110891342163, "learning_rate": 9.72596177236724e-05, "loss": 0.2323, "step": 6450 }, { "epoch": 0.586765974839911, "grad_norm": 1.2769802808761597, "learning_rate": 9.724233634988759e-05, "loss": 0.1439, "step": 6460 }, { "epoch": 0.5876742813025114, "grad_norm": 1.3792847394943237, "learning_rate": 9.722500220252148e-05, "loss": 0.1868, "step": 6470 }, { "epoch": 0.588582587765112, "grad_norm": 1.365533709526062, "learning_rate": 9.720761530093777e-05, "loss": 0.2186, "step": 6480 }, { "epoch": 0.5894908942277124, "grad_norm": 2.5519909858703613, "learning_rate": 9.719017566455904e-05, "loss": 0.2048, "step": 6490 }, { "epoch": 0.5903992006903129, "grad_norm": 1.212759017944336, "learning_rate": 9.717268331286679e-05, "loss": 0.2206, "step": 6500 }, { "epoch": 0.5903992006903129, "eval_loss": 0.24948085844516754, "eval_runtime": 1103.5254, "eval_samples_per_second": 8.869, "eval_steps_per_second": 8.869, "step": 6500 }, { "epoch": 0.5913075071529134, "grad_norm": 1.5450005531311035, "learning_rate": 9.715513826540145e-05, "loss": 0.186, "step": 6510 }, { "epoch": 0.5922158136155139, "grad_norm": 4.07082986831665, "learning_rate": 9.713754054176224e-05, "loss": 0.2576, "step": 6520 }, { "epoch": 0.5931241200781143, "grad_norm": 0.9052543044090271, "learning_rate": 9.711989016160731e-05, "loss": 0.3067, "step": 6530 }, { "epoch": 0.5940324265407149, "grad_norm": 2.1204185485839844, "learning_rate": 9.710218714465355e-05, "loss": 0.2156, "step": 6540 }, { "epoch": 0.5949407330033153, "grad_norm": 0.48529449105262756, "learning_rate": 9.708443151067673e-05, "loss": 0.2866, "step": 6550 }, { "epoch": 0.5958490394659158, "grad_norm": 3.084003448486328, "learning_rate": 9.706662327951132e-05, "loss": 0.3112, "step": 6560 }, { "epoch": 0.5967573459285163, "grad_norm": 0.9320728778839111, "learning_rate": 9.70487624710506e-05, "loss": 0.1771, "step": 6570 }, { "epoch": 0.5976656523911168, "grad_norm": 2.5907680988311768, "learning_rate": 9.703084910524655e-05, "loss": 0.2637, "step": 6580 }, { "epoch": 0.5985739588537172, "grad_norm": 0.9873548746109009, "learning_rate": 9.701288320210989e-05, "loss": 0.303, "step": 6590 }, { "epoch": 0.5994822653163178, "grad_norm": 2.4340357780456543, "learning_rate": 9.699486478170999e-05, "loss": 0.3162, "step": 6600 }, { "epoch": 0.6003905717789182, "grad_norm": 0.7254656553268433, "learning_rate": 9.697679386417495e-05, "loss": 0.2066, "step": 6610 }, { "epoch": 0.6012988782415187, "grad_norm": 1.029218316078186, "learning_rate": 9.695867046969143e-05, "loss": 0.2487, "step": 6620 }, { "epoch": 0.6022071847041192, "grad_norm": 0.993603527545929, "learning_rate": 9.694049461850478e-05, "loss": 0.1654, "step": 6630 }, { "epoch": 0.6031154911667197, "grad_norm": 2.798999071121216, "learning_rate": 9.692226633091894e-05, "loss": 0.1904, "step": 6640 }, { "epoch": 0.6040237976293201, "grad_norm": 0.6649394631385803, "learning_rate": 9.690398562729636e-05, "loss": 0.1618, "step": 6650 }, { "epoch": 0.6049321040919207, "grad_norm": 3.0210683345794678, "learning_rate": 9.688565252805814e-05, "loss": 0.2695, "step": 6660 }, { "epoch": 0.6058404105545211, "grad_norm": 0.8739848732948303, "learning_rate": 9.686726705368383e-05, "loss": 0.3201, "step": 6670 }, { "epoch": 0.6067487170171216, "grad_norm": 0.7374640703201294, "learning_rate": 9.684882922471152e-05, "loss": 0.2405, "step": 6680 }, { "epoch": 0.6076570234797221, "grad_norm": 1.9094116687774658, "learning_rate": 9.683033906173781e-05, "loss": 0.2535, "step": 6690 }, { "epoch": 0.6085653299423225, "grad_norm": 3.5535993576049805, "learning_rate": 9.681179658541771e-05, "loss": 0.215, "step": 6700 }, { "epoch": 0.609473636404923, "grad_norm": 2.945466995239258, "learning_rate": 9.679320181646472e-05, "loss": 0.2593, "step": 6710 }, { "epoch": 0.6103819428675235, "grad_norm": 1.4983636140823364, "learning_rate": 9.677455477565069e-05, "loss": 0.3455, "step": 6720 }, { "epoch": 0.611290249330124, "grad_norm": 1.007995843887329, "learning_rate": 9.675585548380592e-05, "loss": 0.238, "step": 6730 }, { "epoch": 0.6121985557927244, "grad_norm": 0.4167982339859009, "learning_rate": 9.673710396181907e-05, "loss": 0.2556, "step": 6740 }, { "epoch": 0.613106862255325, "grad_norm": 0.880611777305603, "learning_rate": 9.671830023063714e-05, "loss": 0.1877, "step": 6750 }, { "epoch": 0.6140151687179254, "grad_norm": 1.4742501974105835, "learning_rate": 9.669944431126541e-05, "loss": 0.3174, "step": 6760 }, { "epoch": 0.6149234751805259, "grad_norm": 1.873968482017517, "learning_rate": 9.668053622476754e-05, "loss": 0.1189, "step": 6770 }, { "epoch": 0.6158317816431264, "grad_norm": 1.190991997718811, "learning_rate": 9.666157599226537e-05, "loss": 0.2078, "step": 6780 }, { "epoch": 0.6167400881057269, "grad_norm": 1.1962559223175049, "learning_rate": 9.664256363493908e-05, "loss": 0.2511, "step": 6790 }, { "epoch": 0.6176483945683273, "grad_norm": 0.8577046990394592, "learning_rate": 9.662349917402704e-05, "loss": 0.2116, "step": 6800 }, { "epoch": 0.6185567010309279, "grad_norm": 2.0124294757843018, "learning_rate": 9.660438263082581e-05, "loss": 0.2539, "step": 6810 }, { "epoch": 0.6194650074935283, "grad_norm": 1.6729375123977661, "learning_rate": 9.658521402669014e-05, "loss": 0.2806, "step": 6820 }, { "epoch": 0.6203733139561288, "grad_norm": 1.027794599533081, "learning_rate": 9.656599338303294e-05, "loss": 0.2489, "step": 6830 }, { "epoch": 0.6212816204187293, "grad_norm": 0.4414721429347992, "learning_rate": 9.654672072132527e-05, "loss": 0.2257, "step": 6840 }, { "epoch": 0.6221899268813298, "grad_norm": 1.4292662143707275, "learning_rate": 9.652739606309627e-05, "loss": 0.258, "step": 6850 }, { "epoch": 0.6230982333439302, "grad_norm": 3.6433029174804688, "learning_rate": 9.650801942993316e-05, "loss": 0.2465, "step": 6860 }, { "epoch": 0.6240065398065308, "grad_norm": 1.6662176847457886, "learning_rate": 9.648859084348125e-05, "loss": 0.176, "step": 6870 }, { "epoch": 0.6249148462691312, "grad_norm": 1.251931071281433, "learning_rate": 9.646911032544388e-05, "loss": 0.2184, "step": 6880 }, { "epoch": 0.6258231527317317, "grad_norm": 2.469770908355713, "learning_rate": 9.644957789758238e-05, "loss": 0.2532, "step": 6890 }, { "epoch": 0.6267314591943322, "grad_norm": 2.033341646194458, "learning_rate": 9.642999358171608e-05, "loss": 0.2029, "step": 6900 }, { "epoch": 0.6276397656569327, "grad_norm": 0.603538990020752, "learning_rate": 9.641035739972228e-05, "loss": 0.2182, "step": 6910 }, { "epoch": 0.6285480721195331, "grad_norm": 1.6671717166900635, "learning_rate": 9.639066937353622e-05, "loss": 0.2099, "step": 6920 }, { "epoch": 0.6294563785821337, "grad_norm": 0.7601290345191956, "learning_rate": 9.637092952515104e-05, "loss": 0.1639, "step": 6930 }, { "epoch": 0.6303646850447341, "grad_norm": 0.6287315487861633, "learning_rate": 9.635113787661776e-05, "loss": 0.2795, "step": 6940 }, { "epoch": 0.6312729915073346, "grad_norm": 1.7777360677719116, "learning_rate": 9.63312944500453e-05, "loss": 0.2564, "step": 6950 }, { "epoch": 0.632181297969935, "grad_norm": 0.8745569586753845, "learning_rate": 9.63113992676004e-05, "loss": 0.2093, "step": 6960 }, { "epoch": 0.6330896044325356, "grad_norm": 1.011582851409912, "learning_rate": 9.629145235150763e-05, "loss": 0.2208, "step": 6970 }, { "epoch": 0.633997910895136, "grad_norm": 4.2249860763549805, "learning_rate": 9.627145372404931e-05, "loss": 0.2254, "step": 6980 }, { "epoch": 0.6349062173577364, "grad_norm": 1.3456746339797974, "learning_rate": 9.625140340756558e-05, "loss": 0.2417, "step": 6990 }, { "epoch": 0.635814523820337, "grad_norm": 0.7527981400489807, "learning_rate": 9.623130142445429e-05, "loss": 0.2403, "step": 7000 }, { "epoch": 0.635814523820337, "eval_loss": 0.24407260119915009, "eval_runtime": 1103.4318, "eval_samples_per_second": 8.87, "eval_steps_per_second": 8.87, "step": 7000 }, { "epoch": 0.6367228302829374, "grad_norm": 1.4642101526260376, "learning_rate": 9.621114779717099e-05, "loss": 0.2203, "step": 7010 }, { "epoch": 0.637631136745538, "grad_norm": 1.5340478420257568, "learning_rate": 9.619094254822896e-05, "loss": 0.2979, "step": 7020 }, { "epoch": 0.6385394432081384, "grad_norm": 1.92472243309021, "learning_rate": 9.617068570019913e-05, "loss": 0.1837, "step": 7030 }, { "epoch": 0.6394477496707389, "grad_norm": 2.593146562576294, "learning_rate": 9.615037727571006e-05, "loss": 0.2556, "step": 7040 }, { "epoch": 0.6403560561333393, "grad_norm": 1.4368683099746704, "learning_rate": 9.613001729744794e-05, "loss": 0.1771, "step": 7050 }, { "epoch": 0.6412643625959399, "grad_norm": 1.3653099536895752, "learning_rate": 9.610960578815653e-05, "loss": 0.2204, "step": 7060 }, { "epoch": 0.6421726690585403, "grad_norm": 2.0947265625, "learning_rate": 9.608914277063719e-05, "loss": 0.2314, "step": 7070 }, { "epoch": 0.6430809755211409, "grad_norm": 0.8618085384368896, "learning_rate": 9.606862826774877e-05, "loss": 0.1944, "step": 7080 }, { "epoch": 0.6439892819837413, "grad_norm": 0.3051355183124542, "learning_rate": 9.604806230240768e-05, "loss": 0.1482, "step": 7090 }, { "epoch": 0.6448975884463418, "grad_norm": 2.4838311672210693, "learning_rate": 9.602744489758779e-05, "loss": 0.2123, "step": 7100 }, { "epoch": 0.6458058949089422, "grad_norm": 0.260320782661438, "learning_rate": 9.600677607632042e-05, "loss": 0.1186, "step": 7110 }, { "epoch": 0.6467142013715428, "grad_norm": 2.1934142112731934, "learning_rate": 9.598605586169438e-05, "loss": 0.2318, "step": 7120 }, { "epoch": 0.6476225078341432, "grad_norm": 0.09602408111095428, "learning_rate": 9.596528427685585e-05, "loss": 0.1327, "step": 7130 }, { "epoch": 0.6485308142967438, "grad_norm": 1.6230735778808594, "learning_rate": 9.594446134500838e-05, "loss": 0.1874, "step": 7140 }, { "epoch": 0.6494391207593442, "grad_norm": 1.9948903322219849, "learning_rate": 9.592358708941292e-05, "loss": 0.218, "step": 7150 }, { "epoch": 0.6503474272219447, "grad_norm": 1.3965662717819214, "learning_rate": 9.590266153338771e-05, "loss": 0.2193, "step": 7160 }, { "epoch": 0.6512557336845451, "grad_norm": 1.3388559818267822, "learning_rate": 9.588168470030836e-05, "loss": 0.2216, "step": 7170 }, { "epoch": 0.6521640401471457, "grad_norm": 0.9695103764533997, "learning_rate": 9.586065661360768e-05, "loss": 0.2258, "step": 7180 }, { "epoch": 0.6530723466097461, "grad_norm": 1.8458356857299805, "learning_rate": 9.58395772967758e-05, "loss": 0.1717, "step": 7190 }, { "epoch": 0.6539806530723467, "grad_norm": 1.3312608003616333, "learning_rate": 9.581844677336004e-05, "loss": 0.1215, "step": 7200 }, { "epoch": 0.6548889595349471, "grad_norm": 1.283759355545044, "learning_rate": 9.579726506696494e-05, "loss": 0.2369, "step": 7210 }, { "epoch": 0.6557972659975476, "grad_norm": 2.1736867427825928, "learning_rate": 9.57760322012522e-05, "loss": 0.2131, "step": 7220 }, { "epoch": 0.656705572460148, "grad_norm": 1.9659581184387207, "learning_rate": 9.57547481999407e-05, "loss": 0.1816, "step": 7230 }, { "epoch": 0.6576138789227486, "grad_norm": 2.669267177581787, "learning_rate": 9.573341308680642e-05, "loss": 0.2185, "step": 7240 }, { "epoch": 0.658522185385349, "grad_norm": 2.4059908390045166, "learning_rate": 9.571202688568242e-05, "loss": 0.252, "step": 7250 }, { "epoch": 0.6594304918479496, "grad_norm": 4.869415760040283, "learning_rate": 9.569058962045888e-05, "loss": 0.2746, "step": 7260 }, { "epoch": 0.66033879831055, "grad_norm": 6.012903213500977, "learning_rate": 9.566910131508296e-05, "loss": 0.2514, "step": 7270 }, { "epoch": 0.6612471047731504, "grad_norm": 2.16634202003479, "learning_rate": 9.564756199355888e-05, "loss": 0.1952, "step": 7280 }, { "epoch": 0.662155411235751, "grad_norm": 0.16177774965763092, "learning_rate": 9.562597167994783e-05, "loss": 0.2132, "step": 7290 }, { "epoch": 0.6630637176983514, "grad_norm": 1.3175703287124634, "learning_rate": 9.560433039836798e-05, "loss": 0.1817, "step": 7300 }, { "epoch": 0.6639720241609519, "grad_norm": 1.8357609510421753, "learning_rate": 9.558263817299442e-05, "loss": 0.2641, "step": 7310 }, { "epoch": 0.6648803306235523, "grad_norm": 1.472636342048645, "learning_rate": 9.556089502805914e-05, "loss": 0.3063, "step": 7320 }, { "epoch": 0.6657886370861529, "grad_norm": 1.4188315868377686, "learning_rate": 9.553910098785104e-05, "loss": 0.1704, "step": 7330 }, { "epoch": 0.6666969435487533, "grad_norm": 1.5052870512008667, "learning_rate": 9.551725607671586e-05, "loss": 0.3198, "step": 7340 }, { "epoch": 0.6676052500113538, "grad_norm": 1.7425658702850342, "learning_rate": 9.549536031905616e-05, "loss": 0.2311, "step": 7350 }, { "epoch": 0.6685135564739543, "grad_norm": 1.1218266487121582, "learning_rate": 9.54734137393313e-05, "loss": 0.2061, "step": 7360 }, { "epoch": 0.6694218629365548, "grad_norm": 0.3878643214702606, "learning_rate": 9.545141636205741e-05, "loss": 0.2449, "step": 7370 }, { "epoch": 0.6703301693991552, "grad_norm": 0.5777683258056641, "learning_rate": 9.54293682118074e-05, "loss": 0.1409, "step": 7380 }, { "epoch": 0.6712384758617558, "grad_norm": 2.2519898414611816, "learning_rate": 9.540726931321084e-05, "loss": 0.2348, "step": 7390 }, { "epoch": 0.6721467823243562, "grad_norm": 0.20855630934238434, "learning_rate": 9.538511969095406e-05, "loss": 0.2309, "step": 7400 }, { "epoch": 0.6730550887869567, "grad_norm": 1.3928900957107544, "learning_rate": 9.536291936977994e-05, "loss": 0.1898, "step": 7410 }, { "epoch": 0.6739633952495572, "grad_norm": 3.349761486053467, "learning_rate": 9.534066837448815e-05, "loss": 0.1839, "step": 7420 }, { "epoch": 0.6748717017121577, "grad_norm": 2.336822986602783, "learning_rate": 9.531836672993484e-05, "loss": 0.2895, "step": 7430 }, { "epoch": 0.6757800081747581, "grad_norm": 2.3935656547546387, "learning_rate": 9.529601446103278e-05, "loss": 0.2083, "step": 7440 }, { "epoch": 0.6766883146373587, "grad_norm": 2.8584275245666504, "learning_rate": 9.527361159275132e-05, "loss": 0.3306, "step": 7450 }, { "epoch": 0.6775966210999591, "grad_norm": 1.4423718452453613, "learning_rate": 9.525115815011631e-05, "loss": 0.1515, "step": 7460 }, { "epoch": 0.6785049275625596, "grad_norm": 1.7885206937789917, "learning_rate": 9.522865415821007e-05, "loss": 0.1882, "step": 7470 }, { "epoch": 0.6794132340251601, "grad_norm": 4.284626483917236, "learning_rate": 9.520609964217142e-05, "loss": 0.2271, "step": 7480 }, { "epoch": 0.6803215404877606, "grad_norm": 1.6797765493392944, "learning_rate": 9.518349462719562e-05, "loss": 0.183, "step": 7490 }, { "epoch": 0.681229846950361, "grad_norm": 3.036764144897461, "learning_rate": 9.516083913853432e-05, "loss": 0.2681, "step": 7500 }, { "epoch": 0.681229846950361, "eval_loss": 0.24869385361671448, "eval_runtime": 1103.8515, "eval_samples_per_second": 8.866, "eval_steps_per_second": 8.866, "step": 7500 }, { "epoch": 0.6821381534129616, "grad_norm": 1.4593957662582397, "learning_rate": 9.51381332014956e-05, "loss": 0.2646, "step": 7510 }, { "epoch": 0.683046459875562, "grad_norm": 1.5315800905227661, "learning_rate": 9.511537684144383e-05, "loss": 0.2353, "step": 7520 }, { "epoch": 0.6839547663381625, "grad_norm": 0.5604353547096252, "learning_rate": 9.509257008379975e-05, "loss": 0.2759, "step": 7530 }, { "epoch": 0.684863072800763, "grad_norm": 1.865754246711731, "learning_rate": 9.506971295404039e-05, "loss": 0.2194, "step": 7540 }, { "epoch": 0.6857713792633635, "grad_norm": 1.0088040828704834, "learning_rate": 9.504680547769904e-05, "loss": 0.1789, "step": 7550 }, { "epoch": 0.6866796857259639, "grad_norm": 2.306342124938965, "learning_rate": 9.502384768036525e-05, "loss": 0.2481, "step": 7560 }, { "epoch": 0.6875879921885644, "grad_norm": 1.8559297323226929, "learning_rate": 9.500083958768474e-05, "loss": 0.284, "step": 7570 }, { "epoch": 0.6884962986511649, "grad_norm": 2.734429121017456, "learning_rate": 9.497778122535948e-05, "loss": 0.1936, "step": 7580 }, { "epoch": 0.6894046051137653, "grad_norm": 1.900835394859314, "learning_rate": 9.495467261914754e-05, "loss": 0.2385, "step": 7590 }, { "epoch": 0.6903129115763659, "grad_norm": 1.3376544713974, "learning_rate": 9.493151379486316e-05, "loss": 0.2921, "step": 7600 }, { "epoch": 0.6912212180389663, "grad_norm": 1.479772925376892, "learning_rate": 9.490830477837662e-05, "loss": 0.1887, "step": 7610 }, { "epoch": 0.6921295245015668, "grad_norm": 4.5334272384643555, "learning_rate": 9.488504559561433e-05, "loss": 0.3755, "step": 7620 }, { "epoch": 0.6930378309641673, "grad_norm": 0.20442070066928864, "learning_rate": 9.48617362725587e-05, "loss": 0.2389, "step": 7630 }, { "epoch": 0.6939461374267678, "grad_norm": 0.1492093950510025, "learning_rate": 9.483837683524816e-05, "loss": 0.189, "step": 7640 }, { "epoch": 0.6948544438893682, "grad_norm": 2.4135525226593018, "learning_rate": 9.481496730977716e-05, "loss": 0.1782, "step": 7650 }, { "epoch": 0.6957627503519688, "grad_norm": 0.7992739677429199, "learning_rate": 9.479150772229603e-05, "loss": 0.2782, "step": 7660 }, { "epoch": 0.6966710568145692, "grad_norm": 3.2786824703216553, "learning_rate": 9.476799809901107e-05, "loss": 0.2814, "step": 7670 }, { "epoch": 0.6975793632771697, "grad_norm": 1.2455403804779053, "learning_rate": 9.474443846618446e-05, "loss": 0.2169, "step": 7680 }, { "epoch": 0.6984876697397702, "grad_norm": 4.209232330322266, "learning_rate": 9.472082885013427e-05, "loss": 0.2646, "step": 7690 }, { "epoch": 0.6993959762023707, "grad_norm": 0.9699862599372864, "learning_rate": 9.469716927723438e-05, "loss": 0.2452, "step": 7700 }, { "epoch": 0.7003042826649711, "grad_norm": 3.7773053646087646, "learning_rate": 9.467345977391449e-05, "loss": 0.2834, "step": 7710 }, { "epoch": 0.7012125891275717, "grad_norm": 1.6733121871948242, "learning_rate": 9.464970036666003e-05, "loss": 0.2092, "step": 7720 }, { "epoch": 0.7021208955901721, "grad_norm": 1.6446547508239746, "learning_rate": 9.462589108201225e-05, "loss": 0.3291, "step": 7730 }, { "epoch": 0.7030292020527726, "grad_norm": 1.978885293006897, "learning_rate": 9.460203194656809e-05, "loss": 0.2599, "step": 7740 }, { "epoch": 0.7039375085153731, "grad_norm": 1.4183902740478516, "learning_rate": 9.457812298698014e-05, "loss": 0.2014, "step": 7750 }, { "epoch": 0.7048458149779736, "grad_norm": 1.088576316833496, "learning_rate": 9.455416422995669e-05, "loss": 0.2298, "step": 7760 }, { "epoch": 0.705754121440574, "grad_norm": 1.2726986408233643, "learning_rate": 9.453015570226165e-05, "loss": 0.2202, "step": 7770 }, { "epoch": 0.7066624279031746, "grad_norm": 2.182227611541748, "learning_rate": 9.450609743071452e-05, "loss": 0.262, "step": 7780 }, { "epoch": 0.707570734365775, "grad_norm": 0.8476855754852295, "learning_rate": 9.448198944219035e-05, "loss": 0.2037, "step": 7790 }, { "epoch": 0.7084790408283755, "grad_norm": 0.8422141075134277, "learning_rate": 9.445783176361979e-05, "loss": 0.2587, "step": 7800 }, { "epoch": 0.709387347290976, "grad_norm": 1.266040325164795, "learning_rate": 9.443362442198894e-05, "loss": 0.2202, "step": 7810 }, { "epoch": 0.7102956537535765, "grad_norm": 3.881195068359375, "learning_rate": 9.44093674443394e-05, "loss": 0.2143, "step": 7820 }, { "epoch": 0.7112039602161769, "grad_norm": 1.3000882863998413, "learning_rate": 9.438506085776816e-05, "loss": 0.2615, "step": 7830 }, { "epoch": 0.7121122666787775, "grad_norm": 2.4487392902374268, "learning_rate": 9.436070468942777e-05, "loss": 0.3171, "step": 7840 }, { "epoch": 0.7130205731413779, "grad_norm": 2.0511229038238525, "learning_rate": 9.4336298966526e-05, "loss": 0.2106, "step": 7850 }, { "epoch": 0.7139288796039783, "grad_norm": 2.3195440769195557, "learning_rate": 9.431184371632609e-05, "loss": 0.2588, "step": 7860 }, { "epoch": 0.7148371860665789, "grad_norm": 0.9786276817321777, "learning_rate": 9.428733896614656e-05, "loss": 0.2383, "step": 7870 }, { "epoch": 0.7157454925291793, "grad_norm": 0.5176436305046082, "learning_rate": 9.426278474336125e-05, "loss": 0.2048, "step": 7880 }, { "epoch": 0.7166537989917798, "grad_norm": 2.0800962448120117, "learning_rate": 9.423818107539921e-05, "loss": 0.1807, "step": 7890 }, { "epoch": 0.7175621054543803, "grad_norm": 2.6597819328308105, "learning_rate": 9.421352798974481e-05, "loss": 0.2706, "step": 7900 }, { "epoch": 0.7184704119169808, "grad_norm": 0.8298594355583191, "learning_rate": 9.418882551393755e-05, "loss": 0.1775, "step": 7910 }, { "epoch": 0.7193787183795812, "grad_norm": 2.659057855606079, "learning_rate": 9.416407367557214e-05, "loss": 0.167, "step": 7920 }, { "epoch": 0.7202870248421818, "grad_norm": 2.0961992740631104, "learning_rate": 9.413927250229841e-05, "loss": 0.229, "step": 7930 }, { "epoch": 0.7211953313047822, "grad_norm": 2.4123411178588867, "learning_rate": 9.411442202182133e-05, "loss": 0.2554, "step": 7940 }, { "epoch": 0.7221036377673827, "grad_norm": 2.938190221786499, "learning_rate": 9.408952226190091e-05, "loss": 0.185, "step": 7950 }, { "epoch": 0.7230119442299832, "grad_norm": 2.564750909805298, "learning_rate": 9.406457325035227e-05, "loss": 0.2422, "step": 7960 }, { "epoch": 0.7239202506925837, "grad_norm": 2.1012978553771973, "learning_rate": 9.403957501504548e-05, "loss": 0.3118, "step": 7970 }, { "epoch": 0.7248285571551841, "grad_norm": 0.5318172574043274, "learning_rate": 9.401452758390564e-05, "loss": 0.2829, "step": 7980 }, { "epoch": 0.7257368636177847, "grad_norm": 1.624374270439148, "learning_rate": 9.39894309849128e-05, "loss": 0.3355, "step": 7990 }, { "epoch": 0.7266451700803851, "grad_norm": 0.9793020486831665, "learning_rate": 9.39642852461019e-05, "loss": 0.2041, "step": 8000 }, { "epoch": 0.7266451700803851, "eval_loss": 0.23091359436511993, "eval_runtime": 1096.6103, "eval_samples_per_second": 8.925, "eval_steps_per_second": 8.925, "step": 8000 }, { "epoch": 0.7275534765429856, "grad_norm": 0.5689694881439209, "learning_rate": 9.393909039556284e-05, "loss": 0.1837, "step": 8010 }, { "epoch": 0.7284617830055861, "grad_norm": 1.9560484886169434, "learning_rate": 9.391384646144032e-05, "loss": 0.312, "step": 8020 }, { "epoch": 0.7293700894681866, "grad_norm": 3.5065605640411377, "learning_rate": 9.38885534719339e-05, "loss": 0.2439, "step": 8030 }, { "epoch": 0.730278395930787, "grad_norm": 0.42073142528533936, "learning_rate": 9.386321145529795e-05, "loss": 0.1941, "step": 8040 }, { "epoch": 0.7311867023933876, "grad_norm": 0.20554228127002716, "learning_rate": 9.383782043984156e-05, "loss": 0.1775, "step": 8050 }, { "epoch": 0.732095008855988, "grad_norm": 1.7402093410491943, "learning_rate": 9.381238045392858e-05, "loss": 0.1928, "step": 8060 }, { "epoch": 0.7330033153185885, "grad_norm": 1.731754183769226, "learning_rate": 9.378689152597761e-05, "loss": 0.2881, "step": 8070 }, { "epoch": 0.733911621781189, "grad_norm": 2.5363688468933105, "learning_rate": 9.376135368446184e-05, "loss": 0.2606, "step": 8080 }, { "epoch": 0.7348199282437895, "grad_norm": 0.22740615904331207, "learning_rate": 9.373576695790919e-05, "loss": 0.1175, "step": 8090 }, { "epoch": 0.7357282347063899, "grad_norm": 3.015138864517212, "learning_rate": 9.37101313749021e-05, "loss": 0.2333, "step": 8100 }, { "epoch": 0.7366365411689905, "grad_norm": 2.1438097953796387, "learning_rate": 9.368444696407764e-05, "loss": 0.1901, "step": 8110 }, { "epoch": 0.7375448476315909, "grad_norm": 1.901610016822815, "learning_rate": 9.365871375412742e-05, "loss": 0.215, "step": 8120 }, { "epoch": 0.7384531540941914, "grad_norm": 3.088996648788452, "learning_rate": 9.363293177379755e-05, "loss": 0.215, "step": 8130 }, { "epoch": 0.7393614605567919, "grad_norm": 0.8592635989189148, "learning_rate": 9.360710105188863e-05, "loss": 0.2579, "step": 8140 }, { "epoch": 0.7402697670193923, "grad_norm": 3.367539644241333, "learning_rate": 9.358122161725569e-05, "loss": 0.1227, "step": 8150 }, { "epoch": 0.7411780734819928, "grad_norm": 1.7342684268951416, "learning_rate": 9.35552934988082e-05, "loss": 0.3778, "step": 8160 }, { "epoch": 0.7420863799445933, "grad_norm": 1.781141996383667, "learning_rate": 9.352931672551001e-05, "loss": 0.247, "step": 8170 }, { "epoch": 0.7429946864071938, "grad_norm": 1.672813892364502, "learning_rate": 9.350329132637932e-05, "loss": 0.293, "step": 8180 }, { "epoch": 0.7439029928697942, "grad_norm": 2.259763479232788, "learning_rate": 9.347721733048863e-05, "loss": 0.2491, "step": 8190 }, { "epoch": 0.7448112993323948, "grad_norm": 1.0862449407577515, "learning_rate": 9.345109476696474e-05, "loss": 0.1667, "step": 8200 }, { "epoch": 0.7457196057949952, "grad_norm": 1.804565668106079, "learning_rate": 9.342492366498871e-05, "loss": 0.2021, "step": 8210 }, { "epoch": 0.7466279122575957, "grad_norm": 3.848090171813965, "learning_rate": 9.339870405379583e-05, "loss": 0.2029, "step": 8220 }, { "epoch": 0.7475362187201962, "grad_norm": 1.75103759765625, "learning_rate": 9.337243596267555e-05, "loss": 0.2151, "step": 8230 }, { "epoch": 0.7484445251827967, "grad_norm": 2.160886526107788, "learning_rate": 9.33461194209715e-05, "loss": 0.2547, "step": 8240 }, { "epoch": 0.7493528316453971, "grad_norm": 0.5682860612869263, "learning_rate": 9.33197544580814e-05, "loss": 0.1904, "step": 8250 }, { "epoch": 0.7502611381079977, "grad_norm": 2.168586492538452, "learning_rate": 9.329334110345711e-05, "loss": 0.3289, "step": 8260 }, { "epoch": 0.7511694445705981, "grad_norm": 1.3806037902832031, "learning_rate": 9.326687938660454e-05, "loss": 0.2395, "step": 8270 }, { "epoch": 0.7520777510331986, "grad_norm": 2.1950950622558594, "learning_rate": 9.324036933708356e-05, "loss": 0.2355, "step": 8280 }, { "epoch": 0.752986057495799, "grad_norm": 0.7293787598609924, "learning_rate": 9.321381098450812e-05, "loss": 0.1931, "step": 8290 }, { "epoch": 0.7538943639583996, "grad_norm": 1.1452808380126953, "learning_rate": 9.318720435854608e-05, "loss": 0.2347, "step": 8300 }, { "epoch": 0.754802670421, "grad_norm": 6.051717758178711, "learning_rate": 9.316054948891922e-05, "loss": 0.2068, "step": 8310 }, { "epoch": 0.7557109768836006, "grad_norm": 4.357767105102539, "learning_rate": 9.313384640540321e-05, "loss": 0.1929, "step": 8320 }, { "epoch": 0.756619283346201, "grad_norm": 2.124784231185913, "learning_rate": 9.310709513782764e-05, "loss": 0.3339, "step": 8330 }, { "epoch": 0.7575275898088015, "grad_norm": 0.2706786096096039, "learning_rate": 9.308029571607586e-05, "loss": 0.1664, "step": 8340 }, { "epoch": 0.758435896271402, "grad_norm": 3.0264675617218018, "learning_rate": 9.3053448170085e-05, "loss": 0.2866, "step": 8350 }, { "epoch": 0.7593442027340025, "grad_norm": 1.7086448669433594, "learning_rate": 9.302655252984603e-05, "loss": 0.2216, "step": 8360 }, { "epoch": 0.7602525091966029, "grad_norm": 4.075212478637695, "learning_rate": 9.299960882540355e-05, "loss": 0.2643, "step": 8370 }, { "epoch": 0.7611608156592035, "grad_norm": 2.3809549808502197, "learning_rate": 9.297261708685594e-05, "loss": 0.2688, "step": 8380 }, { "epoch": 0.7620691221218039, "grad_norm": 2.6645851135253906, "learning_rate": 9.294557734435516e-05, "loss": 0.2579, "step": 8390 }, { "epoch": 0.7629774285844044, "grad_norm": 0.9961481094360352, "learning_rate": 9.291848962810685e-05, "loss": 0.329, "step": 8400 }, { "epoch": 0.7638857350470049, "grad_norm": 2.3113629817962646, "learning_rate": 9.289135396837021e-05, "loss": 0.2874, "step": 8410 }, { "epoch": 0.7647940415096054, "grad_norm": 1.3125301599502563, "learning_rate": 9.286417039545802e-05, "loss": 0.1152, "step": 8420 }, { "epoch": 0.7657023479722058, "grad_norm": 2.619159460067749, "learning_rate": 9.283693893973658e-05, "loss": 0.219, "step": 8430 }, { "epoch": 0.7666106544348062, "grad_norm": 4.5893144607543945, "learning_rate": 9.280965963162562e-05, "loss": 0.2376, "step": 8440 }, { "epoch": 0.7675189608974068, "grad_norm": 0.9832370281219482, "learning_rate": 9.278233250159842e-05, "loss": 0.2602, "step": 8450 }, { "epoch": 0.7684272673600072, "grad_norm": 1.7947461605072021, "learning_rate": 9.275495758018161e-05, "loss": 0.2003, "step": 8460 }, { "epoch": 0.7693355738226078, "grad_norm": 0.46976587176322937, "learning_rate": 9.272753489795524e-05, "loss": 0.1675, "step": 8470 }, { "epoch": 0.7702438802852082, "grad_norm": 2.38234281539917, "learning_rate": 9.270006448555274e-05, "loss": 0.3082, "step": 8480 }, { "epoch": 0.7711521867478087, "grad_norm": 1.9783036708831787, "learning_rate": 9.267254637366078e-05, "loss": 0.2026, "step": 8490 }, { "epoch": 0.7720604932104091, "grad_norm": 1.4464579820632935, "learning_rate": 9.264498059301936e-05, "loss": 0.2982, "step": 8500 }, { "epoch": 0.7720604932104091, "eval_loss": 0.23714645206928253, "eval_runtime": 1103.8995, "eval_samples_per_second": 8.866, "eval_steps_per_second": 8.866, "step": 8500 }, { "epoch": 0.7729687996730097, "grad_norm": 1.7964836359024048, "learning_rate": 9.261736717442174e-05, "loss": 0.2256, "step": 8510 }, { "epoch": 0.7738771061356101, "grad_norm": 1.7671520709991455, "learning_rate": 9.258970614871439e-05, "loss": 0.2457, "step": 8520 }, { "epoch": 0.7747854125982107, "grad_norm": 2.169093132019043, "learning_rate": 9.256199754679691e-05, "loss": 0.2481, "step": 8530 }, { "epoch": 0.7756937190608111, "grad_norm": 3.6101789474487305, "learning_rate": 9.253424139962217e-05, "loss": 0.2718, "step": 8540 }, { "epoch": 0.7766020255234116, "grad_norm": 2.7137603759765625, "learning_rate": 9.2506437738196e-05, "loss": 0.2412, "step": 8550 }, { "epoch": 0.777510331986012, "grad_norm": 2.3219659328460693, "learning_rate": 9.247858659357743e-05, "loss": 0.271, "step": 8560 }, { "epoch": 0.7784186384486126, "grad_norm": 1.831635594367981, "learning_rate": 9.245068799687843e-05, "loss": 0.1369, "step": 8570 }, { "epoch": 0.779326944911213, "grad_norm": 1.4570703506469727, "learning_rate": 9.24227419792641e-05, "loss": 0.1386, "step": 8580 }, { "epoch": 0.7802352513738136, "grad_norm": 1.2180925607681274, "learning_rate": 9.23947485719524e-05, "loss": 0.1569, "step": 8590 }, { "epoch": 0.781143557836414, "grad_norm": 2.0030696392059326, "learning_rate": 9.236670780621426e-05, "loss": 0.186, "step": 8600 }, { "epoch": 0.7820518642990145, "grad_norm": 0.6773229837417603, "learning_rate": 9.233861971337354e-05, "loss": 0.2954, "step": 8610 }, { "epoch": 0.782960170761615, "grad_norm": 1.1342360973358154, "learning_rate": 9.231048432480697e-05, "loss": 0.2028, "step": 8620 }, { "epoch": 0.7838684772242155, "grad_norm": 3.299285650253296, "learning_rate": 9.22823016719441e-05, "loss": 0.2138, "step": 8630 }, { "epoch": 0.7847767836868159, "grad_norm": 2.7393975257873535, "learning_rate": 9.225407178626723e-05, "loss": 0.3108, "step": 8640 }, { "epoch": 0.7856850901494165, "grad_norm": 0.9661497473716736, "learning_rate": 9.222579469931149e-05, "loss": 0.2024, "step": 8650 }, { "epoch": 0.7865933966120169, "grad_norm": 1.0596673488616943, "learning_rate": 9.219747044266475e-05, "loss": 0.253, "step": 8660 }, { "epoch": 0.7875017030746174, "grad_norm": 1.3396559953689575, "learning_rate": 9.216909904796749e-05, "loss": 0.1868, "step": 8670 }, { "epoch": 0.7884100095372178, "grad_norm": 0.6418445706367493, "learning_rate": 9.214068054691291e-05, "loss": 0.2638, "step": 8680 }, { "epoch": 0.7893183159998184, "grad_norm": 1.649557113647461, "learning_rate": 9.211221497124685e-05, "loss": 0.1393, "step": 8690 }, { "epoch": 0.7902266224624188, "grad_norm": 3.325786590576172, "learning_rate": 9.208370235276764e-05, "loss": 0.3127, "step": 8700 }, { "epoch": 0.7911349289250194, "grad_norm": 1.4678473472595215, "learning_rate": 9.205514272332627e-05, "loss": 0.2431, "step": 8710 }, { "epoch": 0.7920432353876198, "grad_norm": 1.3743597269058228, "learning_rate": 9.202653611482619e-05, "loss": 0.2404, "step": 8720 }, { "epoch": 0.7929515418502202, "grad_norm": 0.5885980129241943, "learning_rate": 9.199788255922333e-05, "loss": 0.1981, "step": 8730 }, { "epoch": 0.7938598483128207, "grad_norm": 0.9716159105300903, "learning_rate": 9.196918208852607e-05, "loss": 0.191, "step": 8740 }, { "epoch": 0.7947681547754212, "grad_norm": 0.36479026079177856, "learning_rate": 9.19404347347952e-05, "loss": 0.1458, "step": 8750 }, { "epoch": 0.7956764612380217, "grad_norm": 1.5203617811203003, "learning_rate": 9.191164053014392e-05, "loss": 0.2669, "step": 8760 }, { "epoch": 0.7965847677006221, "grad_norm": 2.338388204574585, "learning_rate": 9.188279950673768e-05, "loss": 0.1986, "step": 8770 }, { "epoch": 0.7974930741632227, "grad_norm": 1.498023509979248, "learning_rate": 9.185391169679429e-05, "loss": 0.2198, "step": 8780 }, { "epoch": 0.7984013806258231, "grad_norm": 0.6039037704467773, "learning_rate": 9.182497713258382e-05, "loss": 0.2215, "step": 8790 }, { "epoch": 0.7993096870884236, "grad_norm": 0.7324672937393188, "learning_rate": 9.179599584642857e-05, "loss": 0.3225, "step": 8800 }, { "epoch": 0.8002179935510241, "grad_norm": 1.6899704933166504, "learning_rate": 9.1766967870703e-05, "loss": 0.304, "step": 8810 }, { "epoch": 0.8011263000136246, "grad_norm": 0.8298718929290771, "learning_rate": 9.173789323783378e-05, "loss": 0.2888, "step": 8820 }, { "epoch": 0.802034606476225, "grad_norm": 0.9884745478630066, "learning_rate": 9.170877198029963e-05, "loss": 0.2259, "step": 8830 }, { "epoch": 0.8029429129388256, "grad_norm": 1.5238854885101318, "learning_rate": 9.167960413063144e-05, "loss": 0.1313, "step": 8840 }, { "epoch": 0.803851219401426, "grad_norm": 1.4259624481201172, "learning_rate": 9.165038972141206e-05, "loss": 0.1597, "step": 8850 }, { "epoch": 0.8047595258640265, "grad_norm": 2.208859920501709, "learning_rate": 9.162112878527644e-05, "loss": 0.2363, "step": 8860 }, { "epoch": 0.805667832326627, "grad_norm": 1.2908027172088623, "learning_rate": 9.15918213549114e-05, "loss": 0.174, "step": 8870 }, { "epoch": 0.8065761387892275, "grad_norm": 1.6518332958221436, "learning_rate": 9.156246746305579e-05, "loss": 0.1929, "step": 8880 }, { "epoch": 0.8074844452518279, "grad_norm": 1.9318767786026, "learning_rate": 9.153306714250032e-05, "loss": 0.191, "step": 8890 }, { "epoch": 0.8083927517144285, "grad_norm": 1.6005733013153076, "learning_rate": 9.150362042608756e-05, "loss": 0.1863, "step": 8900 }, { "epoch": 0.8093010581770289, "grad_norm": 2.4235987663269043, "learning_rate": 9.147412734671192e-05, "loss": 0.156, "step": 8910 }, { "epoch": 0.8102093646396294, "grad_norm": 1.62490975856781, "learning_rate": 9.144458793731963e-05, "loss": 0.197, "step": 8920 }, { "epoch": 0.8111176711022299, "grad_norm": 1.0505470037460327, "learning_rate": 9.141500223090858e-05, "loss": 0.227, "step": 8930 }, { "epoch": 0.8120259775648304, "grad_norm": 1.1920310258865356, "learning_rate": 9.138537026052849e-05, "loss": 0.2448, "step": 8940 }, { "epoch": 0.8129342840274308, "grad_norm": 1.228576898574829, "learning_rate": 9.13556920592807e-05, "loss": 0.1774, "step": 8950 }, { "epoch": 0.8138425904900314, "grad_norm": 0.7804567217826843, "learning_rate": 9.132596766031822e-05, "loss": 0.2492, "step": 8960 }, { "epoch": 0.8147508969526318, "grad_norm": 0.32850754261016846, "learning_rate": 9.129619709684561e-05, "loss": 0.2094, "step": 8970 }, { "epoch": 0.8156592034152323, "grad_norm": 2.0755724906921387, "learning_rate": 9.126638040211908e-05, "loss": 0.1859, "step": 8980 }, { "epoch": 0.8165675098778328, "grad_norm": 1.8687611818313599, "learning_rate": 9.123651760944633e-05, "loss": 0.2451, "step": 8990 }, { "epoch": 0.8174758163404333, "grad_norm": 0.3395299017429352, "learning_rate": 9.120660875218655e-05, "loss": 0.2233, "step": 9000 }, { "epoch": 0.8174758163404333, "eval_loss": 0.23322731256484985, "eval_runtime": 1103.1231, "eval_samples_per_second": 8.872, "eval_steps_per_second": 8.872, "step": 9000 }, { "epoch": 0.8183841228030337, "grad_norm": 2.4242618083953857, "learning_rate": 9.11766538637504e-05, "loss": 0.1773, "step": 9010 }, { "epoch": 0.8192924292656342, "grad_norm": 3.269299030303955, "learning_rate": 9.114665297759994e-05, "loss": 0.1838, "step": 9020 }, { "epoch": 0.8202007357282347, "grad_norm": 0.6432119607925415, "learning_rate": 9.111660612724867e-05, "loss": 0.1303, "step": 9030 }, { "epoch": 0.8211090421908351, "grad_norm": 2.612182140350342, "learning_rate": 9.108651334626138e-05, "loss": 0.2298, "step": 9040 }, { "epoch": 0.8220173486534357, "grad_norm": 0.9941670298576355, "learning_rate": 9.105637466825416e-05, "loss": 0.1894, "step": 9050 }, { "epoch": 0.8229256551160361, "grad_norm": 2.10341477394104, "learning_rate": 9.102619012689443e-05, "loss": 0.3614, "step": 9060 }, { "epoch": 0.8238339615786366, "grad_norm": 2.0694632530212402, "learning_rate": 9.099595975590079e-05, "loss": 0.232, "step": 9070 }, { "epoch": 0.8247422680412371, "grad_norm": 1.4041831493377686, "learning_rate": 9.096568358904306e-05, "loss": 0.1869, "step": 9080 }, { "epoch": 0.8256505745038376, "grad_norm": 3.5093994140625, "learning_rate": 9.093536166014222e-05, "loss": 0.3723, "step": 9090 }, { "epoch": 0.826558880966438, "grad_norm": 1.6450073719024658, "learning_rate": 9.090499400307036e-05, "loss": 0.2101, "step": 9100 }, { "epoch": 0.8274671874290386, "grad_norm": 3.9906795024871826, "learning_rate": 9.087458065175062e-05, "loss": 0.2049, "step": 9110 }, { "epoch": 0.828375493891639, "grad_norm": 1.7053701877593994, "learning_rate": 9.084412164015727e-05, "loss": 0.3448, "step": 9120 }, { "epoch": 0.8292838003542395, "grad_norm": 2.02923321723938, "learning_rate": 9.081361700231552e-05, "loss": 0.1998, "step": 9130 }, { "epoch": 0.83019210681684, "grad_norm": 0.8226824998855591, "learning_rate": 9.078306677230156e-05, "loss": 0.2144, "step": 9140 }, { "epoch": 0.8311004132794405, "grad_norm": 1.8329319953918457, "learning_rate": 9.075247098424249e-05, "loss": 0.179, "step": 9150 }, { "epoch": 0.8320087197420409, "grad_norm": 3.1892433166503906, "learning_rate": 9.072182967231638e-05, "loss": 0.2376, "step": 9160 }, { "epoch": 0.8329170262046415, "grad_norm": 1.4164024591445923, "learning_rate": 9.069114287075202e-05, "loss": 0.1811, "step": 9170 }, { "epoch": 0.8338253326672419, "grad_norm": 2.529266357421875, "learning_rate": 9.066041061382917e-05, "loss": 0.2622, "step": 9180 }, { "epoch": 0.8347336391298424, "grad_norm": 0.7817333340644836, "learning_rate": 9.062963293587824e-05, "loss": 0.1946, "step": 9190 }, { "epoch": 0.8356419455924429, "grad_norm": 2.304752826690674, "learning_rate": 9.059880987128049e-05, "loss": 0.1825, "step": 9200 }, { "epoch": 0.8365502520550434, "grad_norm": 4.910146236419678, "learning_rate": 9.056794145446776e-05, "loss": 0.2986, "step": 9210 }, { "epoch": 0.8374585585176438, "grad_norm": 1.7318074703216553, "learning_rate": 9.053702771992265e-05, "loss": 0.1813, "step": 9220 }, { "epoch": 0.8383668649802444, "grad_norm": 0.5127506852149963, "learning_rate": 9.050606870217835e-05, "loss": 0.2149, "step": 9230 }, { "epoch": 0.8392751714428448, "grad_norm": 1.7823703289031982, "learning_rate": 9.047506443581861e-05, "loss": 0.2311, "step": 9240 }, { "epoch": 0.8401834779054453, "grad_norm": 4.454924583435059, "learning_rate": 9.044401495547776e-05, "loss": 0.2224, "step": 9250 }, { "epoch": 0.8410917843680458, "grad_norm": 0.8864699602127075, "learning_rate": 9.041292029584063e-05, "loss": 0.172, "step": 9260 }, { "epoch": 0.8420000908306463, "grad_norm": 1.8095026016235352, "learning_rate": 9.038178049164252e-05, "loss": 0.2405, "step": 9270 }, { "epoch": 0.8429083972932467, "grad_norm": 0.5081397891044617, "learning_rate": 9.035059557766916e-05, "loss": 0.1601, "step": 9280 }, { "epoch": 0.8438167037558473, "grad_norm": 0.7696103453636169, "learning_rate": 9.031936558875664e-05, "loss": 0.283, "step": 9290 }, { "epoch": 0.8447250102184477, "grad_norm": 3.508364200592041, "learning_rate": 9.028809055979146e-05, "loss": 0.3188, "step": 9300 }, { "epoch": 0.8456333166810481, "grad_norm": 0.264516144990921, "learning_rate": 9.02567705257104e-05, "loss": 0.1855, "step": 9310 }, { "epoch": 0.8465416231436487, "grad_norm": 4.412176609039307, "learning_rate": 9.022540552150049e-05, "loss": 0.2263, "step": 9320 }, { "epoch": 0.8474499296062491, "grad_norm": 0.30030137300491333, "learning_rate": 9.019399558219907e-05, "loss": 0.1991, "step": 9330 }, { "epoch": 0.8483582360688496, "grad_norm": 2.244642972946167, "learning_rate": 9.016254074289359e-05, "loss": 0.2401, "step": 9340 }, { "epoch": 0.8492665425314501, "grad_norm": 2.5362672805786133, "learning_rate": 9.013104103872172e-05, "loss": 0.181, "step": 9350 }, { "epoch": 0.8501748489940506, "grad_norm": 2.0638644695281982, "learning_rate": 9.009949650487121e-05, "loss": 0.23, "step": 9360 }, { "epoch": 0.851083155456651, "grad_norm": 1.0114895105361938, "learning_rate": 9.006790717657991e-05, "loss": 0.2702, "step": 9370 }, { "epoch": 0.8519914619192516, "grad_norm": 2.944054365158081, "learning_rate": 9.003627308913569e-05, "loss": 0.2365, "step": 9380 }, { "epoch": 0.852899768381852, "grad_norm": 1.1059380769729614, "learning_rate": 9.000459427787646e-05, "loss": 0.2262, "step": 9390 }, { "epoch": 0.8538080748444525, "grad_norm": 1.068943738937378, "learning_rate": 8.997287077819003e-05, "loss": 0.2558, "step": 9400 }, { "epoch": 0.854716381307053, "grad_norm": 2.678361415863037, "learning_rate": 8.994110262551418e-05, "loss": 0.231, "step": 9410 }, { "epoch": 0.8556246877696535, "grad_norm": 3.0199198722839355, "learning_rate": 8.990928985533656e-05, "loss": 0.1856, "step": 9420 }, { "epoch": 0.8565329942322539, "grad_norm": 2.0723586082458496, "learning_rate": 8.987743250319463e-05, "loss": 0.1845, "step": 9430 }, { "epoch": 0.8574413006948545, "grad_norm": 2.312871217727661, "learning_rate": 8.98455306046757e-05, "loss": 0.0799, "step": 9440 }, { "epoch": 0.8583496071574549, "grad_norm": 1.4236469268798828, "learning_rate": 8.981358419541679e-05, "loss": 0.2902, "step": 9450 }, { "epoch": 0.8592579136200554, "grad_norm": 1.2188771963119507, "learning_rate": 8.97815933111047e-05, "loss": 0.2786, "step": 9460 }, { "epoch": 0.8601662200826559, "grad_norm": 0.4329122304916382, "learning_rate": 8.974955798747588e-05, "loss": 0.2394, "step": 9470 }, { "epoch": 0.8610745265452564, "grad_norm": 2.3357737064361572, "learning_rate": 8.971747826031641e-05, "loss": 0.3094, "step": 9480 }, { "epoch": 0.8619828330078568, "grad_norm": 0.93446946144104, "learning_rate": 8.9685354165462e-05, "loss": 0.2748, "step": 9490 }, { "epoch": 0.8628911394704574, "grad_norm": 0.5357255339622498, "learning_rate": 8.965318573879791e-05, "loss": 0.2416, "step": 9500 }, { "epoch": 0.8628911394704574, "eval_loss": 0.2304728627204895, "eval_runtime": 1108.0926, "eval_samples_per_second": 8.832, "eval_steps_per_second": 8.832, "step": 9500 }, { "epoch": 0.8637994459330578, "grad_norm": 2.762953281402588, "learning_rate": 8.962097301625891e-05, "loss": 0.1678, "step": 9510 }, { "epoch": 0.8647077523956583, "grad_norm": 2.364431619644165, "learning_rate": 8.958871603382929e-05, "loss": 0.2584, "step": 9520 }, { "epoch": 0.8656160588582588, "grad_norm": 1.5640538930892944, "learning_rate": 8.955641482754271e-05, "loss": 0.3015, "step": 9530 }, { "epoch": 0.8665243653208593, "grad_norm": 2.3820362091064453, "learning_rate": 8.952406943348234e-05, "loss": 0.2143, "step": 9540 }, { "epoch": 0.8674326717834597, "grad_norm": 0.7123646140098572, "learning_rate": 8.94916798877806e-05, "loss": 0.2207, "step": 9550 }, { "epoch": 0.8683409782460603, "grad_norm": 1.7668347358703613, "learning_rate": 8.945924622661929e-05, "loss": 0.2146, "step": 9560 }, { "epoch": 0.8692492847086607, "grad_norm": 1.6068856716156006, "learning_rate": 8.942676848622951e-05, "loss": 0.3253, "step": 9570 }, { "epoch": 0.8701575911712612, "grad_norm": 0.7929647564888, "learning_rate": 8.939424670289155e-05, "loss": 0.203, "step": 9580 }, { "epoch": 0.8710658976338617, "grad_norm": 2.608799934387207, "learning_rate": 8.936168091293491e-05, "loss": 0.3439, "step": 9590 }, { "epoch": 0.8719742040964621, "grad_norm": 3.2293217182159424, "learning_rate": 8.932907115273828e-05, "loss": 0.2416, "step": 9600 }, { "epoch": 0.8728825105590626, "grad_norm": 1.3854663372039795, "learning_rate": 8.929641745872943e-05, "loss": 0.2584, "step": 9610 }, { "epoch": 0.8737908170216631, "grad_norm": 0.9238001108169556, "learning_rate": 8.926371986738524e-05, "loss": 0.2378, "step": 9620 }, { "epoch": 0.8746991234842636, "grad_norm": 2.8573319911956787, "learning_rate": 8.923097841523163e-05, "loss": 0.1925, "step": 9630 }, { "epoch": 0.875607429946864, "grad_norm": 2.1525943279266357, "learning_rate": 8.919819313884346e-05, "loss": 0.1947, "step": 9640 }, { "epoch": 0.8765157364094646, "grad_norm": 0.8737854361534119, "learning_rate": 8.91653640748446e-05, "loss": 0.2878, "step": 9650 }, { "epoch": 0.877424042872065, "grad_norm": 0.5325286984443665, "learning_rate": 8.913249125990783e-05, "loss": 0.2658, "step": 9660 }, { "epoch": 0.8783323493346655, "grad_norm": 1.2082217931747437, "learning_rate": 8.909957473075478e-05, "loss": 0.1057, "step": 9670 }, { "epoch": 0.879240655797266, "grad_norm": 1.6381468772888184, "learning_rate": 8.906661452415592e-05, "loss": 0.1555, "step": 9680 }, { "epoch": 0.8801489622598665, "grad_norm": 0.3891143202781677, "learning_rate": 8.903361067693055e-05, "loss": 0.2455, "step": 9690 }, { "epoch": 0.8810572687224669, "grad_norm": 1.2576104402542114, "learning_rate": 8.900056322594663e-05, "loss": 0.2681, "step": 9700 }, { "epoch": 0.8819655751850675, "grad_norm": 0.1960395723581314, "learning_rate": 8.896747220812094e-05, "loss": 0.1896, "step": 9710 }, { "epoch": 0.8828738816476679, "grad_norm": 1.6140750646591187, "learning_rate": 8.893433766041883e-05, "loss": 0.17, "step": 9720 }, { "epoch": 0.8837821881102684, "grad_norm": 1.298567295074463, "learning_rate": 8.890115961985434e-05, "loss": 0.21, "step": 9730 }, { "epoch": 0.8846904945728689, "grad_norm": 2.849236249923706, "learning_rate": 8.886793812349008e-05, "loss": 0.2758, "step": 9740 }, { "epoch": 0.8855988010354694, "grad_norm": 2.1344239711761475, "learning_rate": 8.88346732084372e-05, "loss": 0.3511, "step": 9750 }, { "epoch": 0.8865071074980698, "grad_norm": 2.702893018722534, "learning_rate": 8.880136491185531e-05, "loss": 0.2002, "step": 9760 }, { "epoch": 0.8874154139606704, "grad_norm": 3.0220091342926025, "learning_rate": 8.876801327095258e-05, "loss": 0.2417, "step": 9770 }, { "epoch": 0.8883237204232708, "grad_norm": 3.09015154838562, "learning_rate": 8.87346183229855e-05, "loss": 0.2248, "step": 9780 }, { "epoch": 0.8892320268858713, "grad_norm": 1.957781195640564, "learning_rate": 8.870118010525898e-05, "loss": 0.1967, "step": 9790 }, { "epoch": 0.8901403333484718, "grad_norm": 1.5081753730773926, "learning_rate": 8.866769865512629e-05, "loss": 0.1877, "step": 9800 }, { "epoch": 0.8910486398110723, "grad_norm": 0.4056055545806885, "learning_rate": 8.863417400998893e-05, "loss": 0.1461, "step": 9810 }, { "epoch": 0.8919569462736727, "grad_norm": 2.704529285430908, "learning_rate": 8.860060620729673e-05, "loss": 0.2536, "step": 9820 }, { "epoch": 0.8928652527362733, "grad_norm": 2.2927613258361816, "learning_rate": 8.856699528454765e-05, "loss": 0.1867, "step": 9830 }, { "epoch": 0.8937735591988737, "grad_norm": 2.285118579864502, "learning_rate": 8.853334127928791e-05, "loss": 0.2068, "step": 9840 }, { "epoch": 0.8946818656614742, "grad_norm": 2.2203211784362793, "learning_rate": 8.849964422911177e-05, "loss": 0.268, "step": 9850 }, { "epoch": 0.8955901721240747, "grad_norm": 0.43290475010871887, "learning_rate": 8.84659041716616e-05, "loss": 0.2006, "step": 9860 }, { "epoch": 0.8964984785866752, "grad_norm": 2.152538299560547, "learning_rate": 8.843212114462787e-05, "loss": 0.2188, "step": 9870 }, { "epoch": 0.8974067850492756, "grad_norm": 2.1843667030334473, "learning_rate": 8.839829518574895e-05, "loss": 0.2163, "step": 9880 }, { "epoch": 0.898315091511876, "grad_norm": 1.2972503900527954, "learning_rate": 8.836442633281125e-05, "loss": 0.2599, "step": 9890 }, { "epoch": 0.8992233979744766, "grad_norm": 1.3606443405151367, "learning_rate": 8.833051462364908e-05, "loss": 0.1894, "step": 9900 }, { "epoch": 0.900131704437077, "grad_norm": 3.9704506397247314, "learning_rate": 8.82965600961446e-05, "loss": 0.1722, "step": 9910 }, { "epoch": 0.9010400108996776, "grad_norm": 0.5372936725616455, "learning_rate": 8.82625627882278e-05, "loss": 0.2165, "step": 9920 }, { "epoch": 0.901948317362278, "grad_norm": 1.5746445655822754, "learning_rate": 8.82285227378765e-05, "loss": 0.1843, "step": 9930 }, { "epoch": 0.9028566238248785, "grad_norm": 0.6894408464431763, "learning_rate": 8.81944399831162e-05, "loss": 0.2031, "step": 9940 }, { "epoch": 0.903764930287479, "grad_norm": 1.9152029752731323, "learning_rate": 8.816031456202021e-05, "loss": 0.1883, "step": 9950 }, { "epoch": 0.9046732367500795, "grad_norm": 2.0476081371307373, "learning_rate": 8.812614651270938e-05, "loss": 0.2455, "step": 9960 }, { "epoch": 0.9055815432126799, "grad_norm": 1.3143072128295898, "learning_rate": 8.809193587335225e-05, "loss": 0.2647, "step": 9970 }, { "epoch": 0.9064898496752805, "grad_norm": 2.7326507568359375, "learning_rate": 8.805768268216495e-05, "loss": 0.2791, "step": 9980 }, { "epoch": 0.9073981561378809, "grad_norm": 2.92396879196167, "learning_rate": 8.802338697741106e-05, "loss": 0.179, "step": 9990 }, { "epoch": 0.9083064626004814, "grad_norm": 0.8260532021522522, "learning_rate": 8.798904879740175e-05, "loss": 0.1913, "step": 10000 }, { "epoch": 0.9083064626004814, "eval_loss": 0.22879034280776978, "eval_runtime": 1106.4417, "eval_samples_per_second": 8.845, "eval_steps_per_second": 8.845, "step": 10000 }, { "epoch": 0.9092147690630819, "grad_norm": 0.7373243570327759, "learning_rate": 8.79546681804956e-05, "loss": 0.2081, "step": 10010 }, { "epoch": 0.9101230755256824, "grad_norm": 1.8286044597625732, "learning_rate": 8.792024516509856e-05, "loss": 0.2785, "step": 10020 }, { "epoch": 0.9110313819882828, "grad_norm": 0.4846707582473755, "learning_rate": 8.7885779789664e-05, "loss": 0.21, "step": 10030 }, { "epoch": 0.9119396884508834, "grad_norm": 2.613950252532959, "learning_rate": 8.785127209269257e-05, "loss": 0.2729, "step": 10040 }, { "epoch": 0.9128479949134838, "grad_norm": 1.6645597219467163, "learning_rate": 8.781672211273221e-05, "loss": 0.229, "step": 10050 }, { "epoch": 0.9137563013760843, "grad_norm": 0.6869065165519714, "learning_rate": 8.778212988837811e-05, "loss": 0.2065, "step": 10060 }, { "epoch": 0.9146646078386848, "grad_norm": 0.5552541017532349, "learning_rate": 8.77474954582726e-05, "loss": 0.1961, "step": 10070 }, { "epoch": 0.9155729143012853, "grad_norm": 0.369120329618454, "learning_rate": 8.771281886110526e-05, "loss": 0.2218, "step": 10080 }, { "epoch": 0.9164812207638857, "grad_norm": 1.620583176612854, "learning_rate": 8.767810013561262e-05, "loss": 0.2594, "step": 10090 }, { "epoch": 0.9173895272264863, "grad_norm": 0.4655068516731262, "learning_rate": 8.76433393205784e-05, "loss": 0.297, "step": 10100 }, { "epoch": 0.9182978336890867, "grad_norm": 1.7534241676330566, "learning_rate": 8.760853645483329e-05, "loss": 0.2234, "step": 10110 }, { "epoch": 0.9192061401516872, "grad_norm": 2.790292739868164, "learning_rate": 8.757369157725496e-05, "loss": 0.2384, "step": 10120 }, { "epoch": 0.9201144466142877, "grad_norm": 1.629294991493225, "learning_rate": 8.753880472676801e-05, "loss": 0.2399, "step": 10130 }, { "epoch": 0.9210227530768882, "grad_norm": 1.3590749502182007, "learning_rate": 8.75038759423439e-05, "loss": 0.2555, "step": 10140 }, { "epoch": 0.9219310595394886, "grad_norm": 0.9291559457778931, "learning_rate": 8.7468905263001e-05, "loss": 0.2373, "step": 10150 }, { "epoch": 0.9228393660020892, "grad_norm": 0.12285423278808594, "learning_rate": 8.743389272780443e-05, "loss": 0.2795, "step": 10160 }, { "epoch": 0.9237476724646896, "grad_norm": 3.487515449523926, "learning_rate": 8.739883837586604e-05, "loss": 0.1479, "step": 10170 }, { "epoch": 0.9246559789272901, "grad_norm": 2.454878568649292, "learning_rate": 8.736374224634448e-05, "loss": 0.2549, "step": 10180 }, { "epoch": 0.9255642853898906, "grad_norm": 2.414968967437744, "learning_rate": 8.732860437844497e-05, "loss": 0.1976, "step": 10190 }, { "epoch": 0.926472591852491, "grad_norm": 1.1335774660110474, "learning_rate": 8.729342481141945e-05, "loss": 0.1996, "step": 10200 }, { "epoch": 0.9273808983150915, "grad_norm": 2.0294835567474365, "learning_rate": 8.725820358456636e-05, "loss": 0.2829, "step": 10210 }, { "epoch": 0.928289204777692, "grad_norm": 0.655392587184906, "learning_rate": 8.722294073723071e-05, "loss": 0.2687, "step": 10220 }, { "epoch": 0.9291975112402925, "grad_norm": 0.6854535341262817, "learning_rate": 8.718763630880402e-05, "loss": 0.1428, "step": 10230 }, { "epoch": 0.9301058177028929, "grad_norm": 1.63532292842865, "learning_rate": 8.715229033872425e-05, "loss": 0.3072, "step": 10240 }, { "epoch": 0.9310141241654935, "grad_norm": 1.051173210144043, "learning_rate": 8.711690286647575e-05, "loss": 0.2462, "step": 10250 }, { "epoch": 0.9319224306280939, "grad_norm": 1.2058234214782715, "learning_rate": 8.708147393158925e-05, "loss": 0.1638, "step": 10260 }, { "epoch": 0.9328307370906944, "grad_norm": 0.45949092507362366, "learning_rate": 8.704600357364177e-05, "loss": 0.1719, "step": 10270 }, { "epoch": 0.9337390435532948, "grad_norm": 1.1349047422409058, "learning_rate": 8.701049183225665e-05, "loss": 0.2977, "step": 10280 }, { "epoch": 0.9346473500158954, "grad_norm": 0.8001668453216553, "learning_rate": 8.697493874710338e-05, "loss": 0.184, "step": 10290 }, { "epoch": 0.9355556564784958, "grad_norm": 1.4661996364593506, "learning_rate": 8.693934435789776e-05, "loss": 0.2457, "step": 10300 }, { "epoch": 0.9364639629410964, "grad_norm": 0.5280899405479431, "learning_rate": 8.690370870440159e-05, "loss": 0.1832, "step": 10310 }, { "epoch": 0.9373722694036968, "grad_norm": 0.6575143933296204, "learning_rate": 8.686803182642287e-05, "loss": 0.1532, "step": 10320 }, { "epoch": 0.9382805758662973, "grad_norm": 2.7703280448913574, "learning_rate": 8.683231376381557e-05, "loss": 0.2185, "step": 10330 }, { "epoch": 0.9391888823288977, "grad_norm": 0.6181463003158569, "learning_rate": 8.679655455647976e-05, "loss": 0.1778, "step": 10340 }, { "epoch": 0.9400971887914983, "grad_norm": 1.2319104671478271, "learning_rate": 8.676075424436139e-05, "loss": 0.2125, "step": 10350 }, { "epoch": 0.9410054952540987, "grad_norm": 1.0043729543685913, "learning_rate": 8.672491286745234e-05, "loss": 0.1184, "step": 10360 }, { "epoch": 0.9419138017166993, "grad_norm": 1.564584732055664, "learning_rate": 8.668903046579042e-05, "loss": 0.147, "step": 10370 }, { "epoch": 0.9428221081792997, "grad_norm": 1.3574166297912598, "learning_rate": 8.665310707945922e-05, "loss": 0.2173, "step": 10380 }, { "epoch": 0.9437304146419002, "grad_norm": 1.7842562198638916, "learning_rate": 8.66171427485881e-05, "loss": 0.2397, "step": 10390 }, { "epoch": 0.9446387211045006, "grad_norm": 1.5212817192077637, "learning_rate": 8.658113751335219e-05, "loss": 0.2324, "step": 10400 }, { "epoch": 0.9455470275671012, "grad_norm": 1.5999540090560913, "learning_rate": 8.654509141397232e-05, "loss": 0.1793, "step": 10410 }, { "epoch": 0.9464553340297016, "grad_norm": 1.168617606163025, "learning_rate": 8.650900449071492e-05, "loss": 0.1873, "step": 10420 }, { "epoch": 0.9473636404923022, "grad_norm": 1.4893817901611328, "learning_rate": 8.64728767838921e-05, "loss": 0.2039, "step": 10430 }, { "epoch": 0.9482719469549026, "grad_norm": 2.0753867626190186, "learning_rate": 8.643670833386144e-05, "loss": 0.1817, "step": 10440 }, { "epoch": 0.9491802534175031, "grad_norm": 3.420966625213623, "learning_rate": 8.64004991810261e-05, "loss": 0.1672, "step": 10450 }, { "epoch": 0.9500885598801035, "grad_norm": 2.1348934173583984, "learning_rate": 8.636424936583469e-05, "loss": 0.2417, "step": 10460 }, { "epoch": 0.9509968663427041, "grad_norm": 1.601203203201294, "learning_rate": 8.632795892878124e-05, "loss": 0.1854, "step": 10470 }, { "epoch": 0.9519051728053045, "grad_norm": 1.5149697065353394, "learning_rate": 8.629162791040514e-05, "loss": 0.2451, "step": 10480 }, { "epoch": 0.9528134792679049, "grad_norm": 2.1721978187561035, "learning_rate": 8.625525635129115e-05, "loss": 0.2174, "step": 10490 }, { "epoch": 0.9537217857305055, "grad_norm": 1.6201722621917725, "learning_rate": 8.621884429206928e-05, "loss": 0.2006, "step": 10500 }, { "epoch": 0.9537217857305055, "eval_loss": 0.23156803846359253, "eval_runtime": 1104.0346, "eval_samples_per_second": 8.865, "eval_steps_per_second": 8.865, "step": 10500 }, { "epoch": 0.9546300921931059, "grad_norm": 2.5690205097198486, "learning_rate": 8.618239177341481e-05, "loss": 0.2944, "step": 10510 }, { "epoch": 0.9555383986557064, "grad_norm": 2.0148327350616455, "learning_rate": 8.61458988360482e-05, "loss": 0.2194, "step": 10520 }, { "epoch": 0.9564467051183069, "grad_norm": 3.265207052230835, "learning_rate": 8.610936552073507e-05, "loss": 0.1721, "step": 10530 }, { "epoch": 0.9573550115809074, "grad_norm": 0.6174920201301575, "learning_rate": 8.607279186828617e-05, "loss": 0.1322, "step": 10540 }, { "epoch": 0.9582633180435078, "grad_norm": 0.3554171323776245, "learning_rate": 8.603617791955722e-05, "loss": 0.2188, "step": 10550 }, { "epoch": 0.9591716245061084, "grad_norm": 0.6742961406707764, "learning_rate": 8.599952371544909e-05, "loss": 0.2017, "step": 10560 }, { "epoch": 0.9600799309687088, "grad_norm": 1.0283615589141846, "learning_rate": 8.596282929690749e-05, "loss": 0.1277, "step": 10570 }, { "epoch": 0.9609882374313093, "grad_norm": 2.243908166885376, "learning_rate": 8.592609470492312e-05, "loss": 0.2756, "step": 10580 }, { "epoch": 0.9618965438939098, "grad_norm": 0.7960989475250244, "learning_rate": 8.588931998053156e-05, "loss": 0.149, "step": 10590 }, { "epoch": 0.9628048503565103, "grad_norm": 0.8100318312644958, "learning_rate": 8.58525051648132e-05, "loss": 0.2403, "step": 10600 }, { "epoch": 0.9637131568191107, "grad_norm": 0.3529641330242157, "learning_rate": 8.581565029889323e-05, "loss": 0.2293, "step": 10610 }, { "epoch": 0.9646214632817113, "grad_norm": 2.6964962482452393, "learning_rate": 8.577875542394156e-05, "loss": 0.2573, "step": 10620 }, { "epoch": 0.9655297697443117, "grad_norm": 0.5576412677764893, "learning_rate": 8.57418205811728e-05, "loss": 0.2526, "step": 10630 }, { "epoch": 0.9664380762069122, "grad_norm": 2.1595520973205566, "learning_rate": 8.570484581184621e-05, "loss": 0.1907, "step": 10640 }, { "epoch": 0.9673463826695127, "grad_norm": 1.0680956840515137, "learning_rate": 8.566783115726567e-05, "loss": 0.1975, "step": 10650 }, { "epoch": 0.9682546891321132, "grad_norm": 0.25677692890167236, "learning_rate": 8.56307766587796e-05, "loss": 0.1691, "step": 10660 }, { "epoch": 0.9691629955947136, "grad_norm": 0.8181679844856262, "learning_rate": 8.559368235778091e-05, "loss": 0.1945, "step": 10670 }, { "epoch": 0.9700713020573142, "grad_norm": 0.8568990230560303, "learning_rate": 8.555654829570699e-05, "loss": 0.2594, "step": 10680 }, { "epoch": 0.9709796085199146, "grad_norm": 0.25836291909217834, "learning_rate": 8.551937451403966e-05, "loss": 0.1779, "step": 10690 }, { "epoch": 0.9718879149825151, "grad_norm": 0.8394226431846619, "learning_rate": 8.548216105430508e-05, "loss": 0.167, "step": 10700 }, { "epoch": 0.9727962214451156, "grad_norm": 1.3019907474517822, "learning_rate": 8.544490795807379e-05, "loss": 0.1844, "step": 10710 }, { "epoch": 0.9737045279077161, "grad_norm": 0.02720511332154274, "learning_rate": 8.540761526696051e-05, "loss": 0.1731, "step": 10720 }, { "epoch": 0.9746128343703165, "grad_norm": 1.3506646156311035, "learning_rate": 8.537028302262426e-05, "loss": 0.2816, "step": 10730 }, { "epoch": 0.9755211408329171, "grad_norm": 2.6526601314544678, "learning_rate": 8.533291126676828e-05, "loss": 0.2141, "step": 10740 }, { "epoch": 0.9764294472955175, "grad_norm": 1.2748953104019165, "learning_rate": 8.529550004113984e-05, "loss": 0.1772, "step": 10750 }, { "epoch": 0.977337753758118, "grad_norm": 2.6770293712615967, "learning_rate": 8.525804938753042e-05, "loss": 0.2931, "step": 10760 }, { "epoch": 0.9782460602207185, "grad_norm": 1.6746790409088135, "learning_rate": 8.522055934777541e-05, "loss": 0.1571, "step": 10770 }, { "epoch": 0.9791543666833189, "grad_norm": 1.376015067100525, "learning_rate": 8.518302996375435e-05, "loss": 0.216, "step": 10780 }, { "epoch": 0.9800626731459194, "grad_norm": 0.8246263861656189, "learning_rate": 8.51454612773906e-05, "loss": 0.1972, "step": 10790 }, { "epoch": 0.9809709796085199, "grad_norm": 2.5484704971313477, "learning_rate": 8.51078533306515e-05, "loss": 0.286, "step": 10800 }, { "epoch": 0.9818792860711204, "grad_norm": 4.023249626159668, "learning_rate": 8.507020616554823e-05, "loss": 0.1933, "step": 10810 }, { "epoch": 0.9827875925337208, "grad_norm": 0.910373330116272, "learning_rate": 8.503251982413576e-05, "loss": 0.23, "step": 10820 }, { "epoch": 0.9836958989963214, "grad_norm": 0.25478243827819824, "learning_rate": 8.499479434851284e-05, "loss": 0.1455, "step": 10830 }, { "epoch": 0.9846042054589218, "grad_norm": 1.1130362749099731, "learning_rate": 8.495702978082194e-05, "loss": 0.2589, "step": 10840 }, { "epoch": 0.9855125119215223, "grad_norm": 1.3866053819656372, "learning_rate": 8.491922616324917e-05, "loss": 0.1232, "step": 10850 }, { "epoch": 0.9864208183841228, "grad_norm": 1.0908716917037964, "learning_rate": 8.488138353802433e-05, "loss": 0.1867, "step": 10860 }, { "epoch": 0.9873291248467233, "grad_norm": 4.231415748596191, "learning_rate": 8.484350194742072e-05, "loss": 0.2509, "step": 10870 }, { "epoch": 0.9882374313093237, "grad_norm": 1.4102476835250854, "learning_rate": 8.480558143375519e-05, "loss": 0.1943, "step": 10880 }, { "epoch": 0.9891457377719243, "grad_norm": 1.428128719329834, "learning_rate": 8.476762203938811e-05, "loss": 0.2913, "step": 10890 }, { "epoch": 0.9900540442345247, "grad_norm": 0.787982702255249, "learning_rate": 8.47296238067232e-05, "loss": 0.2714, "step": 10900 }, { "epoch": 0.9909623506971252, "grad_norm": 1.2297790050506592, "learning_rate": 8.469158677820768e-05, "loss": 0.2114, "step": 10910 }, { "epoch": 0.9918706571597257, "grad_norm": 1.6737028360366821, "learning_rate": 8.465351099633198e-05, "loss": 0.2155, "step": 10920 }, { "epoch": 0.9927789636223262, "grad_norm": 1.6670866012573242, "learning_rate": 8.461539650362993e-05, "loss": 0.2032, "step": 10930 }, { "epoch": 0.9936872700849266, "grad_norm": 1.5952634811401367, "learning_rate": 8.457724334267853e-05, "loss": 0.2295, "step": 10940 }, { "epoch": 0.9945955765475272, "grad_norm": 2.548560380935669, "learning_rate": 8.453905155609803e-05, "loss": 0.2407, "step": 10950 }, { "epoch": 0.9955038830101276, "grad_norm": 1.767035722732544, "learning_rate": 8.450082118655178e-05, "loss": 0.2198, "step": 10960 }, { "epoch": 0.9964121894727281, "grad_norm": 2.658242702484131, "learning_rate": 8.446255227674625e-05, "loss": 0.2102, "step": 10970 }, { "epoch": 0.9973204959353286, "grad_norm": 0.7763751745223999, "learning_rate": 8.4424244869431e-05, "loss": 0.1117, "step": 10980 }, { "epoch": 0.9982288023979291, "grad_norm": 1.7833304405212402, "learning_rate": 8.43858990073985e-05, "loss": 0.253, "step": 10990 }, { "epoch": 0.9991371088605295, "grad_norm": 2.2650444507598877, "learning_rate": 8.43475147334843e-05, "loss": 0.1846, "step": 11000 }, { "epoch": 0.9991371088605295, "eval_loss": 0.22360771894454956, "eval_runtime": 1096.4481, "eval_samples_per_second": 8.926, "eval_steps_per_second": 8.926, "step": 11000 }, { "epoch": 1.00004541532313, "grad_norm": 0.5547324419021606, "learning_rate": 8.430909209056675e-05, "loss": 0.1614, "step": 11010 }, { "epoch": 1.0009537217857305, "grad_norm": 1.994166374206543, "learning_rate": 8.427063112156712e-05, "loss": 0.2001, "step": 11020 }, { "epoch": 1.001862028248331, "grad_norm": 1.2011778354644775, "learning_rate": 8.42321318694495e-05, "loss": 0.1904, "step": 11030 }, { "epoch": 1.0027703347109316, "grad_norm": 2.2237958908081055, "learning_rate": 8.41935943772207e-05, "loss": 0.227, "step": 11040 }, { "epoch": 1.003678641173532, "grad_norm": 0.751552402973175, "learning_rate": 8.415501868793028e-05, "loss": 0.1606, "step": 11050 }, { "epoch": 1.0045869476361324, "grad_norm": 0.5514255166053772, "learning_rate": 8.411640484467047e-05, "loss": 0.2548, "step": 11060 }, { "epoch": 1.005495254098733, "grad_norm": 0.3022928237915039, "learning_rate": 8.407775289057611e-05, "loss": 0.1684, "step": 11070 }, { "epoch": 1.0064035605613333, "grad_norm": 1.3364183902740479, "learning_rate": 8.403906286882463e-05, "loss": 0.2365, "step": 11080 }, { "epoch": 1.0073118670239338, "grad_norm": 0.6467509865760803, "learning_rate": 8.400033482263593e-05, "loss": 0.1409, "step": 11090 }, { "epoch": 1.0082201734865344, "grad_norm": 0.7904669046401978, "learning_rate": 8.396156879527248e-05, "loss": 0.1543, "step": 11100 }, { "epoch": 1.009128479949135, "grad_norm": 0.4226484000682831, "learning_rate": 8.39227648300391e-05, "loss": 0.1972, "step": 11110 }, { "epoch": 1.0100367864117352, "grad_norm": 0.6842941045761108, "learning_rate": 8.3883922970283e-05, "loss": 0.152, "step": 11120 }, { "epoch": 1.0109450928743358, "grad_norm": 0.9502262473106384, "learning_rate": 8.384504325939377e-05, "loss": 0.1418, "step": 11130 }, { "epoch": 1.0118533993369363, "grad_norm": 0.03781938925385475, "learning_rate": 8.380612574080323e-05, "loss": 0.1704, "step": 11140 }, { "epoch": 1.0127617057995368, "grad_norm": 1.131725788116455, "learning_rate": 8.376717045798545e-05, "loss": 0.215, "step": 11150 }, { "epoch": 1.0136700122621372, "grad_norm": 0.3528728485107422, "learning_rate": 8.372817745445671e-05, "loss": 0.2661, "step": 11160 }, { "epoch": 1.0145783187247377, "grad_norm": 2.6319239139556885, "learning_rate": 8.368914677377539e-05, "loss": 0.1678, "step": 11170 }, { "epoch": 1.0154866251873382, "grad_norm": 1.389140248298645, "learning_rate": 8.365007845954198e-05, "loss": 0.1133, "step": 11180 }, { "epoch": 1.0163949316499388, "grad_norm": 1.3652955293655396, "learning_rate": 8.361097255539902e-05, "loss": 0.1926, "step": 11190 }, { "epoch": 1.017303238112539, "grad_norm": 0.9974254369735718, "learning_rate": 8.357182910503099e-05, "loss": 0.2172, "step": 11200 }, { "epoch": 1.0182115445751396, "grad_norm": 0.6293363571166992, "learning_rate": 8.353264815216439e-05, "loss": 0.1773, "step": 11210 }, { "epoch": 1.0191198510377402, "grad_norm": 0.5233990550041199, "learning_rate": 8.349342974056753e-05, "loss": 0.18, "step": 11220 }, { "epoch": 1.0200281575003407, "grad_norm": 0.1991007775068283, "learning_rate": 8.345417391405063e-05, "loss": 0.2228, "step": 11230 }, { "epoch": 1.020936463962941, "grad_norm": 1.3277443647384644, "learning_rate": 8.341488071646571e-05, "loss": 0.1657, "step": 11240 }, { "epoch": 1.0218447704255416, "grad_norm": 2.760777711868286, "learning_rate": 8.337555019170644e-05, "loss": 0.186, "step": 11250 }, { "epoch": 1.022753076888142, "grad_norm": 1.234932541847229, "learning_rate": 8.333618238370833e-05, "loss": 0.1759, "step": 11260 }, { "epoch": 1.0236613833507426, "grad_norm": 2.468183755874634, "learning_rate": 8.329677733644841e-05, "loss": 0.2215, "step": 11270 }, { "epoch": 1.024569689813343, "grad_norm": 0.5869355797767639, "learning_rate": 8.325733509394538e-05, "loss": 0.1479, "step": 11280 }, { "epoch": 1.0254779962759435, "grad_norm": 1.645634651184082, "learning_rate": 8.321785570025948e-05, "loss": 0.1446, "step": 11290 }, { "epoch": 1.026386302738544, "grad_norm": 0.3599882423877716, "learning_rate": 8.317833919949245e-05, "loss": 0.2628, "step": 11300 }, { "epoch": 1.0272946092011446, "grad_norm": 1.7688584327697754, "learning_rate": 8.313878563578748e-05, "loss": 0.2178, "step": 11310 }, { "epoch": 1.0282029156637449, "grad_norm": 4.138487339019775, "learning_rate": 8.309919505332913e-05, "loss": 0.2377, "step": 11320 }, { "epoch": 1.0291112221263454, "grad_norm": 2.0548083782196045, "learning_rate": 8.305956749634337e-05, "loss": 0.1805, "step": 11330 }, { "epoch": 1.030019528588946, "grad_norm": 1.28032648563385, "learning_rate": 8.301990300909741e-05, "loss": 0.1992, "step": 11340 }, { "epoch": 1.0309278350515463, "grad_norm": 2.906949758529663, "learning_rate": 8.298020163589979e-05, "loss": 0.1556, "step": 11350 }, { "epoch": 1.0318361415141468, "grad_norm": 0.3286450207233429, "learning_rate": 8.29404634211002e-05, "loss": 0.2246, "step": 11360 }, { "epoch": 1.0327444479767474, "grad_norm": 1.100372314453125, "learning_rate": 8.290068840908951e-05, "loss": 0.282, "step": 11370 }, { "epoch": 1.033652754439348, "grad_norm": 0.5059899687767029, "learning_rate": 8.286087664429966e-05, "loss": 0.1625, "step": 11380 }, { "epoch": 1.0345610609019482, "grad_norm": 2.0365524291992188, "learning_rate": 8.282102817120368e-05, "loss": 0.0861, "step": 11390 }, { "epoch": 1.0354693673645488, "grad_norm": 1.875199317932129, "learning_rate": 8.278114303431561e-05, "loss": 0.1287, "step": 11400 }, { "epoch": 1.0363776738271493, "grad_norm": 1.016701340675354, "learning_rate": 8.274122127819042e-05, "loss": 0.159, "step": 11410 }, { "epoch": 1.0372859802897498, "grad_norm": 0.9991846680641174, "learning_rate": 8.2701262947424e-05, "loss": 0.1786, "step": 11420 }, { "epoch": 1.0381942867523501, "grad_norm": 0.6201272010803223, "learning_rate": 8.266126808665311e-05, "loss": 0.1509, "step": 11430 }, { "epoch": 1.0391025932149507, "grad_norm": 4.427784442901611, "learning_rate": 8.262123674055528e-05, "loss": 0.2193, "step": 11440 }, { "epoch": 1.0400108996775512, "grad_norm": 1.440670371055603, "learning_rate": 8.258116895384885e-05, "loss": 0.1267, "step": 11450 }, { "epoch": 1.0409192061401518, "grad_norm": 0.5308876633644104, "learning_rate": 8.254106477129283e-05, "loss": 0.2048, "step": 11460 }, { "epoch": 1.041827512602752, "grad_norm": 1.6040642261505127, "learning_rate": 8.250092423768687e-05, "loss": 0.1732, "step": 11470 }, { "epoch": 1.0427358190653526, "grad_norm": 0.4870031177997589, "learning_rate": 8.24607473978713e-05, "loss": 0.1258, "step": 11480 }, { "epoch": 1.0436441255279532, "grad_norm": 0.8443437814712524, "learning_rate": 8.242053429672691e-05, "loss": 0.1585, "step": 11490 }, { "epoch": 1.0445524319905537, "grad_norm": 1.9186221361160278, "learning_rate": 8.238028497917509e-05, "loss": 0.2535, "step": 11500 }, { "epoch": 1.0445524319905537, "eval_loss": 0.22569063305854797, "eval_runtime": 1097.1843, "eval_samples_per_second": 8.92, "eval_steps_per_second": 8.92, "step": 11500 }, { "epoch": 1.045460738453154, "grad_norm": 2.797954559326172, "learning_rate": 8.23399994901776e-05, "loss": 0.1357, "step": 11510 }, { "epoch": 1.0463690449157546, "grad_norm": 1.2285041809082031, "learning_rate": 8.22996778747367e-05, "loss": 0.1908, "step": 11520 }, { "epoch": 1.047277351378355, "grad_norm": 0.02597026899456978, "learning_rate": 8.225932017789491e-05, "loss": 0.0892, "step": 11530 }, { "epoch": 1.0481856578409556, "grad_norm": 1.5627864599227905, "learning_rate": 8.22189264447351e-05, "loss": 0.2025, "step": 11540 }, { "epoch": 1.049093964303556, "grad_norm": 1.1258312463760376, "learning_rate": 8.217849672038044e-05, "loss": 0.2208, "step": 11550 }, { "epoch": 1.0500022707661565, "grad_norm": 3.862898826599121, "learning_rate": 8.213803104999423e-05, "loss": 0.1867, "step": 11560 }, { "epoch": 1.050910577228757, "grad_norm": 1.2094706296920776, "learning_rate": 8.209752947877998e-05, "loss": 0.1758, "step": 11570 }, { "epoch": 1.0518188836913576, "grad_norm": 1.2484130859375, "learning_rate": 8.205699205198127e-05, "loss": 0.1797, "step": 11580 }, { "epoch": 1.0527271901539579, "grad_norm": 1.1677626371383667, "learning_rate": 8.201641881488174e-05, "loss": 0.1511, "step": 11590 }, { "epoch": 1.0536354966165584, "grad_norm": 1.5498257875442505, "learning_rate": 8.197580981280504e-05, "loss": 0.1748, "step": 11600 }, { "epoch": 1.054543803079159, "grad_norm": 0.6957345008850098, "learning_rate": 8.193516509111476e-05, "loss": 0.1745, "step": 11610 }, { "epoch": 1.0554521095417595, "grad_norm": 1.3137191534042358, "learning_rate": 8.189448469521442e-05, "loss": 0.2001, "step": 11620 }, { "epoch": 1.0563604160043598, "grad_norm": 1.23902428150177, "learning_rate": 8.185376867054738e-05, "loss": 0.1515, "step": 11630 }, { "epoch": 1.0572687224669604, "grad_norm": 0.7945805788040161, "learning_rate": 8.181301706259676e-05, "loss": 0.1378, "step": 11640 }, { "epoch": 1.058177028929561, "grad_norm": 1.635898232460022, "learning_rate": 8.177222991688551e-05, "loss": 0.1546, "step": 11650 }, { "epoch": 1.0590853353921612, "grad_norm": 1.2486401796340942, "learning_rate": 8.173140727897619e-05, "loss": 0.2129, "step": 11660 }, { "epoch": 1.0599936418547617, "grad_norm": 4.044930934906006, "learning_rate": 8.169054919447107e-05, "loss": 0.1718, "step": 11670 }, { "epoch": 1.0609019483173623, "grad_norm": 2.327892303466797, "learning_rate": 8.164965570901198e-05, "loss": 0.1835, "step": 11680 }, { "epoch": 1.0618102547799628, "grad_norm": 0.8654083609580994, "learning_rate": 8.160872686828034e-05, "loss": 0.1332, "step": 11690 }, { "epoch": 1.0627185612425631, "grad_norm": 0.6999381184577942, "learning_rate": 8.1567762717997e-05, "loss": 0.1427, "step": 11700 }, { "epoch": 1.0636268677051637, "grad_norm": 0.27817538380622864, "learning_rate": 8.152676330392233e-05, "loss": 0.1464, "step": 11710 }, { "epoch": 1.0645351741677642, "grad_norm": 1.039597988128662, "learning_rate": 8.148572867185602e-05, "loss": 0.1576, "step": 11720 }, { "epoch": 1.0654434806303648, "grad_norm": 0.8753854632377625, "learning_rate": 8.144465886763714e-05, "loss": 0.2053, "step": 11730 }, { "epoch": 1.066351787092965, "grad_norm": 0.3564082682132721, "learning_rate": 8.140355393714407e-05, "loss": 0.1988, "step": 11740 }, { "epoch": 1.0672600935555656, "grad_norm": 0.6177548170089722, "learning_rate": 8.136241392629437e-05, "loss": 0.1399, "step": 11750 }, { "epoch": 1.0681684000181662, "grad_norm": 2.5369718074798584, "learning_rate": 8.132123888104483e-05, "loss": 0.2804, "step": 11760 }, { "epoch": 1.0690767064807667, "grad_norm": 2.0540833473205566, "learning_rate": 8.128002884739139e-05, "loss": 0.1964, "step": 11770 }, { "epoch": 1.069985012943367, "grad_norm": 2.1928141117095947, "learning_rate": 8.123878387136904e-05, "loss": 0.1773, "step": 11780 }, { "epoch": 1.0708933194059675, "grad_norm": 3.3358590602874756, "learning_rate": 8.119750399905179e-05, "loss": 0.186, "step": 11790 }, { "epoch": 1.071801625868568, "grad_norm": 1.7895925045013428, "learning_rate": 8.115618927655269e-05, "loss": 0.2159, "step": 11800 }, { "epoch": 1.0727099323311686, "grad_norm": 1.8430495262145996, "learning_rate": 8.111483975002368e-05, "loss": 0.1641, "step": 11810 }, { "epoch": 1.073618238793769, "grad_norm": 1.9761362075805664, "learning_rate": 8.107345546565558e-05, "loss": 0.1957, "step": 11820 }, { "epoch": 1.0745265452563695, "grad_norm": 1.6488392353057861, "learning_rate": 8.103203646967805e-05, "loss": 0.1983, "step": 11830 }, { "epoch": 1.07543485171897, "grad_norm": 0.5946348309516907, "learning_rate": 8.099058280835951e-05, "loss": 0.1513, "step": 11840 }, { "epoch": 1.0763431581815706, "grad_norm": 1.0522695779800415, "learning_rate": 8.094909452800714e-05, "loss": 0.1324, "step": 11850 }, { "epoch": 1.0772514646441709, "grad_norm": 1.5600851774215698, "learning_rate": 8.090757167496674e-05, "loss": 0.1592, "step": 11860 }, { "epoch": 1.0781597711067714, "grad_norm": 0.7374505996704102, "learning_rate": 8.086601429562277e-05, "loss": 0.2201, "step": 11870 }, { "epoch": 1.079068077569372, "grad_norm": 2.092637538909912, "learning_rate": 8.082442243639825e-05, "loss": 0.1822, "step": 11880 }, { "epoch": 1.0799763840319723, "grad_norm": 1.1154708862304688, "learning_rate": 8.07827961437547e-05, "loss": 0.2832, "step": 11890 }, { "epoch": 1.0808846904945728, "grad_norm": 2.218660593032837, "learning_rate": 8.074113546419213e-05, "loss": 0.1652, "step": 11900 }, { "epoch": 1.0817929969571733, "grad_norm": 2.2590107917785645, "learning_rate": 8.069944044424892e-05, "loss": 0.212, "step": 11910 }, { "epoch": 1.0827013034197739, "grad_norm": 1.2825366258621216, "learning_rate": 8.065771113050191e-05, "loss": 0.259, "step": 11920 }, { "epoch": 1.0836096098823744, "grad_norm": 1.0743293762207031, "learning_rate": 8.061594756956612e-05, "loss": 0.1666, "step": 11930 }, { "epoch": 1.0845179163449747, "grad_norm": 0.22303113341331482, "learning_rate": 8.05741498080949e-05, "loss": 0.1473, "step": 11940 }, { "epoch": 1.0854262228075753, "grad_norm": 1.0161007642745972, "learning_rate": 8.05323178927798e-05, "loss": 0.248, "step": 11950 }, { "epoch": 1.0863345292701758, "grad_norm": 0.07642332464456558, "learning_rate": 8.049045187035053e-05, "loss": 0.1606, "step": 11960 }, { "epoch": 1.0872428357327761, "grad_norm": 0.8133869171142578, "learning_rate": 8.044855178757486e-05, "loss": 0.1497, "step": 11970 }, { "epoch": 1.0881511421953767, "grad_norm": 1.0754567384719849, "learning_rate": 8.040661769125863e-05, "loss": 0.1547, "step": 11980 }, { "epoch": 1.0890594486579772, "grad_norm": 1.5715055465698242, "learning_rate": 8.036464962824572e-05, "loss": 0.1465, "step": 11990 }, { "epoch": 1.0899677551205778, "grad_norm": 0.38046783208847046, "learning_rate": 8.032264764541787e-05, "loss": 0.1195, "step": 12000 }, { "epoch": 1.0899677551205778, "eval_loss": 0.22573022544384003, "eval_runtime": 1097.8683, "eval_samples_per_second": 8.915, "eval_steps_per_second": 8.915, "step": 12000 }, { "epoch": 1.090876061583178, "grad_norm": 1.8536694049835205, "learning_rate": 8.028061178969477e-05, "loss": 0.176, "step": 12010 }, { "epoch": 1.0917843680457786, "grad_norm": 2.850680112838745, "learning_rate": 8.023854210803395e-05, "loss": 0.2531, "step": 12020 }, { "epoch": 1.0926926745083791, "grad_norm": 1.3387980461120605, "learning_rate": 8.019643864743068e-05, "loss": 0.2072, "step": 12030 }, { "epoch": 1.0936009809709797, "grad_norm": 0.9607848525047302, "learning_rate": 8.015430145491801e-05, "loss": 0.1566, "step": 12040 }, { "epoch": 1.09450928743358, "grad_norm": 2.960244655609131, "learning_rate": 8.011213057756667e-05, "loss": 0.1337, "step": 12050 }, { "epoch": 1.0954175938961805, "grad_norm": 0.07177522033452988, "learning_rate": 8.006992606248498e-05, "loss": 0.1322, "step": 12060 }, { "epoch": 1.096325900358781, "grad_norm": 0.8772838115692139, "learning_rate": 8.002768795681887e-05, "loss": 0.2135, "step": 12070 }, { "epoch": 1.0972342068213816, "grad_norm": 1.6365331411361694, "learning_rate": 7.998541630775175e-05, "loss": 0.2122, "step": 12080 }, { "epoch": 1.098142513283982, "grad_norm": 1.0378507375717163, "learning_rate": 7.994311116250462e-05, "loss": 0.1206, "step": 12090 }, { "epoch": 1.0990508197465825, "grad_norm": 0.9249830842018127, "learning_rate": 7.990077256833573e-05, "loss": 0.1753, "step": 12100 }, { "epoch": 1.099959126209183, "grad_norm": 2.957075834274292, "learning_rate": 7.98584005725408e-05, "loss": 0.2135, "step": 12110 }, { "epoch": 1.1008674326717836, "grad_norm": 2.1830015182495117, "learning_rate": 7.981599522245285e-05, "loss": 0.2355, "step": 12120 }, { "epoch": 1.1017757391343839, "grad_norm": 1.965022087097168, "learning_rate": 7.977355656544214e-05, "loss": 0.1306, "step": 12130 }, { "epoch": 1.1026840455969844, "grad_norm": 1.1296042203903198, "learning_rate": 7.973108464891617e-05, "loss": 0.204, "step": 12140 }, { "epoch": 1.103592352059585, "grad_norm": 0.9707185626029968, "learning_rate": 7.968857952031954e-05, "loss": 0.2007, "step": 12150 }, { "epoch": 1.1045006585221855, "grad_norm": 0.5664136409759521, "learning_rate": 7.964604122713397e-05, "loss": 0.161, "step": 12160 }, { "epoch": 1.1054089649847858, "grad_norm": 0.7225908041000366, "learning_rate": 7.960346981687828e-05, "loss": 0.1669, "step": 12170 }, { "epoch": 1.1063172714473863, "grad_norm": 3.470031261444092, "learning_rate": 7.95608653371082e-05, "loss": 0.1773, "step": 12180 }, { "epoch": 1.1072255779099869, "grad_norm": 0.14042674005031586, "learning_rate": 7.951822783541649e-05, "loss": 0.1387, "step": 12190 }, { "epoch": 1.1081338843725872, "grad_norm": 0.8134803771972656, "learning_rate": 7.94755573594327e-05, "loss": 0.1583, "step": 12200 }, { "epoch": 1.1090421908351877, "grad_norm": 2.221982717514038, "learning_rate": 7.94328539568233e-05, "loss": 0.1425, "step": 12210 }, { "epoch": 1.1099504972977883, "grad_norm": 0.3668789565563202, "learning_rate": 7.939011767529149e-05, "loss": 0.1585, "step": 12220 }, { "epoch": 1.1108588037603888, "grad_norm": 0.33296433091163635, "learning_rate": 7.934734856257722e-05, "loss": 0.2059, "step": 12230 }, { "epoch": 1.1117671102229894, "grad_norm": 2.481766700744629, "learning_rate": 7.930454666645715e-05, "loss": 0.2339, "step": 12240 }, { "epoch": 1.1126754166855897, "grad_norm": 2.2350234985351562, "learning_rate": 7.926171203474446e-05, "loss": 0.1881, "step": 12250 }, { "epoch": 1.1135837231481902, "grad_norm": 1.0035669803619385, "learning_rate": 7.921884471528901e-05, "loss": 0.1826, "step": 12260 }, { "epoch": 1.1144920296107907, "grad_norm": 2.884647846221924, "learning_rate": 7.91759447559771e-05, "loss": 0.1212, "step": 12270 }, { "epoch": 1.115400336073391, "grad_norm": 1.3250243663787842, "learning_rate": 7.913301220473153e-05, "loss": 0.202, "step": 12280 }, { "epoch": 1.1163086425359916, "grad_norm": 1.7402634620666504, "learning_rate": 7.909004710951151e-05, "loss": 0.1782, "step": 12290 }, { "epoch": 1.1172169489985921, "grad_norm": 2.126861810684204, "learning_rate": 7.904704951831256e-05, "loss": 0.1135, "step": 12300 }, { "epoch": 1.1181252554611927, "grad_norm": 3.196378469467163, "learning_rate": 7.900401947916656e-05, "loss": 0.1471, "step": 12310 }, { "epoch": 1.119033561923793, "grad_norm": 0.47015511989593506, "learning_rate": 7.89609570401416e-05, "loss": 0.152, "step": 12320 }, { "epoch": 1.1199418683863935, "grad_norm": 0.5880885720252991, "learning_rate": 7.891786224934194e-05, "loss": 0.1917, "step": 12330 }, { "epoch": 1.120850174848994, "grad_norm": 1.7473978996276855, "learning_rate": 7.887473515490806e-05, "loss": 0.2113, "step": 12340 }, { "epoch": 1.1217584813115946, "grad_norm": 1.5515917539596558, "learning_rate": 7.883157580501645e-05, "loss": 0.2091, "step": 12350 }, { "epoch": 1.122666787774195, "grad_norm": 1.916438341140747, "learning_rate": 7.878838424787968e-05, "loss": 0.2415, "step": 12360 }, { "epoch": 1.1235750942367955, "grad_norm": 0.898893415927887, "learning_rate": 7.874516053174625e-05, "loss": 0.1412, "step": 12370 }, { "epoch": 1.124483400699396, "grad_norm": 0.6166730523109436, "learning_rate": 7.870190470490063e-05, "loss": 0.2678, "step": 12380 }, { "epoch": 1.1253917071619965, "grad_norm": 0.24683329463005066, "learning_rate": 7.865861681566311e-05, "loss": 0.1229, "step": 12390 }, { "epoch": 1.1263000136245969, "grad_norm": 2.989305257797241, "learning_rate": 7.861529691238988e-05, "loss": 0.236, "step": 12400 }, { "epoch": 1.1272083200871974, "grad_norm": 0.8489149808883667, "learning_rate": 7.85719450434728e-05, "loss": 0.2327, "step": 12410 }, { "epoch": 1.128116626549798, "grad_norm": 2.111071825027466, "learning_rate": 7.85285612573395e-05, "loss": 0.1378, "step": 12420 }, { "epoch": 1.1290249330123985, "grad_norm": 2.457166910171509, "learning_rate": 7.848514560245324e-05, "loss": 0.1754, "step": 12430 }, { "epoch": 1.1299332394749988, "grad_norm": 1.6256060600280762, "learning_rate": 7.844169812731288e-05, "loss": 0.148, "step": 12440 }, { "epoch": 1.1308415459375993, "grad_norm": 2.5117030143737793, "learning_rate": 7.839821888045283e-05, "loss": 0.1508, "step": 12450 }, { "epoch": 1.1317498524001999, "grad_norm": 2.846806049346924, "learning_rate": 7.835470791044295e-05, "loss": 0.188, "step": 12460 }, { "epoch": 1.1326581588628004, "grad_norm": 1.792342185974121, "learning_rate": 7.831116526588864e-05, "loss": 0.2611, "step": 12470 }, { "epoch": 1.1335664653254007, "grad_norm": 0.5921221375465393, "learning_rate": 7.82675909954306e-05, "loss": 0.1823, "step": 12480 }, { "epoch": 1.1344747717880013, "grad_norm": 1.9859912395477295, "learning_rate": 7.822398514774489e-05, "loss": 0.1958, "step": 12490 }, { "epoch": 1.1353830782506018, "grad_norm": 0.8084275722503662, "learning_rate": 7.818034777154279e-05, "loss": 0.1386, "step": 12500 }, { "epoch": 1.1353830782506018, "eval_loss": 0.21974828839302063, "eval_runtime": 1098.1272, "eval_samples_per_second": 8.912, "eval_steps_per_second": 8.912, "step": 12500 }, { "epoch": 1.1362913847132021, "grad_norm": 1.0338177680969238, "learning_rate": 7.813667891557088e-05, "loss": 0.1976, "step": 12510 }, { "epoch": 1.1371996911758027, "grad_norm": 1.4476417303085327, "learning_rate": 7.809297862861088e-05, "loss": 0.1863, "step": 12520 }, { "epoch": 1.1381079976384032, "grad_norm": 0.6773595809936523, "learning_rate": 7.804924695947957e-05, "loss": 0.2142, "step": 12530 }, { "epoch": 1.1390163041010037, "grad_norm": 1.7676374912261963, "learning_rate": 7.800548395702887e-05, "loss": 0.2239, "step": 12540 }, { "epoch": 1.1399246105636043, "grad_norm": 2.792613983154297, "learning_rate": 7.796168967014563e-05, "loss": 0.2201, "step": 12550 }, { "epoch": 1.1408329170262046, "grad_norm": 2.8869688510894775, "learning_rate": 7.791786414775168e-05, "loss": 0.2277, "step": 12560 }, { "epoch": 1.1417412234888051, "grad_norm": 2.1175434589385986, "learning_rate": 7.787400743880376e-05, "loss": 0.1761, "step": 12570 }, { "epoch": 1.1426495299514057, "grad_norm": 1.9091053009033203, "learning_rate": 7.783011959229338e-05, "loss": 0.1205, "step": 12580 }, { "epoch": 1.143557836414006, "grad_norm": 1.8209257125854492, "learning_rate": 7.778620065724694e-05, "loss": 0.1252, "step": 12590 }, { "epoch": 1.1444661428766065, "grad_norm": 1.3020883798599243, "learning_rate": 7.774225068272546e-05, "loss": 0.1875, "step": 12600 }, { "epoch": 1.145374449339207, "grad_norm": 3.7230582237243652, "learning_rate": 7.76982697178247e-05, "loss": 0.2747, "step": 12610 }, { "epoch": 1.1462827558018076, "grad_norm": 0.03660500794649124, "learning_rate": 7.765425781167503e-05, "loss": 0.092, "step": 12620 }, { "epoch": 1.147191062264408, "grad_norm": 3.8817315101623535, "learning_rate": 7.761021501344135e-05, "loss": 0.1909, "step": 12630 }, { "epoch": 1.1480993687270085, "grad_norm": 0.25007522106170654, "learning_rate": 7.756614137232312e-05, "loss": 0.1765, "step": 12640 }, { "epoch": 1.149007675189609, "grad_norm": 0.5047479867935181, "learning_rate": 7.752203693755419e-05, "loss": 0.2207, "step": 12650 }, { "epoch": 1.1499159816522095, "grad_norm": 2.4738683700561523, "learning_rate": 7.747790175840291e-05, "loss": 0.1343, "step": 12660 }, { "epoch": 1.1508242881148099, "grad_norm": 1.7191436290740967, "learning_rate": 7.743373588417187e-05, "loss": 0.1636, "step": 12670 }, { "epoch": 1.1517325945774104, "grad_norm": 4.615729331970215, "learning_rate": 7.738953936419797e-05, "loss": 0.1795, "step": 12680 }, { "epoch": 1.152640901040011, "grad_norm": 3.581761360168457, "learning_rate": 7.734531224785242e-05, "loss": 0.2201, "step": 12690 }, { "epoch": 1.1535492075026115, "grad_norm": 1.8583347797393799, "learning_rate": 7.730105458454048e-05, "loss": 0.214, "step": 12700 }, { "epoch": 1.1544575139652118, "grad_norm": 1.4408388137817383, "learning_rate": 7.725676642370168e-05, "loss": 0.1619, "step": 12710 }, { "epoch": 1.1553658204278123, "grad_norm": 1.6927516460418701, "learning_rate": 7.721244781480949e-05, "loss": 0.1789, "step": 12720 }, { "epoch": 1.1562741268904129, "grad_norm": 1.9322364330291748, "learning_rate": 7.716809880737147e-05, "loss": 0.1462, "step": 12730 }, { "epoch": 1.1571824333530132, "grad_norm": 0.5202440023422241, "learning_rate": 7.712371945092912e-05, "loss": 0.1843, "step": 12740 }, { "epoch": 1.1580907398156137, "grad_norm": 0.5710748434066772, "learning_rate": 7.70793097950578e-05, "loss": 0.2109, "step": 12750 }, { "epoch": 1.1589990462782143, "grad_norm": 1.3277735710144043, "learning_rate": 7.70348698893668e-05, "loss": 0.1409, "step": 12760 }, { "epoch": 1.1599073527408148, "grad_norm": 0.9547052383422852, "learning_rate": 7.699039978349913e-05, "loss": 0.2111, "step": 12770 }, { "epoch": 1.1608156592034153, "grad_norm": 1.9616237878799438, "learning_rate": 7.694589952713158e-05, "loss": 0.1692, "step": 12780 }, { "epoch": 1.1617239656660157, "grad_norm": 3.0220189094543457, "learning_rate": 7.69013691699746e-05, "loss": 0.2552, "step": 12790 }, { "epoch": 1.1626322721286162, "grad_norm": 0.4911302328109741, "learning_rate": 7.685680876177222e-05, "loss": 0.149, "step": 12800 }, { "epoch": 1.1635405785912167, "grad_norm": 2.0093495845794678, "learning_rate": 7.681221835230215e-05, "loss": 0.1665, "step": 12810 }, { "epoch": 1.164448885053817, "grad_norm": 0.7934325337409973, "learning_rate": 7.676759799137551e-05, "loss": 0.1302, "step": 12820 }, { "epoch": 1.1653571915164176, "grad_norm": 0.7247586250305176, "learning_rate": 7.672294772883693e-05, "loss": 0.1304, "step": 12830 }, { "epoch": 1.1662654979790181, "grad_norm": 0.21352390944957733, "learning_rate": 7.667826761456445e-05, "loss": 0.2312, "step": 12840 }, { "epoch": 1.1671738044416187, "grad_norm": 1.237528920173645, "learning_rate": 7.66335576984694e-05, "loss": 0.1924, "step": 12850 }, { "epoch": 1.1680821109042192, "grad_norm": 1.0381205081939697, "learning_rate": 7.658881803049648e-05, "loss": 0.1891, "step": 12860 }, { "epoch": 1.1689904173668195, "grad_norm": 2.2249457836151123, "learning_rate": 7.654404866062358e-05, "loss": 0.1771, "step": 12870 }, { "epoch": 1.16989872382942, "grad_norm": 1.8861968517303467, "learning_rate": 7.649924963886173e-05, "loss": 0.1633, "step": 12880 }, { "epoch": 1.1708070302920206, "grad_norm": 1.0658674240112305, "learning_rate": 7.645442101525517e-05, "loss": 0.208, "step": 12890 }, { "epoch": 1.171715336754621, "grad_norm": 2.2489218711853027, "learning_rate": 7.640956283988115e-05, "loss": 0.1344, "step": 12900 }, { "epoch": 1.1726236432172215, "grad_norm": 0.9525306820869446, "learning_rate": 7.636467516284996e-05, "loss": 0.1841, "step": 12910 }, { "epoch": 1.173531949679822, "grad_norm": 1.288439154624939, "learning_rate": 7.631975803430483e-05, "loss": 0.2228, "step": 12920 }, { "epoch": 1.1744402561424225, "grad_norm": 1.2491360902786255, "learning_rate": 7.627481150442187e-05, "loss": 0.1647, "step": 12930 }, { "epoch": 1.1753485626050229, "grad_norm": 1.3447723388671875, "learning_rate": 7.622983562341009e-05, "loss": 0.1107, "step": 12940 }, { "epoch": 1.1762568690676234, "grad_norm": 2.85978364944458, "learning_rate": 7.61848304415112e-05, "loss": 0.2055, "step": 12950 }, { "epoch": 1.177165175530224, "grad_norm": 3.3469691276550293, "learning_rate": 7.613979600899976e-05, "loss": 0.1737, "step": 12960 }, { "epoch": 1.1780734819928245, "grad_norm": 1.5573810338974, "learning_rate": 7.609473237618291e-05, "loss": 0.2134, "step": 12970 }, { "epoch": 1.1789817884554248, "grad_norm": 0.9079775810241699, "learning_rate": 7.604963959340044e-05, "loss": 0.1618, "step": 12980 }, { "epoch": 1.1798900949180253, "grad_norm": 1.017450213432312, "learning_rate": 7.60045177110247e-05, "loss": 0.1925, "step": 12990 }, { "epoch": 1.1807984013806259, "grad_norm": 0.40586501359939575, "learning_rate": 7.595936677946056e-05, "loss": 0.1542, "step": 13000 }, { "epoch": 1.1807984013806259, "eval_loss": 0.23153305053710938, "eval_runtime": 1092.8445, "eval_samples_per_second": 8.956, "eval_steps_per_second": 8.956, "step": 13000 }, { "epoch": 1.1817067078432264, "grad_norm": 5.4359941482543945, "learning_rate": 7.591418684914532e-05, "loss": 0.1994, "step": 13010 }, { "epoch": 1.1826150143058267, "grad_norm": 0.8199917674064636, "learning_rate": 7.58689779705487e-05, "loss": 0.169, "step": 13020 }, { "epoch": 1.1835233207684273, "grad_norm": 1.0284796953201294, "learning_rate": 7.582374019417271e-05, "loss": 0.2228, "step": 13030 }, { "epoch": 1.1844316272310278, "grad_norm": 1.201950192451477, "learning_rate": 7.57784735705517e-05, "loss": 0.1509, "step": 13040 }, { "epoch": 1.1853399336936281, "grad_norm": 0.8256326913833618, "learning_rate": 7.57331781502522e-05, "loss": 0.1352, "step": 13050 }, { "epoch": 1.1862482401562287, "grad_norm": 0.8957222104072571, "learning_rate": 7.568785398387297e-05, "loss": 0.1448, "step": 13060 }, { "epoch": 1.1871565466188292, "grad_norm": 1.4677457809448242, "learning_rate": 7.564250112204479e-05, "loss": 0.2484, "step": 13070 }, { "epoch": 1.1880648530814297, "grad_norm": 1.7417947053909302, "learning_rate": 7.559711961543054e-05, "loss": 0.2022, "step": 13080 }, { "epoch": 1.1889731595440303, "grad_norm": 0.9710177779197693, "learning_rate": 7.555170951472513e-05, "loss": 0.2749, "step": 13090 }, { "epoch": 1.1898814660066306, "grad_norm": 1.3225151300430298, "learning_rate": 7.550627087065536e-05, "loss": 0.1321, "step": 13100 }, { "epoch": 1.1907897724692311, "grad_norm": 1.6582667827606201, "learning_rate": 7.546080373397995e-05, "loss": 0.1416, "step": 13110 }, { "epoch": 1.1916980789318317, "grad_norm": 1.2596209049224854, "learning_rate": 7.541530815548946e-05, "loss": 0.2181, "step": 13120 }, { "epoch": 1.192606385394432, "grad_norm": 0.9659709930419922, "learning_rate": 7.536978418600613e-05, "loss": 0.1467, "step": 13130 }, { "epoch": 1.1935146918570325, "grad_norm": 1.9829187393188477, "learning_rate": 7.532423187638406e-05, "loss": 0.292, "step": 13140 }, { "epoch": 1.194422998319633, "grad_norm": 1.2196447849273682, "learning_rate": 7.52786512775089e-05, "loss": 0.1675, "step": 13150 }, { "epoch": 1.1953313047822336, "grad_norm": 2.1093435287475586, "learning_rate": 7.523304244029792e-05, "loss": 0.135, "step": 13160 }, { "epoch": 1.196239611244834, "grad_norm": 0.46340057253837585, "learning_rate": 7.518740541569998e-05, "loss": 0.0978, "step": 13170 }, { "epoch": 1.1971479177074345, "grad_norm": 1.045954942703247, "learning_rate": 7.514174025469536e-05, "loss": 0.2581, "step": 13180 }, { "epoch": 1.198056224170035, "grad_norm": 2.697943687438965, "learning_rate": 7.509604700829583e-05, "loss": 0.1693, "step": 13190 }, { "epoch": 1.1989645306326355, "grad_norm": 0.9835949540138245, "learning_rate": 7.50503257275445e-05, "loss": 0.1896, "step": 13200 }, { "epoch": 1.1998728370952358, "grad_norm": 0.5497257113456726, "learning_rate": 7.500457646351581e-05, "loss": 0.1694, "step": 13210 }, { "epoch": 1.2007811435578364, "grad_norm": 1.0448310375213623, "learning_rate": 7.495879926731548e-05, "loss": 0.1943, "step": 13220 }, { "epoch": 1.201689450020437, "grad_norm": 2.794445037841797, "learning_rate": 7.491299419008039e-05, "loss": 0.2709, "step": 13230 }, { "epoch": 1.2025977564830375, "grad_norm": 2.4889235496520996, "learning_rate": 7.486716128297858e-05, "loss": 0.1297, "step": 13240 }, { "epoch": 1.2035060629456378, "grad_norm": 1.851720929145813, "learning_rate": 7.482130059720916e-05, "loss": 0.1248, "step": 13250 }, { "epoch": 1.2044143694082383, "grad_norm": 2.3769924640655518, "learning_rate": 7.477541218400233e-05, "loss": 0.2646, "step": 13260 }, { "epoch": 1.2053226758708389, "grad_norm": 0.5488000512123108, "learning_rate": 7.47294960946192e-05, "loss": 0.1538, "step": 13270 }, { "epoch": 1.2062309823334394, "grad_norm": 1.1655014753341675, "learning_rate": 7.468355238035186e-05, "loss": 0.176, "step": 13280 }, { "epoch": 1.2071392887960397, "grad_norm": 0.23356206715106964, "learning_rate": 7.463758109252322e-05, "loss": 0.1769, "step": 13290 }, { "epoch": 1.2080475952586402, "grad_norm": 2.858597993850708, "learning_rate": 7.459158228248695e-05, "loss": 0.2195, "step": 13300 }, { "epoch": 1.2089559017212408, "grad_norm": 1.1080729961395264, "learning_rate": 7.454555600162755e-05, "loss": 0.2381, "step": 13310 }, { "epoch": 1.2098642081838413, "grad_norm": 1.048413872718811, "learning_rate": 7.449950230136015e-05, "loss": 0.1736, "step": 13320 }, { "epoch": 1.2107725146464416, "grad_norm": 4.270589828491211, "learning_rate": 7.445342123313054e-05, "loss": 0.1726, "step": 13330 }, { "epoch": 1.2116808211090422, "grad_norm": 1.7768833637237549, "learning_rate": 7.440731284841506e-05, "loss": 0.1903, "step": 13340 }, { "epoch": 1.2125891275716427, "grad_norm": 3.805901288986206, "learning_rate": 7.436117719872057e-05, "loss": 0.1897, "step": 13350 }, { "epoch": 1.213497434034243, "grad_norm": 0.9229122996330261, "learning_rate": 7.43150143355844e-05, "loss": 0.2111, "step": 13360 }, { "epoch": 1.2144057404968436, "grad_norm": 1.5517085790634155, "learning_rate": 7.426882431057427e-05, "loss": 0.2328, "step": 13370 }, { "epoch": 1.2153140469594441, "grad_norm": 0.26565688848495483, "learning_rate": 7.422260717528825e-05, "loss": 0.1612, "step": 13380 }, { "epoch": 1.2162223534220447, "grad_norm": 0.5262832045555115, "learning_rate": 7.417636298135468e-05, "loss": 0.1835, "step": 13390 }, { "epoch": 1.2171306598846452, "grad_norm": 2.450317859649658, "learning_rate": 7.413009178043211e-05, "loss": 0.1947, "step": 13400 }, { "epoch": 1.2180389663472455, "grad_norm": 0.7632344961166382, "learning_rate": 7.408379362420932e-05, "loss": 0.1656, "step": 13410 }, { "epoch": 1.218947272809846, "grad_norm": 1.2247673273086548, "learning_rate": 7.403746856440513e-05, "loss": 0.238, "step": 13420 }, { "epoch": 1.2198555792724466, "grad_norm": 0.277344286441803, "learning_rate": 7.399111665276849e-05, "loss": 0.1366, "step": 13430 }, { "epoch": 1.220763885735047, "grad_norm": 1.0015743970870972, "learning_rate": 7.394473794107827e-05, "loss": 0.2331, "step": 13440 }, { "epoch": 1.2216721921976474, "grad_norm": 3.4916157722473145, "learning_rate": 7.389833248114334e-05, "loss": 0.1337, "step": 13450 }, { "epoch": 1.222580498660248, "grad_norm": 0.3495250940322876, "learning_rate": 7.385190032480239e-05, "loss": 0.2023, "step": 13460 }, { "epoch": 1.2234888051228485, "grad_norm": 0.32738739252090454, "learning_rate": 7.380544152392396e-05, "loss": 0.2042, "step": 13470 }, { "epoch": 1.2243971115854488, "grad_norm": 1.758453607559204, "learning_rate": 7.37589561304064e-05, "loss": 0.1992, "step": 13480 }, { "epoch": 1.2253054180480494, "grad_norm": 1.3445188999176025, "learning_rate": 7.37124441961777e-05, "loss": 0.1441, "step": 13490 }, { "epoch": 1.22621372451065, "grad_norm": 1.1823047399520874, "learning_rate": 7.36659057731955e-05, "loss": 0.1951, "step": 13500 }, { "epoch": 1.22621372451065, "eval_loss": 0.2194036990404129, "eval_runtime": 1094.7374, "eval_samples_per_second": 8.94, "eval_steps_per_second": 8.94, "step": 13500 }, { "epoch": 1.2271220309732505, "grad_norm": 0.8656460642814636, "learning_rate": 7.361934091344709e-05, "loss": 0.1825, "step": 13510 }, { "epoch": 1.2280303374358508, "grad_norm": 1.7109898328781128, "learning_rate": 7.357274966894923e-05, "loss": 0.1825, "step": 13520 }, { "epoch": 1.2289386438984513, "grad_norm": 0.14438281953334808, "learning_rate": 7.352613209174818e-05, "loss": 0.1612, "step": 13530 }, { "epoch": 1.2298469503610518, "grad_norm": 0.5100304484367371, "learning_rate": 7.347948823391963e-05, "loss": 0.1374, "step": 13540 }, { "epoch": 1.2307552568236524, "grad_norm": 3.461698532104492, "learning_rate": 7.343281814756858e-05, "loss": 0.1767, "step": 13550 }, { "epoch": 1.2316635632862527, "grad_norm": 2.794881582260132, "learning_rate": 7.338612188482941e-05, "loss": 0.1456, "step": 13560 }, { "epoch": 1.2325718697488532, "grad_norm": 0.6207141280174255, "learning_rate": 7.333939949786564e-05, "loss": 0.1055, "step": 13570 }, { "epoch": 1.2334801762114538, "grad_norm": 2.1050467491149902, "learning_rate": 7.329265103887005e-05, "loss": 0.1897, "step": 13580 }, { "epoch": 1.2343884826740543, "grad_norm": 1.0547115802764893, "learning_rate": 7.324587656006452e-05, "loss": 0.2025, "step": 13590 }, { "epoch": 1.2352967891366546, "grad_norm": 1.8303409814834595, "learning_rate": 7.319907611370001e-05, "loss": 0.1468, "step": 13600 }, { "epoch": 1.2362050955992552, "grad_norm": 0.7932450771331787, "learning_rate": 7.315224975205643e-05, "loss": 0.1876, "step": 13610 }, { "epoch": 1.2371134020618557, "grad_norm": 3.4797329902648926, "learning_rate": 7.310539752744273e-05, "loss": 0.1545, "step": 13620 }, { "epoch": 1.2380217085244563, "grad_norm": 0.4182676076889038, "learning_rate": 7.305851949219667e-05, "loss": 0.1106, "step": 13630 }, { "epoch": 1.2389300149870566, "grad_norm": 0.7598378658294678, "learning_rate": 7.30116156986849e-05, "loss": 0.1506, "step": 13640 }, { "epoch": 1.239838321449657, "grad_norm": 1.2624329328536987, "learning_rate": 7.296468619930281e-05, "loss": 0.1897, "step": 13650 }, { "epoch": 1.2407466279122576, "grad_norm": 3.121011734008789, "learning_rate": 7.291773104647453e-05, "loss": 0.1565, "step": 13660 }, { "epoch": 1.241654934374858, "grad_norm": 2.1170454025268555, "learning_rate": 7.287075029265281e-05, "loss": 0.1261, "step": 13670 }, { "epoch": 1.2425632408374585, "grad_norm": 0.6129699349403381, "learning_rate": 7.282374399031905e-05, "loss": 0.1603, "step": 13680 }, { "epoch": 1.243471547300059, "grad_norm": 0.9258622527122498, "learning_rate": 7.277671219198313e-05, "loss": 0.2408, "step": 13690 }, { "epoch": 1.2443798537626596, "grad_norm": 2.256784200668335, "learning_rate": 7.272965495018347e-05, "loss": 0.1725, "step": 13700 }, { "epoch": 1.2452881602252601, "grad_norm": 1.2712091207504272, "learning_rate": 7.268257231748688e-05, "loss": 0.1946, "step": 13710 }, { "epoch": 1.2461964666878604, "grad_norm": 1.1057032346725464, "learning_rate": 7.263546434648852e-05, "loss": 0.1142, "step": 13720 }, { "epoch": 1.247104773150461, "grad_norm": 2.197641134262085, "learning_rate": 7.258833108981189e-05, "loss": 0.2028, "step": 13730 }, { "epoch": 1.2480130796130615, "grad_norm": 1.916194200515747, "learning_rate": 7.254117260010873e-05, "loss": 0.1816, "step": 13740 }, { "epoch": 1.2489213860756618, "grad_norm": 0.8462644219398499, "learning_rate": 7.249398893005895e-05, "loss": 0.1887, "step": 13750 }, { "epoch": 1.2498296925382624, "grad_norm": 2.036501169204712, "learning_rate": 7.244678013237059e-05, "loss": 0.1877, "step": 13760 }, { "epoch": 1.250737999000863, "grad_norm": 1.6229236125946045, "learning_rate": 7.239954625977974e-05, "loss": 0.1497, "step": 13770 }, { "epoch": 1.2516463054634634, "grad_norm": 0.26468223333358765, "learning_rate": 7.235228736505058e-05, "loss": 0.2009, "step": 13780 }, { "epoch": 1.252554611926064, "grad_norm": 0.5471442937850952, "learning_rate": 7.230500350097518e-05, "loss": 0.195, "step": 13790 }, { "epoch": 1.2534629183886643, "grad_norm": 2.0027058124542236, "learning_rate": 7.225769472037346e-05, "loss": 0.1514, "step": 13800 }, { "epoch": 1.2543712248512648, "grad_norm": 0.7462431192398071, "learning_rate": 7.221036107609329e-05, "loss": 0.173, "step": 13810 }, { "epoch": 1.2552795313138652, "grad_norm": 1.2732115983963013, "learning_rate": 7.21630026210102e-05, "loss": 0.1766, "step": 13820 }, { "epoch": 1.2561878377764657, "grad_norm": 2.289412498474121, "learning_rate": 7.211561940802751e-05, "loss": 0.2132, "step": 13830 }, { "epoch": 1.2570961442390662, "grad_norm": 1.104158639907837, "learning_rate": 7.206821149007616e-05, "loss": 0.1904, "step": 13840 }, { "epoch": 1.2580044507016668, "grad_norm": 0.0986461266875267, "learning_rate": 7.202077892011473e-05, "loss": 0.1706, "step": 13850 }, { "epoch": 1.2589127571642673, "grad_norm": 0.8207359910011292, "learning_rate": 7.197332175112928e-05, "loss": 0.184, "step": 13860 }, { "epoch": 1.2598210636268676, "grad_norm": 0.6438146233558655, "learning_rate": 7.192584003613335e-05, "loss": 0.1825, "step": 13870 }, { "epoch": 1.2607293700894682, "grad_norm": 0.6308954358100891, "learning_rate": 7.1878333828168e-05, "loss": 0.1366, "step": 13880 }, { "epoch": 1.2616376765520687, "grad_norm": 0.6915921568870544, "learning_rate": 7.18308031803015e-05, "loss": 0.1248, "step": 13890 }, { "epoch": 1.262545983014669, "grad_norm": 1.8959964513778687, "learning_rate": 7.178324814562956e-05, "loss": 0.1867, "step": 13900 }, { "epoch": 1.2634542894772696, "grad_norm": 1.8790920972824097, "learning_rate": 7.173566877727504e-05, "loss": 0.1586, "step": 13910 }, { "epoch": 1.26436259593987, "grad_norm": 0.6636154651641846, "learning_rate": 7.168806512838802e-05, "loss": 0.1568, "step": 13920 }, { "epoch": 1.2652709024024706, "grad_norm": 1.623907208442688, "learning_rate": 7.16404372521457e-05, "loss": 0.1519, "step": 13930 }, { "epoch": 1.2661792088650712, "grad_norm": 2.208629846572876, "learning_rate": 7.159278520175232e-05, "loss": 0.1697, "step": 13940 }, { "epoch": 1.2670875153276715, "grad_norm": 0.8366792798042297, "learning_rate": 7.154510903043918e-05, "loss": 0.1321, "step": 13950 }, { "epoch": 1.267995821790272, "grad_norm": 0.6579391360282898, "learning_rate": 7.149740879146449e-05, "loss": 0.2316, "step": 13960 }, { "epoch": 1.2689041282528726, "grad_norm": 0.7884989976882935, "learning_rate": 7.144968453811331e-05, "loss": 0.111, "step": 13970 }, { "epoch": 1.269812434715473, "grad_norm": 2.2036261558532715, "learning_rate": 7.140193632369759e-05, "loss": 0.1371, "step": 13980 }, { "epoch": 1.2707207411780734, "grad_norm": 3.0888500213623047, "learning_rate": 7.1354164201556e-05, "loss": 0.1823, "step": 13990 }, { "epoch": 1.271629047640674, "grad_norm": 2.282102346420288, "learning_rate": 7.130636822505396e-05, "loss": 0.1833, "step": 14000 }, { "epoch": 1.271629047640674, "eval_loss": 0.219380721449852, "eval_runtime": 1096.0942, "eval_samples_per_second": 8.929, "eval_steps_per_second": 8.929, "step": 14000 }, { "epoch": 1.2725373541032745, "grad_norm": 2.4674768447875977, "learning_rate": 7.12585484475835e-05, "loss": 0.1941, "step": 14010 }, { "epoch": 1.273445660565875, "grad_norm": 2.7098309993743896, "learning_rate": 7.121070492256324e-05, "loss": 0.2351, "step": 14020 }, { "epoch": 1.2743539670284754, "grad_norm": 1.9002119302749634, "learning_rate": 7.116283770343838e-05, "loss": 0.2092, "step": 14030 }, { "epoch": 1.275262273491076, "grad_norm": 0.9362668991088867, "learning_rate": 7.111494684368048e-05, "loss": 0.257, "step": 14040 }, { "epoch": 1.2761705799536764, "grad_norm": 0.8297501802444458, "learning_rate": 7.106703239678763e-05, "loss": 0.1461, "step": 14050 }, { "epoch": 1.2770788864162768, "grad_norm": 2.861426830291748, "learning_rate": 7.10190944162842e-05, "loss": 0.1575, "step": 14060 }, { "epoch": 1.2779871928788773, "grad_norm": 1.420722484588623, "learning_rate": 7.097113295572084e-05, "loss": 0.2471, "step": 14070 }, { "epoch": 1.2788954993414778, "grad_norm": 0.9247449636459351, "learning_rate": 7.09231480686745e-05, "loss": 0.1739, "step": 14080 }, { "epoch": 1.2798038058040784, "grad_norm": 1.744306206703186, "learning_rate": 7.08751398087482e-05, "loss": 0.186, "step": 14090 }, { "epoch": 1.280712112266679, "grad_norm": 1.2001882791519165, "learning_rate": 7.082710822957116e-05, "loss": 0.1587, "step": 14100 }, { "epoch": 1.2816204187292792, "grad_norm": 2.6305370330810547, "learning_rate": 7.077905338479858e-05, "loss": 0.173, "step": 14110 }, { "epoch": 1.2825287251918798, "grad_norm": 2.545351982116699, "learning_rate": 7.07309753281117e-05, "loss": 0.2207, "step": 14120 }, { "epoch": 1.28343703165448, "grad_norm": 0.8456815481185913, "learning_rate": 7.068287411321768e-05, "loss": 0.2266, "step": 14130 }, { "epoch": 1.2843453381170806, "grad_norm": 2.3172006607055664, "learning_rate": 7.06347497938495e-05, "loss": 0.2003, "step": 14140 }, { "epoch": 1.2852536445796812, "grad_norm": 1.4336159229278564, "learning_rate": 7.058660242376603e-05, "loss": 0.1513, "step": 14150 }, { "epoch": 1.2861619510422817, "grad_norm": 2.412945508956909, "learning_rate": 7.053843205675181e-05, "loss": 0.1767, "step": 14160 }, { "epoch": 1.2870702575048822, "grad_norm": 0.9974110722541809, "learning_rate": 7.049023874661716e-05, "loss": 0.184, "step": 14170 }, { "epoch": 1.2879785639674826, "grad_norm": 1.3377546072006226, "learning_rate": 7.044202254719793e-05, "loss": 0.1786, "step": 14180 }, { "epoch": 1.288886870430083, "grad_norm": 0.8081949949264526, "learning_rate": 7.03937835123556e-05, "loss": 0.1966, "step": 14190 }, { "epoch": 1.2897951768926836, "grad_norm": 1.0715595483779907, "learning_rate": 7.034552169597714e-05, "loss": 0.1695, "step": 14200 }, { "epoch": 1.290703483355284, "grad_norm": 0.5568915605545044, "learning_rate": 7.0297237151975e-05, "loss": 0.1485, "step": 14210 }, { "epoch": 1.2916117898178845, "grad_norm": 3.0087709426879883, "learning_rate": 7.024892993428696e-05, "loss": 0.3131, "step": 14220 }, { "epoch": 1.292520096280485, "grad_norm": 1.0324058532714844, "learning_rate": 7.02006000968762e-05, "loss": 0.1461, "step": 14230 }, { "epoch": 1.2934284027430856, "grad_norm": 1.9866834878921509, "learning_rate": 7.01522476937311e-05, "loss": 0.129, "step": 14240 }, { "epoch": 1.294336709205686, "grad_norm": 1.8372185230255127, "learning_rate": 7.010387277886528e-05, "loss": 0.204, "step": 14250 }, { "epoch": 1.2952450156682864, "grad_norm": 0.7004584670066833, "learning_rate": 7.005547540631752e-05, "loss": 0.1359, "step": 14260 }, { "epoch": 1.296153322130887, "grad_norm": 0.6821761131286621, "learning_rate": 7.000705563015164e-05, "loss": 0.1326, "step": 14270 }, { "epoch": 1.2970616285934875, "grad_norm": 0.9309883713722229, "learning_rate": 6.995861350445656e-05, "loss": 0.1352, "step": 14280 }, { "epoch": 1.2979699350560878, "grad_norm": 0.7867736220359802, "learning_rate": 6.99101490833461e-05, "loss": 0.0806, "step": 14290 }, { "epoch": 1.2988782415186884, "grad_norm": 0.3914531469345093, "learning_rate": 6.9861662420959e-05, "loss": 0.1035, "step": 14300 }, { "epoch": 1.299786547981289, "grad_norm": 2.1396143436431885, "learning_rate": 6.98131535714589e-05, "loss": 0.1832, "step": 14310 }, { "epoch": 1.3006948544438894, "grad_norm": 0.6994645595550537, "learning_rate": 6.976462258903414e-05, "loss": 0.1681, "step": 14320 }, { "epoch": 1.30160316090649, "grad_norm": 1.629506230354309, "learning_rate": 6.971606952789784e-05, "loss": 0.2115, "step": 14330 }, { "epoch": 1.3025114673690903, "grad_norm": 0.377987265586853, "learning_rate": 6.966749444228775e-05, "loss": 0.1968, "step": 14340 }, { "epoch": 1.3034197738316908, "grad_norm": 3.5439963340759277, "learning_rate": 6.961889738646627e-05, "loss": 0.2121, "step": 14350 }, { "epoch": 1.3043280802942914, "grad_norm": 0.5641065239906311, "learning_rate": 6.95702784147203e-05, "loss": 0.1866, "step": 14360 }, { "epoch": 1.3052363867568917, "grad_norm": 2.147791862487793, "learning_rate": 6.952163758136123e-05, "loss": 0.1243, "step": 14370 }, { "epoch": 1.3061446932194922, "grad_norm": 0.7155373692512512, "learning_rate": 6.947297494072491e-05, "loss": 0.1435, "step": 14380 }, { "epoch": 1.3070529996820928, "grad_norm": 1.742484450340271, "learning_rate": 6.942429054717148e-05, "loss": 0.1247, "step": 14390 }, { "epoch": 1.3079613061446933, "grad_norm": 1.0457918643951416, "learning_rate": 6.937558445508544e-05, "loss": 0.1396, "step": 14400 }, { "epoch": 1.3088696126072936, "grad_norm": 1.2992767095565796, "learning_rate": 6.932685671887548e-05, "loss": 0.1502, "step": 14410 }, { "epoch": 1.3097779190698942, "grad_norm": 1.2253739833831787, "learning_rate": 6.927810739297454e-05, "loss": 0.2161, "step": 14420 }, { "epoch": 1.3106862255324947, "grad_norm": 0.7989453077316284, "learning_rate": 6.922933653183961e-05, "loss": 0.2311, "step": 14430 }, { "epoch": 1.311594531995095, "grad_norm": 1.1667064428329468, "learning_rate": 6.918054418995174e-05, "loss": 0.1492, "step": 14440 }, { "epoch": 1.3125028384576956, "grad_norm": 0.4417150020599365, "learning_rate": 6.913173042181601e-05, "loss": 0.1298, "step": 14450 }, { "epoch": 1.313411144920296, "grad_norm": 1.577610731124878, "learning_rate": 6.908289528196144e-05, "loss": 0.1501, "step": 14460 }, { "epoch": 1.3143194513828966, "grad_norm": 1.6019408702850342, "learning_rate": 6.903403882494088e-05, "loss": 0.2312, "step": 14470 }, { "epoch": 1.3152277578454972, "grad_norm": 1.6584210395812988, "learning_rate": 6.898516110533103e-05, "loss": 0.2627, "step": 14480 }, { "epoch": 1.3161360643080975, "grad_norm": 0.9417382478713989, "learning_rate": 6.89362621777323e-05, "loss": 0.1863, "step": 14490 }, { "epoch": 1.317044370770698, "grad_norm": 1.0271172523498535, "learning_rate": 6.888734209676886e-05, "loss": 0.1244, "step": 14500 }, { "epoch": 1.317044370770698, "eval_loss": 0.21792705357074738, "eval_runtime": 1103.0391, "eval_samples_per_second": 8.873, "eval_steps_per_second": 8.873, "step": 14500 }, { "epoch": 1.3179526772332986, "grad_norm": 0.7572372555732727, "learning_rate": 6.883840091708843e-05, "loss": 0.084, "step": 14510 }, { "epoch": 1.3188609836958989, "grad_norm": 1.5934282541275024, "learning_rate": 6.878943869336241e-05, "loss": 0.168, "step": 14520 }, { "epoch": 1.3197692901584994, "grad_norm": 1.921298623085022, "learning_rate": 6.874045548028556e-05, "loss": 0.1793, "step": 14530 }, { "epoch": 1.3206775966211, "grad_norm": 0.3300674855709076, "learning_rate": 6.86914513325762e-05, "loss": 0.2234, "step": 14540 }, { "epoch": 1.3215859030837005, "grad_norm": 1.1796507835388184, "learning_rate": 6.864242630497599e-05, "loss": 0.1688, "step": 14550 }, { "epoch": 1.322494209546301, "grad_norm": 0.7300209403038025, "learning_rate": 6.859338045224992e-05, "loss": 0.1493, "step": 14560 }, { "epoch": 1.3234025160089014, "grad_norm": 0.20270337164402008, "learning_rate": 6.854431382918627e-05, "loss": 0.1553, "step": 14570 }, { "epoch": 1.324310822471502, "grad_norm": 1.9958044290542603, "learning_rate": 6.849522649059647e-05, "loss": 0.2503, "step": 14580 }, { "epoch": 1.3252191289341024, "grad_norm": 1.0165281295776367, "learning_rate": 6.844611849131514e-05, "loss": 0.201, "step": 14590 }, { "epoch": 1.3261274353967027, "grad_norm": 0.11845185607671738, "learning_rate": 6.839698988619996e-05, "loss": 0.1686, "step": 14600 }, { "epoch": 1.3270357418593033, "grad_norm": 2.311715841293335, "learning_rate": 6.834784073013163e-05, "loss": 0.2319, "step": 14610 }, { "epoch": 1.3279440483219038, "grad_norm": 2.934135675430298, "learning_rate": 6.82986710780138e-05, "loss": 0.13, "step": 14620 }, { "epoch": 1.3288523547845044, "grad_norm": 2.8812201023101807, "learning_rate": 6.824948098477302e-05, "loss": 0.1537, "step": 14630 }, { "epoch": 1.329760661247105, "grad_norm": 0.9378868937492371, "learning_rate": 6.820027050535866e-05, "loss": 0.0893, "step": 14640 }, { "epoch": 1.3306689677097052, "grad_norm": 3.622267246246338, "learning_rate": 6.815103969474293e-05, "loss": 0.2171, "step": 14650 }, { "epoch": 1.3315772741723058, "grad_norm": 3.234666347503662, "learning_rate": 6.810178860792064e-05, "loss": 0.1535, "step": 14660 }, { "epoch": 1.3324855806349063, "grad_norm": 1.2290953397750854, "learning_rate": 6.805251729990937e-05, "loss": 0.15, "step": 14670 }, { "epoch": 1.3333938870975066, "grad_norm": 0.455130010843277, "learning_rate": 6.800322582574918e-05, "loss": 0.1863, "step": 14680 }, { "epoch": 1.3343021935601072, "grad_norm": 1.2077858448028564, "learning_rate": 6.795391424050275e-05, "loss": 0.1649, "step": 14690 }, { "epoch": 1.3352105000227077, "grad_norm": 1.2733981609344482, "learning_rate": 6.790458259925514e-05, "loss": 0.1019, "step": 14700 }, { "epoch": 1.3361188064853082, "grad_norm": 0.956073522567749, "learning_rate": 6.785523095711387e-05, "loss": 0.1531, "step": 14710 }, { "epoch": 1.3370271129479085, "grad_norm": 0.8781226873397827, "learning_rate": 6.78058593692088e-05, "loss": 0.1726, "step": 14720 }, { "epoch": 1.337935419410509, "grad_norm": 1.9246735572814941, "learning_rate": 6.775646789069207e-05, "loss": 0.196, "step": 14730 }, { "epoch": 1.3388437258731096, "grad_norm": 0.703194797039032, "learning_rate": 6.7707056576738e-05, "loss": 0.17, "step": 14740 }, { "epoch": 1.33975203233571, "grad_norm": 0.508236825466156, "learning_rate": 6.765762548254313e-05, "loss": 0.0966, "step": 14750 }, { "epoch": 1.3406603387983105, "grad_norm": 0.6846745014190674, "learning_rate": 6.760817466332605e-05, "loss": 0.1967, "step": 14760 }, { "epoch": 1.341568645260911, "grad_norm": 0.8807492852210999, "learning_rate": 6.755870417432739e-05, "loss": 0.1072, "step": 14770 }, { "epoch": 1.3424769517235116, "grad_norm": 1.7318761348724365, "learning_rate": 6.750921407080978e-05, "loss": 0.2072, "step": 14780 }, { "epoch": 1.343385258186112, "grad_norm": 1.0225956439971924, "learning_rate": 6.745970440805771e-05, "loss": 0.124, "step": 14790 }, { "epoch": 1.3442935646487124, "grad_norm": 1.6278802156448364, "learning_rate": 6.74101752413776e-05, "loss": 0.1757, "step": 14800 }, { "epoch": 1.345201871111313, "grad_norm": 2.064058780670166, "learning_rate": 6.736062662609755e-05, "loss": 0.1941, "step": 14810 }, { "epoch": 1.3461101775739135, "grad_norm": 1.4448683261871338, "learning_rate": 6.73110586175675e-05, "loss": 0.1855, "step": 14820 }, { "epoch": 1.3470184840365138, "grad_norm": 1.6704026460647583, "learning_rate": 6.726147127115897e-05, "loss": 0.2115, "step": 14830 }, { "epoch": 1.3479267904991143, "grad_norm": 1.2982239723205566, "learning_rate": 6.721186464226513e-05, "loss": 0.1655, "step": 14840 }, { "epoch": 1.3488350969617149, "grad_norm": 1.0922256708145142, "learning_rate": 6.716223878630063e-05, "loss": 0.2068, "step": 14850 }, { "epoch": 1.3497434034243154, "grad_norm": 2.5821661949157715, "learning_rate": 6.711259375870167e-05, "loss": 0.112, "step": 14860 }, { "epoch": 1.350651709886916, "grad_norm": 2.0402281284332275, "learning_rate": 6.706292961492587e-05, "loss": 0.1344, "step": 14870 }, { "epoch": 1.3515600163495163, "grad_norm": 0.24603070318698883, "learning_rate": 6.701324641045209e-05, "loss": 0.1589, "step": 14880 }, { "epoch": 1.3524683228121168, "grad_norm": 1.2240370512008667, "learning_rate": 6.696354420078062e-05, "loss": 0.1805, "step": 14890 }, { "epoch": 1.3533766292747174, "grad_norm": 0.4371587634086609, "learning_rate": 6.69138230414329e-05, "loss": 0.2486, "step": 14900 }, { "epoch": 1.3542849357373177, "grad_norm": 1.1898268461227417, "learning_rate": 6.686408298795157e-05, "loss": 0.1767, "step": 14910 }, { "epoch": 1.3551932421999182, "grad_norm": 2.5442121028900146, "learning_rate": 6.681432409590037e-05, "loss": 0.174, "step": 14920 }, { "epoch": 1.3561015486625188, "grad_norm": 0.9221041202545166, "learning_rate": 6.676454642086407e-05, "loss": 0.1375, "step": 14930 }, { "epoch": 1.3570098551251193, "grad_norm": 0.9261654019355774, "learning_rate": 6.671475001844843e-05, "loss": 0.1221, "step": 14940 }, { "epoch": 1.3579181615877198, "grad_norm": 0.9628563523292542, "learning_rate": 6.666493494428016e-05, "loss": 0.1849, "step": 14950 }, { "epoch": 1.3588264680503201, "grad_norm": 1.5712580680847168, "learning_rate": 6.661510125400676e-05, "loss": 0.1518, "step": 14960 }, { "epoch": 1.3597347745129207, "grad_norm": 0.4707755148410797, "learning_rate": 6.65652490032966e-05, "loss": 0.1658, "step": 14970 }, { "epoch": 1.360643080975521, "grad_norm": 1.8163018226623535, "learning_rate": 6.651537824783875e-05, "loss": 0.1567, "step": 14980 }, { "epoch": 1.3615513874381215, "grad_norm": 1.029680848121643, "learning_rate": 6.646548904334293e-05, "loss": 0.1749, "step": 14990 }, { "epoch": 1.362459693900722, "grad_norm": 0.6355626583099365, "learning_rate": 6.641558144553952e-05, "loss": 0.1624, "step": 15000 }, { "epoch": 1.362459693900722, "eval_loss": 0.2152995467185974, "eval_runtime": 1100.5294, "eval_samples_per_second": 8.893, "eval_steps_per_second": 8.893, "step": 15000 }, { "epoch": 1.3633680003633226, "grad_norm": 2.2520830631256104, "learning_rate": 6.636565551017939e-05, "loss": 0.162, "step": 15010 }, { "epoch": 1.3642763068259232, "grad_norm": 3.8823604583740234, "learning_rate": 6.631571129303393e-05, "loss": 0.1968, "step": 15020 }, { "epoch": 1.3651846132885235, "grad_norm": 0.9022207856178284, "learning_rate": 6.626574884989497e-05, "loss": 0.2315, "step": 15030 }, { "epoch": 1.366092919751124, "grad_norm": 1.0363821983337402, "learning_rate": 6.621576823657463e-05, "loss": 0.1429, "step": 15040 }, { "epoch": 1.3670012262137246, "grad_norm": 0.3583439588546753, "learning_rate": 6.616576950890541e-05, "loss": 0.2018, "step": 15050 }, { "epoch": 1.3679095326763249, "grad_norm": 1.9663238525390625, "learning_rate": 6.611575272273999e-05, "loss": 0.1486, "step": 15060 }, { "epoch": 1.3688178391389254, "grad_norm": 1.7342723608016968, "learning_rate": 6.606571793395125e-05, "loss": 0.1424, "step": 15070 }, { "epoch": 1.369726145601526, "grad_norm": 1.0440753698349, "learning_rate": 6.601566519843214e-05, "loss": 0.1792, "step": 15080 }, { "epoch": 1.3706344520641265, "grad_norm": 3.501249074935913, "learning_rate": 6.596559457209574e-05, "loss": 0.1905, "step": 15090 }, { "epoch": 1.371542758526727, "grad_norm": 1.4709155559539795, "learning_rate": 6.591550611087502e-05, "loss": 0.2119, "step": 15100 }, { "epoch": 1.3724510649893273, "grad_norm": 0.46741583943367004, "learning_rate": 6.586539987072294e-05, "loss": 0.1429, "step": 15110 }, { "epoch": 1.3733593714519279, "grad_norm": 1.9428534507751465, "learning_rate": 6.58152759076123e-05, "loss": 0.2149, "step": 15120 }, { "epoch": 1.3742676779145284, "grad_norm": 1.1681013107299805, "learning_rate": 6.576513427753568e-05, "loss": 0.19, "step": 15130 }, { "epoch": 1.3751759843771287, "grad_norm": 0.7512862682342529, "learning_rate": 6.571497503650542e-05, "loss": 0.1253, "step": 15140 }, { "epoch": 1.3760842908397293, "grad_norm": 0.9386667609214783, "learning_rate": 6.566479824055353e-05, "loss": 0.1846, "step": 15150 }, { "epoch": 1.3769925973023298, "grad_norm": 2.1254653930664062, "learning_rate": 6.56146039457316e-05, "loss": 0.1561, "step": 15160 }, { "epoch": 1.3779009037649304, "grad_norm": 0.9035351276397705, "learning_rate": 6.556439220811082e-05, "loss": 0.1143, "step": 15170 }, { "epoch": 1.378809210227531, "grad_norm": 2.247997999191284, "learning_rate": 6.551416308378183e-05, "loss": 0.1396, "step": 15180 }, { "epoch": 1.3797175166901312, "grad_norm": 1.5087502002716064, "learning_rate": 6.54639166288547e-05, "loss": 0.1406, "step": 15190 }, { "epoch": 1.3806258231527317, "grad_norm": 1.6437585353851318, "learning_rate": 6.541365289945884e-05, "loss": 0.1136, "step": 15200 }, { "epoch": 1.3815341296153323, "grad_norm": 2.193575382232666, "learning_rate": 6.536337195174301e-05, "loss": 0.212, "step": 15210 }, { "epoch": 1.3824424360779326, "grad_norm": 3.3469865322113037, "learning_rate": 6.531307384187515e-05, "loss": 0.2296, "step": 15220 }, { "epoch": 1.3833507425405331, "grad_norm": 0.4638090133666992, "learning_rate": 6.526275862604238e-05, "loss": 0.1743, "step": 15230 }, { "epoch": 1.3842590490031337, "grad_norm": 0.663306713104248, "learning_rate": 6.521242636045097e-05, "loss": 0.1654, "step": 15240 }, { "epoch": 1.3851673554657342, "grad_norm": 4.137877464294434, "learning_rate": 6.516207710132618e-05, "loss": 0.1731, "step": 15250 }, { "epoch": 1.3860756619283348, "grad_norm": 0.45193251967430115, "learning_rate": 6.51117109049123e-05, "loss": 0.1714, "step": 15260 }, { "epoch": 1.386983968390935, "grad_norm": 1.1134235858917236, "learning_rate": 6.50613278274725e-05, "loss": 0.1711, "step": 15270 }, { "epoch": 1.3878922748535356, "grad_norm": 0.7902368307113647, "learning_rate": 6.501092792528886e-05, "loss": 0.1316, "step": 15280 }, { "epoch": 1.388800581316136, "grad_norm": 0.5250023007392883, "learning_rate": 6.49605112546622e-05, "loss": 0.1366, "step": 15290 }, { "epoch": 1.3897088877787365, "grad_norm": 0.8733217120170593, "learning_rate": 6.491007787191208e-05, "loss": 0.1696, "step": 15300 }, { "epoch": 1.390617194241337, "grad_norm": 1.8519527912139893, "learning_rate": 6.485962783337676e-05, "loss": 0.0822, "step": 15310 }, { "epoch": 1.3915255007039375, "grad_norm": 0.8873314261436462, "learning_rate": 6.48091611954131e-05, "loss": 0.1717, "step": 15320 }, { "epoch": 1.392433807166538, "grad_norm": 0.42509889602661133, "learning_rate": 6.475867801439646e-05, "loss": 0.1067, "step": 15330 }, { "epoch": 1.3933421136291384, "grad_norm": 1.9646755456924438, "learning_rate": 6.470817834672076e-05, "loss": 0.1928, "step": 15340 }, { "epoch": 1.394250420091739, "grad_norm": 0.9863916635513306, "learning_rate": 6.465766224879825e-05, "loss": 0.2153, "step": 15350 }, { "epoch": 1.3951587265543395, "grad_norm": 0.5503681898117065, "learning_rate": 6.460712977705958e-05, "loss": 0.1622, "step": 15360 }, { "epoch": 1.3960670330169398, "grad_norm": 1.126084804534912, "learning_rate": 6.455658098795369e-05, "loss": 0.1575, "step": 15370 }, { "epoch": 1.3969753394795403, "grad_norm": 1.676141619682312, "learning_rate": 6.450601593794773e-05, "loss": 0.1735, "step": 15380 }, { "epoch": 1.3978836459421409, "grad_norm": 1.2387011051177979, "learning_rate": 6.445543468352705e-05, "loss": 0.246, "step": 15390 }, { "epoch": 1.3987919524047414, "grad_norm": 1.9781464338302612, "learning_rate": 6.440483728119505e-05, "loss": 0.2235, "step": 15400 }, { "epoch": 1.399700258867342, "grad_norm": 0.6322752833366394, "learning_rate": 6.435422378747321e-05, "loss": 0.1809, "step": 15410 }, { "epoch": 1.4006085653299423, "grad_norm": 0.5201440453529358, "learning_rate": 6.430359425890098e-05, "loss": 0.1796, "step": 15420 }, { "epoch": 1.4015168717925428, "grad_norm": 0.42512595653533936, "learning_rate": 6.42529487520357e-05, "loss": 0.143, "step": 15430 }, { "epoch": 1.4024251782551433, "grad_norm": 2.66516375541687, "learning_rate": 6.420228732345257e-05, "loss": 0.1507, "step": 15440 }, { "epoch": 1.4033334847177437, "grad_norm": 2.6636369228363037, "learning_rate": 6.415161002974459e-05, "loss": 0.2054, "step": 15450 }, { "epoch": 1.4042417911803442, "grad_norm": 1.1182680130004883, "learning_rate": 6.410091692752246e-05, "loss": 0.1475, "step": 15460 }, { "epoch": 1.4051500976429447, "grad_norm": 3.3351638317108154, "learning_rate": 6.405020807341458e-05, "loss": 0.1483, "step": 15470 }, { "epoch": 1.4060584041055453, "grad_norm": 1.442475438117981, "learning_rate": 6.399948352406686e-05, "loss": 0.1675, "step": 15480 }, { "epoch": 1.4069667105681458, "grad_norm": 0.11476827412843704, "learning_rate": 6.394874333614287e-05, "loss": 0.1433, "step": 15490 }, { "epoch": 1.4078750170307461, "grad_norm": 0.4714938700199127, "learning_rate": 6.389798756632355e-05, "loss": 0.2119, "step": 15500 }, { "epoch": 1.4078750170307461, "eval_loss": 0.21516531705856323, "eval_runtime": 1084.7463, "eval_samples_per_second": 9.022, "eval_steps_per_second": 9.022, "step": 15500 }, { "epoch": 1.4087833234933467, "grad_norm": 0.5127773880958557, "learning_rate": 6.384721627130725e-05, "loss": 0.1551, "step": 15510 }, { "epoch": 1.4096916299559472, "grad_norm": 0.22540366649627686, "learning_rate": 6.379642950780971e-05, "loss": 0.1535, "step": 15520 }, { "epoch": 1.4105999364185475, "grad_norm": 1.3485805988311768, "learning_rate": 6.374562733256393e-05, "loss": 0.1649, "step": 15530 }, { "epoch": 1.411508242881148, "grad_norm": 2.2201147079467773, "learning_rate": 6.369480980232013e-05, "loss": 0.1769, "step": 15540 }, { "epoch": 1.4124165493437486, "grad_norm": 0.37344422936439514, "learning_rate": 6.364397697384568e-05, "loss": 0.1718, "step": 15550 }, { "epoch": 1.4133248558063491, "grad_norm": 1.1102111339569092, "learning_rate": 6.359312890392501e-05, "loss": 0.1632, "step": 15560 }, { "epoch": 1.4142331622689495, "grad_norm": 1.2498908042907715, "learning_rate": 6.354226564935964e-05, "loss": 0.1368, "step": 15570 }, { "epoch": 1.41514146873155, "grad_norm": 0.8289869427680969, "learning_rate": 6.349138726696797e-05, "loss": 0.1915, "step": 15580 }, { "epoch": 1.4160497751941505, "grad_norm": 0.4715811312198639, "learning_rate": 6.344049381358539e-05, "loss": 0.1543, "step": 15590 }, { "epoch": 1.4169580816567509, "grad_norm": 0.5907606482505798, "learning_rate": 6.338958534606404e-05, "loss": 0.1432, "step": 15600 }, { "epoch": 1.4178663881193514, "grad_norm": 1.2866039276123047, "learning_rate": 6.333866192127292e-05, "loss": 0.1327, "step": 15610 }, { "epoch": 1.418774694581952, "grad_norm": 1.9341095685958862, "learning_rate": 6.328772359609765e-05, "loss": 0.1425, "step": 15620 }, { "epoch": 1.4196830010445525, "grad_norm": 2.3411977291107178, "learning_rate": 6.323677042744054e-05, "loss": 0.1206, "step": 15630 }, { "epoch": 1.420591307507153, "grad_norm": 0.3820275068283081, "learning_rate": 6.31858024722205e-05, "loss": 0.1833, "step": 15640 }, { "epoch": 1.4214996139697533, "grad_norm": 1.2570282220840454, "learning_rate": 6.31348197873729e-05, "loss": 0.1294, "step": 15650 }, { "epoch": 1.4224079204323539, "grad_norm": 0.5319797396659851, "learning_rate": 6.308382242984961e-05, "loss": 0.1669, "step": 15660 }, { "epoch": 1.4233162268949544, "grad_norm": 2.926335573196411, "learning_rate": 6.30328104566189e-05, "loss": 0.1734, "step": 15670 }, { "epoch": 1.4242245333575547, "grad_norm": 0.9096258878707886, "learning_rate": 6.298178392466533e-05, "loss": 0.1714, "step": 15680 }, { "epoch": 1.4251328398201553, "grad_norm": 1.7289985418319702, "learning_rate": 6.293074289098974e-05, "loss": 0.1564, "step": 15690 }, { "epoch": 1.4260411462827558, "grad_norm": 1.154532551765442, "learning_rate": 6.287968741260918e-05, "loss": 0.1694, "step": 15700 }, { "epoch": 1.4269494527453563, "grad_norm": 1.3784821033477783, "learning_rate": 6.282861754655682e-05, "loss": 0.1557, "step": 15710 }, { "epoch": 1.4278577592079569, "grad_norm": 0.7861212491989136, "learning_rate": 6.277753334988189e-05, "loss": 0.2071, "step": 15720 }, { "epoch": 1.4287660656705572, "grad_norm": 0.8418047428131104, "learning_rate": 6.272643487964965e-05, "loss": 0.2143, "step": 15730 }, { "epoch": 1.4296743721331577, "grad_norm": 0.6504663825035095, "learning_rate": 6.267532219294133e-05, "loss": 0.1443, "step": 15740 }, { "epoch": 1.4305826785957583, "grad_norm": 3.019587755203247, "learning_rate": 6.262419534685397e-05, "loss": 0.196, "step": 15750 }, { "epoch": 1.4314909850583586, "grad_norm": 0.05726870521903038, "learning_rate": 6.257305439850051e-05, "loss": 0.1302, "step": 15760 }, { "epoch": 1.4323992915209591, "grad_norm": 0.5308976173400879, "learning_rate": 6.252189940500958e-05, "loss": 0.1954, "step": 15770 }, { "epoch": 1.4333075979835597, "grad_norm": 2.8362884521484375, "learning_rate": 6.247073042352551e-05, "loss": 0.1874, "step": 15780 }, { "epoch": 1.4342159044461602, "grad_norm": 0.7620899081230164, "learning_rate": 6.241954751120829e-05, "loss": 0.1465, "step": 15790 }, { "epoch": 1.4351242109087607, "grad_norm": 2.2687206268310547, "learning_rate": 6.236835072523344e-05, "loss": 0.1075, "step": 15800 }, { "epoch": 1.436032517371361, "grad_norm": 2.2762129306793213, "learning_rate": 6.231714012279197e-05, "loss": 0.1511, "step": 15810 }, { "epoch": 1.4369408238339616, "grad_norm": 0.6106740832328796, "learning_rate": 6.226591576109039e-05, "loss": 0.1468, "step": 15820 }, { "epoch": 1.4378491302965621, "grad_norm": 1.7500579357147217, "learning_rate": 6.221467769735046e-05, "loss": 0.1986, "step": 15830 }, { "epoch": 1.4387574367591625, "grad_norm": 1.386141061782837, "learning_rate": 6.21634259888094e-05, "loss": 0.1549, "step": 15840 }, { "epoch": 1.439665743221763, "grad_norm": 1.2290318012237549, "learning_rate": 6.211216069271951e-05, "loss": 0.1473, "step": 15850 }, { "epoch": 1.4405740496843635, "grad_norm": 1.9283164739608765, "learning_rate": 6.206088186634842e-05, "loss": 0.1505, "step": 15860 }, { "epoch": 1.441482356146964, "grad_norm": 0.34665200114250183, "learning_rate": 6.200958956697876e-05, "loss": 0.1538, "step": 15870 }, { "epoch": 1.4423906626095644, "grad_norm": 0.5457804203033447, "learning_rate": 6.195828385190825e-05, "loss": 0.1602, "step": 15880 }, { "epoch": 1.443298969072165, "grad_norm": 2.8561136722564697, "learning_rate": 6.190696477844962e-05, "loss": 0.1749, "step": 15890 }, { "epoch": 1.4442072755347655, "grad_norm": 1.2272404432296753, "learning_rate": 6.18556324039305e-05, "loss": 0.1833, "step": 15900 }, { "epoch": 1.4451155819973658, "grad_norm": 1.1915199756622314, "learning_rate": 6.180428678569337e-05, "loss": 0.1478, "step": 15910 }, { "epoch": 1.4460238884599663, "grad_norm": 0.459766685962677, "learning_rate": 6.175292798109555e-05, "loss": 0.1502, "step": 15920 }, { "epoch": 1.4469321949225669, "grad_norm": 1.612210750579834, "learning_rate": 6.1701556047509e-05, "loss": 0.1403, "step": 15930 }, { "epoch": 1.4478405013851674, "grad_norm": 2.056227445602417, "learning_rate": 6.165017104232044e-05, "loss": 0.1509, "step": 15940 }, { "epoch": 1.448748807847768, "grad_norm": 0.5240591168403625, "learning_rate": 6.159877302293114e-05, "loss": 0.2078, "step": 15950 }, { "epoch": 1.4496571143103683, "grad_norm": 1.158430814743042, "learning_rate": 6.154736204675691e-05, "loss": 0.1185, "step": 15960 }, { "epoch": 1.4505654207729688, "grad_norm": 1.2941346168518066, "learning_rate": 6.149593817122809e-05, "loss": 0.1855, "step": 15970 }, { "epoch": 1.4514737272355693, "grad_norm": 0.4683038294315338, "learning_rate": 6.144450145378933e-05, "loss": 0.1541, "step": 15980 }, { "epoch": 1.4523820336981696, "grad_norm": 0.011105031706392765, "learning_rate": 6.139305195189972e-05, "loss": 0.1349, "step": 15990 }, { "epoch": 1.4532903401607702, "grad_norm": 4.2394232749938965, "learning_rate": 6.134158972303257e-05, "loss": 0.1696, "step": 16000 }, { "epoch": 1.4532903401607702, "eval_loss": 0.22273105382919312, "eval_runtime": 1104.6677, "eval_samples_per_second": 8.86, "eval_steps_per_second": 8.86, "step": 16000 }, { "epoch": 1.4541986466233707, "grad_norm": 0.4779214859008789, "learning_rate": 6.129011482467543e-05, "loss": 0.0992, "step": 16010 }, { "epoch": 1.4551069530859713, "grad_norm": 2.7576301097869873, "learning_rate": 6.123862731433002e-05, "loss": 0.2533, "step": 16020 }, { "epoch": 1.4560152595485718, "grad_norm": 0.37912681698799133, "learning_rate": 6.118712724951211e-05, "loss": 0.1453, "step": 16030 }, { "epoch": 1.4569235660111721, "grad_norm": 0.3018011152744293, "learning_rate": 6.113561468775154e-05, "loss": 0.1589, "step": 16040 }, { "epoch": 1.4578318724737727, "grad_norm": 3.384218454360962, "learning_rate": 6.108408968659203e-05, "loss": 0.1243, "step": 16050 }, { "epoch": 1.4587401789363732, "grad_norm": 1.5087846517562866, "learning_rate": 6.103255230359132e-05, "loss": 0.1757, "step": 16060 }, { "epoch": 1.4596484853989735, "grad_norm": 0.8756253123283386, "learning_rate": 6.098100259632087e-05, "loss": 0.1889, "step": 16070 }, { "epoch": 1.460556791861574, "grad_norm": 0.8924440741539001, "learning_rate": 6.0929440622365965e-05, "loss": 0.0819, "step": 16080 }, { "epoch": 1.4614650983241746, "grad_norm": 1.776936411857605, "learning_rate": 6.0877866439325573e-05, "loss": 0.221, "step": 16090 }, { "epoch": 1.4623734047867751, "grad_norm": 1.5033124685287476, "learning_rate": 6.082628010481229e-05, "loss": 0.1194, "step": 16100 }, { "epoch": 1.4632817112493757, "grad_norm": 0.021881716325879097, "learning_rate": 6.077468167645233e-05, "loss": 0.1156, "step": 16110 }, { "epoch": 1.464190017711976, "grad_norm": 0.9127677083015442, "learning_rate": 6.0723071211885375e-05, "loss": 0.1386, "step": 16120 }, { "epoch": 1.4650983241745765, "grad_norm": 0.850348174571991, "learning_rate": 6.067144876876455e-05, "loss": 0.2221, "step": 16130 }, { "epoch": 1.466006630637177, "grad_norm": 2.220471143722534, "learning_rate": 6.061981440475639e-05, "loss": 0.1872, "step": 16140 }, { "epoch": 1.4669149370997774, "grad_norm": 1.5224398374557495, "learning_rate": 6.056816817754074e-05, "loss": 0.12, "step": 16150 }, { "epoch": 1.467823243562378, "grad_norm": 2.4548869132995605, "learning_rate": 6.051651014481069e-05, "loss": 0.1401, "step": 16160 }, { "epoch": 1.4687315500249785, "grad_norm": 1.1147593259811401, "learning_rate": 6.046484036427249e-05, "loss": 0.2007, "step": 16170 }, { "epoch": 1.469639856487579, "grad_norm": 0.5807734131813049, "learning_rate": 6.041315889364558e-05, "loss": 0.126, "step": 16180 }, { "epoch": 1.4705481629501793, "grad_norm": 2.2539286613464355, "learning_rate": 6.0361465790662395e-05, "loss": 0.0939, "step": 16190 }, { "epoch": 1.4714564694127799, "grad_norm": 5.320196628570557, "learning_rate": 6.03097611130684e-05, "loss": 0.1442, "step": 16200 }, { "epoch": 1.4723647758753804, "grad_norm": 2.0236144065856934, "learning_rate": 6.025804491862197e-05, "loss": 0.1784, "step": 16210 }, { "epoch": 1.4732730823379807, "grad_norm": 2.13676118850708, "learning_rate": 6.020631726509437e-05, "loss": 0.1659, "step": 16220 }, { "epoch": 1.4741813888005812, "grad_norm": 0.559867262840271, "learning_rate": 6.015457821026963e-05, "loss": 0.1682, "step": 16230 }, { "epoch": 1.4750896952631818, "grad_norm": 2.7195417881011963, "learning_rate": 6.010282781194455e-05, "loss": 0.1188, "step": 16240 }, { "epoch": 1.4759980017257823, "grad_norm": 0.5787736773490906, "learning_rate": 6.005106612792858e-05, "loss": 0.1909, "step": 16250 }, { "epoch": 1.4769063081883829, "grad_norm": 0.2682819664478302, "learning_rate": 5.9999293216043784e-05, "loss": 0.1408, "step": 16260 }, { "epoch": 1.4778146146509832, "grad_norm": 0.8761358261108398, "learning_rate": 5.994750913412478e-05, "loss": 0.1817, "step": 16270 }, { "epoch": 1.4787229211135837, "grad_norm": 1.4522216320037842, "learning_rate": 5.989571394001865e-05, "loss": 0.1761, "step": 16280 }, { "epoch": 1.4796312275761843, "grad_norm": 0.0039059065748006105, "learning_rate": 5.984390769158489e-05, "loss": 0.1452, "step": 16290 }, { "epoch": 1.4805395340387846, "grad_norm": 3.1922082901000977, "learning_rate": 5.9792090446695346e-05, "loss": 0.2513, "step": 16300 }, { "epoch": 1.4814478405013851, "grad_norm": 2.2118520736694336, "learning_rate": 5.974026226323415e-05, "loss": 0.195, "step": 16310 }, { "epoch": 1.4823561469639857, "grad_norm": 0.06607570499181747, "learning_rate": 5.968842319909764e-05, "loss": 0.11, "step": 16320 }, { "epoch": 1.4832644534265862, "grad_norm": 0.6276015639305115, "learning_rate": 5.9636573312194346e-05, "loss": 0.127, "step": 16330 }, { "epoch": 1.4841727598891867, "grad_norm": 2.851334810256958, "learning_rate": 5.958471266044484e-05, "loss": 0.2351, "step": 16340 }, { "epoch": 1.485081066351787, "grad_norm": 1.1764429807662964, "learning_rate": 5.953284130178175e-05, "loss": 0.113, "step": 16350 }, { "epoch": 1.4859893728143876, "grad_norm": 1.8004001379013062, "learning_rate": 5.948095929414968e-05, "loss": 0.1765, "step": 16360 }, { "epoch": 1.4868976792769881, "grad_norm": 0.9153022766113281, "learning_rate": 5.9429066695505065e-05, "loss": 0.1196, "step": 16370 }, { "epoch": 1.4878059857395884, "grad_norm": 0.07822104543447495, "learning_rate": 5.937716356381624e-05, "loss": 0.1433, "step": 16380 }, { "epoch": 1.488714292202189, "grad_norm": 0.6560503840446472, "learning_rate": 5.9325249957063266e-05, "loss": 0.1165, "step": 16390 }, { "epoch": 1.4896225986647895, "grad_norm": 0.7428843379020691, "learning_rate": 5.927332593323791e-05, "loss": 0.1649, "step": 16400 }, { "epoch": 1.49053090512739, "grad_norm": 0.9432240724563599, "learning_rate": 5.922139155034362e-05, "loss": 0.1184, "step": 16410 }, { "epoch": 1.4914392115899906, "grad_norm": 0.31962573528289795, "learning_rate": 5.9169446866395337e-05, "loss": 0.1216, "step": 16420 }, { "epoch": 1.492347518052591, "grad_norm": 0.1385318487882614, "learning_rate": 5.9117491939419576e-05, "loss": 0.2336, "step": 16430 }, { "epoch": 1.4932558245151915, "grad_norm": 1.958909273147583, "learning_rate": 5.906552682745428e-05, "loss": 0.1509, "step": 16440 }, { "epoch": 1.4941641309777918, "grad_norm": 1.7014786005020142, "learning_rate": 5.9013551588548734e-05, "loss": 0.1288, "step": 16450 }, { "epoch": 1.4950724374403923, "grad_norm": 0.2697017788887024, "learning_rate": 5.896156628076358e-05, "loss": 0.1123, "step": 16460 }, { "epoch": 1.4959807439029928, "grad_norm": 2.162752628326416, "learning_rate": 5.8909570962170665e-05, "loss": 0.1304, "step": 16470 }, { "epoch": 1.4968890503655934, "grad_norm": 0.7966536283493042, "learning_rate": 5.885756569085307e-05, "loss": 0.1853, "step": 16480 }, { "epoch": 1.497797356828194, "grad_norm": 1.756423830986023, "learning_rate": 5.8805550524904964e-05, "loss": 0.1753, "step": 16490 }, { "epoch": 1.4987056632907942, "grad_norm": 0.158870667219162, "learning_rate": 5.875352552243154e-05, "loss": 0.1398, "step": 16500 }, { "epoch": 1.4987056632907942, "eval_loss": 0.21225804090499878, "eval_runtime": 1129.7304, "eval_samples_per_second": 8.663, "eval_steps_per_second": 8.663, "step": 16500 }, { "epoch": 1.4996139697533948, "grad_norm": 0.815665066242218, "learning_rate": 5.870149074154906e-05, "loss": 0.1495, "step": 16510 }, { "epoch": 1.5005222762159953, "grad_norm": 2.8247835636138916, "learning_rate": 5.864944624038462e-05, "loss": 0.1697, "step": 16520 }, { "epoch": 1.5014305826785956, "grad_norm": 0.9590558409690857, "learning_rate": 5.859739207707625e-05, "loss": 0.2113, "step": 16530 }, { "epoch": 1.5023388891411962, "grad_norm": 0.9341315031051636, "learning_rate": 5.854532830977271e-05, "loss": 0.1618, "step": 16540 }, { "epoch": 1.5032471956037967, "grad_norm": 0.296190470457077, "learning_rate": 5.849325499663352e-05, "loss": 0.1814, "step": 16550 }, { "epoch": 1.5041555020663973, "grad_norm": 1.6984426975250244, "learning_rate": 5.8441172195828876e-05, "loss": 0.1216, "step": 16560 }, { "epoch": 1.5050638085289978, "grad_norm": 0.49060529470443726, "learning_rate": 5.838907996553953e-05, "loss": 0.214, "step": 16570 }, { "epoch": 1.505972114991598, "grad_norm": 1.1799085140228271, "learning_rate": 5.833697836395683e-05, "loss": 0.1586, "step": 16580 }, { "epoch": 1.5068804214541986, "grad_norm": 0.367255836725235, "learning_rate": 5.828486744928254e-05, "loss": 0.2196, "step": 16590 }, { "epoch": 1.507788727916799, "grad_norm": 1.0038431882858276, "learning_rate": 5.823274727972884e-05, "loss": 0.219, "step": 16600 }, { "epoch": 1.5086970343793995, "grad_norm": 0.6448379158973694, "learning_rate": 5.8180617913518254e-05, "loss": 0.2443, "step": 16610 }, { "epoch": 1.509605340842, "grad_norm": 0.25083595514297485, "learning_rate": 5.812847940888357e-05, "loss": 0.1802, "step": 16620 }, { "epoch": 1.5105136473046006, "grad_norm": 0.5106989145278931, "learning_rate": 5.807633182406782e-05, "loss": 0.131, "step": 16630 }, { "epoch": 1.5114219537672011, "grad_norm": 1.1303683519363403, "learning_rate": 5.802417521732413e-05, "loss": 0.1809, "step": 16640 }, { "epoch": 1.5123302602298017, "grad_norm": 0.3878012001514435, "learning_rate": 5.7972009646915726e-05, "loss": 0.1466, "step": 16650 }, { "epoch": 1.513238566692402, "grad_norm": 2.5887696743011475, "learning_rate": 5.791983517111586e-05, "loss": 0.202, "step": 16660 }, { "epoch": 1.5141468731550025, "grad_norm": 2.378610372543335, "learning_rate": 5.78676518482077e-05, "loss": 0.2585, "step": 16670 }, { "epoch": 1.5150551796176028, "grad_norm": 0.47379979491233826, "learning_rate": 5.781545973648434e-05, "loss": 0.1546, "step": 16680 }, { "epoch": 1.5159634860802034, "grad_norm": 0.9635477066040039, "learning_rate": 5.776325889424865e-05, "loss": 0.1529, "step": 16690 }, { "epoch": 1.516871792542804, "grad_norm": 0.7060835957527161, "learning_rate": 5.771104937981328e-05, "loss": 0.0871, "step": 16700 }, { "epoch": 1.5177800990054044, "grad_norm": 1.9516019821166992, "learning_rate": 5.7658831251500544e-05, "loss": 0.1246, "step": 16710 }, { "epoch": 1.518688405468005, "grad_norm": 0.8044900298118591, "learning_rate": 5.7606604567642405e-05, "loss": 0.1602, "step": 16720 }, { "epoch": 1.5195967119306055, "grad_norm": 2.1173460483551025, "learning_rate": 5.755436938658039e-05, "loss": 0.1406, "step": 16730 }, { "epoch": 1.5205050183932058, "grad_norm": 1.992431402206421, "learning_rate": 5.750212576666546e-05, "loss": 0.1047, "step": 16740 }, { "epoch": 1.5214133248558064, "grad_norm": 0.8321825861930847, "learning_rate": 5.744987376625807e-05, "loss": 0.1537, "step": 16750 }, { "epoch": 1.5223216313184067, "grad_norm": 0.874643087387085, "learning_rate": 5.7397613443728004e-05, "loss": 0.1473, "step": 16760 }, { "epoch": 1.5232299377810072, "grad_norm": 1.4580740928649902, "learning_rate": 5.734534485745432e-05, "loss": 0.1418, "step": 16770 }, { "epoch": 1.5241382442436078, "grad_norm": 1.8692407608032227, "learning_rate": 5.7293068065825385e-05, "loss": 0.1393, "step": 16780 }, { "epoch": 1.5250465507062083, "grad_norm": 0.9137722253799438, "learning_rate": 5.724078312723863e-05, "loss": 0.1748, "step": 16790 }, { "epoch": 1.5259548571688089, "grad_norm": 0.5420616269111633, "learning_rate": 5.7188490100100676e-05, "loss": 0.1052, "step": 16800 }, { "epoch": 1.5268631636314094, "grad_norm": 1.2432211637496948, "learning_rate": 5.7136189042827124e-05, "loss": 0.206, "step": 16810 }, { "epoch": 1.5277714700940097, "grad_norm": 2.4206783771514893, "learning_rate": 5.708388001384258e-05, "loss": 0.1947, "step": 16820 }, { "epoch": 1.5286797765566102, "grad_norm": 2.362973213195801, "learning_rate": 5.703156307158052e-05, "loss": 0.1468, "step": 16830 }, { "epoch": 1.5295880830192106, "grad_norm": 0.7318499684333801, "learning_rate": 5.697923827448327e-05, "loss": 0.107, "step": 16840 }, { "epoch": 1.530496389481811, "grad_norm": 2.2640373706817627, "learning_rate": 5.6926905681001966e-05, "loss": 0.1301, "step": 16850 }, { "epoch": 1.5314046959444116, "grad_norm": 1.4495283365249634, "learning_rate": 5.6874565349596407e-05, "loss": 0.1683, "step": 16860 }, { "epoch": 1.5323130024070122, "grad_norm": 0.9077610969543457, "learning_rate": 5.682221733873506e-05, "loss": 0.1929, "step": 16870 }, { "epoch": 1.5332213088696127, "grad_norm": 1.4560164213180542, "learning_rate": 5.676986170689498e-05, "loss": 0.1663, "step": 16880 }, { "epoch": 1.534129615332213, "grad_norm": 3.9864859580993652, "learning_rate": 5.6717498512561695e-05, "loss": 0.1493, "step": 16890 }, { "epoch": 1.5350379217948136, "grad_norm": 3.770495653152466, "learning_rate": 5.666512781422921e-05, "loss": 0.1596, "step": 16900 }, { "epoch": 1.535946228257414, "grad_norm": 2.3761978149414062, "learning_rate": 5.6612749670399935e-05, "loss": 0.1784, "step": 16910 }, { "epoch": 1.5368545347200144, "grad_norm": 1.2143595218658447, "learning_rate": 5.6560364139584534e-05, "loss": 0.1472, "step": 16920 }, { "epoch": 1.537762841182615, "grad_norm": 0.14791597425937653, "learning_rate": 5.650797128030199e-05, "loss": 0.1128, "step": 16930 }, { "epoch": 1.5386711476452155, "grad_norm": 0.6060728430747986, "learning_rate": 5.645557115107942e-05, "loss": 0.1575, "step": 16940 }, { "epoch": 1.539579454107816, "grad_norm": 2.0245158672332764, "learning_rate": 5.640316381045211e-05, "loss": 0.0935, "step": 16950 }, { "epoch": 1.5404877605704166, "grad_norm": 1.603106141090393, "learning_rate": 5.635074931696336e-05, "loss": 0.2033, "step": 16960 }, { "epoch": 1.541396067033017, "grad_norm": 0.9883659482002258, "learning_rate": 5.629832772916448e-05, "loss": 0.1487, "step": 16970 }, { "epoch": 1.5423043734956174, "grad_norm": 2.4593303203582764, "learning_rate": 5.6245899105614695e-05, "loss": 0.2081, "step": 16980 }, { "epoch": 1.5432126799582178, "grad_norm": 1.2657506465911865, "learning_rate": 5.6193463504881105e-05, "loss": 0.1245, "step": 16990 }, { "epoch": 1.5441209864208183, "grad_norm": 2.41217303276062, "learning_rate": 5.614102098553861e-05, "loss": 0.2048, "step": 17000 }, { "epoch": 1.5441209864208183, "eval_loss": 0.21361428499221802, "eval_runtime": 1124.8895, "eval_samples_per_second": 8.7, "eval_steps_per_second": 8.7, "step": 17000 }, { "epoch": 1.5450292928834188, "grad_norm": 0.9581133723258972, "learning_rate": 5.6088571606169795e-05, "loss": 0.1726, "step": 17010 }, { "epoch": 1.5459375993460194, "grad_norm": 0.17810304462909698, "learning_rate": 5.6036115425364935e-05, "loss": 0.1713, "step": 17020 }, { "epoch": 1.54684590580862, "grad_norm": 2.186227321624756, "learning_rate": 5.598365250172194e-05, "loss": 0.1302, "step": 17030 }, { "epoch": 1.5477542122712205, "grad_norm": 1.6635900735855103, "learning_rate": 5.593118289384619e-05, "loss": 0.1691, "step": 17040 }, { "epoch": 1.5486625187338208, "grad_norm": 1.8868788480758667, "learning_rate": 5.587870666035058e-05, "loss": 0.1561, "step": 17050 }, { "epoch": 1.5495708251964213, "grad_norm": 3.2274179458618164, "learning_rate": 5.582622385985535e-05, "loss": 0.1551, "step": 17060 }, { "epoch": 1.5504791316590216, "grad_norm": 1.0453226566314697, "learning_rate": 5.5773734550988146e-05, "loss": 0.1673, "step": 17070 }, { "epoch": 1.5513874381216222, "grad_norm": 1.8084079027175903, "learning_rate": 5.572123879238383e-05, "loss": 0.1425, "step": 17080 }, { "epoch": 1.5522957445842227, "grad_norm": 2.044438123703003, "learning_rate": 5.56687366426845e-05, "loss": 0.1959, "step": 17090 }, { "epoch": 1.5532040510468232, "grad_norm": 1.3129838705062866, "learning_rate": 5.5616228160539375e-05, "loss": 0.2288, "step": 17100 }, { "epoch": 1.5541123575094238, "grad_norm": 0.8648645877838135, "learning_rate": 5.5563713404604776e-05, "loss": 0.1481, "step": 17110 }, { "epoch": 1.5550206639720243, "grad_norm": 1.6760051250457764, "learning_rate": 5.551119243354397e-05, "loss": 0.2005, "step": 17120 }, { "epoch": 1.5559289704346246, "grad_norm": 0.16565105319023132, "learning_rate": 5.545866530602723e-05, "loss": 0.1313, "step": 17130 }, { "epoch": 1.5568372768972252, "grad_norm": 0.32558730244636536, "learning_rate": 5.54061320807317e-05, "loss": 0.2001, "step": 17140 }, { "epoch": 1.5577455833598255, "grad_norm": 0.43061116337776184, "learning_rate": 5.535359281634131e-05, "loss": 0.1782, "step": 17150 }, { "epoch": 1.558653889822426, "grad_norm": 1.430277943611145, "learning_rate": 5.530104757154673e-05, "loss": 0.2024, "step": 17160 }, { "epoch": 1.5595621962850266, "grad_norm": 2.368666887283325, "learning_rate": 5.5248496405045335e-05, "loss": 0.1633, "step": 17170 }, { "epoch": 1.560470502747627, "grad_norm": 2.4180166721343994, "learning_rate": 5.519593937554113e-05, "loss": 0.1267, "step": 17180 }, { "epoch": 1.5613788092102276, "grad_norm": 0.1572272777557373, "learning_rate": 5.5143376541744606e-05, "loss": 0.0916, "step": 17190 }, { "epoch": 1.562287115672828, "grad_norm": 1.9940836429595947, "learning_rate": 5.5090807962372804e-05, "loss": 0.2066, "step": 17200 }, { "epoch": 1.5631954221354285, "grad_norm": 1.1994825601577759, "learning_rate": 5.503823369614917e-05, "loss": 0.2345, "step": 17210 }, { "epoch": 1.5641037285980288, "grad_norm": 1.2633986473083496, "learning_rate": 5.498565380180345e-05, "loss": 0.1248, "step": 17220 }, { "epoch": 1.5650120350606294, "grad_norm": 2.1845619678497314, "learning_rate": 5.4933068338071744e-05, "loss": 0.1766, "step": 17230 }, { "epoch": 1.56592034152323, "grad_norm": 1.1863033771514893, "learning_rate": 5.4880477363696345e-05, "loss": 0.1593, "step": 17240 }, { "epoch": 1.5668286479858304, "grad_norm": 1.8290281295776367, "learning_rate": 5.4827880937425704e-05, "loss": 0.1917, "step": 17250 }, { "epoch": 1.567736954448431, "grad_norm": 0.8579442501068115, "learning_rate": 5.4775279118014376e-05, "loss": 0.1289, "step": 17260 }, { "epoch": 1.5686452609110315, "grad_norm": 0.3339470624923706, "learning_rate": 5.472267196422288e-05, "loss": 0.1848, "step": 17270 }, { "epoch": 1.5695535673736318, "grad_norm": 3.161248207092285, "learning_rate": 5.4670059534817794e-05, "loss": 0.1383, "step": 17280 }, { "epoch": 1.5704618738362324, "grad_norm": 0.5326175689697266, "learning_rate": 5.46174418885715e-05, "loss": 0.1745, "step": 17290 }, { "epoch": 1.5713701802988327, "grad_norm": 0.9427816867828369, "learning_rate": 5.456481908426225e-05, "loss": 0.2096, "step": 17300 }, { "epoch": 1.5722784867614332, "grad_norm": 1.450534462928772, "learning_rate": 5.451219118067406e-05, "loss": 0.2053, "step": 17310 }, { "epoch": 1.5731867932240338, "grad_norm": 0.7135781645774841, "learning_rate": 5.445955823659663e-05, "loss": 0.093, "step": 17320 }, { "epoch": 1.5740950996866343, "grad_norm": 2.6375489234924316, "learning_rate": 5.44069203108253e-05, "loss": 0.2091, "step": 17330 }, { "epoch": 1.5750034061492348, "grad_norm": 1.333878517150879, "learning_rate": 5.435427746216095e-05, "loss": 0.1737, "step": 17340 }, { "epoch": 1.5759117126118354, "grad_norm": 0.006547876168042421, "learning_rate": 5.430162974941001e-05, "loss": 0.1266, "step": 17350 }, { "epoch": 1.5768200190744357, "grad_norm": 2.4454517364501953, "learning_rate": 5.424897723138427e-05, "loss": 0.2069, "step": 17360 }, { "epoch": 1.5777283255370362, "grad_norm": 4.454812049865723, "learning_rate": 5.4196319966900964e-05, "loss": 0.1618, "step": 17370 }, { "epoch": 1.5786366319996366, "grad_norm": 0.41032254695892334, "learning_rate": 5.4143658014782595e-05, "loss": 0.1956, "step": 17380 }, { "epoch": 1.579544938462237, "grad_norm": 0.19961416721343994, "learning_rate": 5.4090991433856865e-05, "loss": 0.2282, "step": 17390 }, { "epoch": 1.5804532449248376, "grad_norm": 1.4939336776733398, "learning_rate": 5.403832028295673e-05, "loss": 0.1844, "step": 17400 }, { "epoch": 1.5813615513874382, "grad_norm": 1.2402234077453613, "learning_rate": 5.398564462092017e-05, "loss": 0.1675, "step": 17410 }, { "epoch": 1.5822698578500387, "grad_norm": 0.5418598055839539, "learning_rate": 5.393296450659027e-05, "loss": 0.1681, "step": 17420 }, { "epoch": 1.5831781643126392, "grad_norm": 3.517535448074341, "learning_rate": 5.388027999881502e-05, "loss": 0.2183, "step": 17430 }, { "epoch": 1.5840864707752396, "grad_norm": 2.5361390113830566, "learning_rate": 5.3827591156447366e-05, "loss": 0.1691, "step": 17440 }, { "epoch": 1.58499477723784, "grad_norm": 0.7754393815994263, "learning_rate": 5.3774898038345114e-05, "loss": 0.147, "step": 17450 }, { "epoch": 1.5859030837004404, "grad_norm": 0.8850165009498596, "learning_rate": 5.372220070337077e-05, "loss": 0.184, "step": 17460 }, { "epoch": 1.586811390163041, "grad_norm": 0.2365008294582367, "learning_rate": 5.366949921039165e-05, "loss": 0.1211, "step": 17470 }, { "epoch": 1.5877196966256415, "grad_norm": 1.325543761253357, "learning_rate": 5.361679361827964e-05, "loss": 0.1239, "step": 17480 }, { "epoch": 1.588628003088242, "grad_norm": 0.1640712469816208, "learning_rate": 5.356408398591119e-05, "loss": 0.1365, "step": 17490 }, { "epoch": 1.5895363095508426, "grad_norm": 0.3232594430446625, "learning_rate": 5.3511370372167366e-05, "loss": 0.1115, "step": 17500 }, { "epoch": 1.5895363095508426, "eval_loss": 0.20820897817611694, "eval_runtime": 1121.1896, "eval_samples_per_second": 8.729, "eval_steps_per_second": 8.729, "step": 17500 }, { "epoch": 1.590444616013443, "grad_norm": 3.114145517349243, "learning_rate": 5.345865283593356e-05, "loss": 0.2004, "step": 17510 }, { "epoch": 1.5913529224760434, "grad_norm": 0.8962686657905579, "learning_rate": 5.3405931436099664e-05, "loss": 0.1958, "step": 17520 }, { "epoch": 1.5922612289386437, "grad_norm": 1.1040464639663696, "learning_rate": 5.335320623155977e-05, "loss": 0.1228, "step": 17530 }, { "epoch": 1.5931695354012443, "grad_norm": 2.3387813568115234, "learning_rate": 5.33004772812123e-05, "loss": 0.1983, "step": 17540 }, { "epoch": 1.5940778418638448, "grad_norm": 2.0895135402679443, "learning_rate": 5.324774464395984e-05, "loss": 0.1706, "step": 17550 }, { "epoch": 1.5949861483264454, "grad_norm": 1.3563517332077026, "learning_rate": 5.3195008378709064e-05, "loss": 0.134, "step": 17560 }, { "epoch": 1.595894454789046, "grad_norm": 1.6137434244155884, "learning_rate": 5.314226854437074e-05, "loss": 0.1989, "step": 17570 }, { "epoch": 1.5968027612516464, "grad_norm": 2.066648006439209, "learning_rate": 5.308952519985962e-05, "loss": 0.14, "step": 17580 }, { "epoch": 1.5977110677142468, "grad_norm": 2.6605112552642822, "learning_rate": 5.303677840409433e-05, "loss": 0.2023, "step": 17590 }, { "epoch": 1.5986193741768473, "grad_norm": 1.5196373462677002, "learning_rate": 5.2984028215997436e-05, "loss": 0.1467, "step": 17600 }, { "epoch": 1.5995276806394476, "grad_norm": 2.468952178955078, "learning_rate": 5.29312746944952e-05, "loss": 0.1448, "step": 17610 }, { "epoch": 1.6004359871020482, "grad_norm": 0.6076207160949707, "learning_rate": 5.2878517898517675e-05, "loss": 0.1687, "step": 17620 }, { "epoch": 1.6013442935646487, "grad_norm": 0.7925589084625244, "learning_rate": 5.282575788699854e-05, "loss": 0.0744, "step": 17630 }, { "epoch": 1.6022526000272492, "grad_norm": 0.4090554416179657, "learning_rate": 5.2772994718875054e-05, "loss": 0.1071, "step": 17640 }, { "epoch": 1.6031609064898498, "grad_norm": 0.5380252599716187, "learning_rate": 5.272022845308806e-05, "loss": 0.196, "step": 17650 }, { "epoch": 1.6040692129524503, "grad_norm": 1.762993574142456, "learning_rate": 5.26674591485818e-05, "loss": 0.195, "step": 17660 }, { "epoch": 1.6049775194150506, "grad_norm": 1.2474042177200317, "learning_rate": 5.2614686864303933e-05, "loss": 0.1701, "step": 17670 }, { "epoch": 1.6058858258776512, "grad_norm": 0.8812074065208435, "learning_rate": 5.2561911659205464e-05, "loss": 0.1101, "step": 17680 }, { "epoch": 1.6067941323402515, "grad_norm": 0.5742648243904114, "learning_rate": 5.250913359224063e-05, "loss": 0.1924, "step": 17690 }, { "epoch": 1.607702438802852, "grad_norm": 0.3111792206764221, "learning_rate": 5.2456352722366886e-05, "loss": 0.1937, "step": 17700 }, { "epoch": 1.6086107452654526, "grad_norm": 1.7453250885009766, "learning_rate": 5.24035691085448e-05, "loss": 0.1376, "step": 17710 }, { "epoch": 1.609519051728053, "grad_norm": 2.2020349502563477, "learning_rate": 5.2350782809738044e-05, "loss": 0.0905, "step": 17720 }, { "epoch": 1.6104273581906536, "grad_norm": 0.8055725693702698, "learning_rate": 5.229799388491323e-05, "loss": 0.0891, "step": 17730 }, { "epoch": 1.6113356646532542, "grad_norm": 2.05198073387146, "learning_rate": 5.224520239303994e-05, "loss": 0.1508, "step": 17740 }, { "epoch": 1.6122439711158545, "grad_norm": 0.6173577308654785, "learning_rate": 5.2192408393090644e-05, "loss": 0.1601, "step": 17750 }, { "epoch": 1.6131522775784548, "grad_norm": 0.3551391363143921, "learning_rate": 5.2139611944040576e-05, "loss": 0.133, "step": 17760 }, { "epoch": 1.6140605840410553, "grad_norm": 1.1566206216812134, "learning_rate": 5.208681310486771e-05, "loss": 0.1507, "step": 17770 }, { "epoch": 1.6149688905036559, "grad_norm": 0.20933657884597778, "learning_rate": 5.203401193455271e-05, "loss": 0.2093, "step": 17780 }, { "epoch": 1.6158771969662564, "grad_norm": 1.4648593664169312, "learning_rate": 5.1981208492078815e-05, "loss": 0.1697, "step": 17790 }, { "epoch": 1.616785503428857, "grad_norm": 1.3099157810211182, "learning_rate": 5.1928402836431836e-05, "loss": 0.1273, "step": 17800 }, { "epoch": 1.6176938098914575, "grad_norm": 1.3631782531738281, "learning_rate": 5.1875595026600034e-05, "loss": 0.1697, "step": 17810 }, { "epoch": 1.6186021163540578, "grad_norm": 1.4548190832138062, "learning_rate": 5.182278512157409e-05, "loss": 0.1676, "step": 17820 }, { "epoch": 1.6195104228166584, "grad_norm": 0.6393784284591675, "learning_rate": 5.176997318034702e-05, "loss": 0.1359, "step": 17830 }, { "epoch": 1.6204187292792587, "grad_norm": 1.231430172920227, "learning_rate": 5.171715926191411e-05, "loss": 0.1827, "step": 17840 }, { "epoch": 1.6213270357418592, "grad_norm": 0.6544288992881775, "learning_rate": 5.166434342527283e-05, "loss": 0.138, "step": 17850 }, { "epoch": 1.6222353422044598, "grad_norm": 1.5193817615509033, "learning_rate": 5.161152572942285e-05, "loss": 0.1763, "step": 17860 }, { "epoch": 1.6231436486670603, "grad_norm": 4.5945258140563965, "learning_rate": 5.155870623336589e-05, "loss": 0.1565, "step": 17870 }, { "epoch": 1.6240519551296608, "grad_norm": 0.1153169795870781, "learning_rate": 5.150588499610567e-05, "loss": 0.1901, "step": 17880 }, { "epoch": 1.6249602615922614, "grad_norm": 2.7777867317199707, "learning_rate": 5.145306207664785e-05, "loss": 0.1556, "step": 17890 }, { "epoch": 1.6258685680548617, "grad_norm": 0.7602540850639343, "learning_rate": 5.1400237534000006e-05, "loss": 0.1527, "step": 17900 }, { "epoch": 1.6267768745174622, "grad_norm": 2.168761730194092, "learning_rate": 5.13474114271715e-05, "loss": 0.1749, "step": 17910 }, { "epoch": 1.6276851809800625, "grad_norm": 1.6365158557891846, "learning_rate": 5.1294583815173424e-05, "loss": 0.2025, "step": 17920 }, { "epoch": 1.628593487442663, "grad_norm": 1.1597133874893188, "learning_rate": 5.124175475701858e-05, "loss": 0.1274, "step": 17930 }, { "epoch": 1.6295017939052636, "grad_norm": 1.8895697593688965, "learning_rate": 5.11889243117214e-05, "loss": 0.1543, "step": 17940 }, { "epoch": 1.6304101003678642, "grad_norm": 1.782097339630127, "learning_rate": 5.113609253829781e-05, "loss": 0.202, "step": 17950 }, { "epoch": 1.6313184068304647, "grad_norm": 2.0581226348876953, "learning_rate": 5.108325949576527e-05, "loss": 0.1988, "step": 17960 }, { "epoch": 1.6322267132930652, "grad_norm": 0.6382351517677307, "learning_rate": 5.1030425243142644e-05, "loss": 0.26, "step": 17970 }, { "epoch": 1.6331350197556656, "grad_norm": 1.6552915573120117, "learning_rate": 5.097758983945015e-05, "loss": 0.1722, "step": 17980 }, { "epoch": 1.634043326218266, "grad_norm": 0.9407916069030762, "learning_rate": 5.092475334370926e-05, "loss": 0.1194, "step": 17990 }, { "epoch": 1.6349516326808664, "grad_norm": 1.9839940071105957, "learning_rate": 5.087191581494273e-05, "loss": 0.2041, "step": 18000 }, { "epoch": 1.6349516326808664, "eval_loss": 0.2003697156906128, "eval_runtime": 1109.74, "eval_samples_per_second": 8.819, "eval_steps_per_second": 8.819, "step": 18000 }, { "epoch": 1.635859939143467, "grad_norm": 1.4751263856887817, "learning_rate": 5.081907731217438e-05, "loss": 0.1176, "step": 18010 }, { "epoch": 1.6367682456060675, "grad_norm": 1.7577786445617676, "learning_rate": 5.076623789442923e-05, "loss": 0.1159, "step": 18020 }, { "epoch": 1.637676552068668, "grad_norm": 0.6824291348457336, "learning_rate": 5.071339762073321e-05, "loss": 0.1303, "step": 18030 }, { "epoch": 1.6385848585312686, "grad_norm": 2.371260166168213, "learning_rate": 5.066055655011329e-05, "loss": 0.1825, "step": 18040 }, { "epoch": 1.6394931649938689, "grad_norm": 1.2428908348083496, "learning_rate": 5.060771474159729e-05, "loss": 0.127, "step": 18050 }, { "epoch": 1.6404014714564694, "grad_norm": 1.5517820119857788, "learning_rate": 5.055487225421387e-05, "loss": 0.2277, "step": 18060 }, { "epoch": 1.6413097779190697, "grad_norm": 1.320698618888855, "learning_rate": 5.0502029146992435e-05, "loss": 0.1429, "step": 18070 }, { "epoch": 1.6422180843816703, "grad_norm": 1.9778655767440796, "learning_rate": 5.044918547896308e-05, "loss": 0.1366, "step": 18080 }, { "epoch": 1.6431263908442708, "grad_norm": 0.936538815498352, "learning_rate": 5.0396341309156546e-05, "loss": 0.1521, "step": 18090 }, { "epoch": 1.6440346973068714, "grad_norm": 1.0394481420516968, "learning_rate": 5.034349669660412e-05, "loss": 0.1372, "step": 18100 }, { "epoch": 1.644943003769472, "grad_norm": 2.156111478805542, "learning_rate": 5.029065170033759e-05, "loss": 0.1445, "step": 18110 }, { "epoch": 1.6458513102320724, "grad_norm": 2.949735164642334, "learning_rate": 5.0237806379389166e-05, "loss": 0.1287, "step": 18120 }, { "epoch": 1.6467596166946727, "grad_norm": 2.6379358768463135, "learning_rate": 5.018496079279142e-05, "loss": 0.1993, "step": 18130 }, { "epoch": 1.6476679231572733, "grad_norm": 0.6760141253471375, "learning_rate": 5.013211499957725e-05, "loss": 0.1739, "step": 18140 }, { "epoch": 1.6485762296198736, "grad_norm": 3.688136577606201, "learning_rate": 5.0079269058779745e-05, "loss": 0.1968, "step": 18150 }, { "epoch": 1.6494845360824741, "grad_norm": 1.1812481880187988, "learning_rate": 5.002642302943215e-05, "loss": 0.1091, "step": 18160 }, { "epoch": 1.6503928425450747, "grad_norm": 2.820540189743042, "learning_rate": 4.997357697056785e-05, "loss": 0.112, "step": 18170 }, { "epoch": 1.6513011490076752, "grad_norm": 1.2089534997940063, "learning_rate": 4.9920730941220273e-05, "loss": 0.1457, "step": 18180 }, { "epoch": 1.6522094554702758, "grad_norm": 0.04097853973507881, "learning_rate": 4.986788500042276e-05, "loss": 0.1252, "step": 18190 }, { "epoch": 1.6531177619328763, "grad_norm": 2.8142266273498535, "learning_rate": 4.981503920720858e-05, "loss": 0.1375, "step": 18200 }, { "epoch": 1.6540260683954766, "grad_norm": 3.8198318481445312, "learning_rate": 4.9762193620610846e-05, "loss": 0.131, "step": 18210 }, { "epoch": 1.6549343748580772, "grad_norm": 1.6613188982009888, "learning_rate": 4.9709348299662415e-05, "loss": 0.1386, "step": 18220 }, { "epoch": 1.6558426813206775, "grad_norm": 0.3149423599243164, "learning_rate": 4.965650330339589e-05, "loss": 0.1609, "step": 18230 }, { "epoch": 1.656750987783278, "grad_norm": 0.6688880920410156, "learning_rate": 4.9603658690843466e-05, "loss": 0.1111, "step": 18240 }, { "epoch": 1.6576592942458785, "grad_norm": 0.7467014193534851, "learning_rate": 4.9550814521036934e-05, "loss": 0.2381, "step": 18250 }, { "epoch": 1.658567600708479, "grad_norm": 1.2524490356445312, "learning_rate": 4.949797085300758e-05, "loss": 0.1192, "step": 18260 }, { "epoch": 1.6594759071710796, "grad_norm": 0.8485026359558105, "learning_rate": 4.944512774578614e-05, "loss": 0.1775, "step": 18270 }, { "epoch": 1.6603842136336802, "grad_norm": 2.677157402038574, "learning_rate": 4.939228525840271e-05, "loss": 0.2332, "step": 18280 }, { "epoch": 1.6612925200962805, "grad_norm": 2.248152017593384, "learning_rate": 4.933944344988672e-05, "loss": 0.2399, "step": 18290 }, { "epoch": 1.662200826558881, "grad_norm": 0.8281689286231995, "learning_rate": 4.9286602379266804e-05, "loss": 0.1573, "step": 18300 }, { "epoch": 1.6631091330214813, "grad_norm": 1.147505283355713, "learning_rate": 4.9233762105570804e-05, "loss": 0.1391, "step": 18310 }, { "epoch": 1.6640174394840819, "grad_norm": 1.120723009109497, "learning_rate": 4.918092268782561e-05, "loss": 0.2297, "step": 18320 }, { "epoch": 1.6649257459466824, "grad_norm": 0.686523973941803, "learning_rate": 4.912808418505729e-05, "loss": 0.136, "step": 18330 }, { "epoch": 1.665834052409283, "grad_norm": 1.4850467443466187, "learning_rate": 4.907524665629075e-05, "loss": 0.1747, "step": 18340 }, { "epoch": 1.6667423588718835, "grad_norm": 0.8376705646514893, "learning_rate": 4.9022410160549865e-05, "loss": 0.1763, "step": 18350 }, { "epoch": 1.6676506653344838, "grad_norm": 2.384255886077881, "learning_rate": 4.896957475685737e-05, "loss": 0.1792, "step": 18360 }, { "epoch": 1.6685589717970843, "grad_norm": 1.2535876035690308, "learning_rate": 4.891674050423473e-05, "loss": 0.2142, "step": 18370 }, { "epoch": 1.6694672782596847, "grad_norm": 0.9733379483222961, "learning_rate": 4.88639074617022e-05, "loss": 0.1618, "step": 18380 }, { "epoch": 1.6703755847222852, "grad_norm": 2.104921340942383, "learning_rate": 4.881107568827862e-05, "loss": 0.1715, "step": 18390 }, { "epoch": 1.6712838911848857, "grad_norm": 1.5654984712600708, "learning_rate": 4.875824524298143e-05, "loss": 0.2051, "step": 18400 }, { "epoch": 1.6721921976474863, "grad_norm": 1.3740206956863403, "learning_rate": 4.8705416184826594e-05, "loss": 0.1067, "step": 18410 }, { "epoch": 1.6731005041100868, "grad_norm": 1.9825063943862915, "learning_rate": 4.865258857282852e-05, "loss": 0.1803, "step": 18420 }, { "epoch": 1.6740088105726874, "grad_norm": 2.6988258361816406, "learning_rate": 4.8599762466e-05, "loss": 0.2018, "step": 18430 }, { "epoch": 1.6749171170352877, "grad_norm": 1.344997763633728, "learning_rate": 4.8546937923352156e-05, "loss": 0.1644, "step": 18440 }, { "epoch": 1.6758254234978882, "grad_norm": 1.9116928577423096, "learning_rate": 4.849411500389435e-05, "loss": 0.2104, "step": 18450 }, { "epoch": 1.6767337299604885, "grad_norm": 0.6876057982444763, "learning_rate": 4.8441293766634126e-05, "loss": 0.1705, "step": 18460 }, { "epoch": 1.677642036423089, "grad_norm": 2.162706136703491, "learning_rate": 4.838847427057714e-05, "loss": 0.1541, "step": 18470 }, { "epoch": 1.6785503428856896, "grad_norm": 0.5765629410743713, "learning_rate": 4.8335656574727174e-05, "loss": 0.1889, "step": 18480 }, { "epoch": 1.6794586493482901, "grad_norm": 0.8108572959899902, "learning_rate": 4.828284073808591e-05, "loss": 0.1488, "step": 18490 }, { "epoch": 1.6803669558108907, "grad_norm": 1.6747187376022339, "learning_rate": 4.8230026819652995e-05, "loss": 0.2027, "step": 18500 }, { "epoch": 1.6803669558108907, "eval_loss": 0.19957363605499268, "eval_runtime": 1100.8048, "eval_samples_per_second": 8.891, "eval_steps_per_second": 8.891, "step": 18500 }, { "epoch": 1.6812752622734912, "grad_norm": 1.857726812362671, "learning_rate": 4.817721487842593e-05, "loss": 0.1137, "step": 18510 }, { "epoch": 1.6821835687360915, "grad_norm": 0.7307305335998535, "learning_rate": 4.8124404973399964e-05, "loss": 0.1599, "step": 18520 }, { "epoch": 1.683091875198692, "grad_norm": 0.25959593057632446, "learning_rate": 4.807159716356817e-05, "loss": 0.0607, "step": 18530 }, { "epoch": 1.6840001816612924, "grad_norm": 0.49963244795799255, "learning_rate": 4.80187915079212e-05, "loss": 0.1162, "step": 18540 }, { "epoch": 1.684908488123893, "grad_norm": 0.06941510736942291, "learning_rate": 4.796598806544732e-05, "loss": 0.1069, "step": 18550 }, { "epoch": 1.6858167945864935, "grad_norm": 2.940112352371216, "learning_rate": 4.791318689513231e-05, "loss": 0.1352, "step": 18560 }, { "epoch": 1.686725101049094, "grad_norm": 0.5692654848098755, "learning_rate": 4.7860388055959436e-05, "loss": 0.1469, "step": 18570 }, { "epoch": 1.6876334075116945, "grad_norm": 2.022797107696533, "learning_rate": 4.780759160690936e-05, "loss": 0.1334, "step": 18580 }, { "epoch": 1.688541713974295, "grad_norm": 1.02724289894104, "learning_rate": 4.775479760696006e-05, "loss": 0.1682, "step": 18590 }, { "epoch": 1.6894500204368954, "grad_norm": 1.0548737049102783, "learning_rate": 4.7702006115086795e-05, "loss": 0.2482, "step": 18600 }, { "epoch": 1.690358326899496, "grad_norm": 1.2822843790054321, "learning_rate": 4.764921719026199e-05, "loss": 0.1519, "step": 18610 }, { "epoch": 1.6912666333620963, "grad_norm": 0.958453357219696, "learning_rate": 4.7596430891455196e-05, "loss": 0.1878, "step": 18620 }, { "epoch": 1.6921749398246968, "grad_norm": 2.155874729156494, "learning_rate": 4.754364727763312e-05, "loss": 0.1486, "step": 18630 }, { "epoch": 1.6930832462872973, "grad_norm": 2.983290433883667, "learning_rate": 4.7490866407759374e-05, "loss": 0.1624, "step": 18640 }, { "epoch": 1.6939915527498979, "grad_norm": 1.0008611679077148, "learning_rate": 4.743808834079455e-05, "loss": 0.162, "step": 18650 }, { "epoch": 1.6948998592124984, "grad_norm": 1.1450685262680054, "learning_rate": 4.7385313135696085e-05, "loss": 0.1837, "step": 18660 }, { "epoch": 1.6958081656750987, "grad_norm": 1.9612895250320435, "learning_rate": 4.7332540851418204e-05, "loss": 0.1722, "step": 18670 }, { "epoch": 1.6967164721376993, "grad_norm": 1.3650895357131958, "learning_rate": 4.727977154691195e-05, "loss": 0.1414, "step": 18680 }, { "epoch": 1.6976247786002996, "grad_norm": 1.5244815349578857, "learning_rate": 4.722700528112495e-05, "loss": 0.1386, "step": 18690 }, { "epoch": 1.6985330850629001, "grad_norm": 1.9927470684051514, "learning_rate": 4.717424211300148e-05, "loss": 0.1474, "step": 18700 }, { "epoch": 1.6994413915255007, "grad_norm": 1.3216856718063354, "learning_rate": 4.712148210148234e-05, "loss": 0.1637, "step": 18710 }, { "epoch": 1.7003496979881012, "grad_norm": 2.4107272624969482, "learning_rate": 4.706872530550481e-05, "loss": 0.1244, "step": 18720 }, { "epoch": 1.7012580044507017, "grad_norm": 1.1190321445465088, "learning_rate": 4.7015971784002575e-05, "loss": 0.2095, "step": 18730 }, { "epoch": 1.7021663109133023, "grad_norm": 1.2570291757583618, "learning_rate": 4.696322159590567e-05, "loss": 0.1517, "step": 18740 }, { "epoch": 1.7030746173759026, "grad_norm": 1.1277302503585815, "learning_rate": 4.6910474800140403e-05, "loss": 0.1179, "step": 18750 }, { "epoch": 1.7039829238385031, "grad_norm": 0.39447256922721863, "learning_rate": 4.6857731455629256e-05, "loss": 0.1415, "step": 18760 }, { "epoch": 1.7048912303011035, "grad_norm": 0.6873247623443604, "learning_rate": 4.680499162129094e-05, "loss": 0.1385, "step": 18770 }, { "epoch": 1.705799536763704, "grad_norm": 1.6362301111221313, "learning_rate": 4.675225535604017e-05, "loss": 0.1294, "step": 18780 }, { "epoch": 1.7067078432263045, "grad_norm": 0.36843743920326233, "learning_rate": 4.66995227187877e-05, "loss": 0.1337, "step": 18790 }, { "epoch": 1.707616149688905, "grad_norm": 0.7712371945381165, "learning_rate": 4.6646793768440246e-05, "loss": 0.1307, "step": 18800 }, { "epoch": 1.7085244561515056, "grad_norm": 1.5368869304656982, "learning_rate": 4.659406856390034e-05, "loss": 0.1428, "step": 18810 }, { "epoch": 1.7094327626141061, "grad_norm": 2.649801254272461, "learning_rate": 4.654134716406643e-05, "loss": 0.098, "step": 18820 }, { "epoch": 1.7103410690767065, "grad_norm": 1.2610774040222168, "learning_rate": 4.6488629627832645e-05, "loss": 0.1662, "step": 18830 }, { "epoch": 1.711249375539307, "grad_norm": 0.7158254384994507, "learning_rate": 4.643591601408881e-05, "loss": 0.1377, "step": 18840 }, { "epoch": 1.7121576820019073, "grad_norm": 2.272498369216919, "learning_rate": 4.638320638172039e-05, "loss": 0.1727, "step": 18850 }, { "epoch": 1.7130659884645079, "grad_norm": 2.0970523357391357, "learning_rate": 4.6330500789608355e-05, "loss": 0.1386, "step": 18860 }, { "epoch": 1.7139742949271084, "grad_norm": 1.6905821561813354, "learning_rate": 4.6277799296629234e-05, "loss": 0.1472, "step": 18870 }, { "epoch": 1.714882601389709, "grad_norm": 1.3549307584762573, "learning_rate": 4.6225101961654904e-05, "loss": 0.15, "step": 18880 }, { "epoch": 1.7157909078523095, "grad_norm": 0.4119440019130707, "learning_rate": 4.6172408843552645e-05, "loss": 0.1542, "step": 18890 }, { "epoch": 1.71669921431491, "grad_norm": 2.3643224239349365, "learning_rate": 4.6119720001185e-05, "loss": 0.1707, "step": 18900 }, { "epoch": 1.7176075207775103, "grad_norm": 0.5121569037437439, "learning_rate": 4.6067035493409736e-05, "loss": 0.1726, "step": 18910 }, { "epoch": 1.7185158272401106, "grad_norm": 0.8986698389053345, "learning_rate": 4.6014355379079826e-05, "loss": 0.1523, "step": 18920 }, { "epoch": 1.7194241337027112, "grad_norm": 1.1335655450820923, "learning_rate": 4.596167971704327e-05, "loss": 0.093, "step": 18930 }, { "epoch": 1.7203324401653117, "grad_norm": 0.4689861834049225, "learning_rate": 4.590900856614314e-05, "loss": 0.2421, "step": 18940 }, { "epoch": 1.7212407466279123, "grad_norm": 3.6615822315216064, "learning_rate": 4.585634198521743e-05, "loss": 0.1682, "step": 18950 }, { "epoch": 1.7221490530905128, "grad_norm": 1.6421449184417725, "learning_rate": 4.580368003309903e-05, "loss": 0.1558, "step": 18960 }, { "epoch": 1.7230573595531133, "grad_norm": 1.2114853858947754, "learning_rate": 4.5751022768615724e-05, "loss": 0.1726, "step": 18970 }, { "epoch": 1.7239656660157137, "grad_norm": 0.8471675515174866, "learning_rate": 4.569837025059001e-05, "loss": 0.26, "step": 18980 }, { "epoch": 1.7248739724783142, "grad_norm": 1.0329508781433105, "learning_rate": 4.5645722537839056e-05, "loss": 0.203, "step": 18990 }, { "epoch": 1.7257822789409145, "grad_norm": 0.6991339325904846, "learning_rate": 4.5593079689174715e-05, "loss": 0.1198, "step": 19000 }, { "epoch": 1.7257822789409145, "eval_loss": 0.19998696446418762, "eval_runtime": 1093.593, "eval_samples_per_second": 8.949, "eval_steps_per_second": 8.949, "step": 19000 }, { "epoch": 1.726690585403515, "grad_norm": 2.0616512298583984, "learning_rate": 4.554044176340337e-05, "loss": 0.1536, "step": 19010 }, { "epoch": 1.7275988918661156, "grad_norm": 1.102402925491333, "learning_rate": 4.548780881932595e-05, "loss": 0.1646, "step": 19020 }, { "epoch": 1.7285071983287161, "grad_norm": 2.111992120742798, "learning_rate": 4.543518091573776e-05, "loss": 0.1172, "step": 19030 }, { "epoch": 1.7294155047913167, "grad_norm": 0.5133132934570312, "learning_rate": 4.538255811142852e-05, "loss": 0.1208, "step": 19040 }, { "epoch": 1.7303238112539172, "grad_norm": 0.8192662596702576, "learning_rate": 4.532994046518223e-05, "loss": 0.1424, "step": 19050 }, { "epoch": 1.7312321177165175, "grad_norm": 1.2171510457992554, "learning_rate": 4.527732803577711e-05, "loss": 0.1616, "step": 19060 }, { "epoch": 1.732140424179118, "grad_norm": 2.6749091148376465, "learning_rate": 4.5224720881985635e-05, "loss": 0.2074, "step": 19070 }, { "epoch": 1.7330487306417184, "grad_norm": 1.6938165426254272, "learning_rate": 4.51721190625743e-05, "loss": 0.221, "step": 19080 }, { "epoch": 1.733957037104319, "grad_norm": 1.204814076423645, "learning_rate": 4.5119522636303667e-05, "loss": 0.1272, "step": 19090 }, { "epoch": 1.7348653435669195, "grad_norm": 1.383202075958252, "learning_rate": 4.5066931661928274e-05, "loss": 0.1419, "step": 19100 }, { "epoch": 1.73577365002952, "grad_norm": 1.1283315420150757, "learning_rate": 4.5014346198196554e-05, "loss": 0.1782, "step": 19110 }, { "epoch": 1.7366819564921205, "grad_norm": 0.4100414514541626, "learning_rate": 4.496176630385085e-05, "loss": 0.1634, "step": 19120 }, { "epoch": 1.737590262954721, "grad_norm": 0.7582045793533325, "learning_rate": 4.49091920376272e-05, "loss": 0.1854, "step": 19130 }, { "epoch": 1.7384985694173214, "grad_norm": 1.5597150325775146, "learning_rate": 4.485662345825541e-05, "loss": 0.1671, "step": 19140 }, { "epoch": 1.739406875879922, "grad_norm": 0.48108476400375366, "learning_rate": 4.48040606244589e-05, "loss": 0.1923, "step": 19150 }, { "epoch": 1.7403151823425222, "grad_norm": 1.2744799852371216, "learning_rate": 4.475150359495467e-05, "loss": 0.1451, "step": 19160 }, { "epoch": 1.7412234888051228, "grad_norm": 0.3390663266181946, "learning_rate": 4.469895242845328e-05, "loss": 0.1247, "step": 19170 }, { "epoch": 1.7421317952677233, "grad_norm": 1.08182954788208, "learning_rate": 4.464640718365871e-05, "loss": 0.1332, "step": 19180 }, { "epoch": 1.7430401017303239, "grad_norm": 1.6732321977615356, "learning_rate": 4.459386791926831e-05, "loss": 0.1288, "step": 19190 }, { "epoch": 1.7439484081929244, "grad_norm": 0.4313673675060272, "learning_rate": 4.454133469397278e-05, "loss": 0.0749, "step": 19200 }, { "epoch": 1.7448567146555247, "grad_norm": 3.6175320148468018, "learning_rate": 4.4488807566456034e-05, "loss": 0.1516, "step": 19210 }, { "epoch": 1.7457650211181253, "grad_norm": 1.251487374305725, "learning_rate": 4.4436286595395236e-05, "loss": 0.1544, "step": 19220 }, { "epoch": 1.7466733275807256, "grad_norm": 1.0939223766326904, "learning_rate": 4.4383771839460636e-05, "loss": 0.0912, "step": 19230 }, { "epoch": 1.7475816340433261, "grad_norm": 2.881593942642212, "learning_rate": 4.4331263357315515e-05, "loss": 0.1476, "step": 19240 }, { "epoch": 1.7484899405059267, "grad_norm": 1.1641484498977661, "learning_rate": 4.427876120761619e-05, "loss": 0.271, "step": 19250 }, { "epoch": 1.7493982469685272, "grad_norm": 2.2287986278533936, "learning_rate": 4.422626544901186e-05, "loss": 0.1044, "step": 19260 }, { "epoch": 1.7503065534311277, "grad_norm": 2.247929573059082, "learning_rate": 4.417377614014466e-05, "loss": 0.1853, "step": 19270 }, { "epoch": 1.7512148598937283, "grad_norm": 2.9143810272216797, "learning_rate": 4.412129333964944e-05, "loss": 0.1728, "step": 19280 }, { "epoch": 1.7521231663563286, "grad_norm": 1.3376961946487427, "learning_rate": 4.406881710615382e-05, "loss": 0.1901, "step": 19290 }, { "epoch": 1.7530314728189291, "grad_norm": 3.1697216033935547, "learning_rate": 4.401634749827808e-05, "loss": 0.166, "step": 19300 }, { "epoch": 1.7539397792815294, "grad_norm": 1.7700920104980469, "learning_rate": 4.396388457463507e-05, "loss": 0.1121, "step": 19310 }, { "epoch": 1.75484808574413, "grad_norm": 2.7552437782287598, "learning_rate": 4.391142839383022e-05, "loss": 0.1788, "step": 19320 }, { "epoch": 1.7557563922067305, "grad_norm": 0.30734342336654663, "learning_rate": 4.385897901446141e-05, "loss": 0.1138, "step": 19330 }, { "epoch": 1.756664698669331, "grad_norm": 1.20698082447052, "learning_rate": 4.3806536495118906e-05, "loss": 0.2088, "step": 19340 }, { "epoch": 1.7575730051319316, "grad_norm": 0.2987373471260071, "learning_rate": 4.3754100894385316e-05, "loss": 0.0957, "step": 19350 }, { "epoch": 1.7584813115945321, "grad_norm": 1.089237928390503, "learning_rate": 4.370167227083552e-05, "loss": 0.1313, "step": 19360 }, { "epoch": 1.7593896180571325, "grad_norm": 1.7021182775497437, "learning_rate": 4.3649250683036654e-05, "loss": 0.1453, "step": 19370 }, { "epoch": 1.760297924519733, "grad_norm": 1.5632026195526123, "learning_rate": 4.359683618954791e-05, "loss": 0.1176, "step": 19380 }, { "epoch": 1.7612062309823333, "grad_norm": 1.0127357244491577, "learning_rate": 4.354442884892059e-05, "loss": 0.2027, "step": 19390 }, { "epoch": 1.7621145374449338, "grad_norm": 0.9349935054779053, "learning_rate": 4.349202871969804e-05, "loss": 0.1963, "step": 19400 }, { "epoch": 1.7630228439075344, "grad_norm": 1.2781739234924316, "learning_rate": 4.343963586041547e-05, "loss": 0.1905, "step": 19410 }, { "epoch": 1.763931150370135, "grad_norm": 2.331536054611206, "learning_rate": 4.3387250329600076e-05, "loss": 0.1325, "step": 19420 }, { "epoch": 1.7648394568327355, "grad_norm": 0.4662609100341797, "learning_rate": 4.333487218577079e-05, "loss": 0.148, "step": 19430 }, { "epoch": 1.765747763295336, "grad_norm": 1.134279489517212, "learning_rate": 4.3282501487438324e-05, "loss": 0.2222, "step": 19440 }, { "epoch": 1.7666560697579363, "grad_norm": 0.6657999157905579, "learning_rate": 4.323013829310504e-05, "loss": 0.1546, "step": 19450 }, { "epoch": 1.7675643762205369, "grad_norm": 1.5725640058517456, "learning_rate": 4.3177782661264937e-05, "loss": 0.2037, "step": 19460 }, { "epoch": 1.7684726826831372, "grad_norm": 0.5658276677131653, "learning_rate": 4.31254346504036e-05, "loss": 0.1394, "step": 19470 }, { "epoch": 1.7693809891457377, "grad_norm": 1.2159111499786377, "learning_rate": 4.307309431899804e-05, "loss": 0.1175, "step": 19480 }, { "epoch": 1.7702892956083383, "grad_norm": 0.6346610188484192, "learning_rate": 4.302076172551674e-05, "loss": 0.1681, "step": 19490 }, { "epoch": 1.7711976020709388, "grad_norm": 2.5392987728118896, "learning_rate": 4.2968436928419503e-05, "loss": 0.1837, "step": 19500 }, { "epoch": 1.7711976020709388, "eval_loss": 0.20140187442302704, "eval_runtime": 1086.4651, "eval_samples_per_second": 9.008, "eval_steps_per_second": 9.008, "step": 19500 }, { "epoch": 1.7721059085335393, "grad_norm": 0.3040110170841217, "learning_rate": 4.291611998615743e-05, "loss": 0.1507, "step": 19510 }, { "epoch": 1.7730142149961396, "grad_norm": 2.1719319820404053, "learning_rate": 4.286381095717288e-05, "loss": 0.1951, "step": 19520 }, { "epoch": 1.7739225214587402, "grad_norm": 0.9042419791221619, "learning_rate": 4.281150989989933e-05, "loss": 0.0938, "step": 19530 }, { "epoch": 1.7748308279213405, "grad_norm": 1.1266772747039795, "learning_rate": 4.275921687276138e-05, "loss": 0.1602, "step": 19540 }, { "epoch": 1.775739134383941, "grad_norm": 0.44641727209091187, "learning_rate": 4.270693193417464e-05, "loss": 0.1245, "step": 19550 }, { "epoch": 1.7766474408465416, "grad_norm": 0.43591219186782837, "learning_rate": 4.265465514254568e-05, "loss": 0.139, "step": 19560 }, { "epoch": 1.7775557473091421, "grad_norm": 0.7812899351119995, "learning_rate": 4.2602386556272014e-05, "loss": 0.1698, "step": 19570 }, { "epoch": 1.7784640537717427, "grad_norm": 1.0261340141296387, "learning_rate": 4.2550126233741936e-05, "loss": 0.1627, "step": 19580 }, { "epoch": 1.7793723602343432, "grad_norm": 1.3732128143310547, "learning_rate": 4.249787423333455e-05, "loss": 0.2465, "step": 19590 }, { "epoch": 1.7802806666969435, "grad_norm": 2.1889240741729736, "learning_rate": 4.244563061341963e-05, "loss": 0.153, "step": 19600 }, { "epoch": 1.781188973159544, "grad_norm": 0.43228626251220703, "learning_rate": 4.239339543235759e-05, "loss": 0.1395, "step": 19610 }, { "epoch": 1.7820972796221444, "grad_norm": 0.02405843883752823, "learning_rate": 4.234116874849946e-05, "loss": 0.1429, "step": 19620 }, { "epoch": 1.783005586084745, "grad_norm": 1.355658769607544, "learning_rate": 4.228895062018674e-05, "loss": 0.1719, "step": 19630 }, { "epoch": 1.7839138925473454, "grad_norm": 2.314791202545166, "learning_rate": 4.2236741105751366e-05, "loss": 0.164, "step": 19640 }, { "epoch": 1.784822199009946, "grad_norm": 0.3163337707519531, "learning_rate": 4.218454026351568e-05, "loss": 0.182, "step": 19650 }, { "epoch": 1.7857305054725465, "grad_norm": 1.765668511390686, "learning_rate": 4.21323481517923e-05, "loss": 0.1863, "step": 19660 }, { "epoch": 1.786638811935147, "grad_norm": 0.7630751132965088, "learning_rate": 4.2080164828884154e-05, "loss": 0.2191, "step": 19670 }, { "epoch": 1.7875471183977474, "grad_norm": 0.6257327198982239, "learning_rate": 4.202799035308429e-05, "loss": 0.1878, "step": 19680 }, { "epoch": 1.788455424860348, "grad_norm": 1.6195189952850342, "learning_rate": 4.197582478267589e-05, "loss": 0.2058, "step": 19690 }, { "epoch": 1.7893637313229482, "grad_norm": 1.6898984909057617, "learning_rate": 4.1923668175932205e-05, "loss": 0.2052, "step": 19700 }, { "epoch": 1.7902720377855488, "grad_norm": 0.3579619228839874, "learning_rate": 4.1871520591116425e-05, "loss": 0.1011, "step": 19710 }, { "epoch": 1.7911803442481493, "grad_norm": 1.7606229782104492, "learning_rate": 4.181938208648176e-05, "loss": 0.1583, "step": 19720 }, { "epoch": 1.7920886507107499, "grad_norm": 0.2673881947994232, "learning_rate": 4.176725272027117e-05, "loss": 0.1611, "step": 19730 }, { "epoch": 1.7929969571733504, "grad_norm": 1.1541194915771484, "learning_rate": 4.1715132550717475e-05, "loss": 0.1136, "step": 19740 }, { "epoch": 1.793905263635951, "grad_norm": 1.7425777912139893, "learning_rate": 4.166302163604318e-05, "loss": 0.1495, "step": 19750 }, { "epoch": 1.7948135700985512, "grad_norm": 1.0040994882583618, "learning_rate": 4.1610920034460465e-05, "loss": 0.1682, "step": 19760 }, { "epoch": 1.7957218765611518, "grad_norm": 1.230970859527588, "learning_rate": 4.155882780417114e-05, "loss": 0.1227, "step": 19770 }, { "epoch": 1.796630183023752, "grad_norm": 0.25201743841171265, "learning_rate": 4.150674500336649e-05, "loss": 0.1461, "step": 19780 }, { "epoch": 1.7975384894863526, "grad_norm": 0.6179254651069641, "learning_rate": 4.145467169022732e-05, "loss": 0.139, "step": 19790 }, { "epoch": 1.7984467959489532, "grad_norm": 0.33761659264564514, "learning_rate": 4.1402607922923774e-05, "loss": 0.1402, "step": 19800 }, { "epoch": 1.7993551024115537, "grad_norm": 1.0617928504943848, "learning_rate": 4.135055375961539e-05, "loss": 0.0971, "step": 19810 }, { "epoch": 1.8002634088741543, "grad_norm": 0.7982646226882935, "learning_rate": 4.129850925845096e-05, "loss": 0.1471, "step": 19820 }, { "epoch": 1.8011717153367546, "grad_norm": 1.8193268775939941, "learning_rate": 4.124647447756847e-05, "loss": 0.1657, "step": 19830 }, { "epoch": 1.8020800217993551, "grad_norm": 1.1244407892227173, "learning_rate": 4.119444947509507e-05, "loss": 0.1948, "step": 19840 }, { "epoch": 1.8029883282619554, "grad_norm": 4.292942047119141, "learning_rate": 4.114243430914695e-05, "loss": 0.1564, "step": 19850 }, { "epoch": 1.803896634724556, "grad_norm": 0.8418575525283813, "learning_rate": 4.109042903782934e-05, "loss": 0.1078, "step": 19860 }, { "epoch": 1.8048049411871565, "grad_norm": 1.1477673053741455, "learning_rate": 4.103843371923644e-05, "loss": 0.1453, "step": 19870 }, { "epoch": 1.805713247649757, "grad_norm": 2.114877700805664, "learning_rate": 4.098644841145127e-05, "loss": 0.1637, "step": 19880 }, { "epoch": 1.8066215541123576, "grad_norm": 0.25494253635406494, "learning_rate": 4.0934473172545726e-05, "loss": 0.1903, "step": 19890 }, { "epoch": 1.8075298605749581, "grad_norm": 0.9145417213439941, "learning_rate": 4.088250806058043e-05, "loss": 0.1343, "step": 19900 }, { "epoch": 1.8084381670375584, "grad_norm": 0.5362402200698853, "learning_rate": 4.083055313360467e-05, "loss": 0.1347, "step": 19910 }, { "epoch": 1.809346473500159, "grad_norm": 0.4404456317424774, "learning_rate": 4.077860844965639e-05, "loss": 0.1722, "step": 19920 }, { "epoch": 1.8102547799627593, "grad_norm": 2.559523582458496, "learning_rate": 4.072667406676209e-05, "loss": 0.1243, "step": 19930 }, { "epoch": 1.8111630864253598, "grad_norm": 0.8919523358345032, "learning_rate": 4.067475004293675e-05, "loss": 0.1212, "step": 19940 }, { "epoch": 1.8120713928879604, "grad_norm": 0.23532530665397644, "learning_rate": 4.0622836436183775e-05, "loss": 0.1849, "step": 19950 }, { "epoch": 1.812979699350561, "grad_norm": 0.5030931830406189, "learning_rate": 4.0570933304494946e-05, "loss": 0.1012, "step": 19960 }, { "epoch": 1.8138880058131615, "grad_norm": 0.4811929166316986, "learning_rate": 4.051904070585034e-05, "loss": 0.1714, "step": 19970 }, { "epoch": 1.814796312275762, "grad_norm": 0.1442316323518753, "learning_rate": 4.0467158698218253e-05, "loss": 0.1742, "step": 19980 }, { "epoch": 1.8157046187383623, "grad_norm": 0.44783809781074524, "learning_rate": 4.0415287339555177e-05, "loss": 0.1415, "step": 19990 }, { "epoch": 1.8166129252009628, "grad_norm": 1.5393186807632446, "learning_rate": 4.036342668780565e-05, "loss": 0.1748, "step": 20000 }, { "epoch": 1.8166129252009628, "eval_loss": 0.19821995496749878, "eval_runtime": 1092.1197, "eval_samples_per_second": 8.961, "eval_steps_per_second": 8.961, "step": 20000 }, { "epoch": 1.8175212316635632, "grad_norm": 1.9841065406799316, "learning_rate": 4.0311576800902364e-05, "loss": 0.291, "step": 20010 }, { "epoch": 1.8184295381261637, "grad_norm": 2.335050582885742, "learning_rate": 4.0259737736765864e-05, "loss": 0.1713, "step": 20020 }, { "epoch": 1.8193378445887642, "grad_norm": 0.6070078611373901, "learning_rate": 4.0207909553304665e-05, "loss": 0.1301, "step": 20030 }, { "epoch": 1.8202461510513648, "grad_norm": 2.760035991668701, "learning_rate": 4.015609230841512e-05, "loss": 0.2415, "step": 20040 }, { "epoch": 1.8211544575139653, "grad_norm": 0.6763939261436462, "learning_rate": 4.0104286059981354e-05, "loss": 0.1887, "step": 20050 }, { "epoch": 1.8220627639765659, "grad_norm": 1.4762816429138184, "learning_rate": 4.0052490865875225e-05, "loss": 0.1838, "step": 20060 }, { "epoch": 1.8229710704391662, "grad_norm": 0.8568068742752075, "learning_rate": 4.000070678395622e-05, "loss": 0.1309, "step": 20070 }, { "epoch": 1.8238793769017665, "grad_norm": 1.625036358833313, "learning_rate": 3.9948933872071434e-05, "loss": 0.1936, "step": 20080 }, { "epoch": 1.824787683364367, "grad_norm": 1.3704102039337158, "learning_rate": 3.989717218805547e-05, "loss": 0.1522, "step": 20090 }, { "epoch": 1.8256959898269676, "grad_norm": 1.006104588508606, "learning_rate": 3.9845421789730376e-05, "loss": 0.0929, "step": 20100 }, { "epoch": 1.826604296289568, "grad_norm": 0.5888611078262329, "learning_rate": 3.979368273490564e-05, "loss": 0.1496, "step": 20110 }, { "epoch": 1.8275126027521686, "grad_norm": 0.28890353441238403, "learning_rate": 3.974195508137804e-05, "loss": 0.1757, "step": 20120 }, { "epoch": 1.8284209092147692, "grad_norm": 0.34688180685043335, "learning_rate": 3.969023888693161e-05, "loss": 0.0941, "step": 20130 }, { "epoch": 1.8293292156773695, "grad_norm": 0.8682500123977661, "learning_rate": 3.9638534209337616e-05, "loss": 0.1877, "step": 20140 }, { "epoch": 1.83023752213997, "grad_norm": 1.6337122917175293, "learning_rate": 3.9586841106354416e-05, "loss": 0.1475, "step": 20150 }, { "epoch": 1.8311458286025704, "grad_norm": 1.0977455377578735, "learning_rate": 3.95351596357275e-05, "loss": 0.111, "step": 20160 }, { "epoch": 1.832054135065171, "grad_norm": 1.8054147958755493, "learning_rate": 3.948348985518932e-05, "loss": 0.1395, "step": 20170 }, { "epoch": 1.8329624415277714, "grad_norm": 0.8663193583488464, "learning_rate": 3.943183182245926e-05, "loss": 0.1316, "step": 20180 }, { "epoch": 1.833870747990372, "grad_norm": 0.9408950209617615, "learning_rate": 3.938018559524362e-05, "loss": 0.1185, "step": 20190 }, { "epoch": 1.8347790544529725, "grad_norm": 0.6649243831634521, "learning_rate": 3.9328551231235455e-05, "loss": 0.1194, "step": 20200 }, { "epoch": 1.835687360915573, "grad_norm": 2.08208966255188, "learning_rate": 3.927692878811464e-05, "loss": 0.1959, "step": 20210 }, { "epoch": 1.8365956673781734, "grad_norm": 1.9714407920837402, "learning_rate": 3.9225318323547675e-05, "loss": 0.1342, "step": 20220 }, { "epoch": 1.837503973840774, "grad_norm": 1.7497401237487793, "learning_rate": 3.917371989518772e-05, "loss": 0.1146, "step": 20230 }, { "epoch": 1.8384122803033742, "grad_norm": 2.3434858322143555, "learning_rate": 3.9122133560674445e-05, "loss": 0.1435, "step": 20240 }, { "epoch": 1.8393205867659748, "grad_norm": 1.7198643684387207, "learning_rate": 3.907055937763404e-05, "loss": 0.1045, "step": 20250 }, { "epoch": 1.8402288932285753, "grad_norm": 3.9565136432647705, "learning_rate": 3.901899740367914e-05, "loss": 0.2201, "step": 20260 }, { "epoch": 1.8411371996911758, "grad_norm": 3.5163090229034424, "learning_rate": 3.8967447696408694e-05, "loss": 0.1204, "step": 20270 }, { "epoch": 1.8420455061537764, "grad_norm": 0.8689932227134705, "learning_rate": 3.8915910313407976e-05, "loss": 0.1065, "step": 20280 }, { "epoch": 1.842953812616377, "grad_norm": 2.9760847091674805, "learning_rate": 3.88643853122485e-05, "loss": 0.1819, "step": 20290 }, { "epoch": 1.8438621190789772, "grad_norm": 1.3984076976776123, "learning_rate": 3.881287275048789e-05, "loss": 0.2207, "step": 20300 }, { "epoch": 1.8447704255415778, "grad_norm": 1.2470929622650146, "learning_rate": 3.876137268566998e-05, "loss": 0.1387, "step": 20310 }, { "epoch": 1.845678732004178, "grad_norm": 0.3486076891422272, "learning_rate": 3.870988517532457e-05, "loss": 0.1337, "step": 20320 }, { "epoch": 1.8465870384667786, "grad_norm": 0.06163781136274338, "learning_rate": 3.8658410276967444e-05, "loss": 0.1813, "step": 20330 }, { "epoch": 1.8474953449293792, "grad_norm": 1.141906499862671, "learning_rate": 3.8606948048100314e-05, "loss": 0.1522, "step": 20340 }, { "epoch": 1.8484036513919797, "grad_norm": 3.399101972579956, "learning_rate": 3.8555498546210675e-05, "loss": 0.1841, "step": 20350 }, { "epoch": 1.8493119578545802, "grad_norm": 0.007434193976223469, "learning_rate": 3.850406182877192e-05, "loss": 0.1075, "step": 20360 }, { "epoch": 1.8502202643171806, "grad_norm": 0.5582946538925171, "learning_rate": 3.845263795324309e-05, "loss": 0.1225, "step": 20370 }, { "epoch": 1.851128570779781, "grad_norm": 0.2504165470600128, "learning_rate": 3.8401226977068875e-05, "loss": 0.153, "step": 20380 }, { "epoch": 1.8520368772423814, "grad_norm": 4.088414192199707, "learning_rate": 3.834982895767958e-05, "loss": 0.1185, "step": 20390 }, { "epoch": 1.852945183704982, "grad_norm": 0.9897136688232422, "learning_rate": 3.8298443952491006e-05, "loss": 0.135, "step": 20400 }, { "epoch": 1.8538534901675825, "grad_norm": 0.9349237084388733, "learning_rate": 3.8247072018904466e-05, "loss": 0.1368, "step": 20410 }, { "epoch": 1.854761796630183, "grad_norm": 0.5944536328315735, "learning_rate": 3.819571321430663e-05, "loss": 0.14, "step": 20420 }, { "epoch": 1.8556701030927836, "grad_norm": 1.4868252277374268, "learning_rate": 3.814436759606951e-05, "loss": 0.1367, "step": 20430 }, { "epoch": 1.8565784095553841, "grad_norm": 0.46415865421295166, "learning_rate": 3.80930352215504e-05, "loss": 0.1126, "step": 20440 }, { "epoch": 1.8574867160179844, "grad_norm": 0.9753912687301636, "learning_rate": 3.8041716148091746e-05, "loss": 0.1741, "step": 20450 }, { "epoch": 1.858395022480585, "grad_norm": 0.24143315851688385, "learning_rate": 3.799041043302125e-05, "loss": 0.1381, "step": 20460 }, { "epoch": 1.8593033289431853, "grad_norm": 0.614751398563385, "learning_rate": 3.7939118133651594e-05, "loss": 0.0932, "step": 20470 }, { "epoch": 1.8602116354057858, "grad_norm": 1.255395770072937, "learning_rate": 3.78878393072805e-05, "loss": 0.1605, "step": 20480 }, { "epoch": 1.8611199418683864, "grad_norm": 1.0295631885528564, "learning_rate": 3.783657401119063e-05, "loss": 0.1464, "step": 20490 }, { "epoch": 1.862028248330987, "grad_norm": 3.3065483570098877, "learning_rate": 3.778532230264953e-05, "loss": 0.156, "step": 20500 }, { "epoch": 1.862028248330987, "eval_loss": 0.1981230229139328, "eval_runtime": 1089.9889, "eval_samples_per_second": 8.979, "eval_steps_per_second": 8.979, "step": 20500 }, { "epoch": 1.8629365547935874, "grad_norm": 0.5942928791046143, "learning_rate": 3.7734084238909625e-05, "loss": 0.1022, "step": 20510 }, { "epoch": 1.863844861256188, "grad_norm": 7.581417560577393, "learning_rate": 3.7682859877208034e-05, "loss": 0.1121, "step": 20520 }, { "epoch": 1.8647531677187883, "grad_norm": 2.224248170852661, "learning_rate": 3.763164927476658e-05, "loss": 0.2676, "step": 20530 }, { "epoch": 1.8656614741813888, "grad_norm": 0.8873066306114197, "learning_rate": 3.758045248879173e-05, "loss": 0.1558, "step": 20540 }, { "epoch": 1.8665697806439892, "grad_norm": 0.2869255542755127, "learning_rate": 3.75292695764745e-05, "loss": 0.1121, "step": 20550 }, { "epoch": 1.8674780871065897, "grad_norm": 1.6005897521972656, "learning_rate": 3.7478100594990436e-05, "loss": 0.1408, "step": 20560 }, { "epoch": 1.8683863935691902, "grad_norm": 0.8744885325431824, "learning_rate": 3.7426945601499504e-05, "loss": 0.154, "step": 20570 }, { "epoch": 1.8692947000317908, "grad_norm": 2.1276752948760986, "learning_rate": 3.7375804653146035e-05, "loss": 0.1367, "step": 20580 }, { "epoch": 1.8702030064943913, "grad_norm": 2.7978110313415527, "learning_rate": 3.732467780705869e-05, "loss": 0.2085, "step": 20590 }, { "epoch": 1.8711113129569918, "grad_norm": 0.4769912660121918, "learning_rate": 3.727356512035034e-05, "loss": 0.1132, "step": 20600 }, { "epoch": 1.8720196194195922, "grad_norm": 1.1223340034484863, "learning_rate": 3.7222466650118114e-05, "loss": 0.1969, "step": 20610 }, { "epoch": 1.8729279258821927, "grad_norm": 1.4977586269378662, "learning_rate": 3.717138245344319e-05, "loss": 0.1457, "step": 20620 }, { "epoch": 1.873836232344793, "grad_norm": 2.932445526123047, "learning_rate": 3.712031258739084e-05, "loss": 0.1195, "step": 20630 }, { "epoch": 1.8747445388073936, "grad_norm": 0.3541969656944275, "learning_rate": 3.706925710901027e-05, "loss": 0.1506, "step": 20640 }, { "epoch": 1.875652845269994, "grad_norm": 0.8997638821601868, "learning_rate": 3.7018216075334664e-05, "loss": 0.0745, "step": 20650 }, { "epoch": 1.8765611517325946, "grad_norm": 0.15389789640903473, "learning_rate": 3.69671895433811e-05, "loss": 0.1461, "step": 20660 }, { "epoch": 1.8774694581951952, "grad_norm": 1.4367059469223022, "learning_rate": 3.691617757015039e-05, "loss": 0.1856, "step": 20670 }, { "epoch": 1.8783777646577955, "grad_norm": 0.35863494873046875, "learning_rate": 3.6865180212627116e-05, "loss": 0.1678, "step": 20680 }, { "epoch": 1.879286071120396, "grad_norm": 0.6292967200279236, "learning_rate": 3.6814197527779523e-05, "loss": 0.1854, "step": 20690 }, { "epoch": 1.8801943775829963, "grad_norm": 2.561809539794922, "learning_rate": 3.676322957255946e-05, "loss": 0.1748, "step": 20700 }, { "epoch": 1.8811026840455969, "grad_norm": 1.4465985298156738, "learning_rate": 3.671227640390236e-05, "loss": 0.1284, "step": 20710 }, { "epoch": 1.8820109905081974, "grad_norm": 0.058617353439331055, "learning_rate": 3.666133807872709e-05, "loss": 0.2453, "step": 20720 }, { "epoch": 1.882919296970798, "grad_norm": 1.2535580396652222, "learning_rate": 3.661041465393596e-05, "loss": 0.1144, "step": 20730 }, { "epoch": 1.8838276034333985, "grad_norm": 1.1666874885559082, "learning_rate": 3.655950618641463e-05, "loss": 0.1162, "step": 20740 }, { "epoch": 1.884735909895999, "grad_norm": 3.6423401832580566, "learning_rate": 3.650861273303202e-05, "loss": 0.1392, "step": 20750 }, { "epoch": 1.8856442163585994, "grad_norm": 0.10743678361177444, "learning_rate": 3.645773435064037e-05, "loss": 0.1083, "step": 20760 }, { "epoch": 1.8865525228212, "grad_norm": 1.1294463872909546, "learning_rate": 3.6406871096075e-05, "loss": 0.1142, "step": 20770 }, { "epoch": 1.8874608292838002, "grad_norm": 0.5666148066520691, "learning_rate": 3.635602302615434e-05, "loss": 0.1336, "step": 20780 }, { "epoch": 1.8883691357464008, "grad_norm": 0.43875178694725037, "learning_rate": 3.6305190197679884e-05, "loss": 0.1486, "step": 20790 }, { "epoch": 1.8892774422090013, "grad_norm": 0.5840688347816467, "learning_rate": 3.625437266743606e-05, "loss": 0.1667, "step": 20800 }, { "epoch": 1.8901857486716018, "grad_norm": 0.2697092592716217, "learning_rate": 3.62035704921903e-05, "loss": 0.1276, "step": 20810 }, { "epoch": 1.8910940551342024, "grad_norm": 2.958453893661499, "learning_rate": 3.6152783728692765e-05, "loss": 0.208, "step": 20820 }, { "epoch": 1.892002361596803, "grad_norm": 2.407276153564453, "learning_rate": 3.610201243367648e-05, "loss": 0.1676, "step": 20830 }, { "epoch": 1.8929106680594032, "grad_norm": 2.1915090084075928, "learning_rate": 3.6051256663857144e-05, "loss": 0.1881, "step": 20840 }, { "epoch": 1.8938189745220038, "grad_norm": 1.5261797904968262, "learning_rate": 3.6000516475933135e-05, "loss": 0.1564, "step": 20850 }, { "epoch": 1.894727280984604, "grad_norm": 2.4481401443481445, "learning_rate": 3.594979192658544e-05, "loss": 0.1502, "step": 20860 }, { "epoch": 1.8956355874472046, "grad_norm": 0.7441235780715942, "learning_rate": 3.589908307247755e-05, "loss": 0.1213, "step": 20870 }, { "epoch": 1.8965438939098052, "grad_norm": 0.9160979986190796, "learning_rate": 3.584838997025543e-05, "loss": 0.1799, "step": 20880 }, { "epoch": 1.8974522003724057, "grad_norm": 0.7887523174285889, "learning_rate": 3.579771267654746e-05, "loss": 0.1093, "step": 20890 }, { "epoch": 1.8983605068350062, "grad_norm": 2.4992544651031494, "learning_rate": 3.574705124796431e-05, "loss": 0.1654, "step": 20900 }, { "epoch": 1.8992688132976068, "grad_norm": 0.9239526987075806, "learning_rate": 3.5696405741099036e-05, "loss": 0.135, "step": 20910 }, { "epoch": 1.900177119760207, "grad_norm": 2.8231399059295654, "learning_rate": 3.5645776212526805e-05, "loss": 0.1941, "step": 20920 }, { "epoch": 1.9010854262228076, "grad_norm": 2.9628098011016846, "learning_rate": 3.559516271880496e-05, "loss": 0.1922, "step": 20930 }, { "epoch": 1.901993732685408, "grad_norm": 1.0248701572418213, "learning_rate": 3.554456531647297e-05, "loss": 0.1705, "step": 20940 }, { "epoch": 1.9029020391480085, "grad_norm": 1.8331882953643799, "learning_rate": 3.5493984062052264e-05, "loss": 0.1273, "step": 20950 }, { "epoch": 1.903810345610609, "grad_norm": 1.0698966979980469, "learning_rate": 3.544341901204631e-05, "loss": 0.1439, "step": 20960 }, { "epoch": 1.9047186520732096, "grad_norm": 1.3020763397216797, "learning_rate": 3.539287022294042e-05, "loss": 0.0899, "step": 20970 }, { "epoch": 1.90562695853581, "grad_norm": 1.472510576248169, "learning_rate": 3.534233775120176e-05, "loss": 0.1467, "step": 20980 }, { "epoch": 1.9065352649984104, "grad_norm": 1.2080647945404053, "learning_rate": 3.529182165327925e-05, "loss": 0.1342, "step": 20990 }, { "epoch": 1.907443571461011, "grad_norm": 0.9480942487716675, "learning_rate": 3.5241321985603534e-05, "loss": 0.1704, "step": 21000 }, { "epoch": 1.907443571461011, "eval_loss": 0.1924435943365097, "eval_runtime": 1107.9341, "eval_samples_per_second": 8.834, "eval_steps_per_second": 8.834, "step": 21000 }, { "epoch": 1.9083518779236113, "grad_norm": 0.721440851688385, "learning_rate": 3.519083880458691e-05, "loss": 0.1594, "step": 21010 }, { "epoch": 1.9092601843862118, "grad_norm": 0.6426498889923096, "learning_rate": 3.514037216662325e-05, "loss": 0.1542, "step": 21020 }, { "epoch": 1.9101684908488124, "grad_norm": 0.7371048331260681, "learning_rate": 3.508992212808794e-05, "loss": 0.1201, "step": 21030 }, { "epoch": 1.911076797311413, "grad_norm": 0.9668445587158203, "learning_rate": 3.503948874533783e-05, "loss": 0.1382, "step": 21040 }, { "epoch": 1.9119851037740134, "grad_norm": 0.9656088948249817, "learning_rate": 3.4989072074711135e-05, "loss": 0.1641, "step": 21050 }, { "epoch": 1.912893410236614, "grad_norm": 0.6991058588027954, "learning_rate": 3.49386721725275e-05, "loss": 0.1565, "step": 21060 }, { "epoch": 1.9138017166992143, "grad_norm": 1.5379269123077393, "learning_rate": 3.4888289095087715e-05, "loss": 0.1843, "step": 21070 }, { "epoch": 1.9147100231618148, "grad_norm": 1.7456004619598389, "learning_rate": 3.4837922898673836e-05, "loss": 0.1241, "step": 21080 }, { "epoch": 1.9156183296244151, "grad_norm": 1.643217921257019, "learning_rate": 3.4787573639549033e-05, "loss": 0.1169, "step": 21090 }, { "epoch": 1.9165266360870157, "grad_norm": 0.5529202818870544, "learning_rate": 3.473724137395762e-05, "loss": 0.1051, "step": 21100 }, { "epoch": 1.9174349425496162, "grad_norm": 0.540319561958313, "learning_rate": 3.4686926158124855e-05, "loss": 0.1314, "step": 21110 }, { "epoch": 1.9183432490122168, "grad_norm": 2.5608975887298584, "learning_rate": 3.4636628048256995e-05, "loss": 0.1693, "step": 21120 }, { "epoch": 1.9192515554748173, "grad_norm": 1.838441014289856, "learning_rate": 3.4586347100541164e-05, "loss": 0.183, "step": 21130 }, { "epoch": 1.9201598619374178, "grad_norm": 1.1969835758209229, "learning_rate": 3.45360833711453e-05, "loss": 0.1116, "step": 21140 }, { "epoch": 1.9210681684000182, "grad_norm": 1.4026541709899902, "learning_rate": 3.448583691621817e-05, "loss": 0.1299, "step": 21150 }, { "epoch": 1.9219764748626187, "grad_norm": 1.5592889785766602, "learning_rate": 3.4435607791889176e-05, "loss": 0.1541, "step": 21160 }, { "epoch": 1.922884781325219, "grad_norm": 2.16625714302063, "learning_rate": 3.4385396054268405e-05, "loss": 0.1004, "step": 21170 }, { "epoch": 1.9237930877878195, "grad_norm": 0.8922967314720154, "learning_rate": 3.433520175944649e-05, "loss": 0.1335, "step": 21180 }, { "epoch": 1.92470139425042, "grad_norm": 1.2232451438903809, "learning_rate": 3.428502496349457e-05, "loss": 0.1269, "step": 21190 }, { "epoch": 1.9256097007130206, "grad_norm": 1.2074873447418213, "learning_rate": 3.423486572246433e-05, "loss": 0.1725, "step": 21200 }, { "epoch": 1.9265180071756212, "grad_norm": 0.27605748176574707, "learning_rate": 3.418472409238772e-05, "loss": 0.1004, "step": 21210 }, { "epoch": 1.9274263136382217, "grad_norm": 0.33344629406929016, "learning_rate": 3.413460012927707e-05, "loss": 0.102, "step": 21220 }, { "epoch": 1.928334620100822, "grad_norm": 0.6134114265441895, "learning_rate": 3.4084493889125e-05, "loss": 0.1377, "step": 21230 }, { "epoch": 1.9292429265634223, "grad_norm": 4.1631693840026855, "learning_rate": 3.403440542790427e-05, "loss": 0.2563, "step": 21240 }, { "epoch": 1.9301512330260229, "grad_norm": 5.534477710723877, "learning_rate": 3.3984334801567856e-05, "loss": 0.1859, "step": 21250 }, { "epoch": 1.9310595394886234, "grad_norm": 2.4325449466705322, "learning_rate": 3.3934282066048765e-05, "loss": 0.1428, "step": 21260 }, { "epoch": 1.931967845951224, "grad_norm": 0.7062081694602966, "learning_rate": 3.3884247277260025e-05, "loss": 0.0744, "step": 21270 }, { "epoch": 1.9328761524138245, "grad_norm": 4.87678861618042, "learning_rate": 3.38342304910946e-05, "loss": 0.1616, "step": 21280 }, { "epoch": 1.933784458876425, "grad_norm": 4.487456321716309, "learning_rate": 3.378423176342537e-05, "loss": 0.2677, "step": 21290 }, { "epoch": 1.9346927653390253, "grad_norm": 1.1641885042190552, "learning_rate": 3.3734251150105046e-05, "loss": 0.1546, "step": 21300 }, { "epoch": 1.9356010718016259, "grad_norm": 1.4061802625656128, "learning_rate": 3.3684288706966074e-05, "loss": 0.1711, "step": 21310 }, { "epoch": 1.9365093782642262, "grad_norm": 3.595653772354126, "learning_rate": 3.3634344489820625e-05, "loss": 0.1579, "step": 21320 }, { "epoch": 1.9374176847268267, "grad_norm": 0.6807557344436646, "learning_rate": 3.3584418554460495e-05, "loss": 0.1781, "step": 21330 }, { "epoch": 1.9383259911894273, "grad_norm": 2.0941009521484375, "learning_rate": 3.353451095665707e-05, "loss": 0.1449, "step": 21340 }, { "epoch": 1.9392342976520278, "grad_norm": 1.9189091920852661, "learning_rate": 3.348462175216126e-05, "loss": 0.1735, "step": 21350 }, { "epoch": 1.9401426041146284, "grad_norm": 1.290285587310791, "learning_rate": 3.3434750996703415e-05, "loss": 0.092, "step": 21360 }, { "epoch": 1.941050910577229, "grad_norm": 0.7163854241371155, "learning_rate": 3.3384898745993254e-05, "loss": 0.1389, "step": 21370 }, { "epoch": 1.9419592170398292, "grad_norm": 0.808363676071167, "learning_rate": 3.333506505571987e-05, "loss": 0.2142, "step": 21380 }, { "epoch": 1.9428675235024297, "grad_norm": 1.7106871604919434, "learning_rate": 3.328524998155157e-05, "loss": 0.1515, "step": 21390 }, { "epoch": 1.94377582996503, "grad_norm": 1.4641573429107666, "learning_rate": 3.323545357913594e-05, "loss": 0.1378, "step": 21400 }, { "epoch": 1.9446841364276306, "grad_norm": 5.427967548370361, "learning_rate": 3.3185675904099645e-05, "loss": 0.1905, "step": 21410 }, { "epoch": 1.9455924428902311, "grad_norm": 0.6180101633071899, "learning_rate": 3.3135917012048436e-05, "loss": 0.1305, "step": 21420 }, { "epoch": 1.9465007493528317, "grad_norm": 0.38064128160476685, "learning_rate": 3.308617695856711e-05, "loss": 0.1231, "step": 21430 }, { "epoch": 1.9474090558154322, "grad_norm": 1.2531800270080566, "learning_rate": 3.303645579921939e-05, "loss": 0.1891, "step": 21440 }, { "epoch": 1.9483173622780328, "grad_norm": 0.014885921031236649, "learning_rate": 3.298675358954791e-05, "loss": 0.0972, "step": 21450 }, { "epoch": 1.949225668740633, "grad_norm": 0.5043304562568665, "learning_rate": 3.293707038507415e-05, "loss": 0.1213, "step": 21460 }, { "epoch": 1.9501339752032336, "grad_norm": 1.180209994316101, "learning_rate": 3.288740624129833e-05, "loss": 0.1533, "step": 21470 }, { "epoch": 1.951042281665834, "grad_norm": 1.681359052658081, "learning_rate": 3.283776121369938e-05, "loss": 0.1097, "step": 21480 }, { "epoch": 1.9519505881284345, "grad_norm": 0.8610898852348328, "learning_rate": 3.2788135357734886e-05, "loss": 0.1479, "step": 21490 }, { "epoch": 1.952858894591035, "grad_norm": 0.6285989880561829, "learning_rate": 3.273852872884104e-05, "loss": 0.1532, "step": 21500 }, { "epoch": 1.952858894591035, "eval_loss": 0.1962735652923584, "eval_runtime": 1097.1274, "eval_samples_per_second": 8.921, "eval_steps_per_second": 8.921, "step": 21500 }, { "epoch": 1.9537672010536355, "grad_norm": 2.368990182876587, "learning_rate": 3.268894138243251e-05, "loss": 0.1507, "step": 21510 }, { "epoch": 1.954675507516236, "grad_norm": 1.8356529474258423, "learning_rate": 3.263937337390246e-05, "loss": 0.1624, "step": 21520 }, { "epoch": 1.9555838139788364, "grad_norm": 2.4216983318328857, "learning_rate": 3.2589824758622437e-05, "loss": 0.1596, "step": 21530 }, { "epoch": 1.956492120441437, "grad_norm": 0.719409167766571, "learning_rate": 3.254029559194229e-05, "loss": 0.202, "step": 21540 }, { "epoch": 1.9574004269040373, "grad_norm": 1.5214760303497314, "learning_rate": 3.249078592919024e-05, "loss": 0.1351, "step": 21550 }, { "epoch": 1.9583087333666378, "grad_norm": 1.4792649745941162, "learning_rate": 3.2441295825672616e-05, "loss": 0.1691, "step": 21560 }, { "epoch": 1.9592170398292383, "grad_norm": 0.470086008310318, "learning_rate": 3.2391825336673966e-05, "loss": 0.1204, "step": 21570 }, { "epoch": 1.9601253462918389, "grad_norm": 0.7077327370643616, "learning_rate": 3.234237451745687e-05, "loss": 0.1387, "step": 21580 }, { "epoch": 1.9610336527544394, "grad_norm": 0.8499559164047241, "learning_rate": 3.229294342326199e-05, "loss": 0.1193, "step": 21590 }, { "epoch": 1.96194195921704, "grad_norm": 1.9897165298461914, "learning_rate": 3.224353210930794e-05, "loss": 0.1114, "step": 21600 }, { "epoch": 1.9628502656796403, "grad_norm": 1.3853709697723389, "learning_rate": 3.21941406307912e-05, "loss": 0.1224, "step": 21610 }, { "epoch": 1.9637585721422408, "grad_norm": 0.8446764349937439, "learning_rate": 3.214476904288614e-05, "loss": 0.1602, "step": 21620 }, { "epoch": 1.9646668786048411, "grad_norm": 1.8705095052719116, "learning_rate": 3.2095417400744885e-05, "loss": 0.1638, "step": 21630 }, { "epoch": 1.9655751850674417, "grad_norm": 0.8816527724266052, "learning_rate": 3.2046085759497266e-05, "loss": 0.1566, "step": 21640 }, { "epoch": 1.9664834915300422, "grad_norm": 0.9782041907310486, "learning_rate": 3.199677417425082e-05, "loss": 0.1483, "step": 21650 }, { "epoch": 1.9673917979926427, "grad_norm": 1.2924858331680298, "learning_rate": 3.194748270009065e-05, "loss": 0.1484, "step": 21660 }, { "epoch": 1.9683001044552433, "grad_norm": 0.4171060621738434, "learning_rate": 3.1898211392079356e-05, "loss": 0.1116, "step": 21670 }, { "epoch": 1.9692084109178438, "grad_norm": 2.249924421310425, "learning_rate": 3.184896030525709e-05, "loss": 0.1889, "step": 21680 }, { "epoch": 1.9701167173804441, "grad_norm": 1.2682596445083618, "learning_rate": 3.179972949464133e-05, "loss": 0.1594, "step": 21690 }, { "epoch": 1.9710250238430447, "grad_norm": 0.7607709169387817, "learning_rate": 3.175051901522699e-05, "loss": 0.0829, "step": 21700 }, { "epoch": 1.971933330305645, "grad_norm": 4.253425598144531, "learning_rate": 3.170132892198621e-05, "loss": 0.1399, "step": 21710 }, { "epoch": 1.9728416367682455, "grad_norm": 0.9376879334449768, "learning_rate": 3.165215926986838e-05, "loss": 0.1461, "step": 21720 }, { "epoch": 1.973749943230846, "grad_norm": 0.9389997124671936, "learning_rate": 3.1603010113800045e-05, "loss": 0.188, "step": 21730 }, { "epoch": 1.9746582496934466, "grad_norm": 0.8276881575584412, "learning_rate": 3.1553881508684855e-05, "loss": 0.1643, "step": 21740 }, { "epoch": 1.9755665561560471, "grad_norm": 2.138993263244629, "learning_rate": 3.150477350940354e-05, "loss": 0.1293, "step": 21750 }, { "epoch": 1.9764748626186477, "grad_norm": 0.6379855871200562, "learning_rate": 3.145568617081375e-05, "loss": 0.1547, "step": 21760 }, { "epoch": 1.977383169081248, "grad_norm": 1.6409012079238892, "learning_rate": 3.1406619547750095e-05, "loss": 0.2114, "step": 21770 }, { "epoch": 1.9782914755438485, "grad_norm": 1.0750148296356201, "learning_rate": 3.1357573695024034e-05, "loss": 0.1332, "step": 21780 }, { "epoch": 1.9791997820064489, "grad_norm": 0.4999237060546875, "learning_rate": 3.130854866742382e-05, "loss": 0.1146, "step": 21790 }, { "epoch": 1.9801080884690494, "grad_norm": 1.2502321004867554, "learning_rate": 3.1259544519714454e-05, "loss": 0.1037, "step": 21800 }, { "epoch": 1.98101639493165, "grad_norm": 2.3031883239746094, "learning_rate": 3.121056130663761e-05, "loss": 0.156, "step": 21810 }, { "epoch": 1.9819247013942505, "grad_norm": 0.6641430258750916, "learning_rate": 3.116159908291156e-05, "loss": 0.1015, "step": 21820 }, { "epoch": 1.982833007856851, "grad_norm": 2.2317073345184326, "learning_rate": 3.111265790323116e-05, "loss": 0.0947, "step": 21830 }, { "epoch": 1.9837413143194513, "grad_norm": 1.7947218418121338, "learning_rate": 3.10637378222677e-05, "loss": 0.1937, "step": 21840 }, { "epoch": 1.9846496207820519, "grad_norm": 1.496112585067749, "learning_rate": 3.1014838894668986e-05, "loss": 0.159, "step": 21850 }, { "epoch": 1.9855579272446522, "grad_norm": 0.9568132162094116, "learning_rate": 3.096596117505913e-05, "loss": 0.1038, "step": 21860 }, { "epoch": 1.9864662337072527, "grad_norm": 1.1959569454193115, "learning_rate": 3.091710471803857e-05, "loss": 0.1353, "step": 21870 }, { "epoch": 1.9873745401698533, "grad_norm": 0.46885553002357483, "learning_rate": 3.0868269578184004e-05, "loss": 0.1671, "step": 21880 }, { "epoch": 1.9882828466324538, "grad_norm": 1.8575822114944458, "learning_rate": 3.081945581004827e-05, "loss": 0.1939, "step": 21890 }, { "epoch": 1.9891911530950543, "grad_norm": 0.11176953464746475, "learning_rate": 3.077066346816041e-05, "loss": 0.1178, "step": 21900 }, { "epoch": 1.9900994595576549, "grad_norm": 0.7612248659133911, "learning_rate": 3.072189260702547e-05, "loss": 0.1441, "step": 21910 }, { "epoch": 1.9910077660202552, "grad_norm": 0.25794652104377747, "learning_rate": 3.067314328112452e-05, "loss": 0.1064, "step": 21920 }, { "epoch": 1.9919160724828557, "grad_norm": 1.4852784872055054, "learning_rate": 3.062441554491458e-05, "loss": 0.1608, "step": 21930 }, { "epoch": 1.992824378945456, "grad_norm": 2.457210063934326, "learning_rate": 3.057570945282853e-05, "loss": 0.19, "step": 21940 }, { "epoch": 1.9937326854080566, "grad_norm": 1.9175115823745728, "learning_rate": 3.0527025059275104e-05, "loss": 0.1891, "step": 21950 }, { "epoch": 1.9946409918706571, "grad_norm": 0.3366178870201111, "learning_rate": 3.0478362418638772e-05, "loss": 0.1393, "step": 21960 }, { "epoch": 1.9955492983332577, "grad_norm": 2.275217294692993, "learning_rate": 3.0429721585279717e-05, "loss": 0.1267, "step": 21970 }, { "epoch": 1.9964576047958582, "grad_norm": 0.4795277714729309, "learning_rate": 3.038110261353375e-05, "loss": 0.1961, "step": 21980 }, { "epoch": 1.9973659112584587, "grad_norm": 0.8578714728355408, "learning_rate": 3.0332505557712254e-05, "loss": 0.1681, "step": 21990 }, { "epoch": 1.998274217721059, "grad_norm": 2.699740409851074, "learning_rate": 3.0283930472102174e-05, "loss": 0.1719, "step": 22000 }, { "epoch": 1.998274217721059, "eval_loss": 0.19198648631572723, "eval_runtime": 1093.7598, "eval_samples_per_second": 8.948, "eval_steps_per_second": 8.948, "step": 22000 }, { "epoch": 1.9991825241836596, "grad_norm": 0.9176803231239319, "learning_rate": 3.023537741096587e-05, "loss": 0.1218, "step": 22010 }, { "epoch": 2.00009083064626, "grad_norm": 1.5087378025054932, "learning_rate": 3.0186846428541117e-05, "loss": 0.1614, "step": 22020 }, { "epoch": 2.0009991371088605, "grad_norm": 0.11143729835748672, "learning_rate": 3.013833757904101e-05, "loss": 0.0733, "step": 22030 }, { "epoch": 2.001907443571461, "grad_norm": 0.5393356680870056, "learning_rate": 3.0089850916653907e-05, "loss": 0.1488, "step": 22040 }, { "epoch": 2.0028157500340615, "grad_norm": 1.0374938249588013, "learning_rate": 3.0041386495543444e-05, "loss": 0.0712, "step": 22050 }, { "epoch": 2.003724056496662, "grad_norm": 0.07894042879343033, "learning_rate": 2.9992944369848364e-05, "loss": 0.0883, "step": 22060 }, { "epoch": 2.0046323629592626, "grad_norm": 0.8520058989524841, "learning_rate": 2.99445245936825e-05, "loss": 0.125, "step": 22070 }, { "epoch": 2.005540669421863, "grad_norm": 0.24248750507831573, "learning_rate": 2.9896127221134733e-05, "loss": 0.0835, "step": 22080 }, { "epoch": 2.0064489758844632, "grad_norm": 0.24489916861057281, "learning_rate": 2.9847752306268907e-05, "loss": 0.0936, "step": 22090 }, { "epoch": 2.007357282347064, "grad_norm": 0.41195234656333923, "learning_rate": 2.979939990312381e-05, "loss": 0.1181, "step": 22100 }, { "epoch": 2.0082655888096643, "grad_norm": 1.1340333223342896, "learning_rate": 2.9751070065713038e-05, "loss": 0.0848, "step": 22110 }, { "epoch": 2.009173895272265, "grad_norm": 1.0265529155731201, "learning_rate": 2.970276284802501e-05, "loss": 0.1165, "step": 22120 }, { "epoch": 2.0100822017348654, "grad_norm": 1.1120764017105103, "learning_rate": 2.965447830402287e-05, "loss": 0.1408, "step": 22130 }, { "epoch": 2.010990508197466, "grad_norm": 0.7637825608253479, "learning_rate": 2.960621648764441e-05, "loss": 0.1088, "step": 22140 }, { "epoch": 2.0118988146600665, "grad_norm": 1.1629550457000732, "learning_rate": 2.9557977452802087e-05, "loss": 0.1347, "step": 22150 }, { "epoch": 2.0128071211226666, "grad_norm": 0.09417810291051865, "learning_rate": 2.9509761253382862e-05, "loss": 0.1376, "step": 22160 }, { "epoch": 2.013715427585267, "grad_norm": 2.658820390701294, "learning_rate": 2.94615679432482e-05, "loss": 0.1768, "step": 22170 }, { "epoch": 2.0146237340478677, "grad_norm": 0.6738539338111877, "learning_rate": 2.9413397576234004e-05, "loss": 0.1067, "step": 22180 }, { "epoch": 2.015532040510468, "grad_norm": 0.9092407822608948, "learning_rate": 2.9365250206150507e-05, "loss": 0.093, "step": 22190 }, { "epoch": 2.0164403469730687, "grad_norm": 0.3149075508117676, "learning_rate": 2.9317125886782336e-05, "loss": 0.1114, "step": 22200 }, { "epoch": 2.0173486534356693, "grad_norm": 0.8476216793060303, "learning_rate": 2.9269024671888305e-05, "loss": 0.0918, "step": 22210 }, { "epoch": 2.01825695989827, "grad_norm": 1.343492865562439, "learning_rate": 2.9220946615201428e-05, "loss": 0.1097, "step": 22220 }, { "epoch": 2.0191652663608703, "grad_norm": 0.8078330159187317, "learning_rate": 2.917289177042884e-05, "loss": 0.1009, "step": 22230 }, { "epoch": 2.0200735728234704, "grad_norm": 0.7090185284614563, "learning_rate": 2.912486019125179e-05, "loss": 0.0731, "step": 22240 }, { "epoch": 2.020981879286071, "grad_norm": 0.38304615020751953, "learning_rate": 2.9076851931325504e-05, "loss": 0.1358, "step": 22250 }, { "epoch": 2.0218901857486715, "grad_norm": 0.7919134497642517, "learning_rate": 2.902886704427915e-05, "loss": 0.1206, "step": 22260 }, { "epoch": 2.022798492211272, "grad_norm": 2.2521631717681885, "learning_rate": 2.8980905583715807e-05, "loss": 0.0934, "step": 22270 }, { "epoch": 2.0237067986738726, "grad_norm": 1.4108318090438843, "learning_rate": 2.8932967603212374e-05, "loss": 0.1522, "step": 22280 }, { "epoch": 2.024615105136473, "grad_norm": 0.8378875851631165, "learning_rate": 2.8885053156319524e-05, "loss": 0.1436, "step": 22290 }, { "epoch": 2.0255234115990737, "grad_norm": 0.4033800959587097, "learning_rate": 2.8837162296561642e-05, "loss": 0.1165, "step": 22300 }, { "epoch": 2.026431718061674, "grad_norm": 1.7720831632614136, "learning_rate": 2.8789295077436764e-05, "loss": 0.2077, "step": 22310 }, { "epoch": 2.0273400245242743, "grad_norm": 2.8476080894470215, "learning_rate": 2.874145155241652e-05, "loss": 0.0998, "step": 22320 }, { "epoch": 2.028248330986875, "grad_norm": 0.3608739376068115, "learning_rate": 2.869363177494604e-05, "loss": 0.1152, "step": 22330 }, { "epoch": 2.0291566374494754, "grad_norm": 0.7594811916351318, "learning_rate": 2.8645835798444e-05, "loss": 0.0967, "step": 22340 }, { "epoch": 2.030064943912076, "grad_norm": 1.7172287702560425, "learning_rate": 2.8598063676302424e-05, "loss": 0.1173, "step": 22350 }, { "epoch": 2.0309732503746765, "grad_norm": 0.0576055645942688, "learning_rate": 2.85503154618867e-05, "loss": 0.1003, "step": 22360 }, { "epoch": 2.031881556837277, "grad_norm": 2.492870569229126, "learning_rate": 2.850259120853553e-05, "loss": 0.1289, "step": 22370 }, { "epoch": 2.0327898632998775, "grad_norm": 0.6789908409118652, "learning_rate": 2.8454890969560803e-05, "loss": 0.1144, "step": 22380 }, { "epoch": 2.033698169762478, "grad_norm": 0.7932015061378479, "learning_rate": 2.8407214798247668e-05, "loss": 0.0893, "step": 22390 }, { "epoch": 2.034606476225078, "grad_norm": 0.8867520689964294, "learning_rate": 2.8359562747854308e-05, "loss": 0.1465, "step": 22400 }, { "epoch": 2.0355147826876787, "grad_norm": 1.4294145107269287, "learning_rate": 2.8311934871611982e-05, "loss": 0.1224, "step": 22410 }, { "epoch": 2.0364230891502793, "grad_norm": 0.8710873126983643, "learning_rate": 2.8264331222724967e-05, "loss": 0.1028, "step": 22420 }, { "epoch": 2.03733139561288, "grad_norm": 0.7939785718917847, "learning_rate": 2.8216751854370448e-05, "loss": 0.0867, "step": 22430 }, { "epoch": 2.0382397020754803, "grad_norm": 0.8302410244941711, "learning_rate": 2.81691968196985e-05, "loss": 0.0966, "step": 22440 }, { "epoch": 2.039148008538081, "grad_norm": 0.9981852769851685, "learning_rate": 2.8121666171832023e-05, "loss": 0.0862, "step": 22450 }, { "epoch": 2.0400563150006814, "grad_norm": 0.9211143851280212, "learning_rate": 2.8074159963866652e-05, "loss": 0.1157, "step": 22460 }, { "epoch": 2.0409646214632815, "grad_norm": 0.01429961621761322, "learning_rate": 2.8026678248870752e-05, "loss": 0.0598, "step": 22470 }, { "epoch": 2.041872927925882, "grad_norm": 1.2494324445724487, "learning_rate": 2.7979221079885275e-05, "loss": 0.1385, "step": 22480 }, { "epoch": 2.0427812343884826, "grad_norm": 1.3855364322662354, "learning_rate": 2.793178850992383e-05, "loss": 0.08, "step": 22490 }, { "epoch": 2.043689540851083, "grad_norm": 1.3549965620040894, "learning_rate": 2.788438059197249e-05, "loss": 0.0699, "step": 22500 }, { "epoch": 2.043689540851083, "eval_loss": 0.20184771716594696, "eval_runtime": 1090.3183, "eval_samples_per_second": 8.976, "eval_steps_per_second": 8.976, "step": 22500 }, { "epoch": 2.0445978473136837, "grad_norm": 0.3872697651386261, "learning_rate": 2.783699737898981e-05, "loss": 0.0974, "step": 22510 }, { "epoch": 2.045506153776284, "grad_norm": 0.015475944615900517, "learning_rate": 2.7789638923906723e-05, "loss": 0.0699, "step": 22520 }, { "epoch": 2.0464144602388847, "grad_norm": 0.4982447624206543, "learning_rate": 2.774230527962653e-05, "loss": 0.1117, "step": 22530 }, { "epoch": 2.0473227667014853, "grad_norm": 0.26215070486068726, "learning_rate": 2.7694996499024827e-05, "loss": 0.1213, "step": 22540 }, { "epoch": 2.0482310731640854, "grad_norm": 1.2340055704116821, "learning_rate": 2.764771263494941e-05, "loss": 0.1672, "step": 22550 }, { "epoch": 2.049139379626686, "grad_norm": 2.488434076309204, "learning_rate": 2.7600453740220255e-05, "loss": 0.1251, "step": 22560 }, { "epoch": 2.0500476860892864, "grad_norm": 0.8141029477119446, "learning_rate": 2.7553219867629432e-05, "loss": 0.0712, "step": 22570 }, { "epoch": 2.050955992551887, "grad_norm": 0.18456272780895233, "learning_rate": 2.7506011069941066e-05, "loss": 0.0744, "step": 22580 }, { "epoch": 2.0518642990144875, "grad_norm": 2.0731565952301025, "learning_rate": 2.7458827399891284e-05, "loss": 0.0853, "step": 22590 }, { "epoch": 2.052772605477088, "grad_norm": 0.681673526763916, "learning_rate": 2.741166891018812e-05, "loss": 0.1441, "step": 22600 }, { "epoch": 2.0536809119396886, "grad_norm": 0.8069421052932739, "learning_rate": 2.7364535653511504e-05, "loss": 0.1148, "step": 22610 }, { "epoch": 2.054589218402289, "grad_norm": 0.43028461933135986, "learning_rate": 2.7317427682513153e-05, "loss": 0.1, "step": 22620 }, { "epoch": 2.0554975248648892, "grad_norm": 2.708228588104248, "learning_rate": 2.7270345049816537e-05, "loss": 0.1181, "step": 22630 }, { "epoch": 2.0564058313274898, "grad_norm": 0.7179459929466248, "learning_rate": 2.7223287808016873e-05, "loss": 0.0707, "step": 22640 }, { "epoch": 2.0573141377900903, "grad_norm": 0.41829797625541687, "learning_rate": 2.7176256009680966e-05, "loss": 0.1057, "step": 22650 }, { "epoch": 2.058222444252691, "grad_norm": 0.9957857728004456, "learning_rate": 2.7129249707347197e-05, "loss": 0.0617, "step": 22660 }, { "epoch": 2.0591307507152914, "grad_norm": 0.5787109136581421, "learning_rate": 2.7082268953525486e-05, "loss": 0.1286, "step": 22670 }, { "epoch": 2.060039057177892, "grad_norm": 0.6710687279701233, "learning_rate": 2.7035313800697183e-05, "loss": 0.082, "step": 22680 }, { "epoch": 2.0609473636404925, "grad_norm": 0.5279198884963989, "learning_rate": 2.6988384301315095e-05, "loss": 0.0735, "step": 22690 }, { "epoch": 2.0618556701030926, "grad_norm": 0.7604414820671082, "learning_rate": 2.6941480507803328e-05, "loss": 0.1092, "step": 22700 }, { "epoch": 2.062763976565693, "grad_norm": 1.4776630401611328, "learning_rate": 2.689460247255728e-05, "loss": 0.1197, "step": 22710 }, { "epoch": 2.0636722830282936, "grad_norm": 1.9054956436157227, "learning_rate": 2.6847750247943576e-05, "loss": 0.1166, "step": 22720 }, { "epoch": 2.064580589490894, "grad_norm": 1.7560817003250122, "learning_rate": 2.680092388630001e-05, "loss": 0.1105, "step": 22730 }, { "epoch": 2.0654888959534947, "grad_norm": 0.32929155230522156, "learning_rate": 2.6754123439935487e-05, "loss": 0.1664, "step": 22740 }, { "epoch": 2.0663972024160953, "grad_norm": 1.9386165142059326, "learning_rate": 2.6707348961129965e-05, "loss": 0.0696, "step": 22750 }, { "epoch": 2.067305508878696, "grad_norm": 0.5414419174194336, "learning_rate": 2.6660600502134382e-05, "loss": 0.1071, "step": 22760 }, { "epoch": 2.0682138153412963, "grad_norm": 1.922927737236023, "learning_rate": 2.6613878115170622e-05, "loss": 0.1643, "step": 22770 }, { "epoch": 2.0691221218038964, "grad_norm": 0.6423650979995728, "learning_rate": 2.6567181852431417e-05, "loss": 0.106, "step": 22780 }, { "epoch": 2.070030428266497, "grad_norm": 0.9914568662643433, "learning_rate": 2.652051176608038e-05, "loss": 0.1071, "step": 22790 }, { "epoch": 2.0709387347290975, "grad_norm": 3.4619147777557373, "learning_rate": 2.6473867908251827e-05, "loss": 0.1314, "step": 22800 }, { "epoch": 2.071847041191698, "grad_norm": 1.7971118688583374, "learning_rate": 2.642725033105078e-05, "loss": 0.0797, "step": 22810 }, { "epoch": 2.0727553476542986, "grad_norm": 1.4912705421447754, "learning_rate": 2.6380659086552928e-05, "loss": 0.1113, "step": 22820 }, { "epoch": 2.073663654116899, "grad_norm": 0.017287755385041237, "learning_rate": 2.633409422680449e-05, "loss": 0.0783, "step": 22830 }, { "epoch": 2.0745719605794997, "grad_norm": 2.5456721782684326, "learning_rate": 2.6287555803822306e-05, "loss": 0.0894, "step": 22840 }, { "epoch": 2.0754802670421, "grad_norm": 1.420876145362854, "learning_rate": 2.6241043869593595e-05, "loss": 0.1416, "step": 22850 }, { "epoch": 2.0763885735047003, "grad_norm": 1.3691956996917725, "learning_rate": 2.619455847607603e-05, "loss": 0.1045, "step": 22860 }, { "epoch": 2.077296879967301, "grad_norm": 0.16864070296287537, "learning_rate": 2.6148099675197642e-05, "loss": 0.0611, "step": 22870 }, { "epoch": 2.0782051864299014, "grad_norm": 0.7337554693222046, "learning_rate": 2.6101667518856677e-05, "loss": 0.0746, "step": 22880 }, { "epoch": 2.079113492892502, "grad_norm": 2.690438985824585, "learning_rate": 2.605526205892173e-05, "loss": 0.0812, "step": 22890 }, { "epoch": 2.0800217993551025, "grad_norm": 0.6979029774665833, "learning_rate": 2.6008883347231515e-05, "loss": 0.0902, "step": 22900 }, { "epoch": 2.080930105817703, "grad_norm": 0.08816546201705933, "learning_rate": 2.596253143559487e-05, "loss": 0.0915, "step": 22910 }, { "epoch": 2.0818384122803035, "grad_norm": 0.5823408365249634, "learning_rate": 2.59162063757907e-05, "loss": 0.1267, "step": 22920 }, { "epoch": 2.082746718742904, "grad_norm": 1.1795340776443481, "learning_rate": 2.5869908219567894e-05, "loss": 0.0779, "step": 22930 }, { "epoch": 2.083655025205504, "grad_norm": 1.7620940208435059, "learning_rate": 2.5823637018645342e-05, "loss": 0.065, "step": 22940 }, { "epoch": 2.0845633316681047, "grad_norm": 1.8524243831634521, "learning_rate": 2.577739282471176e-05, "loss": 0.1236, "step": 22950 }, { "epoch": 2.0854716381307052, "grad_norm": 0.7068631649017334, "learning_rate": 2.5731175689425734e-05, "loss": 0.0862, "step": 22960 }, { "epoch": 2.086379944593306, "grad_norm": 1.2347184419631958, "learning_rate": 2.5684985664415606e-05, "loss": 0.1491, "step": 22970 }, { "epoch": 2.0872882510559063, "grad_norm": 0.9801735281944275, "learning_rate": 2.563882280127943e-05, "loss": 0.1482, "step": 22980 }, { "epoch": 2.088196557518507, "grad_norm": 1.5433346033096313, "learning_rate": 2.5592687151584943e-05, "loss": 0.0667, "step": 22990 }, { "epoch": 2.0891048639811074, "grad_norm": 2.6898999214172363, "learning_rate": 2.5546578766869468e-05, "loss": 0.145, "step": 23000 }, { "epoch": 2.0891048639811074, "eval_loss": 0.20789769291877747, "eval_runtime": 1105.0997, "eval_samples_per_second": 8.856, "eval_steps_per_second": 8.856, "step": 23000 }, { "epoch": 2.090013170443708, "grad_norm": 1.3069541454315186, "learning_rate": 2.5500497698639876e-05, "loss": 0.0521, "step": 23010 }, { "epoch": 2.090921476906308, "grad_norm": 0.3719736635684967, "learning_rate": 2.545444399837248e-05, "loss": 0.0822, "step": 23020 }, { "epoch": 2.0918297833689086, "grad_norm": 0.1026528924703598, "learning_rate": 2.5408417717513067e-05, "loss": 0.0889, "step": 23030 }, { "epoch": 2.092738089831509, "grad_norm": 2.1331305503845215, "learning_rate": 2.5362418907476804e-05, "loss": 0.0986, "step": 23040 }, { "epoch": 2.0936463962941096, "grad_norm": 3.5616302490234375, "learning_rate": 2.5316447619648148e-05, "loss": 0.0847, "step": 23050 }, { "epoch": 2.09455470275671, "grad_norm": 1.5330402851104736, "learning_rate": 2.5270503905380805e-05, "loss": 0.1215, "step": 23060 }, { "epoch": 2.0954630092193107, "grad_norm": 1.2096655368804932, "learning_rate": 2.5224587815997697e-05, "loss": 0.123, "step": 23070 }, { "epoch": 2.0963713156819113, "grad_norm": 0.35994240641593933, "learning_rate": 2.5178699402790852e-05, "loss": 0.1061, "step": 23080 }, { "epoch": 2.0972796221445114, "grad_norm": 1.026349425315857, "learning_rate": 2.5132838717021445e-05, "loss": 0.1394, "step": 23090 }, { "epoch": 2.098187928607112, "grad_norm": 1.6643867492675781, "learning_rate": 2.508700580991963e-05, "loss": 0.1, "step": 23100 }, { "epoch": 2.0990962350697124, "grad_norm": 1.1848859786987305, "learning_rate": 2.5041200732684523e-05, "loss": 0.1368, "step": 23110 }, { "epoch": 2.100004541532313, "grad_norm": 2.29665470123291, "learning_rate": 2.4995423536484185e-05, "loss": 0.1547, "step": 23120 }, { "epoch": 2.1009128479949135, "grad_norm": 0.7654948234558105, "learning_rate": 2.4949674272455488e-05, "loss": 0.1237, "step": 23130 }, { "epoch": 2.101821154457514, "grad_norm": 0.130772665143013, "learning_rate": 2.4903952991704165e-05, "loss": 0.0902, "step": 23140 }, { "epoch": 2.1027294609201146, "grad_norm": 2.716590404510498, "learning_rate": 2.485825974530464e-05, "loss": 0.1314, "step": 23150 }, { "epoch": 2.103637767382715, "grad_norm": 1.6494760513305664, "learning_rate": 2.481259458430005e-05, "loss": 0.1188, "step": 23160 }, { "epoch": 2.1045460738453152, "grad_norm": 1.691077470779419, "learning_rate": 2.4766957559702103e-05, "loss": 0.1116, "step": 23170 }, { "epoch": 2.1054543803079158, "grad_norm": 0.5437243580818176, "learning_rate": 2.4721348722491116e-05, "loss": 0.064, "step": 23180 }, { "epoch": 2.1063626867705163, "grad_norm": 1.4646347761154175, "learning_rate": 2.467576812361595e-05, "loss": 0.1079, "step": 23190 }, { "epoch": 2.107270993233117, "grad_norm": 1.444653868675232, "learning_rate": 2.463021581399388e-05, "loss": 0.1055, "step": 23200 }, { "epoch": 2.1081792996957174, "grad_norm": 0.10691870748996735, "learning_rate": 2.4584691844510572e-05, "loss": 0.125, "step": 23210 }, { "epoch": 2.109087606158318, "grad_norm": 2.0216217041015625, "learning_rate": 2.4539196266020065e-05, "loss": 0.1147, "step": 23220 }, { "epoch": 2.1099959126209185, "grad_norm": 2.3236241340637207, "learning_rate": 2.4493729129344643e-05, "loss": 0.1669, "step": 23230 }, { "epoch": 2.110904219083519, "grad_norm": 0.0366465225815773, "learning_rate": 2.4448290485274883e-05, "loss": 0.1017, "step": 23240 }, { "epoch": 2.111812525546119, "grad_norm": 1.2417548894882202, "learning_rate": 2.440288038456947e-05, "loss": 0.1302, "step": 23250 }, { "epoch": 2.1127208320087196, "grad_norm": 1.8536176681518555, "learning_rate": 2.4357498877955233e-05, "loss": 0.1579, "step": 23260 }, { "epoch": 2.11362913847132, "grad_norm": 2.90272855758667, "learning_rate": 2.4312146016127046e-05, "loss": 0.1144, "step": 23270 }, { "epoch": 2.1145374449339207, "grad_norm": 2.2505199909210205, "learning_rate": 2.426682184974778e-05, "loss": 0.0998, "step": 23280 }, { "epoch": 2.1154457513965212, "grad_norm": 0.6312969326972961, "learning_rate": 2.4221526429448292e-05, "loss": 0.102, "step": 23290 }, { "epoch": 2.116354057859122, "grad_norm": 0.552282452583313, "learning_rate": 2.417625980582731e-05, "loss": 0.1199, "step": 23300 }, { "epoch": 2.1172623643217223, "grad_norm": 1.9968969821929932, "learning_rate": 2.4131022029451332e-05, "loss": 0.1174, "step": 23310 }, { "epoch": 2.1181706707843224, "grad_norm": 0.34090301394462585, "learning_rate": 2.408581315085471e-05, "loss": 0.1132, "step": 23320 }, { "epoch": 2.119078977246923, "grad_norm": 0.3912825882434845, "learning_rate": 2.4040633220539456e-05, "loss": 0.0761, "step": 23330 }, { "epoch": 2.1199872837095235, "grad_norm": 0.46178194880485535, "learning_rate": 2.3995482288975314e-05, "loss": 0.1063, "step": 23340 }, { "epoch": 2.120895590172124, "grad_norm": 0.5381664037704468, "learning_rate": 2.395036040659958e-05, "loss": 0.1206, "step": 23350 }, { "epoch": 2.1218038966347246, "grad_norm": 0.2038208544254303, "learning_rate": 2.3905267623817112e-05, "loss": 0.1138, "step": 23360 }, { "epoch": 2.122712203097325, "grad_norm": 0.5528069734573364, "learning_rate": 2.386020399100026e-05, "loss": 0.1472, "step": 23370 }, { "epoch": 2.1236205095599257, "grad_norm": 0.23538106679916382, "learning_rate": 2.38151695584888e-05, "loss": 0.0973, "step": 23380 }, { "epoch": 2.124528816022526, "grad_norm": 1.398434042930603, "learning_rate": 2.3770164376589933e-05, "loss": 0.0613, "step": 23390 }, { "epoch": 2.1254371224851263, "grad_norm": 1.3447924852371216, "learning_rate": 2.372518849557814e-05, "loss": 0.103, "step": 23400 }, { "epoch": 2.126345428947727, "grad_norm": 3.080317258834839, "learning_rate": 2.3680241965695188e-05, "loss": 0.1158, "step": 23410 }, { "epoch": 2.1272537354103274, "grad_norm": 1.6821755170822144, "learning_rate": 2.3635324837150024e-05, "loss": 0.1134, "step": 23420 }, { "epoch": 2.128162041872928, "grad_norm": 2.4771881103515625, "learning_rate": 2.3590437160118834e-05, "loss": 0.1364, "step": 23430 }, { "epoch": 2.1290703483355284, "grad_norm": 0.07653086632490158, "learning_rate": 2.354557898474484e-05, "loss": 0.1409, "step": 23440 }, { "epoch": 2.129978654798129, "grad_norm": 0.5455546975135803, "learning_rate": 2.3500750361138284e-05, "loss": 0.1129, "step": 23450 }, { "epoch": 2.1308869612607295, "grad_norm": 0.9764596223831177, "learning_rate": 2.3455951339376452e-05, "loss": 0.1644, "step": 23460 }, { "epoch": 2.13179526772333, "grad_norm": 1.0811526775360107, "learning_rate": 2.3411181969503515e-05, "loss": 0.0739, "step": 23470 }, { "epoch": 2.13270357418593, "grad_norm": 1.605974555015564, "learning_rate": 2.3366442301530595e-05, "loss": 0.1093, "step": 23480 }, { "epoch": 2.1336118806485307, "grad_norm": 1.9131004810333252, "learning_rate": 2.3321732385435565e-05, "loss": 0.1302, "step": 23490 }, { "epoch": 2.1345201871111312, "grad_norm": 1.8143808841705322, "learning_rate": 2.327705227116308e-05, "loss": 0.1097, "step": 23500 }, { "epoch": 2.1345201871111312, "eval_loss": 0.20178838074207306, "eval_runtime": 1106.2057, "eval_samples_per_second": 8.847, "eval_steps_per_second": 8.847, "step": 23500 }, { "epoch": 2.1354284935737318, "grad_norm": 2.8578808307647705, "learning_rate": 2.323240200862451e-05, "loss": 0.1546, "step": 23510 }, { "epoch": 2.1363368000363323, "grad_norm": 1.8353939056396484, "learning_rate": 2.3187781647697858e-05, "loss": 0.1764, "step": 23520 }, { "epoch": 2.137245106498933, "grad_norm": 1.7028470039367676, "learning_rate": 2.3143191238227784e-05, "loss": 0.1525, "step": 23530 }, { "epoch": 2.1381534129615334, "grad_norm": 1.9111360311508179, "learning_rate": 2.3098630830025425e-05, "loss": 0.0729, "step": 23540 }, { "epoch": 2.1390617194241335, "grad_norm": 1.0834972858428955, "learning_rate": 2.3054100472868428e-05, "loss": 0.0717, "step": 23550 }, { "epoch": 2.139970025886734, "grad_norm": 1.6406643390655518, "learning_rate": 2.3009600216500877e-05, "loss": 0.1535, "step": 23560 }, { "epoch": 2.1408783323493346, "grad_norm": 2.784174919128418, "learning_rate": 2.2965130110633193e-05, "loss": 0.0943, "step": 23570 }, { "epoch": 2.141786638811935, "grad_norm": 0.0015620696358382702, "learning_rate": 2.2920690204942212e-05, "loss": 0.1268, "step": 23580 }, { "epoch": 2.1426949452745356, "grad_norm": 2.206881523132324, "learning_rate": 2.2876280549070912e-05, "loss": 0.1283, "step": 23590 }, { "epoch": 2.143603251737136, "grad_norm": 1.6371487379074097, "learning_rate": 2.2831901192628553e-05, "loss": 0.0839, "step": 23600 }, { "epoch": 2.1445115581997367, "grad_norm": 0.5806708335876465, "learning_rate": 2.2787552185190537e-05, "loss": 0.1056, "step": 23610 }, { "epoch": 2.1454198646623373, "grad_norm": 1.884461522102356, "learning_rate": 2.2743233576298327e-05, "loss": 0.1358, "step": 23620 }, { "epoch": 2.146328171124938, "grad_norm": 1.7416247129440308, "learning_rate": 2.2698945415459517e-05, "loss": 0.0756, "step": 23630 }, { "epoch": 2.147236477587538, "grad_norm": 2.3461897373199463, "learning_rate": 2.26546877521476e-05, "loss": 0.0858, "step": 23640 }, { "epoch": 2.1481447840501384, "grad_norm": 1.3122369050979614, "learning_rate": 2.2610460635802033e-05, "loss": 0.0906, "step": 23650 }, { "epoch": 2.149053090512739, "grad_norm": 2.555051565170288, "learning_rate": 2.256626411582815e-05, "loss": 0.0784, "step": 23660 }, { "epoch": 2.1499613969753395, "grad_norm": 1.419880747795105, "learning_rate": 2.2522098241597083e-05, "loss": 0.077, "step": 23670 }, { "epoch": 2.15086970343794, "grad_norm": 1.2004035711288452, "learning_rate": 2.2477963062445796e-05, "loss": 0.1398, "step": 23680 }, { "epoch": 2.1517780099005406, "grad_norm": 1.671119213104248, "learning_rate": 2.243385862767689e-05, "loss": 0.1317, "step": 23690 }, { "epoch": 2.152686316363141, "grad_norm": 2.3822245597839355, "learning_rate": 2.238978498655866e-05, "loss": 0.1285, "step": 23700 }, { "epoch": 2.153594622825741, "grad_norm": 0.9587027430534363, "learning_rate": 2.2345742188324992e-05, "loss": 0.1081, "step": 23710 }, { "epoch": 2.1545029292883418, "grad_norm": 1.0157815217971802, "learning_rate": 2.2301730282175297e-05, "loss": 0.0675, "step": 23720 }, { "epoch": 2.1554112357509423, "grad_norm": 1.1300714015960693, "learning_rate": 2.2257749317274564e-05, "loss": 0.0839, "step": 23730 }, { "epoch": 2.156319542213543, "grad_norm": 2.6904966831207275, "learning_rate": 2.221379934275309e-05, "loss": 0.0888, "step": 23740 }, { "epoch": 2.1572278486761434, "grad_norm": 0.8278287649154663, "learning_rate": 2.2169880407706632e-05, "loss": 0.0744, "step": 23750 }, { "epoch": 2.158136155138744, "grad_norm": 0.003235558047890663, "learning_rate": 2.2125992561196274e-05, "loss": 0.0621, "step": 23760 }, { "epoch": 2.1590444616013444, "grad_norm": 3.848726987838745, "learning_rate": 2.2082135852248325e-05, "loss": 0.1431, "step": 23770 }, { "epoch": 2.1599527680639445, "grad_norm": 0.25135305523872375, "learning_rate": 2.2038310329854385e-05, "loss": 0.0644, "step": 23780 }, { "epoch": 2.160861074526545, "grad_norm": 1.4133942127227783, "learning_rate": 2.199451604297114e-05, "loss": 0.1285, "step": 23790 }, { "epoch": 2.1617693809891456, "grad_norm": 1.992393970489502, "learning_rate": 2.1950753040520435e-05, "loss": 0.1278, "step": 23800 }, { "epoch": 2.162677687451746, "grad_norm": 0.732172966003418, "learning_rate": 2.190702137138914e-05, "loss": 0.1784, "step": 23810 }, { "epoch": 2.1635859939143467, "grad_norm": 0.24799641966819763, "learning_rate": 2.1863321084429112e-05, "loss": 0.0825, "step": 23820 }, { "epoch": 2.1644943003769472, "grad_norm": 1.8309744596481323, "learning_rate": 2.181965222845721e-05, "loss": 0.119, "step": 23830 }, { "epoch": 2.1654026068395478, "grad_norm": 1.0958960056304932, "learning_rate": 2.1776014852255123e-05, "loss": 0.1342, "step": 23840 }, { "epoch": 2.1663109133021483, "grad_norm": 0.4399101734161377, "learning_rate": 2.1732409004569397e-05, "loss": 0.1355, "step": 23850 }, { "epoch": 2.167219219764749, "grad_norm": 1.9591704607009888, "learning_rate": 2.1688834734111364e-05, "loss": 0.108, "step": 23860 }, { "epoch": 2.168127526227349, "grad_norm": 2.3752963542938232, "learning_rate": 2.1645292089557057e-05, "loss": 0.1052, "step": 23870 }, { "epoch": 2.1690358326899495, "grad_norm": 0.45268014073371887, "learning_rate": 2.1601781119547203e-05, "loss": 0.0671, "step": 23880 }, { "epoch": 2.16994413915255, "grad_norm": 0.7767653465270996, "learning_rate": 2.155830187268715e-05, "loss": 0.1135, "step": 23890 }, { "epoch": 2.1708524456151506, "grad_norm": 2.063584327697754, "learning_rate": 2.151485439754678e-05, "loss": 0.1075, "step": 23900 }, { "epoch": 2.171760752077751, "grad_norm": 1.1512370109558105, "learning_rate": 2.1471438742660517e-05, "loss": 0.1331, "step": 23910 }, { "epoch": 2.1726690585403516, "grad_norm": 2.316772937774658, "learning_rate": 2.14280549565272e-05, "loss": 0.1432, "step": 23920 }, { "epoch": 2.173577365002952, "grad_norm": 0.3498794138431549, "learning_rate": 2.1384703087610126e-05, "loss": 0.0985, "step": 23930 }, { "epoch": 2.1744856714655523, "grad_norm": 1.1387475728988647, "learning_rate": 2.1341383184336892e-05, "loss": 0.1182, "step": 23940 }, { "epoch": 2.175393977928153, "grad_norm": 0.4095536172389984, "learning_rate": 2.129809529509939e-05, "loss": 0.1485, "step": 23950 }, { "epoch": 2.1763022843907533, "grad_norm": 1.1297954320907593, "learning_rate": 2.1254839468253767e-05, "loss": 0.141, "step": 23960 }, { "epoch": 2.177210590853354, "grad_norm": 0.7159091830253601, "learning_rate": 2.1211615752120323e-05, "loss": 0.0694, "step": 23970 }, { "epoch": 2.1781188973159544, "grad_norm": 1.340155005455017, "learning_rate": 2.1168424194983543e-05, "loss": 0.1361, "step": 23980 }, { "epoch": 2.179027203778555, "grad_norm": 2.868708848953247, "learning_rate": 2.1125264845091935e-05, "loss": 0.0983, "step": 23990 }, { "epoch": 2.1799355102411555, "grad_norm": 3.79897403717041, "learning_rate": 2.1082137750658054e-05, "loss": 0.1007, "step": 24000 }, { "epoch": 2.1799355102411555, "eval_loss": 0.20349858701229095, "eval_runtime": 1112.3354, "eval_samples_per_second": 8.799, "eval_steps_per_second": 8.799, "step": 24000 }, { "epoch": 2.180843816703756, "grad_norm": 1.2111263275146484, "learning_rate": 2.1039042959858418e-05, "loss": 0.1179, "step": 24010 }, { "epoch": 2.181752123166356, "grad_norm": 2.7683353424072266, "learning_rate": 2.0995980520833446e-05, "loss": 0.1106, "step": 24020 }, { "epoch": 2.1826604296289567, "grad_norm": 0.4943409562110901, "learning_rate": 2.0952950481687445e-05, "loss": 0.0629, "step": 24030 }, { "epoch": 2.183568736091557, "grad_norm": 1.6524266004562378, "learning_rate": 2.0909952890488506e-05, "loss": 0.0797, "step": 24040 }, { "epoch": 2.1844770425541578, "grad_norm": 1.1796435117721558, "learning_rate": 2.0866987795268477e-05, "loss": 0.1202, "step": 24050 }, { "epoch": 2.1853853490167583, "grad_norm": 0.7489283680915833, "learning_rate": 2.0824055244022917e-05, "loss": 0.1274, "step": 24060 }, { "epoch": 2.186293655479359, "grad_norm": 0.7217639088630676, "learning_rate": 2.0781155284710996e-05, "loss": 0.0587, "step": 24070 }, { "epoch": 2.1872019619419594, "grad_norm": 0.18333062529563904, "learning_rate": 2.0738287965255547e-05, "loss": 0.0982, "step": 24080 }, { "epoch": 2.18811026840456, "grad_norm": 0.3905477821826935, "learning_rate": 2.069545333354287e-05, "loss": 0.1251, "step": 24090 }, { "epoch": 2.18901857486716, "grad_norm": 0.7578851580619812, "learning_rate": 2.065265143742278e-05, "loss": 0.1099, "step": 24100 }, { "epoch": 2.1899268813297605, "grad_norm": 2.1733808517456055, "learning_rate": 2.0609882324708523e-05, "loss": 0.0536, "step": 24110 }, { "epoch": 2.190835187792361, "grad_norm": 0.3594528138637543, "learning_rate": 2.0567146043176706e-05, "loss": 0.0871, "step": 24120 }, { "epoch": 2.1917434942549616, "grad_norm": 0.2203902304172516, "learning_rate": 2.0524442640567305e-05, "loss": 0.0758, "step": 24130 }, { "epoch": 2.192651800717562, "grad_norm": 0.8138812780380249, "learning_rate": 2.0481772164583524e-05, "loss": 0.0762, "step": 24140 }, { "epoch": 2.1935601071801627, "grad_norm": 0.5235545635223389, "learning_rate": 2.04391346628918e-05, "loss": 0.0857, "step": 24150 }, { "epoch": 2.1944684136427632, "grad_norm": 0.9495700001716614, "learning_rate": 2.0396530183121732e-05, "loss": 0.1245, "step": 24160 }, { "epoch": 2.1953767201053633, "grad_norm": 4.030699253082275, "learning_rate": 2.0353958772866035e-05, "loss": 0.0867, "step": 24170 }, { "epoch": 2.196285026567964, "grad_norm": 4.500916481018066, "learning_rate": 2.0311420479680482e-05, "loss": 0.0572, "step": 24180 }, { "epoch": 2.1971933330305644, "grad_norm": 2.904212713241577, "learning_rate": 2.026891535108385e-05, "loss": 0.1369, "step": 24190 }, { "epoch": 2.198101639493165, "grad_norm": 3.009889602661133, "learning_rate": 2.0226443434557868e-05, "loss": 0.1037, "step": 24200 }, { "epoch": 2.1990099459557655, "grad_norm": 3.9868123531341553, "learning_rate": 2.0184004777547165e-05, "loss": 0.0474, "step": 24210 }, { "epoch": 2.199918252418366, "grad_norm": 2.285261631011963, "learning_rate": 2.01415994274592e-05, "loss": 0.1392, "step": 24220 }, { "epoch": 2.2008265588809666, "grad_norm": 3.543816566467285, "learning_rate": 2.0099227431664286e-05, "loss": 0.1718, "step": 24230 }, { "epoch": 2.201734865343567, "grad_norm": 0.23021963238716125, "learning_rate": 2.0056888837495395e-05, "loss": 0.1014, "step": 24240 }, { "epoch": 2.202643171806167, "grad_norm": 0.8778946399688721, "learning_rate": 2.0014583692248247e-05, "loss": 0.112, "step": 24250 }, { "epoch": 2.2035514782687677, "grad_norm": 0.7180653214454651, "learning_rate": 1.9972312043181158e-05, "loss": 0.1, "step": 24260 }, { "epoch": 2.2044597847313683, "grad_norm": 2.5244007110595703, "learning_rate": 1.9930073937515026e-05, "loss": 0.0895, "step": 24270 }, { "epoch": 2.205368091193969, "grad_norm": 0.23595793545246124, "learning_rate": 1.9887869422433337e-05, "loss": 0.1172, "step": 24280 }, { "epoch": 2.2062763976565694, "grad_norm": 0.5414576530456543, "learning_rate": 1.984569854508199e-05, "loss": 0.0872, "step": 24290 }, { "epoch": 2.20718470411917, "grad_norm": 1.296176791191101, "learning_rate": 1.9803561352569327e-05, "loss": 0.095, "step": 24300 }, { "epoch": 2.2080930105817704, "grad_norm": 2.245436906814575, "learning_rate": 1.9761457891966066e-05, "loss": 0.0802, "step": 24310 }, { "epoch": 2.209001317044371, "grad_norm": 2.0009350776672363, "learning_rate": 1.971938821030524e-05, "loss": 0.1495, "step": 24320 }, { "epoch": 2.209909623506971, "grad_norm": 0.9442091584205627, "learning_rate": 1.9677352354582152e-05, "loss": 0.1082, "step": 24330 }, { "epoch": 2.2108179299695716, "grad_norm": 2.446464776992798, "learning_rate": 1.963535037175431e-05, "loss": 0.1377, "step": 24340 }, { "epoch": 2.211726236432172, "grad_norm": 0.15239258110523224, "learning_rate": 1.959338230874138e-05, "loss": 0.0948, "step": 24350 }, { "epoch": 2.2126345428947727, "grad_norm": 1.0134872198104858, "learning_rate": 1.9551448212425165e-05, "loss": 0.1154, "step": 24360 }, { "epoch": 2.213542849357373, "grad_norm": 1.9759843349456787, "learning_rate": 1.950954812964948e-05, "loss": 0.1695, "step": 24370 }, { "epoch": 2.2144511558199738, "grad_norm": 1.8578143119812012, "learning_rate": 1.9467682107220203e-05, "loss": 0.1134, "step": 24380 }, { "epoch": 2.2153594622825743, "grad_norm": 1.7232544422149658, "learning_rate": 1.942585019190511e-05, "loss": 0.1898, "step": 24390 }, { "epoch": 2.2162677687451744, "grad_norm": 0.6951001882553101, "learning_rate": 1.9384052430433895e-05, "loss": 0.0957, "step": 24400 }, { "epoch": 2.217176075207775, "grad_norm": 0.5040855407714844, "learning_rate": 1.9342288869498104e-05, "loss": 0.1983, "step": 24410 }, { "epoch": 2.2180843816703755, "grad_norm": 1.637534260749817, "learning_rate": 1.9300559555751064e-05, "loss": 0.1357, "step": 24420 }, { "epoch": 2.218992688132976, "grad_norm": 0.506963312625885, "learning_rate": 1.9258864535807873e-05, "loss": 0.0862, "step": 24430 }, { "epoch": 2.2199009945955765, "grad_norm": 0.7147289514541626, "learning_rate": 1.9217203856245302e-05, "loss": 0.1208, "step": 24440 }, { "epoch": 2.220809301058177, "grad_norm": 1.2968387603759766, "learning_rate": 1.917557756360176e-05, "loss": 0.0916, "step": 24450 }, { "epoch": 2.2217176075207776, "grad_norm": 4.33096170425415, "learning_rate": 1.913398570437724e-05, "loss": 0.0706, "step": 24460 }, { "epoch": 2.222625913983378, "grad_norm": 1.3926900625228882, "learning_rate": 1.9092428325033275e-05, "loss": 0.107, "step": 24470 }, { "epoch": 2.2235342204459787, "grad_norm": 2.2140443325042725, "learning_rate": 1.9050905471992875e-05, "loss": 0.1377, "step": 24480 }, { "epoch": 2.224442526908579, "grad_norm": 1.1347744464874268, "learning_rate": 1.9009417191640504e-05, "loss": 0.0974, "step": 24490 }, { "epoch": 2.2253508333711793, "grad_norm": 0.7044615745544434, "learning_rate": 1.8967963530321976e-05, "loss": 0.0622, "step": 24500 }, { "epoch": 2.2253508333711793, "eval_loss": 0.20742006599903107, "eval_runtime": 1106.5048, "eval_samples_per_second": 8.845, "eval_steps_per_second": 8.845, "step": 24500 }, { "epoch": 2.22625913983378, "grad_norm": 1.8563460111618042, "learning_rate": 1.8926544534344443e-05, "loss": 0.096, "step": 24510 }, { "epoch": 2.2271674462963804, "grad_norm": 1.2367788553237915, "learning_rate": 1.888516024997633e-05, "loss": 0.1027, "step": 24520 }, { "epoch": 2.228075752758981, "grad_norm": 1.2368099689483643, "learning_rate": 1.8843810723447318e-05, "loss": 0.0992, "step": 24530 }, { "epoch": 2.2289840592215815, "grad_norm": 2.527334690093994, "learning_rate": 1.8802496000948217e-05, "loss": 0.0815, "step": 24540 }, { "epoch": 2.229892365684182, "grad_norm": 2.596294403076172, "learning_rate": 1.8761216128630982e-05, "loss": 0.1025, "step": 24550 }, { "epoch": 2.230800672146782, "grad_norm": 0.7294000387191772, "learning_rate": 1.8719971152608606e-05, "loss": 0.1287, "step": 24560 }, { "epoch": 2.2317089786093827, "grad_norm": 0.627945065498352, "learning_rate": 1.8678761118955164e-05, "loss": 0.0806, "step": 24570 }, { "epoch": 2.232617285071983, "grad_norm": 1.841489315032959, "learning_rate": 1.8637586073705633e-05, "loss": 0.1589, "step": 24580 }, { "epoch": 2.2335255915345837, "grad_norm": 0.7897591590881348, "learning_rate": 1.859644606285594e-05, "loss": 0.0899, "step": 24590 }, { "epoch": 2.2344338979971843, "grad_norm": 0.8744959831237793, "learning_rate": 1.8555341132362864e-05, "loss": 0.0994, "step": 24600 }, { "epoch": 2.235342204459785, "grad_norm": 0.2457582801580429, "learning_rate": 1.8514271328143997e-05, "loss": 0.0852, "step": 24610 }, { "epoch": 2.2362505109223854, "grad_norm": 2.451033592224121, "learning_rate": 1.847323669607769e-05, "loss": 0.093, "step": 24620 }, { "epoch": 2.237158817384986, "grad_norm": 0.3469717800617218, "learning_rate": 1.8432237282003012e-05, "loss": 0.1069, "step": 24630 }, { "epoch": 2.238067123847586, "grad_norm": 0.9318056106567383, "learning_rate": 1.8391273131719682e-05, "loss": 0.1369, "step": 24640 }, { "epoch": 2.2389754303101865, "grad_norm": 1.1717740297317505, "learning_rate": 1.8350344290988035e-05, "loss": 0.0927, "step": 24650 }, { "epoch": 2.239883736772787, "grad_norm": 2.155811309814453, "learning_rate": 1.8309450805528937e-05, "loss": 0.1861, "step": 24660 }, { "epoch": 2.2407920432353876, "grad_norm": 1.5861257314682007, "learning_rate": 1.8268592721023818e-05, "loss": 0.1169, "step": 24670 }, { "epoch": 2.241700349697988, "grad_norm": 1.7857093811035156, "learning_rate": 1.8227770083114505e-05, "loss": 0.0982, "step": 24680 }, { "epoch": 2.2426086561605887, "grad_norm": 1.8803619146347046, "learning_rate": 1.818698293740324e-05, "loss": 0.1065, "step": 24690 }, { "epoch": 2.2435169626231892, "grad_norm": 2.937295436859131, "learning_rate": 1.8146231329452635e-05, "loss": 0.1399, "step": 24700 }, { "epoch": 2.2444252690857898, "grad_norm": 0.9232556223869324, "learning_rate": 1.810551530478557e-05, "loss": 0.1433, "step": 24710 }, { "epoch": 2.24533357554839, "grad_norm": 1.7175683975219727, "learning_rate": 1.8064834908885238e-05, "loss": 0.1128, "step": 24720 }, { "epoch": 2.2462418820109904, "grad_norm": 1.7667536735534668, "learning_rate": 1.802419018719497e-05, "loss": 0.0744, "step": 24730 }, { "epoch": 2.247150188473591, "grad_norm": 2.1238975524902344, "learning_rate": 1.798358118511827e-05, "loss": 0.1104, "step": 24740 }, { "epoch": 2.2480584949361915, "grad_norm": 0.8726353049278259, "learning_rate": 1.7943007948018743e-05, "loss": 0.0574, "step": 24750 }, { "epoch": 2.248966801398792, "grad_norm": 0.0040878173895180225, "learning_rate": 1.7902470521220028e-05, "loss": 0.0361, "step": 24760 }, { "epoch": 2.2498751078613926, "grad_norm": 2.420943021774292, "learning_rate": 1.786196895000577e-05, "loss": 0.1114, "step": 24770 }, { "epoch": 2.250783414323993, "grad_norm": 1.3466414213180542, "learning_rate": 1.7821503279619573e-05, "loss": 0.1129, "step": 24780 }, { "epoch": 2.251691720786593, "grad_norm": 5.117208957672119, "learning_rate": 1.778107355526491e-05, "loss": 0.1141, "step": 24790 }, { "epoch": 2.2526000272491937, "grad_norm": 1.1222286224365234, "learning_rate": 1.7740679822105127e-05, "loss": 0.1212, "step": 24800 }, { "epoch": 2.2535083337117943, "grad_norm": 1.194596767425537, "learning_rate": 1.770032212526332e-05, "loss": 0.1402, "step": 24810 }, { "epoch": 2.254416640174395, "grad_norm": 1.2080458402633667, "learning_rate": 1.7660000509822406e-05, "loss": 0.082, "step": 24820 }, { "epoch": 2.2553249466369953, "grad_norm": 1.0022152662277222, "learning_rate": 1.761971502082493e-05, "loss": 0.0824, "step": 24830 }, { "epoch": 2.256233253099596, "grad_norm": 2.077509880065918, "learning_rate": 1.7579465703273097e-05, "loss": 0.1249, "step": 24840 }, { "epoch": 2.2571415595621964, "grad_norm": 1.2880889177322388, "learning_rate": 1.753925260212872e-05, "loss": 0.0748, "step": 24850 }, { "epoch": 2.258049866024797, "grad_norm": 1.086443305015564, "learning_rate": 1.7499075762313123e-05, "loss": 0.1213, "step": 24860 }, { "epoch": 2.258958172487397, "grad_norm": 1.3250011205673218, "learning_rate": 1.7458935228707175e-05, "loss": 0.1664, "step": 24870 }, { "epoch": 2.2598664789499976, "grad_norm": 0.5050017833709717, "learning_rate": 1.741883104615115e-05, "loss": 0.1034, "step": 24880 }, { "epoch": 2.260774785412598, "grad_norm": 3.2333178520202637, "learning_rate": 1.737876325944472e-05, "loss": 0.1183, "step": 24890 }, { "epoch": 2.2616830918751987, "grad_norm": 1.943233609199524, "learning_rate": 1.7338731913346905e-05, "loss": 0.1356, "step": 24900 }, { "epoch": 2.262591398337799, "grad_norm": 1.0904546976089478, "learning_rate": 1.7298737052576015e-05, "loss": 0.1344, "step": 24910 }, { "epoch": 2.2634997048003997, "grad_norm": 1.4393284320831299, "learning_rate": 1.72587787218096e-05, "loss": 0.1399, "step": 24920 }, { "epoch": 2.2644080112630003, "grad_norm": 0.5537924766540527, "learning_rate": 1.721885696568441e-05, "loss": 0.098, "step": 24930 }, { "epoch": 2.265316317725601, "grad_norm": 12.018943786621094, "learning_rate": 1.7178971828796335e-05, "loss": 0.1075, "step": 24940 }, { "epoch": 2.266224624188201, "grad_norm": 0.6377115845680237, "learning_rate": 1.7139123355700366e-05, "loss": 0.1193, "step": 24950 }, { "epoch": 2.2671329306508015, "grad_norm": 0.0007066968828439713, "learning_rate": 1.7099311590910503e-05, "loss": 0.1805, "step": 24960 }, { "epoch": 2.268041237113402, "grad_norm": 2.24735426902771, "learning_rate": 1.7059536578899802e-05, "loss": 0.1394, "step": 24970 }, { "epoch": 2.2689495435760025, "grad_norm": 1.3762437105178833, "learning_rate": 1.7019798364100213e-05, "loss": 0.1215, "step": 24980 }, { "epoch": 2.269857850038603, "grad_norm": 1.3876729011535645, "learning_rate": 1.69800969909026e-05, "loss": 0.0594, "step": 24990 }, { "epoch": 2.2707661565012036, "grad_norm": 1.7208665609359741, "learning_rate": 1.6940432503656655e-05, "loss": 0.095, "step": 25000 }, { "epoch": 2.2707661565012036, "eval_loss": 0.20000511407852173, "eval_runtime": 1119.7563, "eval_samples_per_second": 8.74, "eval_steps_per_second": 8.74, "step": 25000 }, { "epoch": 2.271674462963804, "grad_norm": 0.9411871433258057, "learning_rate": 1.6900804946670873e-05, "loss": 0.1092, "step": 25010 }, { "epoch": 2.2725827694264042, "grad_norm": 1.0357754230499268, "learning_rate": 1.686121436421253e-05, "loss": 0.0885, "step": 25020 }, { "epoch": 2.273491075889005, "grad_norm": 1.6898747682571411, "learning_rate": 1.682166080050755e-05, "loss": 0.1078, "step": 25030 }, { "epoch": 2.2743993823516053, "grad_norm": 2.0479037761688232, "learning_rate": 1.6782144299740515e-05, "loss": 0.1055, "step": 25040 }, { "epoch": 2.275307688814206, "grad_norm": 0.9554092884063721, "learning_rate": 1.6742664906054623e-05, "loss": 0.1241, "step": 25050 }, { "epoch": 2.2762159952768064, "grad_norm": 1.091264247894287, "learning_rate": 1.6703222663551604e-05, "loss": 0.1225, "step": 25060 }, { "epoch": 2.277124301739407, "grad_norm": 1.7122777700424194, "learning_rate": 1.6663817616291687e-05, "loss": 0.1045, "step": 25070 }, { "epoch": 2.2780326082020075, "grad_norm": 1.7341994047164917, "learning_rate": 1.6624449808293564e-05, "loss": 0.073, "step": 25080 }, { "epoch": 2.278940914664608, "grad_norm": 0.6555615663528442, "learning_rate": 1.6585119283534318e-05, "loss": 0.0834, "step": 25090 }, { "epoch": 2.2798492211272086, "grad_norm": 1.4255918264389038, "learning_rate": 1.654582608594938e-05, "loss": 0.1019, "step": 25100 }, { "epoch": 2.2807575275898087, "grad_norm": 0.3354332745075226, "learning_rate": 1.6506570259432473e-05, "loss": 0.1294, "step": 25110 }, { "epoch": 2.281665834052409, "grad_norm": 0.9586864709854126, "learning_rate": 1.6467351847835626e-05, "loss": 0.1216, "step": 25120 }, { "epoch": 2.2825741405150097, "grad_norm": 0.2370087057352066, "learning_rate": 1.642817089496902e-05, "loss": 0.057, "step": 25130 }, { "epoch": 2.2834824469776103, "grad_norm": 1.7409640550613403, "learning_rate": 1.6389027444600996e-05, "loss": 0.1015, "step": 25140 }, { "epoch": 2.284390753440211, "grad_norm": 1.8232488632202148, "learning_rate": 1.6349921540458023e-05, "loss": 0.0847, "step": 25150 }, { "epoch": 2.2852990599028113, "grad_norm": 1.5039163827896118, "learning_rate": 1.6310853226224606e-05, "loss": 0.1416, "step": 25160 }, { "epoch": 2.286207366365412, "grad_norm": 0.8832641243934631, "learning_rate": 1.6271822545543285e-05, "loss": 0.078, "step": 25170 }, { "epoch": 2.287115672828012, "grad_norm": 1.5332778692245483, "learning_rate": 1.6232829542014544e-05, "loss": 0.1184, "step": 25180 }, { "epoch": 2.2880239792906125, "grad_norm": 0.9404165744781494, "learning_rate": 1.6193874259196774e-05, "loss": 0.1121, "step": 25190 }, { "epoch": 2.288932285753213, "grad_norm": 0.5026895999908447, "learning_rate": 1.6154956740606232e-05, "loss": 0.0652, "step": 25200 }, { "epoch": 2.2898405922158136, "grad_norm": 0.7422900199890137, "learning_rate": 1.6116077029717003e-05, "loss": 0.0593, "step": 25210 }, { "epoch": 2.290748898678414, "grad_norm": 0.36293280124664307, "learning_rate": 1.6077235169960913e-05, "loss": 0.0977, "step": 25220 }, { "epoch": 2.2916572051410147, "grad_norm": 0.1733863651752472, "learning_rate": 1.6038431204727528e-05, "loss": 0.1387, "step": 25230 }, { "epoch": 2.292565511603615, "grad_norm": 0.019097190350294113, "learning_rate": 1.5999665177364077e-05, "loss": 0.0895, "step": 25240 }, { "epoch": 2.2934738180662153, "grad_norm": 0.6307501792907715, "learning_rate": 1.596093713117539e-05, "loss": 0.1089, "step": 25250 }, { "epoch": 2.294382124528816, "grad_norm": 0.7754175066947937, "learning_rate": 1.5922247109423887e-05, "loss": 0.0962, "step": 25260 }, { "epoch": 2.2952904309914164, "grad_norm": 1.9732190370559692, "learning_rate": 1.5883595155329526e-05, "loss": 0.095, "step": 25270 }, { "epoch": 2.296198737454017, "grad_norm": 1.9207006692886353, "learning_rate": 1.5844981312069723e-05, "loss": 0.0829, "step": 25280 }, { "epoch": 2.2971070439166175, "grad_norm": 1.4236201047897339, "learning_rate": 1.5806405622779313e-05, "loss": 0.1025, "step": 25290 }, { "epoch": 2.298015350379218, "grad_norm": 1.0344629287719727, "learning_rate": 1.5767868130550516e-05, "loss": 0.0576, "step": 25300 }, { "epoch": 2.2989236568418185, "grad_norm": 1.244114875793457, "learning_rate": 1.5729368878432877e-05, "loss": 0.086, "step": 25310 }, { "epoch": 2.299831963304419, "grad_norm": 0.10455206781625748, "learning_rate": 1.5690907909433256e-05, "loss": 0.0623, "step": 25320 }, { "epoch": 2.3007402697670196, "grad_norm": 1.8939216136932373, "learning_rate": 1.5652485266515714e-05, "loss": 0.0932, "step": 25330 }, { "epoch": 2.3016485762296197, "grad_norm": 2.726426839828491, "learning_rate": 1.5614100992601506e-05, "loss": 0.1426, "step": 25340 }, { "epoch": 2.3025568826922203, "grad_norm": 0.5497615337371826, "learning_rate": 1.5575755130569025e-05, "loss": 0.0857, "step": 25350 }, { "epoch": 2.303465189154821, "grad_norm": 0.889060914516449, "learning_rate": 1.5537447723253763e-05, "loss": 0.0874, "step": 25360 }, { "epoch": 2.3043734956174213, "grad_norm": 2.511474609375, "learning_rate": 1.5499178813448245e-05, "loss": 0.1645, "step": 25370 }, { "epoch": 2.305281802080022, "grad_norm": 3.2290866374969482, "learning_rate": 1.5460948443901996e-05, "loss": 0.1336, "step": 25380 }, { "epoch": 2.3061901085426224, "grad_norm": 0.27941638231277466, "learning_rate": 1.542275665732149e-05, "loss": 0.0591, "step": 25390 }, { "epoch": 2.307098415005223, "grad_norm": 0.801520049571991, "learning_rate": 1.53846034963701e-05, "loss": 0.108, "step": 25400 }, { "epoch": 2.308006721467823, "grad_norm": 2.462456703186035, "learning_rate": 1.5346489003668028e-05, "loss": 0.1005, "step": 25410 }, { "epoch": 2.3089150279304236, "grad_norm": 2.6677913665771484, "learning_rate": 1.5308413221792338e-05, "loss": 0.1411, "step": 25420 }, { "epoch": 2.309823334393024, "grad_norm": 1.1568964719772339, "learning_rate": 1.5270376193276804e-05, "loss": 0.0833, "step": 25430 }, { "epoch": 2.3107316408556247, "grad_norm": 0.18297377228736877, "learning_rate": 1.523237796061191e-05, "loss": 0.0874, "step": 25440 }, { "epoch": 2.311639947318225, "grad_norm": 0.0032576038502156734, "learning_rate": 1.5194418566244817e-05, "loss": 0.0594, "step": 25450 }, { "epoch": 2.3125482537808257, "grad_norm": 1.8631714582443237, "learning_rate": 1.5156498052579281e-05, "loss": 0.1244, "step": 25460 }, { "epoch": 2.3134565602434263, "grad_norm": 0.9100682735443115, "learning_rate": 1.5118616461975671e-05, "loss": 0.09, "step": 25470 }, { "epoch": 2.3143648667060264, "grad_norm": 0.6241140961647034, "learning_rate": 1.5080773836750822e-05, "loss": 0.0768, "step": 25480 }, { "epoch": 2.315273173168627, "grad_norm": 1.6367753744125366, "learning_rate": 1.504297021917807e-05, "loss": 0.07, "step": 25490 }, { "epoch": 2.3161814796312274, "grad_norm": 1.6989903450012207, "learning_rate": 1.500520565148717e-05, "loss": 0.144, "step": 25500 }, { "epoch": 2.3161814796312274, "eval_loss": 0.20555782318115234, "eval_runtime": 1103.7457, "eval_samples_per_second": 8.867, "eval_steps_per_second": 8.867, "step": 25500 }, { "epoch": 2.317089786093828, "grad_norm": 0.5340821146965027, "learning_rate": 1.496748017586425e-05, "loss": 0.0775, "step": 25510 }, { "epoch": 2.3179980925564285, "grad_norm": 0.7041557431221008, "learning_rate": 1.4929793834451784e-05, "loss": 0.1128, "step": 25520 }, { "epoch": 2.318906399019029, "grad_norm": 1.8674943447113037, "learning_rate": 1.4892146669348505e-05, "loss": 0.0839, "step": 25530 }, { "epoch": 2.3198147054816296, "grad_norm": 1.2770776748657227, "learning_rate": 1.4854538722609412e-05, "loss": 0.1108, "step": 25540 }, { "epoch": 2.32072301194423, "grad_norm": 1.3001269102096558, "learning_rate": 1.481697003624567e-05, "loss": 0.0964, "step": 25550 }, { "epoch": 2.3216313184068307, "grad_norm": 2.9671037197113037, "learning_rate": 1.4779440652224586e-05, "loss": 0.1695, "step": 25560 }, { "epoch": 2.3225396248694308, "grad_norm": 2.9106175899505615, "learning_rate": 1.4741950612469596e-05, "loss": 0.1366, "step": 25570 }, { "epoch": 2.3234479313320313, "grad_norm": 2.0591511726379395, "learning_rate": 1.4704499958860163e-05, "loss": 0.0436, "step": 25580 }, { "epoch": 2.324356237794632, "grad_norm": 2.2357170581817627, "learning_rate": 1.4667088733231737e-05, "loss": 0.0769, "step": 25590 }, { "epoch": 2.3252645442572324, "grad_norm": 0.2886096239089966, "learning_rate": 1.4629716977375746e-05, "loss": 0.0768, "step": 25600 }, { "epoch": 2.326172850719833, "grad_norm": 0.37789028882980347, "learning_rate": 1.45923847330395e-05, "loss": 0.083, "step": 25610 }, { "epoch": 2.3270811571824335, "grad_norm": 0.4852542281150818, "learning_rate": 1.4555092041926227e-05, "loss": 0.125, "step": 25620 }, { "epoch": 2.327989463645034, "grad_norm": 0.005690164864063263, "learning_rate": 1.4517838945694917e-05, "loss": 0.0754, "step": 25630 }, { "epoch": 2.328897770107634, "grad_norm": 0.0017473762854933739, "learning_rate": 1.4480625485960347e-05, "loss": 0.0768, "step": 25640 }, { "epoch": 2.3298060765702346, "grad_norm": 2.782609462738037, "learning_rate": 1.4443451704293016e-05, "loss": 0.1374, "step": 25650 }, { "epoch": 2.330714383032835, "grad_norm": 2.965702772140503, "learning_rate": 1.4406317642219103e-05, "loss": 0.127, "step": 25660 }, { "epoch": 2.3316226894954357, "grad_norm": 1.508428692817688, "learning_rate": 1.4369223341220417e-05, "loss": 0.1108, "step": 25670 }, { "epoch": 2.3325309959580363, "grad_norm": 0.41260117292404175, "learning_rate": 1.4332168842734339e-05, "loss": 0.144, "step": 25680 }, { "epoch": 2.333439302420637, "grad_norm": 2.2052454948425293, "learning_rate": 1.4295154188153803e-05, "loss": 0.1163, "step": 25690 }, { "epoch": 2.3343476088832373, "grad_norm": 0.8424702286720276, "learning_rate": 1.4258179418827222e-05, "loss": 0.099, "step": 25700 }, { "epoch": 2.335255915345838, "grad_norm": 2.2527174949645996, "learning_rate": 1.4221244576058451e-05, "loss": 0.1303, "step": 25710 }, { "epoch": 2.3361642218084384, "grad_norm": 0.5920016169548035, "learning_rate": 1.418434970110678e-05, "loss": 0.1172, "step": 25720 }, { "epoch": 2.3370725282710385, "grad_norm": 1.7412511110305786, "learning_rate": 1.4147494835186803e-05, "loss": 0.0814, "step": 25730 }, { "epoch": 2.337980834733639, "grad_norm": 2.736176013946533, "learning_rate": 1.4110680019468448e-05, "loss": 0.0859, "step": 25740 }, { "epoch": 2.3388891411962396, "grad_norm": 3.0514347553253174, "learning_rate": 1.4073905295076873e-05, "loss": 0.1926, "step": 25750 }, { "epoch": 2.33979744765884, "grad_norm": 0.8338022232055664, "learning_rate": 1.4037170703092517e-05, "loss": 0.0993, "step": 25760 }, { "epoch": 2.3407057541214407, "grad_norm": 2.6268322467803955, "learning_rate": 1.4000476284550924e-05, "loss": 0.086, "step": 25770 }, { "epoch": 2.341614060584041, "grad_norm": 4.137705326080322, "learning_rate": 1.3963822080442773e-05, "loss": 0.1115, "step": 25780 }, { "epoch": 2.3425223670466417, "grad_norm": 3.2681162357330322, "learning_rate": 1.3927208131713848e-05, "loss": 0.0938, "step": 25790 }, { "epoch": 2.343430673509242, "grad_norm": 1.5754629373550415, "learning_rate": 1.389063447926493e-05, "loss": 0.1188, "step": 25800 }, { "epoch": 2.3443389799718424, "grad_norm": 1.024586796760559, "learning_rate": 1.385410116395181e-05, "loss": 0.0682, "step": 25810 }, { "epoch": 2.345247286434443, "grad_norm": 2.522644281387329, "learning_rate": 1.381760822658521e-05, "loss": 0.1036, "step": 25820 }, { "epoch": 2.3461555928970435, "grad_norm": 0.4203888177871704, "learning_rate": 1.3781155707930743e-05, "loss": 0.0905, "step": 25830 }, { "epoch": 2.347063899359644, "grad_norm": 2.0465009212493896, "learning_rate": 1.3744743648708875e-05, "loss": 0.0522, "step": 25840 }, { "epoch": 2.3479722058222445, "grad_norm": 0.4610568881034851, "learning_rate": 1.3708372089594862e-05, "loss": 0.1222, "step": 25850 }, { "epoch": 2.348880512284845, "grad_norm": 2.4422056674957275, "learning_rate": 1.3672041071218767e-05, "loss": 0.051, "step": 25860 }, { "epoch": 2.349788818747445, "grad_norm": 2.8435614109039307, "learning_rate": 1.3635750634165311e-05, "loss": 0.1389, "step": 25870 }, { "epoch": 2.3506971252100457, "grad_norm": 1.0706357955932617, "learning_rate": 1.3599500818973898e-05, "loss": 0.067, "step": 25880 }, { "epoch": 2.3516054316726462, "grad_norm": 1.9931360483169556, "learning_rate": 1.3563291666138566e-05, "loss": 0.159, "step": 25890 }, { "epoch": 2.352513738135247, "grad_norm": 1.6539133787155151, "learning_rate": 1.3527123216107896e-05, "loss": 0.0777, "step": 25900 }, { "epoch": 2.3534220445978473, "grad_norm": 2.0319983959198, "learning_rate": 1.3490995509285064e-05, "loss": 0.0843, "step": 25910 }, { "epoch": 2.354330351060448, "grad_norm": 0.7722983956336975, "learning_rate": 1.3454908586027682e-05, "loss": 0.1332, "step": 25920 }, { "epoch": 2.3552386575230484, "grad_norm": 0.8040679693222046, "learning_rate": 1.3418862486647805e-05, "loss": 0.043, "step": 25930 }, { "epoch": 2.356146963985649, "grad_norm": 1.933949589729309, "learning_rate": 1.33828572514119e-05, "loss": 0.1102, "step": 25940 }, { "epoch": 2.3570552704482495, "grad_norm": 1.0446518659591675, "learning_rate": 1.3346892920540788e-05, "loss": 0.1051, "step": 25950 }, { "epoch": 2.3579635769108496, "grad_norm": 0.5067903995513916, "learning_rate": 1.3310969534209584e-05, "loss": 0.0812, "step": 25960 }, { "epoch": 2.35887188337345, "grad_norm": 1.5828933715820312, "learning_rate": 1.3275087132547665e-05, "loss": 0.1165, "step": 25970 }, { "epoch": 2.3597801898360506, "grad_norm": 2.3814938068389893, "learning_rate": 1.323924575563863e-05, "loss": 0.1345, "step": 25980 }, { "epoch": 2.360688496298651, "grad_norm": 1.9596654176712036, "learning_rate": 1.3203445443520257e-05, "loss": 0.0656, "step": 25990 }, { "epoch": 2.3615968027612517, "grad_norm": 0.8726671934127808, "learning_rate": 1.3167686236184423e-05, "loss": 0.2398, "step": 26000 }, { "epoch": 2.3615968027612517, "eval_loss": 0.2032337337732315, "eval_runtime": 1113.3125, "eval_samples_per_second": 8.791, "eval_steps_per_second": 8.791, "step": 26000 }, { "epoch": 2.3625051092238523, "grad_norm": 3.7932193279266357, "learning_rate": 1.3131968173577142e-05, "loss": 0.1828, "step": 26010 }, { "epoch": 2.363413415686453, "grad_norm": 3.034214496612549, "learning_rate": 1.3096291295598418e-05, "loss": 0.0935, "step": 26020 }, { "epoch": 2.364321722149053, "grad_norm": 1.3939217329025269, "learning_rate": 1.3060655642102254e-05, "loss": 0.1048, "step": 26030 }, { "epoch": 2.3652300286116534, "grad_norm": 0.6660311818122864, "learning_rate": 1.302506125289662e-05, "loss": 0.0865, "step": 26040 }, { "epoch": 2.366138335074254, "grad_norm": 0.944110095500946, "learning_rate": 1.2989508167743359e-05, "loss": 0.0814, "step": 26050 }, { "epoch": 2.3670466415368545, "grad_norm": 0.6009684801101685, "learning_rate": 1.2953996426358228e-05, "loss": 0.1194, "step": 26060 }, { "epoch": 2.367954947999455, "grad_norm": 1.0664825439453125, "learning_rate": 1.2918526068410758e-05, "loss": 0.1339, "step": 26070 }, { "epoch": 2.3688632544620556, "grad_norm": 1.8905296325683594, "learning_rate": 1.2883097133524252e-05, "loss": 0.1058, "step": 26080 }, { "epoch": 2.369771560924656, "grad_norm": 1.2868424654006958, "learning_rate": 1.2847709661275752e-05, "loss": 0.0951, "step": 26090 }, { "epoch": 2.3706798673872562, "grad_norm": 1.2623231410980225, "learning_rate": 1.2812363691195983e-05, "loss": 0.0667, "step": 26100 }, { "epoch": 2.3715881738498568, "grad_norm": 1.7939809560775757, "learning_rate": 1.2777059262769298e-05, "loss": 0.0785, "step": 26110 }, { "epoch": 2.3724964803124573, "grad_norm": 1.955164909362793, "learning_rate": 1.274179641543366e-05, "loss": 0.1172, "step": 26120 }, { "epoch": 2.373404786775058, "grad_norm": 0.9528709650039673, "learning_rate": 1.2706575188580566e-05, "loss": 0.1027, "step": 26130 }, { "epoch": 2.3743130932376584, "grad_norm": 1.4561142921447754, "learning_rate": 1.2671395621555038e-05, "loss": 0.1639, "step": 26140 }, { "epoch": 2.375221399700259, "grad_norm": 2.6684377193450928, "learning_rate": 1.2636257753655529e-05, "loss": 0.1211, "step": 26150 }, { "epoch": 2.3761297061628595, "grad_norm": 0.6672934293746948, "learning_rate": 1.2601161624133956e-05, "loss": 0.1165, "step": 26160 }, { "epoch": 2.37703801262546, "grad_norm": 2.4272427558898926, "learning_rate": 1.2566107272195582e-05, "loss": 0.098, "step": 26170 }, { "epoch": 2.3779463190880605, "grad_norm": 1.0865168571472168, "learning_rate": 1.2531094736999e-05, "loss": 0.0844, "step": 26180 }, { "epoch": 2.3788546255506606, "grad_norm": 2.2987494468688965, "learning_rate": 1.24961240576561e-05, "loss": 0.1293, "step": 26190 }, { "epoch": 2.379762932013261, "grad_norm": 1.424992561340332, "learning_rate": 1.2461195273231995e-05, "loss": 0.0594, "step": 26200 }, { "epoch": 2.3806712384758617, "grad_norm": 0.4942430257797241, "learning_rate": 1.2426308422745042e-05, "loss": 0.0524, "step": 26210 }, { "epoch": 2.3815795449384622, "grad_norm": 1.6812337636947632, "learning_rate": 1.2391463545166709e-05, "loss": 0.0995, "step": 26220 }, { "epoch": 2.382487851401063, "grad_norm": 0.6841968894004822, "learning_rate": 1.2356660679421606e-05, "loss": 0.0975, "step": 26230 }, { "epoch": 2.3833961578636633, "grad_norm": 1.4924936294555664, "learning_rate": 1.2321899864387404e-05, "loss": 0.065, "step": 26240 }, { "epoch": 2.384304464326264, "grad_norm": 2.8087143898010254, "learning_rate": 1.228718113889476e-05, "loss": 0.1194, "step": 26250 }, { "epoch": 2.385212770788864, "grad_norm": 0.3209075629711151, "learning_rate": 1.2252504541727388e-05, "loss": 0.0813, "step": 26260 }, { "epoch": 2.3861210772514645, "grad_norm": 2.0721240043640137, "learning_rate": 1.2217870111621898e-05, "loss": 0.1011, "step": 26270 }, { "epoch": 2.387029383714065, "grad_norm": 0.6685589551925659, "learning_rate": 1.2183277887267796e-05, "loss": 0.0687, "step": 26280 }, { "epoch": 2.3879376901766656, "grad_norm": 0.9407830834388733, "learning_rate": 1.2148727907307444e-05, "loss": 0.1205, "step": 26290 }, { "epoch": 2.388845996639266, "grad_norm": 2.6776010990142822, "learning_rate": 1.2114220210336002e-05, "loss": 0.095, "step": 26300 }, { "epoch": 2.3897543031018667, "grad_norm": 2.4497838020324707, "learning_rate": 1.2079754834901441e-05, "loss": 0.1142, "step": 26310 }, { "epoch": 2.390662609564467, "grad_norm": 1.2253493070602417, "learning_rate": 1.2045331819504413e-05, "loss": 0.1394, "step": 26320 }, { "epoch": 2.3915709160270673, "grad_norm": 2.0399117469787598, "learning_rate": 1.2010951202598253e-05, "loss": 0.1218, "step": 26330 }, { "epoch": 2.392479222489668, "grad_norm": 1.7184092998504639, "learning_rate": 1.1976613022588951e-05, "loss": 0.2291, "step": 26340 }, { "epoch": 2.3933875289522684, "grad_norm": 0.13486792147159576, "learning_rate": 1.1942317317835062e-05, "loss": 0.1028, "step": 26350 }, { "epoch": 2.394295835414869, "grad_norm": 0.5013140439987183, "learning_rate": 1.1908064126647745e-05, "loss": 0.0638, "step": 26360 }, { "epoch": 2.3952041418774694, "grad_norm": 0.7024948596954346, "learning_rate": 1.1873853487290626e-05, "loss": 0.0831, "step": 26370 }, { "epoch": 2.39611244834007, "grad_norm": 0.6970208287239075, "learning_rate": 1.1839685437979814e-05, "loss": 0.0645, "step": 26380 }, { "epoch": 2.3970207548026705, "grad_norm": 1.342077612876892, "learning_rate": 1.180556001688381e-05, "loss": 0.1241, "step": 26390 }, { "epoch": 2.397929061265271, "grad_norm": 0.9264432191848755, "learning_rate": 1.1771477262123515e-05, "loss": 0.1166, "step": 26400 }, { "epoch": 2.3988373677278716, "grad_norm": 1.9716691970825195, "learning_rate": 1.1737437211772211e-05, "loss": 0.2448, "step": 26410 }, { "epoch": 2.3997456741904717, "grad_norm": 1.054269552230835, "learning_rate": 1.1703439903855423e-05, "loss": 0.1145, "step": 26420 }, { "epoch": 2.4006539806530722, "grad_norm": 0.8036478757858276, "learning_rate": 1.1669485376350936e-05, "loss": 0.064, "step": 26430 }, { "epoch": 2.4015622871156728, "grad_norm": 1.6499825716018677, "learning_rate": 1.1635573667188759e-05, "loss": 0.1264, "step": 26440 }, { "epoch": 2.4024705935782733, "grad_norm": 2.1713640689849854, "learning_rate": 1.1601704814251058e-05, "loss": 0.1765, "step": 26450 }, { "epoch": 2.403378900040874, "grad_norm": 1.1716407537460327, "learning_rate": 1.1567878855372144e-05, "loss": 0.055, "step": 26460 }, { "epoch": 2.4042872065034744, "grad_norm": 2.433725595474243, "learning_rate": 1.1534095828338403e-05, "loss": 0.103, "step": 26470 }, { "epoch": 2.405195512966075, "grad_norm": 0.3787498474121094, "learning_rate": 1.1500355770888244e-05, "loss": 0.0746, "step": 26480 }, { "epoch": 2.406103819428675, "grad_norm": 2.229719638824463, "learning_rate": 1.1466658720712098e-05, "loss": 0.1697, "step": 26490 }, { "epoch": 2.4070121258912756, "grad_norm": 0.43033280968666077, "learning_rate": 1.1433004715452333e-05, "loss": 0.0303, "step": 26500 }, { "epoch": 2.4070121258912756, "eval_loss": 0.2016032636165619, "eval_runtime": 1113.7473, "eval_samples_per_second": 8.787, "eval_steps_per_second": 8.787, "step": 26500 }, { "epoch": 2.407920432353876, "grad_norm": 0.39431998133659363, "learning_rate": 1.1399393792703272e-05, "loss": 0.09, "step": 26510 }, { "epoch": 2.4088287388164766, "grad_norm": 0.5760634541511536, "learning_rate": 1.1365825990011064e-05, "loss": 0.0773, "step": 26520 }, { "epoch": 2.409737045279077, "grad_norm": 1.7922223806381226, "learning_rate": 1.1332301344873736e-05, "loss": 0.0779, "step": 26530 }, { "epoch": 2.4106453517416777, "grad_norm": 0.11288601905107498, "learning_rate": 1.1298819894741036e-05, "loss": 0.124, "step": 26540 }, { "epoch": 2.4115536582042782, "grad_norm": 1.8146154880523682, "learning_rate": 1.1265381677014514e-05, "loss": 0.0665, "step": 26550 }, { "epoch": 2.412461964666879, "grad_norm": 0.7868393063545227, "learning_rate": 1.1231986729047433e-05, "loss": 0.0653, "step": 26560 }, { "epoch": 2.4133702711294793, "grad_norm": 1.0106362104415894, "learning_rate": 1.1198635088144694e-05, "loss": 0.0621, "step": 26570 }, { "epoch": 2.4142785775920794, "grad_norm": 2.4492006301879883, "learning_rate": 1.1165326791562825e-05, "loss": 0.0945, "step": 26580 }, { "epoch": 2.41518688405468, "grad_norm": 1.1163064241409302, "learning_rate": 1.113206187650993e-05, "loss": 0.0578, "step": 26590 }, { "epoch": 2.4160951905172805, "grad_norm": 1.7863094806671143, "learning_rate": 1.1098840380145653e-05, "loss": 0.11, "step": 26600 }, { "epoch": 2.417003496979881, "grad_norm": 1.6648139953613281, "learning_rate": 1.1065662339581173e-05, "loss": 0.095, "step": 26610 }, { "epoch": 2.4179118034424816, "grad_norm": 1.7744226455688477, "learning_rate": 1.1032527791879071e-05, "loss": 0.1009, "step": 26620 }, { "epoch": 2.418820109905082, "grad_norm": 0.8902453184127808, "learning_rate": 1.0999436774053373e-05, "loss": 0.1607, "step": 26630 }, { "epoch": 2.4197284163676827, "grad_norm": 0.8043012619018555, "learning_rate": 1.096638932306947e-05, "loss": 0.1105, "step": 26640 }, { "epoch": 2.4206367228302827, "grad_norm": 2.1253483295440674, "learning_rate": 1.0933385475844072e-05, "loss": 0.0684, "step": 26650 }, { "epoch": 2.4215450292928833, "grad_norm": 0.7793969511985779, "learning_rate": 1.0900425269245223e-05, "loss": 0.041, "step": 26660 }, { "epoch": 2.422453335755484, "grad_norm": 0.003352551255375147, "learning_rate": 1.0867508740092186e-05, "loss": 0.0629, "step": 26670 }, { "epoch": 2.4233616422180844, "grad_norm": 1.2452113628387451, "learning_rate": 1.0834635925155418e-05, "loss": 0.0785, "step": 26680 }, { "epoch": 2.424269948680685, "grad_norm": 1.3789457082748413, "learning_rate": 1.0801806861156561e-05, "loss": 0.1003, "step": 26690 }, { "epoch": 2.4251782551432854, "grad_norm": 0.34679970145225525, "learning_rate": 1.0769021584768384e-05, "loss": 0.0688, "step": 26700 }, { "epoch": 2.426086561605886, "grad_norm": 0.22185184061527252, "learning_rate": 1.0736280132614757e-05, "loss": 0.0837, "step": 26710 }, { "epoch": 2.426994868068486, "grad_norm": 0.015234981663525105, "learning_rate": 1.0703582541270573e-05, "loss": 0.149, "step": 26720 }, { "epoch": 2.4279031745310866, "grad_norm": 0.33736422657966614, "learning_rate": 1.0670928847261736e-05, "loss": 0.0715, "step": 26730 }, { "epoch": 2.428811480993687, "grad_norm": 0.8287287950515747, "learning_rate": 1.0638319087065102e-05, "loss": 0.0265, "step": 26740 }, { "epoch": 2.4297197874562877, "grad_norm": 0.2001328021287918, "learning_rate": 1.0605753297108455e-05, "loss": 0.0405, "step": 26750 }, { "epoch": 2.4306280939188882, "grad_norm": 2.994689702987671, "learning_rate": 1.0573231513770493e-05, "loss": 0.146, "step": 26760 }, { "epoch": 2.4315364003814888, "grad_norm": 3.3604562282562256, "learning_rate": 1.0540753773380707e-05, "loss": 0.1139, "step": 26770 }, { "epoch": 2.4324447068440893, "grad_norm": 0.20142370462417603, "learning_rate": 1.0508320112219411e-05, "loss": 0.1717, "step": 26780 }, { "epoch": 2.43335301330669, "grad_norm": 2.0832529067993164, "learning_rate": 1.0475930566517678e-05, "loss": 0.1098, "step": 26790 }, { "epoch": 2.4342613197692904, "grad_norm": 1.1982722282409668, "learning_rate": 1.0443585172457282e-05, "loss": 0.0829, "step": 26800 }, { "epoch": 2.4351696262318905, "grad_norm": 0.37016212940216064, "learning_rate": 1.0411283966170737e-05, "loss": 0.0863, "step": 26810 }, { "epoch": 2.436077932694491, "grad_norm": 5.042032241821289, "learning_rate": 1.0379026983741103e-05, "loss": 0.1337, "step": 26820 }, { "epoch": 2.4369862391570916, "grad_norm": 2.4578542709350586, "learning_rate": 1.0346814261202115e-05, "loss": 0.1172, "step": 26830 }, { "epoch": 2.437894545619692, "grad_norm": 0.5710972547531128, "learning_rate": 1.0314645834538023e-05, "loss": 0.1093, "step": 26840 }, { "epoch": 2.4388028520822926, "grad_norm": 1.1063145399093628, "learning_rate": 1.0282521739683598e-05, "loss": 0.1265, "step": 26850 }, { "epoch": 2.439711158544893, "grad_norm": 0.7670822143554688, "learning_rate": 1.0250442012524137e-05, "loss": 0.0908, "step": 26860 }, { "epoch": 2.4406194650074937, "grad_norm": 2.0779452323913574, "learning_rate": 1.021840668889531e-05, "loss": 0.0948, "step": 26870 }, { "epoch": 2.441527771470094, "grad_norm": 2.435546398162842, "learning_rate": 1.0186415804583228e-05, "loss": 0.1075, "step": 26880 }, { "epoch": 2.4424360779326943, "grad_norm": 0.44046902656555176, "learning_rate": 1.0154469395324318e-05, "loss": 0.1062, "step": 26890 }, { "epoch": 2.443344384395295, "grad_norm": 1.3125042915344238, "learning_rate": 1.012256749680538e-05, "loss": 0.066, "step": 26900 }, { "epoch": 2.4442526908578954, "grad_norm": 2.1012260913848877, "learning_rate": 1.0090710144663457e-05, "loss": 0.163, "step": 26910 }, { "epoch": 2.445160997320496, "grad_norm": 0.41931766271591187, "learning_rate": 1.0058897374485825e-05, "loss": 0.0643, "step": 26920 }, { "epoch": 2.4460693037830965, "grad_norm": 1.105959177017212, "learning_rate": 1.0027129221809977e-05, "loss": 0.0885, "step": 26930 }, { "epoch": 2.446977610245697, "grad_norm": 0.837913453578949, "learning_rate": 9.995405722123541e-06, "loss": 0.1348, "step": 26940 }, { "epoch": 2.447885916708297, "grad_norm": 0.1895887404680252, "learning_rate": 9.963726910864302e-06, "loss": 0.1368, "step": 26950 }, { "epoch": 2.4487942231708977, "grad_norm": 1.899917721748352, "learning_rate": 9.932092823420109e-06, "loss": 0.1268, "step": 26960 }, { "epoch": 2.449702529633498, "grad_norm": 1.1699455976486206, "learning_rate": 9.900503495128804e-06, "loss": 0.1609, "step": 26970 }, { "epoch": 2.4506108360960988, "grad_norm": 1.4201322793960571, "learning_rate": 9.868958961278296e-06, "loss": 0.1047, "step": 26980 }, { "epoch": 2.4515191425586993, "grad_norm": 2.5638911724090576, "learning_rate": 9.837459257106412e-06, "loss": 0.0488, "step": 26990 }, { "epoch": 2.4524274490213, "grad_norm": 1.2927517890930176, "learning_rate": 9.80600441780094e-06, "loss": 0.0766, "step": 27000 }, { "epoch": 2.4524274490213, "eval_loss": 0.204449862241745, "eval_runtime": 1106.9469, "eval_samples_per_second": 8.841, "eval_steps_per_second": 8.841, "step": 27000 }, { "epoch": 2.4533357554839004, "grad_norm": 1.742397665977478, "learning_rate": 9.77459447849951e-06, "loss": 0.098, "step": 27010 }, { "epoch": 2.454244061946501, "grad_norm": 0.017489587888121605, "learning_rate": 9.743229474289617e-06, "loss": 0.1152, "step": 27020 }, { "epoch": 2.4551523684091014, "grad_norm": 0.6116973161697388, "learning_rate": 9.71190944020855e-06, "loss": 0.0825, "step": 27030 }, { "epoch": 2.4560606748717015, "grad_norm": 0.3350277841091156, "learning_rate": 9.68063441124336e-06, "loss": 0.0651, "step": 27040 }, { "epoch": 2.456968981334302, "grad_norm": 2.9598093032836914, "learning_rate": 9.649404422330848e-06, "loss": 0.1672, "step": 27050 }, { "epoch": 2.4578772877969026, "grad_norm": 1.7602821588516235, "learning_rate": 9.618219508357485e-06, "loss": 0.0949, "step": 27060 }, { "epoch": 2.458785594259503, "grad_norm": 2.246508836746216, "learning_rate": 9.587079704159374e-06, "loss": 0.1344, "step": 27070 }, { "epoch": 2.4596939007221037, "grad_norm": 0.9709048271179199, "learning_rate": 9.555985044522248e-06, "loss": 0.1296, "step": 27080 }, { "epoch": 2.4606022071847042, "grad_norm": 1.7165356874465942, "learning_rate": 9.524935564181387e-06, "loss": 0.0784, "step": 27090 }, { "epoch": 2.4615105136473048, "grad_norm": 3.0283167362213135, "learning_rate": 9.493931297821667e-06, "loss": 0.1197, "step": 27100 }, { "epoch": 2.462418820109905, "grad_norm": 1.5717583894729614, "learning_rate": 9.462972280077354e-06, "loss": 0.1064, "step": 27110 }, { "epoch": 2.4633271265725054, "grad_norm": 2.09834361076355, "learning_rate": 9.432058545532246e-06, "loss": 0.1232, "step": 27120 }, { "epoch": 2.464235433035106, "grad_norm": 1.3690845966339111, "learning_rate": 9.401190128719528e-06, "loss": 0.0964, "step": 27130 }, { "epoch": 2.4651437394977065, "grad_norm": 1.6346335411071777, "learning_rate": 9.370367064121748e-06, "loss": 0.1156, "step": 27140 }, { "epoch": 2.466052045960307, "grad_norm": 0.5180050730705261, "learning_rate": 9.339589386170838e-06, "loss": 0.0915, "step": 27150 }, { "epoch": 2.4669603524229076, "grad_norm": 1.3035880327224731, "learning_rate": 9.308857129247984e-06, "loss": 0.1224, "step": 27160 }, { "epoch": 2.467868658885508, "grad_norm": 1.967984676361084, "learning_rate": 9.278170327683644e-06, "loss": 0.0568, "step": 27170 }, { "epoch": 2.4687769653481086, "grad_norm": 4.590625286102295, "learning_rate": 9.247529015757517e-06, "loss": 0.0999, "step": 27180 }, { "epoch": 2.469685271810709, "grad_norm": 2.8153553009033203, "learning_rate": 9.216933227698443e-06, "loss": 0.1206, "step": 27190 }, { "epoch": 2.4705935782733093, "grad_norm": 0.2843480110168457, "learning_rate": 9.186382997684479e-06, "loss": 0.1449, "step": 27200 }, { "epoch": 2.47150188473591, "grad_norm": 1.4871662855148315, "learning_rate": 9.155878359842723e-06, "loss": 0.1372, "step": 27210 }, { "epoch": 2.4724101911985104, "grad_norm": 1.629217267036438, "learning_rate": 9.125419348249376e-06, "loss": 0.1192, "step": 27220 }, { "epoch": 2.473318497661111, "grad_norm": 2.004460334777832, "learning_rate": 9.095005996929656e-06, "loss": 0.1052, "step": 27230 }, { "epoch": 2.4742268041237114, "grad_norm": 0.0011962150456383824, "learning_rate": 9.06463833985779e-06, "loss": 0.0793, "step": 27240 }, { "epoch": 2.475135110586312, "grad_norm": 2.251664876937866, "learning_rate": 9.03431641095695e-06, "loss": 0.0875, "step": 27250 }, { "epoch": 2.4760434170489125, "grad_norm": 1.1812680959701538, "learning_rate": 9.004040244099222e-06, "loss": 0.1694, "step": 27260 }, { "epoch": 2.4769517235115126, "grad_norm": 2.090235710144043, "learning_rate": 8.973809873105587e-06, "loss": 0.1317, "step": 27270 }, { "epoch": 2.477860029974113, "grad_norm": 0.23697219789028168, "learning_rate": 8.943625331745858e-06, "loss": 0.0931, "step": 27280 }, { "epoch": 2.4787683364367137, "grad_norm": 2.6405141353607178, "learning_rate": 8.913486653738635e-06, "loss": 0.1219, "step": 27290 }, { "epoch": 2.479676642899314, "grad_norm": 0.5027287602424622, "learning_rate": 8.883393872751334e-06, "loss": 0.1281, "step": 27300 }, { "epoch": 2.4805849493619148, "grad_norm": 2.511854410171509, "learning_rate": 8.85334702240006e-06, "loss": 0.1404, "step": 27310 }, { "epoch": 2.4814932558245153, "grad_norm": 0.5693429708480835, "learning_rate": 8.823346136249617e-06, "loss": 0.0689, "step": 27320 }, { "epoch": 2.482401562287116, "grad_norm": 2.607881784439087, "learning_rate": 8.793391247813465e-06, "loss": 0.1081, "step": 27330 }, { "epoch": 2.483309868749716, "grad_norm": 1.4274866580963135, "learning_rate": 8.763482390553667e-06, "loss": 0.0642, "step": 27340 }, { "epoch": 2.4842181752123165, "grad_norm": 3.8737223148345947, "learning_rate": 8.733619597880915e-06, "loss": 0.0726, "step": 27350 }, { "epoch": 2.485126481674917, "grad_norm": 0.6219329237937927, "learning_rate": 8.70380290315439e-06, "loss": 0.0584, "step": 27360 }, { "epoch": 2.4860347881375175, "grad_norm": 1.372384786605835, "learning_rate": 8.674032339681797e-06, "loss": 0.0785, "step": 27370 }, { "epoch": 2.486943094600118, "grad_norm": 2.021320343017578, "learning_rate": 8.644307940719305e-06, "loss": 0.1337, "step": 27380 }, { "epoch": 2.4878514010627186, "grad_norm": 0.8007686734199524, "learning_rate": 8.614629739471513e-06, "loss": 0.1663, "step": 27390 }, { "epoch": 2.488759707525319, "grad_norm": 0.2667044699192047, "learning_rate": 8.58499776909143e-06, "loss": 0.1241, "step": 27400 }, { "epoch": 2.4896680139879197, "grad_norm": 1.211715817451477, "learning_rate": 8.555412062680396e-06, "loss": 0.1003, "step": 27410 }, { "epoch": 2.4905763204505202, "grad_norm": 0.992534339427948, "learning_rate": 8.525872653288087e-06, "loss": 0.0989, "step": 27420 }, { "epoch": 2.4914846269131203, "grad_norm": 2.2205634117126465, "learning_rate": 8.496379573912455e-06, "loss": 0.0966, "step": 27430 }, { "epoch": 2.492392933375721, "grad_norm": 2.392956018447876, "learning_rate": 8.466932857499682e-06, "loss": 0.1074, "step": 27440 }, { "epoch": 2.4933012398383214, "grad_norm": 2.9962446689605713, "learning_rate": 8.43753253694421e-06, "loss": 0.1158, "step": 27450 }, { "epoch": 2.494209546300922, "grad_norm": 1.1651109457015991, "learning_rate": 8.408178645088605e-06, "loss": 0.0511, "step": 27460 }, { "epoch": 2.4951178527635225, "grad_norm": 1.7571938037872314, "learning_rate": 8.378871214723577e-06, "loss": 0.1375, "step": 27470 }, { "epoch": 2.496026159226123, "grad_norm": 0.08424451947212219, "learning_rate": 8.349610278587938e-06, "loss": 0.0756, "step": 27480 }, { "epoch": 2.4969344656887236, "grad_norm": 0.11937100440263748, "learning_rate": 8.320395869368563e-06, "loss": 0.1006, "step": 27490 }, { "epoch": 2.4978427721513237, "grad_norm": 0.5030651688575745, "learning_rate": 8.29122801970037e-06, "loss": 0.0822, "step": 27500 }, { "epoch": 2.4978427721513237, "eval_loss": 0.20289765298366547, "eval_runtime": 1111.971, "eval_samples_per_second": 8.801, "eval_steps_per_second": 8.801, "step": 27500 }, { "epoch": 2.498751078613924, "grad_norm": 3.838109254837036, "learning_rate": 8.262106762166238e-06, "loss": 0.073, "step": 27510 }, { "epoch": 2.4996593850765247, "grad_norm": 0.958215057849884, "learning_rate": 8.233032129297008e-06, "loss": 0.099, "step": 27520 }, { "epoch": 2.5005676915391253, "grad_norm": 0.36660656332969666, "learning_rate": 8.204004153571448e-06, "loss": 0.1365, "step": 27530 }, { "epoch": 2.501475998001726, "grad_norm": 1.4926785230636597, "learning_rate": 8.175022867416193e-06, "loss": 0.1035, "step": 27540 }, { "epoch": 2.5023843044643264, "grad_norm": 1.305115818977356, "learning_rate": 8.146088303205723e-06, "loss": 0.0583, "step": 27550 }, { "epoch": 2.503292610926927, "grad_norm": 2.413069248199463, "learning_rate": 8.11720049326234e-06, "loss": 0.0955, "step": 27560 }, { "epoch": 2.504200917389527, "grad_norm": 1.5041923522949219, "learning_rate": 8.0883594698561e-06, "loss": 0.1289, "step": 27570 }, { "epoch": 2.505109223852128, "grad_norm": 2.186065196990967, "learning_rate": 8.059565265204799e-06, "loss": 0.1764, "step": 27580 }, { "epoch": 2.506017530314728, "grad_norm": 1.17872154712677, "learning_rate": 8.03081791147393e-06, "loss": 0.1138, "step": 27590 }, { "epoch": 2.5069258367773286, "grad_norm": 0.007466814946383238, "learning_rate": 8.00211744077668e-06, "loss": 0.1055, "step": 27600 }, { "epoch": 2.507834143239929, "grad_norm": 2.4516642093658447, "learning_rate": 7.97346388517382e-06, "loss": 0.1094, "step": 27610 }, { "epoch": 2.5087424497025297, "grad_norm": 0.5182468891143799, "learning_rate": 7.944857276673739e-06, "loss": 0.1232, "step": 27620 }, { "epoch": 2.5096507561651302, "grad_norm": 1.3044612407684326, "learning_rate": 7.916297647232374e-06, "loss": 0.1454, "step": 27630 }, { "epoch": 2.5105590626277303, "grad_norm": 1.6682698726654053, "learning_rate": 7.887785028753163e-06, "loss": 0.0568, "step": 27640 }, { "epoch": 2.5114673690903313, "grad_norm": 2.497687339782715, "learning_rate": 7.859319453087083e-06, "loss": 0.0744, "step": 27650 }, { "epoch": 2.5123756755529314, "grad_norm": 0.32343927025794983, "learning_rate": 7.830900952032517e-06, "loss": 0.1072, "step": 27660 }, { "epoch": 2.513283982015532, "grad_norm": 0.4784739017486572, "learning_rate": 7.802529557335265e-06, "loss": 0.0987, "step": 27670 }, { "epoch": 2.5141922884781325, "grad_norm": 2.1008360385894775, "learning_rate": 7.774205300688508e-06, "loss": 0.1858, "step": 27680 }, { "epoch": 2.515100594940733, "grad_norm": 0.5433931946754456, "learning_rate": 7.745928213732784e-06, "loss": 0.1, "step": 27690 }, { "epoch": 2.5160089014033336, "grad_norm": 0.7346005439758301, "learning_rate": 7.717698328055922e-06, "loss": 0.0946, "step": 27700 }, { "epoch": 2.516917207865934, "grad_norm": 1.048302173614502, "learning_rate": 7.689515675193038e-06, "loss": 0.0504, "step": 27710 }, { "epoch": 2.5178255143285346, "grad_norm": 3.0844874382019043, "learning_rate": 7.661380286626469e-06, "loss": 0.0487, "step": 27720 }, { "epoch": 2.5187338207911347, "grad_norm": 0.04404810816049576, "learning_rate": 7.633292193785758e-06, "loss": 0.0551, "step": 27730 }, { "epoch": 2.5196421272537353, "grad_norm": 2.6178696155548096, "learning_rate": 7.605251428047616e-06, "loss": 0.1036, "step": 27740 }, { "epoch": 2.520550433716336, "grad_norm": 1.066431999206543, "learning_rate": 7.577258020735906e-06, "loss": 0.077, "step": 27750 }, { "epoch": 2.5214587401789363, "grad_norm": 3.030526876449585, "learning_rate": 7.549312003121561e-06, "loss": 0.1284, "step": 27760 }, { "epoch": 2.522367046641537, "grad_norm": 3.055703639984131, "learning_rate": 7.521413406422584e-06, "loss": 0.0851, "step": 27770 }, { "epoch": 2.5232753531041374, "grad_norm": 1.474138855934143, "learning_rate": 7.4935622618040044e-06, "loss": 0.0885, "step": 27780 }, { "epoch": 2.524183659566738, "grad_norm": 0.46433618664741516, "learning_rate": 7.465758600377837e-06, "loss": 0.1002, "step": 27790 }, { "epoch": 2.525091966029338, "grad_norm": 0.16236822307109833, "learning_rate": 7.4380024532030805e-06, "loss": 0.0778, "step": 27800 }, { "epoch": 2.526000272491939, "grad_norm": 2.0387675762176514, "learning_rate": 7.410293851285627e-06, "loss": 0.0797, "step": 27810 }, { "epoch": 2.526908578954539, "grad_norm": 0.8980036973953247, "learning_rate": 7.382632825578273e-06, "loss": 0.0858, "step": 27820 }, { "epoch": 2.5278168854171397, "grad_norm": 0.7167443037033081, "learning_rate": 7.355019406980651e-06, "loss": 0.0871, "step": 27830 }, { "epoch": 2.52872519187974, "grad_norm": 3.1929051876068115, "learning_rate": 7.3274536263392365e-06, "loss": 0.1046, "step": 27840 }, { "epoch": 2.5296334983423407, "grad_norm": 1.3811477422714233, "learning_rate": 7.299935514447271e-06, "loss": 0.0528, "step": 27850 }, { "epoch": 2.5305418048049413, "grad_norm": 1.3244705200195312, "learning_rate": 7.2724651020447565e-06, "loss": 0.0835, "step": 27860 }, { "epoch": 2.531450111267542, "grad_norm": 2.295027494430542, "learning_rate": 7.245042419818399e-06, "loss": 0.0661, "step": 27870 }, { "epoch": 2.5323584177301424, "grad_norm": 0.9220595955848694, "learning_rate": 7.217667498401598e-06, "loss": 0.0644, "step": 27880 }, { "epoch": 2.5332667241927425, "grad_norm": 0.7274656295776367, "learning_rate": 7.190340368374387e-06, "loss": 0.0606, "step": 27890 }, { "epoch": 2.534175030655343, "grad_norm": 2.7286479473114014, "learning_rate": 7.163061060263443e-06, "loss": 0.1358, "step": 27900 }, { "epoch": 2.5350833371179435, "grad_norm": 0.5606793761253357, "learning_rate": 7.135829604541982e-06, "loss": 0.0705, "step": 27910 }, { "epoch": 2.535991643580544, "grad_norm": 2.269077777862549, "learning_rate": 7.10864603162979e-06, "loss": 0.0644, "step": 27920 }, { "epoch": 2.5368999500431446, "grad_norm": 2.703366279602051, "learning_rate": 7.081510371893158e-06, "loss": 0.2819, "step": 27930 }, { "epoch": 2.537808256505745, "grad_norm": 1.1045697927474976, "learning_rate": 7.054422655644838e-06, "loss": 0.0785, "step": 27940 }, { "epoch": 2.5387165629683457, "grad_norm": 3.0004451274871826, "learning_rate": 7.027382913144065e-06, "loss": 0.1167, "step": 27950 }, { "epoch": 2.539624869430946, "grad_norm": 1.1710636615753174, "learning_rate": 7.000391174596449e-06, "loss": 0.0931, "step": 27960 }, { "epoch": 2.5405331758935463, "grad_norm": 1.8321985006332397, "learning_rate": 6.973447470153982e-06, "loss": 0.1307, "step": 27970 }, { "epoch": 2.541441482356147, "grad_norm": 0.3588727116584778, "learning_rate": 6.946551829915004e-06, "loss": 0.0531, "step": 27980 }, { "epoch": 2.5423497888187474, "grad_norm": 1.0895037651062012, "learning_rate": 6.919704283924156e-06, "loss": 0.0906, "step": 27990 }, { "epoch": 2.543258095281348, "grad_norm": 17.334468841552734, "learning_rate": 6.892904862172367e-06, "loss": 0.1465, "step": 28000 }, { "epoch": 2.543258095281348, "eval_loss": 0.20570336282253265, "eval_runtime": 1112.9582, "eval_samples_per_second": 8.794, "eval_steps_per_second": 8.794, "step": 28000 }, { "epoch": 2.5441664017439485, "grad_norm": 3.0946710109710693, "learning_rate": 6.866153594596791e-06, "loss": 0.0852, "step": 28010 }, { "epoch": 2.545074708206549, "grad_norm": 0.007453919854015112, "learning_rate": 6.839450511080797e-06, "loss": 0.1119, "step": 28020 }, { "epoch": 2.545983014669149, "grad_norm": 1.530529499053955, "learning_rate": 6.812795641453939e-06, "loss": 0.0791, "step": 28030 }, { "epoch": 2.54689132113175, "grad_norm": 0.012950288131833076, "learning_rate": 6.786189015491878e-06, "loss": 0.0318, "step": 28040 }, { "epoch": 2.54779962759435, "grad_norm": 0.089525505900383, "learning_rate": 6.759630662916439e-06, "loss": 0.0888, "step": 28050 }, { "epoch": 2.5487079340569507, "grad_norm": 1.5975747108459473, "learning_rate": 6.733120613395471e-06, "loss": 0.141, "step": 28060 }, { "epoch": 2.5496162405195513, "grad_norm": 1.1116536855697632, "learning_rate": 6.706658896542894e-06, "loss": 0.1267, "step": 28070 }, { "epoch": 2.550524546982152, "grad_norm": 1.905626654624939, "learning_rate": 6.6802455419186e-06, "loss": 0.0899, "step": 28080 }, { "epoch": 2.5514328534447523, "grad_norm": 2.450395107269287, "learning_rate": 6.653880579028515e-06, "loss": 0.0727, "step": 28090 }, { "epoch": 2.552341159907353, "grad_norm": 0.4655876159667969, "learning_rate": 6.627564037324457e-06, "loss": 0.1409, "step": 28100 }, { "epoch": 2.5532494663699534, "grad_norm": 0.6269615292549133, "learning_rate": 6.601295946204178e-06, "loss": 0.0952, "step": 28110 }, { "epoch": 2.5541577728325535, "grad_norm": 2.5221359729766846, "learning_rate": 6.575076335011293e-06, "loss": 0.0789, "step": 28120 }, { "epoch": 2.555066079295154, "grad_norm": 2.5499472618103027, "learning_rate": 6.548905233035269e-06, "loss": 0.1459, "step": 28130 }, { "epoch": 2.5559743857577546, "grad_norm": 0.9899567365646362, "learning_rate": 6.522782669511379e-06, "loss": 0.1551, "step": 28140 }, { "epoch": 2.556882692220355, "grad_norm": 3.3933355808258057, "learning_rate": 6.496708673620688e-06, "loss": 0.0937, "step": 28150 }, { "epoch": 2.5577909986829557, "grad_norm": 2.3017115592956543, "learning_rate": 6.4706832744899914e-06, "loss": 0.1618, "step": 28160 }, { "epoch": 2.558699305145556, "grad_norm": 0.003403515089303255, "learning_rate": 6.444706501191805e-06, "loss": 0.0718, "step": 28170 }, { "epoch": 2.5596076116081568, "grad_norm": 1.5001386404037476, "learning_rate": 6.4187783827443116e-06, "loss": 0.1272, "step": 28180 }, { "epoch": 2.560515918070757, "grad_norm": 1.497300624847412, "learning_rate": 6.392898948111376e-06, "loss": 0.1084, "step": 28190 }, { "epoch": 2.561424224533358, "grad_norm": 6.053680419921875, "learning_rate": 6.367068226202455e-06, "loss": 0.0618, "step": 28200 }, { "epoch": 2.562332530995958, "grad_norm": 1.5334755182266235, "learning_rate": 6.341286245872585e-06, "loss": 0.1053, "step": 28210 }, { "epoch": 2.5632408374585585, "grad_norm": 0.009855972602963448, "learning_rate": 6.3155530359223665e-06, "loss": 0.1289, "step": 28220 }, { "epoch": 2.564149143921159, "grad_norm": 2.639949083328247, "learning_rate": 6.2898686250979e-06, "loss": 0.1633, "step": 28230 }, { "epoch": 2.5650574503837595, "grad_norm": 0.501032829284668, "learning_rate": 6.2642330420908116e-06, "loss": 0.1068, "step": 28240 }, { "epoch": 2.56596575684636, "grad_norm": 0.7856234312057495, "learning_rate": 6.238646315538155e-06, "loss": 0.0874, "step": 28250 }, { "epoch": 2.56687406330896, "grad_norm": 2.1558101177215576, "learning_rate": 6.2131084740224e-06, "loss": 0.1025, "step": 28260 }, { "epoch": 2.567782369771561, "grad_norm": 0.12067554146051407, "learning_rate": 6.187619546071421e-06, "loss": 0.1097, "step": 28270 }, { "epoch": 2.5686906762341613, "grad_norm": 0.825342059135437, "learning_rate": 6.16217956015846e-06, "loss": 0.1384, "step": 28280 }, { "epoch": 2.569598982696762, "grad_norm": 0.36013853549957275, "learning_rate": 6.136788544702066e-06, "loss": 0.0645, "step": 28290 }, { "epoch": 2.5705072891593623, "grad_norm": 0.6107112765312195, "learning_rate": 6.111446528066101e-06, "loss": 0.1144, "step": 28300 }, { "epoch": 2.571415595621963, "grad_norm": 0.2887214124202728, "learning_rate": 6.0861535385596856e-06, "loss": 0.0394, "step": 28310 }, { "epoch": 2.5723239020845634, "grad_norm": 1.5467320680618286, "learning_rate": 6.06090960443717e-06, "loss": 0.0601, "step": 28320 }, { "epoch": 2.573232208547164, "grad_norm": 0.7531377673149109, "learning_rate": 6.035714753898097e-06, "loss": 0.0961, "step": 28330 }, { "epoch": 2.5741405150097645, "grad_norm": 1.954485535621643, "learning_rate": 6.010569015087214e-06, "loss": 0.1536, "step": 28340 }, { "epoch": 2.5750488214723646, "grad_norm": 3.049821615219116, "learning_rate": 5.9854724160943675e-06, "loss": 0.0966, "step": 28350 }, { "epoch": 2.575957127934965, "grad_norm": 1.4616351127624512, "learning_rate": 5.960424984954527e-06, "loss": 0.1433, "step": 28360 }, { "epoch": 2.5768654343975657, "grad_norm": 2.0658764839172363, "learning_rate": 5.935426749647743e-06, "loss": 0.1017, "step": 28370 }, { "epoch": 2.577773740860166, "grad_norm": 1.4275517463684082, "learning_rate": 5.910477738099085e-06, "loss": 0.0901, "step": 28380 }, { "epoch": 2.5786820473227667, "grad_norm": 0.5202794671058655, "learning_rate": 5.88557797817868e-06, "loss": 0.0938, "step": 28390 }, { "epoch": 2.5795903537853673, "grad_norm": 2.6823065280914307, "learning_rate": 5.8607274977015954e-06, "loss": 0.139, "step": 28400 }, { "epoch": 2.580498660247968, "grad_norm": 0.39046069979667664, "learning_rate": 5.835926324427871e-06, "loss": 0.1252, "step": 28410 }, { "epoch": 2.581406966710568, "grad_norm": 1.1960668563842773, "learning_rate": 5.81117448606246e-06, "loss": 0.0835, "step": 28420 }, { "epoch": 2.582315273173169, "grad_norm": 1.122786521911621, "learning_rate": 5.786472010255195e-06, "loss": 0.1161, "step": 28430 }, { "epoch": 2.583223579635769, "grad_norm": 1.101189374923706, "learning_rate": 5.761818924600793e-06, "loss": 0.0754, "step": 28440 }, { "epoch": 2.5841318860983695, "grad_norm": 1.6679588556289673, "learning_rate": 5.737215256638767e-06, "loss": 0.0688, "step": 28450 }, { "epoch": 2.58504019256097, "grad_norm": 1.3174406290054321, "learning_rate": 5.712661033853445e-06, "loss": 0.1349, "step": 28460 }, { "epoch": 2.5859484990235706, "grad_norm": 2.267176866531372, "learning_rate": 5.688156283673918e-06, "loss": 0.138, "step": 28470 }, { "epoch": 2.586856805486171, "grad_norm": 0.8204176425933838, "learning_rate": 5.663701033474e-06, "loss": 0.0939, "step": 28480 }, { "epoch": 2.5877651119487717, "grad_norm": 2.5150461196899414, "learning_rate": 5.63929531057224e-06, "loss": 0.0619, "step": 28490 }, { "epoch": 2.588673418411372, "grad_norm": 1.7597970962524414, "learning_rate": 5.614939142231834e-06, "loss": 0.094, "step": 28500 }, { "epoch": 2.588673418411372, "eval_loss": 0.2005964070558548, "eval_runtime": 1106.777, "eval_samples_per_second": 8.843, "eval_steps_per_second": 8.843, "step": 28500 }, { "epoch": 2.5895817248739723, "grad_norm": 1.1949470043182373, "learning_rate": 5.590632555660625e-06, "loss": 0.0955, "step": 28510 }, { "epoch": 2.590490031336573, "grad_norm": 1.5125807523727417, "learning_rate": 5.566375578011068e-06, "loss": 0.1296, "step": 28520 }, { "epoch": 2.5913983377991734, "grad_norm": 1.5749651193618774, "learning_rate": 5.542168236380202e-06, "loss": 0.0982, "step": 28530 }, { "epoch": 2.592306644261774, "grad_norm": 0.7008296847343445, "learning_rate": 5.5180105578096445e-06, "loss": 0.0554, "step": 28540 }, { "epoch": 2.5932149507243745, "grad_norm": 0.5390353798866272, "learning_rate": 5.493902569285492e-06, "loss": 0.0548, "step": 28550 }, { "epoch": 2.594123257186975, "grad_norm": 0.5282658934593201, "learning_rate": 5.4698442977383575e-06, "loss": 0.0879, "step": 28560 }, { "epoch": 2.5950315636495755, "grad_norm": 0.671007513999939, "learning_rate": 5.445835770043317e-06, "loss": 0.1042, "step": 28570 }, { "epoch": 2.5959398701121756, "grad_norm": 0.3196677565574646, "learning_rate": 5.4218770130198715e-06, "loss": 0.0851, "step": 28580 }, { "epoch": 2.596848176574776, "grad_norm": 2.2235167026519775, "learning_rate": 5.397968053431923e-06, "loss": 0.1021, "step": 28590 }, { "epoch": 2.5977564830373767, "grad_norm": 1.214285969734192, "learning_rate": 5.374108917987752e-06, "loss": 0.1153, "step": 28600 }, { "epoch": 2.5986647894999773, "grad_norm": 1.729266881942749, "learning_rate": 5.350299633339978e-06, "loss": 0.1087, "step": 28610 }, { "epoch": 2.599573095962578, "grad_norm": 1.6388075351715088, "learning_rate": 5.32654022608553e-06, "loss": 0.1175, "step": 28620 }, { "epoch": 2.6004814024251783, "grad_norm": 1.2293556928634644, "learning_rate": 5.302830722765623e-06, "loss": 0.1261, "step": 28630 }, { "epoch": 2.601389708887779, "grad_norm": 3.485309362411499, "learning_rate": 5.27917114986573e-06, "loss": 0.1795, "step": 28640 }, { "epoch": 2.602298015350379, "grad_norm": 2.12432861328125, "learning_rate": 5.25556153381554e-06, "loss": 0.1073, "step": 28650 }, { "epoch": 2.60320632181298, "grad_norm": 3.244194507598877, "learning_rate": 5.232001900988942e-06, "loss": 0.0593, "step": 28660 }, { "epoch": 2.60411462827558, "grad_norm": 0.0013577858917415142, "learning_rate": 5.208492277703986e-06, "loss": 0.0436, "step": 28670 }, { "epoch": 2.6050229347381806, "grad_norm": 1.127408742904663, "learning_rate": 5.185032690222846e-06, "loss": 0.0646, "step": 28680 }, { "epoch": 2.605931241200781, "grad_norm": 1.7183469533920288, "learning_rate": 5.161623164751828e-06, "loss": 0.1501, "step": 28690 }, { "epoch": 2.6068395476633817, "grad_norm": 1.6404086351394653, "learning_rate": 5.138263727441301e-06, "loss": 0.1537, "step": 28700 }, { "epoch": 2.607747854125982, "grad_norm": 1.1747888326644897, "learning_rate": 5.114954404385675e-06, "loss": 0.0653, "step": 28710 }, { "epoch": 2.6086561605885827, "grad_norm": 3.6159298419952393, "learning_rate": 5.0916952216233885e-06, "loss": 0.1086, "step": 28720 }, { "epoch": 2.6095644670511833, "grad_norm": 1.5741169452667236, "learning_rate": 5.068486205136852e-06, "loss": 0.0997, "step": 28730 }, { "epoch": 2.6104727735137834, "grad_norm": 1.638576626777649, "learning_rate": 5.045327380852466e-06, "loss": 0.0606, "step": 28740 }, { "epoch": 2.611381079976384, "grad_norm": 0.29194092750549316, "learning_rate": 5.02221877464053e-06, "loss": 0.1483, "step": 28750 }, { "epoch": 2.6122893864389845, "grad_norm": 1.0335850715637207, "learning_rate": 4.999160412315274e-06, "loss": 0.1209, "step": 28760 }, { "epoch": 2.613197692901585, "grad_norm": 1.7188044786453247, "learning_rate": 4.976152319634775e-06, "loss": 0.1038, "step": 28770 }, { "epoch": 2.6141059993641855, "grad_norm": 2.8763232231140137, "learning_rate": 4.953194522300969e-06, "loss": 0.1018, "step": 28780 }, { "epoch": 2.615014305826786, "grad_norm": 2.0517826080322266, "learning_rate": 4.930287045959619e-06, "loss": 0.1041, "step": 28790 }, { "epoch": 2.6159226122893866, "grad_norm": 0.8920056819915771, "learning_rate": 4.907429916200257e-06, "loss": 0.1738, "step": 28800 }, { "epoch": 2.6168309187519867, "grad_norm": 0.18275237083435059, "learning_rate": 4.884623158556179e-06, "loss": 0.0659, "step": 28810 }, { "epoch": 2.6177392252145872, "grad_norm": 1.659462332725525, "learning_rate": 4.861866798504411e-06, "loss": 0.0476, "step": 28820 }, { "epoch": 2.618647531677188, "grad_norm": 1.9730417728424072, "learning_rate": 4.8391608614656746e-06, "loss": 0.1311, "step": 28830 }, { "epoch": 2.6195558381397883, "grad_norm": 2.1103951930999756, "learning_rate": 4.8165053728043865e-06, "loss": 0.1238, "step": 28840 }, { "epoch": 2.620464144602389, "grad_norm": 2.3336870670318604, "learning_rate": 4.79390035782859e-06, "loss": 0.0843, "step": 28850 }, { "epoch": 2.6213724510649894, "grad_norm": 2.8668391704559326, "learning_rate": 4.771345841789942e-06, "loss": 0.1216, "step": 28860 }, { "epoch": 2.62228075752759, "grad_norm": 1.5358473062515259, "learning_rate": 4.748841849883701e-06, "loss": 0.0844, "step": 28870 }, { "epoch": 2.62318906399019, "grad_norm": 0.5765732526779175, "learning_rate": 4.726388407248677e-06, "loss": 0.0633, "step": 28880 }, { "epoch": 2.624097370452791, "grad_norm": 1.0130410194396973, "learning_rate": 4.70398553896722e-06, "loss": 0.1214, "step": 28890 }, { "epoch": 2.625005676915391, "grad_norm": 1.2017775774002075, "learning_rate": 4.681633270065172e-06, "loss": 0.1017, "step": 28900 }, { "epoch": 2.6259139833779916, "grad_norm": 1.6888262033462524, "learning_rate": 4.659331625511859e-06, "loss": 0.1184, "step": 28910 }, { "epoch": 2.626822289840592, "grad_norm": 2.0918190479278564, "learning_rate": 4.6370806302200645e-06, "loss": 0.0926, "step": 28920 }, { "epoch": 2.6277305963031927, "grad_norm": 1.3769179582595825, "learning_rate": 4.61488030904596e-06, "loss": 0.057, "step": 28930 }, { "epoch": 2.6286389027657933, "grad_norm": 1.01719331741333, "learning_rate": 4.592730686789159e-06, "loss": 0.0818, "step": 28940 }, { "epoch": 2.629547209228394, "grad_norm": 1.78557550907135, "learning_rate": 4.570631788192609e-06, "loss": 0.0871, "step": 28950 }, { "epoch": 2.6304555156909943, "grad_norm": 1.086766004562378, "learning_rate": 4.548583637942594e-06, "loss": 0.1042, "step": 28960 }, { "epoch": 2.6313638221535944, "grad_norm": 2.000225067138672, "learning_rate": 4.526586260668714e-06, "loss": 0.1535, "step": 28970 }, { "epoch": 2.632272128616195, "grad_norm": 0.9845068454742432, "learning_rate": 4.504639680943845e-06, "loss": 0.0891, "step": 28980 }, { "epoch": 2.6331804350787955, "grad_norm": 0.06618889421224594, "learning_rate": 4.482743923284144e-06, "loss": 0.0714, "step": 28990 }, { "epoch": 2.634088741541396, "grad_norm": 3.6077568531036377, "learning_rate": 4.460899012148961e-06, "loss": 0.1033, "step": 29000 }, { "epoch": 2.634088741541396, "eval_loss": 0.20123445987701416, "eval_runtime": 1110.9769, "eval_samples_per_second": 8.809, "eval_steps_per_second": 8.809, "step": 29000 }, { "epoch": 2.6349970480039966, "grad_norm": 0.5582040548324585, "learning_rate": 4.439104971940866e-06, "loss": 0.0801, "step": 29010 }, { "epoch": 2.635905354466597, "grad_norm": 1.2901995182037354, "learning_rate": 4.417361827005595e-06, "loss": 0.1061, "step": 29020 }, { "epoch": 2.6368136609291977, "grad_norm": 1.6436207294464111, "learning_rate": 4.39566960163203e-06, "loss": 0.1033, "step": 29030 }, { "epoch": 2.6377219673917978, "grad_norm": 1.6988452672958374, "learning_rate": 4.374028320052182e-06, "loss": 0.1267, "step": 29040 }, { "epoch": 2.6386302738543987, "grad_norm": 0.45825663208961487, "learning_rate": 4.352438006441134e-06, "loss": 0.0996, "step": 29050 }, { "epoch": 2.639538580316999, "grad_norm": 0.48270633816719055, "learning_rate": 4.3308986849170565e-06, "loss": 0.0952, "step": 29060 }, { "epoch": 2.6404468867795994, "grad_norm": 0.7905918955802917, "learning_rate": 4.309410379541135e-06, "loss": 0.0644, "step": 29070 }, { "epoch": 2.6413551932422, "grad_norm": 2.7809743881225586, "learning_rate": 4.287973114317578e-06, "loss": 0.0877, "step": 29080 }, { "epoch": 2.6422634997048005, "grad_norm": 1.5405194759368896, "learning_rate": 4.266586913193582e-06, "loss": 0.0702, "step": 29090 }, { "epoch": 2.643171806167401, "grad_norm": 0.9896026849746704, "learning_rate": 4.245251800059302e-06, "loss": 0.1351, "step": 29100 }, { "epoch": 2.644080112630001, "grad_norm": 2.0052490234375, "learning_rate": 4.223967798747802e-06, "loss": 0.135, "step": 29110 }, { "epoch": 2.644988419092602, "grad_norm": 0.18172350525856018, "learning_rate": 4.202734933035074e-06, "loss": 0.0816, "step": 29120 }, { "epoch": 2.645896725555202, "grad_norm": 1.9559983015060425, "learning_rate": 4.181553226639967e-06, "loss": 0.0853, "step": 29130 }, { "epoch": 2.6468050320178027, "grad_norm": 1.9342668056488037, "learning_rate": 4.160422703224204e-06, "loss": 0.0703, "step": 29140 }, { "epoch": 2.6477133384804032, "grad_norm": 2.8364086151123047, "learning_rate": 4.1393433863923244e-06, "loss": 0.0919, "step": 29150 }, { "epoch": 2.648621644943004, "grad_norm": 0.8093467950820923, "learning_rate": 4.1183152996916496e-06, "loss": 0.1276, "step": 29160 }, { "epoch": 2.6495299514056043, "grad_norm": 1.4702037572860718, "learning_rate": 4.097338466612288e-06, "loss": 0.0994, "step": 29170 }, { "epoch": 2.650438257868205, "grad_norm": 0.9965925216674805, "learning_rate": 4.0764129105870905e-06, "loss": 0.0631, "step": 29180 }, { "epoch": 2.6513465643308054, "grad_norm": 2.7410595417022705, "learning_rate": 4.055538654991625e-06, "loss": 0.0931, "step": 29190 }, { "epoch": 2.6522548707934055, "grad_norm": 1.7234203815460205, "learning_rate": 4.034715723144161e-06, "loss": 0.1041, "step": 29200 }, { "epoch": 2.653163177256006, "grad_norm": 1.5100781917572021, "learning_rate": 4.013944138305625e-06, "loss": 0.0793, "step": 29210 }, { "epoch": 2.6540714837186066, "grad_norm": 1.945876955986023, "learning_rate": 3.9932239236795735e-06, "loss": 0.0765, "step": 29220 }, { "epoch": 2.654979790181207, "grad_norm": 2.229311943054199, "learning_rate": 3.972555102412223e-06, "loss": 0.1352, "step": 29230 }, { "epoch": 2.6558880966438076, "grad_norm": 1.1433398723602295, "learning_rate": 3.9519376975923285e-06, "loss": 0.1291, "step": 29240 }, { "epoch": 2.656796403106408, "grad_norm": 1.8518577814102173, "learning_rate": 3.931371732251238e-06, "loss": 0.108, "step": 29250 }, { "epoch": 2.6577047095690087, "grad_norm": 0.25171276926994324, "learning_rate": 3.910857229362819e-06, "loss": 0.1148, "step": 29260 }, { "epoch": 2.658613016031609, "grad_norm": 1.8992358446121216, "learning_rate": 3.890394211843468e-06, "loss": 0.1009, "step": 29270 }, { "epoch": 2.65952132249421, "grad_norm": 4.703619003295898, "learning_rate": 3.869982702552061e-06, "loss": 0.1716, "step": 29280 }, { "epoch": 2.66042962895681, "grad_norm": 0.04593246430158615, "learning_rate": 3.849622724289942e-06, "loss": 0.108, "step": 29290 }, { "epoch": 2.6613379354194104, "grad_norm": 2.1835103034973145, "learning_rate": 3.829314299800873e-06, "loss": 0.0986, "step": 29300 }, { "epoch": 2.662246241882011, "grad_norm": 0.9923092722892761, "learning_rate": 3.809057451771042e-06, "loss": 0.0626, "step": 29310 }, { "epoch": 2.6631545483446115, "grad_norm": 0.08765143156051636, "learning_rate": 3.7888522028290186e-06, "loss": 0.0957, "step": 29320 }, { "epoch": 2.664062854807212, "grad_norm": 1.025355339050293, "learning_rate": 3.768698575545726e-06, "loss": 0.1004, "step": 29330 }, { "epoch": 2.6649711612698126, "grad_norm": 1.4432151317596436, "learning_rate": 3.748596592434428e-06, "loss": 0.1093, "step": 29340 }, { "epoch": 2.665879467732413, "grad_norm": 0.4621047079563141, "learning_rate": 3.728546275950695e-06, "loss": 0.0967, "step": 29350 }, { "epoch": 2.6667877741950132, "grad_norm": 1.8613866567611694, "learning_rate": 3.7085476484923775e-06, "loss": 0.1222, "step": 29360 }, { "epoch": 2.6676960806576138, "grad_norm": 1.224278211593628, "learning_rate": 3.688600732399594e-06, "loss": 0.111, "step": 29370 }, { "epoch": 2.6686043871202143, "grad_norm": 1.9078925848007202, "learning_rate": 3.6687055499546987e-06, "loss": 0.089, "step": 29380 }, { "epoch": 2.669512693582815, "grad_norm": 1.6613056659698486, "learning_rate": 3.6488621233822463e-06, "loss": 0.0935, "step": 29390 }, { "epoch": 2.6704210000454154, "grad_norm": 1.299709439277649, "learning_rate": 3.629070474848972e-06, "loss": 0.1148, "step": 29400 }, { "epoch": 2.671329306508016, "grad_norm": 0.06136166676878929, "learning_rate": 3.609330626463786e-06, "loss": 0.0854, "step": 29410 }, { "epoch": 2.6722376129706165, "grad_norm": 1.259183406829834, "learning_rate": 3.589642600277715e-06, "loss": 0.0981, "step": 29420 }, { "epoch": 2.6731459194332166, "grad_norm": 0.8577300310134888, "learning_rate": 3.5700064182839165e-06, "loss": 0.1032, "step": 29430 }, { "epoch": 2.674054225895817, "grad_norm": 2.0929455757141113, "learning_rate": 3.5504221024176222e-06, "loss": 0.1102, "step": 29440 }, { "epoch": 2.6749625323584176, "grad_norm": 0.5819238424301147, "learning_rate": 3.530889674556126e-06, "loss": 0.1103, "step": 29450 }, { "epoch": 2.675870838821018, "grad_norm": 0.3556223213672638, "learning_rate": 3.5114091565187523e-06, "loss": 0.0385, "step": 29460 }, { "epoch": 2.6767791452836187, "grad_norm": 2.0016536712646484, "learning_rate": 3.491980570066844e-06, "loss": 0.0802, "step": 29470 }, { "epoch": 2.6776874517462192, "grad_norm": 0.7011592984199524, "learning_rate": 3.4726039369037455e-06, "loss": 0.0673, "step": 29480 }, { "epoch": 2.67859575820882, "grad_norm": 1.1301778554916382, "learning_rate": 3.453279278674737e-06, "loss": 0.0925, "step": 29490 }, { "epoch": 2.67950406467142, "grad_norm": 2.2361655235290527, "learning_rate": 3.4340066169670617e-06, "loss": 0.128, "step": 29500 }, { "epoch": 2.67950406467142, "eval_loss": 0.20267841219902039, "eval_runtime": 1110.7102, "eval_samples_per_second": 8.811, "eval_steps_per_second": 8.811, "step": 29500 }, { "epoch": 2.680412371134021, "grad_norm": 4.02304220199585, "learning_rate": 3.41478597330987e-06, "loss": 0.1116, "step": 29510 }, { "epoch": 2.681320677596621, "grad_norm": 1.6116136312484741, "learning_rate": 3.3956173691741923e-06, "loss": 0.0836, "step": 29520 }, { "epoch": 2.6822289840592215, "grad_norm": 1.64669668674469, "learning_rate": 3.3765008259729615e-06, "loss": 0.1416, "step": 29530 }, { "epoch": 2.683137290521822, "grad_norm": 0.9126670956611633, "learning_rate": 3.357436365060912e-06, "loss": 0.136, "step": 29540 }, { "epoch": 2.6840455969844226, "grad_norm": 0.015529902651906013, "learning_rate": 3.338424007734631e-06, "loss": 0.0861, "step": 29550 }, { "epoch": 2.684953903447023, "grad_norm": 1.2996901273727417, "learning_rate": 3.3194637752324785e-06, "loss": 0.1098, "step": 29560 }, { "epoch": 2.6858622099096237, "grad_norm": 2.004168748855591, "learning_rate": 3.3005556887345855e-06, "loss": 0.1051, "step": 29570 }, { "epoch": 2.686770516372224, "grad_norm": 0.7057539820671082, "learning_rate": 3.281699769362867e-06, "loss": 0.075, "step": 29580 }, { "epoch": 2.6876788228348243, "grad_norm": 1.2585818767547607, "learning_rate": 3.2628960381809236e-06, "loss": 0.063, "step": 29590 }, { "epoch": 2.688587129297425, "grad_norm": 2.021228313446045, "learning_rate": 3.2441445161940753e-06, "loss": 0.0835, "step": 29600 }, { "epoch": 2.6894954357600254, "grad_norm": 3.4618189334869385, "learning_rate": 3.2254452243493217e-06, "loss": 0.1011, "step": 29610 }, { "epoch": 2.690403742222626, "grad_norm": 1.3506410121917725, "learning_rate": 3.2067981835352924e-06, "loss": 0.0628, "step": 29620 }, { "epoch": 2.6913120486852264, "grad_norm": 1.4905776977539062, "learning_rate": 3.1882034145822912e-06, "loss": 0.0728, "step": 29630 }, { "epoch": 2.692220355147827, "grad_norm": 1.3281582593917847, "learning_rate": 3.1696609382621977e-06, "loss": 0.1027, "step": 29640 }, { "epoch": 2.6931286616104275, "grad_norm": 0.6949412822723389, "learning_rate": 3.151170775288481e-06, "loss": 0.1019, "step": 29650 }, { "epoch": 2.6940369680730276, "grad_norm": 1.2917380332946777, "learning_rate": 3.132732946316186e-06, "loss": 0.1149, "step": 29660 }, { "epoch": 2.694945274535628, "grad_norm": 0.45774102210998535, "learning_rate": 3.1143474719418696e-06, "loss": 0.0915, "step": 29670 }, { "epoch": 2.6958535809982287, "grad_norm": 0.8361861109733582, "learning_rate": 3.096014372703637e-06, "loss": 0.1289, "step": 29680 }, { "epoch": 2.6967618874608292, "grad_norm": 1.1123442649841309, "learning_rate": 3.077733669081073e-06, "loss": 0.0424, "step": 29690 }, { "epoch": 2.6976701939234298, "grad_norm": 1.2813533544540405, "learning_rate": 3.0595053814952145e-06, "loss": 0.0795, "step": 29700 }, { "epoch": 2.6985785003860303, "grad_norm": 1.8606863021850586, "learning_rate": 3.041329530308573e-06, "loss": 0.0693, "step": 29710 }, { "epoch": 2.699486806848631, "grad_norm": 1.9069486856460571, "learning_rate": 3.0232061358250576e-06, "loss": 0.1482, "step": 29720 }, { "epoch": 2.700395113311231, "grad_norm": 0.8299840688705444, "learning_rate": 3.005135218290006e-06, "loss": 0.0817, "step": 29730 }, { "epoch": 2.701303419773832, "grad_norm": 0.29457345604896545, "learning_rate": 2.9871167978901215e-06, "loss": 0.1094, "step": 29740 }, { "epoch": 2.702211726236432, "grad_norm": 0.9012564420700073, "learning_rate": 2.9691508947534585e-06, "loss": 0.0914, "step": 29750 }, { "epoch": 2.7031200326990326, "grad_norm": 0.1397467851638794, "learning_rate": 2.951237528949419e-06, "loss": 0.0781, "step": 29760 }, { "epoch": 2.704028339161633, "grad_norm": 0.9729006290435791, "learning_rate": 2.9333767204886897e-06, "loss": 0.083, "step": 29770 }, { "epoch": 2.7049366456242336, "grad_norm": 0.01455288752913475, "learning_rate": 2.915568489323278e-06, "loss": 0.0975, "step": 29780 }, { "epoch": 2.705844952086834, "grad_norm": 6.274304389953613, "learning_rate": 2.897812855346449e-06, "loss": 0.1163, "step": 29790 }, { "epoch": 2.7067532585494347, "grad_norm": 0.984542191028595, "learning_rate": 2.8801098383927028e-06, "loss": 0.1445, "step": 29800 }, { "epoch": 2.7076615650120353, "grad_norm": 0.07083532959222794, "learning_rate": 2.862459458237765e-06, "loss": 0.0507, "step": 29810 }, { "epoch": 2.7085698714746353, "grad_norm": 0.016134286299347878, "learning_rate": 2.8448617345985583e-06, "loss": 0.0992, "step": 29820 }, { "epoch": 2.709478177937236, "grad_norm": 1.3869261741638184, "learning_rate": 2.827316687133208e-06, "loss": 0.0771, "step": 29830 }, { "epoch": 2.7103864843998364, "grad_norm": 0.5554212331771851, "learning_rate": 2.8098243354409692e-06, "loss": 0.1168, "step": 29840 }, { "epoch": 2.711294790862437, "grad_norm": 1.3337239027023315, "learning_rate": 2.792384699062234e-06, "loss": 0.1023, "step": 29850 }, { "epoch": 2.7122030973250375, "grad_norm": 2.4193384647369385, "learning_rate": 2.7749977974785178e-06, "loss": 0.0862, "step": 29860 }, { "epoch": 2.713111403787638, "grad_norm": 2.6488823890686035, "learning_rate": 2.7576636501124186e-06, "loss": 0.0807, "step": 29870 }, { "epoch": 2.7140197102502386, "grad_norm": 3.0374271869659424, "learning_rate": 2.7403822763276134e-06, "loss": 0.1022, "step": 29880 }, { "epoch": 2.7149280167128387, "grad_norm": 1.4971052408218384, "learning_rate": 2.723153695428815e-06, "loss": 0.1223, "step": 29890 }, { "epoch": 2.7158363231754397, "grad_norm": 0.04178939014673233, "learning_rate": 2.705977926661779e-06, "loss": 0.0883, "step": 29900 }, { "epoch": 2.7167446296380398, "grad_norm": 1.6718358993530273, "learning_rate": 2.6888549892132353e-06, "loss": 0.0901, "step": 29910 }, { "epoch": 2.7176529361006403, "grad_norm": 0.7872090339660645, "learning_rate": 2.6717849022109163e-06, "loss": 0.0839, "step": 29920 }, { "epoch": 2.718561242563241, "grad_norm": 2.133857011795044, "learning_rate": 2.6547676847235305e-06, "loss": 0.1741, "step": 29930 }, { "epoch": 2.7194695490258414, "grad_norm": 3.1657443046569824, "learning_rate": 2.637803355760704e-06, "loss": 0.0994, "step": 29940 }, { "epoch": 2.720377855488442, "grad_norm": 2.685434341430664, "learning_rate": 2.6208919342729898e-06, "loss": 0.1716, "step": 29950 }, { "epoch": 2.721286161951042, "grad_norm": 0.5759431719779968, "learning_rate": 2.6040334391518373e-06, "loss": 0.1387, "step": 29960 }, { "epoch": 2.722194468413643, "grad_norm": 1.9779216051101685, "learning_rate": 2.5872278892295707e-06, "loss": 0.1274, "step": 29970 }, { "epoch": 2.723102774876243, "grad_norm": 1.0363761186599731, "learning_rate": 2.570475303279385e-06, "loss": 0.1475, "step": 29980 }, { "epoch": 2.7240110813388436, "grad_norm": 1.4684959650039673, "learning_rate": 2.5537757000152927e-06, "loss": 0.1144, "step": 29990 }, { "epoch": 2.724919387801444, "grad_norm": 0.18157246708869934, "learning_rate": 2.537129098092128e-06, "loss": 0.0784, "step": 30000 }, { "epoch": 2.724919387801444, "eval_loss": 0.20348073542118073, "eval_runtime": 1110.0194, "eval_samples_per_second": 8.817, "eval_steps_per_second": 8.817, "step": 30000 }, { "epoch": 2.7258276942640447, "grad_norm": 0.8609075546264648, "learning_rate": 2.52053551610551e-06, "loss": 0.075, "step": 30010 }, { "epoch": 2.7267360007266452, "grad_norm": 3.1423516273498535, "learning_rate": 2.503994972591839e-06, "loss": 0.11, "step": 30020 }, { "epoch": 2.7276443071892458, "grad_norm": 1.6816142797470093, "learning_rate": 2.4875074860282733e-06, "loss": 0.1094, "step": 30030 }, { "epoch": 2.7285526136518463, "grad_norm": 1.7875168323516846, "learning_rate": 2.4710730748326917e-06, "loss": 0.1091, "step": 30040 }, { "epoch": 2.7294609201144464, "grad_norm": 4.0865654945373535, "learning_rate": 2.45469175736367e-06, "loss": 0.1384, "step": 30050 }, { "epoch": 2.730369226577047, "grad_norm": 0.38264355063438416, "learning_rate": 2.4383635519205082e-06, "loss": 0.0655, "step": 30060 }, { "epoch": 2.7312775330396475, "grad_norm": 1.2987134456634521, "learning_rate": 2.422088476743134e-06, "loss": 0.1582, "step": 30070 }, { "epoch": 2.732185839502248, "grad_norm": 1.9318394660949707, "learning_rate": 2.4058665500121703e-06, "loss": 0.1169, "step": 30080 }, { "epoch": 2.7330941459648486, "grad_norm": 1.5406855344772339, "learning_rate": 2.389697789848838e-06, "loss": 0.094, "step": 30090 }, { "epoch": 2.734002452427449, "grad_norm": 0.6665658354759216, "learning_rate": 2.373582214314973e-06, "loss": 0.0778, "step": 30100 }, { "epoch": 2.7349107588900496, "grad_norm": 0.9046145677566528, "learning_rate": 2.357519841413003e-06, "loss": 0.0713, "step": 30110 }, { "epoch": 2.7358190653526497, "grad_norm": 1.443549633026123, "learning_rate": 2.3415106890859074e-06, "loss": 0.069, "step": 30120 }, { "epoch": 2.7367273718152507, "grad_norm": 2.2652199268341064, "learning_rate": 2.3255547752172535e-06, "loss": 0.1446, "step": 30130 }, { "epoch": 2.737635678277851, "grad_norm": 2.532176971435547, "learning_rate": 2.309652117631095e-06, "loss": 0.0966, "step": 30140 }, { "epoch": 2.7385439847404514, "grad_norm": 1.0138896703720093, "learning_rate": 2.2938027340920097e-06, "loss": 0.0475, "step": 30150 }, { "epoch": 2.739452291203052, "grad_norm": 1.7285960912704468, "learning_rate": 2.2780066423050693e-06, "loss": 0.0762, "step": 30160 }, { "epoch": 2.7403605976656524, "grad_norm": 1.4132277965545654, "learning_rate": 2.262263859915803e-06, "loss": 0.0838, "step": 30170 }, { "epoch": 2.741268904128253, "grad_norm": 4.052127361297607, "learning_rate": 2.2465744045102155e-06, "loss": 0.0698, "step": 30180 }, { "epoch": 2.7421772105908535, "grad_norm": 1.9558606147766113, "learning_rate": 2.230938293614698e-06, "loss": 0.0617, "step": 30190 }, { "epoch": 2.743085517053454, "grad_norm": 2.1610593795776367, "learning_rate": 2.2153555446960795e-06, "loss": 0.0862, "step": 30200 }, { "epoch": 2.743993823516054, "grad_norm": 0.025102665647864342, "learning_rate": 2.1998261751615845e-06, "loss": 0.1136, "step": 30210 }, { "epoch": 2.7449021299786547, "grad_norm": 0.9080238938331604, "learning_rate": 2.1843502023587813e-06, "loss": 0.0778, "step": 30220 }, { "epoch": 2.745810436441255, "grad_norm": 1.784608006477356, "learning_rate": 2.168927643575619e-06, "loss": 0.0927, "step": 30230 }, { "epoch": 2.7467187429038558, "grad_norm": 0.5622226595878601, "learning_rate": 2.153558516040366e-06, "loss": 0.1243, "step": 30240 }, { "epoch": 2.7476270493664563, "grad_norm": 0.5653716921806335, "learning_rate": 2.1382428369216003e-06, "loss": 0.0434, "step": 30250 }, { "epoch": 2.748535355829057, "grad_norm": 0.7719136476516724, "learning_rate": 2.1229806233282023e-06, "loss": 0.1409, "step": 30260 }, { "epoch": 2.7494436622916574, "grad_norm": 1.4255152940750122, "learning_rate": 2.107771892309307e-06, "loss": 0.0687, "step": 30270 }, { "epoch": 2.7503519687542575, "grad_norm": 2.313025712966919, "learning_rate": 2.092616660854335e-06, "loss": 0.1284, "step": 30280 }, { "epoch": 2.751260275216858, "grad_norm": 3.492917060852051, "learning_rate": 2.0775149458929276e-06, "loss": 0.1133, "step": 30290 }, { "epoch": 2.7521685816794585, "grad_norm": 2.207146167755127, "learning_rate": 2.0624667642949346e-06, "loss": 0.078, "step": 30300 }, { "epoch": 2.753076888142059, "grad_norm": 0.5633704662322998, "learning_rate": 2.047472132870426e-06, "loss": 0.0886, "step": 30310 }, { "epoch": 2.7539851946046596, "grad_norm": 1.8043510913848877, "learning_rate": 2.03253106836962e-06, "loss": 0.0786, "step": 30320 }, { "epoch": 2.75489350106726, "grad_norm": 0.24553436040878296, "learning_rate": 2.017643587482948e-06, "loss": 0.1087, "step": 30330 }, { "epoch": 2.7558018075298607, "grad_norm": 1.0307281017303467, "learning_rate": 2.00280970684093e-06, "loss": 0.1254, "step": 30340 }, { "epoch": 2.756710113992461, "grad_norm": 1.3364232778549194, "learning_rate": 1.9880294430142387e-06, "loss": 0.1008, "step": 30350 }, { "epoch": 2.757618420455062, "grad_norm": 1.9610775709152222, "learning_rate": 1.9733028125136334e-06, "loss": 0.1336, "step": 30360 }, { "epoch": 2.758526726917662, "grad_norm": 2.2546441555023193, "learning_rate": 1.958629831789999e-06, "loss": 0.1152, "step": 30370 }, { "epoch": 2.7594350333802624, "grad_norm": 0.5929815173149109, "learning_rate": 1.944010517234246e-06, "loss": 0.0707, "step": 30380 }, { "epoch": 2.760343339842863, "grad_norm": 2.0541112422943115, "learning_rate": 1.929444885177356e-06, "loss": 0.0626, "step": 30390 }, { "epoch": 2.7612516463054635, "grad_norm": 0.9449825882911682, "learning_rate": 1.914932951890341e-06, "loss": 0.0813, "step": 30400 }, { "epoch": 2.762159952768064, "grad_norm": 0.9868356585502625, "learning_rate": 1.9004747335842232e-06, "loss": 0.1046, "step": 30410 }, { "epoch": 2.7630682592306646, "grad_norm": 5.533127784729004, "learning_rate": 1.8860702464100322e-06, "loss": 0.1457, "step": 30420 }, { "epoch": 2.763976565693265, "grad_norm": 4.116486072540283, "learning_rate": 1.8717195064587577e-06, "loss": 0.089, "step": 30430 }, { "epoch": 2.764884872155865, "grad_norm": 1.8417174816131592, "learning_rate": 1.85742252976136e-06, "loss": 0.1162, "step": 30440 }, { "epoch": 2.7657931786184657, "grad_norm": 1.0501673221588135, "learning_rate": 1.843179332288747e-06, "loss": 0.0389, "step": 30450 }, { "epoch": 2.7667014850810663, "grad_norm": 0.8686567544937134, "learning_rate": 1.82898992995173e-06, "loss": 0.0549, "step": 30460 }, { "epoch": 2.767609791543667, "grad_norm": 0.14410114288330078, "learning_rate": 1.8148543386010575e-06, "loss": 0.0583, "step": 30470 }, { "epoch": 2.7685180980062674, "grad_norm": 0.5643256306648254, "learning_rate": 1.8007725740273319e-06, "loss": 0.1277, "step": 30480 }, { "epoch": 2.769426404468868, "grad_norm": 0.5637433528900146, "learning_rate": 1.7867446519610587e-06, "loss": 0.0732, "step": 30490 }, { "epoch": 2.7703347109314684, "grad_norm": 2.361829996109009, "learning_rate": 1.7727705880725698e-06, "loss": 0.1244, "step": 30500 }, { "epoch": 2.7703347109314684, "eval_loss": 0.20449918508529663, "eval_runtime": 1101.7074, "eval_samples_per_second": 8.883, "eval_steps_per_second": 8.883, "step": 30500 }, { "epoch": 2.7712430173940685, "grad_norm": 0.2538432478904724, "learning_rate": 1.7588503979720505e-06, "loss": 0.1226, "step": 30510 }, { "epoch": 2.7721513238566695, "grad_norm": 0.1074981689453125, "learning_rate": 1.7449840972095011e-06, "loss": 0.1092, "step": 30520 }, { "epoch": 2.7730596303192696, "grad_norm": 0.9750372767448425, "learning_rate": 1.7311717012747254e-06, "loss": 0.0982, "step": 30530 }, { "epoch": 2.77396793678187, "grad_norm": 0.8007983565330505, "learning_rate": 1.717413225597303e-06, "loss": 0.069, "step": 30540 }, { "epoch": 2.7748762432444707, "grad_norm": 1.9904080629348755, "learning_rate": 1.70370868554659e-06, "loss": 0.1279, "step": 30550 }, { "epoch": 2.7757845497070712, "grad_norm": 0.5165224671363831, "learning_rate": 1.6900580964316738e-06, "loss": 0.0746, "step": 30560 }, { "epoch": 2.7766928561696718, "grad_norm": 1.1952545642852783, "learning_rate": 1.6764614735014006e-06, "loss": 0.096, "step": 30570 }, { "epoch": 2.777601162632272, "grad_norm": 1.0591791868209839, "learning_rate": 1.66291883194431e-06, "loss": 0.0693, "step": 30580 }, { "epoch": 2.778509469094873, "grad_norm": 2.400491714477539, "learning_rate": 1.649430186888651e-06, "loss": 0.1029, "step": 30590 }, { "epoch": 2.779417775557473, "grad_norm": 1.3571070432662964, "learning_rate": 1.635995553402353e-06, "loss": 0.081, "step": 30600 }, { "epoch": 2.7803260820200735, "grad_norm": 0.12053988873958588, "learning_rate": 1.6226149464930064e-06, "loss": 0.1053, "step": 30610 }, { "epoch": 2.781234388482674, "grad_norm": 0.3063257932662964, "learning_rate": 1.609288381107854e-06, "loss": 0.0704, "step": 30620 }, { "epoch": 2.7821426949452746, "grad_norm": 1.4262027740478516, "learning_rate": 1.5960158721337603e-06, "loss": 0.0811, "step": 30630 }, { "epoch": 2.783051001407875, "grad_norm": 0.20860958099365234, "learning_rate": 1.5827974343972152e-06, "loss": 0.1378, "step": 30640 }, { "epoch": 2.7839593078704756, "grad_norm": 3.70756196975708, "learning_rate": 1.5696330826643069e-06, "loss": 0.1326, "step": 30650 }, { "epoch": 2.784867614333076, "grad_norm": 2.0462145805358887, "learning_rate": 1.5565228316406887e-06, "loss": 0.1078, "step": 30660 }, { "epoch": 2.7857759207956763, "grad_norm": 1.87882399559021, "learning_rate": 1.543466695971607e-06, "loss": 0.1015, "step": 30670 }, { "epoch": 2.786684227258277, "grad_norm": 0.8759644627571106, "learning_rate": 1.5304646902418286e-06, "loss": 0.1282, "step": 30680 }, { "epoch": 2.7875925337208773, "grad_norm": 1.3993092775344849, "learning_rate": 1.517516828975668e-06, "loss": 0.0858, "step": 30690 }, { "epoch": 2.788500840183478, "grad_norm": 0.4694734215736389, "learning_rate": 1.5046231266369559e-06, "loss": 0.0607, "step": 30700 }, { "epoch": 2.7894091466460784, "grad_norm": 0.7196660041809082, "learning_rate": 1.491783597629015e-06, "loss": 0.1372, "step": 30710 }, { "epoch": 2.790317453108679, "grad_norm": 0.9152572154998779, "learning_rate": 1.4789982562946613e-06, "loss": 0.0913, "step": 30720 }, { "epoch": 2.7912257595712795, "grad_norm": 0.31966933608055115, "learning_rate": 1.4662671169161756e-06, "loss": 0.1129, "step": 30730 }, { "epoch": 2.7921340660338796, "grad_norm": 2.063244104385376, "learning_rate": 1.4535901937152819e-06, "loss": 0.0802, "step": 30740 }, { "epoch": 2.7930423724964806, "grad_norm": 2.3474984169006348, "learning_rate": 1.4409675008531575e-06, "loss": 0.1127, "step": 30750 }, { "epoch": 2.7939506789590807, "grad_norm": 2.515760660171509, "learning_rate": 1.4283990524303792e-06, "loss": 0.0785, "step": 30760 }, { "epoch": 2.794858985421681, "grad_norm": 0.5796874761581421, "learning_rate": 1.415884862486949e-06, "loss": 0.1095, "step": 30770 }, { "epoch": 2.7957672918842817, "grad_norm": 2.4787299633026123, "learning_rate": 1.4034249450022408e-06, "loss": 0.1117, "step": 30780 }, { "epoch": 2.7966755983468823, "grad_norm": 1.271517276763916, "learning_rate": 1.3910193138950146e-06, "loss": 0.1773, "step": 30790 }, { "epoch": 2.797583904809483, "grad_norm": 0.49770912528038025, "learning_rate": 1.378667983023374e-06, "loss": 0.1391, "step": 30800 }, { "epoch": 2.7984922112720834, "grad_norm": 1.8315696716308594, "learning_rate": 1.3663709661847767e-06, "loss": 0.0582, "step": 30810 }, { "epoch": 2.799400517734684, "grad_norm": 0.1443779468536377, "learning_rate": 1.3541282771160125e-06, "loss": 0.1388, "step": 30820 }, { "epoch": 2.800308824197284, "grad_norm": 2.0017361640930176, "learning_rate": 1.3419399294931635e-06, "loss": 0.0655, "step": 30830 }, { "epoch": 2.8012171306598845, "grad_norm": 1.079185128211975, "learning_rate": 1.3298059369316218e-06, "loss": 0.0591, "step": 30840 }, { "epoch": 2.802125437122485, "grad_norm": 3.6804072856903076, "learning_rate": 1.3177263129860563e-06, "loss": 0.1412, "step": 30850 }, { "epoch": 2.8030337435850856, "grad_norm": 0.10497110337018967, "learning_rate": 1.3057010711503947e-06, "loss": 0.1259, "step": 30860 }, { "epoch": 2.803942050047686, "grad_norm": 0.17345204949378967, "learning_rate": 1.2937302248578309e-06, "loss": 0.1057, "step": 30870 }, { "epoch": 2.8048503565102867, "grad_norm": 1.5044065713882446, "learning_rate": 1.28181378748079e-06, "loss": 0.0768, "step": 30880 }, { "epoch": 2.8057586629728872, "grad_norm": 0.26722395420074463, "learning_rate": 1.2699517723309073e-06, "loss": 0.0397, "step": 30890 }, { "epoch": 2.8066669694354873, "grad_norm": 2.8113045692443848, "learning_rate": 1.2581441926590332e-06, "loss": 0.1005, "step": 30900 }, { "epoch": 2.807575275898088, "grad_norm": 1.6045891046524048, "learning_rate": 1.2463910616552055e-06, "loss": 0.0764, "step": 30910 }, { "epoch": 2.8084835823606884, "grad_norm": 2.440965414047241, "learning_rate": 1.2346923924486386e-06, "loss": 0.1281, "step": 30920 }, { "epoch": 2.809391888823289, "grad_norm": 1.5399550199508667, "learning_rate": 1.2230481981077124e-06, "loss": 0.0974, "step": 30930 }, { "epoch": 2.8103001952858895, "grad_norm": 0.6333988308906555, "learning_rate": 1.2114584916399495e-06, "loss": 0.1277, "step": 30940 }, { "epoch": 2.81120850174849, "grad_norm": 0.0006258583161979914, "learning_rate": 1.1999232859920052e-06, "loss": 0.1234, "step": 30950 }, { "epoch": 2.8121168082110906, "grad_norm": 4.226573467254639, "learning_rate": 1.1884425940496546e-06, "loss": 0.1423, "step": 30960 }, { "epoch": 2.8130251146736907, "grad_norm": 1.4303691387176514, "learning_rate": 1.1770164286377838e-06, "loss": 0.0956, "step": 30970 }, { "epoch": 2.8139334211362916, "grad_norm": 0.25808560848236084, "learning_rate": 1.1656448025203548e-06, "loss": 0.1315, "step": 30980 }, { "epoch": 2.8148417275988917, "grad_norm": 0.33973413705825806, "learning_rate": 1.1543277284004116e-06, "loss": 0.0867, "step": 30990 }, { "epoch": 2.8157500340614923, "grad_norm": 0.04776841029524803, "learning_rate": 1.1430652189200586e-06, "loss": 0.1106, "step": 31000 }, { "epoch": 2.8157500340614923, "eval_loss": 0.20416565239429474, "eval_runtime": 1130.4632, "eval_samples_per_second": 8.658, "eval_steps_per_second": 8.658, "step": 31000 }, { "epoch": 2.816658340524093, "grad_norm": 2.25657057762146, "learning_rate": 1.131857286660437e-06, "loss": 0.0996, "step": 31010 }, { "epoch": 2.8175666469866933, "grad_norm": 0.009514719247817993, "learning_rate": 1.1207039441417434e-06, "loss": 0.0652, "step": 31020 }, { "epoch": 2.818474953449294, "grad_norm": 2.8367223739624023, "learning_rate": 1.109605203823183e-06, "loss": 0.1018, "step": 31030 }, { "epoch": 2.8193832599118944, "grad_norm": 2.0242984294891357, "learning_rate": 1.0985610781029499e-06, "loss": 0.0411, "step": 31040 }, { "epoch": 2.820291566374495, "grad_norm": 0.8381386995315552, "learning_rate": 1.0875715793182528e-06, "loss": 0.073, "step": 31050 }, { "epoch": 2.821199872837095, "grad_norm": 1.9689520597457886, "learning_rate": 1.0766367197452555e-06, "loss": 0.0872, "step": 31060 }, { "epoch": 2.8221081792996956, "grad_norm": 1.7594802379608154, "learning_rate": 1.0657565115991087e-06, "loss": 0.1035, "step": 31070 }, { "epoch": 2.823016485762296, "grad_norm": 0.4777784049510956, "learning_rate": 1.0549309670338958e-06, "loss": 0.1129, "step": 31080 }, { "epoch": 2.8239247922248967, "grad_norm": 1.9736729860305786, "learning_rate": 1.0441600981426435e-06, "loss": 0.1474, "step": 31090 }, { "epoch": 2.824833098687497, "grad_norm": 2.3072125911712646, "learning_rate": 1.0334439169572997e-06, "loss": 0.0779, "step": 31100 }, { "epoch": 2.8257414051500978, "grad_norm": 0.7711701989173889, "learning_rate": 1.0227824354487158e-06, "loss": 0.0986, "step": 31110 }, { "epoch": 2.8266497116126983, "grad_norm": 2.017849922180176, "learning_rate": 1.0121756655266545e-06, "loss": 0.0834, "step": 31120 }, { "epoch": 2.8275580180752984, "grad_norm": 2.115281105041504, "learning_rate": 1.0016236190397433e-06, "loss": 0.0809, "step": 31130 }, { "epoch": 2.828466324537899, "grad_norm": 0.2116859406232834, "learning_rate": 9.911263077754919e-07, "loss": 0.0726, "step": 31140 }, { "epoch": 2.8293746310004995, "grad_norm": 1.0318259000778198, "learning_rate": 9.806837434602588e-07, "loss": 0.1315, "step": 31150 }, { "epoch": 2.8302829374631, "grad_norm": 1.0194873809814453, "learning_rate": 9.70295937759247e-07, "loss": 0.084, "step": 31160 }, { "epoch": 2.8311912439257005, "grad_norm": 0.8613615036010742, "learning_rate": 9.59962902276501e-07, "loss": 0.098, "step": 31170 }, { "epoch": 2.832099550388301, "grad_norm": 0.9376434683799744, "learning_rate": 9.496846485548661e-07, "loss": 0.062, "step": 31180 }, { "epoch": 2.8330078568509016, "grad_norm": 0.16518542170524597, "learning_rate": 9.394611880759962e-07, "loss": 0.0569, "step": 31190 }, { "epoch": 2.8339161633135017, "grad_norm": 1.160170078277588, "learning_rate": 9.292925322603397e-07, "loss": 0.0643, "step": 31200 }, { "epoch": 2.8348244697761027, "grad_norm": 3.7351269721984863, "learning_rate": 9.191786924671264e-07, "loss": 0.084, "step": 31210 }, { "epoch": 2.835732776238703, "grad_norm": 1.593578577041626, "learning_rate": 9.091196799943469e-07, "loss": 0.0882, "step": 31220 }, { "epoch": 2.8366410827013033, "grad_norm": 0.5781943798065186, "learning_rate": 8.991155060787404e-07, "loss": 0.0787, "step": 31230 }, { "epoch": 2.837549389163904, "grad_norm": 0.514787495136261, "learning_rate": 8.891661818958064e-07, "loss": 0.1507, "step": 31240 }, { "epoch": 2.8384576956265044, "grad_norm": 3.7931227684020996, "learning_rate": 8.792717185597432e-07, "loss": 0.1623, "step": 31250 }, { "epoch": 2.839366002089105, "grad_norm": 0.9614262580871582, "learning_rate": 8.694321271234929e-07, "loss": 0.0831, "step": 31260 }, { "epoch": 2.8402743085517055, "grad_norm": 0.4819766581058502, "learning_rate": 8.596474185786907e-07, "loss": 0.1355, "step": 31270 }, { "epoch": 2.841182615014306, "grad_norm": 4.551284313201904, "learning_rate": 8.499176038556545e-07, "loss": 0.0973, "step": 31280 }, { "epoch": 2.842090921476906, "grad_norm": 0.5566439628601074, "learning_rate": 8.402426938233954e-07, "loss": 0.0876, "step": 31290 }, { "epoch": 2.8429992279395067, "grad_norm": 0.5351948142051697, "learning_rate": 8.306226992895794e-07, "loss": 0.1133, "step": 31300 }, { "epoch": 2.843907534402107, "grad_norm": 1.1115832328796387, "learning_rate": 8.210576310005325e-07, "loss": 0.1293, "step": 31310 }, { "epoch": 2.8448158408647077, "grad_norm": 1.5235204696655273, "learning_rate": 8.115474996412353e-07, "loss": 0.0924, "step": 31320 }, { "epoch": 2.8457241473273083, "grad_norm": 3.645326852798462, "learning_rate": 8.020923158352734e-07, "loss": 0.1559, "step": 31330 }, { "epoch": 2.846632453789909, "grad_norm": 1.7104331254959106, "learning_rate": 7.926920901448753e-07, "loss": 0.1055, "step": 31340 }, { "epoch": 2.8475407602525094, "grad_norm": 2.0236005783081055, "learning_rate": 7.833468330708638e-07, "loss": 0.0772, "step": 31350 }, { "epoch": 2.8484490667151094, "grad_norm": 1.7526692152023315, "learning_rate": 7.740565550526601e-07, "loss": 0.1025, "step": 31360 }, { "epoch": 2.8493573731777104, "grad_norm": 1.8240044116973877, "learning_rate": 7.648212664682741e-07, "loss": 0.0704, "step": 31370 }, { "epoch": 2.8502656796403105, "grad_norm": 2.5941638946533203, "learning_rate": 7.556409776342754e-07, "loss": 0.1078, "step": 31380 }, { "epoch": 2.851173986102911, "grad_norm": 5.90081787109375, "learning_rate": 7.465156988058109e-07, "loss": 0.1619, "step": 31390 }, { "epoch": 2.8520822925655116, "grad_norm": 1.5260767936706543, "learning_rate": 7.374454401765707e-07, "loss": 0.0816, "step": 31400 }, { "epoch": 2.852990599028112, "grad_norm": 0.3467237055301666, "learning_rate": 7.284302118787667e-07, "loss": 0.1033, "step": 31410 }, { "epoch": 2.8538989054907127, "grad_norm": 0.33683422207832336, "learning_rate": 7.194700239831709e-07, "loss": 0.1353, "step": 31420 }, { "epoch": 2.8548072119533128, "grad_norm": 0.002373561728745699, "learning_rate": 7.105648864990377e-07, "loss": 0.1007, "step": 31430 }, { "epoch": 2.8557155184159138, "grad_norm": 1.6099987030029297, "learning_rate": 7.017148093741433e-07, "loss": 0.0623, "step": 31440 }, { "epoch": 2.856623824878514, "grad_norm": 1.8806498050689697, "learning_rate": 6.929198024947569e-07, "loss": 0.1556, "step": 31450 }, { "epoch": 2.8575321313411144, "grad_norm": 1.8772755861282349, "learning_rate": 6.841798756856144e-07, "loss": 0.0949, "step": 31460 }, { "epoch": 2.858440437803715, "grad_norm": 1.9597253799438477, "learning_rate": 6.754950387099446e-07, "loss": 0.1283, "step": 31470 }, { "epoch": 2.8593487442663155, "grad_norm": 3.1700048446655273, "learning_rate": 6.66865301269426e-07, "loss": 0.0748, "step": 31480 }, { "epoch": 2.860257050728916, "grad_norm": 2.2321362495422363, "learning_rate": 6.582906730041749e-07, "loss": 0.0799, "step": 31490 }, { "epoch": 2.8611653571915165, "grad_norm": 1.141028881072998, "learning_rate": 6.497711634927739e-07, "loss": 0.0845, "step": 31500 }, { "epoch": 2.8611653571915165, "eval_loss": 0.20419014990329742, "eval_runtime": 1118.0051, "eval_samples_per_second": 8.754, "eval_steps_per_second": 8.754, "step": 31500 }, { "epoch": 2.862073663654117, "grad_norm": 0.5663442611694336, "learning_rate": 6.413067822522045e-07, "loss": 0.0553, "step": 31510 }, { "epoch": 2.862981970116717, "grad_norm": 0.8703701496124268, "learning_rate": 6.328975387378811e-07, "loss": 0.1139, "step": 31520 }, { "epoch": 2.8638902765793177, "grad_norm": 1.359121322631836, "learning_rate": 6.245434423436225e-07, "loss": 0.0575, "step": 31530 }, { "epoch": 2.8647985830419183, "grad_norm": 1.1363805532455444, "learning_rate": 6.162445024016416e-07, "loss": 0.0705, "step": 31540 }, { "epoch": 2.865706889504519, "grad_norm": 0.8428524136543274, "learning_rate": 6.080007281825339e-07, "loss": 0.1286, "step": 31550 }, { "epoch": 2.8666151959671193, "grad_norm": 0.3168107867240906, "learning_rate": 5.998121288952829e-07, "loss": 0.0475, "step": 31560 }, { "epoch": 2.86752350242972, "grad_norm": 0.2509475350379944, "learning_rate": 5.916787136872215e-07, "loss": 0.0791, "step": 31570 }, { "epoch": 2.8684318088923204, "grad_norm": 0.29175010323524475, "learning_rate": 5.836004916440485e-07, "loss": 0.0871, "step": 31580 }, { "epoch": 2.8693401153549205, "grad_norm": 1.438096523284912, "learning_rate": 5.75577471789801e-07, "loss": 0.09, "step": 31590 }, { "epoch": 2.8702484218175215, "grad_norm": 1.1897201538085938, "learning_rate": 5.676096630868544e-07, "loss": 0.0708, "step": 31600 }, { "epoch": 2.8711567282801216, "grad_norm": 1.191619634628296, "learning_rate": 5.596970744359054e-07, "loss": 0.1038, "step": 31610 }, { "epoch": 2.872065034742722, "grad_norm": 1.8757117986679077, "learning_rate": 5.518397146759724e-07, "loss": 0.1033, "step": 31620 }, { "epoch": 2.8729733412053227, "grad_norm": 2.5794360637664795, "learning_rate": 5.440375925843732e-07, "loss": 0.0917, "step": 31630 }, { "epoch": 2.873881647667923, "grad_norm": 0.6564699411392212, "learning_rate": 5.36290716876714e-07, "loss": 0.0773, "step": 31640 }, { "epoch": 2.8747899541305237, "grad_norm": 1.90837824344635, "learning_rate": 5.285990962068943e-07, "loss": 0.0693, "step": 31650 }, { "epoch": 2.8756982605931243, "grad_norm": 0.36312225461006165, "learning_rate": 5.209627391670968e-07, "loss": 0.1036, "step": 31660 }, { "epoch": 2.876606567055725, "grad_norm": 1.6918108463287354, "learning_rate": 5.133816542877479e-07, "loss": 0.0877, "step": 31670 }, { "epoch": 2.877514873518325, "grad_norm": 1.5968433618545532, "learning_rate": 5.058558500375565e-07, "loss": 0.0793, "step": 31680 }, { "epoch": 2.8784231799809255, "grad_norm": 2.656822681427002, "learning_rate": 4.983853348234591e-07, "loss": 0.1273, "step": 31690 }, { "epoch": 2.879331486443526, "grad_norm": 1.9260691404342651, "learning_rate": 4.909701169906356e-07, "loss": 0.1383, "step": 31700 }, { "epoch": 2.8802397929061265, "grad_norm": 1.4756124019622803, "learning_rate": 4.836102048224988e-07, "loss": 0.1717, "step": 31710 }, { "epoch": 2.881148099368727, "grad_norm": 3.394392967224121, "learning_rate": 4.7630560654067236e-07, "loss": 0.084, "step": 31720 }, { "epoch": 2.8820564058313276, "grad_norm": 1.1866072416305542, "learning_rate": 4.6905633030500663e-07, "loss": 0.0648, "step": 31730 }, { "epoch": 2.882964712293928, "grad_norm": 0.24523867666721344, "learning_rate": 4.618623842135239e-07, "loss": 0.1561, "step": 31740 }, { "epoch": 2.8838730187565282, "grad_norm": 1.271323800086975, "learning_rate": 4.547237763024625e-07, "loss": 0.0522, "step": 31750 }, { "epoch": 2.884781325219129, "grad_norm": 0.4248110353946686, "learning_rate": 4.4764051454623256e-07, "loss": 0.0828, "step": 31760 }, { "epoch": 2.8856896316817293, "grad_norm": 4.28304386138916, "learning_rate": 4.406126068574268e-07, "loss": 0.0958, "step": 31770 }, { "epoch": 2.88659793814433, "grad_norm": 2.1593997478485107, "learning_rate": 4.33640061086793e-07, "loss": 0.0733, "step": 31780 }, { "epoch": 2.8875062446069304, "grad_norm": 1.7059824466705322, "learning_rate": 4.2672288502323965e-07, "loss": 0.1074, "step": 31790 }, { "epoch": 2.888414551069531, "grad_norm": 2.231419324874878, "learning_rate": 4.198610863938246e-07, "loss": 0.1033, "step": 31800 }, { "epoch": 2.8893228575321315, "grad_norm": 2.3111822605133057, "learning_rate": 4.1305467286373857e-07, "loss": 0.1081, "step": 31810 }, { "epoch": 2.8902311639947316, "grad_norm": 1.1901158094406128, "learning_rate": 4.0630365203630504e-07, "loss": 0.0993, "step": 31820 }, { "epoch": 2.8911394704573325, "grad_norm": 2.609874963760376, "learning_rate": 3.9960803145297487e-07, "loss": 0.078, "step": 31830 }, { "epoch": 2.8920477769199326, "grad_norm": 1.4698419570922852, "learning_rate": 3.929678185933039e-07, "loss": 0.1794, "step": 31840 }, { "epoch": 2.892956083382533, "grad_norm": 0.0011369034182280302, "learning_rate": 3.863830208749586e-07, "loss": 0.0586, "step": 31850 }, { "epoch": 2.8938643898451337, "grad_norm": 0.827709436416626, "learning_rate": 3.798536456536994e-07, "loss": 0.1299, "step": 31860 }, { "epoch": 2.8947726963077343, "grad_norm": 1.9895398616790771, "learning_rate": 3.7337970022338074e-07, "loss": 0.0822, "step": 31870 }, { "epoch": 2.895681002770335, "grad_norm": 2.1408133506774902, "learning_rate": 3.669611918159288e-07, "loss": 0.0556, "step": 31880 }, { "epoch": 2.8965893092329353, "grad_norm": 2.0974488258361816, "learning_rate": 3.6059812760134704e-07, "loss": 0.1317, "step": 31890 }, { "epoch": 2.897497615695536, "grad_norm": 0.6474989056587219, "learning_rate": 3.542905146876996e-07, "loss": 0.0954, "step": 31900 }, { "epoch": 2.898405922158136, "grad_norm": 0.2275838702917099, "learning_rate": 3.480383601211112e-07, "loss": 0.08, "step": 31910 }, { "epoch": 2.8993142286207365, "grad_norm": 2.4491374492645264, "learning_rate": 3.418416708857619e-07, "loss": 0.098, "step": 31920 }, { "epoch": 2.900222535083337, "grad_norm": 5.024981498718262, "learning_rate": 3.3570045390384776e-07, "loss": 0.1134, "step": 31930 }, { "epoch": 2.9011308415459376, "grad_norm": 1.2483018636703491, "learning_rate": 3.296147160356311e-07, "loss": 0.0834, "step": 31940 }, { "epoch": 2.902039148008538, "grad_norm": 0.6549615263938904, "learning_rate": 3.235844640793684e-07, "loss": 0.0434, "step": 31950 }, { "epoch": 2.9029474544711387, "grad_norm": 0.2605512738227844, "learning_rate": 3.1760970477135996e-07, "loss": 0.0573, "step": 31960 }, { "epoch": 2.903855760933739, "grad_norm": 0.41946208477020264, "learning_rate": 3.1169044478589483e-07, "loss": 0.0627, "step": 31970 }, { "epoch": 2.9047640673963393, "grad_norm": 1.9639627933502197, "learning_rate": 3.058266907352725e-07, "loss": 0.0831, "step": 31980 }, { "epoch": 2.9056723738589403, "grad_norm": 0.12303037196397781, "learning_rate": 3.000184491697977e-07, "loss": 0.0879, "step": 31990 }, { "epoch": 2.9065806803215404, "grad_norm": 3.1818344593048096, "learning_rate": 2.9426572657775266e-07, "loss": 0.1129, "step": 32000 }, { "epoch": 2.9065806803215404, "eval_loss": 0.2040770947933197, "eval_runtime": 1112.7991, "eval_samples_per_second": 8.795, "eval_steps_per_second": 8.795, "step": 32000 }, { "epoch": 2.907488986784141, "grad_norm": 0.015612751245498657, "learning_rate": 2.885685293853968e-07, "loss": 0.107, "step": 32010 }, { "epoch": 2.9083972932467415, "grad_norm": 4.805224418640137, "learning_rate": 2.829268639569782e-07, "loss": 0.1159, "step": 32020 }, { "epoch": 2.909305599709342, "grad_norm": 4.025540351867676, "learning_rate": 2.7734073659468896e-07, "loss": 0.0833, "step": 32030 }, { "epoch": 2.9102139061719425, "grad_norm": 2.178417205810547, "learning_rate": 2.718101535387041e-07, "loss": 0.1497, "step": 32040 }, { "epoch": 2.9111222126345426, "grad_norm": 2.536613941192627, "learning_rate": 2.663351209671372e-07, "loss": 0.103, "step": 32050 }, { "epoch": 2.9120305190971436, "grad_norm": 0.31859758496284485, "learning_rate": 2.6091564499605146e-07, "loss": 0.0816, "step": 32060 }, { "epoch": 2.9129388255597437, "grad_norm": 1.4173710346221924, "learning_rate": 2.5555173167944866e-07, "loss": 0.0901, "step": 32070 }, { "epoch": 2.9138471320223442, "grad_norm": 3.2968523502349854, "learning_rate": 2.5024338700925796e-07, "loss": 0.1121, "step": 32080 }, { "epoch": 2.914755438484945, "grad_norm": 2.617961883544922, "learning_rate": 2.4499061691534153e-07, "loss": 0.1389, "step": 32090 }, { "epoch": 2.9156637449475453, "grad_norm": 0.49702343344688416, "learning_rate": 2.3979342726547224e-07, "loss": 0.1478, "step": 32100 }, { "epoch": 2.916572051410146, "grad_norm": 1.4125447273254395, "learning_rate": 2.3465182386533947e-07, "loss": 0.0761, "step": 32110 }, { "epoch": 2.9174803578727464, "grad_norm": 0.3417629599571228, "learning_rate": 2.2956581245854315e-07, "loss": 0.0944, "step": 32120 }, { "epoch": 2.918388664335347, "grad_norm": 1.4259366989135742, "learning_rate": 2.2453539872657748e-07, "loss": 0.0423, "step": 32130 }, { "epoch": 2.919296970797947, "grad_norm": 1.6359031200408936, "learning_rate": 2.1956058828881966e-07, "loss": 0.1306, "step": 32140 }, { "epoch": 2.9202052772605476, "grad_norm": 0.8521758913993835, "learning_rate": 2.146413867025465e-07, "loss": 0.0797, "step": 32150 }, { "epoch": 2.921113583723148, "grad_norm": 2.546018600463867, "learning_rate": 2.0977779946291798e-07, "loss": 0.0826, "step": 32160 }, { "epoch": 2.9220218901857486, "grad_norm": 1.3692561388015747, "learning_rate": 2.0496983200295473e-07, "loss": 0.0682, "step": 32170 }, { "epoch": 2.922930196648349, "grad_norm": 2.8861382007598877, "learning_rate": 2.0021748969355492e-07, "loss": 0.0618, "step": 32180 }, { "epoch": 2.9238385031109497, "grad_norm": 0.8075135946273804, "learning_rate": 1.95520777843472e-07, "loss": 0.0832, "step": 32190 }, { "epoch": 2.9247468095735503, "grad_norm": 0.1811692863702774, "learning_rate": 1.9087970169932578e-07, "loss": 0.1209, "step": 32200 }, { "epoch": 2.9256551160361504, "grad_norm": 2.73325514793396, "learning_rate": 1.8629426644558578e-07, "loss": 0.0693, "step": 32210 }, { "epoch": 2.9265634224987513, "grad_norm": 2.129448652267456, "learning_rate": 1.8176447720454347e-07, "loss": 0.1655, "step": 32220 }, { "epoch": 2.9274717289613514, "grad_norm": 0.5807188749313354, "learning_rate": 1.7729033903636228e-07, "loss": 0.0821, "step": 32230 }, { "epoch": 2.928380035423952, "grad_norm": 1.1434310674667358, "learning_rate": 1.7287185693902196e-07, "loss": 0.09, "step": 32240 }, { "epoch": 2.9292883418865525, "grad_norm": 1.8073419332504272, "learning_rate": 1.6850903584831878e-07, "loss": 0.1283, "step": 32250 }, { "epoch": 2.930196648349153, "grad_norm": 2.501613140106201, "learning_rate": 1.6420188063789865e-07, "loss": 0.0709, "step": 32260 }, { "epoch": 2.9311049548117536, "grad_norm": 1.1138010025024414, "learning_rate": 1.5995039611919615e-07, "loss": 0.0759, "step": 32270 }, { "epoch": 2.932013261274354, "grad_norm": 0.0020240331068634987, "learning_rate": 1.5575458704147893e-07, "loss": 0.0679, "step": 32280 }, { "epoch": 2.9329215677369547, "grad_norm": 1.6587823629379272, "learning_rate": 1.516144580918033e-07, "loss": 0.0842, "step": 32290 }, { "epoch": 2.9338298741995548, "grad_norm": 0.6247867345809937, "learning_rate": 1.4753001389504195e-07, "loss": 0.0933, "step": 32300 }, { "epoch": 2.9347381806621553, "grad_norm": 4.964267730712891, "learning_rate": 1.435012590138507e-07, "loss": 0.1499, "step": 32310 }, { "epoch": 2.935646487124756, "grad_norm": 0.00412380788475275, "learning_rate": 1.3952819794867955e-07, "loss": 0.114, "step": 32320 }, { "epoch": 2.9365547935873564, "grad_norm": 1.3245699405670166, "learning_rate": 1.3561083513777273e-07, "loss": 0.0981, "step": 32330 }, { "epoch": 2.937463100049957, "grad_norm": 2.4375483989715576, "learning_rate": 1.317491749571409e-07, "loss": 0.129, "step": 32340 }, { "epoch": 2.9383714065125575, "grad_norm": 1.1928890943527222, "learning_rate": 1.2794322172057228e-07, "loss": 0.101, "step": 32350 }, { "epoch": 2.939279712975158, "grad_norm": 1.2640317678451538, "learning_rate": 1.2419297967963818e-07, "loss": 0.0698, "step": 32360 }, { "epoch": 2.940188019437758, "grad_norm": 2.4425642490386963, "learning_rate": 1.2049845302366525e-07, "loss": 0.0858, "step": 32370 }, { "epoch": 2.9410963259003586, "grad_norm": 1.3205333948135376, "learning_rate": 1.1685964587974663e-07, "loss": 0.1114, "step": 32380 }, { "epoch": 2.942004632362959, "grad_norm": 0.5034480094909668, "learning_rate": 1.1327656231272521e-07, "loss": 0.0512, "step": 32390 }, { "epoch": 2.9429129388255597, "grad_norm": 1.4817718267440796, "learning_rate": 1.0974920632519925e-07, "loss": 0.1327, "step": 32400 }, { "epoch": 2.9438212452881602, "grad_norm": 1.1007354259490967, "learning_rate": 1.0627758185752235e-07, "loss": 0.0787, "step": 32410 }, { "epoch": 2.944729551750761, "grad_norm": 0.8271766901016235, "learning_rate": 1.0286169278778124e-07, "loss": 0.0942, "step": 32420 }, { "epoch": 2.9456378582133613, "grad_norm": 0.06407704204320908, "learning_rate": 9.950154293180691e-08, "loss": 0.0954, "step": 32430 }, { "epoch": 2.9465461646759614, "grad_norm": 0.718769907951355, "learning_rate": 9.619713604315794e-08, "loss": 0.0779, "step": 32440 }, { "epoch": 2.9474544711385624, "grad_norm": 0.5276159644126892, "learning_rate": 9.294847581313715e-08, "loss": 0.1392, "step": 32450 }, { "epoch": 2.9483627776011625, "grad_norm": 2.1346428394317627, "learning_rate": 8.975556587076383e-08, "loss": 0.0682, "step": 32460 }, { "epoch": 2.949271084063763, "grad_norm": 1.7059907913208008, "learning_rate": 8.66184097827738e-08, "loss": 0.1509, "step": 32470 }, { "epoch": 2.9501793905263636, "grad_norm": 0.8955178260803223, "learning_rate": 8.353701105364154e-08, "loss": 0.1072, "step": 32480 }, { "epoch": 2.951087696988964, "grad_norm": 3.704460382461548, "learning_rate": 8.051137312553026e-08, "loss": 0.1393, "step": 32490 }, { "epoch": 2.9519960034515647, "grad_norm": 1.6427141427993774, "learning_rate": 7.754149937833632e-08, "loss": 0.1064, "step": 32500 }, { "epoch": 2.9519960034515647, "eval_loss": 0.20409540832042694, "eval_runtime": 1132.1667, "eval_samples_per_second": 8.644, "eval_steps_per_second": 8.644, "step": 32500 }, { "epoch": 2.952904309914165, "grad_norm": 1.8221585750579834, "learning_rate": 7.462739312965039e-08, "loss": 0.1263, "step": 32510 }, { "epoch": 2.9538126163767657, "grad_norm": 0.19530564546585083, "learning_rate": 7.176905763476294e-08, "loss": 0.0789, "step": 32520 }, { "epoch": 2.954720922839366, "grad_norm": 0.971616268157959, "learning_rate": 6.896649608668093e-08, "loss": 0.0893, "step": 32530 }, { "epoch": 2.9556292293019664, "grad_norm": 1.6435067653656006, "learning_rate": 6.621971161608898e-08, "loss": 0.0719, "step": 32540 }, { "epoch": 2.956537535764567, "grad_norm": 1.1344499588012695, "learning_rate": 6.352870729137705e-08, "loss": 0.0835, "step": 32550 }, { "epoch": 2.9574458422271674, "grad_norm": 1.604345440864563, "learning_rate": 6.089348611861834e-08, "loss": 0.0541, "step": 32560 }, { "epoch": 2.958354148689768, "grad_norm": 1.6731191873550415, "learning_rate": 5.8314051041563624e-08, "loss": 0.1063, "step": 32570 }, { "epoch": 2.9592624551523685, "grad_norm": 2.872709274291992, "learning_rate": 5.579040494166354e-08, "loss": 0.1034, "step": 32580 }, { "epoch": 2.960170761614969, "grad_norm": 1.564459204673767, "learning_rate": 5.3322550638029714e-08, "loss": 0.059, "step": 32590 }, { "epoch": 2.961079068077569, "grad_norm": 0.007099229376763105, "learning_rate": 5.091049088746802e-08, "loss": 0.0804, "step": 32600 }, { "epoch": 2.9619873745401697, "grad_norm": 1.9615105390548706, "learning_rate": 4.8554228384445344e-08, "loss": 0.0733, "step": 32610 }, { "epoch": 2.9628956810027702, "grad_norm": 2.1215689182281494, "learning_rate": 4.625376576109508e-08, "loss": 0.0583, "step": 32620 }, { "epoch": 2.9638039874653708, "grad_norm": 1.5527448654174805, "learning_rate": 4.400910558723381e-08, "loss": 0.0658, "step": 32630 }, { "epoch": 2.9647122939279713, "grad_norm": 0.7116771936416626, "learning_rate": 4.182025037032245e-08, "loss": 0.1414, "step": 32640 }, { "epoch": 2.965620600390572, "grad_norm": 0.9874725937843323, "learning_rate": 3.968720255549396e-08, "loss": 0.1107, "step": 32650 }, { "epoch": 2.9665289068531724, "grad_norm": 0.4019642770290375, "learning_rate": 3.760996452554233e-08, "loss": 0.0506, "step": 32660 }, { "epoch": 2.9674372133157725, "grad_norm": 1.2600650787353516, "learning_rate": 3.558853860091693e-08, "loss": 0.0675, "step": 32670 }, { "epoch": 2.9683455197783735, "grad_norm": 1.322258472442627, "learning_rate": 3.362292703970593e-08, "loss": 0.103, "step": 32680 }, { "epoch": 2.9692538262409736, "grad_norm": 1.9879436492919922, "learning_rate": 3.171313203766957e-08, "loss": 0.0892, "step": 32690 }, { "epoch": 2.970162132703574, "grad_norm": 0.8417840003967285, "learning_rate": 2.98591557282013e-08, "loss": 0.0792, "step": 32700 }, { "epoch": 2.9710704391661746, "grad_norm": 3.637547254562378, "learning_rate": 2.8061000182344477e-08, "loss": 0.1817, "step": 32710 }, { "epoch": 2.971978745628775, "grad_norm": 0.2219175398349762, "learning_rate": 2.6318667408792297e-08, "loss": 0.1279, "step": 32720 }, { "epoch": 2.9728870520913757, "grad_norm": 1.159891963005066, "learning_rate": 2.4632159353865645e-08, "loss": 0.0803, "step": 32730 }, { "epoch": 2.9737953585539763, "grad_norm": 1.1402428150177002, "learning_rate": 2.3001477901540836e-08, "loss": 0.0814, "step": 32740 }, { "epoch": 2.974703665016577, "grad_norm": 3.1366357803344727, "learning_rate": 2.142662487341629e-08, "loss": 0.1004, "step": 32750 }, { "epoch": 2.975611971479177, "grad_norm": 0.18767398595809937, "learning_rate": 1.9907602028740312e-08, "loss": 0.0961, "step": 32760 }, { "epoch": 2.9765202779417774, "grad_norm": 1.011823058128357, "learning_rate": 1.8444411064383327e-08, "loss": 0.144, "step": 32770 }, { "epoch": 2.977428584404378, "grad_norm": 0.8356990814208984, "learning_rate": 1.7037053614848973e-08, "loss": 0.0627, "step": 32780 }, { "epoch": 2.9783368908669785, "grad_norm": 2.2634830474853516, "learning_rate": 1.5685531252268572e-08, "loss": 0.1648, "step": 32790 }, { "epoch": 2.979245197329579, "grad_norm": 3.3767435550689697, "learning_rate": 1.43898454864122e-08, "loss": 0.1231, "step": 32800 }, { "epoch": 2.9801535037921796, "grad_norm": 1.4489741325378418, "learning_rate": 1.3149997764660971e-08, "loss": 0.0844, "step": 32810 }, { "epoch": 2.98106181025478, "grad_norm": 0.5138169527053833, "learning_rate": 1.1965989472023653e-08, "loss": 0.0623, "step": 32820 }, { "epoch": 2.98197011671738, "grad_norm": 1.2638658285140991, "learning_rate": 1.0837821931147796e-08, "loss": 0.0601, "step": 32830 }, { "epoch": 2.982878423179981, "grad_norm": 3.252366781234741, "learning_rate": 9.765496402275309e-09, "loss": 0.0878, "step": 32840 }, { "epoch": 2.9837867296425813, "grad_norm": 0.32205402851104736, "learning_rate": 8.749014083292428e-09, "loss": 0.0744, "step": 32850 }, { "epoch": 2.984695036105182, "grad_norm": 3.4594597816467285, "learning_rate": 7.788376109690853e-09, "loss": 0.1634, "step": 32860 }, { "epoch": 2.9856033425677824, "grad_norm": 1.5202643871307373, "learning_rate": 6.883583554578854e-09, "loss": 0.0916, "step": 32870 }, { "epoch": 2.986511649030383, "grad_norm": 3.2158656120300293, "learning_rate": 6.034637428692369e-09, "loss": 0.1219, "step": 32880 }, { "epoch": 2.9874199554929834, "grad_norm": 1.6530450582504272, "learning_rate": 5.241538680361702e-09, "loss": 0.1645, "step": 32890 }, { "epoch": 2.9883282619555835, "grad_norm": 2.066208839416504, "learning_rate": 4.504288195555928e-09, "loss": 0.1024, "step": 32900 }, { "epoch": 2.9892365684181845, "grad_norm": 0.3930835425853729, "learning_rate": 3.8228867978329365e-09, "loss": 0.1016, "step": 32910 }, { "epoch": 2.9901448748807846, "grad_norm": 0.3429376780986786, "learning_rate": 3.197335248383837e-09, "loss": 0.0695, "step": 32920 }, { "epoch": 2.991053181343385, "grad_norm": 1.05301034450531, "learning_rate": 2.6276342459941037e-09, "loss": 0.1562, "step": 32930 }, { "epoch": 2.9919614878059857, "grad_norm": 1.074446439743042, "learning_rate": 2.1137844270768812e-09, "loss": 0.1407, "step": 32940 }, { "epoch": 2.9928697942685862, "grad_norm": 1.1826083660125732, "learning_rate": 1.6557863656341265e-09, "loss": 0.0692, "step": 32950 }, { "epoch": 2.9937781007311868, "grad_norm": 1.2543842792510986, "learning_rate": 1.253640573289916e-09, "loss": 0.0956, "step": 32960 }, { "epoch": 2.9946864071937873, "grad_norm": 0.1264507919549942, "learning_rate": 9.073474992793429e-10, "loss": 0.1153, "step": 32970 }, { "epoch": 2.995594713656388, "grad_norm": 1.136021614074707, "learning_rate": 6.16907530431865e-10, "loss": 0.1222, "step": 32980 }, { "epoch": 2.996503020118988, "grad_norm": 3.222353935241699, "learning_rate": 3.8232099119905883e-10, "loss": 0.0937, "step": 32990 }, { "epoch": 2.9974113265815885, "grad_norm": 2.388324499130249, "learning_rate": 2.0358814363241606e-10, "loss": 0.1087, "step": 33000 }, { "epoch": 2.9974113265815885, "eval_loss": 0.20413753390312195, "eval_runtime": 1114.9096, "eval_samples_per_second": 8.778, "eval_steps_per_second": 8.778, "step": 33000 }, { "epoch": 2.998319633044189, "grad_norm": 1.2996996641159058, "learning_rate": 8.070918739444544e-11, "loss": 0.0615, "step": 33010 }, { "epoch": 2.9992279395067896, "grad_norm": 0.7635434865951538, "learning_rate": 1.3684259742019478e-11, "loss": 0.1121, "step": 33020 }, { "epoch": 2.99986375403061, "step": 33027, "total_flos": 3.203574657765851e+18, "train_loss": 0.19387660140121624, "train_runtime": 119078.0598, "train_samples_per_second": 2.219, "train_steps_per_second": 0.277 } ], "logging_steps": 10, "max_steps": 33027, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.203574657765851e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }