{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 8576, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005830223880597014, "grad_norm": 2.550195533598444, "learning_rate": 5.827505827505828e-07, "loss": 0.8852, "step": 5 }, { "epoch": 0.0011660447761194029, "grad_norm": 2.2188855019798273, "learning_rate": 1.1655011655011657e-06, "loss": 0.8311, "step": 10 }, { "epoch": 0.0017490671641791045, "grad_norm": 1.7967346771825998, "learning_rate": 1.7482517482517485e-06, "loss": 0.8113, "step": 15 }, { "epoch": 0.0023320895522388058, "grad_norm": 1.618619190869765, "learning_rate": 2.3310023310023313e-06, "loss": 0.8333, "step": 20 }, { "epoch": 0.0029151119402985076, "grad_norm": 1.137673984585037, "learning_rate": 2.9137529137529138e-06, "loss": 0.7745, "step": 25 }, { "epoch": 0.003498134328358209, "grad_norm": 1.1933766134733839, "learning_rate": 3.496503496503497e-06, "loss": 0.7904, "step": 30 }, { "epoch": 0.00408115671641791, "grad_norm": 1.0157304336095063, "learning_rate": 4.079254079254079e-06, "loss": 0.7624, "step": 35 }, { "epoch": 0.0046641791044776115, "grad_norm": 0.9429820002012593, "learning_rate": 4.662004662004663e-06, "loss": 0.7285, "step": 40 }, { "epoch": 0.005247201492537314, "grad_norm": 0.915430165066572, "learning_rate": 5.244755244755245e-06, "loss": 0.7469, "step": 45 }, { "epoch": 0.005830223880597015, "grad_norm": 0.8161946795665328, "learning_rate": 5.8275058275058275e-06, "loss": 0.707, "step": 50 }, { "epoch": 0.006413246268656717, "grad_norm": 0.8339668281290502, "learning_rate": 6.41025641025641e-06, "loss": 0.6905, "step": 55 }, { "epoch": 0.006996268656716418, "grad_norm": 0.8574171717816127, "learning_rate": 6.993006993006994e-06, "loss": 0.7698, "step": 60 }, { "epoch": 0.00757929104477612, "grad_norm": 0.8619956369841163, "learning_rate": 7.5757575757575764e-06, "loss": 0.7082, "step": 65 }, { "epoch": 0.00816231343283582, "grad_norm": 0.8045557883502801, "learning_rate": 8.158508158508159e-06, "loss": 0.6587, "step": 70 }, { "epoch": 0.008745335820895522, "grad_norm": 0.7946638318702278, "learning_rate": 8.741258741258741e-06, "loss": 0.6539, "step": 75 }, { "epoch": 0.009328358208955223, "grad_norm": 0.8503979707980048, "learning_rate": 9.324009324009325e-06, "loss": 0.6879, "step": 80 }, { "epoch": 0.009911380597014926, "grad_norm": 0.804711208532189, "learning_rate": 9.906759906759908e-06, "loss": 0.6743, "step": 85 }, { "epoch": 0.010494402985074628, "grad_norm": 0.7951085316698083, "learning_rate": 1.048951048951049e-05, "loss": 0.6586, "step": 90 }, { "epoch": 0.011077425373134329, "grad_norm": 0.793128900429868, "learning_rate": 1.1072261072261073e-05, "loss": 0.6842, "step": 95 }, { "epoch": 0.01166044776119403, "grad_norm": 0.8075245421009535, "learning_rate": 1.1655011655011655e-05, "loss": 0.6684, "step": 100 }, { "epoch": 0.012243470149253732, "grad_norm": 0.8581518728101114, "learning_rate": 1.2237762237762239e-05, "loss": 0.6207, "step": 105 }, { "epoch": 0.012826492537313433, "grad_norm": 0.9082387952747121, "learning_rate": 1.282051282051282e-05, "loss": 0.6678, "step": 110 }, { "epoch": 0.013409514925373135, "grad_norm": 0.9029958898223162, "learning_rate": 1.3403263403263406e-05, "loss": 0.6394, "step": 115 }, { "epoch": 0.013992537313432836, "grad_norm": 0.8377367698700525, "learning_rate": 1.3986013986013988e-05, "loss": 0.6707, "step": 120 }, { "epoch": 0.014575559701492538, "grad_norm": 0.7587966655835441, "learning_rate": 1.456876456876457e-05, "loss": 0.6258, "step": 125 }, { "epoch": 0.01515858208955224, "grad_norm": 0.8173306054331542, "learning_rate": 1.5151515151515153e-05, "loss": 0.6619, "step": 130 }, { "epoch": 0.01574160447761194, "grad_norm": 0.9376337343046882, "learning_rate": 1.5734265734265734e-05, "loss": 0.6673, "step": 135 }, { "epoch": 0.01632462686567164, "grad_norm": 0.9363652268784762, "learning_rate": 1.6317016317016318e-05, "loss": 0.668, "step": 140 }, { "epoch": 0.016907649253731342, "grad_norm": 0.8794426221093276, "learning_rate": 1.68997668997669e-05, "loss": 0.6569, "step": 145 }, { "epoch": 0.017490671641791043, "grad_norm": 0.9407491307884387, "learning_rate": 1.7482517482517483e-05, "loss": 0.6778, "step": 150 }, { "epoch": 0.018073694029850745, "grad_norm": 0.8720667910368793, "learning_rate": 1.8065268065268067e-05, "loss": 0.6291, "step": 155 }, { "epoch": 0.018656716417910446, "grad_norm": 0.8603987252532928, "learning_rate": 1.864801864801865e-05, "loss": 0.6372, "step": 160 }, { "epoch": 0.019239738805970148, "grad_norm": 0.8449689793110765, "learning_rate": 1.923076923076923e-05, "loss": 0.6586, "step": 165 }, { "epoch": 0.019822761194029852, "grad_norm": 1.0325515822144362, "learning_rate": 1.9813519813519816e-05, "loss": 0.6386, "step": 170 }, { "epoch": 0.020405783582089554, "grad_norm": 0.8513517345308381, "learning_rate": 2.0396270396270396e-05, "loss": 0.6311, "step": 175 }, { "epoch": 0.020988805970149255, "grad_norm": 0.8847582203013, "learning_rate": 2.097902097902098e-05, "loss": 0.6834, "step": 180 }, { "epoch": 0.021571828358208957, "grad_norm": 0.9006033563159529, "learning_rate": 2.156177156177156e-05, "loss": 0.6381, "step": 185 }, { "epoch": 0.022154850746268658, "grad_norm": 0.8147114381410464, "learning_rate": 2.2144522144522145e-05, "loss": 0.6432, "step": 190 }, { "epoch": 0.02273787313432836, "grad_norm": 0.9560611394473829, "learning_rate": 2.272727272727273e-05, "loss": 0.6261, "step": 195 }, { "epoch": 0.02332089552238806, "grad_norm": 0.7951106600912393, "learning_rate": 2.331002331002331e-05, "loss": 0.6307, "step": 200 }, { "epoch": 0.023903917910447763, "grad_norm": 0.855140755847949, "learning_rate": 2.3892773892773894e-05, "loss": 0.6213, "step": 205 }, { "epoch": 0.024486940298507464, "grad_norm": 1.0420866065102492, "learning_rate": 2.4475524475524478e-05, "loss": 0.6275, "step": 210 }, { "epoch": 0.025069962686567165, "grad_norm": 0.89019303742937, "learning_rate": 2.505827505827506e-05, "loss": 0.6275, "step": 215 }, { "epoch": 0.025652985074626867, "grad_norm": 0.9758673014251983, "learning_rate": 2.564102564102564e-05, "loss": 0.6631, "step": 220 }, { "epoch": 0.02623600746268657, "grad_norm": 0.8596860100346014, "learning_rate": 2.6223776223776224e-05, "loss": 0.6216, "step": 225 }, { "epoch": 0.02681902985074627, "grad_norm": 0.8908792386022747, "learning_rate": 2.680652680652681e-05, "loss": 0.6406, "step": 230 }, { "epoch": 0.02740205223880597, "grad_norm": 0.8721610635945679, "learning_rate": 2.738927738927739e-05, "loss": 0.6216, "step": 235 }, { "epoch": 0.027985074626865673, "grad_norm": 0.8671325117829104, "learning_rate": 2.7972027972027976e-05, "loss": 0.6713, "step": 240 }, { "epoch": 0.028568097014925374, "grad_norm": 0.8803714665797042, "learning_rate": 2.8554778554778557e-05, "loss": 0.629, "step": 245 }, { "epoch": 0.029151119402985076, "grad_norm": 0.8308844754653052, "learning_rate": 2.913752913752914e-05, "loss": 0.6373, "step": 250 }, { "epoch": 0.029734141791044777, "grad_norm": 0.829842731040146, "learning_rate": 2.972027972027972e-05, "loss": 0.6163, "step": 255 }, { "epoch": 0.03031716417910448, "grad_norm": 0.9311493282737088, "learning_rate": 3.0303030303030306e-05, "loss": 0.6448, "step": 260 }, { "epoch": 0.03090018656716418, "grad_norm": 0.8380478242539974, "learning_rate": 3.088578088578088e-05, "loss": 0.6123, "step": 265 }, { "epoch": 0.03148320895522388, "grad_norm": 0.8612483599648676, "learning_rate": 3.146853146853147e-05, "loss": 0.6206, "step": 270 }, { "epoch": 0.03206623134328358, "grad_norm": 0.7504191034824335, "learning_rate": 3.205128205128206e-05, "loss": 0.5956, "step": 275 }, { "epoch": 0.03264925373134328, "grad_norm": 0.8435455726913331, "learning_rate": 3.2634032634032635e-05, "loss": 0.6147, "step": 280 }, { "epoch": 0.033232276119402986, "grad_norm": 0.8228330420668449, "learning_rate": 3.321678321678322e-05, "loss": 0.6285, "step": 285 }, { "epoch": 0.033815298507462684, "grad_norm": 0.7466739529712338, "learning_rate": 3.37995337995338e-05, "loss": 0.62, "step": 290 }, { "epoch": 0.03439832089552239, "grad_norm": 0.8056760014927022, "learning_rate": 3.438228438228439e-05, "loss": 0.6216, "step": 295 }, { "epoch": 0.034981343283582086, "grad_norm": 0.8113797852368729, "learning_rate": 3.4965034965034965e-05, "loss": 0.6105, "step": 300 }, { "epoch": 0.03556436567164179, "grad_norm": 0.8890502750300378, "learning_rate": 3.554778554778555e-05, "loss": 0.6139, "step": 305 }, { "epoch": 0.03614738805970149, "grad_norm": 0.7687545593968855, "learning_rate": 3.613053613053613e-05, "loss": 0.614, "step": 310 }, { "epoch": 0.036730410447761194, "grad_norm": 0.8336903362213683, "learning_rate": 3.671328671328672e-05, "loss": 0.6278, "step": 315 }, { "epoch": 0.03731343283582089, "grad_norm": 0.869875611794232, "learning_rate": 3.72960372960373e-05, "loss": 0.6645, "step": 320 }, { "epoch": 0.0378964552238806, "grad_norm": 0.789926244606821, "learning_rate": 3.787878787878788e-05, "loss": 0.6027, "step": 325 }, { "epoch": 0.038479477611940295, "grad_norm": 0.8392459694319648, "learning_rate": 3.846153846153846e-05, "loss": 0.6567, "step": 330 }, { "epoch": 0.0390625, "grad_norm": 0.7918852769546142, "learning_rate": 3.904428904428905e-05, "loss": 0.6453, "step": 335 }, { "epoch": 0.039645522388059705, "grad_norm": 0.7819447011686889, "learning_rate": 3.962703962703963e-05, "loss": 0.6, "step": 340 }, { "epoch": 0.0402285447761194, "grad_norm": 0.7599671478703276, "learning_rate": 4.020979020979021e-05, "loss": 0.6097, "step": 345 }, { "epoch": 0.04081156716417911, "grad_norm": 0.8770061789317196, "learning_rate": 4.079254079254079e-05, "loss": 0.6199, "step": 350 }, { "epoch": 0.041394589552238806, "grad_norm": 0.8228590505028837, "learning_rate": 4.1375291375291377e-05, "loss": 0.63, "step": 355 }, { "epoch": 0.04197761194029851, "grad_norm": 0.9266679768403295, "learning_rate": 4.195804195804196e-05, "loss": 0.6611, "step": 360 }, { "epoch": 0.04256063432835821, "grad_norm": 0.776925533918814, "learning_rate": 4.254079254079254e-05, "loss": 0.6383, "step": 365 }, { "epoch": 0.043143656716417914, "grad_norm": 0.7652459941000134, "learning_rate": 4.312354312354312e-05, "loss": 0.5967, "step": 370 }, { "epoch": 0.04372667910447761, "grad_norm": 0.8438476797737248, "learning_rate": 4.370629370629371e-05, "loss": 0.6146, "step": 375 }, { "epoch": 0.044309701492537316, "grad_norm": 0.8158888620815322, "learning_rate": 4.428904428904429e-05, "loss": 0.6237, "step": 380 }, { "epoch": 0.044892723880597014, "grad_norm": 0.9144427364330033, "learning_rate": 4.4871794871794874e-05, "loss": 0.6098, "step": 385 }, { "epoch": 0.04547574626865672, "grad_norm": 0.7260277122998825, "learning_rate": 4.545454545454546e-05, "loss": 0.6105, "step": 390 }, { "epoch": 0.04605876865671642, "grad_norm": 0.774195742981333, "learning_rate": 4.603729603729604e-05, "loss": 0.6182, "step": 395 }, { "epoch": 0.04664179104477612, "grad_norm": 0.8931158013280471, "learning_rate": 4.662004662004662e-05, "loss": 0.6269, "step": 400 }, { "epoch": 0.04722481343283582, "grad_norm": 0.7219414153894123, "learning_rate": 4.7202797202797204e-05, "loss": 0.6159, "step": 405 }, { "epoch": 0.047807835820895525, "grad_norm": 0.7392180125184434, "learning_rate": 4.778554778554779e-05, "loss": 0.6597, "step": 410 }, { "epoch": 0.04839085820895522, "grad_norm": 0.7724562759848209, "learning_rate": 4.836829836829837e-05, "loss": 0.6588, "step": 415 }, { "epoch": 0.04897388059701493, "grad_norm": 0.7698999004589875, "learning_rate": 4.8951048951048956e-05, "loss": 0.6431, "step": 420 }, { "epoch": 0.049556902985074626, "grad_norm": 0.756825275252759, "learning_rate": 4.9533799533799534e-05, "loss": 0.6475, "step": 425 }, { "epoch": 0.05013992537313433, "grad_norm": 0.7182404605677212, "learning_rate": 4.9999998327150664e-05, "loss": 0.6519, "step": 430 }, { "epoch": 0.05072294776119403, "grad_norm": 0.7352734921536612, "learning_rate": 4.999993977744981e-05, "loss": 0.6367, "step": 435 }, { "epoch": 0.051305970149253734, "grad_norm": 0.8663133271054537, "learning_rate": 4.9999797585530614e-05, "loss": 0.6367, "step": 440 }, { "epoch": 0.05188899253731343, "grad_norm": 0.8007074508867144, "learning_rate": 4.9999571751921666e-05, "loss": 0.649, "step": 445 }, { "epoch": 0.05247201492537314, "grad_norm": 0.8808725057777399, "learning_rate": 4.999926227746247e-05, "loss": 0.6407, "step": 450 }, { "epoch": 0.053055037313432835, "grad_norm": 0.8207754194931053, "learning_rate": 4.999886916330351e-05, "loss": 0.6664, "step": 455 }, { "epoch": 0.05363805970149254, "grad_norm": 0.7200644017362087, "learning_rate": 4.9998392410906135e-05, "loss": 0.6264, "step": 460 }, { "epoch": 0.05422108208955224, "grad_norm": 0.7655394577960307, "learning_rate": 4.9997832022042676e-05, "loss": 0.6854, "step": 465 }, { "epoch": 0.05480410447761194, "grad_norm": 1.5845328151194846, "learning_rate": 4.9997187998796316e-05, "loss": 0.6092, "step": 470 }, { "epoch": 0.05538712686567164, "grad_norm": 0.763948541759687, "learning_rate": 4.9996460343561184e-05, "loss": 0.6601, "step": 475 }, { "epoch": 0.055970149253731345, "grad_norm": 0.7370601652088519, "learning_rate": 4.99956490590423e-05, "loss": 0.6062, "step": 480 }, { "epoch": 0.05655317164179104, "grad_norm": 0.6503724111090272, "learning_rate": 4.9994754148255566e-05, "loss": 0.597, "step": 485 }, { "epoch": 0.05713619402985075, "grad_norm": 0.642633522142368, "learning_rate": 4.999377561452776e-05, "loss": 0.633, "step": 490 }, { "epoch": 0.057719216417910446, "grad_norm": 0.6866312480811294, "learning_rate": 4.999271346149652e-05, "loss": 0.6421, "step": 495 }, { "epoch": 0.05830223880597015, "grad_norm": 0.6472205796291965, "learning_rate": 4.999156769311035e-05, "loss": 0.615, "step": 500 }, { "epoch": 0.05888526119402985, "grad_norm": 0.6977115669022897, "learning_rate": 4.999033831362857e-05, "loss": 0.6214, "step": 505 }, { "epoch": 0.059468283582089554, "grad_norm": 0.6986043336376292, "learning_rate": 4.998902532762132e-05, "loss": 0.6193, "step": 510 }, { "epoch": 0.06005130597014925, "grad_norm": 0.7475999210726344, "learning_rate": 4.9987628739969554e-05, "loss": 0.6224, "step": 515 }, { "epoch": 0.06063432835820896, "grad_norm": 0.6813745517845872, "learning_rate": 4.9986148555865016e-05, "loss": 0.6177, "step": 520 }, { "epoch": 0.061217350746268655, "grad_norm": 0.7015168996041015, "learning_rate": 4.9984584780810196e-05, "loss": 0.6768, "step": 525 }, { "epoch": 0.06180037313432836, "grad_norm": 0.7037997780013406, "learning_rate": 4.998293742061833e-05, "loss": 0.6742, "step": 530 }, { "epoch": 0.06238339552238806, "grad_norm": 0.7156012560309609, "learning_rate": 4.998120648141338e-05, "loss": 0.6304, "step": 535 }, { "epoch": 0.06296641791044776, "grad_norm": 0.711127994617591, "learning_rate": 4.997939196963004e-05, "loss": 0.6765, "step": 540 }, { "epoch": 0.06354944029850747, "grad_norm": 0.7367809530722819, "learning_rate": 4.997749389201363e-05, "loss": 0.6443, "step": 545 }, { "epoch": 0.06413246268656717, "grad_norm": 0.7023254028972605, "learning_rate": 4.997551225562014e-05, "loss": 0.614, "step": 550 }, { "epoch": 0.06471548507462686, "grad_norm": 0.7934049324407979, "learning_rate": 4.99734470678162e-05, "loss": 0.6804, "step": 555 }, { "epoch": 0.06529850746268656, "grad_norm": 0.7211790595438915, "learning_rate": 4.997129833627902e-05, "loss": 0.6022, "step": 560 }, { "epoch": 0.06588152985074627, "grad_norm": 0.7803510446535082, "learning_rate": 4.996906606899639e-05, "loss": 0.6324, "step": 565 }, { "epoch": 0.06646455223880597, "grad_norm": 0.7268141035030203, "learning_rate": 4.996675027426662e-05, "loss": 0.6244, "step": 570 }, { "epoch": 0.06704757462686567, "grad_norm": 1.937711458766374, "learning_rate": 4.9964350960698564e-05, "loss": 0.637, "step": 575 }, { "epoch": 0.06763059701492537, "grad_norm": 0.6807880062940403, "learning_rate": 4.996186813721152e-05, "loss": 0.6009, "step": 580 }, { "epoch": 0.06821361940298508, "grad_norm": 0.7031632269645188, "learning_rate": 4.995930181303522e-05, "loss": 0.6312, "step": 585 }, { "epoch": 0.06879664179104478, "grad_norm": 0.6771751142969727, "learning_rate": 4.995665199770986e-05, "loss": 0.6604, "step": 590 }, { "epoch": 0.06937966417910447, "grad_norm": 0.711794153251972, "learning_rate": 4.995391870108595e-05, "loss": 0.6527, "step": 595 }, { "epoch": 0.06996268656716417, "grad_norm": 0.7336499566418994, "learning_rate": 4.9951101933324374e-05, "loss": 0.6056, "step": 600 }, { "epoch": 0.07054570895522388, "grad_norm": 0.9123524114339076, "learning_rate": 4.994820170489629e-05, "loss": 0.6351, "step": 605 }, { "epoch": 0.07112873134328358, "grad_norm": 0.6937351959939653, "learning_rate": 4.9945218026583147e-05, "loss": 0.6415, "step": 610 }, { "epoch": 0.07171175373134328, "grad_norm": 0.6701402337273787, "learning_rate": 4.9942150909476576e-05, "loss": 0.616, "step": 615 }, { "epoch": 0.07229477611940298, "grad_norm": 0.6737823946741938, "learning_rate": 4.9939000364978424e-05, "loss": 0.5916, "step": 620 }, { "epoch": 0.07287779850746269, "grad_norm": 0.6443035255679863, "learning_rate": 4.993576640480064e-05, "loss": 0.593, "step": 625 }, { "epoch": 0.07346082089552239, "grad_norm": 0.7194715850493604, "learning_rate": 4.9932449040965296e-05, "loss": 0.6537, "step": 630 }, { "epoch": 0.07404384328358209, "grad_norm": 0.6638929998798874, "learning_rate": 4.992904828580449e-05, "loss": 0.651, "step": 635 }, { "epoch": 0.07462686567164178, "grad_norm": 0.6403792391822621, "learning_rate": 4.992556415196035e-05, "loss": 0.5995, "step": 640 }, { "epoch": 0.0752098880597015, "grad_norm": 0.7646937059991467, "learning_rate": 4.9921996652384915e-05, "loss": 0.6254, "step": 645 }, { "epoch": 0.0757929104477612, "grad_norm": 0.708046339189841, "learning_rate": 4.991834580034017e-05, "loss": 0.6285, "step": 650 }, { "epoch": 0.07637593283582089, "grad_norm": 0.6787038237176444, "learning_rate": 4.991461160939795e-05, "loss": 0.6089, "step": 655 }, { "epoch": 0.07695895522388059, "grad_norm": 0.637822387437937, "learning_rate": 4.991079409343989e-05, "loss": 0.6339, "step": 660 }, { "epoch": 0.0775419776119403, "grad_norm": 0.7266823799603278, "learning_rate": 4.990689326665738e-05, "loss": 0.601, "step": 665 }, { "epoch": 0.078125, "grad_norm": 0.6501949889104538, "learning_rate": 4.9902909143551516e-05, "loss": 0.6016, "step": 670 }, { "epoch": 0.0787080223880597, "grad_norm": 0.6820009767011616, "learning_rate": 4.989884173893305e-05, "loss": 0.6007, "step": 675 }, { "epoch": 0.07929104477611941, "grad_norm": 0.6877686038902945, "learning_rate": 4.989469106792231e-05, "loss": 0.6136, "step": 680 }, { "epoch": 0.07987406716417911, "grad_norm": 0.6085151630088098, "learning_rate": 4.9890457145949186e-05, "loss": 0.5881, "step": 685 }, { "epoch": 0.0804570895522388, "grad_norm": 0.6214607197858061, "learning_rate": 4.988613998875304e-05, "loss": 0.6053, "step": 690 }, { "epoch": 0.0810401119402985, "grad_norm": 0.6744013331308584, "learning_rate": 4.988173961238264e-05, "loss": 0.5802, "step": 695 }, { "epoch": 0.08162313432835822, "grad_norm": 0.6808032691033227, "learning_rate": 4.987725603319615e-05, "loss": 0.6094, "step": 700 }, { "epoch": 0.08220615671641791, "grad_norm": 0.6979985310436119, "learning_rate": 4.987268926786098e-05, "loss": 0.6323, "step": 705 }, { "epoch": 0.08278917910447761, "grad_norm": 0.6226469149492844, "learning_rate": 4.986803933335385e-05, "loss": 0.6459, "step": 710 }, { "epoch": 0.08337220149253731, "grad_norm": 0.676363656496179, "learning_rate": 4.9863306246960605e-05, "loss": 0.5761, "step": 715 }, { "epoch": 0.08395522388059702, "grad_norm": 0.6685016854342901, "learning_rate": 4.9858490026276226e-05, "loss": 0.5988, "step": 720 }, { "epoch": 0.08453824626865672, "grad_norm": 0.6396794014235626, "learning_rate": 4.9853590689204715e-05, "loss": 0.5854, "step": 725 }, { "epoch": 0.08512126865671642, "grad_norm": 0.720334288248887, "learning_rate": 4.9848608253959096e-05, "loss": 0.6207, "step": 730 }, { "epoch": 0.08570429104477612, "grad_norm": 0.6079861012766015, "learning_rate": 4.984354273906127e-05, "loss": 0.5953, "step": 735 }, { "epoch": 0.08628731343283583, "grad_norm": 0.6782045394701461, "learning_rate": 4.9838394163341993e-05, "loss": 0.63, "step": 740 }, { "epoch": 0.08687033582089553, "grad_norm": 0.6465534111571817, "learning_rate": 4.983316254594081e-05, "loss": 0.6006, "step": 745 }, { "epoch": 0.08745335820895522, "grad_norm": 0.6048787945996146, "learning_rate": 4.9827847906305934e-05, "loss": 0.5937, "step": 750 }, { "epoch": 0.08803638059701492, "grad_norm": 0.6704335077914664, "learning_rate": 4.982245026419424e-05, "loss": 0.6433, "step": 755 }, { "epoch": 0.08861940298507463, "grad_norm": 0.6420162328599116, "learning_rate": 4.981696963967116e-05, "loss": 0.5806, "step": 760 }, { "epoch": 0.08920242537313433, "grad_norm": 0.589278560052739, "learning_rate": 4.981140605311057e-05, "loss": 0.6049, "step": 765 }, { "epoch": 0.08978544776119403, "grad_norm": 0.6681567808799939, "learning_rate": 4.98057595251948e-05, "loss": 0.6369, "step": 770 }, { "epoch": 0.09036847014925373, "grad_norm": 0.5822554638380099, "learning_rate": 4.980003007691449e-05, "loss": 0.633, "step": 775 }, { "epoch": 0.09095149253731344, "grad_norm": 0.6014893132349064, "learning_rate": 4.979421772956852e-05, "loss": 0.637, "step": 780 }, { "epoch": 0.09153451492537314, "grad_norm": 0.6190152455325075, "learning_rate": 4.9788322504763954e-05, "loss": 0.626, "step": 785 }, { "epoch": 0.09211753731343283, "grad_norm": 0.6043190595480243, "learning_rate": 4.978234442441596e-05, "loss": 0.5958, "step": 790 }, { "epoch": 0.09270055970149253, "grad_norm": 0.7067979054927527, "learning_rate": 4.977628351074769e-05, "loss": 0.6391, "step": 795 }, { "epoch": 0.09328358208955224, "grad_norm": 1.0900391421932065, "learning_rate": 4.977013978629025e-05, "loss": 0.5869, "step": 800 }, { "epoch": 0.09386660447761194, "grad_norm": 0.7289575339056653, "learning_rate": 4.976391327388257e-05, "loss": 0.5817, "step": 805 }, { "epoch": 0.09444962686567164, "grad_norm": 0.6772024103975534, "learning_rate": 4.9757603996671354e-05, "loss": 0.644, "step": 810 }, { "epoch": 0.09503264925373134, "grad_norm": 0.6273836766329598, "learning_rate": 4.975121197811096e-05, "loss": 0.6343, "step": 815 }, { "epoch": 0.09561567164179105, "grad_norm": 0.6925933519325139, "learning_rate": 4.974473724196338e-05, "loss": 0.6527, "step": 820 }, { "epoch": 0.09619869402985075, "grad_norm": 0.7918262789810132, "learning_rate": 4.973817981229802e-05, "loss": 0.6043, "step": 825 }, { "epoch": 0.09678171641791045, "grad_norm": 0.6915704244723462, "learning_rate": 4.9731539713491776e-05, "loss": 0.6101, "step": 830 }, { "epoch": 0.09736473880597014, "grad_norm": 0.7026540645380082, "learning_rate": 4.972481697022883e-05, "loss": 0.6321, "step": 835 }, { "epoch": 0.09794776119402986, "grad_norm": 0.6870682595723567, "learning_rate": 4.971801160750057e-05, "loss": 0.6431, "step": 840 }, { "epoch": 0.09853078358208955, "grad_norm": 0.6762758634999102, "learning_rate": 4.971112365060555e-05, "loss": 0.6212, "step": 845 }, { "epoch": 0.09911380597014925, "grad_norm": 1.0041969348534898, "learning_rate": 4.970415312514936e-05, "loss": 0.5934, "step": 850 }, { "epoch": 0.09969682835820895, "grad_norm": 0.6273960276780831, "learning_rate": 4.969710005704449e-05, "loss": 0.6499, "step": 855 }, { "epoch": 0.10027985074626866, "grad_norm": 0.6488940069009905, "learning_rate": 4.9689964472510345e-05, "loss": 0.5949, "step": 860 }, { "epoch": 0.10086287313432836, "grad_norm": 0.763231378289182, "learning_rate": 4.968274639807304e-05, "loss": 0.6146, "step": 865 }, { "epoch": 0.10144589552238806, "grad_norm": 0.6658920295533095, "learning_rate": 4.967544586056532e-05, "loss": 0.6049, "step": 870 }, { "epoch": 0.10202891791044776, "grad_norm": 0.7074390644632157, "learning_rate": 4.966806288712654e-05, "loss": 0.6175, "step": 875 }, { "epoch": 0.10261194029850747, "grad_norm": 0.695343426907801, "learning_rate": 4.966059750520246e-05, "loss": 0.591, "step": 880 }, { "epoch": 0.10319496268656717, "grad_norm": 0.6418320757008029, "learning_rate": 4.965304974254521e-05, "loss": 0.6176, "step": 885 }, { "epoch": 0.10377798507462686, "grad_norm": 0.7727923536862525, "learning_rate": 4.9645419627213155e-05, "loss": 0.6134, "step": 890 }, { "epoch": 0.10436100746268656, "grad_norm": 0.6029656422873543, "learning_rate": 4.96377071875708e-05, "loss": 0.5654, "step": 895 }, { "epoch": 0.10494402985074627, "grad_norm": 0.6358118150755105, "learning_rate": 4.9629912452288696e-05, "loss": 0.6252, "step": 900 }, { "epoch": 0.10552705223880597, "grad_norm": 0.61344386800639, "learning_rate": 4.962203545034332e-05, "loss": 0.6059, "step": 905 }, { "epoch": 0.10611007462686567, "grad_norm": 0.6135409466759565, "learning_rate": 4.961407621101697e-05, "loss": 0.5614, "step": 910 }, { "epoch": 0.10669309701492537, "grad_norm": 1.0821214504665195, "learning_rate": 4.960603476389765e-05, "loss": 0.6162, "step": 915 }, { "epoch": 0.10727611940298508, "grad_norm": 0.5759584165021776, "learning_rate": 4.959791113887898e-05, "loss": 0.6055, "step": 920 }, { "epoch": 0.10785914179104478, "grad_norm": 0.5799139948464339, "learning_rate": 4.958970536616006e-05, "loss": 0.6067, "step": 925 }, { "epoch": 0.10844216417910447, "grad_norm": 0.6049902178680088, "learning_rate": 4.9581417476245365e-05, "loss": 0.5808, "step": 930 }, { "epoch": 0.10902518656716417, "grad_norm": 0.6216812883236675, "learning_rate": 4.957304749994465e-05, "loss": 0.5644, "step": 935 }, { "epoch": 0.10960820895522388, "grad_norm": 0.6264774945250813, "learning_rate": 4.956459546837283e-05, "loss": 0.5889, "step": 940 }, { "epoch": 0.11019123134328358, "grad_norm": 0.7584079121183279, "learning_rate": 4.955606141294982e-05, "loss": 0.6662, "step": 945 }, { "epoch": 0.11077425373134328, "grad_norm": 0.6021040722392922, "learning_rate": 4.954744536540048e-05, "loss": 0.6075, "step": 950 }, { "epoch": 0.11135727611940298, "grad_norm": 0.6516825177813871, "learning_rate": 4.953874735775448e-05, "loss": 0.6163, "step": 955 }, { "epoch": 0.11194029850746269, "grad_norm": 0.6483316850868462, "learning_rate": 4.9529967422346137e-05, "loss": 0.6388, "step": 960 }, { "epoch": 0.11252332089552239, "grad_norm": 1.0296359523919179, "learning_rate": 4.952110559181437e-05, "loss": 0.6187, "step": 965 }, { "epoch": 0.11310634328358209, "grad_norm": 0.5963874485076113, "learning_rate": 4.95121618991025e-05, "loss": 0.653, "step": 970 }, { "epoch": 0.11368936567164178, "grad_norm": 0.5827112927403142, "learning_rate": 4.950313637745819e-05, "loss": 0.6013, "step": 975 }, { "epoch": 0.1142723880597015, "grad_norm": 0.5978368012555301, "learning_rate": 4.9494029060433304e-05, "loss": 0.5854, "step": 980 }, { "epoch": 0.1148554104477612, "grad_norm": 0.6061921826097367, "learning_rate": 4.9484839981883755e-05, "loss": 0.6042, "step": 985 }, { "epoch": 0.11543843283582089, "grad_norm": 0.577591628346454, "learning_rate": 4.9475569175969414e-05, "loss": 0.6033, "step": 990 }, { "epoch": 0.11602145522388059, "grad_norm": 0.578876456470585, "learning_rate": 4.9466216677153945e-05, "loss": 0.5762, "step": 995 }, { "epoch": 0.1166044776119403, "grad_norm": 0.6111711530967577, "learning_rate": 4.945678252020475e-05, "loss": 0.6078, "step": 1000 }, { "epoch": 0.1171875, "grad_norm": 0.5389571001747652, "learning_rate": 4.9447266740192735e-05, "loss": 0.586, "step": 1005 }, { "epoch": 0.1177705223880597, "grad_norm": 0.5665361736437953, "learning_rate": 4.943766937249226e-05, "loss": 0.5896, "step": 1010 }, { "epoch": 0.11835354477611941, "grad_norm": 0.6004663875659557, "learning_rate": 4.942799045278099e-05, "loss": 0.5931, "step": 1015 }, { "epoch": 0.11893656716417911, "grad_norm": 0.5877689595746617, "learning_rate": 4.941823001703974e-05, "loss": 0.5985, "step": 1020 }, { "epoch": 0.1195195895522388, "grad_norm": 0.5935807095087586, "learning_rate": 4.940838810155237e-05, "loss": 0.5991, "step": 1025 }, { "epoch": 0.1201026119402985, "grad_norm": 0.6656506474951656, "learning_rate": 4.939846474290563e-05, "loss": 0.6148, "step": 1030 }, { "epoch": 0.12068563432835822, "grad_norm": 0.6400226054588848, "learning_rate": 4.9388459977989016e-05, "loss": 0.6035, "step": 1035 }, { "epoch": 0.12126865671641791, "grad_norm": 0.5682054111741724, "learning_rate": 4.937837384399467e-05, "loss": 0.5942, "step": 1040 }, { "epoch": 0.12185167910447761, "grad_norm": 0.5722092103161287, "learning_rate": 4.936820637841721e-05, "loss": 0.6297, "step": 1045 }, { "epoch": 0.12243470149253731, "grad_norm": 0.5744583767360888, "learning_rate": 4.935795761905359e-05, "loss": 0.5799, "step": 1050 }, { "epoch": 0.12301772388059702, "grad_norm": 0.5955258224599197, "learning_rate": 4.934762760400299e-05, "loss": 0.5901, "step": 1055 }, { "epoch": 0.12360074626865672, "grad_norm": 0.5941450778138045, "learning_rate": 4.933721637166662e-05, "loss": 0.5932, "step": 1060 }, { "epoch": 0.12418376865671642, "grad_norm": 0.6984953623677035, "learning_rate": 4.9326723960747655e-05, "loss": 0.5928, "step": 1065 }, { "epoch": 0.12476679104477612, "grad_norm": 0.5556738969991728, "learning_rate": 4.931615041025101e-05, "loss": 0.5449, "step": 1070 }, { "epoch": 0.12534981343283583, "grad_norm": 0.6202875202064847, "learning_rate": 4.9305495759483246e-05, "loss": 0.6245, "step": 1075 }, { "epoch": 0.1259328358208955, "grad_norm": 0.6360648582792375, "learning_rate": 4.929476004805241e-05, "loss": 0.622, "step": 1080 }, { "epoch": 0.12651585820895522, "grad_norm": 0.548961800240671, "learning_rate": 4.928394331586788e-05, "loss": 0.5878, "step": 1085 }, { "epoch": 0.12709888059701493, "grad_norm": 0.6534887664001585, "learning_rate": 4.927304560314023e-05, "loss": 0.5985, "step": 1090 }, { "epoch": 0.12768190298507462, "grad_norm": 0.6051258985285789, "learning_rate": 4.9262066950381074e-05, "loss": 0.5815, "step": 1095 }, { "epoch": 0.12826492537313433, "grad_norm": 0.5940919877769313, "learning_rate": 4.925100739840293e-05, "loss": 0.6056, "step": 1100 }, { "epoch": 0.12884794776119404, "grad_norm": 0.5795416953387396, "learning_rate": 4.923986698831902e-05, "loss": 0.593, "step": 1105 }, { "epoch": 0.12943097014925373, "grad_norm": 0.6060003300391443, "learning_rate": 4.922864576154318e-05, "loss": 0.6293, "step": 1110 }, { "epoch": 0.13001399253731344, "grad_norm": 0.5926697361708111, "learning_rate": 4.921734375978966e-05, "loss": 0.5863, "step": 1115 }, { "epoch": 0.13059701492537312, "grad_norm": 0.5666399578950052, "learning_rate": 4.9205961025073005e-05, "loss": 0.6093, "step": 1120 }, { "epoch": 0.13118003731343283, "grad_norm": 0.6290080779373148, "learning_rate": 4.919449759970787e-05, "loss": 0.6149, "step": 1125 }, { "epoch": 0.13176305970149255, "grad_norm": 0.6030059289618742, "learning_rate": 4.9182953526308866e-05, "loss": 0.5983, "step": 1130 }, { "epoch": 0.13234608208955223, "grad_norm": 0.6967548151886129, "learning_rate": 4.9171328847790416e-05, "loss": 0.5979, "step": 1135 }, { "epoch": 0.13292910447761194, "grad_norm": 0.6568770405720038, "learning_rate": 4.9159623607366587e-05, "loss": 0.5857, "step": 1140 }, { "epoch": 0.13351212686567165, "grad_norm": 0.5840444295366739, "learning_rate": 4.914783784855093e-05, "loss": 0.6438, "step": 1145 }, { "epoch": 0.13409514925373134, "grad_norm": 0.7004511399791347, "learning_rate": 4.913597161515633e-05, "loss": 0.6184, "step": 1150 }, { "epoch": 0.13467817164179105, "grad_norm": 0.550373143569304, "learning_rate": 4.91240249512948e-05, "loss": 0.567, "step": 1155 }, { "epoch": 0.13526119402985073, "grad_norm": 0.5313433759195957, "learning_rate": 4.9111997901377373e-05, "loss": 0.5855, "step": 1160 }, { "epoch": 0.13584421641791045, "grad_norm": 0.5655705667845206, "learning_rate": 4.9099890510113924e-05, "loss": 0.5661, "step": 1165 }, { "epoch": 0.13642723880597016, "grad_norm": 0.6195584641586247, "learning_rate": 4.908770282251296e-05, "loss": 0.5905, "step": 1170 }, { "epoch": 0.13701026119402984, "grad_norm": 0.5583698659457814, "learning_rate": 4.9075434883881504e-05, "loss": 0.6002, "step": 1175 }, { "epoch": 0.13759328358208955, "grad_norm": 0.6389070121403235, "learning_rate": 4.906308673982491e-05, "loss": 0.5607, "step": 1180 }, { "epoch": 0.13817630597014927, "grad_norm": 0.59774495236772, "learning_rate": 4.905065843624668e-05, "loss": 0.5678, "step": 1185 }, { "epoch": 0.13875932835820895, "grad_norm": 0.6173804758323792, "learning_rate": 4.903815001934832e-05, "loss": 0.6002, "step": 1190 }, { "epoch": 0.13934235074626866, "grad_norm": 0.5469869287414887, "learning_rate": 4.9025561535629125e-05, "loss": 0.5977, "step": 1195 }, { "epoch": 0.13992537313432835, "grad_norm": 0.5624438700425504, "learning_rate": 4.9012893031886075e-05, "loss": 0.582, "step": 1200 }, { "epoch": 0.14050839552238806, "grad_norm": 0.5985379982120503, "learning_rate": 4.9000144555213575e-05, "loss": 0.5825, "step": 1205 }, { "epoch": 0.14109141791044777, "grad_norm": 0.5800377004502044, "learning_rate": 4.898731615300336e-05, "loss": 0.5706, "step": 1210 }, { "epoch": 0.14167444029850745, "grad_norm": 0.5901666286652832, "learning_rate": 4.8974407872944263e-05, "loss": 0.5937, "step": 1215 }, { "epoch": 0.14225746268656717, "grad_norm": 0.644009126842579, "learning_rate": 4.8961419763022065e-05, "loss": 0.5612, "step": 1220 }, { "epoch": 0.14284048507462688, "grad_norm": 0.5922807627602008, "learning_rate": 4.894835187151931e-05, "loss": 0.6067, "step": 1225 }, { "epoch": 0.14342350746268656, "grad_norm": 0.5722199265482402, "learning_rate": 4.893520424701513e-05, "loss": 0.6082, "step": 1230 }, { "epoch": 0.14400652985074627, "grad_norm": 0.6126570623385963, "learning_rate": 4.892197693838504e-05, "loss": 0.5964, "step": 1235 }, { "epoch": 0.14458955223880596, "grad_norm": 0.5973655677327272, "learning_rate": 4.890866999480082e-05, "loss": 0.5918, "step": 1240 }, { "epoch": 0.14517257462686567, "grad_norm": 0.54393899856667, "learning_rate": 4.889528346573023e-05, "loss": 0.5827, "step": 1245 }, { "epoch": 0.14575559701492538, "grad_norm": 0.5508599028743483, "learning_rate": 4.888181740093693e-05, "loss": 0.5966, "step": 1250 }, { "epoch": 0.14633861940298507, "grad_norm": 0.5748810947596694, "learning_rate": 4.886827185048023e-05, "loss": 0.6069, "step": 1255 }, { "epoch": 0.14692164179104478, "grad_norm": 0.5843720987519714, "learning_rate": 4.8854646864714906e-05, "loss": 0.626, "step": 1260 }, { "epoch": 0.1475046641791045, "grad_norm": 0.5574624014831214, "learning_rate": 4.884094249429109e-05, "loss": 0.5513, "step": 1265 }, { "epoch": 0.14808768656716417, "grad_norm": 0.6341906909347658, "learning_rate": 4.882715879015396e-05, "loss": 0.597, "step": 1270 }, { "epoch": 0.14867070895522388, "grad_norm": 0.575813430423846, "learning_rate": 4.881329580354363e-05, "loss": 0.6081, "step": 1275 }, { "epoch": 0.14925373134328357, "grad_norm": 0.6445668774873804, "learning_rate": 4.8799353585994954e-05, "loss": 0.6087, "step": 1280 }, { "epoch": 0.14983675373134328, "grad_norm": 0.5859378252774211, "learning_rate": 4.8785332189337325e-05, "loss": 0.5602, "step": 1285 }, { "epoch": 0.150419776119403, "grad_norm": 0.6268377383651598, "learning_rate": 4.877123166569445e-05, "loss": 0.621, "step": 1290 }, { "epoch": 0.15100279850746268, "grad_norm": 0.539111443847871, "learning_rate": 4.8757052067484216e-05, "loss": 0.5778, "step": 1295 }, { "epoch": 0.1515858208955224, "grad_norm": 0.6230886477061637, "learning_rate": 4.874279344741846e-05, "loss": 0.6052, "step": 1300 }, { "epoch": 0.1521688432835821, "grad_norm": 0.5804714929817124, "learning_rate": 4.872845585850277e-05, "loss": 0.5704, "step": 1305 }, { "epoch": 0.15275186567164178, "grad_norm": 0.5644557466436395, "learning_rate": 4.87140393540363e-05, "loss": 0.6135, "step": 1310 }, { "epoch": 0.1533348880597015, "grad_norm": 0.5979238448583827, "learning_rate": 4.869954398761156e-05, "loss": 0.5891, "step": 1315 }, { "epoch": 0.15391791044776118, "grad_norm": 0.5684870674237956, "learning_rate": 4.868496981311424e-05, "loss": 0.6057, "step": 1320 }, { "epoch": 0.1545009328358209, "grad_norm": 0.6084433820062218, "learning_rate": 4.8670316884722984e-05, "loss": 0.6183, "step": 1325 }, { "epoch": 0.1550839552238806, "grad_norm": 0.5168917985718466, "learning_rate": 4.86555852569092e-05, "loss": 0.558, "step": 1330 }, { "epoch": 0.1556669776119403, "grad_norm": 0.6169882009143579, "learning_rate": 4.864077498443687e-05, "loss": 0.5894, "step": 1335 }, { "epoch": 0.15625, "grad_norm": 0.5573154647203309, "learning_rate": 4.8625886122362305e-05, "loss": 0.6047, "step": 1340 }, { "epoch": 0.1568330223880597, "grad_norm": 0.5430765223123376, "learning_rate": 4.861091872603399e-05, "loss": 0.5936, "step": 1345 }, { "epoch": 0.1574160447761194, "grad_norm": 0.5952512138917507, "learning_rate": 4.859587285109235e-05, "loss": 0.6323, "step": 1350 }, { "epoch": 0.1579990671641791, "grad_norm": 0.6257733920419736, "learning_rate": 4.8580748553469554e-05, "loss": 0.621, "step": 1355 }, { "epoch": 0.15858208955223882, "grad_norm": 0.6484544777362733, "learning_rate": 4.8565545889389295e-05, "loss": 0.6225, "step": 1360 }, { "epoch": 0.1591651119402985, "grad_norm": 0.6001757890053516, "learning_rate": 4.85502649153666e-05, "loss": 0.7485, "step": 1365 }, { "epoch": 0.15974813432835822, "grad_norm": 0.661835508508043, "learning_rate": 4.853490568820759e-05, "loss": 0.8953, "step": 1370 }, { "epoch": 0.1603311567164179, "grad_norm": 0.5424000700147391, "learning_rate": 4.851946826500932e-05, "loss": 0.6088, "step": 1375 }, { "epoch": 0.1609141791044776, "grad_norm": 0.5642506132746299, "learning_rate": 4.8503952703159485e-05, "loss": 0.5788, "step": 1380 }, { "epoch": 0.16149720149253732, "grad_norm": 1.5770888453223382, "learning_rate": 4.8488359060336314e-05, "loss": 0.6464, "step": 1385 }, { "epoch": 0.162080223880597, "grad_norm": 0.5798567582108715, "learning_rate": 4.847268739450825e-05, "loss": 0.5962, "step": 1390 }, { "epoch": 0.16266324626865672, "grad_norm": 0.6247134338348835, "learning_rate": 4.84569377639338e-05, "loss": 0.5762, "step": 1395 }, { "epoch": 0.16324626865671643, "grad_norm": 0.6403693981471119, "learning_rate": 4.84411102271613e-05, "loss": 0.6033, "step": 1400 }, { "epoch": 0.16382929104477612, "grad_norm": 0.6328461284086258, "learning_rate": 4.842520484302871e-05, "loss": 0.6124, "step": 1405 }, { "epoch": 0.16441231343283583, "grad_norm": 0.5690467644511553, "learning_rate": 4.840922167066335e-05, "loss": 0.6058, "step": 1410 }, { "epoch": 0.1649953358208955, "grad_norm": 0.5991271256192877, "learning_rate": 4.8393160769481755e-05, "loss": 0.5759, "step": 1415 }, { "epoch": 0.16557835820895522, "grad_norm": 0.5518688454925015, "learning_rate": 4.8377022199189374e-05, "loss": 0.5758, "step": 1420 }, { "epoch": 0.16616138059701493, "grad_norm": 0.8358279547212623, "learning_rate": 4.836080601978043e-05, "loss": 0.614, "step": 1425 }, { "epoch": 0.16674440298507462, "grad_norm": 0.5736501078455153, "learning_rate": 4.83445122915376e-05, "loss": 0.6268, "step": 1430 }, { "epoch": 0.16732742537313433, "grad_norm": 0.501139191324822, "learning_rate": 4.832814107503188e-05, "loss": 0.5522, "step": 1435 }, { "epoch": 0.16791044776119404, "grad_norm": 0.5414270989894118, "learning_rate": 4.8311692431122326e-05, "loss": 0.5985, "step": 1440 }, { "epoch": 0.16849347014925373, "grad_norm": 0.5503417116648404, "learning_rate": 4.82951664209558e-05, "loss": 0.5883, "step": 1445 }, { "epoch": 0.16907649253731344, "grad_norm": 0.5828254158920338, "learning_rate": 4.82785631059668e-05, "loss": 0.5973, "step": 1450 }, { "epoch": 0.16965951492537312, "grad_norm": 0.5750003990997431, "learning_rate": 4.826188254787717e-05, "loss": 0.5929, "step": 1455 }, { "epoch": 0.17024253731343283, "grad_norm": 0.5931754407423996, "learning_rate": 4.824512480869593e-05, "loss": 0.584, "step": 1460 }, { "epoch": 0.17082555970149255, "grad_norm": 0.5457054499115934, "learning_rate": 4.822828995071899e-05, "loss": 0.5711, "step": 1465 }, { "epoch": 0.17140858208955223, "grad_norm": 0.5229569450197525, "learning_rate": 4.821137803652896e-05, "loss": 0.5459, "step": 1470 }, { "epoch": 0.17199160447761194, "grad_norm": 0.6027439958371525, "learning_rate": 4.819438912899489e-05, "loss": 0.5814, "step": 1475 }, { "epoch": 0.17257462686567165, "grad_norm": 0.5702069062273913, "learning_rate": 4.8177323291272066e-05, "loss": 0.6299, "step": 1480 }, { "epoch": 0.17315764925373134, "grad_norm": 0.5780484151492952, "learning_rate": 4.8160180586801744e-05, "loss": 0.5777, "step": 1485 }, { "epoch": 0.17374067164179105, "grad_norm": 0.575917162342149, "learning_rate": 4.814296107931093e-05, "loss": 0.5547, "step": 1490 }, { "epoch": 0.17432369402985073, "grad_norm": 0.5192602462313004, "learning_rate": 4.812566483281216e-05, "loss": 0.5669, "step": 1495 }, { "epoch": 0.17490671641791045, "grad_norm": 0.5345650171129616, "learning_rate": 4.81082919116032e-05, "loss": 0.5917, "step": 1500 }, { "epoch": 0.17548973880597016, "grad_norm": 0.5475736070312467, "learning_rate": 4.809084238026689e-05, "loss": 0.602, "step": 1505 }, { "epoch": 0.17607276119402984, "grad_norm": 0.5786621619994619, "learning_rate": 4.8073316303670835e-05, "loss": 0.5861, "step": 1510 }, { "epoch": 0.17665578358208955, "grad_norm": 0.5448850237456587, "learning_rate": 4.8055713746967216e-05, "loss": 0.5857, "step": 1515 }, { "epoch": 0.17723880597014927, "grad_norm": 0.5779873112523268, "learning_rate": 4.803803477559252e-05, "loss": 0.5665, "step": 1520 }, { "epoch": 0.17782182835820895, "grad_norm": 0.5652601574196091, "learning_rate": 4.8020279455267274e-05, "loss": 0.5794, "step": 1525 }, { "epoch": 0.17840485074626866, "grad_norm": 0.5437626429311837, "learning_rate": 4.800244785199588e-05, "loss": 0.547, "step": 1530 }, { "epoch": 0.17898787313432835, "grad_norm": 0.5657663832577134, "learning_rate": 4.7984540032066266e-05, "loss": 0.5645, "step": 1535 }, { "epoch": 0.17957089552238806, "grad_norm": 0.5396084565585528, "learning_rate": 4.796655606204971e-05, "loss": 0.5917, "step": 1540 }, { "epoch": 0.18015391791044777, "grad_norm": 0.5659614973871961, "learning_rate": 4.794849600880059e-05, "loss": 0.5888, "step": 1545 }, { "epoch": 0.18073694029850745, "grad_norm": 0.610344463972923, "learning_rate": 4.793035993945609e-05, "loss": 0.5973, "step": 1550 }, { "epoch": 0.18131996268656717, "grad_norm": 0.5315364790581317, "learning_rate": 4.7912147921436e-05, "loss": 0.5793, "step": 1555 }, { "epoch": 0.18190298507462688, "grad_norm": 0.5215816759572497, "learning_rate": 4.789386002244244e-05, "loss": 0.5773, "step": 1560 }, { "epoch": 0.18248600746268656, "grad_norm": 0.48268753752530635, "learning_rate": 4.7875496310459607e-05, "loss": 0.5439, "step": 1565 }, { "epoch": 0.18306902985074627, "grad_norm": 0.5928345158273391, "learning_rate": 4.7857056853753536e-05, "loss": 0.5946, "step": 1570 }, { "epoch": 0.18365205223880596, "grad_norm": 0.5217322463974674, "learning_rate": 4.783854172087183e-05, "loss": 0.5633, "step": 1575 }, { "epoch": 0.18423507462686567, "grad_norm": 0.5361438235075061, "learning_rate": 4.781995098064343e-05, "loss": 0.5616, "step": 1580 }, { "epoch": 0.18481809701492538, "grad_norm": 0.5319409918375809, "learning_rate": 4.780128470217833e-05, "loss": 0.5959, "step": 1585 }, { "epoch": 0.18540111940298507, "grad_norm": 0.5656220383824416, "learning_rate": 4.778254295486732e-05, "loss": 0.5941, "step": 1590 }, { "epoch": 0.18598414179104478, "grad_norm": 0.664107972745298, "learning_rate": 4.7763725808381777e-05, "loss": 0.5932, "step": 1595 }, { "epoch": 0.1865671641791045, "grad_norm": 0.5594876020852717, "learning_rate": 4.7744833332673336e-05, "loss": 0.5754, "step": 1600 }, { "epoch": 0.18715018656716417, "grad_norm": 0.5748188546122887, "learning_rate": 4.7725865597973684e-05, "loss": 0.5929, "step": 1605 }, { "epoch": 0.18773320895522388, "grad_norm": 0.5608090588962066, "learning_rate": 4.770682267479427e-05, "loss": 0.6, "step": 1610 }, { "epoch": 0.18831623134328357, "grad_norm": 0.5614561936493092, "learning_rate": 4.7687704633926056e-05, "loss": 0.5885, "step": 1615 }, { "epoch": 0.18889925373134328, "grad_norm": 0.5020575112535327, "learning_rate": 4.766851154643924e-05, "loss": 0.5766, "step": 1620 }, { "epoch": 0.189482276119403, "grad_norm": 0.5350491170633971, "learning_rate": 4.7649243483683015e-05, "loss": 0.6016, "step": 1625 }, { "epoch": 0.19006529850746268, "grad_norm": 0.5723463466212365, "learning_rate": 4.762990051728529e-05, "loss": 0.5938, "step": 1630 }, { "epoch": 0.1906483208955224, "grad_norm": 0.528038155312228, "learning_rate": 4.7610482719152404e-05, "loss": 0.5919, "step": 1635 }, { "epoch": 0.1912313432835821, "grad_norm": 0.5195292947087375, "learning_rate": 4.7590990161468906e-05, "loss": 0.587, "step": 1640 }, { "epoch": 0.19181436567164178, "grad_norm": 0.5309411178378524, "learning_rate": 4.757142291669724e-05, "loss": 0.5584, "step": 1645 }, { "epoch": 0.1923973880597015, "grad_norm": 0.5235366705843203, "learning_rate": 4.755178105757751e-05, "loss": 0.5332, "step": 1650 }, { "epoch": 0.19298041044776118, "grad_norm": 0.5662388478996405, "learning_rate": 4.753206465712717e-05, "loss": 0.6003, "step": 1655 }, { "epoch": 0.1935634328358209, "grad_norm": 0.541515776380167, "learning_rate": 4.751227378864081e-05, "loss": 0.6167, "step": 1660 }, { "epoch": 0.1941464552238806, "grad_norm": 0.5451263614433529, "learning_rate": 4.749240852568981e-05, "loss": 0.5795, "step": 1665 }, { "epoch": 0.1947294776119403, "grad_norm": 0.5539737195153389, "learning_rate": 4.747246894212216e-05, "loss": 0.6156, "step": 1670 }, { "epoch": 0.1953125, "grad_norm": 0.4838135257147904, "learning_rate": 4.7452455112062076e-05, "loss": 0.586, "step": 1675 }, { "epoch": 0.1958955223880597, "grad_norm": 0.6206912586835625, "learning_rate": 4.743236710990982e-05, "loss": 0.5835, "step": 1680 }, { "epoch": 0.1964785447761194, "grad_norm": 0.5575344781989854, "learning_rate": 4.7412205010341385e-05, "loss": 0.5615, "step": 1685 }, { "epoch": 0.1970615671641791, "grad_norm": 0.5465760234872273, "learning_rate": 4.739196888830818e-05, "loss": 0.5614, "step": 1690 }, { "epoch": 0.19764458955223882, "grad_norm": 0.6116107229280592, "learning_rate": 4.737165881903683e-05, "loss": 0.5777, "step": 1695 }, { "epoch": 0.1982276119402985, "grad_norm": 0.526352835776055, "learning_rate": 4.735127487802882e-05, "loss": 0.5499, "step": 1700 }, { "epoch": 0.19881063432835822, "grad_norm": 0.5790768202270913, "learning_rate": 4.7330817141060284e-05, "loss": 0.6062, "step": 1705 }, { "epoch": 0.1993936567164179, "grad_norm": 0.5786125838532169, "learning_rate": 4.731028568418167e-05, "loss": 0.5853, "step": 1710 }, { "epoch": 0.1999766791044776, "grad_norm": 0.5220981929969758, "learning_rate": 4.728968058371746e-05, "loss": 0.5917, "step": 1715 }, { "epoch": 0.20055970149253732, "grad_norm": 0.5962771524705726, "learning_rate": 4.726900191626592e-05, "loss": 0.615, "step": 1720 }, { "epoch": 0.201142723880597, "grad_norm": 0.614179304629615, "learning_rate": 4.724824975869881e-05, "loss": 0.6088, "step": 1725 }, { "epoch": 0.20172574626865672, "grad_norm": 0.5339809717905718, "learning_rate": 4.722742418816106e-05, "loss": 0.5856, "step": 1730 }, { "epoch": 0.20230876865671643, "grad_norm": 0.5775559838523747, "learning_rate": 4.7206525282070514e-05, "loss": 0.5525, "step": 1735 }, { "epoch": 0.20289179104477612, "grad_norm": 0.5787127160424809, "learning_rate": 4.718555311811764e-05, "loss": 0.5889, "step": 1740 }, { "epoch": 0.20347481343283583, "grad_norm": 0.5507748006986873, "learning_rate": 4.716450777426525e-05, "loss": 0.5811, "step": 1745 }, { "epoch": 0.2040578358208955, "grad_norm": 0.5386322774377927, "learning_rate": 4.7143389328748174e-05, "loss": 0.5681, "step": 1750 }, { "epoch": 0.20464085820895522, "grad_norm": 0.5253847435139612, "learning_rate": 4.712219786007302e-05, "loss": 0.5352, "step": 1755 }, { "epoch": 0.20522388059701493, "grad_norm": 0.5229542272956548, "learning_rate": 4.710093344701782e-05, "loss": 0.5878, "step": 1760 }, { "epoch": 0.20580690298507462, "grad_norm": 0.5347624824538455, "learning_rate": 4.707959616863181e-05, "loss": 0.5622, "step": 1765 }, { "epoch": 0.20638992537313433, "grad_norm": 0.5561607185138324, "learning_rate": 4.7058186104235086e-05, "loss": 0.5797, "step": 1770 }, { "epoch": 0.20697294776119404, "grad_norm": 0.4892165988190017, "learning_rate": 4.70367033334183e-05, "loss": 0.5403, "step": 1775 }, { "epoch": 0.20755597014925373, "grad_norm": 0.5510594553982379, "learning_rate": 4.701514793604242e-05, "loss": 0.5559, "step": 1780 }, { "epoch": 0.20813899253731344, "grad_norm": 0.5473696320978395, "learning_rate": 4.699351999223838e-05, "loss": 0.5753, "step": 1785 }, { "epoch": 0.20872201492537312, "grad_norm": 0.4873353162223706, "learning_rate": 4.697181958240679e-05, "loss": 0.5492, "step": 1790 }, { "epoch": 0.20930503731343283, "grad_norm": 0.5064007191967134, "learning_rate": 4.695004678721768e-05, "loss": 0.5858, "step": 1795 }, { "epoch": 0.20988805970149255, "grad_norm": 0.5667430033926578, "learning_rate": 4.692820168761014e-05, "loss": 0.6062, "step": 1800 }, { "epoch": 0.21047108208955223, "grad_norm": 0.5212510033253821, "learning_rate": 4.690628436479206e-05, "loss": 0.5598, "step": 1805 }, { "epoch": 0.21105410447761194, "grad_norm": 0.5588622002334643, "learning_rate": 4.688429490023982e-05, "loss": 0.5763, "step": 1810 }, { "epoch": 0.21163712686567165, "grad_norm": 0.6512372849264118, "learning_rate": 4.6862233375697964e-05, "loss": 0.5808, "step": 1815 }, { "epoch": 0.21222014925373134, "grad_norm": 0.600056817158455, "learning_rate": 4.684009987317894e-05, "loss": 0.5929, "step": 1820 }, { "epoch": 0.21280317164179105, "grad_norm": 0.5525047925178632, "learning_rate": 4.6817894474962756e-05, "loss": 0.5598, "step": 1825 }, { "epoch": 0.21338619402985073, "grad_norm": 0.49741821795163615, "learning_rate": 4.679561726359668e-05, "loss": 0.5689, "step": 1830 }, { "epoch": 0.21396921641791045, "grad_norm": 0.5369544968462594, "learning_rate": 4.677326832189496e-05, "loss": 0.5846, "step": 1835 }, { "epoch": 0.21455223880597016, "grad_norm": 0.578124984252202, "learning_rate": 4.675084773293848e-05, "loss": 0.6068, "step": 1840 }, { "epoch": 0.21513526119402984, "grad_norm": 0.5489389574421355, "learning_rate": 4.6728355580074476e-05, "loss": 0.5799, "step": 1845 }, { "epoch": 0.21571828358208955, "grad_norm": 0.5492579167575462, "learning_rate": 4.6705791946916236e-05, "loss": 0.5969, "step": 1850 }, { "epoch": 0.21630130597014927, "grad_norm": 0.4976854096096666, "learning_rate": 4.6683156917342726e-05, "loss": 0.6034, "step": 1855 }, { "epoch": 0.21688432835820895, "grad_norm": 0.600940601658767, "learning_rate": 4.666045057549838e-05, "loss": 0.5946, "step": 1860 }, { "epoch": 0.21746735074626866, "grad_norm": 0.5506775932621145, "learning_rate": 4.663767300579268e-05, "loss": 0.5847, "step": 1865 }, { "epoch": 0.21805037313432835, "grad_norm": 0.5271315191050433, "learning_rate": 4.661482429289994e-05, "loss": 0.5662, "step": 1870 }, { "epoch": 0.21863339552238806, "grad_norm": 0.5806047125500812, "learning_rate": 4.659190452175891e-05, "loss": 0.5717, "step": 1875 }, { "epoch": 0.21921641791044777, "grad_norm": 0.5548551191672546, "learning_rate": 4.65689137775725e-05, "loss": 0.5918, "step": 1880 }, { "epoch": 0.21979944029850745, "grad_norm": 0.5224077396834346, "learning_rate": 4.654585214580749e-05, "loss": 0.5764, "step": 1885 }, { "epoch": 0.22038246268656717, "grad_norm": 0.595212892553765, "learning_rate": 4.652271971219412e-05, "loss": 0.5718, "step": 1890 }, { "epoch": 0.22096548507462688, "grad_norm": 0.5205351842902096, "learning_rate": 4.6499516562725906e-05, "loss": 0.5843, "step": 1895 }, { "epoch": 0.22154850746268656, "grad_norm": 0.5193771435539847, "learning_rate": 4.647624278365917e-05, "loss": 0.6024, "step": 1900 }, { "epoch": 0.22213152985074627, "grad_norm": 0.5762529929483823, "learning_rate": 4.6452898461512866e-05, "loss": 0.5841, "step": 1905 }, { "epoch": 0.22271455223880596, "grad_norm": 0.5613099807801801, "learning_rate": 4.642948368306814e-05, "loss": 0.5909, "step": 1910 }, { "epoch": 0.22329757462686567, "grad_norm": 0.5857098538355904, "learning_rate": 4.640599853536806e-05, "loss": 0.5986, "step": 1915 }, { "epoch": 0.22388059701492538, "grad_norm": 0.5346551057187519, "learning_rate": 4.6382443105717324e-05, "loss": 0.5651, "step": 1920 }, { "epoch": 0.22446361940298507, "grad_norm": 0.6133229780421945, "learning_rate": 4.635881748168184e-05, "loss": 0.6051, "step": 1925 }, { "epoch": 0.22504664179104478, "grad_norm": 0.564856310131765, "learning_rate": 4.633512175108851e-05, "loss": 0.5645, "step": 1930 }, { "epoch": 0.2256296641791045, "grad_norm": 0.6086776045173963, "learning_rate": 4.631135600202485e-05, "loss": 0.611, "step": 1935 }, { "epoch": 0.22621268656716417, "grad_norm": 0.5053374707380298, "learning_rate": 4.628752032283862e-05, "loss": 0.6359, "step": 1940 }, { "epoch": 0.22679570895522388, "grad_norm": 0.5177309796242289, "learning_rate": 4.626361480213759e-05, "loss": 0.5161, "step": 1945 }, { "epoch": 0.22737873134328357, "grad_norm": 0.5655297929571091, "learning_rate": 4.623963952878914e-05, "loss": 0.5215, "step": 1950 }, { "epoch": 0.22796175373134328, "grad_norm": 0.5007101086568317, "learning_rate": 4.621559459191996e-05, "loss": 0.5672, "step": 1955 }, { "epoch": 0.228544776119403, "grad_norm": 0.5604337094818254, "learning_rate": 4.619148008091569e-05, "loss": 0.5495, "step": 1960 }, { "epoch": 0.22912779850746268, "grad_norm": 0.5600231646159735, "learning_rate": 4.616729608542064e-05, "loss": 0.5914, "step": 1965 }, { "epoch": 0.2297108208955224, "grad_norm": 0.5261560439698072, "learning_rate": 4.61430426953374e-05, "loss": 0.5761, "step": 1970 }, { "epoch": 0.2302938432835821, "grad_norm": 0.5177160101117486, "learning_rate": 4.611872000082654e-05, "loss": 0.6105, "step": 1975 }, { "epoch": 0.23087686567164178, "grad_norm": 0.4991176586608243, "learning_rate": 4.609432809230627e-05, "loss": 0.5576, "step": 1980 }, { "epoch": 0.2314598880597015, "grad_norm": 0.49011496350766276, "learning_rate": 4.606986706045207e-05, "loss": 0.5217, "step": 1985 }, { "epoch": 0.23204291044776118, "grad_norm": 0.49863205095192953, "learning_rate": 4.604533699619643e-05, "loss": 0.5606, "step": 1990 }, { "epoch": 0.2326259328358209, "grad_norm": 0.5202522274733646, "learning_rate": 4.602073799072841e-05, "loss": 0.582, "step": 1995 }, { "epoch": 0.2332089552238806, "grad_norm": 0.5778738097193321, "learning_rate": 4.5996070135493426e-05, "loss": 0.5608, "step": 2000 }, { "epoch": 0.2337919776119403, "grad_norm": 0.5383730742466348, "learning_rate": 4.597133352219275e-05, "loss": 0.5833, "step": 2005 }, { "epoch": 0.234375, "grad_norm": 0.504768235114523, "learning_rate": 4.594652824278333e-05, "loss": 0.5428, "step": 2010 }, { "epoch": 0.2349580223880597, "grad_norm": 0.5424141377382832, "learning_rate": 4.592165438947734e-05, "loss": 0.5234, "step": 2015 }, { "epoch": 0.2355410447761194, "grad_norm": 0.560600079358499, "learning_rate": 4.589671205474189e-05, "loss": 0.5949, "step": 2020 }, { "epoch": 0.2361240671641791, "grad_norm": 0.5710842213852398, "learning_rate": 4.587170133129867e-05, "loss": 0.5675, "step": 2025 }, { "epoch": 0.23670708955223882, "grad_norm": 0.6954740937313674, "learning_rate": 4.5846622312123566e-05, "loss": 0.5593, "step": 2030 }, { "epoch": 0.2372901119402985, "grad_norm": 0.4936861843329207, "learning_rate": 4.582147509044639e-05, "loss": 0.544, "step": 2035 }, { "epoch": 0.23787313432835822, "grad_norm": 0.5990080756647657, "learning_rate": 4.579625975975047e-05, "loss": 0.6093, "step": 2040 }, { "epoch": 0.2384561567164179, "grad_norm": 0.46710498554704555, "learning_rate": 4.577097641377234e-05, "loss": 0.5421, "step": 2045 }, { "epoch": 0.2390391791044776, "grad_norm": 0.6220614730173019, "learning_rate": 4.574562514650137e-05, "loss": 0.5959, "step": 2050 }, { "epoch": 0.23962220149253732, "grad_norm": 0.5736687871170686, "learning_rate": 4.572020605217941e-05, "loss": 0.5705, "step": 2055 }, { "epoch": 0.240205223880597, "grad_norm": 0.5700891932767567, "learning_rate": 4.569471922530048e-05, "loss": 0.5812, "step": 2060 }, { "epoch": 0.24078824626865672, "grad_norm": 0.5076933159103915, "learning_rate": 4.566916476061036e-05, "loss": 0.565, "step": 2065 }, { "epoch": 0.24137126865671643, "grad_norm": 0.5485662358157128, "learning_rate": 4.56435427531063e-05, "loss": 0.576, "step": 2070 }, { "epoch": 0.24195429104477612, "grad_norm": 0.5699225728909032, "learning_rate": 4.5617853298036634e-05, "loss": 0.5984, "step": 2075 }, { "epoch": 0.24253731343283583, "grad_norm": 0.5970990880266768, "learning_rate": 4.559209649090039e-05, "loss": 0.5648, "step": 2080 }, { "epoch": 0.2431203358208955, "grad_norm": 0.5310811562750187, "learning_rate": 4.556627242744703e-05, "loss": 0.5616, "step": 2085 }, { "epoch": 0.24370335820895522, "grad_norm": 0.4892958829299151, "learning_rate": 4.5540381203675994e-05, "loss": 0.5867, "step": 2090 }, { "epoch": 0.24428638059701493, "grad_norm": 0.5227922779707119, "learning_rate": 4.55144229158364e-05, "loss": 0.5534, "step": 2095 }, { "epoch": 0.24486940298507462, "grad_norm": 0.5041942400636352, "learning_rate": 4.548839766042668e-05, "loss": 0.5371, "step": 2100 }, { "epoch": 0.24545242537313433, "grad_norm": 0.49660405961707166, "learning_rate": 4.5462305534194204e-05, "loss": 0.572, "step": 2105 }, { "epoch": 0.24603544776119404, "grad_norm": 0.5499042134726534, "learning_rate": 4.543614663413493e-05, "loss": 0.5611, "step": 2110 }, { "epoch": 0.24661847014925373, "grad_norm": 0.5496526828713953, "learning_rate": 4.5409921057493064e-05, "loss": 0.57, "step": 2115 }, { "epoch": 0.24720149253731344, "grad_norm": 0.5599675685862884, "learning_rate": 4.538362890176066e-05, "loss": 0.5618, "step": 2120 }, { "epoch": 0.24778451492537312, "grad_norm": 0.5579889614380124, "learning_rate": 4.535727026467727e-05, "loss": 0.5682, "step": 2125 }, { "epoch": 0.24836753731343283, "grad_norm": 0.5300253135611827, "learning_rate": 4.533084524422959e-05, "loss": 0.5828, "step": 2130 }, { "epoch": 0.24895055970149255, "grad_norm": 0.498730689430679, "learning_rate": 4.530435393865111e-05, "loss": 0.5535, "step": 2135 }, { "epoch": 0.24953358208955223, "grad_norm": 0.5177126292248756, "learning_rate": 4.527779644642172e-05, "loss": 0.5661, "step": 2140 }, { "epoch": 0.2501166044776119, "grad_norm": 0.5708740823647661, "learning_rate": 4.525117286626734e-05, "loss": 0.5764, "step": 2145 }, { "epoch": 0.25069962686567165, "grad_norm": 0.5326668265886716, "learning_rate": 4.522448329715959e-05, "loss": 0.562, "step": 2150 }, { "epoch": 0.25128264925373134, "grad_norm": 0.4878189090302149, "learning_rate": 4.51977278383154e-05, "loss": 0.5389, "step": 2155 }, { "epoch": 0.251865671641791, "grad_norm": 0.5459043435498905, "learning_rate": 4.517090658919662e-05, "loss": 0.5456, "step": 2160 }, { "epoch": 0.25244869402985076, "grad_norm": 0.5451507549234296, "learning_rate": 4.5144019649509694e-05, "loss": 0.5619, "step": 2165 }, { "epoch": 0.25303171641791045, "grad_norm": 0.48817546876328066, "learning_rate": 4.5117067119205256e-05, "loss": 0.5328, "step": 2170 }, { "epoch": 0.25361473880597013, "grad_norm": 0.5573481611505946, "learning_rate": 4.5090049098477756e-05, "loss": 0.6166, "step": 2175 }, { "epoch": 0.25419776119402987, "grad_norm": 0.46166230865153285, "learning_rate": 4.506296568776513e-05, "loss": 0.5603, "step": 2180 }, { "epoch": 0.25478078358208955, "grad_norm": 0.5255525351801149, "learning_rate": 4.503581698774838e-05, "loss": 0.5597, "step": 2185 }, { "epoch": 0.25536380597014924, "grad_norm": 0.5036713071359231, "learning_rate": 4.5008603099351235e-05, "loss": 0.5572, "step": 2190 }, { "epoch": 0.255946828358209, "grad_norm": 0.5629197395168302, "learning_rate": 4.498132412373972e-05, "loss": 0.5549, "step": 2195 }, { "epoch": 0.25652985074626866, "grad_norm": 0.5281653518122478, "learning_rate": 4.4953980162321845e-05, "loss": 0.5783, "step": 2200 }, { "epoch": 0.25711287313432835, "grad_norm": 0.52357818056316, "learning_rate": 4.492657131674722e-05, "loss": 0.5365, "step": 2205 }, { "epoch": 0.2576958955223881, "grad_norm": 0.5581473844002001, "learning_rate": 4.48990976889066e-05, "loss": 0.5314, "step": 2210 }, { "epoch": 0.25827891791044777, "grad_norm": 0.5463641738254288, "learning_rate": 4.487155938093163e-05, "loss": 0.5678, "step": 2215 }, { "epoch": 0.25886194029850745, "grad_norm": 0.580427123805623, "learning_rate": 4.484395649519435e-05, "loss": 0.5831, "step": 2220 }, { "epoch": 0.25944496268656714, "grad_norm": 0.5515157134010146, "learning_rate": 4.48162891343069e-05, "loss": 0.5916, "step": 2225 }, { "epoch": 0.2600279850746269, "grad_norm": 0.4614774802102389, "learning_rate": 4.478855740112107e-05, "loss": 0.5747, "step": 2230 }, { "epoch": 0.26061100746268656, "grad_norm": 0.5263544516058696, "learning_rate": 4.476076139872797e-05, "loss": 0.5873, "step": 2235 }, { "epoch": 0.26119402985074625, "grad_norm": 0.5279147571827948, "learning_rate": 4.473290123045764e-05, "loss": 0.5694, "step": 2240 }, { "epoch": 0.261777052238806, "grad_norm": 0.9742556257776407, "learning_rate": 4.470497699987861e-05, "loss": 0.5686, "step": 2245 }, { "epoch": 0.26236007462686567, "grad_norm": 0.5328912557304052, "learning_rate": 4.4676988810797596e-05, "loss": 0.5591, "step": 2250 }, { "epoch": 0.26294309701492535, "grad_norm": 0.5633803039126256, "learning_rate": 4.464893676725906e-05, "loss": 0.5728, "step": 2255 }, { "epoch": 0.2635261194029851, "grad_norm": 0.5134466833234131, "learning_rate": 4.4620820973544866e-05, "loss": 0.588, "step": 2260 }, { "epoch": 0.2641091417910448, "grad_norm": 0.5049188829803447, "learning_rate": 4.459264153417381e-05, "loss": 0.5973, "step": 2265 }, { "epoch": 0.26469216417910446, "grad_norm": 0.5404106159275316, "learning_rate": 4.4564398553901344e-05, "loss": 0.5788, "step": 2270 }, { "epoch": 0.2652751865671642, "grad_norm": 0.6037489755180414, "learning_rate": 4.4536092137719094e-05, "loss": 0.5935, "step": 2275 }, { "epoch": 0.2658582089552239, "grad_norm": 0.5161487013459114, "learning_rate": 4.450772239085452e-05, "loss": 0.5371, "step": 2280 }, { "epoch": 0.26644123134328357, "grad_norm": 0.5278496691940492, "learning_rate": 4.44792894187705e-05, "loss": 0.5234, "step": 2285 }, { "epoch": 0.2670242537313433, "grad_norm": 0.5092042449789793, "learning_rate": 4.445079332716497e-05, "loss": 0.5925, "step": 2290 }, { "epoch": 0.267607276119403, "grad_norm": 0.5223498221449892, "learning_rate": 4.4422234221970475e-05, "loss": 0.5629, "step": 2295 }, { "epoch": 0.2681902985074627, "grad_norm": 0.5456470351851613, "learning_rate": 4.439361220935385e-05, "loss": 0.5416, "step": 2300 }, { "epoch": 0.26877332089552236, "grad_norm": 0.5476526134142059, "learning_rate": 4.436492739571575e-05, "loss": 0.5499, "step": 2305 }, { "epoch": 0.2693563432835821, "grad_norm": 0.49505688936501147, "learning_rate": 4.433617988769031e-05, "loss": 0.5431, "step": 2310 }, { "epoch": 0.2699393656716418, "grad_norm": 0.5342065164923998, "learning_rate": 4.43073697921447e-05, "loss": 0.5693, "step": 2315 }, { "epoch": 0.27052238805970147, "grad_norm": 0.5145761250584094, "learning_rate": 4.4278497216178805e-05, "loss": 0.5795, "step": 2320 }, { "epoch": 0.2711054104477612, "grad_norm": 0.5273626768859015, "learning_rate": 4.4249562267124735e-05, "loss": 0.5549, "step": 2325 }, { "epoch": 0.2716884328358209, "grad_norm": 0.5079624559473638, "learning_rate": 4.422056505254648e-05, "loss": 0.5746, "step": 2330 }, { "epoch": 0.2722714552238806, "grad_norm": 0.505058134321932, "learning_rate": 4.4191505680239494e-05, "loss": 0.5565, "step": 2335 }, { "epoch": 0.2728544776119403, "grad_norm": 0.5668879894960839, "learning_rate": 4.416238425823031e-05, "loss": 0.5578, "step": 2340 }, { "epoch": 0.2734375, "grad_norm": 0.5141499926926323, "learning_rate": 4.413320089477612e-05, "loss": 0.5489, "step": 2345 }, { "epoch": 0.2740205223880597, "grad_norm": 0.5109670063026779, "learning_rate": 4.4103955698364394e-05, "loss": 0.5592, "step": 2350 }, { "epoch": 0.2746035447761194, "grad_norm": 0.5082096864806888, "learning_rate": 4.407464877771243e-05, "loss": 0.5213, "step": 2355 }, { "epoch": 0.2751865671641791, "grad_norm": 0.5083698199510284, "learning_rate": 4.4045280241767024e-05, "loss": 0.5493, "step": 2360 }, { "epoch": 0.2757695895522388, "grad_norm": 0.5053920381989545, "learning_rate": 4.401585019970397e-05, "loss": 0.5686, "step": 2365 }, { "epoch": 0.27635261194029853, "grad_norm": 0.4963501376761793, "learning_rate": 4.3986358760927774e-05, "loss": 0.5545, "step": 2370 }, { "epoch": 0.2769356343283582, "grad_norm": 0.5288831258581409, "learning_rate": 4.3956806035071123e-05, "loss": 0.5249, "step": 2375 }, { "epoch": 0.2775186567164179, "grad_norm": 0.62308836025263, "learning_rate": 4.392719213199457e-05, "loss": 0.6042, "step": 2380 }, { "epoch": 0.27810167910447764, "grad_norm": 0.4832296950151894, "learning_rate": 4.389751716178606e-05, "loss": 0.5633, "step": 2385 }, { "epoch": 0.2786847014925373, "grad_norm": 0.5237890773447569, "learning_rate": 4.386778123476059e-05, "loss": 0.5271, "step": 2390 }, { "epoch": 0.279267723880597, "grad_norm": 0.5595739424570599, "learning_rate": 4.383798446145973e-05, "loss": 0.6001, "step": 2395 }, { "epoch": 0.2798507462686567, "grad_norm": 0.5332931182325285, "learning_rate": 4.380812695265126e-05, "loss": 0.6024, "step": 2400 }, { "epoch": 0.28043376865671643, "grad_norm": 0.4918731480131771, "learning_rate": 4.3778208819328724e-05, "loss": 0.56, "step": 2405 }, { "epoch": 0.2810167910447761, "grad_norm": 0.4625310842353903, "learning_rate": 4.374823017271105e-05, "loss": 0.5161, "step": 2410 }, { "epoch": 0.2815998134328358, "grad_norm": 0.5070557648263391, "learning_rate": 4.371819112424212e-05, "loss": 0.518, "step": 2415 }, { "epoch": 0.28218283582089554, "grad_norm": 0.56213077270057, "learning_rate": 4.368809178559034e-05, "loss": 0.5645, "step": 2420 }, { "epoch": 0.2827658582089552, "grad_norm": 0.5027850751505164, "learning_rate": 4.365793226864825e-05, "loss": 0.5516, "step": 2425 }, { "epoch": 0.2833488805970149, "grad_norm": 0.496969778411097, "learning_rate": 4.3627712685532104e-05, "loss": 0.5661, "step": 2430 }, { "epoch": 0.28393190298507465, "grad_norm": 0.5203901533643535, "learning_rate": 4.3597433148581465e-05, "loss": 0.5564, "step": 2435 }, { "epoch": 0.28451492537313433, "grad_norm": 0.5761459191507305, "learning_rate": 4.3567093770358724e-05, "loss": 0.584, "step": 2440 }, { "epoch": 0.285097947761194, "grad_norm": 0.5440010534822834, "learning_rate": 4.353669466364877e-05, "loss": 0.5833, "step": 2445 }, { "epoch": 0.28568097014925375, "grad_norm": 0.5575628208489772, "learning_rate": 4.3506235941458516e-05, "loss": 0.5926, "step": 2450 }, { "epoch": 0.28626399253731344, "grad_norm": 0.500211988994469, "learning_rate": 4.347571771701648e-05, "loss": 0.5651, "step": 2455 }, { "epoch": 0.2868470149253731, "grad_norm": 0.46825863619458763, "learning_rate": 4.34451401037724e-05, "loss": 0.5461, "step": 2460 }, { "epoch": 0.28743003731343286, "grad_norm": 0.5521309621293441, "learning_rate": 4.3414503215396776e-05, "loss": 0.5659, "step": 2465 }, { "epoch": 0.28801305970149255, "grad_norm": 0.4609786323278, "learning_rate": 4.338380716578046e-05, "loss": 0.5487, "step": 2470 }, { "epoch": 0.28859608208955223, "grad_norm": 0.499738534899754, "learning_rate": 4.3353052069034214e-05, "loss": 0.5381, "step": 2475 }, { "epoch": 0.2891791044776119, "grad_norm": 0.5917418512742165, "learning_rate": 4.332223803948834e-05, "loss": 0.5434, "step": 2480 }, { "epoch": 0.28976212686567165, "grad_norm": 0.48180474136483686, "learning_rate": 4.3291365191692204e-05, "loss": 0.5734, "step": 2485 }, { "epoch": 0.29034514925373134, "grad_norm": 0.5007564237127459, "learning_rate": 4.326043364041381e-05, "loss": 0.5352, "step": 2490 }, { "epoch": 0.290928171641791, "grad_norm": 0.47349703509695523, "learning_rate": 4.3229443500639414e-05, "loss": 0.5553, "step": 2495 }, { "epoch": 0.29151119402985076, "grad_norm": 0.5192204895800268, "learning_rate": 4.319839488757305e-05, "loss": 0.5398, "step": 2500 }, { "epoch": 0.29209421641791045, "grad_norm": 0.5757316112230765, "learning_rate": 4.3167287916636145e-05, "loss": 0.5795, "step": 2505 }, { "epoch": 0.29267723880597013, "grad_norm": 0.5046004476804008, "learning_rate": 4.3136122703467045e-05, "loss": 0.5657, "step": 2510 }, { "epoch": 0.29326026119402987, "grad_norm": 0.6116298626095651, "learning_rate": 4.3104899363920616e-05, "loss": 0.5892, "step": 2515 }, { "epoch": 0.29384328358208955, "grad_norm": 0.5028293067243745, "learning_rate": 4.3073618014067824e-05, "loss": 0.5614, "step": 2520 }, { "epoch": 0.29442630597014924, "grad_norm": 0.5284740774719737, "learning_rate": 4.304227877019525e-05, "loss": 0.5509, "step": 2525 }, { "epoch": 0.295009328358209, "grad_norm": 0.5043452273361897, "learning_rate": 4.301088174880472e-05, "loss": 0.5734, "step": 2530 }, { "epoch": 0.29559235074626866, "grad_norm": 0.5491908726146885, "learning_rate": 4.297942706661283e-05, "loss": 0.5994, "step": 2535 }, { "epoch": 0.29617537313432835, "grad_norm": 0.5967635993887994, "learning_rate": 4.2947914840550544e-05, "loss": 0.5895, "step": 2540 }, { "epoch": 0.2967583955223881, "grad_norm": 0.49957976391073805, "learning_rate": 4.291634518776273e-05, "loss": 0.5559, "step": 2545 }, { "epoch": 0.29734141791044777, "grad_norm": 0.586452369328343, "learning_rate": 4.2884718225607736e-05, "loss": 0.5987, "step": 2550 }, { "epoch": 0.29792444029850745, "grad_norm": 0.5079777653661454, "learning_rate": 4.285303407165694e-05, "loss": 0.5892, "step": 2555 }, { "epoch": 0.29850746268656714, "grad_norm": 0.45945289033221137, "learning_rate": 4.282129284369436e-05, "loss": 0.5353, "step": 2560 }, { "epoch": 0.2990904850746269, "grad_norm": 0.47293762371540526, "learning_rate": 4.278949465971616e-05, "loss": 0.5451, "step": 2565 }, { "epoch": 0.29967350746268656, "grad_norm": 0.5125425687772994, "learning_rate": 4.2757639637930246e-05, "loss": 0.588, "step": 2570 }, { "epoch": 0.30025652985074625, "grad_norm": 0.560469657657478, "learning_rate": 4.2725727896755804e-05, "loss": 0.5353, "step": 2575 }, { "epoch": 0.300839552238806, "grad_norm": 0.5254788469907365, "learning_rate": 4.269375955482287e-05, "loss": 0.547, "step": 2580 }, { "epoch": 0.30142257462686567, "grad_norm": 0.48987507837663824, "learning_rate": 4.266173473097192e-05, "loss": 0.5175, "step": 2585 }, { "epoch": 0.30200559701492535, "grad_norm": 0.5484732726515896, "learning_rate": 4.262965354425335e-05, "loss": 0.6067, "step": 2590 }, { "epoch": 0.3025886194029851, "grad_norm": 0.5064490667857149, "learning_rate": 4.259751611392712e-05, "loss": 0.533, "step": 2595 }, { "epoch": 0.3031716417910448, "grad_norm": 0.5014575792637805, "learning_rate": 4.256532255946226e-05, "loss": 0.5913, "step": 2600 }, { "epoch": 0.30375466417910446, "grad_norm": 0.48901348419554475, "learning_rate": 4.253307300053643e-05, "loss": 0.5543, "step": 2605 }, { "epoch": 0.3043376865671642, "grad_norm": 0.5232458296317694, "learning_rate": 4.25007675570355e-05, "loss": 0.5759, "step": 2610 }, { "epoch": 0.3049207089552239, "grad_norm": 0.5070730983893377, "learning_rate": 4.246840634905307e-05, "loss": 0.5515, "step": 2615 }, { "epoch": 0.30550373134328357, "grad_norm": 0.4828637362156461, "learning_rate": 4.2435989496890054e-05, "loss": 0.5817, "step": 2620 }, { "epoch": 0.3060867537313433, "grad_norm": 0.45681007096402865, "learning_rate": 4.240351712105422e-05, "loss": 0.5538, "step": 2625 }, { "epoch": 0.306669776119403, "grad_norm": 0.5633919156991598, "learning_rate": 4.237098934225973e-05, "loss": 0.5619, "step": 2630 }, { "epoch": 0.3072527985074627, "grad_norm": 0.4744742163372956, "learning_rate": 4.233840628142672e-05, "loss": 0.5749, "step": 2635 }, { "epoch": 0.30783582089552236, "grad_norm": 0.47221088669237776, "learning_rate": 4.2305768059680806e-05, "loss": 0.5689, "step": 2640 }, { "epoch": 0.3084188432835821, "grad_norm": 0.4762515982934417, "learning_rate": 4.2273074798352706e-05, "loss": 0.5507, "step": 2645 }, { "epoch": 0.3090018656716418, "grad_norm": 0.4718217399644616, "learning_rate": 4.22403266189777e-05, "loss": 0.5483, "step": 2650 }, { "epoch": 0.30958488805970147, "grad_norm": 0.4671780211152603, "learning_rate": 4.2207523643295253e-05, "loss": 0.5463, "step": 2655 }, { "epoch": 0.3101679104477612, "grad_norm": 0.49004196187956206, "learning_rate": 4.2174665993248505e-05, "loss": 0.5474, "step": 2660 }, { "epoch": 0.3107509328358209, "grad_norm": 0.5092130863744895, "learning_rate": 4.214175379098388e-05, "loss": 0.5512, "step": 2665 }, { "epoch": 0.3113339552238806, "grad_norm": 0.4925159889445034, "learning_rate": 4.210878715885056e-05, "loss": 0.5609, "step": 2670 }, { "epoch": 0.3119169776119403, "grad_norm": 0.5438536521932689, "learning_rate": 4.2075766219400095e-05, "loss": 0.5949, "step": 2675 }, { "epoch": 0.3125, "grad_norm": 0.4985765329826545, "learning_rate": 4.20426910953859e-05, "loss": 0.5726, "step": 2680 }, { "epoch": 0.3130830223880597, "grad_norm": 0.5036063584187528, "learning_rate": 4.200956190976284e-05, "loss": 0.5255, "step": 2685 }, { "epoch": 0.3136660447761194, "grad_norm": 0.5634159268083127, "learning_rate": 4.1976378785686715e-05, "loss": 0.5254, "step": 2690 }, { "epoch": 0.3142490671641791, "grad_norm": 0.483875275656377, "learning_rate": 4.1943141846513886e-05, "loss": 0.5518, "step": 2695 }, { "epoch": 0.3148320895522388, "grad_norm": 0.5510683619678165, "learning_rate": 4.190985121580071e-05, "loss": 0.557, "step": 2700 }, { "epoch": 0.31541511194029853, "grad_norm": 0.45786711381238016, "learning_rate": 4.18765070173032e-05, "loss": 0.521, "step": 2705 }, { "epoch": 0.3159981343283582, "grad_norm": 0.47954794442661985, "learning_rate": 4.184310937497647e-05, "loss": 0.55, "step": 2710 }, { "epoch": 0.3165811567164179, "grad_norm": 0.5348827941441646, "learning_rate": 4.1809658412974314e-05, "loss": 0.5668, "step": 2715 }, { "epoch": 0.31716417910447764, "grad_norm": 0.5035567889348412, "learning_rate": 4.177615425564872e-05, "loss": 0.5584, "step": 2720 }, { "epoch": 0.3177472014925373, "grad_norm": 0.5014049010285576, "learning_rate": 4.174259702754947e-05, "loss": 0.5538, "step": 2725 }, { "epoch": 0.318330223880597, "grad_norm": 0.7108080618041146, "learning_rate": 4.17089868534236e-05, "loss": 0.593, "step": 2730 }, { "epoch": 0.3189132462686567, "grad_norm": 0.4901344322857948, "learning_rate": 4.1675323858214975e-05, "loss": 0.5504, "step": 2735 }, { "epoch": 0.31949626865671643, "grad_norm": 0.5455638212871261, "learning_rate": 4.164160816706383e-05, "loss": 0.5781, "step": 2740 }, { "epoch": 0.3200792910447761, "grad_norm": 0.4691865337288382, "learning_rate": 4.160783990530629e-05, "loss": 0.5349, "step": 2745 }, { "epoch": 0.3206623134328358, "grad_norm": 0.5397164117756076, "learning_rate": 4.157401919847389e-05, "loss": 0.5635, "step": 2750 }, { "epoch": 0.32124533582089554, "grad_norm": 0.49203133885978545, "learning_rate": 4.1540146172293154e-05, "loss": 0.553, "step": 2755 }, { "epoch": 0.3218283582089552, "grad_norm": 0.6956383718262041, "learning_rate": 4.150622095268508e-05, "loss": 0.5454, "step": 2760 }, { "epoch": 0.3224113805970149, "grad_norm": 0.5236479255481173, "learning_rate": 4.1472243665764715e-05, "loss": 0.546, "step": 2765 }, { "epoch": 0.32299440298507465, "grad_norm": 0.5226139094665005, "learning_rate": 4.1438214437840625e-05, "loss": 0.5685, "step": 2770 }, { "epoch": 0.32357742537313433, "grad_norm": 0.519532395995944, "learning_rate": 4.140413339541451e-05, "loss": 0.5621, "step": 2775 }, { "epoch": 0.324160447761194, "grad_norm": 0.5221117984856967, "learning_rate": 4.137000066518065e-05, "loss": 0.5945, "step": 2780 }, { "epoch": 0.32474347014925375, "grad_norm": 0.4662764439552845, "learning_rate": 4.13358163740255e-05, "loss": 0.5439, "step": 2785 }, { "epoch": 0.32532649253731344, "grad_norm": 0.4825709779338081, "learning_rate": 4.1301580649027154e-05, "loss": 0.5487, "step": 2790 }, { "epoch": 0.3259095149253731, "grad_norm": 0.49595525241615773, "learning_rate": 4.126729361745495e-05, "loss": 0.5456, "step": 2795 }, { "epoch": 0.32649253731343286, "grad_norm": 0.46259611525800753, "learning_rate": 4.1232955406768925e-05, "loss": 0.5385, "step": 2800 }, { "epoch": 0.32707555970149255, "grad_norm": 0.48140441921381905, "learning_rate": 4.119856614461938e-05, "loss": 0.5289, "step": 2805 }, { "epoch": 0.32765858208955223, "grad_norm": 0.4878844018028054, "learning_rate": 4.11641259588464e-05, "loss": 0.5698, "step": 2810 }, { "epoch": 0.3282416044776119, "grad_norm": 0.521050501093044, "learning_rate": 4.1129634977479375e-05, "loss": 0.5609, "step": 2815 }, { "epoch": 0.32882462686567165, "grad_norm": 0.49828928878222534, "learning_rate": 4.109509332873653e-05, "loss": 0.5421, "step": 2820 }, { "epoch": 0.32940764925373134, "grad_norm": 0.5526407883676059, "learning_rate": 4.106050114102443e-05, "loss": 0.5789, "step": 2825 }, { "epoch": 0.329990671641791, "grad_norm": 0.5516248967750637, "learning_rate": 4.102585854293751e-05, "loss": 0.526, "step": 2830 }, { "epoch": 0.33057369402985076, "grad_norm": 0.5461716849773546, "learning_rate": 4.0991165663257636e-05, "loss": 0.542, "step": 2835 }, { "epoch": 0.33115671641791045, "grad_norm": 0.4524230892035543, "learning_rate": 4.095642263095356e-05, "loss": 0.5429, "step": 2840 }, { "epoch": 0.33173973880597013, "grad_norm": 0.4907608846635581, "learning_rate": 4.0921629575180485e-05, "loss": 0.5536, "step": 2845 }, { "epoch": 0.33232276119402987, "grad_norm": 0.5284875026264199, "learning_rate": 4.088678662527959e-05, "loss": 0.6164, "step": 2850 }, { "epoch": 0.33290578358208955, "grad_norm": 0.48638662847344827, "learning_rate": 4.085189391077749e-05, "loss": 0.576, "step": 2855 }, { "epoch": 0.33348880597014924, "grad_norm": 0.5695349098133777, "learning_rate": 4.0816951561385836e-05, "loss": 0.5521, "step": 2860 }, { "epoch": 0.334071828358209, "grad_norm": 0.44821729017419637, "learning_rate": 4.078195970700079e-05, "loss": 0.526, "step": 2865 }, { "epoch": 0.33465485074626866, "grad_norm": 0.5289515772805433, "learning_rate": 4.074691847770251e-05, "loss": 0.5505, "step": 2870 }, { "epoch": 0.33523787313432835, "grad_norm": 0.5491964687999824, "learning_rate": 4.0711828003754764e-05, "loss": 0.5795, "step": 2875 }, { "epoch": 0.3358208955223881, "grad_norm": 0.5184535389920294, "learning_rate": 4.067668841560432e-05, "loss": 0.5864, "step": 2880 }, { "epoch": 0.33640391791044777, "grad_norm": 0.4640726695845034, "learning_rate": 4.064149984388057e-05, "loss": 0.5377, "step": 2885 }, { "epoch": 0.33698694029850745, "grad_norm": 0.5223555480526277, "learning_rate": 4.060626241939499e-05, "loss": 0.5528, "step": 2890 }, { "epoch": 0.33756996268656714, "grad_norm": 0.47444935336532806, "learning_rate": 4.057097627314063e-05, "loss": 0.5366, "step": 2895 }, { "epoch": 0.3381529850746269, "grad_norm": 0.49936360299242055, "learning_rate": 4.0535641536291725e-05, "loss": 0.5981, "step": 2900 }, { "epoch": 0.33873600746268656, "grad_norm": 0.49567180693590535, "learning_rate": 4.050025834020307e-05, "loss": 0.564, "step": 2905 }, { "epoch": 0.33931902985074625, "grad_norm": 0.48231305889253284, "learning_rate": 4.046482681640967e-05, "loss": 0.5559, "step": 2910 }, { "epoch": 0.339902052238806, "grad_norm": 0.5713231807843953, "learning_rate": 4.042934709662613e-05, "loss": 0.6046, "step": 2915 }, { "epoch": 0.34048507462686567, "grad_norm": 0.5599418550405636, "learning_rate": 4.039381931274626e-05, "loss": 0.5459, "step": 2920 }, { "epoch": 0.34106809701492535, "grad_norm": 0.5244470856538797, "learning_rate": 4.035824359684253e-05, "loss": 0.5559, "step": 2925 }, { "epoch": 0.3416511194029851, "grad_norm": 0.46218246493450177, "learning_rate": 4.032262008116559e-05, "loss": 0.5496, "step": 2930 }, { "epoch": 0.3422341417910448, "grad_norm": 0.5460308556475504, "learning_rate": 4.02869488981438e-05, "loss": 0.5726, "step": 2935 }, { "epoch": 0.34281716417910446, "grad_norm": 0.4623737344709334, "learning_rate": 4.025123018038271e-05, "loss": 0.5422, "step": 2940 }, { "epoch": 0.3434001865671642, "grad_norm": 0.5813565724181794, "learning_rate": 4.0215464060664564e-05, "loss": 0.5822, "step": 2945 }, { "epoch": 0.3439832089552239, "grad_norm": 0.49376149527368923, "learning_rate": 4.017965067194783e-05, "loss": 0.5821, "step": 2950 }, { "epoch": 0.34456623134328357, "grad_norm": 0.5169325234827248, "learning_rate": 4.0143790147366724e-05, "loss": 0.5482, "step": 2955 }, { "epoch": 0.3451492537313433, "grad_norm": 0.515737079500457, "learning_rate": 4.010788262023064e-05, "loss": 0.5709, "step": 2960 }, { "epoch": 0.345732276119403, "grad_norm": 0.5087208722494978, "learning_rate": 4.007192822402372e-05, "loss": 0.5565, "step": 2965 }, { "epoch": 0.3463152985074627, "grad_norm": 0.4796963642209258, "learning_rate": 4.003592709240438e-05, "loss": 0.5494, "step": 2970 }, { "epoch": 0.34689832089552236, "grad_norm": 0.49755643832004437, "learning_rate": 3.9999879359204676e-05, "loss": 0.5382, "step": 2975 }, { "epoch": 0.3474813432835821, "grad_norm": 0.4690304314754959, "learning_rate": 3.996378515843001e-05, "loss": 0.5334, "step": 2980 }, { "epoch": 0.3480643656716418, "grad_norm": 0.4772189221465707, "learning_rate": 3.9927644624258445e-05, "loss": 0.5902, "step": 2985 }, { "epoch": 0.34864738805970147, "grad_norm": 0.46404845790530064, "learning_rate": 3.989145789104033e-05, "loss": 0.5431, "step": 2990 }, { "epoch": 0.3492304104477612, "grad_norm": 0.5340362988246943, "learning_rate": 3.985522509329775e-05, "loss": 0.5486, "step": 2995 }, { "epoch": 0.3498134328358209, "grad_norm": 0.510771920504794, "learning_rate": 3.9818946365724004e-05, "loss": 0.5401, "step": 3000 }, { "epoch": 0.3503964552238806, "grad_norm": 0.49718158294490394, "learning_rate": 3.978262184318317e-05, "loss": 0.5626, "step": 3005 }, { "epoch": 0.3509794776119403, "grad_norm": 0.5158633729934479, "learning_rate": 3.974625166070953e-05, "loss": 0.5364, "step": 3010 }, { "epoch": 0.3515625, "grad_norm": 0.5204824836614376, "learning_rate": 3.970983595350714e-05, "loss": 0.5545, "step": 3015 }, { "epoch": 0.3521455223880597, "grad_norm": 0.506085761814899, "learning_rate": 3.967337485694929e-05, "loss": 0.5492, "step": 3020 }, { "epoch": 0.3527285447761194, "grad_norm": 0.5167191975821872, "learning_rate": 3.963686850657795e-05, "loss": 0.5326, "step": 3025 }, { "epoch": 0.3533115671641791, "grad_norm": 0.49528479007831655, "learning_rate": 3.9600317038103385e-05, "loss": 0.548, "step": 3030 }, { "epoch": 0.3538945895522388, "grad_norm": 0.514405428467145, "learning_rate": 3.956372058740354e-05, "loss": 0.5708, "step": 3035 }, { "epoch": 0.35447761194029853, "grad_norm": 0.4760653718744451, "learning_rate": 3.952707929052359e-05, "loss": 0.5385, "step": 3040 }, { "epoch": 0.3550606343283582, "grad_norm": 0.5086645138695323, "learning_rate": 3.9490393283675445e-05, "loss": 0.5425, "step": 3045 }, { "epoch": 0.3556436567164179, "grad_norm": 0.48138382467030305, "learning_rate": 3.9453662703237186e-05, "loss": 0.5599, "step": 3050 }, { "epoch": 0.35622667910447764, "grad_norm": 0.594129469541421, "learning_rate": 3.941688768575261e-05, "loss": 0.558, "step": 3055 }, { "epoch": 0.3568097014925373, "grad_norm": 0.4896479074572315, "learning_rate": 3.938006836793073e-05, "loss": 0.5399, "step": 3060 }, { "epoch": 0.357392723880597, "grad_norm": 0.5430860896504877, "learning_rate": 3.934320488664519e-05, "loss": 0.5614, "step": 3065 }, { "epoch": 0.3579757462686567, "grad_norm": 0.5171549846963465, "learning_rate": 3.9306297378933855e-05, "loss": 0.5338, "step": 3070 }, { "epoch": 0.35855876865671643, "grad_norm": 0.5030297563055794, "learning_rate": 3.926934598199824e-05, "loss": 0.5671, "step": 3075 }, { "epoch": 0.3591417910447761, "grad_norm": 0.4738604088361965, "learning_rate": 3.923235083320301e-05, "loss": 0.5204, "step": 3080 }, { "epoch": 0.3597248134328358, "grad_norm": 0.5081725848955316, "learning_rate": 3.919531207007548e-05, "loss": 0.5382, "step": 3085 }, { "epoch": 0.36030783582089554, "grad_norm": 0.5336505974453601, "learning_rate": 3.915822983030512e-05, "loss": 0.5498, "step": 3090 }, { "epoch": 0.3608908582089552, "grad_norm": 0.45862890034907244, "learning_rate": 3.912110425174296e-05, "loss": 0.5478, "step": 3095 }, { "epoch": 0.3614738805970149, "grad_norm": 0.5692817255847623, "learning_rate": 3.9083935472401214e-05, "loss": 0.5511, "step": 3100 }, { "epoch": 0.36205690298507465, "grad_norm": 0.5588630515105258, "learning_rate": 3.904672363045265e-05, "loss": 0.5713, "step": 3105 }, { "epoch": 0.36263992537313433, "grad_norm": 0.7882164929530883, "learning_rate": 3.900946886423012e-05, "loss": 0.5967, "step": 3110 }, { "epoch": 0.363222947761194, "grad_norm": 0.44828952211943274, "learning_rate": 3.897217131222606e-05, "loss": 0.5117, "step": 3115 }, { "epoch": 0.36380597014925375, "grad_norm": 0.492899081711114, "learning_rate": 3.893483111309196e-05, "loss": 0.5381, "step": 3120 }, { "epoch": 0.36438899253731344, "grad_norm": 0.46780156460670047, "learning_rate": 3.889744840563781e-05, "loss": 0.5561, "step": 3125 }, { "epoch": 0.3649720149253731, "grad_norm": 0.5062858013949219, "learning_rate": 3.886002332883169e-05, "loss": 0.5501, "step": 3130 }, { "epoch": 0.36555503731343286, "grad_norm": 0.4693101482730097, "learning_rate": 3.8822556021799114e-05, "loss": 0.5193, "step": 3135 }, { "epoch": 0.36613805970149255, "grad_norm": 0.5060844807583814, "learning_rate": 3.878504662382264e-05, "loss": 0.5532, "step": 3140 }, { "epoch": 0.36672108208955223, "grad_norm": 0.5201363699624885, "learning_rate": 3.8747495274341274e-05, "loss": 0.5845, "step": 3145 }, { "epoch": 0.3673041044776119, "grad_norm": 0.5073064897043211, "learning_rate": 3.870990211294997e-05, "loss": 0.5444, "step": 3150 }, { "epoch": 0.36788712686567165, "grad_norm": 0.5344016844623599, "learning_rate": 3.867226727939912e-05, "loss": 0.5606, "step": 3155 }, { "epoch": 0.36847014925373134, "grad_norm": 0.5139307938716214, "learning_rate": 3.863459091359401e-05, "loss": 0.5882, "step": 3160 }, { "epoch": 0.369053171641791, "grad_norm": 0.4425759816838131, "learning_rate": 3.8596873155594385e-05, "loss": 0.5202, "step": 3165 }, { "epoch": 0.36963619402985076, "grad_norm": 0.5742393732533151, "learning_rate": 3.855911414561378e-05, "loss": 0.5662, "step": 3170 }, { "epoch": 0.37021921641791045, "grad_norm": 0.44175421568947043, "learning_rate": 3.852131402401914e-05, "loss": 0.4854, "step": 3175 }, { "epoch": 0.37080223880597013, "grad_norm": 0.4984055964239209, "learning_rate": 3.848347293133021e-05, "loss": 0.573, "step": 3180 }, { "epoch": 0.37138526119402987, "grad_norm": 0.49757126890693293, "learning_rate": 3.844559100821906e-05, "loss": 0.5549, "step": 3185 }, { "epoch": 0.37196828358208955, "grad_norm": 0.57357388001981, "learning_rate": 3.8407668395509526e-05, "loss": 0.5574, "step": 3190 }, { "epoch": 0.37255130597014924, "grad_norm": 0.5019623696079913, "learning_rate": 3.8369705234176726e-05, "loss": 0.5894, "step": 3195 }, { "epoch": 0.373134328358209, "grad_norm": 0.4858155386150297, "learning_rate": 3.8331701665346495e-05, "loss": 0.5383, "step": 3200 }, { "epoch": 0.37371735074626866, "grad_norm": 0.5624113035875654, "learning_rate": 3.829365783029492e-05, "loss": 0.5585, "step": 3205 }, { "epoch": 0.37430037313432835, "grad_norm": 0.5052807550374429, "learning_rate": 3.8255573870447716e-05, "loss": 0.5439, "step": 3210 }, { "epoch": 0.3748833955223881, "grad_norm": 0.5085388458884429, "learning_rate": 3.82174499273798e-05, "loss": 0.565, "step": 3215 }, { "epoch": 0.37546641791044777, "grad_norm": 0.4989534046655282, "learning_rate": 3.817928614281471e-05, "loss": 0.5324, "step": 3220 }, { "epoch": 0.37604944029850745, "grad_norm": 0.8403204163889687, "learning_rate": 3.8141082658624106e-05, "loss": 0.5626, "step": 3225 }, { "epoch": 0.37663246268656714, "grad_norm": 0.5559098643501764, "learning_rate": 3.810283961682722e-05, "loss": 0.5583, "step": 3230 }, { "epoch": 0.3772154850746269, "grad_norm": 0.47644518540184416, "learning_rate": 3.806455715959032e-05, "loss": 0.5233, "step": 3235 }, { "epoch": 0.37779850746268656, "grad_norm": 0.4930068420607561, "learning_rate": 3.8026235429226236e-05, "loss": 0.5642, "step": 3240 }, { "epoch": 0.37838152985074625, "grad_norm": 0.5290426787630793, "learning_rate": 3.798787456819377e-05, "loss": 0.5234, "step": 3245 }, { "epoch": 0.378964552238806, "grad_norm": 0.4868258441116788, "learning_rate": 3.794947471909719e-05, "loss": 0.5395, "step": 3250 }, { "epoch": 0.37954757462686567, "grad_norm": 0.47174968576411624, "learning_rate": 3.791103602468569e-05, "loss": 0.5395, "step": 3255 }, { "epoch": 0.38013059701492535, "grad_norm": 0.5363107908185231, "learning_rate": 3.7872558627852905e-05, "loss": 0.5557, "step": 3260 }, { "epoch": 0.3807136194029851, "grad_norm": 0.49188998699814623, "learning_rate": 3.78340426716363e-05, "loss": 0.5268, "step": 3265 }, { "epoch": 0.3812966417910448, "grad_norm": 0.48927529818476645, "learning_rate": 3.779548829921673e-05, "loss": 0.5387, "step": 3270 }, { "epoch": 0.38187966417910446, "grad_norm": 0.47364382567752494, "learning_rate": 3.775689565391781e-05, "loss": 0.5129, "step": 3275 }, { "epoch": 0.3824626865671642, "grad_norm": 0.5126016691334612, "learning_rate": 3.771826487920546e-05, "loss": 0.5467, "step": 3280 }, { "epoch": 0.3830457089552239, "grad_norm": 0.5430085481644645, "learning_rate": 3.767959611868734e-05, "loss": 0.5536, "step": 3285 }, { "epoch": 0.38362873134328357, "grad_norm": 0.4763687695806715, "learning_rate": 3.764088951611233e-05, "loss": 0.5071, "step": 3290 }, { "epoch": 0.3842117537313433, "grad_norm": 0.47538986078328227, "learning_rate": 3.7602145215369965e-05, "loss": 0.5431, "step": 3295 }, { "epoch": 0.384794776119403, "grad_norm": 0.44993211618209894, "learning_rate": 3.756336336048994e-05, "loss": 0.5608, "step": 3300 }, { "epoch": 0.3853777985074627, "grad_norm": 0.4857510707261703, "learning_rate": 3.752454409564152e-05, "loss": 0.5307, "step": 3305 }, { "epoch": 0.38596082089552236, "grad_norm": 0.49584291918089757, "learning_rate": 3.74856875651331e-05, "loss": 0.6047, "step": 3310 }, { "epoch": 0.3865438432835821, "grad_norm": 0.5136523336548113, "learning_rate": 3.744679391341157e-05, "loss": 0.5516, "step": 3315 }, { "epoch": 0.3871268656716418, "grad_norm": 0.5181136974711912, "learning_rate": 3.740786328506179e-05, "loss": 0.5439, "step": 3320 }, { "epoch": 0.38770988805970147, "grad_norm": 0.5089785637081085, "learning_rate": 3.7368895824806146e-05, "loss": 0.5585, "step": 3325 }, { "epoch": 0.3882929104477612, "grad_norm": 0.49682305539963434, "learning_rate": 3.732989167750388e-05, "loss": 0.562, "step": 3330 }, { "epoch": 0.3888759328358209, "grad_norm": 0.5243718121493276, "learning_rate": 3.7290850988150644e-05, "loss": 0.5851, "step": 3335 }, { "epoch": 0.3894589552238806, "grad_norm": 0.5098618228761496, "learning_rate": 3.7251773901877945e-05, "loss": 0.5191, "step": 3340 }, { "epoch": 0.3900419776119403, "grad_norm": 0.47672882853039394, "learning_rate": 3.721266056395257e-05, "loss": 0.5284, "step": 3345 }, { "epoch": 0.390625, "grad_norm": 0.4822727732005445, "learning_rate": 3.7173511119776086e-05, "loss": 0.5794, "step": 3350 }, { "epoch": 0.3912080223880597, "grad_norm": 0.5222854783235708, "learning_rate": 3.713432571488427e-05, "loss": 0.5391, "step": 3355 }, { "epoch": 0.3917910447761194, "grad_norm": 7.9962975681807595, "learning_rate": 3.70951044949466e-05, "loss": 0.5534, "step": 3360 }, { "epoch": 0.3923740671641791, "grad_norm": 0.4764348433947033, "learning_rate": 3.705584760576566e-05, "loss": 0.5452, "step": 3365 }, { "epoch": 0.3929570895522388, "grad_norm": 0.49080831286781906, "learning_rate": 3.7016555193276667e-05, "loss": 0.5746, "step": 3370 }, { "epoch": 0.39354011194029853, "grad_norm": 0.5101400424865905, "learning_rate": 3.697722740354688e-05, "loss": 0.5729, "step": 3375 }, { "epoch": 0.3941231343283582, "grad_norm": 0.4479587001588852, "learning_rate": 3.6937864382775076e-05, "loss": 0.5447, "step": 3380 }, { "epoch": 0.3947061567164179, "grad_norm": 0.48466656620240023, "learning_rate": 3.689846627729098e-05, "loss": 0.5327, "step": 3385 }, { "epoch": 0.39528917910447764, "grad_norm": 0.5157423888893855, "learning_rate": 3.685903323355477e-05, "loss": 0.5381, "step": 3390 }, { "epoch": 0.3958722014925373, "grad_norm": 0.4820557574738787, "learning_rate": 3.68195653981565e-05, "loss": 0.5183, "step": 3395 }, { "epoch": 0.396455223880597, "grad_norm": 0.48067690178778916, "learning_rate": 3.678006291781555e-05, "loss": 0.6121, "step": 3400 }, { "epoch": 0.3970382462686567, "grad_norm": 0.48140274545401546, "learning_rate": 3.6740525939380084e-05, "loss": 0.5763, "step": 3405 }, { "epoch": 0.39762126865671643, "grad_norm": 0.4821460133421974, "learning_rate": 3.6700954609826535e-05, "loss": 0.5498, "step": 3410 }, { "epoch": 0.3982042910447761, "grad_norm": 0.5062442615898354, "learning_rate": 3.6661349076259015e-05, "loss": 0.5301, "step": 3415 }, { "epoch": 0.3987873134328358, "grad_norm": 0.4894842886999361, "learning_rate": 3.662170948590879e-05, "loss": 0.5647, "step": 3420 }, { "epoch": 0.39937033582089554, "grad_norm": 0.4635809485827499, "learning_rate": 3.658203598613375e-05, "loss": 0.5658, "step": 3425 }, { "epoch": 0.3999533582089552, "grad_norm": 0.4756142318658808, "learning_rate": 3.65423287244178e-05, "loss": 0.5434, "step": 3430 }, { "epoch": 0.4005363805970149, "grad_norm": 0.4768793799786962, "learning_rate": 3.6502587848370395e-05, "loss": 0.5129, "step": 3435 }, { "epoch": 0.40111940298507465, "grad_norm": 0.49735047427469553, "learning_rate": 3.6462813505725915e-05, "loss": 0.5693, "step": 3440 }, { "epoch": 0.40170242537313433, "grad_norm": 0.46768242382881264, "learning_rate": 3.642300584434319e-05, "loss": 0.5424, "step": 3445 }, { "epoch": 0.402285447761194, "grad_norm": 0.48929227885921284, "learning_rate": 3.638316501220487e-05, "loss": 0.5613, "step": 3450 }, { "epoch": 0.40286847014925375, "grad_norm": 0.49035092965063865, "learning_rate": 3.6343291157416937e-05, "loss": 0.5747, "step": 3455 }, { "epoch": 0.40345149253731344, "grad_norm": 0.4920873718095396, "learning_rate": 3.630338442820814e-05, "loss": 0.5621, "step": 3460 }, { "epoch": 0.4040345149253731, "grad_norm": 0.5721821448291733, "learning_rate": 3.6263444972929395e-05, "loss": 0.5642, "step": 3465 }, { "epoch": 0.40461753731343286, "grad_norm": 0.4900664650078509, "learning_rate": 3.622347294005334e-05, "loss": 0.5637, "step": 3470 }, { "epoch": 0.40520055970149255, "grad_norm": 0.4784540540109224, "learning_rate": 3.618346847817366e-05, "loss": 0.5166, "step": 3475 }, { "epoch": 0.40578358208955223, "grad_norm": 0.5033729998350225, "learning_rate": 3.6143431736004636e-05, "loss": 0.5604, "step": 3480 }, { "epoch": 0.4063666044776119, "grad_norm": 0.47667240088779256, "learning_rate": 3.610336286238051e-05, "loss": 0.5492, "step": 3485 }, { "epoch": 0.40694962686567165, "grad_norm": 0.4874513651352348, "learning_rate": 3.6063262006255006e-05, "loss": 0.5563, "step": 3490 }, { "epoch": 0.40753264925373134, "grad_norm": 0.523091113003475, "learning_rate": 3.602312931670073e-05, "loss": 0.5488, "step": 3495 }, { "epoch": 0.408115671641791, "grad_norm": 0.5531785395812051, "learning_rate": 3.59829649429086e-05, "loss": 0.6013, "step": 3500 }, { "epoch": 0.40869869402985076, "grad_norm": 0.47185275980717517, "learning_rate": 3.5942769034187354e-05, "loss": 0.536, "step": 3505 }, { "epoch": 0.40928171641791045, "grad_norm": 0.46288526087001597, "learning_rate": 3.590254173996295e-05, "loss": 0.5169, "step": 3510 }, { "epoch": 0.40986473880597013, "grad_norm": 0.48524872738357405, "learning_rate": 3.586228320977801e-05, "loss": 0.5271, "step": 3515 }, { "epoch": 0.41044776119402987, "grad_norm": 0.5450240726382458, "learning_rate": 3.582199359329129e-05, "loss": 0.5427, "step": 3520 }, { "epoch": 0.41103078358208955, "grad_norm": 0.48693873450848696, "learning_rate": 3.5781673040277084e-05, "loss": 0.5088, "step": 3525 }, { "epoch": 0.41161380597014924, "grad_norm": 0.46864955154362703, "learning_rate": 3.5741321700624726e-05, "loss": 0.5174, "step": 3530 }, { "epoch": 0.412196828358209, "grad_norm": 0.46728302060083887, "learning_rate": 3.570093972433794e-05, "loss": 0.5687, "step": 3535 }, { "epoch": 0.41277985074626866, "grad_norm": 0.4634797945645771, "learning_rate": 3.56605272615344e-05, "loss": 0.528, "step": 3540 }, { "epoch": 0.41336287313432835, "grad_norm": 0.5238785746405048, "learning_rate": 3.562008446244509e-05, "loss": 0.5375, "step": 3545 }, { "epoch": 0.4139458955223881, "grad_norm": 0.5187794803590449, "learning_rate": 3.557961147741376e-05, "loss": 0.5985, "step": 3550 }, { "epoch": 0.41452891791044777, "grad_norm": 0.4379680762591855, "learning_rate": 3.553910845689638e-05, "loss": 0.5554, "step": 3555 }, { "epoch": 0.41511194029850745, "grad_norm": 0.46577740202461215, "learning_rate": 3.549857555146056e-05, "loss": 0.5472, "step": 3560 }, { "epoch": 0.41569496268656714, "grad_norm": 0.4789932179858049, "learning_rate": 3.5458012911785036e-05, "loss": 0.5451, "step": 3565 }, { "epoch": 0.4162779850746269, "grad_norm": 0.49654652817675987, "learning_rate": 3.541742068865907e-05, "loss": 0.5513, "step": 3570 }, { "epoch": 0.41686100746268656, "grad_norm": 0.4692894243690009, "learning_rate": 3.537679903298187e-05, "loss": 0.5185, "step": 3575 }, { "epoch": 0.41744402985074625, "grad_norm": 0.47941339153241536, "learning_rate": 3.53361480957621e-05, "loss": 0.5692, "step": 3580 }, { "epoch": 0.418027052238806, "grad_norm": 0.46544865766435817, "learning_rate": 3.529546802811725e-05, "loss": 0.5405, "step": 3585 }, { "epoch": 0.41861007462686567, "grad_norm": 0.46403588392065775, "learning_rate": 3.5254758981273106e-05, "loss": 0.5437, "step": 3590 }, { "epoch": 0.41919309701492535, "grad_norm": 0.5179906749611947, "learning_rate": 3.521402110656318e-05, "loss": 0.5593, "step": 3595 }, { "epoch": 0.4197761194029851, "grad_norm": 0.4781025455029735, "learning_rate": 3.517325455542815e-05, "loss": 0.5498, "step": 3600 }, { "epoch": 0.4203591417910448, "grad_norm": 0.4934656687592932, "learning_rate": 3.513245947941531e-05, "loss": 0.5215, "step": 3605 }, { "epoch": 0.42094216417910446, "grad_norm": 0.5326144345367541, "learning_rate": 3.5091636030177995e-05, "loss": 0.535, "step": 3610 }, { "epoch": 0.4215251865671642, "grad_norm": 0.47801066829681754, "learning_rate": 3.505078435947498e-05, "loss": 0.5137, "step": 3615 }, { "epoch": 0.4221082089552239, "grad_norm": 0.5286215614880156, "learning_rate": 3.500990461916998e-05, "loss": 0.5733, "step": 3620 }, { "epoch": 0.42269123134328357, "grad_norm": 0.5073258841460547, "learning_rate": 3.496899696123107e-05, "loss": 0.5746, "step": 3625 }, { "epoch": 0.4232742537313433, "grad_norm": 0.48126213143188057, "learning_rate": 3.492806153773007e-05, "loss": 0.559, "step": 3630 }, { "epoch": 0.423857276119403, "grad_norm": 0.43727021104667335, "learning_rate": 3.488709850084206e-05, "loss": 0.4998, "step": 3635 }, { "epoch": 0.4244402985074627, "grad_norm": 0.4897298712066961, "learning_rate": 3.484610800284473e-05, "loss": 0.5463, "step": 3640 }, { "epoch": 0.42502332089552236, "grad_norm": 0.523356089677995, "learning_rate": 3.480509019611788e-05, "loss": 0.5659, "step": 3645 }, { "epoch": 0.4256063432835821, "grad_norm": 0.5277720573146358, "learning_rate": 3.476404523314282e-05, "loss": 0.5241, "step": 3650 }, { "epoch": 0.4261893656716418, "grad_norm": 0.526635621277189, "learning_rate": 3.472297326650183e-05, "loss": 0.5543, "step": 3655 }, { "epoch": 0.42677238805970147, "grad_norm": 0.45989723897065593, "learning_rate": 3.468187444887754e-05, "loss": 0.4939, "step": 3660 }, { "epoch": 0.4273554104477612, "grad_norm": 0.4622944623408051, "learning_rate": 3.464074893305242e-05, "loss": 0.5297, "step": 3665 }, { "epoch": 0.4279384328358209, "grad_norm": 0.4783006627278798, "learning_rate": 3.45995968719082e-05, "loss": 0.5255, "step": 3670 }, { "epoch": 0.4285214552238806, "grad_norm": 0.5059594374936409, "learning_rate": 3.455841841842524e-05, "loss": 0.5773, "step": 3675 }, { "epoch": 0.4291044776119403, "grad_norm": 0.44346639279558187, "learning_rate": 3.4517213725682085e-05, "loss": 0.5196, "step": 3680 }, { "epoch": 0.4296875, "grad_norm": 0.4742923141132099, "learning_rate": 3.447598294685476e-05, "loss": 0.5211, "step": 3685 }, { "epoch": 0.4302705223880597, "grad_norm": 0.5096994312042691, "learning_rate": 3.443472623521631e-05, "loss": 0.5576, "step": 3690 }, { "epoch": 0.4308535447761194, "grad_norm": 0.4757579513441239, "learning_rate": 3.4393443744136136e-05, "loss": 0.5342, "step": 3695 }, { "epoch": 0.4314365671641791, "grad_norm": 0.4525723387319913, "learning_rate": 3.435213562707953e-05, "loss": 0.521, "step": 3700 }, { "epoch": 0.4320195895522388, "grad_norm": 0.4655324458390628, "learning_rate": 3.431080203760699e-05, "loss": 0.5143, "step": 3705 }, { "epoch": 0.43260261194029853, "grad_norm": 0.46493164741995624, "learning_rate": 3.426944312937376e-05, "loss": 0.5448, "step": 3710 }, { "epoch": 0.4331856343283582, "grad_norm": 0.4705834455942846, "learning_rate": 3.422805905612914e-05, "loss": 0.5132, "step": 3715 }, { "epoch": 0.4337686567164179, "grad_norm": 0.44735510242287085, "learning_rate": 3.4186649971716044e-05, "loss": 0.5078, "step": 3720 }, { "epoch": 0.43435167910447764, "grad_norm": 0.4248977052828472, "learning_rate": 3.4145216030070344e-05, "loss": 0.5224, "step": 3725 }, { "epoch": 0.4349347014925373, "grad_norm": 0.48361810889787654, "learning_rate": 3.410375738522028e-05, "loss": 0.5696, "step": 3730 }, { "epoch": 0.435517723880597, "grad_norm": 0.5169029233248057, "learning_rate": 3.406227419128596e-05, "loss": 0.548, "step": 3735 }, { "epoch": 0.4361007462686567, "grad_norm": 0.47352743387761187, "learning_rate": 3.402076660247878e-05, "loss": 0.5735, "step": 3740 }, { "epoch": 0.43668376865671643, "grad_norm": 0.4598793206254637, "learning_rate": 3.397923477310074e-05, "loss": 0.5286, "step": 3745 }, { "epoch": 0.4372667910447761, "grad_norm": 0.7329855818387587, "learning_rate": 3.393767885754405e-05, "loss": 0.5274, "step": 3750 }, { "epoch": 0.4378498134328358, "grad_norm": 0.5374277209323949, "learning_rate": 3.389609901029038e-05, "loss": 0.569, "step": 3755 }, { "epoch": 0.43843283582089554, "grad_norm": 0.4851132196571775, "learning_rate": 3.38544953859104e-05, "loss": 0.5658, "step": 3760 }, { "epoch": 0.4390158582089552, "grad_norm": 0.4588569006808473, "learning_rate": 3.381286813906317e-05, "loss": 0.5463, "step": 3765 }, { "epoch": 0.4395988805970149, "grad_norm": 0.49195337073823053, "learning_rate": 3.3771217424495555e-05, "loss": 0.5855, "step": 3770 }, { "epoch": 0.44018190298507465, "grad_norm": 0.47991606249122304, "learning_rate": 3.372954339704167e-05, "loss": 0.5496, "step": 3775 }, { "epoch": 0.44076492537313433, "grad_norm": 0.4690325622228386, "learning_rate": 3.368784621162229e-05, "loss": 0.5647, "step": 3780 }, { "epoch": 0.441347947761194, "grad_norm": 0.5399827946397292, "learning_rate": 3.364612602324429e-05, "loss": 0.554, "step": 3785 }, { "epoch": 0.44193097014925375, "grad_norm": 0.44956816879532197, "learning_rate": 3.3604382987000016e-05, "loss": 0.5033, "step": 3790 }, { "epoch": 0.44251399253731344, "grad_norm": 0.49304141117786027, "learning_rate": 3.356261725806681e-05, "loss": 0.5651, "step": 3795 }, { "epoch": 0.4430970149253731, "grad_norm": 0.4984710533841555, "learning_rate": 3.352082899170631e-05, "loss": 0.5238, "step": 3800 }, { "epoch": 0.44368003731343286, "grad_norm": 0.5007565543332921, "learning_rate": 3.3479018343264e-05, "loss": 0.5519, "step": 3805 }, { "epoch": 0.44426305970149255, "grad_norm": 0.5011462730265147, "learning_rate": 3.343718546816852e-05, "loss": 0.5523, "step": 3810 }, { "epoch": 0.44484608208955223, "grad_norm": 0.49568146210324576, "learning_rate": 3.339533052193114e-05, "loss": 0.5132, "step": 3815 }, { "epoch": 0.4454291044776119, "grad_norm": 0.45124156599655735, "learning_rate": 3.335345366014522e-05, "loss": 0.55, "step": 3820 }, { "epoch": 0.44601212686567165, "grad_norm": 0.47223468640192207, "learning_rate": 3.331155503848553e-05, "loss": 0.5438, "step": 3825 }, { "epoch": 0.44659514925373134, "grad_norm": 0.5287373716087295, "learning_rate": 3.326963481270778e-05, "loss": 0.5411, "step": 3830 }, { "epoch": 0.447178171641791, "grad_norm": 0.44674599573765417, "learning_rate": 3.322769313864796e-05, "loss": 0.5363, "step": 3835 }, { "epoch": 0.44776119402985076, "grad_norm": 0.4434254783508246, "learning_rate": 3.3185730172221814e-05, "loss": 0.537, "step": 3840 }, { "epoch": 0.44834421641791045, "grad_norm": 0.5046794647102569, "learning_rate": 3.3143746069424215e-05, "loss": 0.5582, "step": 3845 }, { "epoch": 0.44892723880597013, "grad_norm": 0.489005919425733, "learning_rate": 3.310174098632865e-05, "loss": 0.5549, "step": 3850 }, { "epoch": 0.44951026119402987, "grad_norm": 0.49266732159431287, "learning_rate": 3.305971507908655e-05, "loss": 0.5313, "step": 3855 }, { "epoch": 0.45009328358208955, "grad_norm": 0.5315237555379242, "learning_rate": 3.301766850392681e-05, "loss": 0.5581, "step": 3860 }, { "epoch": 0.45067630597014924, "grad_norm": 0.46700902921771087, "learning_rate": 3.29756014171551e-05, "loss": 0.5371, "step": 3865 }, { "epoch": 0.451259328358209, "grad_norm": 0.4395133061533244, "learning_rate": 3.2933513975153384e-05, "loss": 0.5399, "step": 3870 }, { "epoch": 0.45184235074626866, "grad_norm": 0.485910728377908, "learning_rate": 3.2891406334379285e-05, "loss": 0.525, "step": 3875 }, { "epoch": 0.45242537313432835, "grad_norm": 0.46551066433925664, "learning_rate": 3.284927865136551e-05, "loss": 0.4913, "step": 3880 }, { "epoch": 0.4530083955223881, "grad_norm": 0.4792517888689515, "learning_rate": 3.280713108271926e-05, "loss": 0.5237, "step": 3885 }, { "epoch": 0.45359141791044777, "grad_norm": 0.44580552095045006, "learning_rate": 3.276496378512168e-05, "loss": 0.5716, "step": 3890 }, { "epoch": 0.45417444029850745, "grad_norm": 0.46092489328912967, "learning_rate": 3.272277691532725e-05, "loss": 0.5402, "step": 3895 }, { "epoch": 0.45475746268656714, "grad_norm": 0.4413897932753195, "learning_rate": 3.268057063016319e-05, "loss": 0.5305, "step": 3900 }, { "epoch": 0.4553404850746269, "grad_norm": 0.4782676785091174, "learning_rate": 3.263834508652894e-05, "loss": 0.5946, "step": 3905 }, { "epoch": 0.45592350746268656, "grad_norm": 0.5238873675452039, "learning_rate": 3.259610044139548e-05, "loss": 0.5478, "step": 3910 }, { "epoch": 0.45650652985074625, "grad_norm": 0.5182499166196407, "learning_rate": 3.255383685180484e-05, "loss": 0.5454, "step": 3915 }, { "epoch": 0.457089552238806, "grad_norm": 0.47638810180727925, "learning_rate": 3.251155447486945e-05, "loss": 0.5446, "step": 3920 }, { "epoch": 0.45767257462686567, "grad_norm": 0.4523229716036492, "learning_rate": 3.246925346777158e-05, "loss": 0.522, "step": 3925 }, { "epoch": 0.45825559701492535, "grad_norm": 0.46909930899902835, "learning_rate": 3.2426933987762785e-05, "loss": 0.5393, "step": 3930 }, { "epoch": 0.4588386194029851, "grad_norm": 0.43989917162010367, "learning_rate": 3.238459619216326e-05, "loss": 0.5211, "step": 3935 }, { "epoch": 0.4594216417910448, "grad_norm": 0.48026394452289217, "learning_rate": 3.23422402383613e-05, "loss": 0.5547, "step": 3940 }, { "epoch": 0.46000466417910446, "grad_norm": 0.48185947326085543, "learning_rate": 3.22998662838127e-05, "loss": 0.5041, "step": 3945 }, { "epoch": 0.4605876865671642, "grad_norm": 0.4734018707747314, "learning_rate": 3.2257474486040166e-05, "loss": 0.5038, "step": 3950 }, { "epoch": 0.4611707089552239, "grad_norm": 0.5075617095429634, "learning_rate": 3.221506500263276e-05, "loss": 0.5447, "step": 3955 }, { "epoch": 0.46175373134328357, "grad_norm": 0.5046083432543891, "learning_rate": 3.217263799124527e-05, "loss": 0.5772, "step": 3960 }, { "epoch": 0.4623367537313433, "grad_norm": 0.47069172866304554, "learning_rate": 3.213019360959762e-05, "loss": 0.5341, "step": 3965 }, { "epoch": 0.462919776119403, "grad_norm": 0.47866635232292687, "learning_rate": 3.2087732015474366e-05, "loss": 0.5208, "step": 3970 }, { "epoch": 0.4635027985074627, "grad_norm": 0.4516545061013375, "learning_rate": 3.204525336672399e-05, "loss": 0.5382, "step": 3975 }, { "epoch": 0.46408582089552236, "grad_norm": 0.4987443516763455, "learning_rate": 3.200275782125842e-05, "loss": 0.5319, "step": 3980 }, { "epoch": 0.4646688432835821, "grad_norm": 0.5260827157599703, "learning_rate": 3.196024553705235e-05, "loss": 0.5355, "step": 3985 }, { "epoch": 0.4652518656716418, "grad_norm": 0.4668030990105745, "learning_rate": 3.1917716672142746e-05, "loss": 0.5057, "step": 3990 }, { "epoch": 0.46583488805970147, "grad_norm": 0.4426983151393079, "learning_rate": 3.187517138462819e-05, "loss": 0.5254, "step": 3995 }, { "epoch": 0.4664179104477612, "grad_norm": 0.4925988353031197, "learning_rate": 3.1832609832668314e-05, "loss": 0.5422, "step": 4000 }, { "epoch": 0.4670009328358209, "grad_norm": 0.4560283471593519, "learning_rate": 3.179003217448321e-05, "loss": 0.5013, "step": 4005 }, { "epoch": 0.4675839552238806, "grad_norm": 0.5244233872415475, "learning_rate": 3.1747438568352844e-05, "loss": 0.5736, "step": 4010 }, { "epoch": 0.4681669776119403, "grad_norm": 0.5512150192012447, "learning_rate": 3.170482917261648e-05, "loss": 0.5688, "step": 4015 }, { "epoch": 0.46875, "grad_norm": 0.5009381195241619, "learning_rate": 3.166220414567206e-05, "loss": 0.5468, "step": 4020 }, { "epoch": 0.4693330223880597, "grad_norm": 0.47235216856933904, "learning_rate": 3.161956364597566e-05, "loss": 0.5318, "step": 4025 }, { "epoch": 0.4699160447761194, "grad_norm": 0.5634765857129355, "learning_rate": 3.1576907832040855e-05, "loss": 0.5188, "step": 4030 }, { "epoch": 0.4704990671641791, "grad_norm": 0.48072091370811076, "learning_rate": 3.153423686243813e-05, "loss": 0.5313, "step": 4035 }, { "epoch": 0.4710820895522388, "grad_norm": 0.5120155067954245, "learning_rate": 3.149155089579437e-05, "loss": 0.5572, "step": 4040 }, { "epoch": 0.47166511194029853, "grad_norm": 0.4812745114629162, "learning_rate": 3.144885009079215e-05, "loss": 0.5578, "step": 4045 }, { "epoch": 0.4722481343283582, "grad_norm": 0.5058486137109877, "learning_rate": 3.140613460616924e-05, "loss": 0.5199, "step": 4050 }, { "epoch": 0.4728311567164179, "grad_norm": 0.48002106325579913, "learning_rate": 3.1363404600717965e-05, "loss": 0.5659, "step": 4055 }, { "epoch": 0.47341417910447764, "grad_norm": 0.48245091505853244, "learning_rate": 3.132066023328465e-05, "loss": 0.533, "step": 4060 }, { "epoch": 0.4739972014925373, "grad_norm": 0.47536248109948226, "learning_rate": 3.1277901662768983e-05, "loss": 0.5433, "step": 4065 }, { "epoch": 0.474580223880597, "grad_norm": 0.47200232932636466, "learning_rate": 3.123512904812347e-05, "loss": 0.5322, "step": 4070 }, { "epoch": 0.4751632462686567, "grad_norm": 0.44740442637527467, "learning_rate": 3.119234254835282e-05, "loss": 0.5107, "step": 4075 }, { "epoch": 0.47574626865671643, "grad_norm": 0.46154208469718466, "learning_rate": 3.114954232251336e-05, "loss": 0.527, "step": 4080 }, { "epoch": 0.4763292910447761, "grad_norm": 0.5341929895695878, "learning_rate": 3.110672852971243e-05, "loss": 0.5689, "step": 4085 }, { "epoch": 0.4769123134328358, "grad_norm": 0.4652305557977976, "learning_rate": 3.1063901329107843e-05, "loss": 0.5167, "step": 4090 }, { "epoch": 0.47749533582089554, "grad_norm": 0.44417381469876005, "learning_rate": 3.10210608799072e-05, "loss": 0.5413, "step": 4095 }, { "epoch": 0.4780783582089552, "grad_norm": 0.4507749342993046, "learning_rate": 3.097820734136739e-05, "loss": 0.5282, "step": 4100 }, { "epoch": 0.4786613805970149, "grad_norm": 0.48297525105833106, "learning_rate": 3.093534087279397e-05, "loss": 0.5347, "step": 4105 }, { "epoch": 0.47924440298507465, "grad_norm": 0.5100316904667305, "learning_rate": 3.089246163354051e-05, "loss": 0.5395, "step": 4110 }, { "epoch": 0.47982742537313433, "grad_norm": 0.4995786265506999, "learning_rate": 3.084956978300812e-05, "loss": 0.5311, "step": 4115 }, { "epoch": 0.480410447761194, "grad_norm": 0.4471832602242875, "learning_rate": 3.080666548064475e-05, "loss": 0.5193, "step": 4120 }, { "epoch": 0.48099347014925375, "grad_norm": 0.46140397088213214, "learning_rate": 3.076374888594464e-05, "loss": 0.5345, "step": 4125 }, { "epoch": 0.48157649253731344, "grad_norm": 0.44371711691404125, "learning_rate": 3.0720820158447766e-05, "loss": 0.5072, "step": 4130 }, { "epoch": 0.4821595149253731, "grad_norm": 0.4561270995882789, "learning_rate": 3.067787945773915e-05, "loss": 0.5181, "step": 4135 }, { "epoch": 0.48274253731343286, "grad_norm": 0.4468599316613869, "learning_rate": 3.063492694344835e-05, "loss": 0.5286, "step": 4140 }, { "epoch": 0.48332555970149255, "grad_norm": 0.4921853335929309, "learning_rate": 3.059196277524886e-05, "loss": 0.5075, "step": 4145 }, { "epoch": 0.48390858208955223, "grad_norm": 0.47049020704612843, "learning_rate": 3.054898711285747e-05, "loss": 0.5475, "step": 4150 }, { "epoch": 0.4844916044776119, "grad_norm": 1.0060177708740357, "learning_rate": 3.05060001160337e-05, "loss": 0.5293, "step": 4155 }, { "epoch": 0.48507462686567165, "grad_norm": 0.48784465637028546, "learning_rate": 3.046300194457923e-05, "loss": 0.534, "step": 4160 }, { "epoch": 0.48565764925373134, "grad_norm": 0.4518879881598059, "learning_rate": 3.0419992758337235e-05, "loss": 0.5209, "step": 4165 }, { "epoch": 0.486240671641791, "grad_norm": 0.4452974474644216, "learning_rate": 3.0376972717191894e-05, "loss": 0.5245, "step": 4170 }, { "epoch": 0.48682369402985076, "grad_norm": 0.44833371464722693, "learning_rate": 3.0333941981067688e-05, "loss": 0.5156, "step": 4175 }, { "epoch": 0.48740671641791045, "grad_norm": 0.44934997583495073, "learning_rate": 3.029090070992889e-05, "loss": 0.5144, "step": 4180 }, { "epoch": 0.48798973880597013, "grad_norm": 0.47274610781733095, "learning_rate": 3.0247849063778917e-05, "loss": 0.5317, "step": 4185 }, { "epoch": 0.48857276119402987, "grad_norm": 0.47590787280797764, "learning_rate": 3.020478720265977e-05, "loss": 0.51, "step": 4190 }, { "epoch": 0.48915578358208955, "grad_norm": 0.44563102351746864, "learning_rate": 3.01617152866514e-05, "loss": 0.5301, "step": 4195 }, { "epoch": 0.48973880597014924, "grad_norm": 0.4916861483726837, "learning_rate": 3.0118633475871167e-05, "loss": 0.5772, "step": 4200 }, { "epoch": 0.490321828358209, "grad_norm": 0.4684037067244426, "learning_rate": 3.0075541930473183e-05, "loss": 0.4969, "step": 4205 }, { "epoch": 0.49090485074626866, "grad_norm": 0.4797812232043466, "learning_rate": 3.0032440810647783e-05, "loss": 0.5038, "step": 4210 }, { "epoch": 0.49148787313432835, "grad_norm": 0.4525162593921102, "learning_rate": 2.998933027662086e-05, "loss": 0.5266, "step": 4215 }, { "epoch": 0.4920708955223881, "grad_norm": 0.4822829175013609, "learning_rate": 2.9946210488653316e-05, "loss": 0.5243, "step": 4220 }, { "epoch": 0.49265391791044777, "grad_norm": 0.45841709338227005, "learning_rate": 2.9903081607040473e-05, "loss": 0.4948, "step": 4225 }, { "epoch": 0.49323694029850745, "grad_norm": 0.4465507453121582, "learning_rate": 2.9859943792111422e-05, "loss": 0.5228, "step": 4230 }, { "epoch": 0.49381996268656714, "grad_norm": 0.613185904602815, "learning_rate": 2.9816797204228497e-05, "loss": 0.5345, "step": 4235 }, { "epoch": 0.4944029850746269, "grad_norm": 0.49396745715444307, "learning_rate": 2.9773642003786627e-05, "loss": 0.5412, "step": 4240 }, { "epoch": 0.49498600746268656, "grad_norm": 0.5239548031082005, "learning_rate": 2.9730478351212754e-05, "loss": 0.5821, "step": 4245 }, { "epoch": 0.49556902985074625, "grad_norm": 0.4221114790983556, "learning_rate": 2.968730640696526e-05, "loss": 0.5199, "step": 4250 }, { "epoch": 0.496152052238806, "grad_norm": 0.480540611050451, "learning_rate": 2.9644126331533328e-05, "loss": 0.5149, "step": 4255 }, { "epoch": 0.49673507462686567, "grad_norm": 0.4972848047633413, "learning_rate": 2.9600938285436385e-05, "loss": 0.5155, "step": 4260 }, { "epoch": 0.49731809701492535, "grad_norm": 0.5100758433643319, "learning_rate": 2.9557742429223478e-05, "loss": 0.5495, "step": 4265 }, { "epoch": 0.4979011194029851, "grad_norm": 0.46169645444430685, "learning_rate": 2.95145389234727e-05, "loss": 0.5432, "step": 4270 }, { "epoch": 0.4984841417910448, "grad_norm": 0.478683421695544, "learning_rate": 2.947132792879056e-05, "loss": 0.5622, "step": 4275 }, { "epoch": 0.49906716417910446, "grad_norm": 0.4937574145856353, "learning_rate": 2.9428109605811427e-05, "loss": 0.5292, "step": 4280 }, { "epoch": 0.4996501865671642, "grad_norm": 0.4482984531417574, "learning_rate": 2.9384884115196898e-05, "loss": 0.5115, "step": 4285 }, { "epoch": 0.5002332089552238, "grad_norm": 0.4581506836353169, "learning_rate": 2.9341651617635236e-05, "loss": 0.5369, "step": 4290 }, { "epoch": 0.5008162313432836, "grad_norm": 0.4636121196389006, "learning_rate": 2.929841227384072e-05, "loss": 0.563, "step": 4295 }, { "epoch": 0.5013992537313433, "grad_norm": 0.49303476266346524, "learning_rate": 2.925516624455311e-05, "loss": 0.5323, "step": 4300 }, { "epoch": 0.5019822761194029, "grad_norm": 0.4792765198500162, "learning_rate": 2.9211913690537003e-05, "loss": 0.5238, "step": 4305 }, { "epoch": 0.5025652985074627, "grad_norm": 0.47595384636734767, "learning_rate": 2.9168654772581257e-05, "loss": 0.5487, "step": 4310 }, { "epoch": 0.5031483208955224, "grad_norm": 0.45475337905445856, "learning_rate": 2.9125389651498374e-05, "loss": 0.5551, "step": 4315 }, { "epoch": 0.503731343283582, "grad_norm": 0.45368396432436275, "learning_rate": 2.908211848812394e-05, "loss": 0.5096, "step": 4320 }, { "epoch": 0.5043143656716418, "grad_norm": 0.4720732964199861, "learning_rate": 2.903884144331598e-05, "loss": 0.5375, "step": 4325 }, { "epoch": 0.5048973880597015, "grad_norm": 0.4609507788137596, "learning_rate": 2.89955586779544e-05, "loss": 0.5193, "step": 4330 }, { "epoch": 0.5054804104477612, "grad_norm": 0.4437193884044816, "learning_rate": 2.8952270352940362e-05, "loss": 0.538, "step": 4335 }, { "epoch": 0.5060634328358209, "grad_norm": 0.4686721143893824, "learning_rate": 2.89089766291957e-05, "loss": 0.5359, "step": 4340 }, { "epoch": 0.5066464552238806, "grad_norm": 0.48522442554794026, "learning_rate": 2.886567766766231e-05, "loss": 0.5367, "step": 4345 }, { "epoch": 0.5072294776119403, "grad_norm": 0.4682334708596764, "learning_rate": 2.8822373629301573e-05, "loss": 0.4979, "step": 4350 }, { "epoch": 0.5078125, "grad_norm": 0.4461441009977717, "learning_rate": 2.8779064675093724e-05, "loss": 0.5473, "step": 4355 }, { "epoch": 0.5083955223880597, "grad_norm": 0.523425795341376, "learning_rate": 2.8735750966037295e-05, "loss": 0.592, "step": 4360 }, { "epoch": 0.5089785447761194, "grad_norm": 0.41208414865400855, "learning_rate": 2.869243266314847e-05, "loss": 0.5146, "step": 4365 }, { "epoch": 0.5095615671641791, "grad_norm": 0.45512638703063274, "learning_rate": 2.8649109927460533e-05, "loss": 0.517, "step": 4370 }, { "epoch": 0.5101445895522388, "grad_norm": 0.449015485615699, "learning_rate": 2.8605782920023227e-05, "loss": 0.5235, "step": 4375 }, { "epoch": 0.5107276119402985, "grad_norm": 0.49156178005481366, "learning_rate": 2.8562451801902197e-05, "loss": 0.5308, "step": 4380 }, { "epoch": 0.5113106343283582, "grad_norm": 0.4348710846671963, "learning_rate": 2.8519116734178336e-05, "loss": 0.5387, "step": 4385 }, { "epoch": 0.511893656716418, "grad_norm": 0.4902492973827986, "learning_rate": 2.8475777877947264e-05, "loss": 0.5417, "step": 4390 }, { "epoch": 0.5124766791044776, "grad_norm": 0.6547931036800456, "learning_rate": 2.843243539431863e-05, "loss": 0.5444, "step": 4395 }, { "epoch": 0.5130597014925373, "grad_norm": 0.4876654982418252, "learning_rate": 2.838908944441562e-05, "loss": 0.5558, "step": 4400 }, { "epoch": 0.5136427238805971, "grad_norm": 0.46099868761271035, "learning_rate": 2.834574018937428e-05, "loss": 0.5605, "step": 4405 }, { "epoch": 0.5142257462686567, "grad_norm": 0.4545024437386275, "learning_rate": 2.8302387790342943e-05, "loss": 0.5175, "step": 4410 }, { "epoch": 0.5148087686567164, "grad_norm": 0.4336617180340361, "learning_rate": 2.8259032408481635e-05, "loss": 0.5038, "step": 4415 }, { "epoch": 0.5153917910447762, "grad_norm": 0.47838755970386043, "learning_rate": 2.8215674204961462e-05, "loss": 0.5594, "step": 4420 }, { "epoch": 0.5159748134328358, "grad_norm": 0.5142813582002242, "learning_rate": 2.817231334096403e-05, "loss": 0.5177, "step": 4425 }, { "epoch": 0.5165578358208955, "grad_norm": 0.44541441218084665, "learning_rate": 2.812894997768083e-05, "loss": 0.5465, "step": 4430 }, { "epoch": 0.5171408582089553, "grad_norm": 0.4560004171104499, "learning_rate": 2.8085584276312644e-05, "loss": 0.5201, "step": 4435 }, { "epoch": 0.5177238805970149, "grad_norm": 0.47964290891517664, "learning_rate": 2.8042216398068942e-05, "loss": 0.5247, "step": 4440 }, { "epoch": 0.5183069029850746, "grad_norm": 0.8168752970003079, "learning_rate": 2.7998846504167308e-05, "loss": 0.5277, "step": 4445 }, { "epoch": 0.5188899253731343, "grad_norm": 2.544656980904975, "learning_rate": 2.7955474755832784e-05, "loss": 0.505, "step": 4450 }, { "epoch": 0.519472947761194, "grad_norm": 0.4659395065613913, "learning_rate": 2.7912101314297327e-05, "loss": 0.5269, "step": 4455 }, { "epoch": 0.5200559701492538, "grad_norm": 2.6969407750363943, "learning_rate": 2.7868726340799184e-05, "loss": 0.5306, "step": 4460 }, { "epoch": 0.5206389925373134, "grad_norm": 0.5270843633040919, "learning_rate": 2.7825349996582313e-05, "loss": 0.5711, "step": 4465 }, { "epoch": 0.5212220149253731, "grad_norm": 0.5091777799722457, "learning_rate": 2.7781972442895726e-05, "loss": 0.5365, "step": 4470 }, { "epoch": 0.5218050373134329, "grad_norm": 1.3533639882343287, "learning_rate": 2.7738593840992975e-05, "loss": 0.5539, "step": 4475 }, { "epoch": 0.5223880597014925, "grad_norm": 0.4547594298673975, "learning_rate": 2.769521435213149e-05, "loss": 0.544, "step": 4480 }, { "epoch": 0.5229710820895522, "grad_norm": 0.5510873748275549, "learning_rate": 2.7651834137572003e-05, "loss": 0.5353, "step": 4485 }, { "epoch": 0.523554104477612, "grad_norm": 0.4328373857127542, "learning_rate": 2.760845335857793e-05, "loss": 0.5132, "step": 4490 }, { "epoch": 0.5241371268656716, "grad_norm": 0.45053079578268007, "learning_rate": 2.7565072176414803e-05, "loss": 0.5231, "step": 4495 }, { "epoch": 0.5247201492537313, "grad_norm": 0.49741815715434656, "learning_rate": 2.7521690752349643e-05, "loss": 0.5281, "step": 4500 }, { "epoch": 0.5253031716417911, "grad_norm": 0.4607291755452952, "learning_rate": 2.7478309247650362e-05, "loss": 0.5558, "step": 4505 }, { "epoch": 0.5258861940298507, "grad_norm": 0.5194917261722904, "learning_rate": 2.7434927823585206e-05, "loss": 0.539, "step": 4510 }, { "epoch": 0.5264692164179104, "grad_norm": 0.45463126850370955, "learning_rate": 2.739154664142208e-05, "loss": 0.5215, "step": 4515 }, { "epoch": 0.5270522388059702, "grad_norm": 0.4892107161079775, "learning_rate": 2.7348165862428e-05, "loss": 0.4891, "step": 4520 }, { "epoch": 0.5276352611940298, "grad_norm": 0.5045457990411999, "learning_rate": 2.7304785647868507e-05, "loss": 0.5416, "step": 4525 }, { "epoch": 0.5282182835820896, "grad_norm": 0.44346437150982526, "learning_rate": 2.726140615900703e-05, "loss": 0.5048, "step": 4530 }, { "epoch": 0.5288013059701493, "grad_norm": 0.48666851854991494, "learning_rate": 2.7218027557104286e-05, "loss": 0.5034, "step": 4535 }, { "epoch": 0.5293843283582089, "grad_norm": 0.4611162809005146, "learning_rate": 2.7174650003417696e-05, "loss": 0.5142, "step": 4540 }, { "epoch": 0.5299673507462687, "grad_norm": 0.45514376590541833, "learning_rate": 2.7131273659200818e-05, "loss": 0.5162, "step": 4545 }, { "epoch": 0.5305503731343284, "grad_norm": 0.48960501435122616, "learning_rate": 2.7087898685702685e-05, "loss": 0.5069, "step": 4550 }, { "epoch": 0.531133395522388, "grad_norm": 0.4797862989191752, "learning_rate": 2.704452524416722e-05, "loss": 0.531, "step": 4555 }, { "epoch": 0.5317164179104478, "grad_norm": 0.5893011284468495, "learning_rate": 2.7001153495832697e-05, "loss": 0.5061, "step": 4560 }, { "epoch": 0.5322994402985075, "grad_norm": 0.4461810976300674, "learning_rate": 2.6957783601931063e-05, "loss": 0.5203, "step": 4565 }, { "epoch": 0.5328824626865671, "grad_norm": 0.49982880523004874, "learning_rate": 2.691441572368737e-05, "loss": 0.5284, "step": 4570 }, { "epoch": 0.5334654850746269, "grad_norm": 0.8194747676964631, "learning_rate": 2.6871050022319177e-05, "loss": 0.5642, "step": 4575 }, { "epoch": 0.5340485074626866, "grad_norm": 0.45941020004877126, "learning_rate": 2.6827686659035983e-05, "loss": 0.5444, "step": 4580 }, { "epoch": 0.5346315298507462, "grad_norm": 0.7401627959971693, "learning_rate": 2.678432579503855e-05, "loss": 0.5138, "step": 4585 }, { "epoch": 0.535214552238806, "grad_norm": 0.49796442033400656, "learning_rate": 2.6740967591518374e-05, "loss": 0.5264, "step": 4590 }, { "epoch": 0.5357975746268657, "grad_norm": 1.3853714172153995, "learning_rate": 2.6697612209657063e-05, "loss": 0.5375, "step": 4595 }, { "epoch": 0.5363805970149254, "grad_norm": 0.4543560797621967, "learning_rate": 2.665425981062573e-05, "loss": 0.5155, "step": 4600 }, { "epoch": 0.5369636194029851, "grad_norm": 0.484187959090983, "learning_rate": 2.6610910555584384e-05, "loss": 0.5478, "step": 4605 }, { "epoch": 0.5375466417910447, "grad_norm": 0.49626488411866204, "learning_rate": 2.6567564605681376e-05, "loss": 0.5352, "step": 4610 }, { "epoch": 0.5381296641791045, "grad_norm": 0.514181721385795, "learning_rate": 2.652422212205275e-05, "loss": 0.5387, "step": 4615 }, { "epoch": 0.5387126865671642, "grad_norm": 0.5832049097179766, "learning_rate": 2.6480883265821673e-05, "loss": 0.5355, "step": 4620 }, { "epoch": 0.5392957089552238, "grad_norm": 1.5120035406279293, "learning_rate": 2.643754819809781e-05, "loss": 0.5144, "step": 4625 }, { "epoch": 0.5398787313432836, "grad_norm": 0.49157920203901545, "learning_rate": 2.639421707997678e-05, "loss": 0.5176, "step": 4630 }, { "epoch": 0.5404617537313433, "grad_norm": 0.52902066701415, "learning_rate": 2.6350890072539476e-05, "loss": 0.5635, "step": 4635 }, { "epoch": 0.5410447761194029, "grad_norm": 0.4637572082859708, "learning_rate": 2.630756733685153e-05, "loss": 0.5346, "step": 4640 }, { "epoch": 0.5416277985074627, "grad_norm": 0.47595145214824397, "learning_rate": 2.6264249033962713e-05, "loss": 0.5249, "step": 4645 }, { "epoch": 0.5422108208955224, "grad_norm": 1.6764787888943493, "learning_rate": 2.622093532490628e-05, "loss": 0.5064, "step": 4650 }, { "epoch": 0.542793843283582, "grad_norm": 0.5788528459186878, "learning_rate": 2.6177626370698443e-05, "loss": 0.5051, "step": 4655 }, { "epoch": 0.5433768656716418, "grad_norm": 0.48162500018331245, "learning_rate": 2.6134322332337695e-05, "loss": 0.5043, "step": 4660 }, { "epoch": 0.5439598880597015, "grad_norm": 0.4737951941511267, "learning_rate": 2.6091023370804307e-05, "loss": 0.5142, "step": 4665 }, { "epoch": 0.5445429104477612, "grad_norm": 0.5575644816288676, "learning_rate": 2.604772964705965e-05, "loss": 0.5505, "step": 4670 }, { "epoch": 0.5451259328358209, "grad_norm": 0.4758372366326835, "learning_rate": 2.6004441322045603e-05, "loss": 0.5373, "step": 4675 }, { "epoch": 0.5457089552238806, "grad_norm": 0.508958909313539, "learning_rate": 2.596115855668403e-05, "loss": 0.5353, "step": 4680 }, { "epoch": 0.5462919776119403, "grad_norm": 0.4683786155826753, "learning_rate": 2.5917881511876073e-05, "loss": 0.4946, "step": 4685 }, { "epoch": 0.546875, "grad_norm": 0.5456678942382134, "learning_rate": 2.5874610348501632e-05, "loss": 0.5371, "step": 4690 }, { "epoch": 0.5474580223880597, "grad_norm": 0.4552682312653168, "learning_rate": 2.5831345227418752e-05, "loss": 0.5605, "step": 4695 }, { "epoch": 0.5480410447761194, "grad_norm": 0.4776001785979188, "learning_rate": 2.5788086309463006e-05, "loss": 0.5005, "step": 4700 }, { "epoch": 0.5486240671641791, "grad_norm": 0.5453229696717531, "learning_rate": 2.57448337554469e-05, "loss": 0.5194, "step": 4705 }, { "epoch": 0.5492070895522388, "grad_norm": 0.5387817919914101, "learning_rate": 2.570158772615928e-05, "loss": 0.5333, "step": 4710 }, { "epoch": 0.5497901119402985, "grad_norm": 0.48603015940285904, "learning_rate": 2.5658348382364773e-05, "loss": 0.5117, "step": 4715 }, { "epoch": 0.5503731343283582, "grad_norm": 0.4789950555392135, "learning_rate": 2.5615115884803108e-05, "loss": 0.5168, "step": 4720 }, { "epoch": 0.550956156716418, "grad_norm": 0.4593000342095894, "learning_rate": 2.557189039418858e-05, "loss": 0.5238, "step": 4725 }, { "epoch": 0.5515391791044776, "grad_norm": 0.4671404581672369, "learning_rate": 2.552867207120945e-05, "loss": 0.5336, "step": 4730 }, { "epoch": 0.5521222014925373, "grad_norm": 0.4867006683126969, "learning_rate": 2.5485461076527308e-05, "loss": 0.5432, "step": 4735 }, { "epoch": 0.5527052238805971, "grad_norm": 0.4603754841175435, "learning_rate": 2.5442257570776527e-05, "loss": 0.5464, "step": 4740 }, { "epoch": 0.5532882462686567, "grad_norm": 0.5322095390809611, "learning_rate": 2.539906171456362e-05, "loss": 0.5102, "step": 4745 }, { "epoch": 0.5538712686567164, "grad_norm": 0.4501063054653902, "learning_rate": 2.5355873668466677e-05, "loss": 0.526, "step": 4750 }, { "epoch": 0.5544542910447762, "grad_norm": 0.4645529985985766, "learning_rate": 2.5312693593034746e-05, "loss": 0.5035, "step": 4755 }, { "epoch": 0.5550373134328358, "grad_norm": 0.4764710401107973, "learning_rate": 2.5269521648787247e-05, "loss": 0.5458, "step": 4760 }, { "epoch": 0.5556203358208955, "grad_norm": 0.4534954950893274, "learning_rate": 2.5226357996213378e-05, "loss": 0.5199, "step": 4765 }, { "epoch": 0.5562033582089553, "grad_norm": 0.4810706685920549, "learning_rate": 2.518320279577151e-05, "loss": 0.5263, "step": 4770 }, { "epoch": 0.5567863805970149, "grad_norm": 0.4690472358261824, "learning_rate": 2.514005620788858e-05, "loss": 0.5245, "step": 4775 }, { "epoch": 0.5573694029850746, "grad_norm": 0.46880508544260313, "learning_rate": 2.5096918392959532e-05, "loss": 0.5232, "step": 4780 }, { "epoch": 0.5579524253731343, "grad_norm": 0.513925549118589, "learning_rate": 2.5053789511346693e-05, "loss": 0.5157, "step": 4785 }, { "epoch": 0.558535447761194, "grad_norm": 0.5043455996761089, "learning_rate": 2.5010669723379154e-05, "loss": 0.5128, "step": 4790 }, { "epoch": 0.5591184701492538, "grad_norm": 0.5924109641064016, "learning_rate": 2.4967559189352226e-05, "loss": 0.4921, "step": 4795 }, { "epoch": 0.5597014925373134, "grad_norm": 0.47567528373039897, "learning_rate": 2.492445806952682e-05, "loss": 0.5406, "step": 4800 }, { "epoch": 0.5602845149253731, "grad_norm": 0.4707447199114358, "learning_rate": 2.4881366524128845e-05, "loss": 0.5325, "step": 4805 }, { "epoch": 0.5608675373134329, "grad_norm": 0.444463840534227, "learning_rate": 2.4838284713348602e-05, "loss": 0.4945, "step": 4810 }, { "epoch": 0.5614505597014925, "grad_norm": 0.51359075031383, "learning_rate": 2.479521279734024e-05, "loss": 0.4986, "step": 4815 }, { "epoch": 0.5620335820895522, "grad_norm": 0.508318043421021, "learning_rate": 2.475215093622109e-05, "loss": 0.5362, "step": 4820 }, { "epoch": 0.562616604477612, "grad_norm": 0.4798125508084004, "learning_rate": 2.4709099290071126e-05, "loss": 0.498, "step": 4825 }, { "epoch": 0.5631996268656716, "grad_norm": 0.46383793437769893, "learning_rate": 2.4666058018932314e-05, "loss": 0.5254, "step": 4830 }, { "epoch": 0.5637826492537313, "grad_norm": 0.4825804953446005, "learning_rate": 2.4623027282808114e-05, "loss": 0.515, "step": 4835 }, { "epoch": 0.5643656716417911, "grad_norm": 0.49356516542621953, "learning_rate": 2.4580007241662773e-05, "loss": 0.4913, "step": 4840 }, { "epoch": 0.5649486940298507, "grad_norm": 0.46504399968658483, "learning_rate": 2.4536998055420783e-05, "loss": 0.5055, "step": 4845 }, { "epoch": 0.5655317164179104, "grad_norm": 0.5178266867135491, "learning_rate": 2.4493999883966308e-05, "loss": 0.5338, "step": 4850 }, { "epoch": 0.5661147388059702, "grad_norm": 0.443773396289914, "learning_rate": 2.445101288714254e-05, "loss": 0.527, "step": 4855 }, { "epoch": 0.5666977611940298, "grad_norm": 0.4361179409054666, "learning_rate": 2.440803722475114e-05, "loss": 0.5115, "step": 4860 }, { "epoch": 0.5672807835820896, "grad_norm": 0.4957031944579591, "learning_rate": 2.436507305655165e-05, "loss": 0.5224, "step": 4865 }, { "epoch": 0.5678638059701493, "grad_norm": 0.5179909459677171, "learning_rate": 2.4322120542260864e-05, "loss": 0.5026, "step": 4870 }, { "epoch": 0.5684468283582089, "grad_norm": 0.45931411628609564, "learning_rate": 2.4279179841552246e-05, "loss": 0.5492, "step": 4875 }, { "epoch": 0.5690298507462687, "grad_norm": 0.4718292947061309, "learning_rate": 2.4236251114055358e-05, "loss": 0.5485, "step": 4880 }, { "epoch": 0.5696128731343284, "grad_norm": 0.4509426009796579, "learning_rate": 2.419333451935526e-05, "loss": 0.5262, "step": 4885 }, { "epoch": 0.570195895522388, "grad_norm": 0.520970250021266, "learning_rate": 2.4150430216991888e-05, "loss": 0.54, "step": 4890 }, { "epoch": 0.5707789179104478, "grad_norm": 0.48125165415242044, "learning_rate": 2.4107538366459494e-05, "loss": 0.5535, "step": 4895 }, { "epoch": 0.5713619402985075, "grad_norm": 0.49924612989919914, "learning_rate": 2.406465912720604e-05, "loss": 0.5378, "step": 4900 }, { "epoch": 0.5719449626865671, "grad_norm": 0.4618022420362027, "learning_rate": 2.4021792658632612e-05, "loss": 0.5385, "step": 4905 }, { "epoch": 0.5725279850746269, "grad_norm": 0.43524447366727087, "learning_rate": 2.3978939120092814e-05, "loss": 0.5275, "step": 4910 }, { "epoch": 0.5731110074626866, "grad_norm": 0.5235831961960491, "learning_rate": 2.3936098670892165e-05, "loss": 0.5171, "step": 4915 }, { "epoch": 0.5736940298507462, "grad_norm": 0.47253982117527504, "learning_rate": 2.389327147028757e-05, "loss": 0.5149, "step": 4920 }, { "epoch": 0.574277052238806, "grad_norm": 0.536733235694404, "learning_rate": 2.3850457677486655e-05, "loss": 0.5385, "step": 4925 }, { "epoch": 0.5748600746268657, "grad_norm": 0.49487264913586265, "learning_rate": 2.380765745164718e-05, "loss": 0.5453, "step": 4930 }, { "epoch": 0.5754430970149254, "grad_norm": 0.4782990502193716, "learning_rate": 2.376487095187654e-05, "loss": 0.496, "step": 4935 }, { "epoch": 0.5760261194029851, "grad_norm": 1.5309625007086856, "learning_rate": 2.3722098337231025e-05, "loss": 0.507, "step": 4940 }, { "epoch": 0.5766091417910447, "grad_norm": 0.4155961423348815, "learning_rate": 2.3679339766715358e-05, "loss": 0.5219, "step": 4945 }, { "epoch": 0.5771921641791045, "grad_norm": 0.4914382394629777, "learning_rate": 2.363659539928204e-05, "loss": 0.5107, "step": 4950 }, { "epoch": 0.5777751865671642, "grad_norm": 0.4984691122251259, "learning_rate": 2.3593865393830766e-05, "loss": 0.5007, "step": 4955 }, { "epoch": 0.5783582089552238, "grad_norm": 0.504442137117383, "learning_rate": 2.355114990920786e-05, "loss": 0.5103, "step": 4960 }, { "epoch": 0.5789412313432836, "grad_norm": 0.774789963927665, "learning_rate": 2.3508449104205636e-05, "loss": 0.516, "step": 4965 }, { "epoch": 0.5795242537313433, "grad_norm": 0.49669070623006295, "learning_rate": 2.3465763137561875e-05, "loss": 0.4984, "step": 4970 }, { "epoch": 0.5801072761194029, "grad_norm": 0.4915699683156642, "learning_rate": 2.342309216795916e-05, "loss": 0.5237, "step": 4975 }, { "epoch": 0.5806902985074627, "grad_norm": 0.7919278813140391, "learning_rate": 2.3380436354024338e-05, "loss": 0.5327, "step": 4980 }, { "epoch": 0.5812733208955224, "grad_norm": 0.513367233603117, "learning_rate": 2.333779585432794e-05, "loss": 0.545, "step": 4985 }, { "epoch": 0.581856343283582, "grad_norm": 0.5185055850159029, "learning_rate": 2.329517082738353e-05, "loss": 0.5671, "step": 4990 }, { "epoch": 0.5824393656716418, "grad_norm": 0.4535336329617379, "learning_rate": 2.3252561431647158e-05, "loss": 0.5072, "step": 4995 }, { "epoch": 0.5830223880597015, "grad_norm": 0.4214639260752155, "learning_rate": 2.32099678255168e-05, "loss": 0.4765, "step": 5000 }, { "epoch": 0.5836054104477612, "grad_norm": 0.4292230083522436, "learning_rate": 2.316739016733169e-05, "loss": 0.535, "step": 5005 }, { "epoch": 0.5841884328358209, "grad_norm": 0.4500847118246923, "learning_rate": 2.3124828615371817e-05, "loss": 0.5579, "step": 5010 }, { "epoch": 0.5847714552238806, "grad_norm": 0.4441260617873408, "learning_rate": 2.3082283327857253e-05, "loss": 0.502, "step": 5015 }, { "epoch": 0.5853544776119403, "grad_norm": 0.4929193442519856, "learning_rate": 2.3039754462947653e-05, "loss": 0.5154, "step": 5020 }, { "epoch": 0.5859375, "grad_norm": 0.4604596804207028, "learning_rate": 2.2997242178741596e-05, "loss": 0.5173, "step": 5025 }, { "epoch": 0.5865205223880597, "grad_norm": 0.4671301724369313, "learning_rate": 2.2954746633276016e-05, "loss": 0.5022, "step": 5030 }, { "epoch": 0.5871035447761194, "grad_norm": 0.4666266550085784, "learning_rate": 2.2912267984525643e-05, "loss": 0.5263, "step": 5035 }, { "epoch": 0.5876865671641791, "grad_norm": 0.5663717691966537, "learning_rate": 2.2869806390402384e-05, "loss": 0.5126, "step": 5040 }, { "epoch": 0.5882695895522388, "grad_norm": 0.4709782228008594, "learning_rate": 2.2827362008754743e-05, "loss": 0.5374, "step": 5045 }, { "epoch": 0.5888526119402985, "grad_norm": 0.523861503593284, "learning_rate": 2.278493499736724e-05, "loss": 0.5313, "step": 5050 }, { "epoch": 0.5894356343283582, "grad_norm": 0.4797723190933745, "learning_rate": 2.2742525513959832e-05, "loss": 0.5382, "step": 5055 }, { "epoch": 0.590018656716418, "grad_norm": 0.43860453059040094, "learning_rate": 2.2700133716187316e-05, "loss": 0.5265, "step": 5060 }, { "epoch": 0.5906016791044776, "grad_norm": 0.4740764262851301, "learning_rate": 2.2657759761638707e-05, "loss": 0.5148, "step": 5065 }, { "epoch": 0.5911847014925373, "grad_norm": 0.48957608183212625, "learning_rate": 2.261540380783675e-05, "loss": 0.4952, "step": 5070 }, { "epoch": 0.5917677238805971, "grad_norm": 0.5162045189958194, "learning_rate": 2.257306601223722e-05, "loss": 0.5331, "step": 5075 }, { "epoch": 0.5923507462686567, "grad_norm": 0.5728273214706431, "learning_rate": 2.2530746532228413e-05, "loss": 0.5143, "step": 5080 }, { "epoch": 0.5929337686567164, "grad_norm": 0.44294658485783917, "learning_rate": 2.2488445525130557e-05, "loss": 0.5385, "step": 5085 }, { "epoch": 0.5935167910447762, "grad_norm": 0.4728473251887733, "learning_rate": 2.2446163148195164e-05, "loss": 0.5249, "step": 5090 }, { "epoch": 0.5940998134328358, "grad_norm": 0.42056687179230184, "learning_rate": 2.2403899558604525e-05, "loss": 0.4896, "step": 5095 }, { "epoch": 0.5946828358208955, "grad_norm": 0.42312832455359684, "learning_rate": 2.2361654913471065e-05, "loss": 0.4831, "step": 5100 }, { "epoch": 0.5952658582089553, "grad_norm": 0.4310382361089622, "learning_rate": 2.2319429369836815e-05, "loss": 0.5038, "step": 5105 }, { "epoch": 0.5958488805970149, "grad_norm": 0.43349085099461926, "learning_rate": 2.2277223084672765e-05, "loss": 0.5332, "step": 5110 }, { "epoch": 0.5964319029850746, "grad_norm": 0.4358686772263049, "learning_rate": 2.2235036214878325e-05, "loss": 0.4716, "step": 5115 }, { "epoch": 0.5970149253731343, "grad_norm": 0.4998061706788218, "learning_rate": 2.2192868917280745e-05, "loss": 0.5277, "step": 5120 }, { "epoch": 0.597597947761194, "grad_norm": 0.4569209342643407, "learning_rate": 2.21507213486345e-05, "loss": 0.4995, "step": 5125 }, { "epoch": 0.5981809701492538, "grad_norm": 0.45297207736047584, "learning_rate": 2.2108593665620724e-05, "loss": 0.5254, "step": 5130 }, { "epoch": 0.5987639925373134, "grad_norm": 0.49176995249756655, "learning_rate": 2.2066486024846615e-05, "loss": 0.5312, "step": 5135 }, { "epoch": 0.5993470149253731, "grad_norm": 0.45352488174846867, "learning_rate": 2.2024398582844906e-05, "loss": 0.5183, "step": 5140 }, { "epoch": 0.5999300373134329, "grad_norm": 0.4686989486366428, "learning_rate": 2.19823314960732e-05, "loss": 0.5337, "step": 5145 }, { "epoch": 0.6005130597014925, "grad_norm": 0.4989471659468385, "learning_rate": 2.1940284920913445e-05, "loss": 0.513, "step": 5150 }, { "epoch": 0.6010960820895522, "grad_norm": 0.7737032679878307, "learning_rate": 2.1898259013671357e-05, "loss": 0.4948, "step": 5155 }, { "epoch": 0.601679104477612, "grad_norm": 0.6456584381963478, "learning_rate": 2.1856253930575787e-05, "loss": 0.5506, "step": 5160 }, { "epoch": 0.6022621268656716, "grad_norm": 0.49420633212169496, "learning_rate": 2.18142698277782e-05, "loss": 0.534, "step": 5165 }, { "epoch": 0.6028451492537313, "grad_norm": 0.43079700140392413, "learning_rate": 2.1772306861352044e-05, "loss": 0.4994, "step": 5170 }, { "epoch": 0.6034281716417911, "grad_norm": 0.4477340131354866, "learning_rate": 2.1730365187292228e-05, "loss": 0.4895, "step": 5175 }, { "epoch": 0.6040111940298507, "grad_norm": 0.45974056791460993, "learning_rate": 2.1688444961514476e-05, "loss": 0.4963, "step": 5180 }, { "epoch": 0.6045942164179104, "grad_norm": 0.4463763393224647, "learning_rate": 2.1646546339854788e-05, "loss": 0.5139, "step": 5185 }, { "epoch": 0.6051772388059702, "grad_norm": 0.5136717833408538, "learning_rate": 2.1604669478068863e-05, "loss": 0.5195, "step": 5190 }, { "epoch": 0.6057602611940298, "grad_norm": 0.4554979479736964, "learning_rate": 2.1562814531831487e-05, "loss": 0.5062, "step": 5195 }, { "epoch": 0.6063432835820896, "grad_norm": 0.45967404846816146, "learning_rate": 2.1520981656736e-05, "loss": 0.4984, "step": 5200 }, { "epoch": 0.6069263059701493, "grad_norm": 0.44666913701763594, "learning_rate": 2.1479171008293686e-05, "loss": 0.4939, "step": 5205 }, { "epoch": 0.6075093283582089, "grad_norm": 1.738573008880425, "learning_rate": 2.1437382741933204e-05, "loss": 0.542, "step": 5210 }, { "epoch": 0.6080923507462687, "grad_norm": 0.45746766067753797, "learning_rate": 2.1395617012999993e-05, "loss": 0.5014, "step": 5215 }, { "epoch": 0.6086753731343284, "grad_norm": 0.4542305089948559, "learning_rate": 2.1353873976755716e-05, "loss": 0.4852, "step": 5220 }, { "epoch": 0.609258395522388, "grad_norm": 0.4734086638694241, "learning_rate": 2.131215378837771e-05, "loss": 0.5218, "step": 5225 }, { "epoch": 0.6098414179104478, "grad_norm": 0.4738565708182124, "learning_rate": 2.1270456602958332e-05, "loss": 0.5322, "step": 5230 }, { "epoch": 0.6104244402985075, "grad_norm": 0.49359869399689404, "learning_rate": 2.1228782575504447e-05, "loss": 0.537, "step": 5235 }, { "epoch": 0.6110074626865671, "grad_norm": 0.47548423763939335, "learning_rate": 2.1187131860936845e-05, "loss": 0.5183, "step": 5240 }, { "epoch": 0.6115904850746269, "grad_norm": 0.6813270131879138, "learning_rate": 2.114550461408961e-05, "loss": 0.5097, "step": 5245 }, { "epoch": 0.6121735074626866, "grad_norm": 0.4858514966177657, "learning_rate": 2.1103900989709623e-05, "loss": 0.5267, "step": 5250 }, { "epoch": 0.6127565298507462, "grad_norm": 0.46679943200730223, "learning_rate": 2.1062321142455953e-05, "loss": 0.5187, "step": 5255 }, { "epoch": 0.613339552238806, "grad_norm": 0.43680321948998807, "learning_rate": 2.1020765226899257e-05, "loss": 0.5072, "step": 5260 }, { "epoch": 0.6139225746268657, "grad_norm": 0.48765723203673156, "learning_rate": 2.0979233397521237e-05, "loss": 0.5172, "step": 5265 }, { "epoch": 0.6145055970149254, "grad_norm": 0.5421523270806327, "learning_rate": 2.0937725808714037e-05, "loss": 0.5173, "step": 5270 }, { "epoch": 0.6150886194029851, "grad_norm": 0.4821842656660482, "learning_rate": 2.089624261477973e-05, "loss": 0.522, "step": 5275 }, { "epoch": 0.6156716417910447, "grad_norm": 0.473993764265411, "learning_rate": 2.0854783969929668e-05, "loss": 0.5394, "step": 5280 }, { "epoch": 0.6162546641791045, "grad_norm": 0.4419301919043876, "learning_rate": 2.0813350028283958e-05, "loss": 0.5152, "step": 5285 }, { "epoch": 0.6168376865671642, "grad_norm": 0.4672525694912507, "learning_rate": 2.0771940943870866e-05, "loss": 0.5195, "step": 5290 }, { "epoch": 0.6174207089552238, "grad_norm": 0.4420478828415689, "learning_rate": 2.073055687062625e-05, "loss": 0.5131, "step": 5295 }, { "epoch": 0.6180037313432836, "grad_norm": 0.4626362183961214, "learning_rate": 2.0689197962393007e-05, "loss": 0.5162, "step": 5300 }, { "epoch": 0.6185867537313433, "grad_norm": 0.49784203270276733, "learning_rate": 2.0647864372920472e-05, "loss": 0.5279, "step": 5305 }, { "epoch": 0.6191697761194029, "grad_norm": 0.454384681268438, "learning_rate": 2.0606556255863862e-05, "loss": 0.5193, "step": 5310 }, { "epoch": 0.6197527985074627, "grad_norm": 0.549168830900785, "learning_rate": 2.05652737647837e-05, "loss": 0.508, "step": 5315 }, { "epoch": 0.6203358208955224, "grad_norm": 0.4790027663801494, "learning_rate": 2.0524017053145238e-05, "loss": 0.5009, "step": 5320 }, { "epoch": 0.620918843283582, "grad_norm": 0.4858133125717372, "learning_rate": 2.0482786274317923e-05, "loss": 0.5257, "step": 5325 }, { "epoch": 0.6215018656716418, "grad_norm": 0.4686160974815818, "learning_rate": 2.0441581581574765e-05, "loss": 0.5006, "step": 5330 }, { "epoch": 0.6220848880597015, "grad_norm": 0.4623666153609126, "learning_rate": 2.0400403128091812e-05, "loss": 0.5169, "step": 5335 }, { "epoch": 0.6226679104477612, "grad_norm": 0.45877847406013955, "learning_rate": 2.0359251066947583e-05, "loss": 0.5334, "step": 5340 }, { "epoch": 0.6232509328358209, "grad_norm": 0.4835869467134031, "learning_rate": 2.0318125551122468e-05, "loss": 0.4976, "step": 5345 }, { "epoch": 0.6238339552238806, "grad_norm": 0.4575233875191603, "learning_rate": 2.027702673349818e-05, "loss": 0.5078, "step": 5350 }, { "epoch": 0.6244169776119403, "grad_norm": 0.42998314282841227, "learning_rate": 2.023595476685718e-05, "loss": 0.5217, "step": 5355 }, { "epoch": 0.625, "grad_norm": 0.40746461264097383, "learning_rate": 2.0194909803882128e-05, "loss": 0.4901, "step": 5360 }, { "epoch": 0.6255830223880597, "grad_norm": 0.4597436174722755, "learning_rate": 2.0153891997155282e-05, "loss": 0.5494, "step": 5365 }, { "epoch": 0.6261660447761194, "grad_norm": 0.47770020364059385, "learning_rate": 2.011290149915795e-05, "loss": 0.5137, "step": 5370 }, { "epoch": 0.6267490671641791, "grad_norm": 0.4282249414588399, "learning_rate": 2.0071938462269936e-05, "loss": 0.4793, "step": 5375 }, { "epoch": 0.6273320895522388, "grad_norm": 0.4509595314631354, "learning_rate": 2.0031003038768942e-05, "loss": 0.498, "step": 5380 }, { "epoch": 0.6279151119402985, "grad_norm": 0.47714728709347026, "learning_rate": 1.999009538083003e-05, "loss": 0.545, "step": 5385 }, { "epoch": 0.6284981343283582, "grad_norm": 0.47392502530376346, "learning_rate": 1.994921564052503e-05, "loss": 0.5133, "step": 5390 }, { "epoch": 0.629081156716418, "grad_norm": 0.44759229419435465, "learning_rate": 1.990836396982202e-05, "loss": 0.5262, "step": 5395 }, { "epoch": 0.6296641791044776, "grad_norm": 0.4912367043364836, "learning_rate": 1.9867540520584693e-05, "loss": 0.5346, "step": 5400 }, { "epoch": 0.6302472014925373, "grad_norm": 0.4501000804982757, "learning_rate": 1.9826745444571853e-05, "loss": 0.5019, "step": 5405 }, { "epoch": 0.6308302238805971, "grad_norm": 0.48836277098183645, "learning_rate": 1.978597889343683e-05, "loss": 0.5289, "step": 5410 }, { "epoch": 0.6314132462686567, "grad_norm": 0.4625223996087579, "learning_rate": 1.97452410187269e-05, "loss": 0.4958, "step": 5415 }, { "epoch": 0.6319962686567164, "grad_norm": 0.48092056548731926, "learning_rate": 1.970453197188275e-05, "loss": 0.5015, "step": 5420 }, { "epoch": 0.6325792910447762, "grad_norm": 0.41113077299689604, "learning_rate": 1.9663851904237903e-05, "loss": 0.4989, "step": 5425 }, { "epoch": 0.6331623134328358, "grad_norm": 0.43567184997194336, "learning_rate": 1.9623200967018134e-05, "loss": 0.5066, "step": 5430 }, { "epoch": 0.6337453358208955, "grad_norm": 0.480598507617135, "learning_rate": 1.9582579311340943e-05, "loss": 0.5476, "step": 5435 }, { "epoch": 0.6343283582089553, "grad_norm": 0.4632420890807915, "learning_rate": 1.9541987088214963e-05, "loss": 0.514, "step": 5440 }, { "epoch": 0.6349113805970149, "grad_norm": 0.461886176881567, "learning_rate": 1.9501424448539445e-05, "loss": 0.4993, "step": 5445 }, { "epoch": 0.6354944029850746, "grad_norm": 0.4863359836422005, "learning_rate": 1.946089154310364e-05, "loss": 0.5349, "step": 5450 }, { "epoch": 0.6360774253731343, "grad_norm": 0.44233037285547316, "learning_rate": 1.9420388522586242e-05, "loss": 0.5036, "step": 5455 }, { "epoch": 0.636660447761194, "grad_norm": 0.4399036507814764, "learning_rate": 1.937991553755491e-05, "loss": 0.5198, "step": 5460 }, { "epoch": 0.6372434701492538, "grad_norm": 0.4644842288483705, "learning_rate": 1.9339472738465604e-05, "loss": 0.468, "step": 5465 }, { "epoch": 0.6378264925373134, "grad_norm": 0.5021801630911482, "learning_rate": 1.929906027566207e-05, "loss": 0.5046, "step": 5470 }, { "epoch": 0.6384095149253731, "grad_norm": 0.4906231561063412, "learning_rate": 1.9258678299375287e-05, "loss": 0.5637, "step": 5475 }, { "epoch": 0.6389925373134329, "grad_norm": 0.4411840007547938, "learning_rate": 1.9218326959722915e-05, "loss": 0.4941, "step": 5480 }, { "epoch": 0.6395755597014925, "grad_norm": 0.44492116681266264, "learning_rate": 1.9178006406708716e-05, "loss": 0.4999, "step": 5485 }, { "epoch": 0.6401585820895522, "grad_norm": 0.45899402350391216, "learning_rate": 1.913771679022199e-05, "loss": 0.5143, "step": 5490 }, { "epoch": 0.640741604477612, "grad_norm": 0.5002130360013761, "learning_rate": 1.9097458260037055e-05, "loss": 0.5223, "step": 5495 }, { "epoch": 0.6413246268656716, "grad_norm": 0.4445252627269499, "learning_rate": 1.9057230965812652e-05, "loss": 0.5277, "step": 5500 }, { "epoch": 0.6419076492537313, "grad_norm": 0.4327961966702548, "learning_rate": 1.901703505709141e-05, "loss": 0.4689, "step": 5505 }, { "epoch": 0.6424906716417911, "grad_norm": 0.47514425543756855, "learning_rate": 1.897687068329928e-05, "loss": 0.499, "step": 5510 }, { "epoch": 0.6430736940298507, "grad_norm": 0.4544747659785093, "learning_rate": 1.8936737993744996e-05, "loss": 0.5026, "step": 5515 }, { "epoch": 0.6436567164179104, "grad_norm": 0.45829680386820026, "learning_rate": 1.8896637137619495e-05, "loss": 0.5066, "step": 5520 }, { "epoch": 0.6442397388059702, "grad_norm": 0.42928587523633865, "learning_rate": 1.8856568263995373e-05, "loss": 0.4651, "step": 5525 }, { "epoch": 0.6448227611940298, "grad_norm": 0.4785710795086254, "learning_rate": 1.8816531521826346e-05, "loss": 0.5118, "step": 5530 }, { "epoch": 0.6454057835820896, "grad_norm": 0.44984484126810564, "learning_rate": 1.8776527059946676e-05, "loss": 0.4979, "step": 5535 }, { "epoch": 0.6459888059701493, "grad_norm": 0.5084605177342773, "learning_rate": 1.8736555027070607e-05, "loss": 0.513, "step": 5540 }, { "epoch": 0.6465718283582089, "grad_norm": 0.4587926020250396, "learning_rate": 1.8696615571791876e-05, "loss": 0.5056, "step": 5545 }, { "epoch": 0.6471548507462687, "grad_norm": 0.5265935639558714, "learning_rate": 1.865670884258307e-05, "loss": 0.5328, "step": 5550 }, { "epoch": 0.6477378731343284, "grad_norm": 0.48393501644987547, "learning_rate": 1.861683498779514e-05, "loss": 0.5409, "step": 5555 }, { "epoch": 0.648320895522388, "grad_norm": 0.42845129776042723, "learning_rate": 1.8576994155656814e-05, "loss": 0.5036, "step": 5560 }, { "epoch": 0.6489039179104478, "grad_norm": 0.45750371585274724, "learning_rate": 1.853718649427409e-05, "loss": 0.5458, "step": 5565 }, { "epoch": 0.6494869402985075, "grad_norm": 0.5071290594881412, "learning_rate": 1.8497412151629617e-05, "loss": 0.5574, "step": 5570 }, { "epoch": 0.6500699626865671, "grad_norm": 0.4193141376087725, "learning_rate": 1.8457671275582202e-05, "loss": 0.501, "step": 5575 }, { "epoch": 0.6506529850746269, "grad_norm": 0.5097091316108154, "learning_rate": 1.841796401386626e-05, "loss": 0.5424, "step": 5580 }, { "epoch": 0.6512360074626866, "grad_norm": 0.4574079642887348, "learning_rate": 1.8378290514091214e-05, "loss": 0.5143, "step": 5585 }, { "epoch": 0.6518190298507462, "grad_norm": 0.48232820582938274, "learning_rate": 1.8338650923740984e-05, "loss": 0.5068, "step": 5590 }, { "epoch": 0.652402052238806, "grad_norm": 0.4295823610973414, "learning_rate": 1.829904539017347e-05, "loss": 0.5076, "step": 5595 }, { "epoch": 0.6529850746268657, "grad_norm": 0.4893118320191642, "learning_rate": 1.8259474060619925e-05, "loss": 0.5219, "step": 5600 }, { "epoch": 0.6535680970149254, "grad_norm": 0.44539776731173597, "learning_rate": 1.8219937082184462e-05, "loss": 0.4935, "step": 5605 }, { "epoch": 0.6541511194029851, "grad_norm": 0.5129311723129513, "learning_rate": 1.8180434601843505e-05, "loss": 0.5179, "step": 5610 }, { "epoch": 0.6547341417910447, "grad_norm": 0.4864225115581459, "learning_rate": 1.8140966766445235e-05, "loss": 0.4969, "step": 5615 }, { "epoch": 0.6553171641791045, "grad_norm": 0.47007224857589525, "learning_rate": 1.8101533722709036e-05, "loss": 0.5005, "step": 5620 }, { "epoch": 0.6559001865671642, "grad_norm": 0.4597974424746557, "learning_rate": 1.8062135617224933e-05, "loss": 0.5294, "step": 5625 }, { "epoch": 0.6564832089552238, "grad_norm": 0.49917347832714665, "learning_rate": 1.802277259645313e-05, "loss": 0.5261, "step": 5630 }, { "epoch": 0.6570662313432836, "grad_norm": 0.46525260879023533, "learning_rate": 1.798344480672334e-05, "loss": 0.5119, "step": 5635 }, { "epoch": 0.6576492537313433, "grad_norm": 0.49319911982873976, "learning_rate": 1.7944152394234354e-05, "loss": 0.4884, "step": 5640 }, { "epoch": 0.6582322761194029, "grad_norm": 0.5132851806324479, "learning_rate": 1.7904895505053405e-05, "loss": 0.5158, "step": 5645 }, { "epoch": 0.6588152985074627, "grad_norm": 0.47569949694794605, "learning_rate": 1.7865674285115735e-05, "loss": 0.5121, "step": 5650 }, { "epoch": 0.6593983208955224, "grad_norm": 0.44552199263042797, "learning_rate": 1.7826488880223913e-05, "loss": 0.4761, "step": 5655 }, { "epoch": 0.659981343283582, "grad_norm": 0.47720149730201705, "learning_rate": 1.778733943604743e-05, "loss": 0.5124, "step": 5660 }, { "epoch": 0.6605643656716418, "grad_norm": 0.4432509370944976, "learning_rate": 1.774822609812205e-05, "loss": 0.4711, "step": 5665 }, { "epoch": 0.6611473880597015, "grad_norm": 0.4450343141290362, "learning_rate": 1.7709149011849364e-05, "loss": 0.4761, "step": 5670 }, { "epoch": 0.6617304104477612, "grad_norm": 0.4797782997827445, "learning_rate": 1.767010832249613e-05, "loss": 0.5175, "step": 5675 }, { "epoch": 0.6623134328358209, "grad_norm": 0.47463931278850086, "learning_rate": 1.7631104175193863e-05, "loss": 0.5208, "step": 5680 }, { "epoch": 0.6628964552238806, "grad_norm": 0.49271476148028914, "learning_rate": 1.7592136714938206e-05, "loss": 0.522, "step": 5685 }, { "epoch": 0.6634794776119403, "grad_norm": 0.4366587132398326, "learning_rate": 1.755320608658844e-05, "loss": 0.4709, "step": 5690 }, { "epoch": 0.6640625, "grad_norm": 0.44579614307494786, "learning_rate": 1.7514312434866904e-05, "loss": 0.5267, "step": 5695 }, { "epoch": 0.6646455223880597, "grad_norm": 0.4487339036982587, "learning_rate": 1.747545590435848e-05, "loss": 0.4991, "step": 5700 }, { "epoch": 0.6652285447761194, "grad_norm": 0.4383027709136693, "learning_rate": 1.7436636639510082e-05, "loss": 0.5141, "step": 5705 }, { "epoch": 0.6658115671641791, "grad_norm": 0.4458649709251083, "learning_rate": 1.739785478463004e-05, "loss": 0.4921, "step": 5710 }, { "epoch": 0.6663945895522388, "grad_norm": 0.49612990301681104, "learning_rate": 1.735911048388768e-05, "loss": 0.5081, "step": 5715 }, { "epoch": 0.6669776119402985, "grad_norm": 0.4717384137501139, "learning_rate": 1.7320403881312665e-05, "loss": 0.4909, "step": 5720 }, { "epoch": 0.6675606343283582, "grad_norm": 0.5029540121644192, "learning_rate": 1.7281735120794555e-05, "loss": 0.5439, "step": 5725 }, { "epoch": 0.668143656716418, "grad_norm": 0.4643667414061285, "learning_rate": 1.7243104346082194e-05, "loss": 0.4794, "step": 5730 }, { "epoch": 0.6687266791044776, "grad_norm": 0.47417180490760813, "learning_rate": 1.720451170078328e-05, "loss": 0.4996, "step": 5735 }, { "epoch": 0.6693097014925373, "grad_norm": 0.4814261614829366, "learning_rate": 1.7165957328363703e-05, "loss": 0.5027, "step": 5740 }, { "epoch": 0.6698927238805971, "grad_norm": 0.45627638279169563, "learning_rate": 1.71274413721471e-05, "loss": 0.513, "step": 5745 }, { "epoch": 0.6704757462686567, "grad_norm": 0.4334633223796316, "learning_rate": 1.708896397531431e-05, "loss": 0.4659, "step": 5750 }, { "epoch": 0.6710587686567164, "grad_norm": 0.4579206868255898, "learning_rate": 1.7050525280902824e-05, "loss": 0.5158, "step": 5755 }, { "epoch": 0.6716417910447762, "grad_norm": 0.4386284661436938, "learning_rate": 1.701212543180623e-05, "loss": 0.5206, "step": 5760 }, { "epoch": 0.6722248134328358, "grad_norm": 0.45478097582209165, "learning_rate": 1.6973764570773766e-05, "loss": 0.4958, "step": 5765 }, { "epoch": 0.6728078358208955, "grad_norm": 0.49448818817184126, "learning_rate": 1.693544284040968e-05, "loss": 0.532, "step": 5770 }, { "epoch": 0.6733908582089553, "grad_norm": 0.44392640275278356, "learning_rate": 1.6897160383172794e-05, "loss": 0.5102, "step": 5775 }, { "epoch": 0.6739738805970149, "grad_norm": 0.4681803651785721, "learning_rate": 1.6858917341375893e-05, "loss": 0.5033, "step": 5780 }, { "epoch": 0.6745569029850746, "grad_norm": 0.5012783536207814, "learning_rate": 1.6820713857185296e-05, "loss": 0.5261, "step": 5785 }, { "epoch": 0.6751399253731343, "grad_norm": 0.5021117339189587, "learning_rate": 1.6782550072620208e-05, "loss": 0.4987, "step": 5790 }, { "epoch": 0.675722947761194, "grad_norm": 0.48367815094839645, "learning_rate": 1.674442612955229e-05, "loss": 0.5231, "step": 5795 }, { "epoch": 0.6763059701492538, "grad_norm": 0.42292867586088734, "learning_rate": 1.6706342169705092e-05, "loss": 0.4833, "step": 5800 }, { "epoch": 0.6768889925373134, "grad_norm": 0.5498079373194198, "learning_rate": 1.6668298334653504e-05, "loss": 0.5303, "step": 5805 }, { "epoch": 0.6774720149253731, "grad_norm": 0.45613249161600566, "learning_rate": 1.663029476582328e-05, "loss": 0.4946, "step": 5810 }, { "epoch": 0.6780550373134329, "grad_norm": 0.4300584687753966, "learning_rate": 1.659233160449048e-05, "loss": 0.4835, "step": 5815 }, { "epoch": 0.6786380597014925, "grad_norm": 0.47672064051122565, "learning_rate": 1.6554408991780958e-05, "loss": 0.5239, "step": 5820 }, { "epoch": 0.6792210820895522, "grad_norm": 0.4363678330399839, "learning_rate": 1.65165270686698e-05, "loss": 0.5095, "step": 5825 }, { "epoch": 0.679804104477612, "grad_norm": 0.4386976710048956, "learning_rate": 1.6478685975980867e-05, "loss": 0.5037, "step": 5830 }, { "epoch": 0.6803871268656716, "grad_norm": 0.4713692579068845, "learning_rate": 1.6440885854386223e-05, "loss": 0.4919, "step": 5835 }, { "epoch": 0.6809701492537313, "grad_norm": 0.4590294577914819, "learning_rate": 1.6403126844405627e-05, "loss": 0.5001, "step": 5840 }, { "epoch": 0.6815531716417911, "grad_norm": 0.48010469655748705, "learning_rate": 1.6365409086405982e-05, "loss": 0.523, "step": 5845 }, { "epoch": 0.6821361940298507, "grad_norm": 0.4572102553531434, "learning_rate": 1.6327732720600893e-05, "loss": 0.4981, "step": 5850 }, { "epoch": 0.6827192164179104, "grad_norm": 0.42279317577956194, "learning_rate": 1.6290097887050037e-05, "loss": 0.4824, "step": 5855 }, { "epoch": 0.6833022388059702, "grad_norm": 0.4672338547985673, "learning_rate": 1.6252504725658738e-05, "loss": 0.5276, "step": 5860 }, { "epoch": 0.6838852611940298, "grad_norm": 0.5057683989918992, "learning_rate": 1.6214953376177355e-05, "loss": 0.5108, "step": 5865 }, { "epoch": 0.6844682835820896, "grad_norm": 0.4285897123749069, "learning_rate": 1.617744397820089e-05, "loss": 0.4943, "step": 5870 }, { "epoch": 0.6850513059701493, "grad_norm": 0.47280662245843785, "learning_rate": 1.613997667116832e-05, "loss": 0.5127, "step": 5875 }, { "epoch": 0.6856343283582089, "grad_norm": 0.467214604689162, "learning_rate": 1.610255159436219e-05, "loss": 0.4953, "step": 5880 }, { "epoch": 0.6862173507462687, "grad_norm": 0.4452219391184069, "learning_rate": 1.6065168886908046e-05, "loss": 0.4957, "step": 5885 }, { "epoch": 0.6868003731343284, "grad_norm": 0.44301870809154087, "learning_rate": 1.6027828687773947e-05, "loss": 0.4975, "step": 5890 }, { "epoch": 0.687383395522388, "grad_norm": 0.4401277833271277, "learning_rate": 1.5990531135769885e-05, "loss": 0.5195, "step": 5895 }, { "epoch": 0.6879664179104478, "grad_norm": 2.0394489198103662, "learning_rate": 1.5953276369547356e-05, "loss": 0.4956, "step": 5900 }, { "epoch": 0.6885494402985075, "grad_norm": 0.46836588645884025, "learning_rate": 1.591606452759879e-05, "loss": 0.5327, "step": 5905 }, { "epoch": 0.6891324626865671, "grad_norm": 0.44598757398103644, "learning_rate": 1.587889574825705e-05, "loss": 0.4764, "step": 5910 }, { "epoch": 0.6897154850746269, "grad_norm": 0.49742861746687556, "learning_rate": 1.5841770169694895e-05, "loss": 0.5111, "step": 5915 }, { "epoch": 0.6902985074626866, "grad_norm": 0.47331933429323847, "learning_rate": 1.5804687929924522e-05, "loss": 0.5047, "step": 5920 }, { "epoch": 0.6908815298507462, "grad_norm": 0.4652913554765252, "learning_rate": 1.5767649166796995e-05, "loss": 0.5038, "step": 5925 }, { "epoch": 0.691464552238806, "grad_norm": 0.453898586144349, "learning_rate": 1.573065401800176e-05, "loss": 0.4755, "step": 5930 }, { "epoch": 0.6920475746268657, "grad_norm": 0.43523535343423403, "learning_rate": 1.569370262106615e-05, "loss": 0.553, "step": 5935 }, { "epoch": 0.6926305970149254, "grad_norm": 0.46238104794971113, "learning_rate": 1.5656795113354816e-05, "loss": 0.4496, "step": 5940 }, { "epoch": 0.6932136194029851, "grad_norm": 0.4612926915904917, "learning_rate": 1.5619931632069284e-05, "loss": 0.5039, "step": 5945 }, { "epoch": 0.6937966417910447, "grad_norm": 0.46826494703709465, "learning_rate": 1.5583112314247386e-05, "loss": 0.5022, "step": 5950 }, { "epoch": 0.6943796641791045, "grad_norm": 0.47987543937588256, "learning_rate": 1.5546337296762826e-05, "loss": 0.5097, "step": 5955 }, { "epoch": 0.6949626865671642, "grad_norm": 0.43334758409327906, "learning_rate": 1.5509606716324563e-05, "loss": 0.4943, "step": 5960 }, { "epoch": 0.6955457089552238, "grad_norm": 0.4681763383562353, "learning_rate": 1.547292070947641e-05, "loss": 0.5063, "step": 5965 }, { "epoch": 0.6961287313432836, "grad_norm": 0.4924802920273831, "learning_rate": 1.5436279412596466e-05, "loss": 0.5047, "step": 5970 }, { "epoch": 0.6967117537313433, "grad_norm": 0.5347314807147506, "learning_rate": 1.5399682961896627e-05, "loss": 0.5254, "step": 5975 }, { "epoch": 0.6972947761194029, "grad_norm": 0.46161765208167127, "learning_rate": 1.5363131493422045e-05, "loss": 0.5527, "step": 5980 }, { "epoch": 0.6978777985074627, "grad_norm": 0.481592995924184, "learning_rate": 1.5326625143050717e-05, "loss": 0.5093, "step": 5985 }, { "epoch": 0.6984608208955224, "grad_norm": 0.43698502229598946, "learning_rate": 1.5290164046492855e-05, "loss": 0.5113, "step": 5990 }, { "epoch": 0.699043843283582, "grad_norm": 0.45737599000837936, "learning_rate": 1.5253748339290478e-05, "loss": 0.5072, "step": 5995 }, { "epoch": 0.6996268656716418, "grad_norm": 0.4632305738785457, "learning_rate": 1.5217378156816836e-05, "loss": 0.5092, "step": 6000 }, { "epoch": 0.7002098880597015, "grad_norm": 0.5658113543744521, "learning_rate": 1.5181053634276005e-05, "loss": 0.4714, "step": 6005 }, { "epoch": 0.7007929104477612, "grad_norm": 0.5024081912461719, "learning_rate": 1.5144774906702261e-05, "loss": 0.5587, "step": 6010 }, { "epoch": 0.7013759328358209, "grad_norm": 0.4346747521893763, "learning_rate": 1.5108542108959666e-05, "loss": 0.4874, "step": 6015 }, { "epoch": 0.7019589552238806, "grad_norm": 0.48751850242768197, "learning_rate": 1.5072355375741564e-05, "loss": 0.5152, "step": 6020 }, { "epoch": 0.7025419776119403, "grad_norm": 0.4679030016933106, "learning_rate": 1.5036214841570002e-05, "loss": 0.5177, "step": 6025 }, { "epoch": 0.703125, "grad_norm": 0.43192813973658906, "learning_rate": 1.500012064079533e-05, "loss": 0.5381, "step": 6030 }, { "epoch": 0.7037080223880597, "grad_norm": 0.47334729350720534, "learning_rate": 1.4964072907595633e-05, "loss": 0.5428, "step": 6035 }, { "epoch": 0.7042910447761194, "grad_norm": 0.4625335667175621, "learning_rate": 1.4928071775976283e-05, "loss": 0.4965, "step": 6040 }, { "epoch": 0.7048740671641791, "grad_norm": 0.4837863609737646, "learning_rate": 1.489211737976937e-05, "loss": 0.5201, "step": 6045 }, { "epoch": 0.7054570895522388, "grad_norm": 0.45448831055589556, "learning_rate": 1.4856209852633282e-05, "loss": 0.4988, "step": 6050 }, { "epoch": 0.7060401119402985, "grad_norm": 0.4470633313561567, "learning_rate": 1.482034932805217e-05, "loss": 0.4923, "step": 6055 }, { "epoch": 0.7066231343283582, "grad_norm": 0.46146957236249214, "learning_rate": 1.478453593933545e-05, "loss": 0.4966, "step": 6060 }, { "epoch": 0.707206156716418, "grad_norm": 0.44846726296477046, "learning_rate": 1.4748769819617291e-05, "loss": 0.5121, "step": 6065 }, { "epoch": 0.7077891791044776, "grad_norm": 0.445769696660708, "learning_rate": 1.47130511018562e-05, "loss": 0.49, "step": 6070 }, { "epoch": 0.7083722014925373, "grad_norm": 0.44424913913430536, "learning_rate": 1.4677379918834408e-05, "loss": 0.5154, "step": 6075 }, { "epoch": 0.7089552238805971, "grad_norm": 0.5025893980492081, "learning_rate": 1.464175640315748e-05, "loss": 0.4886, "step": 6080 }, { "epoch": 0.7095382462686567, "grad_norm": 0.4287175183865977, "learning_rate": 1.460618068725374e-05, "loss": 0.4943, "step": 6085 }, { "epoch": 0.7101212686567164, "grad_norm": 0.46657717095673207, "learning_rate": 1.4570652903373877e-05, "loss": 0.4866, "step": 6090 }, { "epoch": 0.7107042910447762, "grad_norm": 0.44658175711235154, "learning_rate": 1.453517318359034e-05, "loss": 0.4952, "step": 6095 }, { "epoch": 0.7112873134328358, "grad_norm": 0.5109217950300224, "learning_rate": 1.4499741659796927e-05, "loss": 0.5495, "step": 6100 }, { "epoch": 0.7118703358208955, "grad_norm": 0.439976204898742, "learning_rate": 1.4464358463708277e-05, "loss": 0.4947, "step": 6105 }, { "epoch": 0.7124533582089553, "grad_norm": 0.442606078185802, "learning_rate": 1.442902372685937e-05, "loss": 0.5125, "step": 6110 }, { "epoch": 0.7130363805970149, "grad_norm": 0.5113814435199087, "learning_rate": 1.4393737580605019e-05, "loss": 0.5397, "step": 6115 }, { "epoch": 0.7136194029850746, "grad_norm": 0.4292248926969336, "learning_rate": 1.435850015611943e-05, "loss": 0.5038, "step": 6120 }, { "epoch": 0.7142024253731343, "grad_norm": 0.4436015934650438, "learning_rate": 1.432331158439568e-05, "loss": 0.5015, "step": 6125 }, { "epoch": 0.714785447761194, "grad_norm": 0.4679393946330053, "learning_rate": 1.4288171996245247e-05, "loss": 0.4961, "step": 6130 }, { "epoch": 0.7153684701492538, "grad_norm": 0.46822261278058663, "learning_rate": 1.425308152229749e-05, "loss": 0.5565, "step": 6135 }, { "epoch": 0.7159514925373134, "grad_norm": 0.48866887191385927, "learning_rate": 1.4218040292999221e-05, "loss": 0.531, "step": 6140 }, { "epoch": 0.7165345149253731, "grad_norm": 0.4487435192453521, "learning_rate": 1.4183048438614166e-05, "loss": 0.4994, "step": 6145 }, { "epoch": 0.7171175373134329, "grad_norm": 0.5036947554976395, "learning_rate": 1.4148106089222513e-05, "loss": 0.519, "step": 6150 }, { "epoch": 0.7177005597014925, "grad_norm": 0.49765078956448927, "learning_rate": 1.4113213374720425e-05, "loss": 0.4825, "step": 6155 }, { "epoch": 0.7182835820895522, "grad_norm": 0.49796549587751004, "learning_rate": 1.4078370424819515e-05, "loss": 0.5142, "step": 6160 }, { "epoch": 0.718866604477612, "grad_norm": 0.46661776742591504, "learning_rate": 1.404357736904645e-05, "loss": 0.5121, "step": 6165 }, { "epoch": 0.7194496268656716, "grad_norm": 0.4206361496368435, "learning_rate": 1.4008834336742366e-05, "loss": 0.5043, "step": 6170 }, { "epoch": 0.7200326492537313, "grad_norm": 0.49140533816343374, "learning_rate": 1.3974141457062498e-05, "loss": 0.5465, "step": 6175 }, { "epoch": 0.7206156716417911, "grad_norm": 0.4730841055221398, "learning_rate": 1.3939498858975584e-05, "loss": 0.496, "step": 6180 }, { "epoch": 0.7211986940298507, "grad_norm": 0.5165044754977881, "learning_rate": 1.390490667126348e-05, "loss": 0.5699, "step": 6185 }, { "epoch": 0.7217817164179104, "grad_norm": 0.4574833927556042, "learning_rate": 1.3870365022520627e-05, "loss": 0.494, "step": 6190 }, { "epoch": 0.7223647388059702, "grad_norm": 0.4797480787975331, "learning_rate": 1.3835874041153607e-05, "loss": 0.5, "step": 6195 }, { "epoch": 0.7229477611940298, "grad_norm": 0.5266171413115511, "learning_rate": 1.380143385538063e-05, "loss": 0.5248, "step": 6200 }, { "epoch": 0.7235307835820896, "grad_norm": 0.4787651159553734, "learning_rate": 1.3767044593231082e-05, "loss": 0.4999, "step": 6205 }, { "epoch": 0.7241138059701493, "grad_norm": 0.42275543868072146, "learning_rate": 1.3732706382545054e-05, "loss": 0.4731, "step": 6210 }, { "epoch": 0.7246968283582089, "grad_norm": 0.48666619671292066, "learning_rate": 1.3698419350972851e-05, "loss": 0.5242, "step": 6215 }, { "epoch": 0.7252798507462687, "grad_norm": 0.41898376072456794, "learning_rate": 1.3664183625974503e-05, "loss": 0.5104, "step": 6220 }, { "epoch": 0.7258628731343284, "grad_norm": 0.4929104768550004, "learning_rate": 1.362999933481935e-05, "loss": 0.5206, "step": 6225 }, { "epoch": 0.726445895522388, "grad_norm": 0.4308013069303933, "learning_rate": 1.3595866604585492e-05, "loss": 0.5074, "step": 6230 }, { "epoch": 0.7270289179104478, "grad_norm": 0.4421936343319682, "learning_rate": 1.3561785562159374e-05, "loss": 0.4808, "step": 6235 }, { "epoch": 0.7276119402985075, "grad_norm": 0.4980669606106666, "learning_rate": 1.3527756334235288e-05, "loss": 0.4746, "step": 6240 }, { "epoch": 0.7281949626865671, "grad_norm": 0.4364859749653744, "learning_rate": 1.3493779047314925e-05, "loss": 0.4967, "step": 6245 }, { "epoch": 0.7287779850746269, "grad_norm": 0.4424531855538569, "learning_rate": 1.3459853827706853e-05, "loss": 0.4962, "step": 6250 }, { "epoch": 0.7293610074626866, "grad_norm": 0.5043638790699465, "learning_rate": 1.3425980801526118e-05, "loss": 0.5095, "step": 6255 }, { "epoch": 0.7299440298507462, "grad_norm": 0.464384451619953, "learning_rate": 1.3392160094693724e-05, "loss": 0.5008, "step": 6260 }, { "epoch": 0.730527052238806, "grad_norm": 0.44362217621259836, "learning_rate": 1.3358391832936174e-05, "loss": 0.4965, "step": 6265 }, { "epoch": 0.7311100746268657, "grad_norm": 0.4092013778366071, "learning_rate": 1.3324676141785029e-05, "loss": 0.5133, "step": 6270 }, { "epoch": 0.7316930970149254, "grad_norm": 0.4350575485273892, "learning_rate": 1.3291013146576403e-05, "loss": 0.5128, "step": 6275 }, { "epoch": 0.7322761194029851, "grad_norm": 0.4744197561421012, "learning_rate": 1.3257402972450539e-05, "loss": 0.4784, "step": 6280 }, { "epoch": 0.7328591417910447, "grad_norm": 0.4633376651924879, "learning_rate": 1.3223845744351287e-05, "loss": 0.475, "step": 6285 }, { "epoch": 0.7334421641791045, "grad_norm": 0.47064248305550177, "learning_rate": 1.3190341587025698e-05, "loss": 0.5147, "step": 6290 }, { "epoch": 0.7340251865671642, "grad_norm": 0.4745007087563231, "learning_rate": 1.3156890625023532e-05, "loss": 0.5131, "step": 6295 }, { "epoch": 0.7346082089552238, "grad_norm": 0.47910451161364465, "learning_rate": 1.3123492982696806e-05, "loss": 0.5125, "step": 6300 }, { "epoch": 0.7351912313432836, "grad_norm": 0.45150153399392984, "learning_rate": 1.3090148784199288e-05, "loss": 0.5195, "step": 6305 }, { "epoch": 0.7357742537313433, "grad_norm": 0.4666852788990984, "learning_rate": 1.305685815348613e-05, "loss": 0.5348, "step": 6310 }, { "epoch": 0.7363572761194029, "grad_norm": 0.43969109543694684, "learning_rate": 1.3023621214313289e-05, "loss": 0.4887, "step": 6315 }, { "epoch": 0.7369402985074627, "grad_norm": 0.4520730263820748, "learning_rate": 1.2990438090237167e-05, "loss": 0.5174, "step": 6320 }, { "epoch": 0.7375233208955224, "grad_norm": 0.4531016913171499, "learning_rate": 1.2957308904614099e-05, "loss": 0.513, "step": 6325 }, { "epoch": 0.738106343283582, "grad_norm": 0.43990578576591227, "learning_rate": 1.2924233780599915e-05, "loss": 0.469, "step": 6330 }, { "epoch": 0.7386893656716418, "grad_norm": 0.44084700165618407, "learning_rate": 1.2891212841149447e-05, "loss": 0.4997, "step": 6335 }, { "epoch": 0.7392723880597015, "grad_norm": 0.49100928132633287, "learning_rate": 1.2858246209016128e-05, "loss": 0.5187, "step": 6340 }, { "epoch": 0.7398554104477612, "grad_norm": 1.5928599989218044, "learning_rate": 1.2825334006751493e-05, "loss": 0.4954, "step": 6345 }, { "epoch": 0.7404384328358209, "grad_norm": 0.4278160829593741, "learning_rate": 1.2792476356704759e-05, "loss": 0.462, "step": 6350 }, { "epoch": 0.7410214552238806, "grad_norm": 0.45173765932520604, "learning_rate": 1.2759673381022305e-05, "loss": 0.5198, "step": 6355 }, { "epoch": 0.7416044776119403, "grad_norm": 0.46060933457902786, "learning_rate": 1.27269252016473e-05, "loss": 0.501, "step": 6360 }, { "epoch": 0.7421875, "grad_norm": 0.5194412989177508, "learning_rate": 1.2694231940319192e-05, "loss": 0.4862, "step": 6365 }, { "epoch": 0.7427705223880597, "grad_norm": 0.46637710292865725, "learning_rate": 1.2661593718573294e-05, "loss": 0.4913, "step": 6370 }, { "epoch": 0.7433535447761194, "grad_norm": 0.49083348251147924, "learning_rate": 1.2629010657740275e-05, "loss": 0.5073, "step": 6375 }, { "epoch": 0.7439365671641791, "grad_norm": 0.476732025190355, "learning_rate": 1.2596482878945787e-05, "loss": 0.5343, "step": 6380 }, { "epoch": 0.7445195895522388, "grad_norm": 0.43400279583829604, "learning_rate": 1.2564010503109952e-05, "loss": 0.4952, "step": 6385 }, { "epoch": 0.7451026119402985, "grad_norm": 0.4474931990549133, "learning_rate": 1.2531593650946932e-05, "loss": 0.4966, "step": 6390 }, { "epoch": 0.7456856343283582, "grad_norm": 0.4746293119307018, "learning_rate": 1.2499232442964506e-05, "loss": 0.5241, "step": 6395 }, { "epoch": 0.746268656716418, "grad_norm": 0.4596957874880727, "learning_rate": 1.2466926999463575e-05, "loss": 0.4931, "step": 6400 }, { "epoch": 0.7468516791044776, "grad_norm": 0.48590988974018173, "learning_rate": 1.2434677440537745e-05, "loss": 0.5498, "step": 6405 }, { "epoch": 0.7474347014925373, "grad_norm": 0.43144811701119584, "learning_rate": 1.2402483886072883e-05, "loss": 0.4673, "step": 6410 }, { "epoch": 0.7480177238805971, "grad_norm": 0.4694370814956435, "learning_rate": 1.237034645574666e-05, "loss": 0.5173, "step": 6415 }, { "epoch": 0.7486007462686567, "grad_norm": 0.5215126950510659, "learning_rate": 1.233826526902809e-05, "loss": 0.5413, "step": 6420 }, { "epoch": 0.7491837686567164, "grad_norm": 0.4667853644040054, "learning_rate": 1.230624044517713e-05, "loss": 0.5206, "step": 6425 }, { "epoch": 0.7497667910447762, "grad_norm": 0.49822013134556736, "learning_rate": 1.2274272103244201e-05, "loss": 0.5264, "step": 6430 }, { "epoch": 0.7503498134328358, "grad_norm": 0.4924144531446238, "learning_rate": 1.2242360362069763e-05, "loss": 0.4843, "step": 6435 }, { "epoch": 0.7509328358208955, "grad_norm": 0.47214741828129897, "learning_rate": 1.2210505340283838e-05, "loss": 0.529, "step": 6440 }, { "epoch": 0.7515158582089553, "grad_norm": 0.4726759175626485, "learning_rate": 1.2178707156305644e-05, "loss": 0.4993, "step": 6445 }, { "epoch": 0.7520988805970149, "grad_norm": 0.47497735759010773, "learning_rate": 1.2146965928343062e-05, "loss": 0.4923, "step": 6450 }, { "epoch": 0.7526819029850746, "grad_norm": 0.5282112383115118, "learning_rate": 1.2115281774392278e-05, "loss": 0.5043, "step": 6455 }, { "epoch": 0.7532649253731343, "grad_norm": 0.4289024052469388, "learning_rate": 1.208365481223727e-05, "loss": 0.4915, "step": 6460 }, { "epoch": 0.753847947761194, "grad_norm": 0.4527587796867484, "learning_rate": 1.2052085159449455e-05, "loss": 0.491, "step": 6465 }, { "epoch": 0.7544309701492538, "grad_norm": 0.46413997296204734, "learning_rate": 1.202057293338717e-05, "loss": 0.5207, "step": 6470 }, { "epoch": 0.7550139925373134, "grad_norm": 0.47431690969030404, "learning_rate": 1.1989118251195284e-05, "loss": 0.4807, "step": 6475 }, { "epoch": 0.7555970149253731, "grad_norm": 0.6124168305417785, "learning_rate": 1.1957721229804761e-05, "loss": 0.4909, "step": 6480 }, { "epoch": 0.7561800373134329, "grad_norm": 0.4612146149691418, "learning_rate": 1.1926381985932186e-05, "loss": 0.4912, "step": 6485 }, { "epoch": 0.7567630597014925, "grad_norm": 0.4602032665320846, "learning_rate": 1.1895100636079387e-05, "loss": 0.5287, "step": 6490 }, { "epoch": 0.7573460820895522, "grad_norm": 0.44850719377837156, "learning_rate": 1.186387729653296e-05, "loss": 0.5147, "step": 6495 }, { "epoch": 0.757929104477612, "grad_norm": 0.43547527328367247, "learning_rate": 1.1832712083363865e-05, "loss": 0.4774, "step": 6500 }, { "epoch": 0.7585121268656716, "grad_norm": 0.48648908094016996, "learning_rate": 1.1801605112426953e-05, "loss": 0.4948, "step": 6505 }, { "epoch": 0.7590951492537313, "grad_norm": 0.4362070385676467, "learning_rate": 1.1770556499360593e-05, "loss": 0.4768, "step": 6510 }, { "epoch": 0.7596781716417911, "grad_norm": 0.4594448702342764, "learning_rate": 1.1739566359586195e-05, "loss": 0.5192, "step": 6515 }, { "epoch": 0.7602611940298507, "grad_norm": 0.4895001977535846, "learning_rate": 1.170863480830781e-05, "loss": 0.5029, "step": 6520 }, { "epoch": 0.7608442164179104, "grad_norm": 0.4811285447973769, "learning_rate": 1.167776196051166e-05, "loss": 0.5089, "step": 6525 }, { "epoch": 0.7614272388059702, "grad_norm": 0.4169504765893255, "learning_rate": 1.1646947930965795e-05, "loss": 0.4477, "step": 6530 }, { "epoch": 0.7620102611940298, "grad_norm": 0.4342791005744433, "learning_rate": 1.1616192834219553e-05, "loss": 0.4957, "step": 6535 }, { "epoch": 0.7625932835820896, "grad_norm": 0.500766952555154, "learning_rate": 1.1585496784603234e-05, "loss": 0.474, "step": 6540 }, { "epoch": 0.7631763059701493, "grad_norm": 0.459834987252376, "learning_rate": 1.15548598962276e-05, "loss": 0.4653, "step": 6545 }, { "epoch": 0.7637593283582089, "grad_norm": 0.4634667523018127, "learning_rate": 1.1524282282983526e-05, "loss": 0.4952, "step": 6550 }, { "epoch": 0.7643423507462687, "grad_norm": 0.48306231366385766, "learning_rate": 1.1493764058541493e-05, "loss": 0.5092, "step": 6555 }, { "epoch": 0.7649253731343284, "grad_norm": 0.4412886363099704, "learning_rate": 1.1463305336351233e-05, "loss": 0.4836, "step": 6560 }, { "epoch": 0.765508395522388, "grad_norm": 0.4594326939876668, "learning_rate": 1.143290622964128e-05, "loss": 0.4858, "step": 6565 }, { "epoch": 0.7660914179104478, "grad_norm": 0.495560886941702, "learning_rate": 1.1402566851418545e-05, "loss": 0.484, "step": 6570 }, { "epoch": 0.7666744402985075, "grad_norm": 0.44698231009276274, "learning_rate": 1.1372287314467896e-05, "loss": 0.4938, "step": 6575 }, { "epoch": 0.7672574626865671, "grad_norm": 0.48489845254460334, "learning_rate": 1.1342067731351754e-05, "loss": 0.5349, "step": 6580 }, { "epoch": 0.7678404850746269, "grad_norm": 0.3983706507650223, "learning_rate": 1.1311908214409666e-05, "loss": 0.4916, "step": 6585 }, { "epoch": 0.7684235074626866, "grad_norm": 0.4829203676168014, "learning_rate": 1.128180887575789e-05, "loss": 0.5538, "step": 6590 }, { "epoch": 0.7690065298507462, "grad_norm": 0.5181252743781259, "learning_rate": 1.1251769827288953e-05, "loss": 0.5103, "step": 6595 }, { "epoch": 0.769589552238806, "grad_norm": 0.45245249797253495, "learning_rate": 1.122179118067128e-05, "loss": 0.4877, "step": 6600 }, { "epoch": 0.7701725746268657, "grad_norm": 0.45213952635934074, "learning_rate": 1.1191873047348743e-05, "loss": 0.4699, "step": 6605 }, { "epoch": 0.7707555970149254, "grad_norm": 0.4384946771009236, "learning_rate": 1.1162015538540268e-05, "loss": 0.5199, "step": 6610 }, { "epoch": 0.7713386194029851, "grad_norm": 1.4634272550071448, "learning_rate": 1.1132218765239417e-05, "loss": 0.5111, "step": 6615 }, { "epoch": 0.7719216417910447, "grad_norm": 0.4528179441810264, "learning_rate": 1.1102482838213945e-05, "loss": 0.5095, "step": 6620 }, { "epoch": 0.7725046641791045, "grad_norm": 0.48790948502665343, "learning_rate": 1.1072807868005438e-05, "loss": 0.5375, "step": 6625 }, { "epoch": 0.7730876865671642, "grad_norm": 0.43356020787990573, "learning_rate": 1.104319396492888e-05, "loss": 0.494, "step": 6630 }, { "epoch": 0.7736707089552238, "grad_norm": 0.46568426433428856, "learning_rate": 1.1013641239072233e-05, "loss": 0.5089, "step": 6635 }, { "epoch": 0.7742537313432836, "grad_norm": 0.5058210347479575, "learning_rate": 1.098414980029603e-05, "loss": 0.5278, "step": 6640 }, { "epoch": 0.7748367537313433, "grad_norm": 0.46834473900137596, "learning_rate": 1.0954719758232983e-05, "loss": 0.5183, "step": 6645 }, { "epoch": 0.7754197761194029, "grad_norm": 0.4485355699073728, "learning_rate": 1.092535122228757e-05, "loss": 0.5067, "step": 6650 }, { "epoch": 0.7760027985074627, "grad_norm": 0.44029418480334825, "learning_rate": 1.0896044301635616e-05, "loss": 0.4902, "step": 6655 }, { "epoch": 0.7765858208955224, "grad_norm": 0.46043398790080686, "learning_rate": 1.0866799105223877e-05, "loss": 0.4752, "step": 6660 }, { "epoch": 0.777168843283582, "grad_norm": 0.45411148412679725, "learning_rate": 1.0837615741769695e-05, "loss": 0.5027, "step": 6665 }, { "epoch": 0.7777518656716418, "grad_norm": 0.46002535963451235, "learning_rate": 1.0808494319760511e-05, "loss": 0.4818, "step": 6670 }, { "epoch": 0.7783348880597015, "grad_norm": 0.4613160618534238, "learning_rate": 1.0779434947453531e-05, "loss": 0.5305, "step": 6675 }, { "epoch": 0.7789179104477612, "grad_norm": 0.5192961001346327, "learning_rate": 1.0750437732875265e-05, "loss": 0.4909, "step": 6680 }, { "epoch": 0.7795009328358209, "grad_norm": 0.47703840671844866, "learning_rate": 1.0721502783821194e-05, "loss": 0.5433, "step": 6685 }, { "epoch": 0.7800839552238806, "grad_norm": 0.4027202848278968, "learning_rate": 1.0692630207855296e-05, "loss": 0.4795, "step": 6690 }, { "epoch": 0.7806669776119403, "grad_norm": 0.5254895186437003, "learning_rate": 1.0663820112309695e-05, "loss": 0.5234, "step": 6695 }, { "epoch": 0.78125, "grad_norm": 0.4714359353636831, "learning_rate": 1.0635072604284254e-05, "loss": 0.4837, "step": 6700 }, { "epoch": 0.7818330223880597, "grad_norm": 0.4409933416168129, "learning_rate": 1.0606387790646154e-05, "loss": 0.5124, "step": 6705 }, { "epoch": 0.7824160447761194, "grad_norm": 0.47666045358791953, "learning_rate": 1.0577765778029525e-05, "loss": 0.4762, "step": 6710 }, { "epoch": 0.7829990671641791, "grad_norm": 0.48287842316913426, "learning_rate": 1.0549206672835033e-05, "loss": 0.4879, "step": 6715 }, { "epoch": 0.7835820895522388, "grad_norm": 0.4494356289008418, "learning_rate": 1.0520710581229507e-05, "loss": 0.4816, "step": 6720 }, { "epoch": 0.7841651119402985, "grad_norm": 0.46052908698109446, "learning_rate": 1.049227760914549e-05, "loss": 0.4516, "step": 6725 }, { "epoch": 0.7847481343283582, "grad_norm": 0.4567314964584747, "learning_rate": 1.0463907862280913e-05, "loss": 0.4871, "step": 6730 }, { "epoch": 0.785331156716418, "grad_norm": 0.44868088537045625, "learning_rate": 1.043560144609866e-05, "loss": 0.4955, "step": 6735 }, { "epoch": 0.7859141791044776, "grad_norm": 0.4530134493607295, "learning_rate": 1.0407358465826198e-05, "loss": 0.5199, "step": 6740 }, { "epoch": 0.7864972014925373, "grad_norm": 0.4463551169201282, "learning_rate": 1.0379179026455136e-05, "loss": 0.4913, "step": 6745 }, { "epoch": 0.7870802238805971, "grad_norm": 0.47099610185569996, "learning_rate": 1.0351063232740937e-05, "loss": 0.5332, "step": 6750 }, { "epoch": 0.7876632462686567, "grad_norm": 0.4402447824147342, "learning_rate": 1.0323011189202408e-05, "loss": 0.5053, "step": 6755 }, { "epoch": 0.7882462686567164, "grad_norm": 0.45593057954899197, "learning_rate": 1.0295023000121404e-05, "loss": 0.474, "step": 6760 }, { "epoch": 0.7888292910447762, "grad_norm": 0.4374300903516831, "learning_rate": 1.0267098769542368e-05, "loss": 0.5427, "step": 6765 }, { "epoch": 0.7894123134328358, "grad_norm": 0.4216054919323253, "learning_rate": 1.0239238601272036e-05, "loss": 0.4862, "step": 6770 }, { "epoch": 0.7899953358208955, "grad_norm": 0.43838979602724326, "learning_rate": 1.0211442598878936e-05, "loss": 0.4697, "step": 6775 }, { "epoch": 0.7905783582089553, "grad_norm": 0.41927389582266983, "learning_rate": 1.0183710865693105e-05, "loss": 0.4731, "step": 6780 }, { "epoch": 0.7911613805970149, "grad_norm": 0.4618407627786652, "learning_rate": 1.0156043504805648e-05, "loss": 0.4946, "step": 6785 }, { "epoch": 0.7917444029850746, "grad_norm": 0.49476862297527385, "learning_rate": 1.0128440619068379e-05, "loss": 0.5218, "step": 6790 }, { "epoch": 0.7923274253731343, "grad_norm": 0.4878473893991779, "learning_rate": 1.0100902311093405e-05, "loss": 0.5127, "step": 6795 }, { "epoch": 0.792910447761194, "grad_norm": 0.44555429187543133, "learning_rate": 1.0073428683252788e-05, "loss": 0.4893, "step": 6800 }, { "epoch": 0.7934934701492538, "grad_norm": 0.44977845000528854, "learning_rate": 1.0046019837678153e-05, "loss": 0.4687, "step": 6805 }, { "epoch": 0.7940764925373134, "grad_norm": 0.447310487987635, "learning_rate": 1.001867587626029e-05, "loss": 0.484, "step": 6810 }, { "epoch": 0.7946595149253731, "grad_norm": 0.48144807868357464, "learning_rate": 9.991396900648774e-06, "loss": 0.5395, "step": 6815 }, { "epoch": 0.7952425373134329, "grad_norm": 0.46182249334458747, "learning_rate": 9.964183012251619e-06, "loss": 0.4914, "step": 6820 }, { "epoch": 0.7958255597014925, "grad_norm": 0.4389736266429193, "learning_rate": 9.937034312234872e-06, "loss": 0.4966, "step": 6825 }, { "epoch": 0.7964085820895522, "grad_norm": 0.4683635632125285, "learning_rate": 9.90995090152225e-06, "loss": 0.4872, "step": 6830 }, { "epoch": 0.796991604477612, "grad_norm": 0.449577983637824, "learning_rate": 9.88293288079476e-06, "loss": 0.4775, "step": 6835 }, { "epoch": 0.7975746268656716, "grad_norm": 0.4679257203680657, "learning_rate": 9.855980350490315e-06, "loss": 0.4628, "step": 6840 }, { "epoch": 0.7981576492537313, "grad_norm": 0.4910686168202571, "learning_rate": 9.82909341080339e-06, "loss": 0.516, "step": 6845 }, { "epoch": 0.7987406716417911, "grad_norm": 0.4328561795827866, "learning_rate": 9.802272161684601e-06, "loss": 0.5062, "step": 6850 }, { "epoch": 0.7993236940298507, "grad_norm": 0.48810830150277185, "learning_rate": 9.775516702840411e-06, "loss": 0.542, "step": 6855 }, { "epoch": 0.7999067164179104, "grad_norm": 0.4326359263896414, "learning_rate": 9.748827133732665e-06, "loss": 0.491, "step": 6860 }, { "epoch": 0.8004897388059702, "grad_norm": 0.45895357662154007, "learning_rate": 9.722203553578288e-06, "loss": 0.5017, "step": 6865 }, { "epoch": 0.8010727611940298, "grad_norm": 0.4201551976753478, "learning_rate": 9.695646061348892e-06, "loss": 0.5233, "step": 6870 }, { "epoch": 0.8016557835820896, "grad_norm": 0.4261865295483104, "learning_rate": 9.669154755770415e-06, "loss": 0.4615, "step": 6875 }, { "epoch": 0.8022388059701493, "grad_norm": 0.7140092170328386, "learning_rate": 9.642729735322733e-06, "loss": 0.5097, "step": 6880 }, { "epoch": 0.8028218283582089, "grad_norm": 0.4990283440949626, "learning_rate": 9.616371098239346e-06, "loss": 0.5716, "step": 6885 }, { "epoch": 0.8034048507462687, "grad_norm": 0.4481559993333328, "learning_rate": 9.590078942506933e-06, "loss": 0.4856, "step": 6890 }, { "epoch": 0.8039878731343284, "grad_norm": 0.4607158596942868, "learning_rate": 9.56385336586507e-06, "loss": 0.5107, "step": 6895 }, { "epoch": 0.804570895522388, "grad_norm": 0.476387734456041, "learning_rate": 9.537694465805797e-06, "loss": 0.4998, "step": 6900 }, { "epoch": 0.8051539179104478, "grad_norm": 0.46770623541362477, "learning_rate": 9.511602339573324e-06, "loss": 0.4998, "step": 6905 }, { "epoch": 0.8057369402985075, "grad_norm": 0.5086514392993742, "learning_rate": 9.485577084163604e-06, "loss": 0.5105, "step": 6910 }, { "epoch": 0.8063199626865671, "grad_norm": 0.5077908007862963, "learning_rate": 9.45961879632401e-06, "loss": 0.5116, "step": 6915 }, { "epoch": 0.8069029850746269, "grad_norm": 0.5463748621976762, "learning_rate": 9.43372757255297e-06, "loss": 0.4687, "step": 6920 }, { "epoch": 0.8074860074626866, "grad_norm": 0.4191317799403366, "learning_rate": 9.40790350909961e-06, "loss": 0.498, "step": 6925 }, { "epoch": 0.8080690298507462, "grad_norm": 0.4865303702117012, "learning_rate": 9.382146701963373e-06, "loss": 0.456, "step": 6930 }, { "epoch": 0.808652052238806, "grad_norm": 0.42674988798148156, "learning_rate": 9.356457246893695e-06, "loss": 0.5227, "step": 6935 }, { "epoch": 0.8092350746268657, "grad_norm": 0.46859664111243005, "learning_rate": 9.330835239389645e-06, "loss": 0.5018, "step": 6940 }, { "epoch": 0.8098180970149254, "grad_norm": 0.5581605519629456, "learning_rate": 9.305280774699531e-06, "loss": 0.4893, "step": 6945 }, { "epoch": 0.8104011194029851, "grad_norm": 0.4648844812652044, "learning_rate": 9.279793947820596e-06, "loss": 0.5034, "step": 6950 }, { "epoch": 0.8109841417910447, "grad_norm": 0.4348511386801257, "learning_rate": 9.254374853498636e-06, "loss": 0.476, "step": 6955 }, { "epoch": 0.8115671641791045, "grad_norm": 0.4471748979975722, "learning_rate": 9.229023586227666e-06, "loss": 0.4746, "step": 6960 }, { "epoch": 0.8121501865671642, "grad_norm": 0.49028467746657334, "learning_rate": 9.203740240249527e-06, "loss": 0.5075, "step": 6965 }, { "epoch": 0.8127332089552238, "grad_norm": 0.45156139381243476, "learning_rate": 9.178524909553617e-06, "loss": 0.4997, "step": 6970 }, { "epoch": 0.8133162313432836, "grad_norm": 0.4580984480401016, "learning_rate": 9.153377687876439e-06, "loss": 0.5098, "step": 6975 }, { "epoch": 0.8138992537313433, "grad_norm": 0.4417064208682211, "learning_rate": 9.128298668701341e-06, "loss": 0.4866, "step": 6980 }, { "epoch": 0.8144822761194029, "grad_norm": 0.4396784431419953, "learning_rate": 9.103287945258104e-06, "loss": 0.4843, "step": 6985 }, { "epoch": 0.8150652985074627, "grad_norm": 0.42781740173572586, "learning_rate": 9.078345610522662e-06, "loss": 0.4864, "step": 6990 }, { "epoch": 0.8156483208955224, "grad_norm": 0.4706194313899779, "learning_rate": 9.053471757216675e-06, "loss": 0.4829, "step": 6995 }, { "epoch": 0.816231343283582, "grad_norm": 0.4373005186110652, "learning_rate": 9.028666477807253e-06, "loss": 0.4946, "step": 7000 }, { "epoch": 0.8168143656716418, "grad_norm": 0.45822848787961307, "learning_rate": 9.003929864506583e-06, "loss": 0.4747, "step": 7005 }, { "epoch": 0.8173973880597015, "grad_norm": 0.4376578697732612, "learning_rate": 8.979262009271589e-06, "loss": 0.4982, "step": 7010 }, { "epoch": 0.8179804104477612, "grad_norm": 0.50115571632827, "learning_rate": 8.954663003803579e-06, "loss": 0.4934, "step": 7015 }, { "epoch": 0.8185634328358209, "grad_norm": 0.44219752315988364, "learning_rate": 8.930132939547932e-06, "loss": 0.4663, "step": 7020 }, { "epoch": 0.8191464552238806, "grad_norm": 0.440108685069607, "learning_rate": 8.905671907693738e-06, "loss": 0.4856, "step": 7025 }, { "epoch": 0.8197294776119403, "grad_norm": 0.5246570100006753, "learning_rate": 8.881279999173466e-06, "loss": 0.5021, "step": 7030 }, { "epoch": 0.8203125, "grad_norm": 0.4319239074767847, "learning_rate": 8.856957304662602e-06, "loss": 0.5123, "step": 7035 }, { "epoch": 0.8208955223880597, "grad_norm": 0.4326829341676435, "learning_rate": 8.832703914579363e-06, "loss": 0.5021, "step": 7040 }, { "epoch": 0.8214785447761194, "grad_norm": 0.4380817083829351, "learning_rate": 8.80851991908431e-06, "loss": 0.5044, "step": 7045 }, { "epoch": 0.8220615671641791, "grad_norm": 0.4637369898587162, "learning_rate": 8.784405408080046e-06, "loss": 0.4852, "step": 7050 }, { "epoch": 0.8226445895522388, "grad_norm": 0.4796910879347511, "learning_rate": 8.760360471210865e-06, "loss": 0.5338, "step": 7055 }, { "epoch": 0.8232276119402985, "grad_norm": 0.4135280946993472, "learning_rate": 8.736385197862415e-06, "loss": 0.4824, "step": 7060 }, { "epoch": 0.8238106343283582, "grad_norm": 0.4287858690307368, "learning_rate": 8.712479677161388e-06, "loss": 0.4869, "step": 7065 }, { "epoch": 0.824393656716418, "grad_norm": 0.4714198385972294, "learning_rate": 8.688643997975156e-06, "loss": 0.5143, "step": 7070 }, { "epoch": 0.8249766791044776, "grad_norm": 0.44395978208148723, "learning_rate": 8.66487824891149e-06, "loss": 0.4922, "step": 7075 }, { "epoch": 0.8255597014925373, "grad_norm": 0.508754740613158, "learning_rate": 8.641182518318162e-06, "loss": 0.5184, "step": 7080 }, { "epoch": 0.8261427238805971, "grad_norm": 0.4026422778012762, "learning_rate": 8.617556894282683e-06, "loss": 0.4886, "step": 7085 }, { "epoch": 0.8267257462686567, "grad_norm": 0.47446388322469524, "learning_rate": 8.594001464631938e-06, "loss": 0.5189, "step": 7090 }, { "epoch": 0.8273087686567164, "grad_norm": 0.47024774444454565, "learning_rate": 8.570516316931869e-06, "loss": 0.5266, "step": 7095 }, { "epoch": 0.8278917910447762, "grad_norm": 0.44748825048810753, "learning_rate": 8.547101538487136e-06, "loss": 0.4845, "step": 7100 }, { "epoch": 0.8284748134328358, "grad_norm": 0.4806024012846129, "learning_rate": 8.52375721634083e-06, "loss": 0.5031, "step": 7105 }, { "epoch": 0.8290578358208955, "grad_norm": 0.5413375895981546, "learning_rate": 8.5004834372741e-06, "loss": 0.5106, "step": 7110 }, { "epoch": 0.8296408582089553, "grad_norm": 0.4337024823163042, "learning_rate": 8.477280287805883e-06, "loss": 0.4954, "step": 7115 }, { "epoch": 0.8302238805970149, "grad_norm": 0.4344528368728774, "learning_rate": 8.454147854192515e-06, "loss": 0.5021, "step": 7120 }, { "epoch": 0.8308069029850746, "grad_norm": 0.45520925847409455, "learning_rate": 8.4310862224275e-06, "loss": 0.4828, "step": 7125 }, { "epoch": 0.8313899253731343, "grad_norm": 0.4734386762743593, "learning_rate": 8.408095478241099e-06, "loss": 0.5243, "step": 7130 }, { "epoch": 0.831972947761194, "grad_norm": 0.4717328221940539, "learning_rate": 8.385175707100064e-06, "loss": 0.4907, "step": 7135 }, { "epoch": 0.8325559701492538, "grad_norm": 0.41240208092851716, "learning_rate": 8.36232699420732e-06, "loss": 0.5031, "step": 7140 }, { "epoch": 0.8331389925373134, "grad_norm": 0.41514081930390884, "learning_rate": 8.33954942450163e-06, "loss": 0.4896, "step": 7145 }, { "epoch": 0.8337220149253731, "grad_norm": 0.4675941109997365, "learning_rate": 8.316843082657277e-06, "loss": 0.5009, "step": 7150 }, { "epoch": 0.8343050373134329, "grad_norm": 0.4427930397053809, "learning_rate": 8.294208053083771e-06, "loss": 0.511, "step": 7155 }, { "epoch": 0.8348880597014925, "grad_norm": 0.45949248005156934, "learning_rate": 8.271644419925526e-06, "loss": 0.4719, "step": 7160 }, { "epoch": 0.8354710820895522, "grad_norm": 0.471413986453529, "learning_rate": 8.249152267061524e-06, "loss": 0.4994, "step": 7165 }, { "epoch": 0.836054104477612, "grad_norm": 0.44813478705837095, "learning_rate": 8.226731678105045e-06, "loss": 0.4804, "step": 7170 }, { "epoch": 0.8366371268656716, "grad_norm": 0.4426301509227603, "learning_rate": 8.20438273640332e-06, "loss": 0.5143, "step": 7175 }, { "epoch": 0.8372201492537313, "grad_norm": 0.4465652598526728, "learning_rate": 8.18210552503725e-06, "loss": 0.4871, "step": 7180 }, { "epoch": 0.8378031716417911, "grad_norm": 0.47244655194156704, "learning_rate": 8.159900126821062e-06, "loss": 0.5003, "step": 7185 }, { "epoch": 0.8383861940298507, "grad_norm": 0.4471968512251493, "learning_rate": 8.137766624302036e-06, "loss": 0.5142, "step": 7190 }, { "epoch": 0.8389692164179104, "grad_norm": 0.4709917176706898, "learning_rate": 8.115705099760184e-06, "loss": 0.5195, "step": 7195 }, { "epoch": 0.8395522388059702, "grad_norm": 0.4380879394783618, "learning_rate": 8.093715635207948e-06, "loss": 0.4885, "step": 7200 }, { "epoch": 0.8401352611940298, "grad_norm": 0.41712410093981933, "learning_rate": 8.071798312389863e-06, "loss": 0.5116, "step": 7205 }, { "epoch": 0.8407182835820896, "grad_norm": 0.4325869434630372, "learning_rate": 8.049953212782329e-06, "loss": 0.4478, "step": 7210 }, { "epoch": 0.8413013059701493, "grad_norm": 0.4166574600617677, "learning_rate": 8.028180417593215e-06, "loss": 0.4824, "step": 7215 }, { "epoch": 0.8418843283582089, "grad_norm": 0.43830478937394324, "learning_rate": 8.006480007761628e-06, "loss": 0.4674, "step": 7220 }, { "epoch": 0.8424673507462687, "grad_norm": 0.4817669338294806, "learning_rate": 7.98485206395758e-06, "loss": 0.5077, "step": 7225 }, { "epoch": 0.8430503731343284, "grad_norm": 0.4647045998674549, "learning_rate": 7.963296666581703e-06, "loss": 0.4702, "step": 7230 }, { "epoch": 0.843633395522388, "grad_norm": 0.49174994550827694, "learning_rate": 7.941813895764919e-06, "loss": 0.5212, "step": 7235 }, { "epoch": 0.8442164179104478, "grad_norm": 0.41757383756848515, "learning_rate": 7.920403831368189e-06, "loss": 0.5016, "step": 7240 }, { "epoch": 0.8447994402985075, "grad_norm": 0.44263407474750815, "learning_rate": 7.899066552982179e-06, "loss": 0.4994, "step": 7245 }, { "epoch": 0.8453824626865671, "grad_norm": 0.46533860910272895, "learning_rate": 7.87780213992699e-06, "loss": 0.4973, "step": 7250 }, { "epoch": 0.8459654850746269, "grad_norm": 0.4396469203604777, "learning_rate": 7.856610671251826e-06, "loss": 0.4962, "step": 7255 }, { "epoch": 0.8465485074626866, "grad_norm": 0.461412303324744, "learning_rate": 7.835492225734753e-06, "loss": 0.4848, "step": 7260 }, { "epoch": 0.8471315298507462, "grad_norm": 0.5184880522572166, "learning_rate": 7.81444688188236e-06, "loss": 0.5133, "step": 7265 }, { "epoch": 0.847714552238806, "grad_norm": 0.47001007740037093, "learning_rate": 7.793474717929495e-06, "loss": 0.4852, "step": 7270 }, { "epoch": 0.8482975746268657, "grad_norm": 0.47109511656808234, "learning_rate": 7.772575811838948e-06, "loss": 0.4961, "step": 7275 }, { "epoch": 0.8488805970149254, "grad_norm": 0.44116994424682127, "learning_rate": 7.751750241301192e-06, "loss": 0.4972, "step": 7280 }, { "epoch": 0.8494636194029851, "grad_norm": 0.4501932519308333, "learning_rate": 7.730998083734083e-06, "loss": 0.4859, "step": 7285 }, { "epoch": 0.8500466417910447, "grad_norm": 0.48144234778210115, "learning_rate": 7.710319416282543e-06, "loss": 0.4984, "step": 7290 }, { "epoch": 0.8506296641791045, "grad_norm": 0.456988520365441, "learning_rate": 7.689714315818339e-06, "loss": 0.5232, "step": 7295 }, { "epoch": 0.8512126865671642, "grad_norm": 0.5275485527755961, "learning_rate": 7.669182858939715e-06, "loss": 0.494, "step": 7300 }, { "epoch": 0.8517957089552238, "grad_norm": 0.4358686155004703, "learning_rate": 7.648725121971178e-06, "loss": 0.4652, "step": 7305 }, { "epoch": 0.8523787313432836, "grad_norm": 0.5241681742015615, "learning_rate": 7.628341180963175e-06, "loss": 0.5107, "step": 7310 }, { "epoch": 0.8529617537313433, "grad_norm": 0.4446315316066114, "learning_rate": 7.608031111691826e-06, "loss": 0.4736, "step": 7315 }, { "epoch": 0.8535447761194029, "grad_norm": 0.4586742881823037, "learning_rate": 7.587794989658621e-06, "loss": 0.4789, "step": 7320 }, { "epoch": 0.8541277985074627, "grad_norm": 0.43393817483796693, "learning_rate": 7.567632890090176e-06, "loss": 0.4517, "step": 7325 }, { "epoch": 0.8547108208955224, "grad_norm": 0.4530263088196386, "learning_rate": 7.5475448879379255e-06, "loss": 0.5204, "step": 7330 }, { "epoch": 0.855293843283582, "grad_norm": 0.47232925084661864, "learning_rate": 7.527531057877849e-06, "loss": 0.5212, "step": 7335 }, { "epoch": 0.8558768656716418, "grad_norm": 0.4413601041808957, "learning_rate": 7.507591474310185e-06, "loss": 0.4907, "step": 7340 }, { "epoch": 0.8564598880597015, "grad_norm": 0.5105056148052758, "learning_rate": 7.487726211359198e-06, "loss": 0.5465, "step": 7345 }, { "epoch": 0.8570429104477612, "grad_norm": 0.6715647626166893, "learning_rate": 7.46793534287283e-06, "loss": 0.5069, "step": 7350 }, { "epoch": 0.8576259328358209, "grad_norm": 0.4588809773198785, "learning_rate": 7.448218942422498e-06, "loss": 0.5474, "step": 7355 }, { "epoch": 0.8582089552238806, "grad_norm": 0.4831576147260091, "learning_rate": 7.428577083302757e-06, "loss": 0.5093, "step": 7360 }, { "epoch": 0.8587919776119403, "grad_norm": 0.5042721886700091, "learning_rate": 7.409009838531095e-06, "loss": 0.5375, "step": 7365 }, { "epoch": 0.859375, "grad_norm": 0.46284073380005986, "learning_rate": 7.389517280847598e-06, "loss": 0.5159, "step": 7370 }, { "epoch": 0.8599580223880597, "grad_norm": 0.4661662360141043, "learning_rate": 7.370099482714715e-06, "loss": 0.5315, "step": 7375 }, { "epoch": 0.8605410447761194, "grad_norm": 0.4476959939432871, "learning_rate": 7.35075651631699e-06, "loss": 0.4901, "step": 7380 }, { "epoch": 0.8611240671641791, "grad_norm": 0.44717566109177675, "learning_rate": 7.331488453560767e-06, "loss": 0.4921, "step": 7385 }, { "epoch": 0.8617070895522388, "grad_norm": 0.4476851149766431, "learning_rate": 7.312295366073952e-06, "loss": 0.4839, "step": 7390 }, { "epoch": 0.8622901119402985, "grad_norm": 0.4656768131692163, "learning_rate": 7.293177325205734e-06, "loss": 0.5089, "step": 7395 }, { "epoch": 0.8628731343283582, "grad_norm": 0.47867044879346754, "learning_rate": 7.274134402026321e-06, "loss": 0.5204, "step": 7400 }, { "epoch": 0.863456156716418, "grad_norm": 0.4495376624711596, "learning_rate": 7.255166667326668e-06, "loss": 0.4864, "step": 7405 }, { "epoch": 0.8640391791044776, "grad_norm": 0.45540868280070557, "learning_rate": 7.236274191618228e-06, "loss": 0.4969, "step": 7410 }, { "epoch": 0.8646222014925373, "grad_norm": 0.5275767191951953, "learning_rate": 7.217457045132682e-06, "loss": 0.5086, "step": 7415 }, { "epoch": 0.8652052238805971, "grad_norm": 0.42863000174753835, "learning_rate": 7.198715297821681e-06, "loss": 0.5033, "step": 7420 }, { "epoch": 0.8657882462686567, "grad_norm": 0.4834849241582333, "learning_rate": 7.18004901935657e-06, "loss": 0.5057, "step": 7425 }, { "epoch": 0.8663712686567164, "grad_norm": 0.46483441625060457, "learning_rate": 7.161458279128172e-06, "loss": 0.5058, "step": 7430 }, { "epoch": 0.8669542910447762, "grad_norm": 0.4699147577232594, "learning_rate": 7.142943146246471e-06, "loss": 0.5052, "step": 7435 }, { "epoch": 0.8675373134328358, "grad_norm": 0.4728336401428503, "learning_rate": 7.124503689540403e-06, "loss": 0.4945, "step": 7440 }, { "epoch": 0.8681203358208955, "grad_norm": 0.42756247469245834, "learning_rate": 7.106139977557563e-06, "loss": 0.4868, "step": 7445 }, { "epoch": 0.8687033582089553, "grad_norm": 0.5508454557117077, "learning_rate": 7.087852078564006e-06, "loss": 0.5078, "step": 7450 }, { "epoch": 0.8692863805970149, "grad_norm": 0.42365611783227247, "learning_rate": 7.069640060543914e-06, "loss": 0.4795, "step": 7455 }, { "epoch": 0.8698694029850746, "grad_norm": 0.4619373613002068, "learning_rate": 7.051503991199415e-06, "loss": 0.5093, "step": 7460 }, { "epoch": 0.8704524253731343, "grad_norm": 0.46430333389633666, "learning_rate": 7.03344393795029e-06, "loss": 0.4628, "step": 7465 }, { "epoch": 0.871035447761194, "grad_norm": 0.46350593595683315, "learning_rate": 7.0154599679337405e-06, "loss": 0.4966, "step": 7470 }, { "epoch": 0.8716184701492538, "grad_norm": 0.46997441642678967, "learning_rate": 6.997552148004124e-06, "loss": 0.4619, "step": 7475 }, { "epoch": 0.8722014925373134, "grad_norm": 0.4940809056382957, "learning_rate": 6.9797205447327236e-06, "loss": 0.5207, "step": 7480 }, { "epoch": 0.8727845149253731, "grad_norm": 0.42737497657553214, "learning_rate": 6.961965224407487e-06, "loss": 0.4879, "step": 7485 }, { "epoch": 0.8733675373134329, "grad_norm": 0.4420504460833519, "learning_rate": 6.944286253032789e-06, "loss": 0.4519, "step": 7490 }, { "epoch": 0.8739505597014925, "grad_norm": 0.4482189949894458, "learning_rate": 6.9266836963291725e-06, "loss": 0.5216, "step": 7495 }, { "epoch": 0.8745335820895522, "grad_norm": 0.48507550585616305, "learning_rate": 6.90915761973312e-06, "loss": 0.4904, "step": 7500 }, { "epoch": 0.875116604477612, "grad_norm": 0.4845741351343996, "learning_rate": 6.891708088396803e-06, "loss": 0.5031, "step": 7505 }, { "epoch": 0.8756996268656716, "grad_norm": 0.45661614078718604, "learning_rate": 6.874335167187844e-06, "loss": 0.4694, "step": 7510 }, { "epoch": 0.8762826492537313, "grad_norm": 0.4378046368918111, "learning_rate": 6.857038920689068e-06, "loss": 0.473, "step": 7515 }, { "epoch": 0.8768656716417911, "grad_norm": 0.4351278013857574, "learning_rate": 6.839819413198259e-06, "loss": 0.4686, "step": 7520 }, { "epoch": 0.8774486940298507, "grad_norm": 0.4598040793979348, "learning_rate": 6.822676708727941e-06, "loss": 0.5058, "step": 7525 }, { "epoch": 0.8780317164179104, "grad_norm": 0.46734999477037775, "learning_rate": 6.805610871005115e-06, "loss": 0.5142, "step": 7530 }, { "epoch": 0.8786147388059702, "grad_norm": 0.42358187968622485, "learning_rate": 6.788621963471055e-06, "loss": 0.4656, "step": 7535 }, { "epoch": 0.8791977611940298, "grad_norm": 0.42943564345062557, "learning_rate": 6.771710049281019e-06, "loss": 0.4866, "step": 7540 }, { "epoch": 0.8797807835820896, "grad_norm": 0.46090527920896157, "learning_rate": 6.754875191304076e-06, "loss": 0.5283, "step": 7545 }, { "epoch": 0.8803638059701493, "grad_norm": 0.46574541087337956, "learning_rate": 6.73811745212283e-06, "loss": 0.5072, "step": 7550 }, { "epoch": 0.8809468283582089, "grad_norm": 0.5402513744522806, "learning_rate": 6.721436894033206e-06, "loss": 0.4474, "step": 7555 }, { "epoch": 0.8815298507462687, "grad_norm": 0.4887426580865765, "learning_rate": 6.704833579044198e-06, "loss": 0.4945, "step": 7560 }, { "epoch": 0.8821128731343284, "grad_norm": 0.4720599639216187, "learning_rate": 6.688307568877681e-06, "loss": 0.4757, "step": 7565 }, { "epoch": 0.882695895522388, "grad_norm": 0.44242282186933635, "learning_rate": 6.6718589249681215e-06, "loss": 0.5141, "step": 7570 }, { "epoch": 0.8832789179104478, "grad_norm": 0.47443100760410045, "learning_rate": 6.655487708462407e-06, "loss": 0.479, "step": 7575 }, { "epoch": 0.8838619402985075, "grad_norm": 0.4484432447267727, "learning_rate": 6.639193980219574e-06, "loss": 0.503, "step": 7580 }, { "epoch": 0.8844449626865671, "grad_norm": 0.4544364445156426, "learning_rate": 6.622977800810626e-06, "loss": 0.4757, "step": 7585 }, { "epoch": 0.8850279850746269, "grad_norm": 0.470997939385338, "learning_rate": 6.60683923051825e-06, "loss": 0.4791, "step": 7590 }, { "epoch": 0.8856110074626866, "grad_norm": 0.4213706332803066, "learning_rate": 6.5907783293366525e-06, "loss": 0.468, "step": 7595 }, { "epoch": 0.8861940298507462, "grad_norm": 0.4352892615766016, "learning_rate": 6.574795156971298e-06, "loss": 0.4843, "step": 7600 }, { "epoch": 0.886777052238806, "grad_norm": 0.45036633475522897, "learning_rate": 6.5588897728387055e-06, "loss": 0.4705, "step": 7605 }, { "epoch": 0.8873600746268657, "grad_norm": 0.5077288646140908, "learning_rate": 6.543062236066208e-06, "loss": 0.4791, "step": 7610 }, { "epoch": 0.8879430970149254, "grad_norm": 0.4513500980150284, "learning_rate": 6.527312605491758e-06, "loss": 0.5178, "step": 7615 }, { "epoch": 0.8885261194029851, "grad_norm": 0.4274182074103898, "learning_rate": 6.5116409396636935e-06, "loss": 0.4626, "step": 7620 }, { "epoch": 0.8891091417910447, "grad_norm": 0.4667801241554799, "learning_rate": 6.496047296840513e-06, "loss": 0.5071, "step": 7625 }, { "epoch": 0.8896921641791045, "grad_norm": 0.4123902058374272, "learning_rate": 6.480531734990686e-06, "loss": 0.4992, "step": 7630 }, { "epoch": 0.8902751865671642, "grad_norm": 0.47897565830736566, "learning_rate": 6.4650943117924065e-06, "loss": 0.5153, "step": 7635 }, { "epoch": 0.8908582089552238, "grad_norm": 0.47530139972165525, "learning_rate": 6.449735084633407e-06, "loss": 0.4857, "step": 7640 }, { "epoch": 0.8914412313432836, "grad_norm": 0.4311055331542708, "learning_rate": 6.4344541106107046e-06, "loss": 0.4877, "step": 7645 }, { "epoch": 0.8920242537313433, "grad_norm": 0.4668273679980216, "learning_rate": 6.419251446530451e-06, "loss": 0.5164, "step": 7650 }, { "epoch": 0.8926072761194029, "grad_norm": 0.44140094368731336, "learning_rate": 6.404127148907656e-06, "loss": 0.4784, "step": 7655 }, { "epoch": 0.8931902985074627, "grad_norm": 0.4544488440695652, "learning_rate": 6.38908127396602e-06, "loss": 0.4988, "step": 7660 }, { "epoch": 0.8937733208955224, "grad_norm": 0.4557410652922852, "learning_rate": 6.374113877637701e-06, "loss": 0.4987, "step": 7665 }, { "epoch": 0.894356343283582, "grad_norm": 0.4444789436043567, "learning_rate": 6.359225015563138e-06, "loss": 0.5243, "step": 7670 }, { "epoch": 0.8949393656716418, "grad_norm": 0.4698237483504685, "learning_rate": 6.3444147430908015e-06, "loss": 0.4691, "step": 7675 }, { "epoch": 0.8955223880597015, "grad_norm": 0.4859998755530479, "learning_rate": 6.329683115277018e-06, "loss": 0.4916, "step": 7680 }, { "epoch": 0.8961054104477612, "grad_norm": 0.515441325427795, "learning_rate": 6.315030186885763e-06, "loss": 0.5116, "step": 7685 }, { "epoch": 0.8966884328358209, "grad_norm": 0.5283029015600534, "learning_rate": 6.300456012388446e-06, "loss": 0.4934, "step": 7690 }, { "epoch": 0.8972714552238806, "grad_norm": 0.4417646004083893, "learning_rate": 6.285960645963708e-06, "loss": 0.5026, "step": 7695 }, { "epoch": 0.8978544776119403, "grad_norm": 0.4605167879082956, "learning_rate": 6.271544141497232e-06, "loss": 0.4901, "step": 7700 }, { "epoch": 0.8984375, "grad_norm": 0.4792028373581772, "learning_rate": 6.257206552581541e-06, "loss": 0.5118, "step": 7705 }, { "epoch": 0.8990205223880597, "grad_norm": 0.4644526939230792, "learning_rate": 6.242947932515786e-06, "loss": 0.5282, "step": 7710 }, { "epoch": 0.8996035447761194, "grad_norm": 0.5046663803144333, "learning_rate": 6.228768334305555e-06, "loss": 0.4946, "step": 7715 }, { "epoch": 0.9001865671641791, "grad_norm": 0.43155084823278395, "learning_rate": 6.214667810662682e-06, "loss": 0.4794, "step": 7720 }, { "epoch": 0.9007695895522388, "grad_norm": 0.4640789279302754, "learning_rate": 6.200646414005046e-06, "loss": 0.5239, "step": 7725 }, { "epoch": 0.9013526119402985, "grad_norm": 0.45218047321306765, "learning_rate": 6.18670419645637e-06, "loss": 0.5015, "step": 7730 }, { "epoch": 0.9019356343283582, "grad_norm": 0.4588478078182171, "learning_rate": 6.172841209846046e-06, "loss": 0.4913, "step": 7735 }, { "epoch": 0.902518656716418, "grad_norm": 0.4433138530597733, "learning_rate": 6.159057505708912e-06, "loss": 0.4594, "step": 7740 }, { "epoch": 0.9031016791044776, "grad_norm": 0.42245796632484117, "learning_rate": 6.145353135285091e-06, "loss": 0.4945, "step": 7745 }, { "epoch": 0.9036847014925373, "grad_norm": 0.4756088628902291, "learning_rate": 6.131728149519778e-06, "loss": 0.4932, "step": 7750 }, { "epoch": 0.9042677238805971, "grad_norm": 0.4597129158786207, "learning_rate": 6.118182599063075e-06, "loss": 0.5354, "step": 7755 }, { "epoch": 0.9048507462686567, "grad_norm": 0.503389587802853, "learning_rate": 6.104716534269772e-06, "loss": 0.5077, "step": 7760 }, { "epoch": 0.9054337686567164, "grad_norm": 0.4249365343444884, "learning_rate": 6.091330005199183e-06, "loss": 0.4868, "step": 7765 }, { "epoch": 0.9060167910447762, "grad_norm": 0.43851878310292347, "learning_rate": 6.078023061614953e-06, "loss": 0.5015, "step": 7770 }, { "epoch": 0.9065998134328358, "grad_norm": 0.43139132012318054, "learning_rate": 6.064795752984875e-06, "loss": 0.4832, "step": 7775 }, { "epoch": 0.9071828358208955, "grad_norm": 0.45409545061847195, "learning_rate": 6.0516481284806885e-06, "loss": 0.4794, "step": 7780 }, { "epoch": 0.9077658582089553, "grad_norm": 0.4411595137434583, "learning_rate": 6.0385802369779385e-06, "loss": 0.5183, "step": 7785 }, { "epoch": 0.9083488805970149, "grad_norm": 0.45410062982495236, "learning_rate": 6.025592127055741e-06, "loss": 0.4736, "step": 7790 }, { "epoch": 0.9089319029850746, "grad_norm": 0.42835954681482424, "learning_rate": 6.012683846996645e-06, "loss": 0.4541, "step": 7795 }, { "epoch": 0.9095149253731343, "grad_norm": 0.42047696751886715, "learning_rate": 5.999855444786425e-06, "loss": 0.4973, "step": 7800 }, { "epoch": 0.910097947761194, "grad_norm": 0.44914336580430964, "learning_rate": 5.987106968113928e-06, "loss": 0.5061, "step": 7805 }, { "epoch": 0.9106809701492538, "grad_norm": 0.460206776855078, "learning_rate": 5.974438464370872e-06, "loss": 0.4888, "step": 7810 }, { "epoch": 0.9112639925373134, "grad_norm": 0.4470862201000093, "learning_rate": 5.961849980651684e-06, "loss": 0.4659, "step": 7815 }, { "epoch": 0.9118470149253731, "grad_norm": 0.47720200193566115, "learning_rate": 5.949341563753319e-06, "loss": 0.4992, "step": 7820 }, { "epoch": 0.9124300373134329, "grad_norm": 0.4676169159668866, "learning_rate": 5.936913260175094e-06, "loss": 0.5179, "step": 7825 }, { "epoch": 0.9130130597014925, "grad_norm": 0.4584704293118159, "learning_rate": 5.924565116118499e-06, "loss": 0.4797, "step": 7830 }, { "epoch": 0.9135960820895522, "grad_norm": 0.4776754416693919, "learning_rate": 5.9122971774870435e-06, "loss": 0.4678, "step": 7835 }, { "epoch": 0.914179104477612, "grad_norm": 0.42982811564868484, "learning_rate": 5.900109489886081e-06, "loss": 0.4804, "step": 7840 }, { "epoch": 0.9147621268656716, "grad_norm": 0.41729830382315986, "learning_rate": 5.8880020986226285e-06, "loss": 0.468, "step": 7845 }, { "epoch": 0.9153451492537313, "grad_norm": 0.4582133802824904, "learning_rate": 5.875975048705206e-06, "loss": 0.4939, "step": 7850 }, { "epoch": 0.9159281716417911, "grad_norm": 0.4934433745443256, "learning_rate": 5.864028384843678e-06, "loss": 0.512, "step": 7855 }, { "epoch": 0.9165111940298507, "grad_norm": 0.4461718822871272, "learning_rate": 5.8521621514490715e-06, "loss": 0.5226, "step": 7860 }, { "epoch": 0.9170942164179104, "grad_norm": 0.43580261924917685, "learning_rate": 5.8403763926334146e-06, "loss": 0.4712, "step": 7865 }, { "epoch": 0.9176772388059702, "grad_norm": 0.4289025060075553, "learning_rate": 5.82867115220959e-06, "loss": 0.4857, "step": 7870 }, { "epoch": 0.9182602611940298, "grad_norm": 0.4471506171646922, "learning_rate": 5.81704647369114e-06, "loss": 0.5177, "step": 7875 }, { "epoch": 0.9188432835820896, "grad_norm": 0.4349776664841292, "learning_rate": 5.805502400292137e-06, "loss": 0.4925, "step": 7880 }, { "epoch": 0.9194263059701493, "grad_norm": 0.4524645377436028, "learning_rate": 5.794038974926995e-06, "loss": 0.4807, "step": 7885 }, { "epoch": 0.9200093283582089, "grad_norm": 0.45618145888853556, "learning_rate": 5.782656240210343e-06, "loss": 0.5078, "step": 7890 }, { "epoch": 0.9205923507462687, "grad_norm": 0.4316155039975515, "learning_rate": 5.771354238456828e-06, "loss": 0.4795, "step": 7895 }, { "epoch": 0.9211753731343284, "grad_norm": 0.46008183460748864, "learning_rate": 5.760133011680985e-06, "loss": 0.4788, "step": 7900 }, { "epoch": 0.921758395522388, "grad_norm": 0.44423836815839907, "learning_rate": 5.748992601597076e-06, "loss": 0.4949, "step": 7905 }, { "epoch": 0.9223414179104478, "grad_norm": 0.45890602021310334, "learning_rate": 5.737933049618925e-06, "loss": 0.4811, "step": 7910 }, { "epoch": 0.9229244402985075, "grad_norm": 0.424972332822205, "learning_rate": 5.726954396859773e-06, "loss": 0.4854, "step": 7915 }, { "epoch": 0.9235074626865671, "grad_norm": 0.42910016132492174, "learning_rate": 5.7160566841321255e-06, "loss": 0.4512, "step": 7920 }, { "epoch": 0.9240904850746269, "grad_norm": 0.46334797979450737, "learning_rate": 5.705239951947597e-06, "loss": 0.4834, "step": 7925 }, { "epoch": 0.9246735074626866, "grad_norm": 0.4603535009443525, "learning_rate": 5.694504240516759e-06, "loss": 0.4837, "step": 7930 }, { "epoch": 0.9252565298507462, "grad_norm": 0.4662682403880509, "learning_rate": 5.683849589748994e-06, "loss": 0.4823, "step": 7935 }, { "epoch": 0.925839552238806, "grad_norm": 0.44330683887009237, "learning_rate": 5.673276039252347e-06, "loss": 0.5095, "step": 7940 }, { "epoch": 0.9264225746268657, "grad_norm": 0.43601918566724357, "learning_rate": 5.662783628333379e-06, "loss": 0.5165, "step": 7945 }, { "epoch": 0.9270055970149254, "grad_norm": 0.41717748530531923, "learning_rate": 5.652372395997015e-06, "loss": 0.4911, "step": 7950 }, { "epoch": 0.9275886194029851, "grad_norm": 2.401919501282761, "learning_rate": 5.642042380946412e-06, "loss": 0.4687, "step": 7955 }, { "epoch": 0.9281716417910447, "grad_norm": 0.41624865614532475, "learning_rate": 5.631793621582793e-06, "loss": 0.4945, "step": 7960 }, { "epoch": 0.9287546641791045, "grad_norm": 0.5068445026903259, "learning_rate": 5.621626156005335e-06, "loss": 0.4786, "step": 7965 }, { "epoch": 0.9293376865671642, "grad_norm": 0.45630828056750805, "learning_rate": 5.611540022010985e-06, "loss": 0.4951, "step": 7970 }, { "epoch": 0.9299207089552238, "grad_norm": 0.4956233980595794, "learning_rate": 5.6015352570943755e-06, "loss": 0.4843, "step": 7975 }, { "epoch": 0.9305037313432836, "grad_norm": 0.4329791152614355, "learning_rate": 5.591611898447632e-06, "loss": 0.4634, "step": 7980 }, { "epoch": 0.9310867537313433, "grad_norm": 0.4663064517760106, "learning_rate": 5.581769982960261e-06, "loss": 0.5264, "step": 7985 }, { "epoch": 0.9316697761194029, "grad_norm": 0.4836205261061597, "learning_rate": 5.572009547219013e-06, "loss": 0.5156, "step": 7990 }, { "epoch": 0.9322527985074627, "grad_norm": 0.4277290525866144, "learning_rate": 5.5623306275077475e-06, "loss": 0.481, "step": 7995 }, { "epoch": 0.9328358208955224, "grad_norm": 0.43341519970296755, "learning_rate": 5.552733259807276e-06, "loss": 0.498, "step": 8000 }, { "epoch": 0.933418843283582, "grad_norm": 0.43071532419905234, "learning_rate": 5.543217479795256e-06, "loss": 0.4958, "step": 8005 }, { "epoch": 0.9340018656716418, "grad_norm": 0.46622528251802137, "learning_rate": 5.533783322846053e-06, "loss": 0.4814, "step": 8010 }, { "epoch": 0.9345848880597015, "grad_norm": 0.5249920187972582, "learning_rate": 5.524430824030594e-06, "loss": 0.4907, "step": 8015 }, { "epoch": 0.9351679104477612, "grad_norm": 0.45135919668531826, "learning_rate": 5.515160018116247e-06, "loss": 0.492, "step": 8020 }, { "epoch": 0.9357509328358209, "grad_norm": 0.4928051727444886, "learning_rate": 5.505970939566699e-06, "loss": 0.5035, "step": 8025 }, { "epoch": 0.9363339552238806, "grad_norm": 0.44235862805439663, "learning_rate": 5.4968636225418125e-06, "loss": 0.4783, "step": 8030 }, { "epoch": 0.9369169776119403, "grad_norm": 0.41958300409071425, "learning_rate": 5.487838100897508e-06, "loss": 0.4789, "step": 8035 }, { "epoch": 0.9375, "grad_norm": 0.4570459794031607, "learning_rate": 5.478894408185641e-06, "loss": 0.4661, "step": 8040 }, { "epoch": 0.9380830223880597, "grad_norm": 0.4783472459507973, "learning_rate": 5.470032577653869e-06, "loss": 0.4758, "step": 8045 }, { "epoch": 0.9386660447761194, "grad_norm": 0.4549507857184879, "learning_rate": 5.4612526422455265e-06, "loss": 0.4961, "step": 8050 }, { "epoch": 0.9392490671641791, "grad_norm": 0.4650558485006105, "learning_rate": 5.452554634599519e-06, "loss": 0.4891, "step": 8055 }, { "epoch": 0.9398320895522388, "grad_norm": 0.580533723917234, "learning_rate": 5.443938587050186e-06, "loss": 0.5172, "step": 8060 }, { "epoch": 0.9404151119402985, "grad_norm": 0.42296595646423346, "learning_rate": 5.435404531627176e-06, "loss": 0.4853, "step": 8065 }, { "epoch": 0.9409981343283582, "grad_norm": 0.4573887758625825, "learning_rate": 5.426952500055348e-06, "loss": 0.4614, "step": 8070 }, { "epoch": 0.941581156716418, "grad_norm": 0.4745039842823071, "learning_rate": 5.41858252375464e-06, "loss": 0.5061, "step": 8075 }, { "epoch": 0.9421641791044776, "grad_norm": 0.47805447519039157, "learning_rate": 5.410294633839949e-06, "loss": 0.4735, "step": 8080 }, { "epoch": 0.9427472014925373, "grad_norm": 0.43867449428861033, "learning_rate": 5.402088861121025e-06, "loss": 0.4718, "step": 8085 }, { "epoch": 0.9433302238805971, "grad_norm": 0.4467252451572471, "learning_rate": 5.393965236102353e-06, "loss": 0.4798, "step": 8090 }, { "epoch": 0.9439132462686567, "grad_norm": 0.5133360727759508, "learning_rate": 5.385923788983034e-06, "loss": 0.4894, "step": 8095 }, { "epoch": 0.9444962686567164, "grad_norm": 0.4747210863179483, "learning_rate": 5.377964549656685e-06, "loss": 0.5098, "step": 8100 }, { "epoch": 0.9450792910447762, "grad_norm": 0.44347953398302503, "learning_rate": 5.370087547711307e-06, "loss": 0.5105, "step": 8105 }, { "epoch": 0.9456623134328358, "grad_norm": 0.4387472897639345, "learning_rate": 5.362292812429207e-06, "loss": 0.4815, "step": 8110 }, { "epoch": 0.9462453358208955, "grad_norm": 0.4722026626305582, "learning_rate": 5.354580372786854e-06, "loss": 0.4776, "step": 8115 }, { "epoch": 0.9468283582089553, "grad_norm": 0.4566856777345421, "learning_rate": 5.346950257454792e-06, "loss": 0.5002, "step": 8120 }, { "epoch": 0.9474113805970149, "grad_norm": 0.4601768931007766, "learning_rate": 5.339402494797539e-06, "loss": 0.4725, "step": 8125 }, { "epoch": 0.9479944029850746, "grad_norm": 0.4422563608916944, "learning_rate": 5.331937112873462e-06, "loss": 0.4411, "step": 8130 }, { "epoch": 0.9485774253731343, "grad_norm": 0.44855925098469346, "learning_rate": 5.324554139434679e-06, "loss": 0.4941, "step": 8135 }, { "epoch": 0.949160447761194, "grad_norm": 0.4437725309777671, "learning_rate": 5.317253601926967e-06, "loss": 0.4836, "step": 8140 }, { "epoch": 0.9497434701492538, "grad_norm": 0.4488409563199269, "learning_rate": 5.310035527489651e-06, "loss": 0.4958, "step": 8145 }, { "epoch": 0.9503264925373134, "grad_norm": 0.46309779837786297, "learning_rate": 5.3028999429555045e-06, "loss": 0.493, "step": 8150 }, { "epoch": 0.9509095149253731, "grad_norm": 0.45673001540720787, "learning_rate": 5.295846874850646e-06, "loss": 0.4945, "step": 8155 }, { "epoch": 0.9514925373134329, "grad_norm": 0.4481344375463158, "learning_rate": 5.288876349394448e-06, "loss": 0.4957, "step": 8160 }, { "epoch": 0.9520755597014925, "grad_norm": 0.45392704978409076, "learning_rate": 5.281988392499431e-06, "loss": 0.4721, "step": 8165 }, { "epoch": 0.9526585820895522, "grad_norm": 0.45349195761765704, "learning_rate": 5.275183029771177e-06, "loss": 0.4741, "step": 8170 }, { "epoch": 0.953241604477612, "grad_norm": 0.4411901617730967, "learning_rate": 5.2684602865082255e-06, "loss": 0.4966, "step": 8175 }, { "epoch": 0.9538246268656716, "grad_norm": 0.4585915951637137, "learning_rate": 5.261820187701984e-06, "loss": 0.5453, "step": 8180 }, { "epoch": 0.9544076492537313, "grad_norm": 0.4593205732104069, "learning_rate": 5.2552627580366334e-06, "loss": 0.5013, "step": 8185 }, { "epoch": 0.9549906716417911, "grad_norm": 0.42592095115888917, "learning_rate": 5.248788021889036e-06, "loss": 0.4797, "step": 8190 }, { "epoch": 0.9555736940298507, "grad_norm": 0.46962819902389563, "learning_rate": 5.2423960033286505e-06, "loss": 0.4763, "step": 8195 }, { "epoch": 0.9561567164179104, "grad_norm": 0.40855238448922176, "learning_rate": 5.236086726117433e-06, "loss": 0.4743, "step": 8200 }, { "epoch": 0.9567397388059702, "grad_norm": 0.43575271699567064, "learning_rate": 5.229860213709753e-06, "loss": 0.4773, "step": 8205 }, { "epoch": 0.9573227611940298, "grad_norm": 0.4688749371758774, "learning_rate": 5.223716489252311e-06, "loss": 0.5166, "step": 8210 }, { "epoch": 0.9579057835820896, "grad_norm": 0.7670542201127664, "learning_rate": 5.217655575584045e-06, "loss": 0.493, "step": 8215 }, { "epoch": 0.9584888059701493, "grad_norm": 0.46421302891934385, "learning_rate": 5.211677495236046e-06, "loss": 0.5096, "step": 8220 }, { "epoch": 0.9590718283582089, "grad_norm": 0.4771862479599736, "learning_rate": 5.205782270431484e-06, "loss": 0.4693, "step": 8225 }, { "epoch": 0.9596548507462687, "grad_norm": 0.44226860049129957, "learning_rate": 5.199969923085515e-06, "loss": 0.4459, "step": 8230 }, { "epoch": 0.9602378731343284, "grad_norm": 0.4294703525768145, "learning_rate": 5.194240474805201e-06, "loss": 0.481, "step": 8235 }, { "epoch": 0.960820895522388, "grad_norm": 0.46086183427389743, "learning_rate": 5.188593946889429e-06, "loss": 0.4973, "step": 8240 }, { "epoch": 0.9614039179104478, "grad_norm": 0.4280169626175964, "learning_rate": 5.183030360328846e-06, "loss": 0.4698, "step": 8245 }, { "epoch": 0.9619869402985075, "grad_norm": 0.4753252130786709, "learning_rate": 5.177549735805758e-06, "loss": 0.5015, "step": 8250 }, { "epoch": 0.9625699626865671, "grad_norm": 0.4620963248632002, "learning_rate": 5.172152093694067e-06, "loss": 0.4802, "step": 8255 }, { "epoch": 0.9631529850746269, "grad_norm": 0.5332881434749811, "learning_rate": 5.166837454059193e-06, "loss": 0.5106, "step": 8260 }, { "epoch": 0.9637360074626866, "grad_norm": 0.5025697757795641, "learning_rate": 5.161605836658004e-06, "loss": 0.4986, "step": 8265 }, { "epoch": 0.9643190298507462, "grad_norm": 0.46525870817249937, "learning_rate": 5.156457260938732e-06, "loss": 0.4925, "step": 8270 }, { "epoch": 0.964902052238806, "grad_norm": 0.4466358224458004, "learning_rate": 5.151391746040905e-06, "loss": 0.4896, "step": 8275 }, { "epoch": 0.9654850746268657, "grad_norm": 0.4722232462608345, "learning_rate": 5.146409310795282e-06, "loss": 0.5116, "step": 8280 }, { "epoch": 0.9660680970149254, "grad_norm": 0.4839383338646517, "learning_rate": 5.14150997372378e-06, "loss": 0.4704, "step": 8285 }, { "epoch": 0.9666511194029851, "grad_norm": 0.4598082477182381, "learning_rate": 5.1366937530393955e-06, "loss": 0.4675, "step": 8290 }, { "epoch": 0.9672341417910447, "grad_norm": 0.4571533268091363, "learning_rate": 5.131960666646149e-06, "loss": 0.4886, "step": 8295 }, { "epoch": 0.9678171641791045, "grad_norm": 0.4826350517333677, "learning_rate": 5.127310732139018e-06, "loss": 0.5239, "step": 8300 }, { "epoch": 0.9684001865671642, "grad_norm": 0.501035586719562, "learning_rate": 5.122743966803858e-06, "loss": 0.4845, "step": 8305 }, { "epoch": 0.9689832089552238, "grad_norm": 0.45689428681207833, "learning_rate": 5.118260387617359e-06, "loss": 0.4802, "step": 8310 }, { "epoch": 0.9695662313432836, "grad_norm": 0.41839841626319924, "learning_rate": 5.113860011246964e-06, "loss": 0.4759, "step": 8315 }, { "epoch": 0.9701492537313433, "grad_norm": 0.4623224498637685, "learning_rate": 5.109542854050814e-06, "loss": 0.5191, "step": 8320 }, { "epoch": 0.9707322761194029, "grad_norm": 0.4400774444581771, "learning_rate": 5.105308932077693e-06, "loss": 0.4751, "step": 8325 }, { "epoch": 0.9713152985074627, "grad_norm": 0.5458461815433695, "learning_rate": 5.101158261066959e-06, "loss": 0.4934, "step": 8330 }, { "epoch": 0.9718983208955224, "grad_norm": 0.45228504839890776, "learning_rate": 5.097090856448492e-06, "loss": 0.5007, "step": 8335 }, { "epoch": 0.972481343283582, "grad_norm": 0.467798541042544, "learning_rate": 5.0931067333426275e-06, "loss": 0.5082, "step": 8340 }, { "epoch": 0.9730643656716418, "grad_norm": 0.4266615826249801, "learning_rate": 5.0892059065601145e-06, "loss": 0.4974, "step": 8345 }, { "epoch": 0.9736473880597015, "grad_norm": 0.4621554435159516, "learning_rate": 5.0853883906020525e-06, "loss": 0.4965, "step": 8350 }, { "epoch": 0.9742304104477612, "grad_norm": 0.46073201489628657, "learning_rate": 5.081654199659831e-06, "loss": 0.5071, "step": 8355 }, { "epoch": 0.9748134328358209, "grad_norm": 0.5462060706132782, "learning_rate": 5.07800334761509e-06, "loss": 0.4938, "step": 8360 }, { "epoch": 0.9753964552238806, "grad_norm": 0.48250851908785264, "learning_rate": 5.074435848039658e-06, "loss": 0.5091, "step": 8365 }, { "epoch": 0.9759794776119403, "grad_norm": 0.44847740844622386, "learning_rate": 5.070951714195508e-06, "loss": 0.4957, "step": 8370 }, { "epoch": 0.9765625, "grad_norm": 0.4467178102848771, "learning_rate": 5.067550959034707e-06, "loss": 0.49, "step": 8375 }, { "epoch": 0.9771455223880597, "grad_norm": 0.46636700776310513, "learning_rate": 5.064233595199362e-06, "loss": 0.4884, "step": 8380 }, { "epoch": 0.9777285447761194, "grad_norm": 0.4615425374049883, "learning_rate": 5.060999635021583e-06, "loss": 0.4771, "step": 8385 }, { "epoch": 0.9783115671641791, "grad_norm": 0.45286825679857706, "learning_rate": 5.057849090523426e-06, "loss": 0.4959, "step": 8390 }, { "epoch": 0.9788945895522388, "grad_norm": 0.4569055516084358, "learning_rate": 5.054781973416858e-06, "loss": 0.4955, "step": 8395 }, { "epoch": 0.9794776119402985, "grad_norm": 0.4455395860822749, "learning_rate": 5.051798295103711e-06, "loss": 0.4752, "step": 8400 }, { "epoch": 0.9800606343283582, "grad_norm": 0.4183019666197293, "learning_rate": 5.048898066675631e-06, "loss": 0.4552, "step": 8405 }, { "epoch": 0.980643656716418, "grad_norm": 0.43323200908430703, "learning_rate": 5.046081298914053e-06, "loss": 0.5092, "step": 8410 }, { "epoch": 0.9812266791044776, "grad_norm": 0.46606214824527303, "learning_rate": 5.043348002290145e-06, "loss": 0.4639, "step": 8415 }, { "epoch": 0.9818097014925373, "grad_norm": 0.4620950107319596, "learning_rate": 5.0406981869647805e-06, "loss": 0.5072, "step": 8420 }, { "epoch": 0.9823927238805971, "grad_norm": 0.44313034547356717, "learning_rate": 5.038131862788491e-06, "loss": 0.4765, "step": 8425 }, { "epoch": 0.9829757462686567, "grad_norm": 0.44702546793927656, "learning_rate": 5.035649039301438e-06, "loss": 0.4612, "step": 8430 }, { "epoch": 0.9835587686567164, "grad_norm": 0.45947101038805505, "learning_rate": 5.033249725733377e-06, "loss": 0.4967, "step": 8435 }, { "epoch": 0.9841417910447762, "grad_norm": 0.42282849418109447, "learning_rate": 5.0309339310036125e-06, "loss": 0.507, "step": 8440 }, { "epoch": 0.9847248134328358, "grad_norm": 0.44692770292599415, "learning_rate": 5.02870166372098e-06, "loss": 0.4808, "step": 8445 }, { "epoch": 0.9853078358208955, "grad_norm": 0.5582818670824954, "learning_rate": 5.0265529321838004e-06, "loss": 0.5405, "step": 8450 }, { "epoch": 0.9858908582089553, "grad_norm": 0.4665732954055579, "learning_rate": 5.02448774437986e-06, "loss": 0.5157, "step": 8455 }, { "epoch": 0.9864738805970149, "grad_norm": 0.5202669052061701, "learning_rate": 5.022506107986374e-06, "loss": 0.5146, "step": 8460 }, { "epoch": 0.9870569029850746, "grad_norm": 0.4563878919577009, "learning_rate": 5.020608030369962e-06, "loss": 0.4614, "step": 8465 }, { "epoch": 0.9876399253731343, "grad_norm": 0.45761657701588687, "learning_rate": 5.018793518586616e-06, "loss": 0.5007, "step": 8470 }, { "epoch": 0.988222947761194, "grad_norm": 0.46136936074859125, "learning_rate": 5.017062579381676e-06, "loss": 0.5068, "step": 8475 }, { "epoch": 0.9888059701492538, "grad_norm": 0.48777910519871126, "learning_rate": 5.015415219189812e-06, "loss": 0.4889, "step": 8480 }, { "epoch": 0.9893889925373134, "grad_norm": 0.47084471147391416, "learning_rate": 5.013851444134987e-06, "loss": 0.5022, "step": 8485 }, { "epoch": 0.9899720149253731, "grad_norm": 0.432844624719787, "learning_rate": 5.012371260030445e-06, "loss": 0.491, "step": 8490 }, { "epoch": 0.9905550373134329, "grad_norm": 0.4649632477223049, "learning_rate": 5.010974672378682e-06, "loss": 0.4741, "step": 8495 }, { "epoch": 0.9911380597014925, "grad_norm": 0.4764372199483869, "learning_rate": 5.009661686371434e-06, "loss": 0.4772, "step": 8500 }, { "epoch": 0.9917210820895522, "grad_norm": 0.4868588188840104, "learning_rate": 5.008432306889652e-06, "loss": 0.5214, "step": 8505 }, { "epoch": 0.992304104477612, "grad_norm": 0.438360158275501, "learning_rate": 5.0072865385034785e-06, "loss": 0.4905, "step": 8510 }, { "epoch": 0.9928871268656716, "grad_norm": 0.48340344564902316, "learning_rate": 5.006224385472242e-06, "loss": 0.4927, "step": 8515 }, { "epoch": 0.9934701492537313, "grad_norm": 0.45531955111325645, "learning_rate": 5.0052458517444364e-06, "loss": 0.4888, "step": 8520 }, { "epoch": 0.9940531716417911, "grad_norm": 0.4362698593831642, "learning_rate": 5.004350940957703e-06, "loss": 0.4818, "step": 8525 }, { "epoch": 0.9946361940298507, "grad_norm": 0.46810128935885154, "learning_rate": 5.0035396564388184e-06, "loss": 0.5101, "step": 8530 }, { "epoch": 0.9952192164179104, "grad_norm": 0.8386528644620809, "learning_rate": 5.00281200120369e-06, "loss": 0.4791, "step": 8535 }, { "epoch": 0.9958022388059702, "grad_norm": 0.4834359055420526, "learning_rate": 5.00216797795733e-06, "loss": 0.4913, "step": 8540 }, { "epoch": 0.9963852611940298, "grad_norm": 0.4925810326491755, "learning_rate": 5.001607589093861e-06, "loss": 0.5076, "step": 8545 }, { "epoch": 0.9969682835820896, "grad_norm": 0.44173971825287517, "learning_rate": 5.001130836696491e-06, "loss": 0.4809, "step": 8550 }, { "epoch": 0.9975513059701493, "grad_norm": 0.48077397748529965, "learning_rate": 5.000737722537526e-06, "loss": 0.4974, "step": 8555 }, { "epoch": 0.9981343283582089, "grad_norm": 0.6138381620178532, "learning_rate": 5.00042824807834e-06, "loss": 0.4925, "step": 8560 }, { "epoch": 0.9987173507462687, "grad_norm": 0.43738249534157836, "learning_rate": 5.000202414469386e-06, "loss": 0.4573, "step": 8565 }, { "epoch": 0.9993003731343284, "grad_norm": 0.4562639975448534, "learning_rate": 5.0000602225501925e-06, "loss": 0.4941, "step": 8570 }, { "epoch": 0.999883395522388, "grad_norm": 0.4592908023435152, "learning_rate": 5.0000016728493425e-06, "loss": 0.4962, "step": 8575 }, { "epoch": 1.0, "step": 8576, "total_flos": 488578517827584.0, "train_loss": 0.5423156990836472, "train_runtime": 27783.4456, "train_samples_per_second": 1.235, "train_steps_per_second": 0.309 } ], "logging_steps": 5, "max_steps": 8576, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 488578517827584.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }