GENOME-gemma-2b-it / open_orca /trainer_state.json
Estwld's picture
Upload 15 files
60ee796 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 4.997163925127623,
"eval_steps": 1000,
"global_step": 4405,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.011344299489506523,
"grad_norm": 2.3206300735473633,
"learning_rate": 4.535147392290249e-06,
"loss": 1.5929,
"step": 10
},
{
"epoch": 0.022688598979013045,
"grad_norm": 1.2386493682861328,
"learning_rate": 9.070294784580499e-06,
"loss": 1.6159,
"step": 20
},
{
"epoch": 0.03403289846851957,
"grad_norm": 1.1790252923965454,
"learning_rate": 1.360544217687075e-05,
"loss": 1.538,
"step": 30
},
{
"epoch": 0.04537719795802609,
"grad_norm": 1.021796703338623,
"learning_rate": 1.8140589569160997e-05,
"loss": 1.4132,
"step": 40
},
{
"epoch": 0.05672149744753262,
"grad_norm": 1.3392266035079956,
"learning_rate": 2.267573696145125e-05,
"loss": 1.2604,
"step": 50
},
{
"epoch": 0.06806579693703914,
"grad_norm": 0.9446895122528076,
"learning_rate": 2.72108843537415e-05,
"loss": 1.1644,
"step": 60
},
{
"epoch": 0.07941009642654566,
"grad_norm": 1.3290923833847046,
"learning_rate": 3.1746031746031745e-05,
"loss": 1.1082,
"step": 70
},
{
"epoch": 0.09075439591605218,
"grad_norm": 1.5161434412002563,
"learning_rate": 3.6281179138321995e-05,
"loss": 1.0389,
"step": 80
},
{
"epoch": 0.1020986954055587,
"grad_norm": 0.6483525633811951,
"learning_rate": 4.0816326530612245e-05,
"loss": 1.0542,
"step": 90
},
{
"epoch": 0.11344299489506524,
"grad_norm": 0.8814989924430847,
"learning_rate": 4.53514739229025e-05,
"loss": 0.9847,
"step": 100
},
{
"epoch": 0.12478729438457176,
"grad_norm": 0.7316718101501465,
"learning_rate": 4.9886621315192745e-05,
"loss": 1.0585,
"step": 110
},
{
"epoch": 0.13613159387407828,
"grad_norm": 0.7645348310470581,
"learning_rate": 5.4421768707483e-05,
"loss": 0.9713,
"step": 120
},
{
"epoch": 0.1474758933635848,
"grad_norm": 0.6830883622169495,
"learning_rate": 5.895691609977324e-05,
"loss": 0.9823,
"step": 130
},
{
"epoch": 0.15882019285309132,
"grad_norm": 1.3199207782745361,
"learning_rate": 6.349206349206349e-05,
"loss": 0.9992,
"step": 140
},
{
"epoch": 0.17016449234259784,
"grad_norm": 0.7770159840583801,
"learning_rate": 6.802721088435374e-05,
"loss": 1.0085,
"step": 150
},
{
"epoch": 0.18150879183210436,
"grad_norm": 1.623410940170288,
"learning_rate": 7.256235827664399e-05,
"loss": 1.0491,
"step": 160
},
{
"epoch": 0.19285309132161088,
"grad_norm": 2.8830106258392334,
"learning_rate": 7.709750566893424e-05,
"loss": 1.0686,
"step": 170
},
{
"epoch": 0.2041973908111174,
"grad_norm": 1.3428577184677124,
"learning_rate": 8.163265306122449e-05,
"loss": 1.0359,
"step": 180
},
{
"epoch": 0.21554169030062392,
"grad_norm": 0.8043076395988464,
"learning_rate": 8.616780045351474e-05,
"loss": 1.0496,
"step": 190
},
{
"epoch": 0.22688598979013047,
"grad_norm": 1.8799352645874023,
"learning_rate": 9.0702947845805e-05,
"loss": 1.0284,
"step": 200
},
{
"epoch": 0.238230289279637,
"grad_norm": 0.6667978167533875,
"learning_rate": 9.523809523809524e-05,
"loss": 1.0162,
"step": 210
},
{
"epoch": 0.2495745887691435,
"grad_norm": 0.815127968788147,
"learning_rate": 9.977324263038549e-05,
"loss": 1.0009,
"step": 220
},
{
"epoch": 0.26091888825865,
"grad_norm": 0.6558067798614502,
"learning_rate": 0.00010430839002267574,
"loss": 1.004,
"step": 230
},
{
"epoch": 0.27226318774815655,
"grad_norm": 0.6002511382102966,
"learning_rate": 0.000108843537414966,
"loss": 0.9702,
"step": 240
},
{
"epoch": 0.28360748723766305,
"grad_norm": 0.7007895708084106,
"learning_rate": 0.00011337868480725624,
"loss": 1.0266,
"step": 250
},
{
"epoch": 0.2949517867271696,
"grad_norm": 0.7985921502113342,
"learning_rate": 0.00011791383219954648,
"loss": 0.9753,
"step": 260
},
{
"epoch": 0.30629608621667614,
"grad_norm": 0.5343239903450012,
"learning_rate": 0.00012244897959183676,
"loss": 1.036,
"step": 270
},
{
"epoch": 0.31764038570618264,
"grad_norm": 0.7095124125480652,
"learning_rate": 0.00012698412698412698,
"loss": 1.0061,
"step": 280
},
{
"epoch": 0.3289846851956892,
"grad_norm": 0.8570685386657715,
"learning_rate": 0.00013151927437641726,
"loss": 0.9458,
"step": 290
},
{
"epoch": 0.3403289846851957,
"grad_norm": 0.6379779577255249,
"learning_rate": 0.00013605442176870748,
"loss": 0.9965,
"step": 300
},
{
"epoch": 0.3516732841747022,
"grad_norm": 0.9263567328453064,
"learning_rate": 0.00014058956916099776,
"loss": 0.9601,
"step": 310
},
{
"epoch": 0.3630175836642087,
"grad_norm": 0.7343761920928955,
"learning_rate": 0.00014512471655328798,
"loss": 1.0182,
"step": 320
},
{
"epoch": 0.37436188315371527,
"grad_norm": 0.588762640953064,
"learning_rate": 0.00014965986394557826,
"loss": 0.9762,
"step": 330
},
{
"epoch": 0.38570618264322176,
"grad_norm": 0.6719630360603333,
"learning_rate": 0.00015419501133786848,
"loss": 0.989,
"step": 340
},
{
"epoch": 0.3970504821327283,
"grad_norm": 1.641836166381836,
"learning_rate": 0.00015873015873015873,
"loss": 0.9611,
"step": 350
},
{
"epoch": 0.4083947816222348,
"grad_norm": 0.9340532422065735,
"learning_rate": 0.00016326530612244898,
"loss": 0.9861,
"step": 360
},
{
"epoch": 0.41973908111174135,
"grad_norm": 0.737554669380188,
"learning_rate": 0.00016780045351473923,
"loss": 1.0,
"step": 370
},
{
"epoch": 0.43108338060124785,
"grad_norm": 1.1190237998962402,
"learning_rate": 0.00017233560090702948,
"loss": 1.016,
"step": 380
},
{
"epoch": 0.4424276800907544,
"grad_norm": 0.7501509785652161,
"learning_rate": 0.00017687074829931973,
"loss": 0.9743,
"step": 390
},
{
"epoch": 0.45377197958026094,
"grad_norm": 0.5105754733085632,
"learning_rate": 0.00018140589569161,
"loss": 1.0182,
"step": 400
},
{
"epoch": 0.46511627906976744,
"grad_norm": 0.7148075699806213,
"learning_rate": 0.00018594104308390023,
"loss": 0.9673,
"step": 410
},
{
"epoch": 0.476460578559274,
"grad_norm": 0.49944302439689636,
"learning_rate": 0.00019047619047619048,
"loss": 1.0083,
"step": 420
},
{
"epoch": 0.4878048780487805,
"grad_norm": 0.5624661445617676,
"learning_rate": 0.00019501133786848073,
"loss": 1.0201,
"step": 430
},
{
"epoch": 0.499149177538287,
"grad_norm": 0.5779452919960022,
"learning_rate": 0.00019954648526077098,
"loss": 1.0165,
"step": 440
},
{
"epoch": 0.5104934770277936,
"grad_norm": 0.8505494594573975,
"learning_rate": 0.0001999974561843451,
"loss": 0.9527,
"step": 450
},
{
"epoch": 0.5218377765173,
"grad_norm": 0.7141993641853333,
"learning_rate": 0.00019998866291366877,
"loss": 0.9927,
"step": 460
},
{
"epoch": 0.5331820760068066,
"grad_norm": 0.5913094282150269,
"learning_rate": 0.0001999735893350151,
"loss": 1.0054,
"step": 470
},
{
"epoch": 0.5445263754963131,
"grad_norm": 0.5813531279563904,
"learning_rate": 0.00019995223639515864,
"loss": 0.9511,
"step": 480
},
{
"epoch": 0.5558706749858197,
"grad_norm": 0.9083317518234253,
"learning_rate": 0.0001999246054352818,
"loss": 0.9596,
"step": 490
},
{
"epoch": 0.5672149744753261,
"grad_norm": 0.8444753885269165,
"learning_rate": 0.00019989069819089067,
"loss": 1.0163,
"step": 500
},
{
"epoch": 0.5785592739648326,
"grad_norm": 0.6896610856056213,
"learning_rate": 0.0001998505167917061,
"loss": 0.9606,
"step": 510
},
{
"epoch": 0.5899035734543392,
"grad_norm": 0.7446523308753967,
"learning_rate": 0.00019980406376152984,
"loss": 0.9748,
"step": 520
},
{
"epoch": 0.6012478729438457,
"grad_norm": 0.5111407041549683,
"learning_rate": 0.00019975134201808605,
"loss": 0.9364,
"step": 530
},
{
"epoch": 0.6125921724333523,
"grad_norm": 0.6797256469726562,
"learning_rate": 0.000199692354872838,
"loss": 0.9766,
"step": 540
},
{
"epoch": 0.6239364719228587,
"grad_norm": 0.9774245619773865,
"learning_rate": 0.00019962710603078007,
"loss": 0.9669,
"step": 550
},
{
"epoch": 0.6352807714123653,
"grad_norm": 0.7039481997489929,
"learning_rate": 0.0001995555995902052,
"loss": 0.9371,
"step": 560
},
{
"epoch": 0.6466250709018718,
"grad_norm": 0.7363829016685486,
"learning_rate": 0.0001994778400424472,
"loss": 0.9809,
"step": 570
},
{
"epoch": 0.6579693703913784,
"grad_norm": 0.7072857022285461,
"learning_rate": 0.0001993938322715989,
"loss": 0.9825,
"step": 580
},
{
"epoch": 0.6693136698808848,
"grad_norm": 0.5628974437713623,
"learning_rate": 0.00019930358155420525,
"loss": 0.9101,
"step": 590
},
{
"epoch": 0.6806579693703914,
"grad_norm": 0.6564317345619202,
"learning_rate": 0.0001992070935589319,
"loss": 1.0374,
"step": 600
},
{
"epoch": 0.6920022688598979,
"grad_norm": 0.5805884599685669,
"learning_rate": 0.0001991043743462092,
"loss": 0.9695,
"step": 610
},
{
"epoch": 0.7033465683494045,
"grad_norm": 0.5671830773353577,
"learning_rate": 0.00019899543036785145,
"loss": 0.9598,
"step": 620
},
{
"epoch": 0.7146908678389109,
"grad_norm": 0.54367595911026,
"learning_rate": 0.0001988802684666519,
"loss": 0.962,
"step": 630
},
{
"epoch": 0.7260351673284174,
"grad_norm": 0.6982467770576477,
"learning_rate": 0.00019875889587595252,
"loss": 0.9633,
"step": 640
},
{
"epoch": 0.737379466817924,
"grad_norm": 0.6268488764762878,
"learning_rate": 0.00019863132021919025,
"loss": 0.9684,
"step": 650
},
{
"epoch": 0.7487237663074305,
"grad_norm": 1.2111632823944092,
"learning_rate": 0.00019849754950941758,
"loss": 1.0044,
"step": 660
},
{
"epoch": 0.7600680657969371,
"grad_norm": 0.6442829370498657,
"learning_rate": 0.00019835759214879964,
"loss": 0.9533,
"step": 670
},
{
"epoch": 0.7714123652864435,
"grad_norm": 0.5263229608535767,
"learning_rate": 0.00019821145692808633,
"loss": 0.959,
"step": 680
},
{
"epoch": 0.7827566647759501,
"grad_norm": 0.572928786277771,
"learning_rate": 0.00019805915302606016,
"loss": 0.9473,
"step": 690
},
{
"epoch": 0.7941009642654566,
"grad_norm": 0.6176092624664307,
"learning_rate": 0.00019790069000895987,
"loss": 0.9164,
"step": 700
},
{
"epoch": 0.8054452637549632,
"grad_norm": 0.5628384351730347,
"learning_rate": 0.00019773607782987924,
"loss": 0.9705,
"step": 710
},
{
"epoch": 0.8167895632444696,
"grad_norm": 0.8331648111343384,
"learning_rate": 0.00019756532682814232,
"loss": 0.9497,
"step": 720
},
{
"epoch": 0.8281338627339762,
"grad_norm": 0.5843848586082458,
"learning_rate": 0.00019738844772865377,
"loss": 0.9828,
"step": 730
},
{
"epoch": 0.8394781622234827,
"grad_norm": 0.6603434681892395,
"learning_rate": 0.0001972054516412253,
"loss": 0.9717,
"step": 740
},
{
"epoch": 0.8508224617129893,
"grad_norm": 0.5622076988220215,
"learning_rate": 0.00019701635005987792,
"loss": 0.9392,
"step": 750
},
{
"epoch": 0.8621667612024957,
"grad_norm": 0.8947564959526062,
"learning_rate": 0.00019682115486211984,
"loss": 0.9917,
"step": 760
},
{
"epoch": 0.8735110606920022,
"grad_norm": 0.5935038328170776,
"learning_rate": 0.00019661987830820065,
"loss": 0.9749,
"step": 770
},
{
"epoch": 0.8848553601815088,
"grad_norm": 0.8751797676086426,
"learning_rate": 0.000196412533040341,
"loss": 0.9828,
"step": 780
},
{
"epoch": 0.8961996596710153,
"grad_norm": 0.5279515981674194,
"learning_rate": 0.00019619913208193882,
"loss": 0.9685,
"step": 790
},
{
"epoch": 0.9075439591605219,
"grad_norm": 0.643695056438446,
"learning_rate": 0.00019597968883675116,
"loss": 0.9547,
"step": 800
},
{
"epoch": 0.9188882586500283,
"grad_norm": 0.7370747923851013,
"learning_rate": 0.00019575421708805215,
"loss": 0.9129,
"step": 810
},
{
"epoch": 0.9302325581395349,
"grad_norm": 0.7514728307723999,
"learning_rate": 0.0001955227309977677,
"loss": 0.9929,
"step": 820
},
{
"epoch": 0.9415768576290414,
"grad_norm": 0.6589088439941406,
"learning_rate": 0.00019528524510558547,
"loss": 0.9627,
"step": 830
},
{
"epoch": 0.952921157118548,
"grad_norm": 0.548102617263794,
"learning_rate": 0.00019504177432804203,
"loss": 0.9307,
"step": 840
},
{
"epoch": 0.9642654566080544,
"grad_norm": 0.458879679441452,
"learning_rate": 0.00019479233395758576,
"loss": 0.9838,
"step": 850
},
{
"epoch": 0.975609756097561,
"grad_norm": 0.9955594539642334,
"learning_rate": 0.0001945369396616164,
"loss": 0.9246,
"step": 860
},
{
"epoch": 0.9869540555870675,
"grad_norm": 0.5781052708625793,
"learning_rate": 0.0001942756074815009,
"loss": 1.0076,
"step": 870
},
{
"epoch": 0.998298355076574,
"grad_norm": 0.7370733022689819,
"learning_rate": 0.00019400835383156592,
"loss": 0.9618,
"step": 880
},
{
"epoch": 1.0096426545660806,
"grad_norm": 0.6173350214958191,
"learning_rate": 0.00019373519549806682,
"loss": 0.872,
"step": 890
},
{
"epoch": 1.0209869540555871,
"grad_norm": 0.6110262274742126,
"learning_rate": 0.00019345614963813334,
"loss": 0.8953,
"step": 900
},
{
"epoch": 1.0323312535450937,
"grad_norm": 0.8880902528762817,
"learning_rate": 0.00019317123377869192,
"loss": 0.8847,
"step": 910
},
{
"epoch": 1.0436755530346,
"grad_norm": 0.6907595992088318,
"learning_rate": 0.00019288046581536486,
"loss": 0.8878,
"step": 920
},
{
"epoch": 1.0550198525241066,
"grad_norm": 0.7469139695167542,
"learning_rate": 0.00019258386401134624,
"loss": 0.9018,
"step": 930
},
{
"epoch": 1.0663641520136131,
"grad_norm": 0.8650104403495789,
"learning_rate": 0.0001922814469962549,
"loss": 0.8825,
"step": 940
},
{
"epoch": 1.0777084515031197,
"grad_norm": 1.1437135934829712,
"learning_rate": 0.00019197323376496427,
"loss": 0.8977,
"step": 950
},
{
"epoch": 1.0890527509926262,
"grad_norm": 0.6191611289978027,
"learning_rate": 0.00019165924367640916,
"loss": 0.9059,
"step": 960
},
{
"epoch": 1.1003970504821328,
"grad_norm": 0.7402692437171936,
"learning_rate": 0.00019133949645237005,
"loss": 0.8778,
"step": 970
},
{
"epoch": 1.1117413499716393,
"grad_norm": 0.7002813220024109,
"learning_rate": 0.00019101401217623426,
"loss": 0.9281,
"step": 980
},
{
"epoch": 1.1230856494611459,
"grad_norm": 0.9000174403190613,
"learning_rate": 0.00019068281129173444,
"loss": 0.8795,
"step": 990
},
{
"epoch": 1.1344299489506522,
"grad_norm": 0.6749204993247986,
"learning_rate": 0.00019034591460166463,
"loss": 0.9091,
"step": 1000
},
{
"epoch": 1.1344299489506522,
"eval_loss": 0.8940885663032532,
"eval_runtime": 15.7869,
"eval_samples_per_second": 94.065,
"eval_steps_per_second": 11.782,
"step": 1000
},
{
"epoch": 1.1457742484401587,
"grad_norm": 0.7294667959213257,
"learning_rate": 0.00019000334326657345,
"loss": 0.879,
"step": 1010
},
{
"epoch": 1.1571185479296653,
"grad_norm": 0.9591787457466125,
"learning_rate": 0.00018965511880343527,
"loss": 0.9264,
"step": 1020
},
{
"epoch": 1.1684628474191718,
"grad_norm": 0.9575808644294739,
"learning_rate": 0.00018930126308429844,
"loss": 0.8825,
"step": 1030
},
{
"epoch": 1.1798071469086784,
"grad_norm": 0.49267736077308655,
"learning_rate": 0.00018894179833491164,
"loss": 0.9321,
"step": 1040
},
{
"epoch": 1.191151446398185,
"grad_norm": 0.848102867603302,
"learning_rate": 0.00018857674713332795,
"loss": 0.8543,
"step": 1050
},
{
"epoch": 1.2024957458876915,
"grad_norm": 0.7710912227630615,
"learning_rate": 0.00018820613240848655,
"loss": 0.9468,
"step": 1060
},
{
"epoch": 1.213840045377198,
"grad_norm": 0.6399308443069458,
"learning_rate": 0.00018782997743877264,
"loss": 0.9081,
"step": 1070
},
{
"epoch": 1.2251843448667046,
"grad_norm": 0.9124737977981567,
"learning_rate": 0.00018744830585055538,
"loss": 0.9288,
"step": 1080
},
{
"epoch": 1.236528644356211,
"grad_norm": 0.6313666105270386,
"learning_rate": 0.00018706114161670377,
"loss": 0.8197,
"step": 1090
},
{
"epoch": 1.2478729438457175,
"grad_norm": 0.7220073938369751,
"learning_rate": 0.000186668509055081,
"loss": 0.8576,
"step": 1100
},
{
"epoch": 1.259217243335224,
"grad_norm": 1.1808422803878784,
"learning_rate": 0.00018627043282701703,
"loss": 0.9044,
"step": 1110
},
{
"epoch": 1.2705615428247305,
"grad_norm": 0.6578934788703918,
"learning_rate": 0.00018586693793575966,
"loss": 0.9015,
"step": 1120
},
{
"epoch": 1.281905842314237,
"grad_norm": 0.9080325961112976,
"learning_rate": 0.0001854580497249039,
"loss": 0.8919,
"step": 1130
},
{
"epoch": 1.2932501418037436,
"grad_norm": 0.6446923017501831,
"learning_rate": 0.00018504379387680034,
"loss": 0.9033,
"step": 1140
},
{
"epoch": 1.3045944412932502,
"grad_norm": 0.6877492070198059,
"learning_rate": 0.00018462419641094189,
"loss": 0.8843,
"step": 1150
},
{
"epoch": 1.3159387407827567,
"grad_norm": 0.6565636396408081,
"learning_rate": 0.00018419928368232957,
"loss": 0.8925,
"step": 1160
},
{
"epoch": 1.3272830402722633,
"grad_norm": 0.8198230862617493,
"learning_rate": 0.0001837690823798171,
"loss": 0.8495,
"step": 1170
},
{
"epoch": 1.3386273397617696,
"grad_norm": 0.7579399347305298,
"learning_rate": 0.00018333361952443462,
"loss": 0.9051,
"step": 1180
},
{
"epoch": 1.3499716392512762,
"grad_norm": 0.8067922592163086,
"learning_rate": 0.0001828929224676914,
"loss": 0.8677,
"step": 1190
},
{
"epoch": 1.3613159387407827,
"grad_norm": 0.7077610492706299,
"learning_rate": 0.00018244701888985802,
"loss": 0.942,
"step": 1200
},
{
"epoch": 1.3726602382302893,
"grad_norm": 1.2009291648864746,
"learning_rate": 0.00018199593679822765,
"loss": 0.9034,
"step": 1210
},
{
"epoch": 1.3840045377197958,
"grad_norm": 0.8162534832954407,
"learning_rate": 0.00018153970452535698,
"loss": 0.8904,
"step": 1220
},
{
"epoch": 1.3953488372093024,
"grad_norm": 0.6332406401634216,
"learning_rate": 0.00018107835072728656,
"loss": 0.8637,
"step": 1230
},
{
"epoch": 1.406693136698809,
"grad_norm": 0.6449089050292969,
"learning_rate": 0.00018061190438174105,
"loss": 0.9463,
"step": 1240
},
{
"epoch": 1.4180374361883152,
"grad_norm": 0.6543394327163696,
"learning_rate": 0.00018014039478630894,
"loss": 0.8497,
"step": 1250
},
{
"epoch": 1.429381735677822,
"grad_norm": 0.7993437647819519,
"learning_rate": 0.0001796638515566025,
"loss": 0.9415,
"step": 1260
},
{
"epoch": 1.4407260351673283,
"grad_norm": 0.878514289855957,
"learning_rate": 0.0001791823046243977,
"loss": 0.9143,
"step": 1270
},
{
"epoch": 1.4520703346568349,
"grad_norm": 0.6794580817222595,
"learning_rate": 0.00017869578423575387,
"loss": 0.9041,
"step": 1280
},
{
"epoch": 1.4634146341463414,
"grad_norm": 0.9009565711021423,
"learning_rate": 0.00017820432094911427,
"loss": 0.8773,
"step": 1290
},
{
"epoch": 1.474758933635848,
"grad_norm": 0.6419825553894043,
"learning_rate": 0.00017770794563338647,
"loss": 0.9027,
"step": 1300
},
{
"epoch": 1.4861032331253545,
"grad_norm": 0.7277469635009766,
"learning_rate": 0.0001772066894660037,
"loss": 0.9123,
"step": 1310
},
{
"epoch": 1.497447532614861,
"grad_norm": 0.7514845132827759,
"learning_rate": 0.00017670058393096634,
"loss": 0.9095,
"step": 1320
},
{
"epoch": 1.5087918321043676,
"grad_norm": 0.5530194044113159,
"learning_rate": 0.0001761896608168646,
"loss": 0.855,
"step": 1330
},
{
"epoch": 1.520136131593874,
"grad_norm": 0.6379088759422302,
"learning_rate": 0.0001756739522148818,
"loss": 0.9485,
"step": 1340
},
{
"epoch": 1.5314804310833807,
"grad_norm": 0.5411556959152222,
"learning_rate": 0.0001751534905167787,
"loss": 0.951,
"step": 1350
},
{
"epoch": 1.542824730572887,
"grad_norm": 0.9241764545440674,
"learning_rate": 0.00017462830841285894,
"loss": 0.8459,
"step": 1360
},
{
"epoch": 1.5541690300623936,
"grad_norm": 0.9029989242553711,
"learning_rate": 0.00017409843888991584,
"loss": 0.9045,
"step": 1370
},
{
"epoch": 1.5655133295519001,
"grad_norm": 0.9002951979637146,
"learning_rate": 0.00017356391522916042,
"loss": 0.8388,
"step": 1380
},
{
"epoch": 1.5768576290414067,
"grad_norm": 0.6322818994522095,
"learning_rate": 0.0001730247710041311,
"loss": 0.8937,
"step": 1390
},
{
"epoch": 1.5882019285309132,
"grad_norm": 0.9197801351547241,
"learning_rate": 0.00017248104007858476,
"loss": 0.8656,
"step": 1400
},
{
"epoch": 1.5995462280204198,
"grad_norm": 0.7498595714569092,
"learning_rate": 0.00017193275660436997,
"loss": 0.8848,
"step": 1410
},
{
"epoch": 1.6108905275099263,
"grad_norm": 1.0003221035003662,
"learning_rate": 0.00017137995501928166,
"loss": 0.8494,
"step": 1420
},
{
"epoch": 1.6222348269994327,
"grad_norm": 0.6622512340545654,
"learning_rate": 0.00017082267004489842,
"loss": 0.9158,
"step": 1430
},
{
"epoch": 1.6335791264889394,
"grad_norm": 1.2562657594680786,
"learning_rate": 0.00017026093668440114,
"loss": 0.8899,
"step": 1440
},
{
"epoch": 1.6449234259784458,
"grad_norm": 0.5380372405052185,
"learning_rate": 0.00016969479022037502,
"loss": 0.9082,
"step": 1450
},
{
"epoch": 1.6562677254679523,
"grad_norm": 0.7120011448860168,
"learning_rate": 0.00016912426621259297,
"loss": 0.8456,
"step": 1460
},
{
"epoch": 1.6676120249574589,
"grad_norm": 0.580111026763916,
"learning_rate": 0.0001685494004957824,
"loss": 0.9272,
"step": 1470
},
{
"epoch": 1.6789563244469654,
"grad_norm": 0.9516561627388,
"learning_rate": 0.0001679702291773743,
"loss": 0.906,
"step": 1480
},
{
"epoch": 1.690300623936472,
"grad_norm": 0.5973901152610779,
"learning_rate": 0.0001673867886352354,
"loss": 0.931,
"step": 1490
},
{
"epoch": 1.7016449234259783,
"grad_norm": 0.7292883992195129,
"learning_rate": 0.00016679911551538317,
"loss": 0.8848,
"step": 1500
},
{
"epoch": 1.712989222915485,
"grad_norm": 0.6363751888275146,
"learning_rate": 0.0001662072467296842,
"loss": 0.9059,
"step": 1510
},
{
"epoch": 1.7243335224049914,
"grad_norm": 0.9236806631088257,
"learning_rate": 0.00016561121945353566,
"loss": 0.8557,
"step": 1520
},
{
"epoch": 1.7356778218944982,
"grad_norm": 0.6865366697311401,
"learning_rate": 0.00016501107112353028,
"loss": 0.9264,
"step": 1530
},
{
"epoch": 1.7470221213840045,
"grad_norm": 0.6749486923217773,
"learning_rate": 0.00016440683943510516,
"loss": 0.9224,
"step": 1540
},
{
"epoch": 1.758366420873511,
"grad_norm": 0.7539329528808594,
"learning_rate": 0.00016379856234017382,
"loss": 0.8594,
"step": 1550
},
{
"epoch": 1.7697107203630176,
"grad_norm": 0.6702885031700134,
"learning_rate": 0.0001631862780447426,
"loss": 0.8896,
"step": 1560
},
{
"epoch": 1.7810550198525241,
"grad_norm": 0.6152791976928711,
"learning_rate": 0.00016257002500651098,
"loss": 0.8738,
"step": 1570
},
{
"epoch": 1.7923993193420307,
"grad_norm": 0.5736550688743591,
"learning_rate": 0.00016194984193245587,
"loss": 0.9018,
"step": 1580
},
{
"epoch": 1.803743618831537,
"grad_norm": 0.751157820224762,
"learning_rate": 0.00016132576777640067,
"loss": 0.8605,
"step": 1590
},
{
"epoch": 1.8150879183210438,
"grad_norm": 0.6626732349395752,
"learning_rate": 0.0001606978417365682,
"loss": 0.8857,
"step": 1600
},
{
"epoch": 1.82643221781055,
"grad_norm": 0.584065318107605,
"learning_rate": 0.00016006610325311908,
"loss": 0.9104,
"step": 1610
},
{
"epoch": 1.8377765173000569,
"grad_norm": 0.5933496356010437,
"learning_rate": 0.0001594305920056742,
"loss": 0.8167,
"step": 1620
},
{
"epoch": 1.8491208167895632,
"grad_norm": 0.5618401765823364,
"learning_rate": 0.00015879134791082247,
"loss": 0.8907,
"step": 1630
},
{
"epoch": 1.8604651162790697,
"grad_norm": 0.9804329872131348,
"learning_rate": 0.00015814841111961374,
"loss": 0.9494,
"step": 1640
},
{
"epoch": 1.8718094157685763,
"grad_norm": 0.937347412109375,
"learning_rate": 0.00015750182201503682,
"loss": 0.9045,
"step": 1650
},
{
"epoch": 1.8831537152580828,
"grad_norm": 0.8898664712905884,
"learning_rate": 0.00015685162120948317,
"loss": 0.9346,
"step": 1660
},
{
"epoch": 1.8944980147475894,
"grad_norm": 0.8580901622772217,
"learning_rate": 0.00015619784954219577,
"loss": 0.9412,
"step": 1670
},
{
"epoch": 1.9058423142370957,
"grad_norm": 0.6913225054740906,
"learning_rate": 0.00015554054807670418,
"loss": 0.9006,
"step": 1680
},
{
"epoch": 1.9171866137266025,
"grad_norm": 0.7101637125015259,
"learning_rate": 0.00015487975809824539,
"loss": 0.8857,
"step": 1690
},
{
"epoch": 1.9285309132161088,
"grad_norm": 0.8228437900543213,
"learning_rate": 0.00015421552111117044,
"loss": 0.8607,
"step": 1700
},
{
"epoch": 1.9398752127056156,
"grad_norm": 0.5591906905174255,
"learning_rate": 0.00015354787883633782,
"loss": 0.8674,
"step": 1710
},
{
"epoch": 1.951219512195122,
"grad_norm": 0.6841379404067993,
"learning_rate": 0.00015287687320849271,
"loss": 0.8387,
"step": 1720
},
{
"epoch": 1.9625638116846285,
"grad_norm": 0.8344857096672058,
"learning_rate": 0.00015220254637363318,
"loss": 0.9227,
"step": 1730
},
{
"epoch": 1.973908111174135,
"grad_norm": 0.8986241221427917,
"learning_rate": 0.00015152494068636308,
"loss": 0.8917,
"step": 1740
},
{
"epoch": 1.9852524106636416,
"grad_norm": 0.5783970952033997,
"learning_rate": 0.00015084409870723154,
"loss": 0.872,
"step": 1750
},
{
"epoch": 1.996596710153148,
"grad_norm": 0.6369901895523071,
"learning_rate": 0.00015016006320005986,
"loss": 0.9132,
"step": 1760
},
{
"epoch": 2.0079410096426544,
"grad_norm": 0.5906355381011963,
"learning_rate": 0.00014947287712925545,
"loss": 0.8074,
"step": 1770
},
{
"epoch": 2.019285309132161,
"grad_norm": 0.6774492263793945,
"learning_rate": 0.00014878258365711334,
"loss": 0.759,
"step": 1780
},
{
"epoch": 2.0306296086216675,
"grad_norm": 0.8353272676467896,
"learning_rate": 0.00014808922614110493,
"loss": 0.8028,
"step": 1790
},
{
"epoch": 2.0419739081111743,
"grad_norm": 0.8876771926879883,
"learning_rate": 0.00014739284813115498,
"loss": 0.7302,
"step": 1800
},
{
"epoch": 2.0533182076006806,
"grad_norm": 0.6215524673461914,
"learning_rate": 0.00014669349336690594,
"loss": 0.7759,
"step": 1810
},
{
"epoch": 2.0646625070901874,
"grad_norm": 0.5663015246391296,
"learning_rate": 0.00014599120577497087,
"loss": 0.7834,
"step": 1820
},
{
"epoch": 2.0760068065796937,
"grad_norm": 0.6096060872077942,
"learning_rate": 0.00014528602946617432,
"loss": 0.8364,
"step": 1830
},
{
"epoch": 2.0873511060692,
"grad_norm": 0.7625316977500916,
"learning_rate": 0.00014457800873278172,
"loss": 0.7558,
"step": 1840
},
{
"epoch": 2.098695405558707,
"grad_norm": 0.6301640272140503,
"learning_rate": 0.0001438671880457174,
"loss": 0.8297,
"step": 1850
},
{
"epoch": 2.110039705048213,
"grad_norm": 0.6493074297904968,
"learning_rate": 0.00014315361205177127,
"loss": 0.7764,
"step": 1860
},
{
"epoch": 2.12138400453772,
"grad_norm": 0.8326807618141174,
"learning_rate": 0.0001424373255707947,
"loss": 0.7895,
"step": 1870
},
{
"epoch": 2.1327283040272262,
"grad_norm": 1.0578484535217285,
"learning_rate": 0.00014171837359288524,
"loss": 0.7889,
"step": 1880
},
{
"epoch": 2.144072603516733,
"grad_norm": 0.6812543272972107,
"learning_rate": 0.0001409968012755609,
"loss": 0.7643,
"step": 1890
},
{
"epoch": 2.1554169030062393,
"grad_norm": 0.8412303924560547,
"learning_rate": 0.00014027265394092364,
"loss": 0.7402,
"step": 1900
},
{
"epoch": 2.1667612024957457,
"grad_norm": 0.947846531867981,
"learning_rate": 0.00013954597707281288,
"loss": 0.7763,
"step": 1910
},
{
"epoch": 2.1781055019852524,
"grad_norm": 0.7577157616615295,
"learning_rate": 0.00013881681631394842,
"loss": 0.8334,
"step": 1920
},
{
"epoch": 2.1894498014747588,
"grad_norm": 0.6362768411636353,
"learning_rate": 0.0001380852174630639,
"loss": 0.7484,
"step": 1930
},
{
"epoch": 2.2007941009642655,
"grad_norm": 0.7967275381088257,
"learning_rate": 0.00013735122647202984,
"loss": 0.7302,
"step": 1940
},
{
"epoch": 2.212138400453772,
"grad_norm": 0.7726805210113525,
"learning_rate": 0.0001366148894429677,
"loss": 0.7836,
"step": 1950
},
{
"epoch": 2.2234826999432786,
"grad_norm": 0.7741623520851135,
"learning_rate": 0.00013587625262535396,
"loss": 0.7925,
"step": 1960
},
{
"epoch": 2.234826999432785,
"grad_norm": 0.7582458257675171,
"learning_rate": 0.0001351353624131153,
"loss": 0.7765,
"step": 1970
},
{
"epoch": 2.2461712989222917,
"grad_norm": 0.8276723027229309,
"learning_rate": 0.00013439226534171463,
"loss": 0.81,
"step": 1980
},
{
"epoch": 2.257515598411798,
"grad_norm": 0.8419069051742554,
"learning_rate": 0.00013364700808522807,
"loss": 0.7464,
"step": 1990
},
{
"epoch": 2.2688598979013044,
"grad_norm": 0.7446946501731873,
"learning_rate": 0.00013289963745341345,
"loss": 0.7524,
"step": 2000
},
{
"epoch": 2.2688598979013044,
"eval_loss": 0.9066722989082336,
"eval_runtime": 15.6396,
"eval_samples_per_second": 94.951,
"eval_steps_per_second": 11.893,
"step": 2000
},
{
"epoch": 2.280204197390811,
"grad_norm": 0.7091513872146606,
"learning_rate": 0.00013215020038877002,
"loss": 0.7806,
"step": 2010
},
{
"epoch": 2.2915484968803175,
"grad_norm": 0.5853792428970337,
"learning_rate": 0.0001313987439635902,
"loss": 0.7625,
"step": 2020
},
{
"epoch": 2.3028927963698242,
"grad_norm": 0.7464004158973694,
"learning_rate": 0.00013064531537700284,
"loss": 0.7313,
"step": 2030
},
{
"epoch": 2.3142370958593306,
"grad_norm": 0.6370956301689148,
"learning_rate": 0.00012988996195200858,
"loss": 0.7903,
"step": 2040
},
{
"epoch": 2.3255813953488373,
"grad_norm": 0.8973234295845032,
"learning_rate": 0.0001291327311325076,
"loss": 0.7537,
"step": 2050
},
{
"epoch": 2.3369256948383437,
"grad_norm": 1.206678032875061,
"learning_rate": 0.00012837367048031955,
"loss": 0.8081,
"step": 2060
},
{
"epoch": 2.3482699943278504,
"grad_norm": 0.9258993864059448,
"learning_rate": 0.0001276128276721963,
"loss": 0.7754,
"step": 2070
},
{
"epoch": 2.3596142938173568,
"grad_norm": 0.8008835315704346,
"learning_rate": 0.00012685025049682732,
"loss": 0.8119,
"step": 2080
},
{
"epoch": 2.370958593306863,
"grad_norm": 0.8094901442527771,
"learning_rate": 0.0001260859868518379,
"loss": 0.7889,
"step": 2090
},
{
"epoch": 2.38230289279637,
"grad_norm": 0.7824433445930481,
"learning_rate": 0.00012532008474078093,
"loss": 0.8443,
"step": 2100
},
{
"epoch": 2.393647192285876,
"grad_norm": 0.8314623236656189,
"learning_rate": 0.00012455259227012172,
"loss": 0.8009,
"step": 2110
},
{
"epoch": 2.404991491775383,
"grad_norm": 0.993483304977417,
"learning_rate": 0.0001237835576462163,
"loss": 0.803,
"step": 2120
},
{
"epoch": 2.4163357912648893,
"grad_norm": 0.7922090291976929,
"learning_rate": 0.00012301302917228364,
"loss": 0.7785,
"step": 2130
},
{
"epoch": 2.427680090754396,
"grad_norm": 0.8681336045265198,
"learning_rate": 0.00012224105524537176,
"loss": 0.7427,
"step": 2140
},
{
"epoch": 2.4390243902439024,
"grad_norm": 0.868011474609375,
"learning_rate": 0.00012146768435331797,
"loss": 0.7841,
"step": 2150
},
{
"epoch": 2.450368689733409,
"grad_norm": 0.8300703763961792,
"learning_rate": 0.00012069296507170307,
"loss": 0.7113,
"step": 2160
},
{
"epoch": 2.4617129892229155,
"grad_norm": 1.0211178064346313,
"learning_rate": 0.00011991694606080062,
"loss": 0.7927,
"step": 2170
},
{
"epoch": 2.473057288712422,
"grad_norm": 1.1126124858856201,
"learning_rate": 0.00011913967606252035,
"loss": 0.798,
"step": 2180
},
{
"epoch": 2.4844015882019286,
"grad_norm": 1.331468939781189,
"learning_rate": 0.00011836120389734677,
"loss": 0.7868,
"step": 2190
},
{
"epoch": 2.495745887691435,
"grad_norm": 0.7289639115333557,
"learning_rate": 0.00011758157846127278,
"loss": 0.7501,
"step": 2200
},
{
"epoch": 2.5070901871809417,
"grad_norm": 0.6862948536872864,
"learning_rate": 0.00011680084872272843,
"loss": 0.8113,
"step": 2210
},
{
"epoch": 2.518434486670448,
"grad_norm": 0.6838523745536804,
"learning_rate": 0.00011601906371950523,
"loss": 0.7794,
"step": 2220
},
{
"epoch": 2.5297787861599548,
"grad_norm": 0.8923412561416626,
"learning_rate": 0.00011523627255567606,
"loss": 0.7532,
"step": 2230
},
{
"epoch": 2.541123085649461,
"grad_norm": 0.7864569425582886,
"learning_rate": 0.00011445252439851092,
"loss": 0.8044,
"step": 2240
},
{
"epoch": 2.552467385138968,
"grad_norm": 0.9186776280403137,
"learning_rate": 0.0001136678684753889,
"loss": 0.7861,
"step": 2250
},
{
"epoch": 2.563811684628474,
"grad_norm": 0.9502933025360107,
"learning_rate": 0.00011288235407070588,
"loss": 0.7441,
"step": 2260
},
{
"epoch": 2.5751559841179805,
"grad_norm": 0.9764688014984131,
"learning_rate": 0.00011209603052277924,
"loss": 0.7519,
"step": 2270
},
{
"epoch": 2.5865002836074873,
"grad_norm": 0.8480959534645081,
"learning_rate": 0.00011130894722074874,
"loss": 0.7743,
"step": 2280
},
{
"epoch": 2.5978445830969936,
"grad_norm": 0.8660979866981506,
"learning_rate": 0.00011052115360147448,
"loss": 0.7989,
"step": 2290
},
{
"epoch": 2.6091888825865004,
"grad_norm": 0.6586043238639832,
"learning_rate": 0.0001097326991464318,
"loss": 0.7676,
"step": 2300
},
{
"epoch": 2.6205331820760067,
"grad_norm": 0.7315343618392944,
"learning_rate": 0.00010894363337860314,
"loss": 0.7699,
"step": 2310
},
{
"epoch": 2.6318774815655135,
"grad_norm": 0.7257770895957947,
"learning_rate": 0.0001081540058593677,
"loss": 0.7773,
"step": 2320
},
{
"epoch": 2.64322178105502,
"grad_norm": 0.6760928630828857,
"learning_rate": 0.00010736386618538838,
"loss": 0.7902,
"step": 2330
},
{
"epoch": 2.6545660805445266,
"grad_norm": 0.6824659705162048,
"learning_rate": 0.00010657326398549661,
"loss": 0.7759,
"step": 2340
},
{
"epoch": 2.665910380034033,
"grad_norm": 0.972321629524231,
"learning_rate": 0.0001057822489175752,
"loss": 0.7926,
"step": 2350
},
{
"epoch": 2.6772546795235392,
"grad_norm": 0.9526649713516235,
"learning_rate": 0.00010499087066543922,
"loss": 0.7648,
"step": 2360
},
{
"epoch": 2.688598979013046,
"grad_norm": 0.7266947031021118,
"learning_rate": 0.0001041991789357155,
"loss": 0.776,
"step": 2370
},
{
"epoch": 2.6999432785025523,
"grad_norm": 0.808121383190155,
"learning_rate": 0.00010340722345472037,
"loss": 0.7852,
"step": 2380
},
{
"epoch": 2.711287577992059,
"grad_norm": 1.1124972105026245,
"learning_rate": 0.00010261505396533648,
"loss": 0.717,
"step": 2390
},
{
"epoch": 2.7226318774815654,
"grad_norm": 0.7241740226745605,
"learning_rate": 0.00010182272022388841,
"loss": 0.8335,
"step": 2400
},
{
"epoch": 2.733976176971072,
"grad_norm": 1.0944820642471313,
"learning_rate": 0.0001010302719970174,
"loss": 0.7874,
"step": 2410
},
{
"epoch": 2.7453204764605785,
"grad_norm": 0.735615611076355,
"learning_rate": 0.00010023775905855559,
"loss": 0.7198,
"step": 2420
},
{
"epoch": 2.7566647759500853,
"grad_norm": 0.8080368041992188,
"learning_rate": 9.944523118639958e-05,
"loss": 0.8275,
"step": 2430
},
{
"epoch": 2.7680090754395916,
"grad_norm": 1.0709086656570435,
"learning_rate": 9.865273815938403e-05,
"loss": 0.841,
"step": 2440
},
{
"epoch": 2.779353374929098,
"grad_norm": 0.8561082482337952,
"learning_rate": 9.786032975415503e-05,
"loss": 0.7393,
"step": 2450
},
{
"epoch": 2.7906976744186047,
"grad_norm": 0.6831649541854858,
"learning_rate": 9.706805574204341e-05,
"loss": 0.7904,
"step": 2460
},
{
"epoch": 2.802041973908111,
"grad_norm": 0.9404779672622681,
"learning_rate": 9.627596588593884e-05,
"loss": 0.7651,
"step": 2470
},
{
"epoch": 2.813386273397618,
"grad_norm": 1.1059134006500244,
"learning_rate": 9.54841099371641e-05,
"loss": 0.7792,
"step": 2480
},
{
"epoch": 2.824730572887124,
"grad_norm": 0.8339388966560364,
"learning_rate": 9.469253763235015e-05,
"loss": 0.8037,
"step": 2490
},
{
"epoch": 2.8360748723766305,
"grad_norm": 0.691879153251648,
"learning_rate": 9.390129869031232e-05,
"loss": 0.7882,
"step": 2500
},
{
"epoch": 2.8474191718661372,
"grad_norm": 0.8173119425773621,
"learning_rate": 9.311044280892728e-05,
"loss": 0.7723,
"step": 2510
},
{
"epoch": 2.858763471355644,
"grad_norm": 1.2163662910461426,
"learning_rate": 9.232001966201159e-05,
"loss": 0.8332,
"step": 2520
},
{
"epoch": 2.8701077708451503,
"grad_norm": 0.7762579917907715,
"learning_rate": 9.153007889620169e-05,
"loss": 0.8017,
"step": 2530
},
{
"epoch": 2.8814520703346567,
"grad_norm": 0.7560020089149475,
"learning_rate": 9.074067012783551e-05,
"loss": 0.7645,
"step": 2540
},
{
"epoch": 2.8927963698241634,
"grad_norm": 0.7039526104927063,
"learning_rate": 8.995184293983627e-05,
"loss": 0.7496,
"step": 2550
},
{
"epoch": 2.9041406693136698,
"grad_norm": 0.8188515305519104,
"learning_rate": 8.916364687859782e-05,
"loss": 0.7941,
"step": 2560
},
{
"epoch": 2.9154849688031765,
"grad_norm": 0.8847174048423767,
"learning_rate": 8.837613145087289e-05,
"loss": 0.7462,
"step": 2570
},
{
"epoch": 2.926829268292683,
"grad_norm": 1.4302834272384644,
"learning_rate": 8.758934612066353e-05,
"loss": 0.7659,
"step": 2580
},
{
"epoch": 2.938173567782189,
"grad_norm": 0.8293200135231018,
"learning_rate": 8.680334030611414e-05,
"loss": 0.7464,
"step": 2590
},
{
"epoch": 2.949517867271696,
"grad_norm": 0.9347418546676636,
"learning_rate": 8.601816337640767e-05,
"loss": 0.7907,
"step": 2600
},
{
"epoch": 2.9608621667612027,
"grad_norm": 0.8685625195503235,
"learning_rate": 8.523386464866452e-05,
"loss": 0.7881,
"step": 2610
},
{
"epoch": 2.972206466250709,
"grad_norm": 1.0375618934631348,
"learning_rate": 8.44504933848452e-05,
"loss": 0.7415,
"step": 2620
},
{
"epoch": 2.9835507657402154,
"grad_norm": 1.1286613941192627,
"learning_rate": 8.366809878865594e-05,
"loss": 0.759,
"step": 2630
},
{
"epoch": 2.994895065229722,
"grad_norm": 0.9496249556541443,
"learning_rate": 8.28867300024582e-05,
"loss": 0.8122,
"step": 2640
},
{
"epoch": 3.0062393647192285,
"grad_norm": 0.6161667108535767,
"learning_rate": 8.210643610418232e-05,
"loss": 0.7363,
"step": 2650
},
{
"epoch": 3.0175836642087353,
"grad_norm": 1.1362223625183105,
"learning_rate": 8.132726610424453e-05,
"loss": 0.6957,
"step": 2660
},
{
"epoch": 3.0289279636982416,
"grad_norm": 0.9549693465232849,
"learning_rate": 8.054926894246887e-05,
"loss": 0.6598,
"step": 2670
},
{
"epoch": 3.0402722631877483,
"grad_norm": 0.7844473719596863,
"learning_rate": 7.977249348501314e-05,
"loss": 0.7104,
"step": 2680
},
{
"epoch": 3.0516165626772547,
"grad_norm": 0.9754497408866882,
"learning_rate": 7.899698852129962e-05,
"loss": 0.7109,
"step": 2690
},
{
"epoch": 3.062960862166761,
"grad_norm": 0.8465747237205505,
"learning_rate": 7.822280276095073e-05,
"loss": 0.6208,
"step": 2700
},
{
"epoch": 3.0743051616562678,
"grad_norm": 0.7896714806556702,
"learning_rate": 7.744998483072936e-05,
"loss": 0.6417,
"step": 2710
},
{
"epoch": 3.085649461145774,
"grad_norm": 0.8668105006217957,
"learning_rate": 7.667858327148475e-05,
"loss": 0.6525,
"step": 2720
},
{
"epoch": 3.096993760635281,
"grad_norm": 1.0019567012786865,
"learning_rate": 7.590864653510359e-05,
"loss": 0.6604,
"step": 2730
},
{
"epoch": 3.108338060124787,
"grad_norm": 0.7561362981796265,
"learning_rate": 7.514022298146679e-05,
"loss": 0.6912,
"step": 2740
},
{
"epoch": 3.119682359614294,
"grad_norm": 0.9435575604438782,
"learning_rate": 7.437336087541187e-05,
"loss": 0.6993,
"step": 2750
},
{
"epoch": 3.1310266591038003,
"grad_norm": 1.041034460067749,
"learning_rate": 7.360810838370161e-05,
"loss": 0.6562,
"step": 2760
},
{
"epoch": 3.142370958593307,
"grad_norm": 0.8745769262313843,
"learning_rate": 7.284451357199851e-05,
"loss": 0.6035,
"step": 2770
},
{
"epoch": 3.1537152580828134,
"grad_norm": 0.9436658620834351,
"learning_rate": 7.208262440184584e-05,
"loss": 0.6591,
"step": 2780
},
{
"epoch": 3.1650595575723197,
"grad_norm": 0.9558268785476685,
"learning_rate": 7.13224887276553e-05,
"loss": 0.7548,
"step": 2790
},
{
"epoch": 3.1764038570618265,
"grad_norm": 1.3072495460510254,
"learning_rate": 7.056415429370106e-05,
"loss": 0.648,
"step": 2800
},
{
"epoch": 3.187748156551333,
"grad_norm": 1.0742169618606567,
"learning_rate": 6.980766873112106e-05,
"loss": 0.6646,
"step": 2810
},
{
"epoch": 3.1990924560408396,
"grad_norm": 0.8391577005386353,
"learning_rate": 6.905307955492523e-05,
"loss": 0.6844,
"step": 2820
},
{
"epoch": 3.210436755530346,
"grad_norm": 0.9172285795211792,
"learning_rate": 6.83004341610111e-05,
"loss": 0.6671,
"step": 2830
},
{
"epoch": 3.2217810550198527,
"grad_norm": 1.0791727304458618,
"learning_rate": 6.754977982318693e-05,
"loss": 0.6619,
"step": 2840
},
{
"epoch": 3.233125354509359,
"grad_norm": 0.8881738781929016,
"learning_rate": 6.68011636902022e-05,
"loss": 0.678,
"step": 2850
},
{
"epoch": 3.2444696539988658,
"grad_norm": 0.8353477120399475,
"learning_rate": 6.605463278278646e-05,
"loss": 0.7061,
"step": 2860
},
{
"epoch": 3.255813953488372,
"grad_norm": 0.9251864552497864,
"learning_rate": 6.531023399069574e-05,
"loss": 0.6658,
"step": 2870
},
{
"epoch": 3.2671582529778784,
"grad_norm": 0.7780378460884094,
"learning_rate": 6.45680140697675e-05,
"loss": 0.6327,
"step": 2880
},
{
"epoch": 3.278502552467385,
"grad_norm": 1.3496202230453491,
"learning_rate": 6.38280196389839e-05,
"loss": 0.6658,
"step": 2890
},
{
"epoch": 3.2898468519568915,
"grad_norm": 1.0429950952529907,
"learning_rate": 6.309029717754362e-05,
"loss": 0.7013,
"step": 2900
},
{
"epoch": 3.3011911514463983,
"grad_norm": 0.7141017317771912,
"learning_rate": 6.235489302194247e-05,
"loss": 0.6969,
"step": 2910
},
{
"epoch": 3.3125354509359046,
"grad_norm": 1.2669309377670288,
"learning_rate": 6.162185336306294e-05,
"loss": 0.6468,
"step": 2920
},
{
"epoch": 3.3238797504254114,
"grad_norm": 0.8476207852363586,
"learning_rate": 6.089122424327307e-05,
"loss": 0.6501,
"step": 2930
},
{
"epoch": 3.3352240499149177,
"grad_norm": 0.9521162509918213,
"learning_rate": 6.01630515535345e-05,
"loss": 0.6546,
"step": 2940
},
{
"epoch": 3.346568349404424,
"grad_norm": 0.7817677855491638,
"learning_rate": 5.943738103051997e-05,
"loss": 0.6919,
"step": 2950
},
{
"epoch": 3.357912648893931,
"grad_norm": 0.776945948600769,
"learning_rate": 5.8714258253740564e-05,
"loss": 0.6897,
"step": 2960
},
{
"epoch": 3.369256948383437,
"grad_norm": 0.9761963486671448,
"learning_rate": 5.7993728642683e-05,
"loss": 0.6299,
"step": 2970
},
{
"epoch": 3.380601247872944,
"grad_norm": 0.7887254953384399,
"learning_rate": 5.7275837453956614e-05,
"loss": 0.6773,
"step": 2980
},
{
"epoch": 3.3919455473624502,
"grad_norm": 0.860835611820221,
"learning_rate": 5.656062977845116e-05,
"loss": 0.6239,
"step": 2990
},
{
"epoch": 3.403289846851957,
"grad_norm": 0.9700385928153992,
"learning_rate": 5.584815053850407e-05,
"loss": 0.7148,
"step": 3000
},
{
"epoch": 3.403289846851957,
"eval_loss": 0.9692808389663696,
"eval_runtime": 15.7325,
"eval_samples_per_second": 94.39,
"eval_steps_per_second": 11.823,
"step": 3000
},
{
"epoch": 3.4146341463414633,
"grad_norm": 1.335462212562561,
"learning_rate": 5.51384444850794e-05,
"loss": 0.6387,
"step": 3010
},
{
"epoch": 3.42597844583097,
"grad_norm": 0.8788994550704956,
"learning_rate": 5.443155619495679e-05,
"loss": 0.6809,
"step": 3020
},
{
"epoch": 3.4373227453204764,
"grad_norm": 0.9188012480735779,
"learning_rate": 5.372753006793143e-05,
"loss": 0.6724,
"step": 3030
},
{
"epoch": 3.4486670448099828,
"grad_norm": 0.9619457125663757,
"learning_rate": 5.302641032402578e-05,
"loss": 0.6789,
"step": 3040
},
{
"epoch": 3.4600113442994895,
"grad_norm": 0.9403857588768005,
"learning_rate": 5.2328241000711464e-05,
"loss": 0.6274,
"step": 3050
},
{
"epoch": 3.471355643788996,
"grad_norm": 0.9259539246559143,
"learning_rate": 5.16330659501438e-05,
"loss": 0.6551,
"step": 3060
},
{
"epoch": 3.4826999432785026,
"grad_norm": 1.07770574092865,
"learning_rate": 5.094092883640718e-05,
"loss": 0.6593,
"step": 3070
},
{
"epoch": 3.494044242768009,
"grad_norm": 0.7347473502159119,
"learning_rate": 5.0251873132772576e-05,
"loss": 0.6847,
"step": 3080
},
{
"epoch": 3.5053885422575157,
"grad_norm": 0.9838495254516602,
"learning_rate": 4.956594211896701e-05,
"loss": 0.6667,
"step": 3090
},
{
"epoch": 3.516732841747022,
"grad_norm": 1.1671929359436035,
"learning_rate": 4.8883178878454996e-05,
"loss": 0.683,
"step": 3100
},
{
"epoch": 3.528077141236529,
"grad_norm": 0.6510323882102966,
"learning_rate": 4.8203626295732675e-05,
"loss": 0.6946,
"step": 3110
},
{
"epoch": 3.539421440726035,
"grad_norm": 0.7871556282043457,
"learning_rate": 4.7527327053634094e-05,
"loss": 0.6652,
"step": 3120
},
{
"epoch": 3.5507657402155415,
"grad_norm": 0.8053673505783081,
"learning_rate": 4.685432363065036e-05,
"loss": 0.6431,
"step": 3130
},
{
"epoch": 3.5621100397050482,
"grad_norm": 0.8162011504173279,
"learning_rate": 4.618465829826145e-05,
"loss": 0.6089,
"step": 3140
},
{
"epoch": 3.5734543391945546,
"grad_norm": 1.0298821926116943,
"learning_rate": 4.551837311828131e-05,
"loss": 0.6645,
"step": 3150
},
{
"epoch": 3.5847986386840613,
"grad_norm": 1.0996955633163452,
"learning_rate": 4.485550994021567e-05,
"loss": 0.6872,
"step": 3160
},
{
"epoch": 3.5961429381735677,
"grad_norm": 0.9979953765869141,
"learning_rate": 4.419611039863377e-05,
"loss": 0.628,
"step": 3170
},
{
"epoch": 3.6074872376630744,
"grad_norm": 1.0593342781066895,
"learning_rate": 4.354021591055311e-05,
"loss": 0.6864,
"step": 3180
},
{
"epoch": 3.6188315371525808,
"grad_norm": 1.6677913665771484,
"learning_rate": 4.2887867672838056e-05,
"loss": 0.6232,
"step": 3190
},
{
"epoch": 3.6301758366420875,
"grad_norm": 0.8164204359054565,
"learning_rate": 4.223910665961235e-05,
"loss": 0.6786,
"step": 3200
},
{
"epoch": 3.641520136131594,
"grad_norm": 0.8163765072822571,
"learning_rate": 4.15939736196853e-05,
"loss": 0.6763,
"step": 3210
},
{
"epoch": 3.6528644356211,
"grad_norm": 0.9765521883964539,
"learning_rate": 4.095250907399262e-05,
"loss": 0.6719,
"step": 3220
},
{
"epoch": 3.664208735110607,
"grad_norm": 0.9238688349723816,
"learning_rate": 4.03147533130511e-05,
"loss": 0.68,
"step": 3230
},
{
"epoch": 3.6755530346001133,
"grad_norm": 0.9760640859603882,
"learning_rate": 3.968074639442805e-05,
"loss": 0.6542,
"step": 3240
},
{
"epoch": 3.68689733408962,
"grad_norm": 0.9406284689903259,
"learning_rate": 3.905052814022523e-05,
"loss": 0.653,
"step": 3250
},
{
"epoch": 3.6982416335791264,
"grad_norm": 0.9423522353172302,
"learning_rate": 3.842413813457758e-05,
"loss": 0.706,
"step": 3260
},
{
"epoch": 3.709585933068633,
"grad_norm": 0.8088165521621704,
"learning_rate": 3.780161572116704e-05,
"loss": 0.7161,
"step": 3270
},
{
"epoch": 3.7209302325581395,
"grad_norm": 0.9071544408798218,
"learning_rate": 3.718300000075129e-05,
"loss": 0.7193,
"step": 3280
},
{
"epoch": 3.7322745320476463,
"grad_norm": 0.8792480230331421,
"learning_rate": 3.6568329828707836e-05,
"loss": 0.6381,
"step": 3290
},
{
"epoch": 3.7436188315371526,
"grad_norm": 1.0307759046554565,
"learning_rate": 3.5957643812593543e-05,
"loss": 0.6668,
"step": 3300
},
{
"epoch": 3.754963131026659,
"grad_norm": 1.0883175134658813,
"learning_rate": 3.5350980309719514e-05,
"loss": 0.6978,
"step": 3310
},
{
"epoch": 3.7663074305161657,
"grad_norm": 1.0448516607284546,
"learning_rate": 3.4748377424742115e-05,
"loss": 0.6756,
"step": 3320
},
{
"epoch": 3.777651730005672,
"grad_norm": 0.8772532939910889,
"learning_rate": 3.414987300726945e-05,
"loss": 0.6714,
"step": 3330
},
{
"epoch": 3.7889960294951788,
"grad_norm": 1.0115753412246704,
"learning_rate": 3.3555504649484046e-05,
"loss": 0.6773,
"step": 3340
},
{
"epoch": 3.800340328984685,
"grad_norm": 1.1093175411224365,
"learning_rate": 3.296530968378173e-05,
"loss": 0.6916,
"step": 3350
},
{
"epoch": 3.811684628474192,
"grad_norm": 0.8998281359672546,
"learning_rate": 3.237932518042664e-05,
"loss": 0.6801,
"step": 3360
},
{
"epoch": 3.823028927963698,
"grad_norm": 1.0179048776626587,
"learning_rate": 3.1797587945223026e-05,
"loss": 0.6702,
"step": 3370
},
{
"epoch": 3.834373227453205,
"grad_norm": 0.9240026473999023,
"learning_rate": 3.1220134517203335e-05,
"loss": 0.671,
"step": 3380
},
{
"epoch": 3.8457175269427113,
"grad_norm": 0.7641962766647339,
"learning_rate": 3.0647001166333245e-05,
"loss": 0.7147,
"step": 3390
},
{
"epoch": 3.8570618264322176,
"grad_norm": 0.9078419804573059,
"learning_rate": 3.0078223891233514e-05,
"loss": 0.7155,
"step": 3400
},
{
"epoch": 3.8684061259217244,
"grad_norm": 0.962393045425415,
"learning_rate": 2.9513838416918815e-05,
"loss": 0.6866,
"step": 3410
},
{
"epoch": 3.8797504254112307,
"grad_norm": 1.5198420286178589,
"learning_rate": 2.8953880192554105e-05,
"loss": 0.6741,
"step": 3420
},
{
"epoch": 3.8910947249007375,
"grad_norm": 1.1129947900772095,
"learning_rate": 2.8398384389227816e-05,
"loss": 0.6542,
"step": 3430
},
{
"epoch": 3.902439024390244,
"grad_norm": 0.8633179664611816,
"learning_rate": 2.7847385897742705e-05,
"loss": 0.6768,
"step": 3440
},
{
"epoch": 3.9137833238797506,
"grad_norm": 1.062277913093567,
"learning_rate": 2.7300919326424658e-05,
"loss": 0.6709,
"step": 3450
},
{
"epoch": 3.925127623369257,
"grad_norm": 0.7949813604354858,
"learning_rate": 2.675901899894854e-05,
"loss": 0.6166,
"step": 3460
},
{
"epoch": 3.9364719228587637,
"grad_norm": 0.9200356006622314,
"learning_rate": 2.622171895218273e-05,
"loss": 0.6718,
"step": 3470
},
{
"epoch": 3.94781622234827,
"grad_norm": 0.9637920260429382,
"learning_rate": 2.568905293405095e-05,
"loss": 0.619,
"step": 3480
},
{
"epoch": 3.9591605218377763,
"grad_norm": 1.157073974609375,
"learning_rate": 2.516105440141262e-05,
"loss": 0.6961,
"step": 3490
},
{
"epoch": 3.970504821327283,
"grad_norm": 0.8323079347610474,
"learning_rate": 2.4637756517961517e-05,
"loss": 0.677,
"step": 3500
},
{
"epoch": 3.9818491208167894,
"grad_norm": 0.9369989037513733,
"learning_rate": 2.41191921521427e-05,
"loss": 0.6619,
"step": 3510
},
{
"epoch": 3.993193420306296,
"grad_norm": 0.8290889263153076,
"learning_rate": 2.360539387508801e-05,
"loss": 0.6534,
"step": 3520
},
{
"epoch": 4.0045377197958025,
"grad_norm": 0.8619610071182251,
"learning_rate": 2.309639395857033e-05,
"loss": 0.6531,
"step": 3530
},
{
"epoch": 4.015882019285309,
"grad_norm": 0.7406215071678162,
"learning_rate": 2.259222437297649e-05,
"loss": 0.5811,
"step": 3540
},
{
"epoch": 4.027226318774816,
"grad_norm": 1.3408113718032837,
"learning_rate": 2.2092916785299323e-05,
"loss": 0.6163,
"step": 3550
},
{
"epoch": 4.038570618264322,
"grad_norm": 0.9652060866355896,
"learning_rate": 2.159850255714859e-05,
"loss": 0.6345,
"step": 3560
},
{
"epoch": 4.049914917753829,
"grad_norm": 1.2307026386260986,
"learning_rate": 2.1109012742781142e-05,
"loss": 0.5568,
"step": 3570
},
{
"epoch": 4.061259217243335,
"grad_norm": 1.101637363433838,
"learning_rate": 2.0624478087150456e-05,
"loss": 0.608,
"step": 3580
},
{
"epoch": 4.072603516732841,
"grad_norm": 2.5598561763763428,
"learning_rate": 2.0144929023975413e-05,
"loss": 0.5294,
"step": 3590
},
{
"epoch": 4.083947816222349,
"grad_norm": 0.9463273286819458,
"learning_rate": 1.967039567382888e-05,
"loss": 0.5482,
"step": 3600
},
{
"epoch": 4.095292115711855,
"grad_norm": 0.9838125109672546,
"learning_rate": 1.920090784224581e-05,
"loss": 0.6254,
"step": 3610
},
{
"epoch": 4.106636415201361,
"grad_norm": 0.85828697681427,
"learning_rate": 1.8736495017851062e-05,
"loss": 0.5443,
"step": 3620
},
{
"epoch": 4.117980714690868,
"grad_norm": 0.8922297954559326,
"learning_rate": 1.827718637050736e-05,
"loss": 0.6068,
"step": 3630
},
{
"epoch": 4.129325014180375,
"grad_norm": 0.7973962426185608,
"learning_rate": 1.7823010749482927e-05,
"loss": 0.6179,
"step": 3640
},
{
"epoch": 4.140669313669881,
"grad_norm": 0.8686882257461548,
"learning_rate": 1.737399668163966e-05,
"loss": 0.6186,
"step": 3650
},
{
"epoch": 4.152013613159387,
"grad_norm": 1.4338245391845703,
"learning_rate": 1.693017236964125e-05,
"loss": 0.5784,
"step": 3660
},
{
"epoch": 4.163357912648894,
"grad_norm": 0.9958694577217102,
"learning_rate": 1.6491565690181765e-05,
"loss": 0.6388,
"step": 3670
},
{
"epoch": 4.1747022121384,
"grad_norm": 0.9962863922119141,
"learning_rate": 1.605820419223476e-05,
"loss": 0.6541,
"step": 3680
},
{
"epoch": 4.186046511627907,
"grad_norm": 1.1754194498062134,
"learning_rate": 1.5630115095322827e-05,
"loss": 0.6037,
"step": 3690
},
{
"epoch": 4.197390811117414,
"grad_norm": 1.1034218072891235,
"learning_rate": 1.5207325287808027e-05,
"loss": 0.5844,
"step": 3700
},
{
"epoch": 4.20873511060692,
"grad_norm": 1.0171332359313965,
"learning_rate": 1.4789861325203013e-05,
"loss": 0.6724,
"step": 3710
},
{
"epoch": 4.220079410096426,
"grad_norm": 0.9791539907455444,
"learning_rate": 1.4377749428503006e-05,
"loss": 0.5989,
"step": 3720
},
{
"epoch": 4.231423709585933,
"grad_norm": 0.9501050710678101,
"learning_rate": 1.3971015482538963e-05,
"loss": 0.5911,
"step": 3730
},
{
"epoch": 4.24276800907544,
"grad_norm": 1.2614890336990356,
"learning_rate": 1.3569685034351554e-05,
"loss": 0.5849,
"step": 3740
},
{
"epoch": 4.254112308564946,
"grad_norm": 1.0194411277770996,
"learning_rate": 1.3173783291586772e-05,
"loss": 0.5976,
"step": 3750
},
{
"epoch": 4.2654566080544525,
"grad_norm": 1.0711522102355957,
"learning_rate": 1.2783335120912565e-05,
"loss": 0.5931,
"step": 3760
},
{
"epoch": 4.276800907543959,
"grad_norm": 0.8650385141372681,
"learning_rate": 1.2398365046456783e-05,
"loss": 0.6078,
"step": 3770
},
{
"epoch": 4.288145207033466,
"grad_norm": 0.823208749294281,
"learning_rate": 1.2018897248267103e-05,
"loss": 0.5961,
"step": 3780
},
{
"epoch": 4.299489506522972,
"grad_norm": 0.9447870850563049,
"learning_rate": 1.1644955560791993e-05,
"loss": 0.6468,
"step": 3790
},
{
"epoch": 4.310833806012479,
"grad_norm": 1.102318525314331,
"learning_rate": 1.1276563471383883e-05,
"loss": 0.588,
"step": 3800
},
{
"epoch": 4.322178105501985,
"grad_norm": 0.9916651248931885,
"learning_rate": 1.0913744118823866e-05,
"loss": 0.6188,
"step": 3810
},
{
"epoch": 4.333522404991491,
"grad_norm": 1.1987171173095703,
"learning_rate": 1.05565202918682e-05,
"loss": 0.5841,
"step": 3820
},
{
"epoch": 4.3448667044809985,
"grad_norm": 0.9708378911018372,
"learning_rate": 1.0204914427817158e-05,
"loss": 0.6023,
"step": 3830
},
{
"epoch": 4.356211003970505,
"grad_norm": 1.0048896074295044,
"learning_rate": 9.8589486111056e-06,
"loss": 0.5705,
"step": 3840
},
{
"epoch": 4.367555303460011,
"grad_norm": 0.8364105820655823,
"learning_rate": 9.518644571915847e-06,
"loss": 0.5872,
"step": 3850
},
{
"epoch": 4.3788996029495175,
"grad_norm": 1.5254448652267456,
"learning_rate": 9.184023684812926e-06,
"loss": 0.6063,
"step": 3860
},
{
"epoch": 4.390243902439025,
"grad_norm": 0.993635356426239,
"learning_rate": 8.855106967401839e-06,
"loss": 0.5311,
"step": 3870
},
{
"epoch": 4.401588201928531,
"grad_norm": 0.8678284883499146,
"learning_rate": 8.531915079007625e-06,
"loss": 0.5894,
"step": 3880
},
{
"epoch": 4.412932501418037,
"grad_norm": 1.081127643585205,
"learning_rate": 8.214468319377633e-06,
"loss": 0.5906,
"step": 3890
},
{
"epoch": 4.424276800907544,
"grad_norm": 0.9130728840827942,
"learning_rate": 7.902786627406477e-06,
"loss": 0.5764,
"step": 3900
},
{
"epoch": 4.43562110039705,
"grad_norm": 0.9263814091682434,
"learning_rate": 7.596889579883826e-06,
"loss": 0.5812,
"step": 3910
},
{
"epoch": 4.446965399886557,
"grad_norm": 1.095747947692871,
"learning_rate": 7.296796390264549e-06,
"loss": 0.5721,
"step": 3920
},
{
"epoch": 4.458309699376064,
"grad_norm": 0.8003553152084351,
"learning_rate": 7.002525907462121e-06,
"loss": 0.5882,
"step": 3930
},
{
"epoch": 4.46965399886557,
"grad_norm": 0.8841357231140137,
"learning_rate": 6.7140966146646e-06,
"loss": 0.5543,
"step": 3940
},
{
"epoch": 4.480998298355076,
"grad_norm": 0.8580918312072754,
"learning_rate": 6.431526628173701e-06,
"loss": 0.6549,
"step": 3950
},
{
"epoch": 4.4923425978445835,
"grad_norm": 0.9447335004806519,
"learning_rate": 6.154833696267015e-06,
"loss": 0.6516,
"step": 3960
},
{
"epoch": 4.50368689733409,
"grad_norm": 1.0485211610794067,
"learning_rate": 5.884035198083071e-06,
"loss": 0.579,
"step": 3970
},
{
"epoch": 4.515031196823596,
"grad_norm": 0.9394044876098633,
"learning_rate": 5.619148142529873e-06,
"loss": 0.6396,
"step": 3980
},
{
"epoch": 4.526375496313102,
"grad_norm": 0.93062824010849,
"learning_rate": 5.360189167216545e-06,
"loss": 0.6005,
"step": 3990
},
{
"epoch": 4.537719795802609,
"grad_norm": 0.9513915777206421,
"learning_rate": 5.107174537408233e-06,
"loss": 0.5743,
"step": 4000
},
{
"epoch": 4.537719795802609,
"eval_loss": 1.0443100929260254,
"eval_runtime": 15.6805,
"eval_samples_per_second": 94.704,
"eval_steps_per_second": 11.862,
"step": 4000
},
{
"epoch": 4.549064095292116,
"grad_norm": 0.9627020359039307,
"learning_rate": 4.8601201450046316e-06,
"loss": 0.6077,
"step": 4010
},
{
"epoch": 4.560408394781622,
"grad_norm": 0.8539467453956604,
"learning_rate": 4.619041507541688e-06,
"loss": 0.5812,
"step": 4020
},
{
"epoch": 4.571752694271129,
"grad_norm": 0.9446848630905151,
"learning_rate": 4.383953767216964e-06,
"loss": 0.624,
"step": 4030
},
{
"epoch": 4.583096993760635,
"grad_norm": 1.188366174697876,
"learning_rate": 4.154871689938633e-06,
"loss": 0.6437,
"step": 4040
},
{
"epoch": 4.594441293250142,
"grad_norm": 1.0908474922180176,
"learning_rate": 3.931809664397867e-06,
"loss": 0.6323,
"step": 4050
},
{
"epoch": 4.6057855927396485,
"grad_norm": 0.9742168188095093,
"learning_rate": 3.714781701165304e-06,
"loss": 0.6132,
"step": 4060
},
{
"epoch": 4.617129892229155,
"grad_norm": 0.8761405348777771,
"learning_rate": 3.503801431810816e-06,
"loss": 0.624,
"step": 4070
},
{
"epoch": 4.628474191718661,
"grad_norm": 0.996088445186615,
"learning_rate": 3.298882108047463e-06,
"loss": 0.6009,
"step": 4080
},
{
"epoch": 4.6398184912081675,
"grad_norm": 0.9667827486991882,
"learning_rate": 3.10003660089907e-06,
"loss": 0.5988,
"step": 4090
},
{
"epoch": 4.651162790697675,
"grad_norm": 0.9298661351203918,
"learning_rate": 2.9072773998918503e-06,
"loss": 0.6453,
"step": 4100
},
{
"epoch": 4.662507090187181,
"grad_norm": 0.9182038307189941,
"learning_rate": 2.7206166122698774e-06,
"loss": 0.5915,
"step": 4110
},
{
"epoch": 4.673851389676687,
"grad_norm": 0.835645318031311,
"learning_rate": 2.540065962234683e-06,
"loss": 0.6515,
"step": 4120
},
{
"epoch": 4.685195689166194,
"grad_norm": 0.8575255274772644,
"learning_rate": 2.3656367902088026e-06,
"loss": 0.6169,
"step": 4130
},
{
"epoch": 4.696539988655701,
"grad_norm": 0.9075832962989807,
"learning_rate": 2.19734005212352e-06,
"loss": 0.6166,
"step": 4140
},
{
"epoch": 4.707884288145207,
"grad_norm": 2.0740888118743896,
"learning_rate": 2.035186318730742e-06,
"loss": 0.5779,
"step": 4150
},
{
"epoch": 4.7192285876347135,
"grad_norm": 1.0293558835983276,
"learning_rate": 1.8791857749389741e-06,
"loss": 0.6414,
"step": 4160
},
{
"epoch": 4.73057288712422,
"grad_norm": 0.9525774121284485,
"learning_rate": 1.7293482191736877e-06,
"loss": 0.5802,
"step": 4170
},
{
"epoch": 4.741917186613726,
"grad_norm": 0.9085150957107544,
"learning_rate": 1.5856830627618001e-06,
"loss": 0.6331,
"step": 4180
},
{
"epoch": 4.753261486103233,
"grad_norm": 0.9908912777900696,
"learning_rate": 1.4481993293406048e-06,
"loss": 0.5844,
"step": 4190
},
{
"epoch": 4.76460578559274,
"grad_norm": 0.7421241998672485,
"learning_rate": 1.316905654291012e-06,
"loss": 0.6653,
"step": 4200
},
{
"epoch": 4.775950085082246,
"grad_norm": 0.857502281665802,
"learning_rate": 1.1918102841950607e-06,
"loss": 0.5693,
"step": 4210
},
{
"epoch": 4.787294384571752,
"grad_norm": 0.9300210475921631,
"learning_rate": 1.0729210763180564e-06,
"loss": 0.5755,
"step": 4220
},
{
"epoch": 4.79863868406126,
"grad_norm": 1.2351378202438354,
"learning_rate": 9.602454981149977e-07,
"loss": 0.618,
"step": 4230
},
{
"epoch": 4.809982983550766,
"grad_norm": 1.24778151512146,
"learning_rate": 8.537906267615415e-07,
"loss": 0.5896,
"step": 4240
},
{
"epoch": 4.821327283040272,
"grad_norm": 1.3560271263122559,
"learning_rate": 7.535631487095352e-07,
"loss": 0.5879,
"step": 4250
},
{
"epoch": 4.832671582529779,
"grad_norm": 1.8108911514282227,
"learning_rate": 6.59569359266976e-07,
"loss": 0.5943,
"step": 4260
},
{
"epoch": 4.844015882019285,
"grad_norm": 0.9743121862411499,
"learning_rate": 5.718151622026379e-07,
"loss": 0.6104,
"step": 4270
},
{
"epoch": 4.855360181508792,
"grad_norm": 1.2035831212997437,
"learning_rate": 4.903060693752348e-07,
"loss": 0.608,
"step": 4280
},
{
"epoch": 4.866704480998298,
"grad_norm": 0.9681785106658936,
"learning_rate": 4.1504720038724187e-07,
"loss": 0.5773,
"step": 4290
},
{
"epoch": 4.878048780487805,
"grad_norm": 1.0151753425598145,
"learning_rate": 3.4604328226333083e-07,
"loss": 0.5609,
"step": 4300
},
{
"epoch": 4.889393079977311,
"grad_norm": 1.0577515363693237,
"learning_rate": 2.832986491534295e-07,
"loss": 0.6435,
"step": 4310
},
{
"epoch": 4.900737379466818,
"grad_norm": 0.8938112854957581,
"learning_rate": 2.2681724206052857e-07,
"loss": 0.6398,
"step": 4320
},
{
"epoch": 4.912081678956325,
"grad_norm": 0.997191846370697,
"learning_rate": 1.7660260859315713e-07,
"loss": 0.628,
"step": 4330
},
{
"epoch": 4.923425978445831,
"grad_norm": 0.8382704257965088,
"learning_rate": 1.3265790274249456e-07,
"loss": 0.6105,
"step": 4340
},
{
"epoch": 4.934770277935337,
"grad_norm": 0.8330470323562622,
"learning_rate": 9.498588468433989e-08,
"loss": 0.5982,
"step": 4350
},
{
"epoch": 4.946114577424844,
"grad_norm": 1.2183622121810913,
"learning_rate": 6.35889206057172e-08,
"loss": 0.5876,
"step": 4360
},
{
"epoch": 4.957458876914351,
"grad_norm": 1.131373405456543,
"learning_rate": 3.846898255622788e-08,
"loss": 0.6113,
"step": 4370
},
{
"epoch": 4.968803176403857,
"grad_norm": 1.1781286001205444,
"learning_rate": 1.9627648324227476e-08,
"loss": 0.5522,
"step": 4380
},
{
"epoch": 4.9801474758933635,
"grad_norm": 1.2726503610610962,
"learning_rate": 7.066101337682707e-09,
"loss": 0.6312,
"step": 4390
},
{
"epoch": 4.99149177538287,
"grad_norm": 1.1971274614334106,
"learning_rate": 7.85130589897598e-10,
"loss": 0.6052,
"step": 4400
},
{
"epoch": 4.997163925127623,
"step": 4405,
"total_flos": 9.40234358432727e+17,
"train_loss": 0.7921485962039632,
"train_runtime": 4193.8899,
"train_samples_per_second": 33.618,
"train_steps_per_second": 1.05
}
],
"logging_steps": 10,
"max_steps": 4405,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 9.40234358432727e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}