GENOME-gemma-2b-it / open_orca /trainer_state.json

Upload 15 files

60ee796 verified 10 months ago

78.3 kB

	{
	"best_metric": null,
	"best_model_checkpoint": null,
	"epoch": 4.997163925127623,
	"eval_steps": 1000,
	"global_step": 4405,
	"is_hyper_param_search": false,
	"is_local_process_zero": true,
	"is_world_process_zero": true,
	"log_history": [
	{
	"epoch": 0.011344299489506523,
	"grad_norm": 2.3206300735473633,
	"learning_rate": 4.535147392290249e-06,
	"loss": 1.5929,
	"step": 10
	},
	{
	"epoch": 0.022688598979013045,
	"grad_norm": 1.2386493682861328,
	"learning_rate": 9.070294784580499e-06,
	"loss": 1.6159,
	"step": 20
	},
	{
	"epoch": 0.03403289846851957,
	"grad_norm": 1.1790252923965454,
	"learning_rate": 1.360544217687075e-05,
	"loss": 1.538,
	"step": 30
	},
	{
	"epoch": 0.04537719795802609,
	"grad_norm": 1.021796703338623,
	"learning_rate": 1.8140589569160997e-05,
	"loss": 1.4132,
	"step": 40
	},
	{
	"epoch": 0.05672149744753262,
	"grad_norm": 1.3392266035079956,
	"learning_rate": 2.267573696145125e-05,
	"loss": 1.2604,
	"step": 50
	},
	{
	"epoch": 0.06806579693703914,
	"grad_norm": 0.9446895122528076,
	"learning_rate": 2.72108843537415e-05,
	"loss": 1.1644,
	"step": 60
	},
	{
	"epoch": 0.07941009642654566,
	"grad_norm": 1.3290923833847046,
	"learning_rate": 3.1746031746031745e-05,
	"loss": 1.1082,
	"step": 70
	},
	{
	"epoch": 0.09075439591605218,
	"grad_norm": 1.5161434412002563,
	"learning_rate": 3.6281179138321995e-05,
	"loss": 1.0389,
	"step": 80
	},
	{
	"epoch": 0.1020986954055587,
	"grad_norm": 0.6483525633811951,
	"learning_rate": 4.0816326530612245e-05,
	"loss": 1.0542,
	"step": 90
	},
	{
	"epoch": 0.11344299489506524,
	"grad_norm": 0.8814989924430847,
	"learning_rate": 4.53514739229025e-05,
	"loss": 0.9847,
	"step": 100
	},
	{
	"epoch": 0.12478729438457176,
	"grad_norm": 0.7316718101501465,
	"learning_rate": 4.9886621315192745e-05,
	"loss": 1.0585,
	"step": 110
	},
	{
	"epoch": 0.13613159387407828,
	"grad_norm": 0.7645348310470581,
	"learning_rate": 5.4421768707483e-05,
	"loss": 0.9713,
	"step": 120
	},
	{
	"epoch": 0.1474758933635848,
	"grad_norm": 0.6830883622169495,
	"learning_rate": 5.895691609977324e-05,
	"loss": 0.9823,
	"step": 130
	},
	{
	"epoch": 0.15882019285309132,
	"grad_norm": 1.3199207782745361,
	"learning_rate": 6.349206349206349e-05,
	"loss": 0.9992,
	"step": 140
	},
	{
	"epoch": 0.17016449234259784,
	"grad_norm": 0.7770159840583801,
	"learning_rate": 6.802721088435374e-05,
	"loss": 1.0085,
	"step": 150
	},
	{
	"epoch": 0.18150879183210436,
	"grad_norm": 1.623410940170288,
	"learning_rate": 7.256235827664399e-05,
	"loss": 1.0491,
	"step": 160
	},
	{
	"epoch": 0.19285309132161088,
	"grad_norm": 2.8830106258392334,
	"learning_rate": 7.709750566893424e-05,
	"loss": 1.0686,
	"step": 170
	},
	{
	"epoch": 0.2041973908111174,
	"grad_norm": 1.3428577184677124,
	"learning_rate": 8.163265306122449e-05,
	"loss": 1.0359,
	"step": 180
	},
	{
	"epoch": 0.21554169030062392,
	"grad_norm": 0.8043076395988464,
	"learning_rate": 8.616780045351474e-05,
	"loss": 1.0496,
	"step": 190
	},
	{
	"epoch": 0.22688598979013047,
	"grad_norm": 1.8799352645874023,
	"learning_rate": 9.0702947845805e-05,
	"loss": 1.0284,
	"step": 200
	},
	{
	"epoch": 0.238230289279637,
	"grad_norm": 0.6667978167533875,
	"learning_rate": 9.523809523809524e-05,
	"loss": 1.0162,
	"step": 210
	},
	{
	"epoch": 0.2495745887691435,
	"grad_norm": 0.815127968788147,
	"learning_rate": 9.977324263038549e-05,
	"loss": 1.0009,
	"step": 220
	},
	{
	"epoch": 0.26091888825865,
	"grad_norm": 0.6558067798614502,
	"learning_rate": 0.00010430839002267574,
	"loss": 1.004,
	"step": 230
	},
	{
	"epoch": 0.27226318774815655,
	"grad_norm": 0.6002511382102966,
	"learning_rate": 0.000108843537414966,
	"loss": 0.9702,
	"step": 240
	},
	{
	"epoch": 0.28360748723766305,
	"grad_norm": 0.7007895708084106,
	"learning_rate": 0.00011337868480725624,
	"loss": 1.0266,
	"step": 250
	},
	{
	"epoch": 0.2949517867271696,
	"grad_norm": 0.7985921502113342,
	"learning_rate": 0.00011791383219954648,
	"loss": 0.9753,
	"step": 260
	},
	{
	"epoch": 0.30629608621667614,
	"grad_norm": 0.5343239903450012,
	"learning_rate": 0.00012244897959183676,
	"loss": 1.036,
	"step": 270
	},
	{
	"epoch": 0.31764038570618264,
	"grad_norm": 0.7095124125480652,
	"learning_rate": 0.00012698412698412698,
	"loss": 1.0061,
	"step": 280
	},
	{
	"epoch": 0.3289846851956892,
	"grad_norm": 0.8570685386657715,
	"learning_rate": 0.00013151927437641726,
	"loss": 0.9458,
	"step": 290
	},
	{
	"epoch": 0.3403289846851957,
	"grad_norm": 0.6379779577255249,
	"learning_rate": 0.00013605442176870748,
	"loss": 0.9965,
	"step": 300
	},
	{
	"epoch": 0.3516732841747022,
	"grad_norm": 0.9263567328453064,
	"learning_rate": 0.00014058956916099776,
	"loss": 0.9601,
	"step": 310
	},
	{
	"epoch": 0.3630175836642087,
	"grad_norm": 0.7343761920928955,
	"learning_rate": 0.00014512471655328798,
	"loss": 1.0182,
	"step": 320
	},
	{
	"epoch": 0.37436188315371527,
	"grad_norm": 0.588762640953064,
	"learning_rate": 0.00014965986394557826,
	"loss": 0.9762,
	"step": 330
	},
	{
	"epoch": 0.38570618264322176,
	"grad_norm": 0.6719630360603333,
	"learning_rate": 0.00015419501133786848,
	"loss": 0.989,
	"step": 340
	},
	{
	"epoch": 0.3970504821327283,
	"grad_norm": 1.641836166381836,
	"learning_rate": 0.00015873015873015873,
	"loss": 0.9611,
	"step": 350
	},
	{
	"epoch": 0.4083947816222348,
	"grad_norm": 0.9340532422065735,
	"learning_rate": 0.00016326530612244898,
	"loss": 0.9861,
	"step": 360
	},
	{
	"epoch": 0.41973908111174135,
	"grad_norm": 0.737554669380188,
	"learning_rate": 0.00016780045351473923,
	"loss": 1.0,
	"step": 370
	},
	{
	"epoch": 0.43108338060124785,
	"grad_norm": 1.1190237998962402,
	"learning_rate": 0.00017233560090702948,
	"loss": 1.016,
	"step": 380
	},
	{
	"epoch": 0.4424276800907544,
	"grad_norm": 0.7501509785652161,
	"learning_rate": 0.00017687074829931973,
	"loss": 0.9743,
	"step": 390
	},
	{
	"epoch": 0.45377197958026094,
	"grad_norm": 0.5105754733085632,
	"learning_rate": 0.00018140589569161,
	"loss": 1.0182,
	"step": 400
	},
	{
	"epoch": 0.46511627906976744,
	"grad_norm": 0.7148075699806213,
	"learning_rate": 0.00018594104308390023,
	"loss": 0.9673,
	"step": 410
	},
	{
	"epoch": 0.476460578559274,
	"grad_norm": 0.49944302439689636,
	"learning_rate": 0.00019047619047619048,
	"loss": 1.0083,
	"step": 420
	},
	{
	"epoch": 0.4878048780487805,
	"grad_norm": 0.5624661445617676,
	"learning_rate": 0.00019501133786848073,
	"loss": 1.0201,
	"step": 430
	},
	{
	"epoch": 0.499149177538287,
	"grad_norm": 0.5779452919960022,
	"learning_rate": 0.00019954648526077098,
	"loss": 1.0165,
	"step": 440
	},
	{
	"epoch": 0.5104934770277936,
	"grad_norm": 0.8505494594573975,
	"learning_rate": 0.0001999974561843451,
	"loss": 0.9527,
	"step": 450
	},
	{
	"epoch": 0.5218377765173,
	"grad_norm": 0.7141993641853333,
	"learning_rate": 0.00019998866291366877,
	"loss": 0.9927,
	"step": 460
	},
	{
	"epoch": 0.5331820760068066,
	"grad_norm": 0.5913094282150269,
	"learning_rate": 0.0001999735893350151,
	"loss": 1.0054,
	"step": 470
	},
	{
	"epoch": 0.5445263754963131,
	"grad_norm": 0.5813531279563904,
	"learning_rate": 0.00019995223639515864,
	"loss": 0.9511,
	"step": 480
	},
	{
	"epoch": 0.5558706749858197,
	"grad_norm": 0.9083317518234253,
	"learning_rate": 0.0001999246054352818,
	"loss": 0.9596,
	"step": 490
	},
	{
	"epoch": 0.5672149744753261,
	"grad_norm": 0.8444753885269165,
	"learning_rate": 0.00019989069819089067,
	"loss": 1.0163,
	"step": 500
	},
	{
	"epoch": 0.5785592739648326,
	"grad_norm": 0.6896610856056213,
	"learning_rate": 0.0001998505167917061,
	"loss": 0.9606,
	"step": 510
	},
	{
	"epoch": 0.5899035734543392,
	"grad_norm": 0.7446523308753967,
	"learning_rate": 0.00019980406376152984,
	"loss": 0.9748,
	"step": 520
	},
	{
	"epoch": 0.6012478729438457,
	"grad_norm": 0.5111407041549683,
	"learning_rate": 0.00019975134201808605,
	"loss": 0.9364,
	"step": 530
	},
	{
	"epoch": 0.6125921724333523,
	"grad_norm": 0.6797256469726562,
	"learning_rate": 0.000199692354872838,
	"loss": 0.9766,
	"step": 540
	},
	{
	"epoch": 0.6239364719228587,
	"grad_norm": 0.9774245619773865,
	"learning_rate": 0.00019962710603078007,
	"loss": 0.9669,
	"step": 550
	},
	{
	"epoch": 0.6352807714123653,
	"grad_norm": 0.7039481997489929,
	"learning_rate": 0.0001995555995902052,
	"loss": 0.9371,
	"step": 560
	},
	{
	"epoch": 0.6466250709018718,
	"grad_norm": 0.7363829016685486,
	"learning_rate": 0.0001994778400424472,
	"loss": 0.9809,
	"step": 570
	},
	{
	"epoch": 0.6579693703913784,
	"grad_norm": 0.7072857022285461,
	"learning_rate": 0.0001993938322715989,
	"loss": 0.9825,
	"step": 580
	},
	{
	"epoch": 0.6693136698808848,
	"grad_norm": 0.5628974437713623,
	"learning_rate": 0.00019930358155420525,
	"loss": 0.9101,
	"step": 590
	},
	{
	"epoch": 0.6806579693703914,
	"grad_norm": 0.6564317345619202,
	"learning_rate": 0.0001992070935589319,
	"loss": 1.0374,
	"step": 600
	},
	{
	"epoch": 0.6920022688598979,
	"grad_norm": 0.5805884599685669,
	"learning_rate": 0.0001991043743462092,
	"loss": 0.9695,
	"step": 610
	},
	{
	"epoch": 0.7033465683494045,
	"grad_norm": 0.5671830773353577,
	"learning_rate": 0.00019899543036785145,
	"loss": 0.9598,
	"step": 620
	},
	{
	"epoch": 0.7146908678389109,
	"grad_norm": 0.54367595911026,
	"learning_rate": 0.0001988802684666519,
	"loss": 0.962,
	"step": 630
	},
	{
	"epoch": 0.7260351673284174,
	"grad_norm": 0.6982467770576477,
	"learning_rate": 0.00019875889587595252,
	"loss": 0.9633,
	"step": 640
	},
	{
	"epoch": 0.737379466817924,
	"grad_norm": 0.6268488764762878,
	"learning_rate": 0.00019863132021919025,
	"loss": 0.9684,
	"step": 650
	},
	{
	"epoch": 0.7487237663074305,
	"grad_norm": 1.2111632823944092,
	"learning_rate": 0.00019849754950941758,
	"loss": 1.0044,
	"step": 660
	},
	{
	"epoch": 0.7600680657969371,
	"grad_norm": 0.6442829370498657,
	"learning_rate": 0.00019835759214879964,
	"loss": 0.9533,
	"step": 670
	},
	{
	"epoch": 0.7714123652864435,
	"grad_norm": 0.5263229608535767,
	"learning_rate": 0.00019821145692808633,
	"loss": 0.959,
	"step": 680
	},
	{
	"epoch": 0.7827566647759501,
	"grad_norm": 0.572928786277771,
	"learning_rate": 0.00019805915302606016,
	"loss": 0.9473,
	"step": 690
	},
	{
	"epoch": 0.7941009642654566,
	"grad_norm": 0.6176092624664307,
	"learning_rate": 0.00019790069000895987,
	"loss": 0.9164,
	"step": 700
	},
	{
	"epoch": 0.8054452637549632,
	"grad_norm": 0.5628384351730347,
	"learning_rate": 0.00019773607782987924,
	"loss": 0.9705,
	"step": 710
	},
	{
	"epoch": 0.8167895632444696,
	"grad_norm": 0.8331648111343384,
	"learning_rate": 0.00019756532682814232,
	"loss": 0.9497,
	"step": 720
	},
	{
	"epoch": 0.8281338627339762,
	"grad_norm": 0.5843848586082458,
	"learning_rate": 0.00019738844772865377,
	"loss": 0.9828,
	"step": 730
	},
	{
	"epoch": 0.8394781622234827,
	"grad_norm": 0.6603434681892395,
	"learning_rate": 0.0001972054516412253,
	"loss": 0.9717,
	"step": 740
	},
	{
	"epoch": 0.8508224617129893,
	"grad_norm": 0.5622076988220215,
	"learning_rate": 0.00019701635005987792,
	"loss": 0.9392,
	"step": 750
	},
	{
	"epoch": 0.8621667612024957,
	"grad_norm": 0.8947564959526062,
	"learning_rate": 0.00019682115486211984,
	"loss": 0.9917,
	"step": 760
	},
	{
	"epoch": 0.8735110606920022,
	"grad_norm": 0.5935038328170776,
	"learning_rate": 0.00019661987830820065,
	"loss": 0.9749,
	"step": 770
	},
	{
	"epoch": 0.8848553601815088,
	"grad_norm": 0.8751797676086426,
	"learning_rate": 0.000196412533040341,
	"loss": 0.9828,
	"step": 780
	},
	{
	"epoch": 0.8961996596710153,
	"grad_norm": 0.5279515981674194,
	"learning_rate": 0.00019619913208193882,
	"loss": 0.9685,
	"step": 790
	},
	{
	"epoch": 0.9075439591605219,
	"grad_norm": 0.643695056438446,
	"learning_rate": 0.00019597968883675116,
	"loss": 0.9547,
	"step": 800
	},
	{
	"epoch": 0.9188882586500283,
	"grad_norm": 0.7370747923851013,
	"learning_rate": 0.00019575421708805215,
	"loss": 0.9129,
	"step": 810
	},
	{
	"epoch": 0.9302325581395349,
	"grad_norm": 0.7514728307723999,
	"learning_rate": 0.0001955227309977677,
	"loss": 0.9929,
	"step": 820
	},
	{
	"epoch": 0.9415768576290414,
	"grad_norm": 0.6589088439941406,
	"learning_rate": 0.00019528524510558547,
	"loss": 0.9627,
	"step": 830
	},
	{
	"epoch": 0.952921157118548,
	"grad_norm": 0.548102617263794,
	"learning_rate": 0.00019504177432804203,
	"loss": 0.9307,
	"step": 840
	},
	{
	"epoch": 0.9642654566080544,
	"grad_norm": 0.458879679441452,
	"learning_rate": 0.00019479233395758576,
	"loss": 0.9838,
	"step": 850
	},
	{
	"epoch": 0.975609756097561,
	"grad_norm": 0.9955594539642334,
	"learning_rate": 0.0001945369396616164,
	"loss": 0.9246,
	"step": 860
	},
	{
	"epoch": 0.9869540555870675,
	"grad_norm": 0.5781052708625793,
	"learning_rate": 0.0001942756074815009,
	"loss": 1.0076,
	"step": 870
	},
	{
	"epoch": 0.998298355076574,
	"grad_norm": 0.7370733022689819,
	"learning_rate": 0.00019400835383156592,
	"loss": 0.9618,
	"step": 880
	},
	{
	"epoch": 1.0096426545660806,
	"grad_norm": 0.6173350214958191,
	"learning_rate": 0.00019373519549806682,
	"loss": 0.872,
	"step": 890
	},
	{
	"epoch": 1.0209869540555871,
	"grad_norm": 0.6110262274742126,
	"learning_rate": 0.00019345614963813334,
	"loss": 0.8953,
	"step": 900
	},
	{
	"epoch": 1.0323312535450937,
	"grad_norm": 0.8880902528762817,
	"learning_rate": 0.00019317123377869192,
	"loss": 0.8847,
	"step": 910
	},
	{
	"epoch": 1.0436755530346,
	"grad_norm": 0.6907595992088318,
	"learning_rate": 0.00019288046581536486,
	"loss": 0.8878,
	"step": 920
	},
	{
	"epoch": 1.0550198525241066,
	"grad_norm": 0.7469139695167542,
	"learning_rate": 0.00019258386401134624,
	"loss": 0.9018,
	"step": 930
	},
	{
	"epoch": 1.0663641520136131,
	"grad_norm": 0.8650104403495789,
	"learning_rate": 0.0001922814469962549,
	"loss": 0.8825,
	"step": 940
	},
	{
	"epoch": 1.0777084515031197,
	"grad_norm": 1.1437135934829712,
	"learning_rate": 0.00019197323376496427,
	"loss": 0.8977,
	"step": 950
	},
	{
	"epoch": 1.0890527509926262,
	"grad_norm": 0.6191611289978027,
	"learning_rate": 0.00019165924367640916,
	"loss": 0.9059,
	"step": 960
	},
	{
	"epoch": 1.1003970504821328,
	"grad_norm": 0.7402692437171936,
	"learning_rate": 0.00019133949645237005,
	"loss": 0.8778,
	"step": 970
	},
	{
	"epoch": 1.1117413499716393,
	"grad_norm": 0.7002813220024109,
	"learning_rate": 0.00019101401217623426,
	"loss": 0.9281,
	"step": 980
	},
	{
	"epoch": 1.1230856494611459,
	"grad_norm": 0.9000174403190613,
	"learning_rate": 0.00019068281129173444,
	"loss": 0.8795,
	"step": 990
	},
	{
	"epoch": 1.1344299489506522,
	"grad_norm": 0.6749204993247986,
	"learning_rate": 0.00019034591460166463,
	"loss": 0.9091,
	"step": 1000
	},
	{
	"epoch": 1.1344299489506522,
	"eval_loss": 0.8940885663032532,
	"eval_runtime": 15.7869,
	"eval_samples_per_second": 94.065,
	"eval_steps_per_second": 11.782,
	"step": 1000
	},
	{
	"epoch": 1.1457742484401587,
	"grad_norm": 0.7294667959213257,
	"learning_rate": 0.00019000334326657345,
	"loss": 0.879,
	"step": 1010
	},
	{
	"epoch": 1.1571185479296653,
	"grad_norm": 0.9591787457466125,
	"learning_rate": 0.00018965511880343527,
	"loss": 0.9264,
	"step": 1020
	},
	{
	"epoch": 1.1684628474191718,
	"grad_norm": 0.9575808644294739,
	"learning_rate": 0.00018930126308429844,
	"loss": 0.8825,
	"step": 1030
	},
	{
	"epoch": 1.1798071469086784,
	"grad_norm": 0.49267736077308655,
	"learning_rate": 0.00018894179833491164,
	"loss": 0.9321,
	"step": 1040
	},
	{
	"epoch": 1.191151446398185,
	"grad_norm": 0.848102867603302,
	"learning_rate": 0.00018857674713332795,
	"loss": 0.8543,
	"step": 1050
	},
	{
	"epoch": 1.2024957458876915,
	"grad_norm": 0.7710912227630615,
	"learning_rate": 0.00018820613240848655,
	"loss": 0.9468,
	"step": 1060
	},
	{
	"epoch": 1.213840045377198,
	"grad_norm": 0.6399308443069458,
	"learning_rate": 0.00018782997743877264,
	"loss": 0.9081,
	"step": 1070
	},
	{
	"epoch": 1.2251843448667046,
	"grad_norm": 0.9124737977981567,
	"learning_rate": 0.00018744830585055538,
	"loss": 0.9288,
	"step": 1080
	},
	{
	"epoch": 1.236528644356211,
	"grad_norm": 0.6313666105270386,
	"learning_rate": 0.00018706114161670377,
	"loss": 0.8197,
	"step": 1090
	},
	{
	"epoch": 1.2478729438457175,
	"grad_norm": 0.7220073938369751,
	"learning_rate": 0.000186668509055081,
	"loss": 0.8576,
	"step": 1100
	},
	{
	"epoch": 1.259217243335224,
	"grad_norm": 1.1808422803878784,
	"learning_rate": 0.00018627043282701703,
	"loss": 0.9044,
	"step": 1110
	},
	{
	"epoch": 1.2705615428247305,
	"grad_norm": 0.6578934788703918,
	"learning_rate": 0.00018586693793575966,
	"loss": 0.9015,
	"step": 1120
	},
	{
	"epoch": 1.281905842314237,
	"grad_norm": 0.9080325961112976,
	"learning_rate": 0.0001854580497249039,
	"loss": 0.8919,
	"step": 1130
	},
	{
	"epoch": 1.2932501418037436,
	"grad_norm": 0.6446923017501831,
	"learning_rate": 0.00018504379387680034,
	"loss": 0.9033,
	"step": 1140
	},
	{
	"epoch": 1.3045944412932502,
	"grad_norm": 0.6877492070198059,
	"learning_rate": 0.00018462419641094189,
	"loss": 0.8843,
	"step": 1150
	},
	{
	"epoch": 1.3159387407827567,
	"grad_norm": 0.6565636396408081,
	"learning_rate": 0.00018419928368232957,
	"loss": 0.8925,
	"step": 1160
	},
	{
	"epoch": 1.3272830402722633,
	"grad_norm": 0.8198230862617493,
	"learning_rate": 0.0001837690823798171,
	"loss": 0.8495,
	"step": 1170
	},
	{
	"epoch": 1.3386273397617696,
	"grad_norm": 0.7579399347305298,
	"learning_rate": 0.00018333361952443462,
	"loss": 0.9051,
	"step": 1180
	},
	{
	"epoch": 1.3499716392512762,
	"grad_norm": 0.8067922592163086,
	"learning_rate": 0.0001828929224676914,
	"loss": 0.8677,
	"step": 1190
	},
	{
	"epoch": 1.3613159387407827,
	"grad_norm": 0.7077610492706299,
	"learning_rate": 0.00018244701888985802,
	"loss": 0.942,
	"step": 1200
	},
	{
	"epoch": 1.3726602382302893,
	"grad_norm": 1.2009291648864746,
	"learning_rate": 0.00018199593679822765,
	"loss": 0.9034,
	"step": 1210
	},
	{
	"epoch": 1.3840045377197958,
	"grad_norm": 0.8162534832954407,
	"learning_rate": 0.00018153970452535698,
	"loss": 0.8904,
	"step": 1220
	},
	{
	"epoch": 1.3953488372093024,
	"grad_norm": 0.6332406401634216,
	"learning_rate": 0.00018107835072728656,
	"loss": 0.8637,
	"step": 1230
	},
	{
	"epoch": 1.406693136698809,
	"grad_norm": 0.6449089050292969,
	"learning_rate": 0.00018061190438174105,
	"loss": 0.9463,
	"step": 1240
	},
	{
	"epoch": 1.4180374361883152,
	"grad_norm": 0.6543394327163696,
	"learning_rate": 0.00018014039478630894,
	"loss": 0.8497,
	"step": 1250
	},
	{
	"epoch": 1.429381735677822,
	"grad_norm": 0.7993437647819519,
	"learning_rate": 0.0001796638515566025,
	"loss": 0.9415,
	"step": 1260
	},
	{
	"epoch": 1.4407260351673283,
	"grad_norm": 0.878514289855957,
	"learning_rate": 0.0001791823046243977,
	"loss": 0.9143,
	"step": 1270
	},
	{
	"epoch": 1.4520703346568349,
	"grad_norm": 0.6794580817222595,
	"learning_rate": 0.00017869578423575387,
	"loss": 0.9041,
	"step": 1280
	},
	{
	"epoch": 1.4634146341463414,
	"grad_norm": 0.9009565711021423,
	"learning_rate": 0.00017820432094911427,
	"loss": 0.8773,
	"step": 1290
	},
	{
	"epoch": 1.474758933635848,
	"grad_norm": 0.6419825553894043,
	"learning_rate": 0.00017770794563338647,
	"loss": 0.9027,
	"step": 1300
	},
	{
	"epoch": 1.4861032331253545,
	"grad_norm": 0.7277469635009766,
	"learning_rate": 0.0001772066894660037,
	"loss": 0.9123,
	"step": 1310
	},
	{
	"epoch": 1.497447532614861,
	"grad_norm": 0.7514845132827759,
	"learning_rate": 0.00017670058393096634,
	"loss": 0.9095,
	"step": 1320
	},
	{
	"epoch": 1.5087918321043676,
	"grad_norm": 0.5530194044113159,
	"learning_rate": 0.0001761896608168646,
	"loss": 0.855,
	"step": 1330
	},
	{
	"epoch": 1.520136131593874,
	"grad_norm": 0.6379088759422302,
	"learning_rate": 0.0001756739522148818,
	"loss": 0.9485,
	"step": 1340
	},
	{
	"epoch": 1.5314804310833807,
	"grad_norm": 0.5411556959152222,
	"learning_rate": 0.0001751534905167787,
	"loss": 0.951,
	"step": 1350
	},
	{
	"epoch": 1.542824730572887,
	"grad_norm": 0.9241764545440674,
	"learning_rate": 0.00017462830841285894,
	"loss": 0.8459,
	"step": 1360
	},
	{
	"epoch": 1.5541690300623936,
	"grad_norm": 0.9029989242553711,
	"learning_rate": 0.00017409843888991584,
	"loss": 0.9045,
	"step": 1370
	},
	{
	"epoch": 1.5655133295519001,
	"grad_norm": 0.9002951979637146,
	"learning_rate": 0.00017356391522916042,
	"loss": 0.8388,
	"step": 1380
	},
	{
	"epoch": 1.5768576290414067,
	"grad_norm": 0.6322818994522095,
	"learning_rate": 0.0001730247710041311,
	"loss": 0.8937,
	"step": 1390
	},
	{
	"epoch": 1.5882019285309132,
	"grad_norm": 0.9197801351547241,
	"learning_rate": 0.00017248104007858476,
	"loss": 0.8656,
	"step": 1400
	},
	{
	"epoch": 1.5995462280204198,
	"grad_norm": 0.7498595714569092,
	"learning_rate": 0.00017193275660436997,
	"loss": 0.8848,
	"step": 1410
	},
	{
	"epoch": 1.6108905275099263,
	"grad_norm": 1.0003221035003662,
	"learning_rate": 0.00017137995501928166,
	"loss": 0.8494,
	"step": 1420
	},
	{
	"epoch": 1.6222348269994327,
	"grad_norm": 0.6622512340545654,
	"learning_rate": 0.00017082267004489842,
	"loss": 0.9158,
	"step": 1430
	},
	{
	"epoch": 1.6335791264889394,
	"grad_norm": 1.2562657594680786,
	"learning_rate": 0.00017026093668440114,
	"loss": 0.8899,
	"step": 1440
	},
	{
	"epoch": 1.6449234259784458,
	"grad_norm": 0.5380372405052185,
	"learning_rate": 0.00016969479022037502,
	"loss": 0.9082,
	"step": 1450
	},
	{
	"epoch": 1.6562677254679523,
	"grad_norm": 0.7120011448860168,
	"learning_rate": 0.00016912426621259297,
	"loss": 0.8456,
	"step": 1460
	},
	{
	"epoch": 1.6676120249574589,
	"grad_norm": 0.580111026763916,
	"learning_rate": 0.0001685494004957824,
	"loss": 0.9272,
	"step": 1470
	},
	{
	"epoch": 1.6789563244469654,
	"grad_norm": 0.9516561627388,
	"learning_rate": 0.0001679702291773743,
	"loss": 0.906,
	"step": 1480
	},
	{
	"epoch": 1.690300623936472,
	"grad_norm": 0.5973901152610779,
	"learning_rate": 0.0001673867886352354,
	"loss": 0.931,
	"step": 1490
	},
	{
	"epoch": 1.7016449234259783,
	"grad_norm": 0.7292883992195129,
	"learning_rate": 0.00016679911551538317,
	"loss": 0.8848,
	"step": 1500
	},
	{
	"epoch": 1.712989222915485,
	"grad_norm": 0.6363751888275146,
	"learning_rate": 0.0001662072467296842,
	"loss": 0.9059,
	"step": 1510
	},
	{
	"epoch": 1.7243335224049914,
	"grad_norm": 0.9236806631088257,
	"learning_rate": 0.00016561121945353566,
	"loss": 0.8557,
	"step": 1520
	},
	{
	"epoch": 1.7356778218944982,
	"grad_norm": 0.6865366697311401,
	"learning_rate": 0.00016501107112353028,
	"loss": 0.9264,
	"step": 1530
	},
	{
	"epoch": 1.7470221213840045,
	"grad_norm": 0.6749486923217773,
	"learning_rate": 0.00016440683943510516,
	"loss": 0.9224,
	"step": 1540
	},
	{
	"epoch": 1.758366420873511,
	"grad_norm": 0.7539329528808594,
	"learning_rate": 0.00016379856234017382,
	"loss": 0.8594,
	"step": 1550
	},
	{
	"epoch": 1.7697107203630176,
	"grad_norm": 0.6702885031700134,
	"learning_rate": 0.0001631862780447426,
	"loss": 0.8896,
	"step": 1560
	},
	{
	"epoch": 1.7810550198525241,
	"grad_norm": 0.6152791976928711,
	"learning_rate": 0.00016257002500651098,
	"loss": 0.8738,
	"step": 1570
	},
	{
	"epoch": 1.7923993193420307,
	"grad_norm": 0.5736550688743591,
	"learning_rate": 0.00016194984193245587,
	"loss": 0.9018,
	"step": 1580
	},
	{
	"epoch": 1.803743618831537,
	"grad_norm": 0.751157820224762,
	"learning_rate": 0.00016132576777640067,
	"loss": 0.8605,
	"step": 1590
	},
	{
	"epoch": 1.8150879183210438,
	"grad_norm": 0.6626732349395752,
	"learning_rate": 0.0001606978417365682,
	"loss": 0.8857,
	"step": 1600
	},
	{
	"epoch": 1.82643221781055,
	"grad_norm": 0.584065318107605,
	"learning_rate": 0.00016006610325311908,
	"loss": 0.9104,
	"step": 1610
	},
	{
	"epoch": 1.8377765173000569,
	"grad_norm": 0.5933496356010437,
	"learning_rate": 0.0001594305920056742,
	"loss": 0.8167,
	"step": 1620
	},
	{
	"epoch": 1.8491208167895632,
	"grad_norm": 0.5618401765823364,
	"learning_rate": 0.00015879134791082247,
	"loss": 0.8907,
	"step": 1630
	},
	{
	"epoch": 1.8604651162790697,
	"grad_norm": 0.9804329872131348,
	"learning_rate": 0.00015814841111961374,
	"loss": 0.9494,
	"step": 1640
	},
	{
	"epoch": 1.8718094157685763,
	"grad_norm": 0.937347412109375,
	"learning_rate": 0.00015750182201503682,
	"loss": 0.9045,
	"step": 1650
	},
	{
	"epoch": 1.8831537152580828,
	"grad_norm": 0.8898664712905884,
	"learning_rate": 0.00015685162120948317,
	"loss": 0.9346,
	"step": 1660
	},
	{
	"epoch": 1.8944980147475894,
	"grad_norm": 0.8580901622772217,
	"learning_rate": 0.00015619784954219577,
	"loss": 0.9412,
	"step": 1670
	},
	{
	"epoch": 1.9058423142370957,
	"grad_norm": 0.6913225054740906,
	"learning_rate": 0.00015554054807670418,
	"loss": 0.9006,
	"step": 1680
	},
	{
	"epoch": 1.9171866137266025,
	"grad_norm": 0.7101637125015259,
	"learning_rate": 0.00015487975809824539,
	"loss": 0.8857,
	"step": 1690
	},
	{
	"epoch": 1.9285309132161088,
	"grad_norm": 0.8228437900543213,
	"learning_rate": 0.00015421552111117044,
	"loss": 0.8607,
	"step": 1700
	},
	{
	"epoch": 1.9398752127056156,
	"grad_norm": 0.5591906905174255,
	"learning_rate": 0.00015354787883633782,
	"loss": 0.8674,
	"step": 1710
	},
	{
	"epoch": 1.951219512195122,
	"grad_norm": 0.6841379404067993,
	"learning_rate": 0.00015287687320849271,
	"loss": 0.8387,
	"step": 1720
	},
	{
	"epoch": 1.9625638116846285,
	"grad_norm": 0.8344857096672058,
	"learning_rate": 0.00015220254637363318,
	"loss": 0.9227,
	"step": 1730
	},
	{
	"epoch": 1.973908111174135,
	"grad_norm": 0.8986241221427917,
	"learning_rate": 0.00015152494068636308,
	"loss": 0.8917,
	"step": 1740
	},
	{
	"epoch": 1.9852524106636416,
	"grad_norm": 0.5783970952033997,
	"learning_rate": 0.00015084409870723154,
	"loss": 0.872,
	"step": 1750
	},
	{
	"epoch": 1.996596710153148,
	"grad_norm": 0.6369901895523071,
	"learning_rate": 0.00015016006320005986,
	"loss": 0.9132,
	"step": 1760
	},
	{
	"epoch": 2.0079410096426544,
	"grad_norm": 0.5906355381011963,
	"learning_rate": 0.00014947287712925545,
	"loss": 0.8074,
	"step": 1770
	},
	{
	"epoch": 2.019285309132161,
	"grad_norm": 0.6774492263793945,
	"learning_rate": 0.00014878258365711334,
	"loss": 0.759,
	"step": 1780
	},
	{
	"epoch": 2.0306296086216675,
	"grad_norm": 0.8353272676467896,
	"learning_rate": 0.00014808922614110493,
	"loss": 0.8028,
	"step": 1790
	},
	{
	"epoch": 2.0419739081111743,
	"grad_norm": 0.8876771926879883,
	"learning_rate": 0.00014739284813115498,
	"loss": 0.7302,
	"step": 1800
	},
	{
	"epoch": 2.0533182076006806,
	"grad_norm": 0.6215524673461914,
	"learning_rate": 0.00014669349336690594,
	"loss": 0.7759,
	"step": 1810
	},
	{
	"epoch": 2.0646625070901874,
	"grad_norm": 0.5663015246391296,
	"learning_rate": 0.00014599120577497087,
	"loss": 0.7834,
	"step": 1820
	},
	{
	"epoch": 2.0760068065796937,
	"grad_norm": 0.6096060872077942,
	"learning_rate": 0.00014528602946617432,
	"loss": 0.8364,
	"step": 1830
	},
	{
	"epoch": 2.0873511060692,
	"grad_norm": 0.7625316977500916,
	"learning_rate": 0.00014457800873278172,
	"loss": 0.7558,
	"step": 1840
	},
	{
	"epoch": 2.098695405558707,
	"grad_norm": 0.6301640272140503,
	"learning_rate": 0.0001438671880457174,
	"loss": 0.8297,
	"step": 1850
	},
	{
	"epoch": 2.110039705048213,
	"grad_norm": 0.6493074297904968,
	"learning_rate": 0.00014315361205177127,
	"loss": 0.7764,
	"step": 1860
	},
	{
	"epoch": 2.12138400453772,
	"grad_norm": 0.8326807618141174,
	"learning_rate": 0.0001424373255707947,
	"loss": 0.7895,
	"step": 1870
	},
	{
	"epoch": 2.1327283040272262,
	"grad_norm": 1.0578484535217285,
	"learning_rate": 0.00014171837359288524,
	"loss": 0.7889,
	"step": 1880
	},
	{
	"epoch": 2.144072603516733,
	"grad_norm": 0.6812543272972107,
	"learning_rate": 0.0001409968012755609,
	"loss": 0.7643,
	"step": 1890
	},
	{
	"epoch": 2.1554169030062393,
	"grad_norm": 0.8412303924560547,
	"learning_rate": 0.00014027265394092364,
	"loss": 0.7402,
	"step": 1900
	},
	{
	"epoch": 2.1667612024957457,
	"grad_norm": 0.947846531867981,
	"learning_rate": 0.00013954597707281288,
	"loss": 0.7763,
	"step": 1910
	},
	{
	"epoch": 2.1781055019852524,
	"grad_norm": 0.7577157616615295,
	"learning_rate": 0.00013881681631394842,
	"loss": 0.8334,
	"step": 1920
	},
	{
	"epoch": 2.1894498014747588,
	"grad_norm": 0.6362768411636353,
	"learning_rate": 0.0001380852174630639,
	"loss": 0.7484,
	"step": 1930
	},
	{
	"epoch": 2.2007941009642655,
	"grad_norm": 0.7967275381088257,
	"learning_rate": 0.00013735122647202984,
	"loss": 0.7302,
	"step": 1940
	},
	{
	"epoch": 2.212138400453772,
	"grad_norm": 0.7726805210113525,
	"learning_rate": 0.0001366148894429677,
	"loss": 0.7836,
	"step": 1950
	},
	{
	"epoch": 2.2234826999432786,
	"grad_norm": 0.7741623520851135,
	"learning_rate": 0.00013587625262535396,
	"loss": 0.7925,
	"step": 1960
	},
	{
	"epoch": 2.234826999432785,
	"grad_norm": 0.7582458257675171,
	"learning_rate": 0.0001351353624131153,
	"loss": 0.7765,
	"step": 1970
	},
	{
	"epoch": 2.2461712989222917,
	"grad_norm": 0.8276723027229309,
	"learning_rate": 0.00013439226534171463,
	"loss": 0.81,
	"step": 1980
	},
	{
	"epoch": 2.257515598411798,
	"grad_norm": 0.8419069051742554,
	"learning_rate": 0.00013364700808522807,
	"loss": 0.7464,
	"step": 1990
	},
	{
	"epoch": 2.2688598979013044,
	"grad_norm": 0.7446946501731873,
	"learning_rate": 0.00013289963745341345,
	"loss": 0.7524,
	"step": 2000
	},
	{
	"epoch": 2.2688598979013044,
	"eval_loss": 0.9066722989082336,
	"eval_runtime": 15.6396,
	"eval_samples_per_second": 94.951,
	"eval_steps_per_second": 11.893,
	"step": 2000
	},
	{
	"epoch": 2.280204197390811,
	"grad_norm": 0.7091513872146606,
	"learning_rate": 0.00013215020038877002,
	"loss": 0.7806,
	"step": 2010
	},
	{
	"epoch": 2.2915484968803175,
	"grad_norm": 0.5853792428970337,
	"learning_rate": 0.0001313987439635902,
	"loss": 0.7625,
	"step": 2020
	},
	{
	"epoch": 2.3028927963698242,
	"grad_norm": 0.7464004158973694,
	"learning_rate": 0.00013064531537700284,
	"loss": 0.7313,
	"step": 2030
	},
	{
	"epoch": 2.3142370958593306,
	"grad_norm": 0.6370956301689148,
	"learning_rate": 0.00012988996195200858,
	"loss": 0.7903,
	"step": 2040
	},
	{
	"epoch": 2.3255813953488373,
	"grad_norm": 0.8973234295845032,
	"learning_rate": 0.0001291327311325076,
	"loss": 0.7537,
	"step": 2050
	},
	{
	"epoch": 2.3369256948383437,
	"grad_norm": 1.206678032875061,
	"learning_rate": 0.00012837367048031955,
	"loss": 0.8081,
	"step": 2060
	},
	{
	"epoch": 2.3482699943278504,
	"grad_norm": 0.9258993864059448,
	"learning_rate": 0.0001276128276721963,
	"loss": 0.7754,
	"step": 2070
	},
	{
	"epoch": 2.3596142938173568,
	"grad_norm": 0.8008835315704346,
	"learning_rate": 0.00012685025049682732,
	"loss": 0.8119,
	"step": 2080
	},
	{
	"epoch": 2.370958593306863,
	"grad_norm": 0.8094901442527771,
	"learning_rate": 0.0001260859868518379,
	"loss": 0.7889,
	"step": 2090
	},
	{
	"epoch": 2.38230289279637,
	"grad_norm": 0.7824433445930481,
	"learning_rate": 0.00012532008474078093,
	"loss": 0.8443,
	"step": 2100
	},
	{
	"epoch": 2.393647192285876,
	"grad_norm": 0.8314623236656189,
	"learning_rate": 0.00012455259227012172,
	"loss": 0.8009,
	"step": 2110
	},
	{
	"epoch": 2.404991491775383,
	"grad_norm": 0.993483304977417,
	"learning_rate": 0.0001237835576462163,
	"loss": 0.803,
	"step": 2120
	},
	{
	"epoch": 2.4163357912648893,
	"grad_norm": 0.7922090291976929,
	"learning_rate": 0.00012301302917228364,
	"loss": 0.7785,
	"step": 2130
	},
	{
	"epoch": 2.427680090754396,
	"grad_norm": 0.8681336045265198,
	"learning_rate": 0.00012224105524537176,
	"loss": 0.7427,
	"step": 2140
	},
	{
	"epoch": 2.4390243902439024,
	"grad_norm": 0.868011474609375,
	"learning_rate": 0.00012146768435331797,
	"loss": 0.7841,
	"step": 2150
	},
	{
	"epoch": 2.450368689733409,
	"grad_norm": 0.8300703763961792,
	"learning_rate": 0.00012069296507170307,
	"loss": 0.7113,
	"step": 2160
	},
	{
	"epoch": 2.4617129892229155,
	"grad_norm": 1.0211178064346313,
	"learning_rate": 0.00011991694606080062,
	"loss": 0.7927,
	"step": 2170
	},
	{
	"epoch": 2.473057288712422,
	"grad_norm": 1.1126124858856201,
	"learning_rate": 0.00011913967606252035,
	"loss": 0.798,
	"step": 2180
	},
	{
	"epoch": 2.4844015882019286,
	"grad_norm": 1.331468939781189,
	"learning_rate": 0.00011836120389734677,
	"loss": 0.7868,
	"step": 2190
	},
	{
	"epoch": 2.495745887691435,
	"grad_norm": 0.7289639115333557,
	"learning_rate": 0.00011758157846127278,
	"loss": 0.7501,
	"step": 2200
	},
	{
	"epoch": 2.5070901871809417,
	"grad_norm": 0.6862948536872864,
	"learning_rate": 0.00011680084872272843,
	"loss": 0.8113,
	"step": 2210
	},
	{
	"epoch": 2.518434486670448,
	"grad_norm": 0.6838523745536804,
	"learning_rate": 0.00011601906371950523,
	"loss": 0.7794,
	"step": 2220
	},
	{
	"epoch": 2.5297787861599548,
	"grad_norm": 0.8923412561416626,
	"learning_rate": 0.00011523627255567606,
	"loss": 0.7532,
	"step": 2230
	},
	{
	"epoch": 2.541123085649461,
	"grad_norm": 0.7864569425582886,
	"learning_rate": 0.00011445252439851092,
	"loss": 0.8044,
	"step": 2240
	},
	{
	"epoch": 2.552467385138968,
	"grad_norm": 0.9186776280403137,
	"learning_rate": 0.0001136678684753889,
	"loss": 0.7861,
	"step": 2250
	},
	{
	"epoch": 2.563811684628474,
	"grad_norm": 0.9502933025360107,
	"learning_rate": 0.00011288235407070588,
	"loss": 0.7441,
	"step": 2260
	},
	{
	"epoch": 2.5751559841179805,
	"grad_norm": 0.9764688014984131,
	"learning_rate": 0.00011209603052277924,
	"loss": 0.7519,
	"step": 2270
	},
	{
	"epoch": 2.5865002836074873,
	"grad_norm": 0.8480959534645081,
	"learning_rate": 0.00011130894722074874,
	"loss": 0.7743,
	"step": 2280
	},
	{
	"epoch": 2.5978445830969936,
	"grad_norm": 0.8660979866981506,
	"learning_rate": 0.00011052115360147448,
	"loss": 0.7989,
	"step": 2290
	},
	{
	"epoch": 2.6091888825865004,
	"grad_norm": 0.6586043238639832,
	"learning_rate": 0.0001097326991464318,
	"loss": 0.7676,
	"step": 2300
	},
	{
	"epoch": 2.6205331820760067,
	"grad_norm": 0.7315343618392944,
	"learning_rate": 0.00010894363337860314,
	"loss": 0.7699,
	"step": 2310
	},
	{
	"epoch": 2.6318774815655135,
	"grad_norm": 0.7257770895957947,
	"learning_rate": 0.0001081540058593677,
	"loss": 0.7773,
	"step": 2320
	},
	{
	"epoch": 2.64322178105502,
	"grad_norm": 0.6760928630828857,
	"learning_rate": 0.00010736386618538838,
	"loss": 0.7902,
	"step": 2330
	},
	{
	"epoch": 2.6545660805445266,
	"grad_norm": 0.6824659705162048,
	"learning_rate": 0.00010657326398549661,
	"loss": 0.7759,
	"step": 2340
	},
	{
	"epoch": 2.665910380034033,
	"grad_norm": 0.972321629524231,
	"learning_rate": 0.0001057822489175752,
	"loss": 0.7926,
	"step": 2350
	},
	{
	"epoch": 2.6772546795235392,
	"grad_norm": 0.9526649713516235,
	"learning_rate": 0.00010499087066543922,
	"loss": 0.7648,
	"step": 2360
	},
	{
	"epoch": 2.688598979013046,
	"grad_norm": 0.7266947031021118,
	"learning_rate": 0.0001041991789357155,
	"loss": 0.776,
	"step": 2370
	},
	{
	"epoch": 2.6999432785025523,
	"grad_norm": 0.808121383190155,
	"learning_rate": 0.00010340722345472037,
	"loss": 0.7852,
	"step": 2380
	},
	{
	"epoch": 2.711287577992059,
	"grad_norm": 1.1124972105026245,
	"learning_rate": 0.00010261505396533648,
	"loss": 0.717,
	"step": 2390
	},
	{
	"epoch": 2.7226318774815654,
	"grad_norm": 0.7241740226745605,
	"learning_rate": 0.00010182272022388841,
	"loss": 0.8335,
	"step": 2400
	},
	{
	"epoch": 2.733976176971072,
	"grad_norm": 1.0944820642471313,
	"learning_rate": 0.0001010302719970174,
	"loss": 0.7874,
	"step": 2410
	},
	{
	"epoch": 2.7453204764605785,
	"grad_norm": 0.735615611076355,
	"learning_rate": 0.00010023775905855559,
	"loss": 0.7198,
	"step": 2420
	},
	{
	"epoch": 2.7566647759500853,
	"grad_norm": 0.8080368041992188,
	"learning_rate": 9.944523118639958e-05,
	"loss": 0.8275,
	"step": 2430
	},
	{
	"epoch": 2.7680090754395916,
	"grad_norm": 1.0709086656570435,
	"learning_rate": 9.865273815938403e-05,
	"loss": 0.841,
	"step": 2440
	},
	{
	"epoch": 2.779353374929098,
	"grad_norm": 0.8561082482337952,
	"learning_rate": 9.786032975415503e-05,
	"loss": 0.7393,
	"step": 2450
	},
	{
	"epoch": 2.7906976744186047,
	"grad_norm": 0.6831649541854858,
	"learning_rate": 9.706805574204341e-05,
	"loss": 0.7904,
	"step": 2460
	},
	{
	"epoch": 2.802041973908111,
	"grad_norm": 0.9404779672622681,
	"learning_rate": 9.627596588593884e-05,
	"loss": 0.7651,
	"step": 2470
	},
	{
	"epoch": 2.813386273397618,
	"grad_norm": 1.1059134006500244,
	"learning_rate": 9.54841099371641e-05,
	"loss": 0.7792,
	"step": 2480
	},
	{
	"epoch": 2.824730572887124,
	"grad_norm": 0.8339388966560364,
	"learning_rate": 9.469253763235015e-05,
	"loss": 0.8037,
	"step": 2490
	},
	{
	"epoch": 2.8360748723766305,
	"grad_norm": 0.691879153251648,
	"learning_rate": 9.390129869031232e-05,
	"loss": 0.7882,
	"step": 2500
	},
	{
	"epoch": 2.8474191718661372,
	"grad_norm": 0.8173119425773621,
	"learning_rate": 9.311044280892728e-05,
	"loss": 0.7723,
	"step": 2510
	},
	{
	"epoch": 2.858763471355644,
	"grad_norm": 1.2163662910461426,
	"learning_rate": 9.232001966201159e-05,
	"loss": 0.8332,
	"step": 2520
	},
	{
	"epoch": 2.8701077708451503,
	"grad_norm": 0.7762579917907715,
	"learning_rate": 9.153007889620169e-05,
	"loss": 0.8017,
	"step": 2530
	},
	{
	"epoch": 2.8814520703346567,
	"grad_norm": 0.7560020089149475,
	"learning_rate": 9.074067012783551e-05,
	"loss": 0.7645,
	"step": 2540
	},
	{
	"epoch": 2.8927963698241634,
	"grad_norm": 0.7039526104927063,
	"learning_rate": 8.995184293983627e-05,
	"loss": 0.7496,
	"step": 2550
	},
	{
	"epoch": 2.9041406693136698,
	"grad_norm": 0.8188515305519104,
	"learning_rate": 8.916364687859782e-05,
	"loss": 0.7941,
	"step": 2560
	},
	{
	"epoch": 2.9154849688031765,
	"grad_norm": 0.8847174048423767,
	"learning_rate": 8.837613145087289e-05,
	"loss": 0.7462,
	"step": 2570
	},
	{
	"epoch": 2.926829268292683,
	"grad_norm": 1.4302834272384644,
	"learning_rate": 8.758934612066353e-05,
	"loss": 0.7659,
	"step": 2580
	},
	{
	"epoch": 2.938173567782189,
	"grad_norm": 0.8293200135231018,
	"learning_rate": 8.680334030611414e-05,
	"loss": 0.7464,
	"step": 2590
	},
	{
	"epoch": 2.949517867271696,
	"grad_norm": 0.9347418546676636,
	"learning_rate": 8.601816337640767e-05,
	"loss": 0.7907,
	"step": 2600
	},
	{
	"epoch": 2.9608621667612027,
	"grad_norm": 0.8685625195503235,
	"learning_rate": 8.523386464866452e-05,
	"loss": 0.7881,
	"step": 2610
	},
	{
	"epoch": 2.972206466250709,
	"grad_norm": 1.0375618934631348,
	"learning_rate": 8.44504933848452e-05,
	"loss": 0.7415,
	"step": 2620
	},
	{
	"epoch": 2.9835507657402154,
	"grad_norm": 1.1286613941192627,
	"learning_rate": 8.366809878865594e-05,
	"loss": 0.759,
	"step": 2630
	},
	{
	"epoch": 2.994895065229722,
	"grad_norm": 0.9496249556541443,
	"learning_rate": 8.28867300024582e-05,
	"loss": 0.8122,
	"step": 2640
	},
	{
	"epoch": 3.0062393647192285,
	"grad_norm": 0.6161667108535767,
	"learning_rate": 8.210643610418232e-05,
	"loss": 0.7363,
	"step": 2650
	},
	{
	"epoch": 3.0175836642087353,
	"grad_norm": 1.1362223625183105,
	"learning_rate": 8.132726610424453e-05,
	"loss": 0.6957,
	"step": 2660
	},
	{
	"epoch": 3.0289279636982416,
	"grad_norm": 0.9549693465232849,
	"learning_rate": 8.054926894246887e-05,
	"loss": 0.6598,
	"step": 2670
	},
	{
	"epoch": 3.0402722631877483,
	"grad_norm": 0.7844473719596863,
	"learning_rate": 7.977249348501314e-05,
	"loss": 0.7104,
	"step": 2680
	},
	{
	"epoch": 3.0516165626772547,
	"grad_norm": 0.9754497408866882,
	"learning_rate": 7.899698852129962e-05,
	"loss": 0.7109,
	"step": 2690
	},
	{
	"epoch": 3.062960862166761,
	"grad_norm": 0.8465747237205505,
	"learning_rate": 7.822280276095073e-05,
	"loss": 0.6208,
	"step": 2700
	},
	{
	"epoch": 3.0743051616562678,
	"grad_norm": 0.7896714806556702,
	"learning_rate": 7.744998483072936e-05,
	"loss": 0.6417,
	"step": 2710
	},
	{
	"epoch": 3.085649461145774,
	"grad_norm": 0.8668105006217957,
	"learning_rate": 7.667858327148475e-05,
	"loss": 0.6525,
	"step": 2720
	},
	{
	"epoch": 3.096993760635281,
	"grad_norm": 1.0019567012786865,
	"learning_rate": 7.590864653510359e-05,
	"loss": 0.6604,
	"step": 2730
	},
	{
	"epoch": 3.108338060124787,
	"grad_norm": 0.7561362981796265,
	"learning_rate": 7.514022298146679e-05,
	"loss": 0.6912,
	"step": 2740
	},
	{
	"epoch": 3.119682359614294,
	"grad_norm": 0.9435575604438782,
	"learning_rate": 7.437336087541187e-05,
	"loss": 0.6993,
	"step": 2750
	},
	{
	"epoch": 3.1310266591038003,
	"grad_norm": 1.041034460067749,
	"learning_rate": 7.360810838370161e-05,
	"loss": 0.6562,
	"step": 2760
	},
	{
	"epoch": 3.142370958593307,
	"grad_norm": 0.8745769262313843,
	"learning_rate": 7.284451357199851e-05,
	"loss": 0.6035,
	"step": 2770
	},
	{
	"epoch": 3.1537152580828134,
	"grad_norm": 0.9436658620834351,
	"learning_rate": 7.208262440184584e-05,
	"loss": 0.6591,
	"step": 2780
	},
	{
	"epoch": 3.1650595575723197,
	"grad_norm": 0.9558268785476685,
	"learning_rate": 7.13224887276553e-05,
	"loss": 0.7548,
	"step": 2790
	},
	{
	"epoch": 3.1764038570618265,
	"grad_norm": 1.3072495460510254,
	"learning_rate": 7.056415429370106e-05,
	"loss": 0.648,
	"step": 2800
	},
	{
	"epoch": 3.187748156551333,
	"grad_norm": 1.0742169618606567,
	"learning_rate": 6.980766873112106e-05,
	"loss": 0.6646,
	"step": 2810
	},
	{
	"epoch": 3.1990924560408396,
	"grad_norm": 0.8391577005386353,
	"learning_rate": 6.905307955492523e-05,
	"loss": 0.6844,
	"step": 2820
	},
	{
	"epoch": 3.210436755530346,
	"grad_norm": 0.9172285795211792,
	"learning_rate": 6.83004341610111e-05,
	"loss": 0.6671,
	"step": 2830
	},
	{
	"epoch": 3.2217810550198527,
	"grad_norm": 1.0791727304458618,
	"learning_rate": 6.754977982318693e-05,
	"loss": 0.6619,
	"step": 2840
	},
	{
	"epoch": 3.233125354509359,
	"grad_norm": 0.8881738781929016,
	"learning_rate": 6.68011636902022e-05,
	"loss": 0.678,
	"step": 2850
	},
	{
	"epoch": 3.2444696539988658,
	"grad_norm": 0.8353477120399475,
	"learning_rate": 6.605463278278646e-05,
	"loss": 0.7061,
	"step": 2860
	},
	{
	"epoch": 3.255813953488372,
	"grad_norm": 0.9251864552497864,
	"learning_rate": 6.531023399069574e-05,
	"loss": 0.6658,
	"step": 2870
	},
	{
	"epoch": 3.2671582529778784,
	"grad_norm": 0.7780378460884094,
	"learning_rate": 6.45680140697675e-05,
	"loss": 0.6327,
	"step": 2880
	},
	{
	"epoch": 3.278502552467385,
	"grad_norm": 1.3496202230453491,
	"learning_rate": 6.38280196389839e-05,
	"loss": 0.6658,
	"step": 2890
	},
	{
	"epoch": 3.2898468519568915,
	"grad_norm": 1.0429950952529907,
	"learning_rate": 6.309029717754362e-05,
	"loss": 0.7013,
	"step": 2900
	},
	{
	"epoch": 3.3011911514463983,
	"grad_norm": 0.7141017317771912,
	"learning_rate": 6.235489302194247e-05,
	"loss": 0.6969,
	"step": 2910
	},
	{
	"epoch": 3.3125354509359046,
	"grad_norm": 1.2669309377670288,
	"learning_rate": 6.162185336306294e-05,
	"loss": 0.6468,
	"step": 2920
	},
	{
	"epoch": 3.3238797504254114,
	"grad_norm": 0.8476207852363586,
	"learning_rate": 6.089122424327307e-05,
	"loss": 0.6501,
	"step": 2930
	},
	{
	"epoch": 3.3352240499149177,
	"grad_norm": 0.9521162509918213,
	"learning_rate": 6.01630515535345e-05,
	"loss": 0.6546,
	"step": 2940
	},
	{
	"epoch": 3.346568349404424,
	"grad_norm": 0.7817677855491638,
	"learning_rate": 5.943738103051997e-05,
	"loss": 0.6919,
	"step": 2950
	},
	{
	"epoch": 3.357912648893931,
	"grad_norm": 0.776945948600769,
	"learning_rate": 5.8714258253740564e-05,
	"loss": 0.6897,
	"step": 2960
	},
	{
	"epoch": 3.369256948383437,
	"grad_norm": 0.9761963486671448,
	"learning_rate": 5.7993728642683e-05,
	"loss": 0.6299,
	"step": 2970
	},
	{
	"epoch": 3.380601247872944,
	"grad_norm": 0.7887254953384399,
	"learning_rate": 5.7275837453956614e-05,
	"loss": 0.6773,
	"step": 2980
	},
	{
	"epoch": 3.3919455473624502,
	"grad_norm": 0.860835611820221,
	"learning_rate": 5.656062977845116e-05,
	"loss": 0.6239,
	"step": 2990
	},
	{
	"epoch": 3.403289846851957,
	"grad_norm": 0.9700385928153992,
	"learning_rate": 5.584815053850407e-05,
	"loss": 0.7148,
	"step": 3000
	},
	{
	"epoch": 3.403289846851957,
	"eval_loss": 0.9692808389663696,
	"eval_runtime": 15.7325,
	"eval_samples_per_second": 94.39,
	"eval_steps_per_second": 11.823,
	"step": 3000
	},
	{
	"epoch": 3.4146341463414633,
	"grad_norm": 1.335462212562561,
	"learning_rate": 5.51384444850794e-05,
	"loss": 0.6387,
	"step": 3010
	},
	{
	"epoch": 3.42597844583097,
	"grad_norm": 0.8788994550704956,
	"learning_rate": 5.443155619495679e-05,
	"loss": 0.6809,
	"step": 3020
	},
	{
	"epoch": 3.4373227453204764,
	"grad_norm": 0.9188012480735779,
	"learning_rate": 5.372753006793143e-05,
	"loss": 0.6724,
	"step": 3030
	},
	{
	"epoch": 3.4486670448099828,
	"grad_norm": 0.9619457125663757,
	"learning_rate": 5.302641032402578e-05,
	"loss": 0.6789,
	"step": 3040
	},
	{
	"epoch": 3.4600113442994895,
	"grad_norm": 0.9403857588768005,
	"learning_rate": 5.2328241000711464e-05,
	"loss": 0.6274,
	"step": 3050
	},
	{
	"epoch": 3.471355643788996,
	"grad_norm": 0.9259539246559143,
	"learning_rate": 5.16330659501438e-05,
	"loss": 0.6551,
	"step": 3060
	},
	{
	"epoch": 3.4826999432785026,
	"grad_norm": 1.07770574092865,
	"learning_rate": 5.094092883640718e-05,
	"loss": 0.6593,
	"step": 3070
	},
	{
	"epoch": 3.494044242768009,
	"grad_norm": 0.7347473502159119,
	"learning_rate": 5.0251873132772576e-05,
	"loss": 0.6847,
	"step": 3080
	},
	{
	"epoch": 3.5053885422575157,
	"grad_norm": 0.9838495254516602,
	"learning_rate": 4.956594211896701e-05,
	"loss": 0.6667,
	"step": 3090
	},
	{
	"epoch": 3.516732841747022,
	"grad_norm": 1.1671929359436035,
	"learning_rate": 4.8883178878454996e-05,
	"loss": 0.683,
	"step": 3100
	},
	{
	"epoch": 3.528077141236529,
	"grad_norm": 0.6510323882102966,
	"learning_rate": 4.8203626295732675e-05,
	"loss": 0.6946,
	"step": 3110
	},
	{
	"epoch": 3.539421440726035,
	"grad_norm": 0.7871556282043457,
	"learning_rate": 4.7527327053634094e-05,
	"loss": 0.6652,
	"step": 3120
	},
	{
	"epoch": 3.5507657402155415,
	"grad_norm": 0.8053673505783081,
	"learning_rate": 4.685432363065036e-05,
	"loss": 0.6431,
	"step": 3130
	},
	{
	"epoch": 3.5621100397050482,
	"grad_norm": 0.8162011504173279,
	"learning_rate": 4.618465829826145e-05,
	"loss": 0.6089,
	"step": 3140
	},
	{
	"epoch": 3.5734543391945546,
	"grad_norm": 1.0298821926116943,
	"learning_rate": 4.551837311828131e-05,
	"loss": 0.6645,
	"step": 3150
	},
	{
	"epoch": 3.5847986386840613,
	"grad_norm": 1.0996955633163452,
	"learning_rate": 4.485550994021567e-05,
	"loss": 0.6872,
	"step": 3160
	},
	{
	"epoch": 3.5961429381735677,
	"grad_norm": 0.9979953765869141,
	"learning_rate": 4.419611039863377e-05,
	"loss": 0.628,
	"step": 3170
	},
	{
	"epoch": 3.6074872376630744,
	"grad_norm": 1.0593342781066895,
	"learning_rate": 4.354021591055311e-05,
	"loss": 0.6864,
	"step": 3180
	},
	{
	"epoch": 3.6188315371525808,
	"grad_norm": 1.6677913665771484,
	"learning_rate": 4.2887867672838056e-05,
	"loss": 0.6232,
	"step": 3190
	},
	{
	"epoch": 3.6301758366420875,
	"grad_norm": 0.8164204359054565,
	"learning_rate": 4.223910665961235e-05,
	"loss": 0.6786,
	"step": 3200
	},
	{
	"epoch": 3.641520136131594,
	"grad_norm": 0.8163765072822571,
	"learning_rate": 4.15939736196853e-05,
	"loss": 0.6763,
	"step": 3210
	},
	{
	"epoch": 3.6528644356211,
	"grad_norm": 0.9765521883964539,
	"learning_rate": 4.095250907399262e-05,
	"loss": 0.6719,
	"step": 3220
	},
	{
	"epoch": 3.664208735110607,
	"grad_norm": 0.9238688349723816,
	"learning_rate": 4.03147533130511e-05,
	"loss": 0.68,
	"step": 3230
	},
	{
	"epoch": 3.6755530346001133,
	"grad_norm": 0.9760640859603882,
	"learning_rate": 3.968074639442805e-05,
	"loss": 0.6542,
	"step": 3240
	},
	{
	"epoch": 3.68689733408962,
	"grad_norm": 0.9406284689903259,
	"learning_rate": 3.905052814022523e-05,
	"loss": 0.653,
	"step": 3250
	},
	{
	"epoch": 3.6982416335791264,
	"grad_norm": 0.9423522353172302,
	"learning_rate": 3.842413813457758e-05,
	"loss": 0.706,
	"step": 3260
	},
	{
	"epoch": 3.709585933068633,
	"grad_norm": 0.8088165521621704,
	"learning_rate": 3.780161572116704e-05,
	"loss": 0.7161,
	"step": 3270
	},
	{
	"epoch": 3.7209302325581395,
	"grad_norm": 0.9071544408798218,
	"learning_rate": 3.718300000075129e-05,
	"loss": 0.7193,
	"step": 3280
	},
	{
	"epoch": 3.7322745320476463,
	"grad_norm": 0.8792480230331421,
	"learning_rate": 3.6568329828707836e-05,
	"loss": 0.6381,
	"step": 3290
	},
	{
	"epoch": 3.7436188315371526,
	"grad_norm": 1.0307759046554565,
	"learning_rate": 3.5957643812593543e-05,
	"loss": 0.6668,
	"step": 3300
	},
	{
	"epoch": 3.754963131026659,
	"grad_norm": 1.0883175134658813,
	"learning_rate": 3.5350980309719514e-05,
	"loss": 0.6978,
	"step": 3310
	},
	{
	"epoch": 3.7663074305161657,
	"grad_norm": 1.0448516607284546,
	"learning_rate": 3.4748377424742115e-05,
	"loss": 0.6756,
	"step": 3320
	},
	{
	"epoch": 3.777651730005672,
	"grad_norm": 0.8772532939910889,
	"learning_rate": 3.414987300726945e-05,
	"loss": 0.6714,
	"step": 3330
	},
	{
	"epoch": 3.7889960294951788,
	"grad_norm": 1.0115753412246704,
	"learning_rate": 3.3555504649484046e-05,
	"loss": 0.6773,
	"step": 3340
	},
	{
	"epoch": 3.800340328984685,
	"grad_norm": 1.1093175411224365,
	"learning_rate": 3.296530968378173e-05,
	"loss": 0.6916,
	"step": 3350
	},
	{
	"epoch": 3.811684628474192,
	"grad_norm": 0.8998281359672546,
	"learning_rate": 3.237932518042664e-05,
	"loss": 0.6801,
	"step": 3360
	},
	{
	"epoch": 3.823028927963698,
	"grad_norm": 1.0179048776626587,
	"learning_rate": 3.1797587945223026e-05,
	"loss": 0.6702,
	"step": 3370
	},
	{
	"epoch": 3.834373227453205,
	"grad_norm": 0.9240026473999023,
	"learning_rate": 3.1220134517203335e-05,
	"loss": 0.671,
	"step": 3380
	},
	{
	"epoch": 3.8457175269427113,
	"grad_norm": 0.7641962766647339,
	"learning_rate": 3.0647001166333245e-05,
	"loss": 0.7147,
	"step": 3390
	},
	{
	"epoch": 3.8570618264322176,
	"grad_norm": 0.9078419804573059,
	"learning_rate": 3.0078223891233514e-05,
	"loss": 0.7155,
	"step": 3400
	},
	{
	"epoch": 3.8684061259217244,
	"grad_norm": 0.962393045425415,
	"learning_rate": 2.9513838416918815e-05,
	"loss": 0.6866,
	"step": 3410
	},
	{
	"epoch": 3.8797504254112307,
	"grad_norm": 1.5198420286178589,
	"learning_rate": 2.8953880192554105e-05,
	"loss": 0.6741,
	"step": 3420
	},
	{
	"epoch": 3.8910947249007375,
	"grad_norm": 1.1129947900772095,
	"learning_rate": 2.8398384389227816e-05,
	"loss": 0.6542,
	"step": 3430
	},
	{
	"epoch": 3.902439024390244,
	"grad_norm": 0.8633179664611816,
	"learning_rate": 2.7847385897742705e-05,
	"loss": 0.6768,
	"step": 3440
	},
	{
	"epoch": 3.9137833238797506,
	"grad_norm": 1.062277913093567,
	"learning_rate": 2.7300919326424658e-05,
	"loss": 0.6709,
	"step": 3450
	},
	{
	"epoch": 3.925127623369257,
	"grad_norm": 0.7949813604354858,
	"learning_rate": 2.675901899894854e-05,
	"loss": 0.6166,
	"step": 3460
	},
	{
	"epoch": 3.9364719228587637,
	"grad_norm": 0.9200356006622314,
	"learning_rate": 2.622171895218273e-05,
	"loss": 0.6718,
	"step": 3470
	},
	{
	"epoch": 3.94781622234827,
	"grad_norm": 0.9637920260429382,
	"learning_rate": 2.568905293405095e-05,
	"loss": 0.619,
	"step": 3480
	},
	{
	"epoch": 3.9591605218377763,
	"grad_norm": 1.157073974609375,
	"learning_rate": 2.516105440141262e-05,
	"loss": 0.6961,
	"step": 3490
	},
	{
	"epoch": 3.970504821327283,
	"grad_norm": 0.8323079347610474,
	"learning_rate": 2.4637756517961517e-05,
	"loss": 0.677,
	"step": 3500
	},
	{
	"epoch": 3.9818491208167894,
	"grad_norm": 0.9369989037513733,
	"learning_rate": 2.41191921521427e-05,
	"loss": 0.6619,
	"step": 3510
	},
	{
	"epoch": 3.993193420306296,
	"grad_norm": 0.8290889263153076,
	"learning_rate": 2.360539387508801e-05,
	"loss": 0.6534,
	"step": 3520
	},
	{
	"epoch": 4.0045377197958025,
	"grad_norm": 0.8619610071182251,
	"learning_rate": 2.309639395857033e-05,
	"loss": 0.6531,
	"step": 3530
	},
	{
	"epoch": 4.015882019285309,
	"grad_norm": 0.7406215071678162,
	"learning_rate": 2.259222437297649e-05,
	"loss": 0.5811,
	"step": 3540
	},
	{
	"epoch": 4.027226318774816,
	"grad_norm": 1.3408113718032837,
	"learning_rate": 2.2092916785299323e-05,
	"loss": 0.6163,
	"step": 3550
	},
	{
	"epoch": 4.038570618264322,
	"grad_norm": 0.9652060866355896,
	"learning_rate": 2.159850255714859e-05,
	"loss": 0.6345,
	"step": 3560
	},
	{
	"epoch": 4.049914917753829,
	"grad_norm": 1.2307026386260986,
	"learning_rate": 2.1109012742781142e-05,
	"loss": 0.5568,
	"step": 3570
	},
	{
	"epoch": 4.061259217243335,
	"grad_norm": 1.101637363433838,
	"learning_rate": 2.0624478087150456e-05,
	"loss": 0.608,
	"step": 3580
	},
	{
	"epoch": 4.072603516732841,
	"grad_norm": 2.5598561763763428,
	"learning_rate": 2.0144929023975413e-05,
	"loss": 0.5294,
	"step": 3590
	},
	{
	"epoch": 4.083947816222349,
	"grad_norm": 0.9463273286819458,
	"learning_rate": 1.967039567382888e-05,
	"loss": 0.5482,
	"step": 3600
	},
	{
	"epoch": 4.095292115711855,
	"grad_norm": 0.9838125109672546,
	"learning_rate": 1.920090784224581e-05,
	"loss": 0.6254,
	"step": 3610
	},
	{
	"epoch": 4.106636415201361,
	"grad_norm": 0.85828697681427,
	"learning_rate": 1.8736495017851062e-05,
	"loss": 0.5443,
	"step": 3620
	},
	{
	"epoch": 4.117980714690868,
	"grad_norm": 0.8922297954559326,
	"learning_rate": 1.827718637050736e-05,
	"loss": 0.6068,
	"step": 3630
	},
	{
	"epoch": 4.129325014180375,
	"grad_norm": 0.7973962426185608,
	"learning_rate": 1.7823010749482927e-05,
	"loss": 0.6179,
	"step": 3640
	},
	{
	"epoch": 4.140669313669881,
	"grad_norm": 0.8686882257461548,
	"learning_rate": 1.737399668163966e-05,
	"loss": 0.6186,
	"step": 3650
	},
	{
	"epoch": 4.152013613159387,
	"grad_norm": 1.4338245391845703,
	"learning_rate": 1.693017236964125e-05,
	"loss": 0.5784,
	"step": 3660
	},
	{
	"epoch": 4.163357912648894,
	"grad_norm": 0.9958694577217102,
	"learning_rate": 1.6491565690181765e-05,
	"loss": 0.6388,
	"step": 3670
	},
	{
	"epoch": 4.1747022121384,
	"grad_norm": 0.9962863922119141,
	"learning_rate": 1.605820419223476e-05,
	"loss": 0.6541,
	"step": 3680
	},
	{
	"epoch": 4.186046511627907,
	"grad_norm": 1.1754194498062134,
	"learning_rate": 1.5630115095322827e-05,
	"loss": 0.6037,
	"step": 3690
	},
	{
	"epoch": 4.197390811117414,
	"grad_norm": 1.1034218072891235,
	"learning_rate": 1.5207325287808027e-05,
	"loss": 0.5844,
	"step": 3700
	},
	{
	"epoch": 4.20873511060692,
	"grad_norm": 1.0171332359313965,
	"learning_rate": 1.4789861325203013e-05,
	"loss": 0.6724,
	"step": 3710
	},
	{
	"epoch": 4.220079410096426,
	"grad_norm": 0.9791539907455444,
	"learning_rate": 1.4377749428503006e-05,
	"loss": 0.5989,
	"step": 3720
	},
	{
	"epoch": 4.231423709585933,
	"grad_norm": 0.9501050710678101,
	"learning_rate": 1.3971015482538963e-05,
	"loss": 0.5911,
	"step": 3730
	},
	{
	"epoch": 4.24276800907544,
	"grad_norm": 1.2614890336990356,
	"learning_rate": 1.3569685034351554e-05,
	"loss": 0.5849,
	"step": 3740
	},
	{
	"epoch": 4.254112308564946,
	"grad_norm": 1.0194411277770996,
	"learning_rate": 1.3173783291586772e-05,
	"loss": 0.5976,
	"step": 3750
	},
	{
	"epoch": 4.2654566080544525,
	"grad_norm": 1.0711522102355957,
	"learning_rate": 1.2783335120912565e-05,
	"loss": 0.5931,
	"step": 3760
	},
	{
	"epoch": 4.276800907543959,
	"grad_norm": 0.8650385141372681,
	"learning_rate": 1.2398365046456783e-05,
	"loss": 0.6078,
	"step": 3770
	},
	{
	"epoch": 4.288145207033466,
	"grad_norm": 0.823208749294281,
	"learning_rate": 1.2018897248267103e-05,
	"loss": 0.5961,
	"step": 3780
	},
	{
	"epoch": 4.299489506522972,
	"grad_norm": 0.9447870850563049,
	"learning_rate": 1.1644955560791993e-05,
	"loss": 0.6468,
	"step": 3790
	},
	{
	"epoch": 4.310833806012479,
	"grad_norm": 1.102318525314331,
	"learning_rate": 1.1276563471383883e-05,
	"loss": 0.588,
	"step": 3800
	},
	{
	"epoch": 4.322178105501985,
	"grad_norm": 0.9916651248931885,
	"learning_rate": 1.0913744118823866e-05,
	"loss": 0.6188,
	"step": 3810
	},
	{
	"epoch": 4.333522404991491,
	"grad_norm": 1.1987171173095703,
	"learning_rate": 1.05565202918682e-05,
	"loss": 0.5841,
	"step": 3820
	},
	{
	"epoch": 4.3448667044809985,
	"grad_norm": 0.9708378911018372,
	"learning_rate": 1.0204914427817158e-05,
	"loss": 0.6023,
	"step": 3830
	},
	{
	"epoch": 4.356211003970505,
	"grad_norm": 1.0048896074295044,
	"learning_rate": 9.8589486111056e-06,
	"loss": 0.5705,
	"step": 3840
	},
	{
	"epoch": 4.367555303460011,
	"grad_norm": 0.8364105820655823,
	"learning_rate": 9.518644571915847e-06,
	"loss": 0.5872,
	"step": 3850
	},
	{
	"epoch": 4.3788996029495175,
	"grad_norm": 1.5254448652267456,
	"learning_rate": 9.184023684812926e-06,
	"loss": 0.6063,
	"step": 3860
	},
	{
	"epoch": 4.390243902439025,
	"grad_norm": 0.993635356426239,
	"learning_rate": 8.855106967401839e-06,
	"loss": 0.5311,
	"step": 3870
	},
	{
	"epoch": 4.401588201928531,
	"grad_norm": 0.8678284883499146,
	"learning_rate": 8.531915079007625e-06,
	"loss": 0.5894,
	"step": 3880
	},
	{
	"epoch": 4.412932501418037,
	"grad_norm": 1.081127643585205,
	"learning_rate": 8.214468319377633e-06,
	"loss": 0.5906,
	"step": 3890
	},
	{
	"epoch": 4.424276800907544,
	"grad_norm": 0.9130728840827942,
	"learning_rate": 7.902786627406477e-06,
	"loss": 0.5764,
	"step": 3900
	},
	{
	"epoch": 4.43562110039705,
	"grad_norm": 0.9263814091682434,
	"learning_rate": 7.596889579883826e-06,
	"loss": 0.5812,
	"step": 3910
	},
	{
	"epoch": 4.446965399886557,
	"grad_norm": 1.095747947692871,
	"learning_rate": 7.296796390264549e-06,
	"loss": 0.5721,
	"step": 3920
	},
	{
	"epoch": 4.458309699376064,
	"grad_norm": 0.8003553152084351,
	"learning_rate": 7.002525907462121e-06,
	"loss": 0.5882,
	"step": 3930
	},
	{
	"epoch": 4.46965399886557,
	"grad_norm": 0.8841357231140137,
	"learning_rate": 6.7140966146646e-06,
	"loss": 0.5543,
	"step": 3940
	},
	{
	"epoch": 4.480998298355076,
	"grad_norm": 0.8580918312072754,
	"learning_rate": 6.431526628173701e-06,
	"loss": 0.6549,
	"step": 3950
	},
	{
	"epoch": 4.4923425978445835,
	"grad_norm": 0.9447335004806519,
	"learning_rate": 6.154833696267015e-06,
	"loss": 0.6516,
	"step": 3960
	},
	{
	"epoch": 4.50368689733409,
	"grad_norm": 1.0485211610794067,
	"learning_rate": 5.884035198083071e-06,
	"loss": 0.579,
	"step": 3970
	},
	{
	"epoch": 4.515031196823596,
	"grad_norm": 0.9394044876098633,
	"learning_rate": 5.619148142529873e-06,
	"loss": 0.6396,
	"step": 3980
	},
	{
	"epoch": 4.526375496313102,
	"grad_norm": 0.93062824010849,
	"learning_rate": 5.360189167216545e-06,
	"loss": 0.6005,
	"step": 3990
	},
	{
	"epoch": 4.537719795802609,
	"grad_norm": 0.9513915777206421,
	"learning_rate": 5.107174537408233e-06,
	"loss": 0.5743,
	"step": 4000
	},
	{
	"epoch": 4.537719795802609,
	"eval_loss": 1.0443100929260254,
	"eval_runtime": 15.6805,
	"eval_samples_per_second": 94.704,
	"eval_steps_per_second": 11.862,
	"step": 4000
	},
	{
	"epoch": 4.549064095292116,
	"grad_norm": 0.9627020359039307,
	"learning_rate": 4.8601201450046316e-06,
	"loss": 0.6077,
	"step": 4010
	},
	{
	"epoch": 4.560408394781622,
	"grad_norm": 0.8539467453956604,
	"learning_rate": 4.619041507541688e-06,
	"loss": 0.5812,
	"step": 4020
	},
	{
	"epoch": 4.571752694271129,
	"grad_norm": 0.9446848630905151,
	"learning_rate": 4.383953767216964e-06,
	"loss": 0.624,
	"step": 4030
	},
	{
	"epoch": 4.583096993760635,
	"grad_norm": 1.188366174697876,
	"learning_rate": 4.154871689938633e-06,
	"loss": 0.6437,
	"step": 4040
	},
	{
	"epoch": 4.594441293250142,
	"grad_norm": 1.0908474922180176,
	"learning_rate": 3.931809664397867e-06,
	"loss": 0.6323,
	"step": 4050
	},
	{
	"epoch": 4.6057855927396485,
	"grad_norm": 0.9742168188095093,
	"learning_rate": 3.714781701165304e-06,
	"loss": 0.6132,
	"step": 4060
	},
	{
	"epoch": 4.617129892229155,
	"grad_norm": 0.8761405348777771,
	"learning_rate": 3.503801431810816e-06,
	"loss": 0.624,
	"step": 4070
	},
	{
	"epoch": 4.628474191718661,
	"grad_norm": 0.996088445186615,
	"learning_rate": 3.298882108047463e-06,
	"loss": 0.6009,
	"step": 4080
	},
	{
	"epoch": 4.6398184912081675,
	"grad_norm": 0.9667827486991882,
	"learning_rate": 3.10003660089907e-06,
	"loss": 0.5988,
	"step": 4090
	},
	{
	"epoch": 4.651162790697675,
	"grad_norm": 0.9298661351203918,
	"learning_rate": 2.9072773998918503e-06,
	"loss": 0.6453,
	"step": 4100
	},
	{
	"epoch": 4.662507090187181,
	"grad_norm": 0.9182038307189941,
	"learning_rate": 2.7206166122698774e-06,
	"loss": 0.5915,
	"step": 4110
	},
	{
	"epoch": 4.673851389676687,
	"grad_norm": 0.835645318031311,
	"learning_rate": 2.540065962234683e-06,
	"loss": 0.6515,
	"step": 4120
	},
	{
	"epoch": 4.685195689166194,
	"grad_norm": 0.8575255274772644,
	"learning_rate": 2.3656367902088026e-06,
	"loss": 0.6169,
	"step": 4130
	},
	{
	"epoch": 4.696539988655701,
	"grad_norm": 0.9075832962989807,
	"learning_rate": 2.19734005212352e-06,
	"loss": 0.6166,
	"step": 4140
	},
	{
	"epoch": 4.707884288145207,
	"grad_norm": 2.0740888118743896,
	"learning_rate": 2.035186318730742e-06,
	"loss": 0.5779,
	"step": 4150
	},
	{
	"epoch": 4.7192285876347135,
	"grad_norm": 1.0293558835983276,
	"learning_rate": 1.8791857749389741e-06,
	"loss": 0.6414,
	"step": 4160
	},
	{
	"epoch": 4.73057288712422,
	"grad_norm": 0.9525774121284485,
	"learning_rate": 1.7293482191736877e-06,
	"loss": 0.5802,
	"step": 4170
	},
	{
	"epoch": 4.741917186613726,
	"grad_norm": 0.9085150957107544,
	"learning_rate": 1.5856830627618001e-06,
	"loss": 0.6331,
	"step": 4180
	},
	{
	"epoch": 4.753261486103233,
	"grad_norm": 0.9908912777900696,
	"learning_rate": 1.4481993293406048e-06,
	"loss": 0.5844,
	"step": 4190
	},
	{
	"epoch": 4.76460578559274,
	"grad_norm": 0.7421241998672485,
	"learning_rate": 1.316905654291012e-06,
	"loss": 0.6653,
	"step": 4200
	},
	{
	"epoch": 4.775950085082246,
	"grad_norm": 0.857502281665802,
	"learning_rate": 1.1918102841950607e-06,
	"loss": 0.5693,
	"step": 4210
	},
	{
	"epoch": 4.787294384571752,
	"grad_norm": 0.9300210475921631,
	"learning_rate": 1.0729210763180564e-06,
	"loss": 0.5755,
	"step": 4220
	},
	{
	"epoch": 4.79863868406126,
	"grad_norm": 1.2351378202438354,
	"learning_rate": 9.602454981149977e-07,
	"loss": 0.618,
	"step": 4230
	},
	{
	"epoch": 4.809982983550766,
	"grad_norm": 1.24778151512146,
	"learning_rate": 8.537906267615415e-07,
	"loss": 0.5896,
	"step": 4240
	},
	{
	"epoch": 4.821327283040272,
	"grad_norm": 1.3560271263122559,
	"learning_rate": 7.535631487095352e-07,
	"loss": 0.5879,
	"step": 4250
	},
	{
	"epoch": 4.832671582529779,
	"grad_norm": 1.8108911514282227,
	"learning_rate": 6.59569359266976e-07,
	"loss": 0.5943,
	"step": 4260
	},
	{
	"epoch": 4.844015882019285,
	"grad_norm": 0.9743121862411499,
	"learning_rate": 5.718151622026379e-07,
	"loss": 0.6104,
	"step": 4270
	},
	{
	"epoch": 4.855360181508792,
	"grad_norm": 1.2035831212997437,
	"learning_rate": 4.903060693752348e-07,
	"loss": 0.608,
	"step": 4280
	},
	{
	"epoch": 4.866704480998298,
	"grad_norm": 0.9681785106658936,
	"learning_rate": 4.1504720038724187e-07,
	"loss": 0.5773,
	"step": 4290
	},
	{
	"epoch": 4.878048780487805,
	"grad_norm": 1.0151753425598145,
	"learning_rate": 3.4604328226333083e-07,
	"loss": 0.5609,
	"step": 4300
	},
	{
	"epoch": 4.889393079977311,
	"grad_norm": 1.0577515363693237,
	"learning_rate": 2.832986491534295e-07,
	"loss": 0.6435,
	"step": 4310
	},
	{
	"epoch": 4.900737379466818,
	"grad_norm": 0.8938112854957581,
	"learning_rate": 2.2681724206052857e-07,
	"loss": 0.6398,
	"step": 4320
	},
	{
	"epoch": 4.912081678956325,
	"grad_norm": 0.997191846370697,
	"learning_rate": 1.7660260859315713e-07,
	"loss": 0.628,
	"step": 4330
	},
	{
	"epoch": 4.923425978445831,
	"grad_norm": 0.8382704257965088,
	"learning_rate": 1.3265790274249456e-07,
	"loss": 0.6105,
	"step": 4340
	},
	{
	"epoch": 4.934770277935337,
	"grad_norm": 0.8330470323562622,
	"learning_rate": 9.498588468433989e-08,
	"loss": 0.5982,
	"step": 4350
	},
	{
	"epoch": 4.946114577424844,
	"grad_norm": 1.2183622121810913,
	"learning_rate": 6.35889206057172e-08,
	"loss": 0.5876,
	"step": 4360
	},
	{
	"epoch": 4.957458876914351,
	"grad_norm": 1.131373405456543,
	"learning_rate": 3.846898255622788e-08,
	"loss": 0.6113,
	"step": 4370
	},
	{
	"epoch": 4.968803176403857,
	"grad_norm": 1.1781286001205444,
	"learning_rate": 1.9627648324227476e-08,
	"loss": 0.5522,
	"step": 4380
	},
	{
	"epoch": 4.9801474758933635,
	"grad_norm": 1.2726503610610962,
	"learning_rate": 7.066101337682707e-09,
	"loss": 0.6312,
	"step": 4390
	},
	{
	"epoch": 4.99149177538287,
	"grad_norm": 1.1971274614334106,
	"learning_rate": 7.85130589897598e-10,
	"loss": 0.6052,
	"step": 4400
	},
	{
	"epoch": 4.997163925127623,
	"step": 4405,
	"total_flos": 9.40234358432727e+17,
	"train_loss": 0.7921485962039632,
	"train_runtime": 4193.8899,
	"train_samples_per_second": 33.618,
	"train_steps_per_second": 1.05
	}
	],
	"logging_steps": 10,
	"max_steps": 4405,
	"num_input_tokens_seen": 0,
	"num_train_epochs": 5,
	"save_steps": 1000,
	"stateful_callbacks": {
	"TrainerControl": {
	"args": {
	"should_epoch_stop": false,
	"should_evaluate": false,
	"should_log": false,
	"should_save": true,
	"should_training_stop": true
	},
	"attributes": {}
	}
	},
	"total_flos": 9.40234358432727e+17,
	"train_batch_size": 2,
	"trial_name": null,
	"trial_params": null
	}