gptj-chat-mamulex / trainer_state.json
mamulex's picture
Upload folder using huggingface_hub
181dbba verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.1584800741427248,
"eval_steps": 500,
"global_step": 2500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.004633920296570899,
"grad_norm": 0.4352174997329712,
"learning_rate": 0.00029937442075996293,
"loss": 0.5216,
"step": 10
},
{
"epoch": 0.009267840593141797,
"grad_norm": 0.2795519530773163,
"learning_rate": 0.0002986793327154773,
"loss": 0.5275,
"step": 20
},
{
"epoch": 0.013901760889712697,
"grad_norm": 0.3515735864639282,
"learning_rate": 0.00029798424467099164,
"loss": 0.4768,
"step": 30
},
{
"epoch": 0.018535681186283594,
"grad_norm": 0.3210294842720032,
"learning_rate": 0.000297289156626506,
"loss": 0.4708,
"step": 40
},
{
"epoch": 0.023169601482854494,
"grad_norm": 0.3719145953655243,
"learning_rate": 0.0002965940685820204,
"loss": 0.4825,
"step": 50
},
{
"epoch": 0.027803521779425393,
"grad_norm": 0.41005653142929077,
"learning_rate": 0.0002958989805375347,
"loss": 0.5387,
"step": 60
},
{
"epoch": 0.03243744207599629,
"grad_norm": 0.44353237748146057,
"learning_rate": 0.0002952038924930491,
"loss": 0.517,
"step": 70
},
{
"epoch": 0.03707136237256719,
"grad_norm": 0.42356154322624207,
"learning_rate": 0.00029450880444856345,
"loss": 0.448,
"step": 80
},
{
"epoch": 0.04170528266913809,
"grad_norm": 0.37184569239616394,
"learning_rate": 0.00029381371640407785,
"loss": 0.5389,
"step": 90
},
{
"epoch": 0.04633920296570899,
"grad_norm": 0.34952738881111145,
"learning_rate": 0.0002931186283595922,
"loss": 0.5006,
"step": 100
},
{
"epoch": 0.05097312326227989,
"grad_norm": 0.425190806388855,
"learning_rate": 0.00029242354031510656,
"loss": 0.5391,
"step": 110
},
{
"epoch": 0.05560704355885079,
"grad_norm": 0.522118330001831,
"learning_rate": 0.0002917284522706209,
"loss": 0.5485,
"step": 120
},
{
"epoch": 0.060240963855421686,
"grad_norm": 0.42207586765289307,
"learning_rate": 0.0002910333642261353,
"loss": 0.558,
"step": 130
},
{
"epoch": 0.06487488415199258,
"grad_norm": 0.3233225643634796,
"learning_rate": 0.00029033827618164966,
"loss": 0.4983,
"step": 140
},
{
"epoch": 0.06950880444856349,
"grad_norm": 0.3595481216907501,
"learning_rate": 0.000289643188137164,
"loss": 0.4469,
"step": 150
},
{
"epoch": 0.07414272474513438,
"grad_norm": 0.4325884282588959,
"learning_rate": 0.00028894810009267837,
"loss": 0.5458,
"step": 160
},
{
"epoch": 0.07877664504170528,
"grad_norm": 0.352845162153244,
"learning_rate": 0.0002882530120481927,
"loss": 0.5034,
"step": 170
},
{
"epoch": 0.08341056533827618,
"grad_norm": 0.34022948145866394,
"learning_rate": 0.0002875579240037071,
"loss": 0.4987,
"step": 180
},
{
"epoch": 0.08804448563484708,
"grad_norm": 0.31629809737205505,
"learning_rate": 0.0002868628359592215,
"loss": 0.4883,
"step": 190
},
{
"epoch": 0.09267840593141798,
"grad_norm": 0.32983237504959106,
"learning_rate": 0.0002861677479147359,
"loss": 0.5043,
"step": 200
},
{
"epoch": 0.09731232622798888,
"grad_norm": 0.3827531337738037,
"learning_rate": 0.0002854726598702502,
"loss": 0.5171,
"step": 210
},
{
"epoch": 0.10194624652455977,
"grad_norm": 0.36385366320610046,
"learning_rate": 0.0002847775718257646,
"loss": 0.4355,
"step": 220
},
{
"epoch": 0.10658016682113068,
"grad_norm": 0.37122678756713867,
"learning_rate": 0.00028408248378127893,
"loss": 0.4366,
"step": 230
},
{
"epoch": 0.11121408711770157,
"grad_norm": 0.4853290021419525,
"learning_rate": 0.0002833873957367933,
"loss": 0.5453,
"step": 240
},
{
"epoch": 0.11584800741427248,
"grad_norm": 0.3827182948589325,
"learning_rate": 0.00028269230769230764,
"loss": 0.431,
"step": 250
},
{
"epoch": 0.12048192771084337,
"grad_norm": 0.39195120334625244,
"learning_rate": 0.00028199721964782204,
"loss": 0.4782,
"step": 260
},
{
"epoch": 0.12511584800741427,
"grad_norm": 0.35562747716903687,
"learning_rate": 0.0002813021316033364,
"loss": 0.4195,
"step": 270
},
{
"epoch": 0.12974976830398516,
"grad_norm": 0.4071958065032959,
"learning_rate": 0.00028060704355885075,
"loss": 0.5015,
"step": 280
},
{
"epoch": 0.13438368860055608,
"grad_norm": 0.43866515159606934,
"learning_rate": 0.00027991195551436515,
"loss": 0.4587,
"step": 290
},
{
"epoch": 0.13901760889712697,
"grad_norm": 0.4723324477672577,
"learning_rate": 0.0002792168674698795,
"loss": 0.4118,
"step": 300
},
{
"epoch": 0.14365152919369786,
"grad_norm": 0.35274723172187805,
"learning_rate": 0.00027852177942539385,
"loss": 0.5076,
"step": 310
},
{
"epoch": 0.14828544949026876,
"grad_norm": 0.4973192811012268,
"learning_rate": 0.0002778266913809082,
"loss": 0.5483,
"step": 320
},
{
"epoch": 0.15291936978683968,
"grad_norm": 0.32685619592666626,
"learning_rate": 0.0002771316033364226,
"loss": 0.4675,
"step": 330
},
{
"epoch": 0.15755329008341057,
"grad_norm": 0.485867977142334,
"learning_rate": 0.00027643651529193696,
"loss": 0.4491,
"step": 340
},
{
"epoch": 0.16218721037998146,
"grad_norm": 0.3535182774066925,
"learning_rate": 0.0002757414272474513,
"loss": 0.4544,
"step": 350
},
{
"epoch": 0.16682113067655235,
"grad_norm": 0.41201695799827576,
"learning_rate": 0.00027504633920296567,
"loss": 0.4577,
"step": 360
},
{
"epoch": 0.17145505097312327,
"grad_norm": 0.399549275636673,
"learning_rate": 0.00027435125115848007,
"loss": 0.447,
"step": 370
},
{
"epoch": 0.17608897126969417,
"grad_norm": 0.44300174713134766,
"learning_rate": 0.0002736561631139944,
"loss": 0.5051,
"step": 380
},
{
"epoch": 0.18072289156626506,
"grad_norm": 0.32776978611946106,
"learning_rate": 0.0002729610750695088,
"loss": 0.5166,
"step": 390
},
{
"epoch": 0.18535681186283595,
"grad_norm": 0.4692281186580658,
"learning_rate": 0.0002722659870250231,
"loss": 0.4838,
"step": 400
},
{
"epoch": 0.18999073215940684,
"grad_norm": 0.42574265599250793,
"learning_rate": 0.00027157089898053753,
"loss": 0.4456,
"step": 410
},
{
"epoch": 0.19462465245597776,
"grad_norm": 0.3274458050727844,
"learning_rate": 0.0002708758109360519,
"loss": 0.476,
"step": 420
},
{
"epoch": 0.19925857275254866,
"grad_norm": 0.38131827116012573,
"learning_rate": 0.00027018072289156623,
"loss": 0.4124,
"step": 430
},
{
"epoch": 0.20389249304911955,
"grad_norm": 0.3712831139564514,
"learning_rate": 0.0002694856348470806,
"loss": 0.4395,
"step": 440
},
{
"epoch": 0.20852641334569044,
"grad_norm": 0.43983837962150574,
"learning_rate": 0.000268790546802595,
"loss": 0.5287,
"step": 450
},
{
"epoch": 0.21316033364226136,
"grad_norm": 0.383878618478775,
"learning_rate": 0.00026809545875810934,
"loss": 0.4524,
"step": 460
},
{
"epoch": 0.21779425393883226,
"grad_norm": 0.39737293124198914,
"learning_rate": 0.0002674003707136237,
"loss": 0.5171,
"step": 470
},
{
"epoch": 0.22242817423540315,
"grad_norm": 0.3629433214664459,
"learning_rate": 0.0002667052826691381,
"loss": 0.4017,
"step": 480
},
{
"epoch": 0.22706209453197404,
"grad_norm": 0.44448867440223694,
"learning_rate": 0.0002660101946246524,
"loss": 0.4876,
"step": 490
},
{
"epoch": 0.23169601482854496,
"grad_norm": 0.4824674725532532,
"learning_rate": 0.0002653151065801668,
"loss": 0.442,
"step": 500
},
{
"epoch": 0.23632993512511585,
"grad_norm": 0.3264263868331909,
"learning_rate": 0.00026462001853568115,
"loss": 0.4849,
"step": 510
},
{
"epoch": 0.24096385542168675,
"grad_norm": 0.3806018531322479,
"learning_rate": 0.00026392493049119556,
"loss": 0.4198,
"step": 520
},
{
"epoch": 0.24559777571825764,
"grad_norm": 0.4619787931442261,
"learning_rate": 0.00026322984244670986,
"loss": 0.4348,
"step": 530
},
{
"epoch": 0.25023169601482853,
"grad_norm": 0.3893038034439087,
"learning_rate": 0.00026253475440222426,
"loss": 0.4269,
"step": 540
},
{
"epoch": 0.2548656163113994,
"grad_norm": 0.5061798095703125,
"learning_rate": 0.0002618396663577386,
"loss": 0.4367,
"step": 550
},
{
"epoch": 0.2594995366079703,
"grad_norm": 0.3888299763202667,
"learning_rate": 0.000261144578313253,
"loss": 0.4703,
"step": 560
},
{
"epoch": 0.26413345690454126,
"grad_norm": 0.5040469169616699,
"learning_rate": 0.00026044949026876737,
"loss": 0.4383,
"step": 570
},
{
"epoch": 0.26876737720111216,
"grad_norm": 0.33014264702796936,
"learning_rate": 0.0002597544022242817,
"loss": 0.4622,
"step": 580
},
{
"epoch": 0.27340129749768305,
"grad_norm": 0.4775944650173187,
"learning_rate": 0.00025905931417979607,
"loss": 0.5177,
"step": 590
},
{
"epoch": 0.27803521779425394,
"grad_norm": 0.3613554835319519,
"learning_rate": 0.0002583642261353104,
"loss": 0.4683,
"step": 600
},
{
"epoch": 0.28266913809082483,
"grad_norm": 0.4591723084449768,
"learning_rate": 0.00025766913809082483,
"loss": 0.5064,
"step": 610
},
{
"epoch": 0.2873030583873957,
"grad_norm": 0.42493635416030884,
"learning_rate": 0.0002569740500463392,
"loss": 0.5226,
"step": 620
},
{
"epoch": 0.2919369786839666,
"grad_norm": 0.3522678315639496,
"learning_rate": 0.0002562789620018536,
"loss": 0.4427,
"step": 630
},
{
"epoch": 0.2965708989805375,
"grad_norm": 0.3232039213180542,
"learning_rate": 0.0002555838739573679,
"loss": 0.4115,
"step": 640
},
{
"epoch": 0.30120481927710846,
"grad_norm": 0.32238250970840454,
"learning_rate": 0.0002548887859128823,
"loss": 0.3853,
"step": 650
},
{
"epoch": 0.30583873957367935,
"grad_norm": 0.4618857502937317,
"learning_rate": 0.00025419369786839664,
"loss": 0.5639,
"step": 660
},
{
"epoch": 0.31047265987025024,
"grad_norm": 0.3681054711341858,
"learning_rate": 0.000253498609823911,
"loss": 0.4897,
"step": 670
},
{
"epoch": 0.31510658016682114,
"grad_norm": 0.3761281371116638,
"learning_rate": 0.00025280352177942534,
"loss": 0.4275,
"step": 680
},
{
"epoch": 0.31974050046339203,
"grad_norm": 0.3062303066253662,
"learning_rate": 0.00025210843373493975,
"loss": 0.3967,
"step": 690
},
{
"epoch": 0.3243744207599629,
"grad_norm": 0.38078251481056213,
"learning_rate": 0.0002514133456904541,
"loss": 0.4405,
"step": 700
},
{
"epoch": 0.3290083410565338,
"grad_norm": 0.40751275420188904,
"learning_rate": 0.00025071825764596845,
"loss": 0.5463,
"step": 710
},
{
"epoch": 0.3336422613531047,
"grad_norm": 0.408681720495224,
"learning_rate": 0.00025002316960148286,
"loss": 0.4934,
"step": 720
},
{
"epoch": 0.3382761816496756,
"grad_norm": 0.46908262372016907,
"learning_rate": 0.0002493280815569972,
"loss": 0.445,
"step": 730
},
{
"epoch": 0.34291010194624655,
"grad_norm": 0.479021281003952,
"learning_rate": 0.00024863299351251156,
"loss": 0.4423,
"step": 740
},
{
"epoch": 0.34754402224281744,
"grad_norm": 0.37807953357696533,
"learning_rate": 0.0002479379054680259,
"loss": 0.4505,
"step": 750
},
{
"epoch": 0.35217794253938833,
"grad_norm": 0.4273180365562439,
"learning_rate": 0.0002472428174235403,
"loss": 0.4379,
"step": 760
},
{
"epoch": 0.3568118628359592,
"grad_norm": 0.4171128273010254,
"learning_rate": 0.00024654772937905467,
"loss": 0.4084,
"step": 770
},
{
"epoch": 0.3614457831325301,
"grad_norm": 0.4805515706539154,
"learning_rate": 0.000245852641334569,
"loss": 0.4373,
"step": 780
},
{
"epoch": 0.366079703429101,
"grad_norm": 0.3882713317871094,
"learning_rate": 0.00024515755329008337,
"loss": 0.4425,
"step": 790
},
{
"epoch": 0.3707136237256719,
"grad_norm": 0.3893239200115204,
"learning_rate": 0.0002444624652455978,
"loss": 0.4829,
"step": 800
},
{
"epoch": 0.3753475440222428,
"grad_norm": 0.3848889470100403,
"learning_rate": 0.00024376737720111213,
"loss": 0.4629,
"step": 810
},
{
"epoch": 0.3799814643188137,
"grad_norm": 0.3595952093601227,
"learning_rate": 0.00024307228915662648,
"loss": 0.4623,
"step": 820
},
{
"epoch": 0.38461538461538464,
"grad_norm": 0.4663516581058502,
"learning_rate": 0.00024237720111214086,
"loss": 0.3994,
"step": 830
},
{
"epoch": 0.38924930491195553,
"grad_norm": 0.4669645130634308,
"learning_rate": 0.00024168211306765524,
"loss": 0.4537,
"step": 840
},
{
"epoch": 0.3938832252085264,
"grad_norm": 0.43069687485694885,
"learning_rate": 0.00024098702502316956,
"loss": 0.4325,
"step": 850
},
{
"epoch": 0.3985171455050973,
"grad_norm": 0.5129668712615967,
"learning_rate": 0.00024029193697868394,
"loss": 0.4251,
"step": 860
},
{
"epoch": 0.4031510658016682,
"grad_norm": 0.47785133123397827,
"learning_rate": 0.00023959684893419832,
"loss": 0.4534,
"step": 870
},
{
"epoch": 0.4077849860982391,
"grad_norm": 0.34385788440704346,
"learning_rate": 0.0002389017608897127,
"loss": 0.4258,
"step": 880
},
{
"epoch": 0.41241890639481,
"grad_norm": 0.35733455419540405,
"learning_rate": 0.00023820667284522702,
"loss": 0.4888,
"step": 890
},
{
"epoch": 0.4170528266913809,
"grad_norm": 0.3299333155155182,
"learning_rate": 0.0002375115848007414,
"loss": 0.47,
"step": 900
},
{
"epoch": 0.42168674698795183,
"grad_norm": 0.34629347920417786,
"learning_rate": 0.00023681649675625578,
"loss": 0.4226,
"step": 910
},
{
"epoch": 0.4263206672845227,
"grad_norm": 0.4319722354412079,
"learning_rate": 0.00023612140871177013,
"loss": 0.4982,
"step": 920
},
{
"epoch": 0.4309545875810936,
"grad_norm": 0.3622225522994995,
"learning_rate": 0.0002354263206672845,
"loss": 0.4339,
"step": 930
},
{
"epoch": 0.4355885078776645,
"grad_norm": 0.4643559157848358,
"learning_rate": 0.00023473123262279886,
"loss": 0.4238,
"step": 940
},
{
"epoch": 0.4402224281742354,
"grad_norm": 0.3616037368774414,
"learning_rate": 0.00023403614457831324,
"loss": 0.4741,
"step": 950
},
{
"epoch": 0.4448563484708063,
"grad_norm": 0.40745407342910767,
"learning_rate": 0.0002333410565338276,
"loss": 0.4116,
"step": 960
},
{
"epoch": 0.4494902687673772,
"grad_norm": 0.4535151720046997,
"learning_rate": 0.00023264596848934197,
"loss": 0.4368,
"step": 970
},
{
"epoch": 0.4541241890639481,
"grad_norm": 0.41154956817626953,
"learning_rate": 0.00023195088044485634,
"loss": 0.4914,
"step": 980
},
{
"epoch": 0.458758109360519,
"grad_norm": 0.4555477201938629,
"learning_rate": 0.00023125579240037072,
"loss": 0.4229,
"step": 990
},
{
"epoch": 0.4633920296570899,
"grad_norm": 0.4146144390106201,
"learning_rate": 0.00023056070435588505,
"loss": 0.4097,
"step": 1000
},
{
"epoch": 0.4680259499536608,
"grad_norm": 0.36076611280441284,
"learning_rate": 0.00022986561631139943,
"loss": 0.4091,
"step": 1010
},
{
"epoch": 0.4726598702502317,
"grad_norm": 0.31528130173683167,
"learning_rate": 0.0002291705282669138,
"loss": 0.3686,
"step": 1020
},
{
"epoch": 0.4772937905468026,
"grad_norm": 0.3774864971637726,
"learning_rate": 0.00022847544022242813,
"loss": 0.4923,
"step": 1030
},
{
"epoch": 0.4819277108433735,
"grad_norm": 0.41823625564575195,
"learning_rate": 0.0002277803521779425,
"loss": 0.4546,
"step": 1040
},
{
"epoch": 0.4865616311399444,
"grad_norm": 0.3713241517543793,
"learning_rate": 0.00022708526413345689,
"loss": 0.4299,
"step": 1050
},
{
"epoch": 0.4911955514365153,
"grad_norm": 0.334870308637619,
"learning_rate": 0.00022639017608897126,
"loss": 0.4351,
"step": 1060
},
{
"epoch": 0.49582947173308617,
"grad_norm": 0.24805714190006256,
"learning_rate": 0.00022569508804448562,
"loss": 0.3929,
"step": 1070
},
{
"epoch": 0.5004633920296571,
"grad_norm": 0.529045581817627,
"learning_rate": 0.000225,
"loss": 0.4349,
"step": 1080
},
{
"epoch": 0.505097312326228,
"grad_norm": 0.5127238631248474,
"learning_rate": 0.00022430491195551435,
"loss": 0.444,
"step": 1090
},
{
"epoch": 0.5097312326227988,
"grad_norm": 0.4840947687625885,
"learning_rate": 0.0002236098239110287,
"loss": 0.4814,
"step": 1100
},
{
"epoch": 0.5143651529193698,
"grad_norm": 0.44880053400993347,
"learning_rate": 0.00022291473586654308,
"loss": 0.4466,
"step": 1110
},
{
"epoch": 0.5189990732159406,
"grad_norm": 0.2730713486671448,
"learning_rate": 0.00022221964782205745,
"loss": 0.3927,
"step": 1120
},
{
"epoch": 0.5236329935125116,
"grad_norm": 0.33978259563446045,
"learning_rate": 0.00022152455977757183,
"loss": 0.4763,
"step": 1130
},
{
"epoch": 0.5282669138090825,
"grad_norm": 0.33843424916267395,
"learning_rate": 0.00022082947173308616,
"loss": 0.3855,
"step": 1140
},
{
"epoch": 0.5329008341056534,
"grad_norm": 0.4196402430534363,
"learning_rate": 0.00022013438368860053,
"loss": 0.4703,
"step": 1150
},
{
"epoch": 0.5375347544022243,
"grad_norm": 0.39824753999710083,
"learning_rate": 0.0002194392956441149,
"loss": 0.4653,
"step": 1160
},
{
"epoch": 0.5421686746987951,
"grad_norm": 0.3243648409843445,
"learning_rate": 0.00021874420759962924,
"loss": 0.4504,
"step": 1170
},
{
"epoch": 0.5468025949953661,
"grad_norm": 0.3930327892303467,
"learning_rate": 0.00021804911955514362,
"loss": 0.5203,
"step": 1180
},
{
"epoch": 0.5514365152919369,
"grad_norm": 0.406800776720047,
"learning_rate": 0.000217354031510658,
"loss": 0.4388,
"step": 1190
},
{
"epoch": 0.5560704355885079,
"grad_norm": 0.32722190022468567,
"learning_rate": 0.00021665894346617237,
"loss": 0.4484,
"step": 1200
},
{
"epoch": 0.5607043558850788,
"grad_norm": 0.40086886286735535,
"learning_rate": 0.00021596385542168672,
"loss": 0.3964,
"step": 1210
},
{
"epoch": 0.5653382761816497,
"grad_norm": 0.31294673681259155,
"learning_rate": 0.0002152687673772011,
"loss": 0.4188,
"step": 1220
},
{
"epoch": 0.5699721964782206,
"grad_norm": 0.33032116293907166,
"learning_rate": 0.00021457367933271545,
"loss": 0.4595,
"step": 1230
},
{
"epoch": 0.5746061167747915,
"grad_norm": 0.3128484785556793,
"learning_rate": 0.00021387859128822983,
"loss": 0.4248,
"step": 1240
},
{
"epoch": 0.5792400370713624,
"grad_norm": 0.41308367252349854,
"learning_rate": 0.00021318350324374418,
"loss": 0.4842,
"step": 1250
},
{
"epoch": 0.5838739573679332,
"grad_norm": 0.32540541887283325,
"learning_rate": 0.00021248841519925856,
"loss": 0.392,
"step": 1260
},
{
"epoch": 0.5885078776645042,
"grad_norm": 0.37712159752845764,
"learning_rate": 0.00021179332715477294,
"loss": 0.4385,
"step": 1270
},
{
"epoch": 0.593141797961075,
"grad_norm": 0.39498409628868103,
"learning_rate": 0.00021109823911028727,
"loss": 0.4388,
"step": 1280
},
{
"epoch": 0.597775718257646,
"grad_norm": 0.3992222547531128,
"learning_rate": 0.00021040315106580164,
"loss": 0.504,
"step": 1290
},
{
"epoch": 0.6024096385542169,
"grad_norm": 0.36806175112724304,
"learning_rate": 0.00020970806302131602,
"loss": 0.4143,
"step": 1300
},
{
"epoch": 0.6070435588507878,
"grad_norm": 0.48738566040992737,
"learning_rate": 0.0002090129749768304,
"loss": 0.4917,
"step": 1310
},
{
"epoch": 0.6116774791473587,
"grad_norm": 0.3689660429954529,
"learning_rate": 0.00020831788693234472,
"loss": 0.4205,
"step": 1320
},
{
"epoch": 0.6163113994439295,
"grad_norm": 0.36636972427368164,
"learning_rate": 0.0002076227988878591,
"loss": 0.3859,
"step": 1330
},
{
"epoch": 0.6209453197405005,
"grad_norm": 0.355956494808197,
"learning_rate": 0.00020692771084337348,
"loss": 0.4223,
"step": 1340
},
{
"epoch": 0.6255792400370713,
"grad_norm": 0.355040967464447,
"learning_rate": 0.00020623262279888783,
"loss": 0.414,
"step": 1350
},
{
"epoch": 0.6302131603336423,
"grad_norm": 0.36592569947242737,
"learning_rate": 0.0002055375347544022,
"loss": 0.406,
"step": 1360
},
{
"epoch": 0.6348470806302131,
"grad_norm": 0.43309953808784485,
"learning_rate": 0.00020484244670991656,
"loss": 0.4529,
"step": 1370
},
{
"epoch": 0.6394810009267841,
"grad_norm": 0.30984750390052795,
"learning_rate": 0.00020414735866543094,
"loss": 0.3873,
"step": 1380
},
{
"epoch": 0.644114921223355,
"grad_norm": 0.33852508664131165,
"learning_rate": 0.0002034522706209453,
"loss": 0.4137,
"step": 1390
},
{
"epoch": 0.6487488415199258,
"grad_norm": 0.4319971799850464,
"learning_rate": 0.00020275718257645967,
"loss": 0.445,
"step": 1400
},
{
"epoch": 0.6533827618164968,
"grad_norm": 0.3274412453174591,
"learning_rate": 0.00020206209453197405,
"loss": 0.3896,
"step": 1410
},
{
"epoch": 0.6580166821130676,
"grad_norm": 0.44836536049842834,
"learning_rate": 0.0002013670064874884,
"loss": 0.4647,
"step": 1420
},
{
"epoch": 0.6626506024096386,
"grad_norm": 0.45571744441986084,
"learning_rate": 0.00020067191844300275,
"loss": 0.4575,
"step": 1430
},
{
"epoch": 0.6672845227062094,
"grad_norm": 0.38867196440696716,
"learning_rate": 0.00019997683039851713,
"loss": 0.4505,
"step": 1440
},
{
"epoch": 0.6719184430027804,
"grad_norm": 0.46751868724823,
"learning_rate": 0.0001992817423540315,
"loss": 0.4094,
"step": 1450
},
{
"epoch": 0.6765523632993512,
"grad_norm": 0.3465523421764374,
"learning_rate": 0.00019858665430954583,
"loss": 0.357,
"step": 1460
},
{
"epoch": 0.6811862835959221,
"grad_norm": 0.35008054971694946,
"learning_rate": 0.0001978915662650602,
"loss": 0.4235,
"step": 1470
},
{
"epoch": 0.6858202038924931,
"grad_norm": 0.44757360219955444,
"learning_rate": 0.0001971964782205746,
"loss": 0.4299,
"step": 1480
},
{
"epoch": 0.6904541241890639,
"grad_norm": 0.37812456488609314,
"learning_rate": 0.00019650139017608897,
"loss": 0.4861,
"step": 1490
},
{
"epoch": 0.6950880444856349,
"grad_norm": 0.4172975420951843,
"learning_rate": 0.00019580630213160332,
"loss": 0.411,
"step": 1500
},
{
"epoch": 0.6997219647822057,
"grad_norm": 0.3557645082473755,
"learning_rate": 0.00019511121408711767,
"loss": 0.413,
"step": 1510
},
{
"epoch": 0.7043558850787767,
"grad_norm": 0.475146621465683,
"learning_rate": 0.00019441612604263205,
"loss": 0.404,
"step": 1520
},
{
"epoch": 0.7089898053753475,
"grad_norm": 0.34932583570480347,
"learning_rate": 0.0001937210379981464,
"loss": 0.4339,
"step": 1530
},
{
"epoch": 0.7136237256719185,
"grad_norm": 0.3841742277145386,
"learning_rate": 0.00019302594995366078,
"loss": 0.443,
"step": 1540
},
{
"epoch": 0.7182576459684893,
"grad_norm": 0.40557074546813965,
"learning_rate": 0.00019233086190917516,
"loss": 0.4584,
"step": 1550
},
{
"epoch": 0.7228915662650602,
"grad_norm": 0.46951010823249817,
"learning_rate": 0.0001916357738646895,
"loss": 0.4237,
"step": 1560
},
{
"epoch": 0.7275254865616312,
"grad_norm": 0.38474270701408386,
"learning_rate": 0.00019094068582020386,
"loss": 0.4529,
"step": 1570
},
{
"epoch": 0.732159406858202,
"grad_norm": 0.3190203905105591,
"learning_rate": 0.00019024559777571824,
"loss": 0.4161,
"step": 1580
},
{
"epoch": 0.736793327154773,
"grad_norm": 0.3800548017024994,
"learning_rate": 0.00018955050973123262,
"loss": 0.3886,
"step": 1590
},
{
"epoch": 0.7414272474513438,
"grad_norm": 0.3625418543815613,
"learning_rate": 0.00018885542168674694,
"loss": 0.3909,
"step": 1600
},
{
"epoch": 0.7460611677479148,
"grad_norm": 0.5026484131813049,
"learning_rate": 0.00018816033364226132,
"loss": 0.5229,
"step": 1610
},
{
"epoch": 0.7506950880444856,
"grad_norm": 0.3223924934864044,
"learning_rate": 0.0001874652455977757,
"loss": 0.4273,
"step": 1620
},
{
"epoch": 0.7553290083410565,
"grad_norm": 0.27654021978378296,
"learning_rate": 0.00018677015755329008,
"loss": 0.3905,
"step": 1630
},
{
"epoch": 0.7599629286376274,
"grad_norm": 0.42651575803756714,
"learning_rate": 0.00018607506950880443,
"loss": 0.4427,
"step": 1640
},
{
"epoch": 0.7645968489341983,
"grad_norm": 0.4370901584625244,
"learning_rate": 0.00018537998146431878,
"loss": 0.4688,
"step": 1650
},
{
"epoch": 0.7692307692307693,
"grad_norm": 0.37819185853004456,
"learning_rate": 0.00018468489341983316,
"loss": 0.4231,
"step": 1660
},
{
"epoch": 0.7738646895273401,
"grad_norm": 0.3696691691875458,
"learning_rate": 0.00018398980537534754,
"loss": 0.4458,
"step": 1670
},
{
"epoch": 0.7784986098239111,
"grad_norm": 0.34384486079216003,
"learning_rate": 0.0001832947173308619,
"loss": 0.462,
"step": 1680
},
{
"epoch": 0.7831325301204819,
"grad_norm": 0.39266905188560486,
"learning_rate": 0.00018259962928637627,
"loss": 0.4118,
"step": 1690
},
{
"epoch": 0.7877664504170528,
"grad_norm": 0.43815886974334717,
"learning_rate": 0.00018190454124189065,
"loss": 0.3836,
"step": 1700
},
{
"epoch": 0.7924003707136237,
"grad_norm": 0.4963682293891907,
"learning_rate": 0.00018120945319740497,
"loss": 0.3929,
"step": 1710
},
{
"epoch": 0.7970342910101946,
"grad_norm": 0.4125339388847351,
"learning_rate": 0.00018051436515291935,
"loss": 0.4214,
"step": 1720
},
{
"epoch": 0.8016682113067656,
"grad_norm": 0.5118056535720825,
"learning_rate": 0.00017981927710843373,
"loss": 0.4582,
"step": 1730
},
{
"epoch": 0.8063021316033364,
"grad_norm": 0.44652143120765686,
"learning_rate": 0.0001791241890639481,
"loss": 0.4557,
"step": 1740
},
{
"epoch": 0.8109360518999074,
"grad_norm": 0.39972642064094543,
"learning_rate": 0.00017842910101946243,
"loss": 0.454,
"step": 1750
},
{
"epoch": 0.8155699721964782,
"grad_norm": 0.5146743059158325,
"learning_rate": 0.0001777340129749768,
"loss": 0.4064,
"step": 1760
},
{
"epoch": 0.8202038924930491,
"grad_norm": 0.3463651239871979,
"learning_rate": 0.0001770389249304912,
"loss": 0.3826,
"step": 1770
},
{
"epoch": 0.82483781278962,
"grad_norm": 0.3422740399837494,
"learning_rate": 0.00017634383688600554,
"loss": 0.3838,
"step": 1780
},
{
"epoch": 0.8294717330861909,
"grad_norm": 0.3831815719604492,
"learning_rate": 0.00017564874884151992,
"loss": 0.4442,
"step": 1790
},
{
"epoch": 0.8341056533827618,
"grad_norm": 0.3674461245536804,
"learning_rate": 0.00017495366079703427,
"loss": 0.3617,
"step": 1800
},
{
"epoch": 0.8387395736793327,
"grad_norm": 0.3935919404029846,
"learning_rate": 0.00017425857275254865,
"loss": 0.4578,
"step": 1810
},
{
"epoch": 0.8433734939759037,
"grad_norm": 0.4333125650882721,
"learning_rate": 0.000173563484708063,
"loss": 0.4431,
"step": 1820
},
{
"epoch": 0.8480074142724745,
"grad_norm": 0.42463764548301697,
"learning_rate": 0.00017286839666357738,
"loss": 0.4031,
"step": 1830
},
{
"epoch": 0.8526413345690455,
"grad_norm": 0.35087305307388306,
"learning_rate": 0.00017217330861909176,
"loss": 0.4582,
"step": 1840
},
{
"epoch": 0.8572752548656163,
"grad_norm": 0.33768782019615173,
"learning_rate": 0.0001714782205746061,
"loss": 0.3978,
"step": 1850
},
{
"epoch": 0.8619091751621872,
"grad_norm": 0.3734382390975952,
"learning_rate": 0.00017078313253012046,
"loss": 0.4296,
"step": 1860
},
{
"epoch": 0.8665430954587581,
"grad_norm": 0.3718160092830658,
"learning_rate": 0.00017008804448563484,
"loss": 0.417,
"step": 1870
},
{
"epoch": 0.871177015755329,
"grad_norm": 0.37972408533096313,
"learning_rate": 0.00016939295644114921,
"loss": 0.411,
"step": 1880
},
{
"epoch": 0.8758109360518999,
"grad_norm": 0.43368878960609436,
"learning_rate": 0.00016869786839666354,
"loss": 0.506,
"step": 1890
},
{
"epoch": 0.8804448563484708,
"grad_norm": 0.41099444031715393,
"learning_rate": 0.00016800278035217792,
"loss": 0.3928,
"step": 1900
},
{
"epoch": 0.8850787766450418,
"grad_norm": 0.3973136842250824,
"learning_rate": 0.0001673076923076923,
"loss": 0.4442,
"step": 1910
},
{
"epoch": 0.8897126969416126,
"grad_norm": 0.30687418580055237,
"learning_rate": 0.00016661260426320667,
"loss": 0.4209,
"step": 1920
},
{
"epoch": 0.8943466172381835,
"grad_norm": 0.3588371276855469,
"learning_rate": 0.00016591751621872103,
"loss": 0.3932,
"step": 1930
},
{
"epoch": 0.8989805375347544,
"grad_norm": 0.4107860028743744,
"learning_rate": 0.00016522242817423538,
"loss": 0.3747,
"step": 1940
},
{
"epoch": 0.9036144578313253,
"grad_norm": 0.3935255706310272,
"learning_rate": 0.00016452734012974976,
"loss": 0.401,
"step": 1950
},
{
"epoch": 0.9082483781278962,
"grad_norm": 0.37768107652664185,
"learning_rate": 0.0001638322520852641,
"loss": 0.4125,
"step": 1960
},
{
"epoch": 0.9128822984244671,
"grad_norm": 0.34660592675209045,
"learning_rate": 0.00016313716404077849,
"loss": 0.4238,
"step": 1970
},
{
"epoch": 0.917516218721038,
"grad_norm": 0.325065940618515,
"learning_rate": 0.00016244207599629286,
"loss": 0.4469,
"step": 1980
},
{
"epoch": 0.9221501390176089,
"grad_norm": 0.5264182686805725,
"learning_rate": 0.00016174698795180722,
"loss": 0.4067,
"step": 1990
},
{
"epoch": 0.9267840593141798,
"grad_norm": 0.39541614055633545,
"learning_rate": 0.00016105189990732157,
"loss": 0.4048,
"step": 2000
},
{
"epoch": 0.9314179796107507,
"grad_norm": 0.5899196863174438,
"learning_rate": 0.00016035681186283595,
"loss": 0.4598,
"step": 2010
},
{
"epoch": 0.9360518999073216,
"grad_norm": 0.4272339642047882,
"learning_rate": 0.00015966172381835032,
"loss": 0.5,
"step": 2020
},
{
"epoch": 0.9406858202038925,
"grad_norm": 0.5000485777854919,
"learning_rate": 0.00015896663577386465,
"loss": 0.4154,
"step": 2030
},
{
"epoch": 0.9453197405004634,
"grad_norm": 0.2850925922393799,
"learning_rate": 0.00015827154772937903,
"loss": 0.3751,
"step": 2040
},
{
"epoch": 0.9499536607970342,
"grad_norm": 0.37266668677330017,
"learning_rate": 0.0001575764596848934,
"loss": 0.462,
"step": 2050
},
{
"epoch": 0.9545875810936052,
"grad_norm": 0.35389843583106995,
"learning_rate": 0.00015688137164040778,
"loss": 0.4325,
"step": 2060
},
{
"epoch": 0.959221501390176,
"grad_norm": 0.4005086123943329,
"learning_rate": 0.00015618628359592213,
"loss": 0.4063,
"step": 2070
},
{
"epoch": 0.963855421686747,
"grad_norm": 0.3318014144897461,
"learning_rate": 0.00015549119555143649,
"loss": 0.4005,
"step": 2080
},
{
"epoch": 0.9684893419833179,
"grad_norm": 0.47355303168296814,
"learning_rate": 0.00015479610750695086,
"loss": 0.5058,
"step": 2090
},
{
"epoch": 0.9731232622798888,
"grad_norm": 0.3099556565284729,
"learning_rate": 0.00015410101946246524,
"loss": 0.358,
"step": 2100
},
{
"epoch": 0.9777571825764597,
"grad_norm": 0.5098932385444641,
"learning_rate": 0.0001534059314179796,
"loss": 0.4138,
"step": 2110
},
{
"epoch": 0.9823911028730306,
"grad_norm": 0.43745410442352295,
"learning_rate": 0.00015271084337349397,
"loss": 0.4537,
"step": 2120
},
{
"epoch": 0.9870250231696015,
"grad_norm": 0.4691702127456665,
"learning_rate": 0.00015201575532900832,
"loss": 0.4064,
"step": 2130
},
{
"epoch": 0.9916589434661723,
"grad_norm": 0.43095409870147705,
"learning_rate": 0.00015132066728452268,
"loss": 0.4345,
"step": 2140
},
{
"epoch": 0.9962928637627433,
"grad_norm": 0.3165135681629181,
"learning_rate": 0.00015062557924003705,
"loss": 0.3751,
"step": 2150
},
{
"epoch": 1.0009267840593141,
"grad_norm": 0.38903099298477173,
"learning_rate": 0.00014993049119555143,
"loss": 0.4483,
"step": 2160
},
{
"epoch": 1.0055607043558852,
"grad_norm": 0.3540801703929901,
"learning_rate": 0.00014923540315106578,
"loss": 0.3847,
"step": 2170
},
{
"epoch": 1.010194624652456,
"grad_norm": 0.4951782524585724,
"learning_rate": 0.00014854031510658016,
"loss": 0.3882,
"step": 2180
},
{
"epoch": 1.0148285449490269,
"grad_norm": 0.4157162010669708,
"learning_rate": 0.00014784522706209451,
"loss": 0.3674,
"step": 2190
},
{
"epoch": 1.0194624652455977,
"grad_norm": 0.3631850481033325,
"learning_rate": 0.0001471501390176089,
"loss": 0.341,
"step": 2200
},
{
"epoch": 1.0240963855421688,
"grad_norm": 0.35982194542884827,
"learning_rate": 0.00014645505097312324,
"loss": 0.381,
"step": 2210
},
{
"epoch": 1.0287303058387396,
"grad_norm": 0.3738069236278534,
"learning_rate": 0.0001457599629286376,
"loss": 0.395,
"step": 2220
},
{
"epoch": 1.0333642261353104,
"grad_norm": 0.4233757257461548,
"learning_rate": 0.00014506487488415197,
"loss": 0.4212,
"step": 2230
},
{
"epoch": 1.0379981464318813,
"grad_norm": 0.4391021430492401,
"learning_rate": 0.00014436978683966635,
"loss": 0.4172,
"step": 2240
},
{
"epoch": 1.0426320667284523,
"grad_norm": 0.49776721000671387,
"learning_rate": 0.0001436746987951807,
"loss": 0.384,
"step": 2250
},
{
"epoch": 1.0472659870250232,
"grad_norm": 0.4383520483970642,
"learning_rate": 0.00014297961075069508,
"loss": 0.3707,
"step": 2260
},
{
"epoch": 1.051899907321594,
"grad_norm": 0.47047173976898193,
"learning_rate": 0.00014228452270620946,
"loss": 0.4059,
"step": 2270
},
{
"epoch": 1.056533827618165,
"grad_norm": 0.4196452498435974,
"learning_rate": 0.0001415894346617238,
"loss": 0.441,
"step": 2280
},
{
"epoch": 1.061167747914736,
"grad_norm": 0.4553895592689514,
"learning_rate": 0.00014089434661723816,
"loss": 0.4195,
"step": 2290
},
{
"epoch": 1.0658016682113067,
"grad_norm": 0.5043196082115173,
"learning_rate": 0.00014019925857275254,
"loss": 0.4113,
"step": 2300
},
{
"epoch": 1.0704355885078776,
"grad_norm": 0.4687553942203522,
"learning_rate": 0.0001395041705282669,
"loss": 0.4284,
"step": 2310
},
{
"epoch": 1.0750695088044486,
"grad_norm": 0.42681458592414856,
"learning_rate": 0.00013880908248378127,
"loss": 0.3993,
"step": 2320
},
{
"epoch": 1.0797034291010195,
"grad_norm": 0.38211435079574585,
"learning_rate": 0.00013811399443929562,
"loss": 0.4264,
"step": 2330
},
{
"epoch": 1.0843373493975903,
"grad_norm": 0.4625399112701416,
"learning_rate": 0.00013741890639481,
"loss": 0.4464,
"step": 2340
},
{
"epoch": 1.0889712696941614,
"grad_norm": 0.4715350568294525,
"learning_rate": 0.00013672381835032435,
"loss": 0.3644,
"step": 2350
},
{
"epoch": 1.0936051899907322,
"grad_norm": 0.5162579417228699,
"learning_rate": 0.00013602873030583873,
"loss": 0.4064,
"step": 2360
},
{
"epoch": 1.098239110287303,
"grad_norm": 0.361208438873291,
"learning_rate": 0.00013533364226135308,
"loss": 0.3765,
"step": 2370
},
{
"epoch": 1.1028730305838739,
"grad_norm": 0.45088571310043335,
"learning_rate": 0.00013463855421686746,
"loss": 0.3975,
"step": 2380
},
{
"epoch": 1.107506950880445,
"grad_norm": 0.4460572302341461,
"learning_rate": 0.0001339434661723818,
"loss": 0.4443,
"step": 2390
},
{
"epoch": 1.1121408711770158,
"grad_norm": 0.3629909157752991,
"learning_rate": 0.0001332483781278962,
"loss": 0.401,
"step": 2400
},
{
"epoch": 1.1167747914735866,
"grad_norm": 0.37849023938179016,
"learning_rate": 0.00013255329008341057,
"loss": 0.3476,
"step": 2410
},
{
"epoch": 1.1214087117701577,
"grad_norm": 0.423921674489975,
"learning_rate": 0.00013185820203892492,
"loss": 0.402,
"step": 2420
},
{
"epoch": 1.1260426320667285,
"grad_norm": 0.33255496621131897,
"learning_rate": 0.0001311631139944393,
"loss": 0.3431,
"step": 2430
},
{
"epoch": 1.1306765523632993,
"grad_norm": 0.34243419766426086,
"learning_rate": 0.00013046802594995365,
"loss": 0.4254,
"step": 2440
},
{
"epoch": 1.1353104726598702,
"grad_norm": 0.32260575890541077,
"learning_rate": 0.00012977293790546803,
"loss": 0.3846,
"step": 2450
},
{
"epoch": 1.1399443929564412,
"grad_norm": 0.4910984933376312,
"learning_rate": 0.00012907784986098238,
"loss": 0.3692,
"step": 2460
},
{
"epoch": 1.144578313253012,
"grad_norm": 0.3949461877346039,
"learning_rate": 0.00012838276181649673,
"loss": 0.4011,
"step": 2470
},
{
"epoch": 1.149212233549583,
"grad_norm": 0.5482661128044128,
"learning_rate": 0.0001276876737720111,
"loss": 0.479,
"step": 2480
},
{
"epoch": 1.1538461538461537,
"grad_norm": 0.3622649013996124,
"learning_rate": 0.00012699258572752546,
"loss": 0.3742,
"step": 2490
},
{
"epoch": 1.1584800741427248,
"grad_norm": 0.45524731278419495,
"learning_rate": 0.00012629749768303984,
"loss": 0.4062,
"step": 2500
}
],
"logging_steps": 10,
"max_steps": 4316,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.799325381832704e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}