Omni-nothink-7B-sft / trainer_state.json
hrw's picture
Upload folder using huggingface_hub
3dd812c verified
raw
history blame
65.1 kB
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 100.0,
"global_step": 1100,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0018203883495145632,
"grad_norm": 3.6015546321868896,
"learning_rate": 1.5151515151515152e-07,
"loss": 0.5787061452865601,
"memory(GiB)": 40.37,
"step": 1,
"token_acc": 0.8690476190476191,
"train_speed(iter/s)": 0.029162
},
{
"epoch": 0.009101941747572815,
"grad_norm": 3.435741424560547,
"learning_rate": 7.575757575757576e-07,
"loss": 0.5881168842315674,
"memory(GiB)": 40.37,
"step": 5,
"token_acc": 0.8701684836471755,
"train_speed(iter/s)": 0.06174
},
{
"epoch": 0.01820388349514563,
"grad_norm": 3.448568105697632,
"learning_rate": 1.5151515151515152e-06,
"loss": 0.5851926326751709,
"memory(GiB)": 40.39,
"step": 10,
"token_acc": 0.8622327790973872,
"train_speed(iter/s)": 0.067399
},
{
"epoch": 0.027305825242718445,
"grad_norm": 3.405535936355591,
"learning_rate": 2.2727272727272728e-06,
"loss": 0.6001698970794678,
"memory(GiB)": 40.39,
"step": 15,
"token_acc": 0.8716323296354992,
"train_speed(iter/s)": 0.068625
},
{
"epoch": 0.03640776699029126,
"grad_norm": 3.6892027854919434,
"learning_rate": 3.0303030303030305e-06,
"loss": 0.5676679611206055,
"memory(GiB)": 40.39,
"step": 20,
"token_acc": 0.8715305313243458,
"train_speed(iter/s)": 0.069219
},
{
"epoch": 0.04550970873786408,
"grad_norm": 3.9115183353424072,
"learning_rate": 3.7878787878787882e-06,
"loss": 0.5411659717559815,
"memory(GiB)": 40.39,
"step": 25,
"token_acc": 0.8685669041963578,
"train_speed(iter/s)": 0.071459
},
{
"epoch": 0.05461165048543689,
"grad_norm": 3.357640027999878,
"learning_rate": 4.5454545454545455e-06,
"loss": 0.4885613441467285,
"memory(GiB)": 40.39,
"step": 30,
"token_acc": 0.8682539682539683,
"train_speed(iter/s)": 0.072567
},
{
"epoch": 0.06371359223300971,
"grad_norm": 3.3015964031219482,
"learning_rate": 4.999956654935265e-06,
"loss": 0.4215705871582031,
"memory(GiB)": 40.39,
"step": 35,
"token_acc": 0.8692551505546752,
"train_speed(iter/s)": 0.07282
},
{
"epoch": 0.07281553398058252,
"grad_norm": 2.447498321533203,
"learning_rate": 4.999469040218251e-06,
"loss": 0.2957149982452393,
"memory(GiB)": 40.39,
"step": 40,
"token_acc": 0.8690476190476191,
"train_speed(iter/s)": 0.074291
},
{
"epoch": 0.08191747572815535,
"grad_norm": 0.8801060914993286,
"learning_rate": 4.9984397354824345e-06,
"loss": 0.21340658664703369,
"memory(GiB)": 40.39,
"step": 45,
"token_acc": 0.9135606661379857,
"train_speed(iter/s)": 0.074508
},
{
"epoch": 0.09101941747572816,
"grad_norm": 0.7226303815841675,
"learning_rate": 4.996868963800831e-06,
"loss": 0.1777859926223755,
"memory(GiB)": 40.39,
"step": 50,
"token_acc": 0.9239904988123515,
"train_speed(iter/s)": 0.075304
},
{
"epoch": 0.10012135922330097,
"grad_norm": 0.7329442501068115,
"learning_rate": 4.99475706559428e-06,
"loss": 0.17081427574157715,
"memory(GiB)": 40.39,
"step": 55,
"token_acc": 0.9238699444885012,
"train_speed(iter/s)": 0.075664
},
{
"epoch": 0.10922330097087378,
"grad_norm": 0.48636820912361145,
"learning_rate": 4.992104498557657e-06,
"loss": 0.15634163618087768,
"memory(GiB)": 40.39,
"step": 60,
"token_acc": 0.9262490087232356,
"train_speed(iter/s)": 0.076641
},
{
"epoch": 0.1183252427184466,
"grad_norm": 0.44267499446868896,
"learning_rate": 4.988911837560691e-06,
"loss": 0.1444383144378662,
"memory(GiB)": 40.39,
"step": 65,
"token_acc": 0.9350237717908082,
"train_speed(iter/s)": 0.074923
},
{
"epoch": 0.12742718446601942,
"grad_norm": 0.4311356544494629,
"learning_rate": 4.985179774523375e-06,
"loss": 0.14677078723907472,
"memory(GiB)": 40.39,
"step": 70,
"token_acc": 0.9444444444444444,
"train_speed(iter/s)": 0.075087
},
{
"epoch": 0.13652912621359223,
"grad_norm": 0.3981742858886719,
"learning_rate": 4.980909118266006e-06,
"loss": 0.13511970043182372,
"memory(GiB)": 40.39,
"step": 75,
"token_acc": 0.9484944532488114,
"train_speed(iter/s)": 0.074414
},
{
"epoch": 0.14563106796116504,
"grad_norm": 0.4317033290863037,
"learning_rate": 4.976100794333903e-06,
"loss": 0.12185637950897217,
"memory(GiB)": 40.39,
"step": 80,
"token_acc": 0.9627279936558287,
"train_speed(iter/s)": 0.074464
},
{
"epoch": 0.15473300970873785,
"grad_norm": 0.3179706335067749,
"learning_rate": 4.970755844796817e-06,
"loss": 0.12840776443481444,
"memory(GiB)": 40.39,
"step": 85,
"token_acc": 0.9492063492063492,
"train_speed(iter/s)": 0.074996
},
{
"epoch": 0.1638349514563107,
"grad_norm": 0.3189823031425476,
"learning_rate": 4.964875428023093e-06,
"loss": 0.12376663684844971,
"memory(GiB)": 40.39,
"step": 90,
"token_acc": 0.957936507936508,
"train_speed(iter/s)": 0.075144
},
{
"epoch": 0.1729368932038835,
"grad_norm": 0.33377909660339355,
"learning_rate": 4.958460818428627e-06,
"loss": 0.11574116945266724,
"memory(GiB)": 40.39,
"step": 95,
"token_acc": 0.9563492063492064,
"train_speed(iter/s)": 0.075617
},
{
"epoch": 0.1820388349514563,
"grad_norm": 0.4928111433982849,
"learning_rate": 4.951513406200667e-06,
"loss": 0.1149444341659546,
"memory(GiB)": 40.39,
"step": 100,
"token_acc": 0.9508716323296355,
"train_speed(iter/s)": 0.075828
},
{
"epoch": 0.19114077669902912,
"grad_norm": 0.3134707808494568,
"learning_rate": 4.944034696996534e-06,
"loss": 0.11119295358657837,
"memory(GiB)": 40.39,
"step": 105,
"token_acc": 0.9595238095238096,
"train_speed(iter/s)": 0.075066
},
{
"epoch": 0.20024271844660194,
"grad_norm": 0.2365858554840088,
"learning_rate": 4.936026311617316e-06,
"loss": 0.11442217826843262,
"memory(GiB)": 40.39,
"step": 110,
"token_acc": 0.9588281868566905,
"train_speed(iter/s)": 0.075061
},
{
"epoch": 0.20934466019417475,
"grad_norm": 0.3145173490047455,
"learning_rate": 4.927489985656591e-06,
"loss": 0.10322239398956298,
"memory(GiB)": 40.39,
"step": 115,
"token_acc": 0.9658730158730159,
"train_speed(iter/s)": 0.074479
},
{
"epoch": 0.21844660194174756,
"grad_norm": 0.33202633261680603,
"learning_rate": 4.918427569124302e-06,
"loss": 0.10661822557449341,
"memory(GiB)": 40.39,
"step": 120,
"token_acc": 0.9556259904912837,
"train_speed(iter/s)": 0.074637
},
{
"epoch": 0.2275485436893204,
"grad_norm": 0.3093946874141693,
"learning_rate": 4.908841026045809e-06,
"loss": 0.10065805912017822,
"memory(GiB)": 40.39,
"step": 125,
"token_acc": 0.9540412044374009,
"train_speed(iter/s)": 0.074905
},
{
"epoch": 0.2366504854368932,
"grad_norm": 0.39363232254981995,
"learning_rate": 4.8987324340362445e-06,
"loss": 0.114447021484375,
"memory(GiB)": 40.39,
"step": 130,
"token_acc": 0.9571428571428572,
"train_speed(iter/s)": 0.075072
},
{
"epoch": 0.24575242718446602,
"grad_norm": 0.37065446376800537,
"learning_rate": 4.888103983850245e-06,
"loss": 0.10610785484313964,
"memory(GiB)": 40.39,
"step": 135,
"token_acc": 0.9565217391304348,
"train_speed(iter/s)": 0.075167
},
{
"epoch": 0.25485436893203883,
"grad_norm": 0.542117714881897,
"learning_rate": 4.876957978907176e-06,
"loss": 0.0954114019870758,
"memory(GiB)": 40.39,
"step": 140,
"token_acc": 0.9666666666666667,
"train_speed(iter/s)": 0.075346
},
{
"epoch": 0.26395631067961167,
"grad_norm": 0.3225058913230896,
"learning_rate": 4.865296834791918e-06,
"loss": 0.0959049105644226,
"memory(GiB)": 40.39,
"step": 145,
"token_acc": 0.9587955625990491,
"train_speed(iter/s)": 0.075467
},
{
"epoch": 0.27305825242718446,
"grad_norm": 0.3421016037464142,
"learning_rate": 4.853123078731363e-06,
"loss": 0.09874246120452881,
"memory(GiB)": 40.39,
"step": 150,
"token_acc": 0.9650793650793651,
"train_speed(iter/s)": 0.075618
},
{
"epoch": 0.2821601941747573,
"grad_norm": 0.3102968633174896,
"learning_rate": 4.8404393490467085e-06,
"loss": 0.09461469650268554,
"memory(GiB)": 40.39,
"step": 155,
"token_acc": 0.9547977795400476,
"train_speed(iter/s)": 0.075855
},
{
"epoch": 0.2912621359223301,
"grad_norm": 0.4729763865470886,
"learning_rate": 4.827248394581672e-06,
"loss": 0.10038878917694091,
"memory(GiB)": 40.39,
"step": 160,
"token_acc": 0.9650793650793651,
"train_speed(iter/s)": 0.075945
},
{
"epoch": 0.3003640776699029,
"grad_norm": 0.3695836365222931,
"learning_rate": 4.813553074106761e-06,
"loss": 0.09139147400856018,
"memory(GiB)": 40.39,
"step": 165,
"token_acc": 0.9627279936558287,
"train_speed(iter/s)": 0.075756
},
{
"epoch": 0.3094660194174757,
"grad_norm": 0.47110962867736816,
"learning_rate": 4.799356355699708e-06,
"loss": 0.09496045112609863,
"memory(GiB)": 40.39,
"step": 170,
"token_acc": 0.9698412698412698,
"train_speed(iter/s)": 0.075898
},
{
"epoch": 0.31856796116504854,
"grad_norm": 0.3773088753223419,
"learning_rate": 4.784661316102229e-06,
"loss": 0.09658662080764771,
"memory(GiB)": 40.4,
"step": 175,
"token_acc": 0.96513470681458,
"train_speed(iter/s)": 0.075914
},
{
"epoch": 0.3276699029126214,
"grad_norm": 0.3394829034805298,
"learning_rate": 4.769471140053221e-06,
"loss": 0.08639374971389771,
"memory(GiB)": 40.4,
"step": 180,
"token_acc": 0.969047619047619,
"train_speed(iter/s)": 0.076076
},
{
"epoch": 0.33677184466019416,
"grad_norm": 0.4525506794452667,
"learning_rate": 4.753789119598563e-06,
"loss": 0.09742268323898315,
"memory(GiB)": 40.4,
"step": 185,
"token_acc": 0.9587301587301588,
"train_speed(iter/s)": 0.076177
},
{
"epoch": 0.345873786407767,
"grad_norm": 0.3789404332637787,
"learning_rate": 4.737618653377651e-06,
"loss": 0.09391134977340698,
"memory(GiB)": 40.4,
"step": 190,
"token_acc": 0.9651070578905631,
"train_speed(iter/s)": 0.07649
},
{
"epoch": 0.3549757281553398,
"grad_norm": 0.5464370250701904,
"learning_rate": 4.720963245886846e-06,
"loss": 0.0969527006149292,
"memory(GiB)": 40.4,
"step": 195,
"token_acc": 0.9659270998415214,
"train_speed(iter/s)": 0.076513
},
{
"epoch": 0.3640776699029126,
"grad_norm": 0.3459813892841339,
"learning_rate": 4.703826506719964e-06,
"loss": 0.08732333183288574,
"memory(GiB)": 40.4,
"step": 200,
"token_acc": 0.96513470681458,
"train_speed(iter/s)": 0.076587
},
{
"epoch": 0.3731796116504854,
"grad_norm": 0.3549191653728485,
"learning_rate": 4.686212149786007e-06,
"loss": 0.08515737056732178,
"memory(GiB)": 40.4,
"step": 205,
"token_acc": 0.96513470681458,
"train_speed(iter/s)": 0.076344
},
{
"epoch": 0.38228155339805825,
"grad_norm": 0.7434160709381104,
"learning_rate": 4.668123992504267e-06,
"loss": 0.09526927471160888,
"memory(GiB)": 40.4,
"step": 210,
"token_acc": 0.9666666666666667,
"train_speed(iter/s)": 0.076513
},
{
"epoch": 0.3913834951456311,
"grad_norm": 0.464631587266922,
"learning_rate": 4.649565954977015e-06,
"loss": 0.09264343380928039,
"memory(GiB)": 40.4,
"step": 215,
"token_acc": 0.9620253164556962,
"train_speed(iter/s)": 0.076143
},
{
"epoch": 0.40048543689320387,
"grad_norm": 0.5145648121833801,
"learning_rate": 4.630542059139923e-06,
"loss": 0.09688866138458252,
"memory(GiB)": 40.4,
"step": 220,
"token_acc": 0.9667458432304038,
"train_speed(iter/s)": 0.076292
},
{
"epoch": 0.4095873786407767,
"grad_norm": 0.33657485246658325,
"learning_rate": 4.611056427890428e-06,
"loss": 0.09414277076721192,
"memory(GiB)": 40.4,
"step": 225,
"token_acc": 0.9587301587301588,
"train_speed(iter/s)": 0.076275
},
{
"epoch": 0.4186893203883495,
"grad_norm": 0.47585147619247437,
"learning_rate": 4.5911132841942e-06,
"loss": 0.08656486272811889,
"memory(GiB)": 40.4,
"step": 230,
"token_acc": 0.9698651863600317,
"train_speed(iter/s)": 0.076342
},
{
"epoch": 0.42779126213592233,
"grad_norm": 0.3516729176044464,
"learning_rate": 4.570716950169944e-06,
"loss": 0.08657894730567932,
"memory(GiB)": 40.4,
"step": 235,
"token_acc": 0.9642857142857143,
"train_speed(iter/s)": 0.076493
},
{
"epoch": 0.4368932038834951,
"grad_norm": 0.48757559061050415,
"learning_rate": 4.5498718461526895e-06,
"loss": 0.09453780055046082,
"memory(GiB)": 40.4,
"step": 240,
"token_acc": 0.9643705463182898,
"train_speed(iter/s)": 0.07656
},
{
"epoch": 0.44599514563106796,
"grad_norm": 0.5283713936805725,
"learning_rate": 4.528582489735818e-06,
"loss": 0.08740494847297668,
"memory(GiB)": 40.4,
"step": 245,
"token_acc": 0.9587628865979382,
"train_speed(iter/s)": 0.07663
},
{
"epoch": 0.4550970873786408,
"grad_norm": 0.3577844500541687,
"learning_rate": 4.506853494791992e-06,
"loss": 0.08014656901359558,
"memory(GiB)": 40.4,
"step": 250,
"token_acc": 0.971473851030111,
"train_speed(iter/s)": 0.076543
},
{
"epoch": 0.4641990291262136,
"grad_norm": 0.5026013851165771,
"learning_rate": 4.484689570473232e-06,
"loss": 0.08635783195495605,
"memory(GiB)": 40.4,
"step": 255,
"token_acc": 0.9682791435368755,
"train_speed(iter/s)": 0.076578
},
{
"epoch": 0.4733009708737864,
"grad_norm": 0.45232078433036804,
"learning_rate": 4.462095520190336e-06,
"loss": 0.08593440055847168,
"memory(GiB)": 40.4,
"step": 260,
"token_acc": 0.9699367088607594,
"train_speed(iter/s)": 0.076538
},
{
"epoch": 0.4824029126213592,
"grad_norm": 0.47390663623809814,
"learning_rate": 4.43907624057188e-06,
"loss": 0.08747667074203491,
"memory(GiB)": 40.4,
"step": 265,
"token_acc": 0.9619047619047619,
"train_speed(iter/s)": 0.076588
},
{
"epoch": 0.49150485436893204,
"grad_norm": 0.43587085604667664,
"learning_rate": 4.415636720403005e-06,
"loss": 0.08902972340583801,
"memory(GiB)": 40.4,
"step": 270,
"token_acc": 0.9619349722442506,
"train_speed(iter/s)": 0.076484
},
{
"epoch": 0.5006067961165048,
"grad_norm": 0.41671204566955566,
"learning_rate": 4.391782039544239e-06,
"loss": 0.08426393270492553,
"memory(GiB)": 40.4,
"step": 275,
"token_acc": 0.9603489294210944,
"train_speed(iter/s)": 0.076586
},
{
"epoch": 0.5097087378640777,
"grad_norm": 0.3852890133857727,
"learning_rate": 4.367517367830581e-06,
"loss": 0.08224607706069946,
"memory(GiB)": 40.4,
"step": 280,
"token_acc": 0.9730372720063442,
"train_speed(iter/s)": 0.0767
},
{
"epoch": 0.5188106796116505,
"grad_norm": 0.5980095863342285,
"learning_rate": 4.342847963951085e-06,
"loss": 0.09114923477172851,
"memory(GiB)": 40.4,
"step": 285,
"token_acc": 0.9642857142857143,
"train_speed(iter/s)": 0.076804
},
{
"epoch": 0.5279126213592233,
"grad_norm": 0.5370866656303406,
"learning_rate": 4.317779174309179e-06,
"loss": 0.09176770448684693,
"memory(GiB)": 40.4,
"step": 290,
"token_acc": 0.9595879556259905,
"train_speed(iter/s)": 0.076902
},
{
"epoch": 0.5370145631067961,
"grad_norm": 0.5857056975364685,
"learning_rate": 4.292316431863991e-06,
"loss": 0.08232347965240479,
"memory(GiB)": 40.4,
"step": 295,
"token_acc": 0.9635210150674068,
"train_speed(iter/s)": 0.076861
},
{
"epoch": 0.5461165048543689,
"grad_norm": 0.45398032665252686,
"learning_rate": 4.2664652549528995e-06,
"loss": 0.0860186755657196,
"memory(GiB)": 40.4,
"step": 300,
"token_acc": 0.9603174603174603,
"train_speed(iter/s)": 0.076918
},
{
"epoch": 0.5552184466019418,
"grad_norm": 0.4008013904094696,
"learning_rate": 4.240231246095593e-06,
"loss": 0.08663930892944335,
"memory(GiB)": 40.4,
"step": 305,
"token_acc": 0.9698651863600317,
"train_speed(iter/s)": 0.076723
},
{
"epoch": 0.5643203883495146,
"grad_norm": 0.6199547052383423,
"learning_rate": 4.213620090779877e-06,
"loss": 0.08223216533660889,
"memory(GiB)": 40.4,
"step": 310,
"token_acc": 0.9674861221252974,
"train_speed(iter/s)": 0.076805
},
{
"epoch": 0.5734223300970874,
"grad_norm": 0.37448298931121826,
"learning_rate": 4.186637556229508e-06,
"loss": 0.08296606540679932,
"memory(GiB)": 40.4,
"step": 315,
"token_acc": 0.9666931007137193,
"train_speed(iter/s)": 0.076708
},
{
"epoch": 0.5825242718446602,
"grad_norm": 0.4003507196903229,
"learning_rate": 4.159289490154305e-06,
"loss": 0.07931501269340516,
"memory(GiB)": 40.4,
"step": 320,
"token_acc": 0.9642857142857143,
"train_speed(iter/s)": 0.076845
},
{
"epoch": 0.591626213592233,
"grad_norm": 0.49439844489097595,
"learning_rate": 4.1315818194828196e-06,
"loss": 0.08067693710327148,
"memory(GiB)": 40.4,
"step": 325,
"token_acc": 0.9698412698412698,
"train_speed(iter/s)": 0.076875
},
{
"epoch": 0.6007281553398058,
"grad_norm": 0.584017813205719,
"learning_rate": 4.1035205490778505e-06,
"loss": 0.09277031421661378,
"memory(GiB)": 40.4,
"step": 330,
"token_acc": 0.9595879556259905,
"train_speed(iter/s)": 0.076692
},
{
"epoch": 0.6098300970873787,
"grad_norm": 0.47020280361175537,
"learning_rate": 4.075111760435045e-06,
"loss": 0.07749168276786804,
"memory(GiB)": 40.4,
"step": 335,
"token_acc": 0.96513470681458,
"train_speed(iter/s)": 0.076884
},
{
"epoch": 0.6189320388349514,
"grad_norm": 0.4876089096069336,
"learning_rate": 4.046361610364913e-06,
"loss": 0.07796428203582764,
"memory(GiB)": 40.4,
"step": 340,
"token_acc": 0.9691699604743083,
"train_speed(iter/s)": 0.076913
},
{
"epoch": 0.6280339805825242,
"grad_norm": 0.5511714220046997,
"learning_rate": 4.017276329658506e-06,
"loss": 0.08419817090034484,
"memory(GiB)": 40.4,
"step": 345,
"token_acc": 0.9707278481012658,
"train_speed(iter/s)": 0.07696
},
{
"epoch": 0.6371359223300971,
"grad_norm": 0.5659735798835754,
"learning_rate": 3.987862221737072e-06,
"loss": 0.0797402322292328,
"memory(GiB)": 40.4,
"step": 350,
"token_acc": 0.9659270998415214,
"train_speed(iter/s)": 0.076995
},
{
"epoch": 0.6462378640776699,
"grad_norm": 0.5157150030136108,
"learning_rate": 3.958125661285959e-06,
"loss": 0.0838176965713501,
"memory(GiB)": 40.4,
"step": 355,
"token_acc": 0.9690721649484536,
"train_speed(iter/s)": 0.076909
},
{
"epoch": 0.6553398058252428,
"grad_norm": 0.5069080591201782,
"learning_rate": 3.928073092873088e-06,
"loss": 0.07343612313270569,
"memory(GiB)": 40.4,
"step": 360,
"token_acc": 0.9746233148295004,
"train_speed(iter/s)": 0.076991
},
{
"epoch": 0.6644417475728155,
"grad_norm": 0.49923259019851685,
"learning_rate": 3.897711029552264e-06,
"loss": 0.07626074552536011,
"memory(GiB)": 40.4,
"step": 365,
"token_acc": 0.9683544303797469,
"train_speed(iter/s)": 0.076983
},
{
"epoch": 0.6735436893203883,
"grad_norm": 0.35883885622024536,
"learning_rate": 3.8670460514516615e-06,
"loss": 0.08405499458312989,
"memory(GiB)": 40.4,
"step": 370,
"token_acc": 0.9635499207606973,
"train_speed(iter/s)": 0.077013
},
{
"epoch": 0.6826456310679612,
"grad_norm": 0.4520786702632904,
"learning_rate": 3.836084804347763e-06,
"loss": 0.07998884916305542,
"memory(GiB)": 40.4,
"step": 375,
"token_acc": 0.9698412698412698,
"train_speed(iter/s)": 0.07694
},
{
"epoch": 0.691747572815534,
"grad_norm": 0.47654658555984497,
"learning_rate": 3.8048339982250705e-06,
"loss": 0.08119775056838989,
"memory(GiB)": 40.4,
"step": 380,
"token_acc": 0.9667194928684627,
"train_speed(iter/s)": 0.077002
},
{
"epoch": 0.7008495145631068,
"grad_norm": 0.5640057325363159,
"learning_rate": 3.773300405821908e-06,
"loss": 0.08841820359230042,
"memory(GiB)": 40.4,
"step": 385,
"token_acc": 0.9595559080095163,
"train_speed(iter/s)": 0.077061
},
{
"epoch": 0.7099514563106796,
"grad_norm": 0.42381900548934937,
"learning_rate": 3.7414908611626162e-06,
"loss": 0.08166542053222656,
"memory(GiB)": 40.4,
"step": 390,
"token_acc": 0.969047619047619,
"train_speed(iter/s)": 0.077092
},
{
"epoch": 0.7190533980582524,
"grad_norm": 0.510867714881897,
"learning_rate": 3.709412258076471e-06,
"loss": 0.08081957101821899,
"memory(GiB)": 40.4,
"step": 395,
"token_acc": 0.9699129057798892,
"train_speed(iter/s)": 0.077233
},
{
"epoch": 0.7281553398058253,
"grad_norm": 0.5211343169212341,
"learning_rate": 3.6770715487036413e-06,
"loss": 0.08312466740608215,
"memory(GiB)": 40.4,
"step": 400,
"token_acc": 0.9611419508326725,
"train_speed(iter/s)": 0.077264
},
{
"epoch": 0.7372572815533981,
"grad_norm": 0.46672672033309937,
"learning_rate": 3.644475741988499e-06,
"loss": 0.08163590431213379,
"memory(GiB)": 40.4,
"step": 405,
"token_acc": 0.9666666666666667,
"train_speed(iter/s)": 0.07706
},
{
"epoch": 0.7463592233009708,
"grad_norm": 0.4190872013568878,
"learning_rate": 3.6116319021606345e-06,
"loss": 0.08278034925460816,
"memory(GiB)": 40.4,
"step": 410,
"token_acc": 0.9603803486529319,
"train_speed(iter/s)": 0.077071
},
{
"epoch": 0.7554611650485437,
"grad_norm": 0.4177815318107605,
"learning_rate": 3.5785471472038784e-06,
"loss": 0.07709290385246277,
"memory(GiB)": 40.4,
"step": 415,
"token_acc": 0.9714512291831879,
"train_speed(iter/s)": 0.077076
},
{
"epoch": 0.7645631067961165,
"grad_norm": 0.7115554213523865,
"learning_rate": 3.545228647313679e-06,
"loss": 0.08126543164253235,
"memory(GiB)": 40.4,
"step": 420,
"token_acc": 0.9674861221252974,
"train_speed(iter/s)": 0.07706
},
{
"epoch": 0.7736650485436893,
"grad_norm": 0.43985486030578613,
"learning_rate": 3.5116836233431616e-06,
"loss": 0.08477982282638549,
"memory(GiB)": 40.4,
"step": 425,
"token_acc": 0.9628164556962026,
"train_speed(iter/s)": 0.077154
},
{
"epoch": 0.7827669902912622,
"grad_norm": 0.48275941610336304,
"learning_rate": 3.477919345238213e-06,
"loss": 0.07978797554969788,
"memory(GiB)": 40.4,
"step": 430,
"token_acc": 0.9627279936558287,
"train_speed(iter/s)": 0.077173
},
{
"epoch": 0.7918689320388349,
"grad_norm": 0.5005500912666321,
"learning_rate": 3.4439431304619207e-06,
"loss": 0.07624109983444213,
"memory(GiB)": 40.4,
"step": 435,
"token_acc": 0.9659270998415214,
"train_speed(iter/s)": 0.077238
},
{
"epoch": 0.8009708737864077,
"grad_norm": 0.5146210789680481,
"learning_rate": 3.4097623424087196e-06,
"loss": 0.080259507894516,
"memory(GiB)": 40.4,
"step": 440,
"token_acc": 0.9706582077716098,
"train_speed(iter/s)": 0.077241
},
{
"epoch": 0.8100728155339806,
"grad_norm": 0.558778703212738,
"learning_rate": 3.3753843888085806e-06,
"loss": 0.07813260555267335,
"memory(GiB)": 40.4,
"step": 445,
"token_acc": 0.9658730158730159,
"train_speed(iter/s)": 0.077226
},
{
"epoch": 0.8191747572815534,
"grad_norm": 0.574676513671875,
"learning_rate": 3.340816720121597e-06,
"loss": 0.0761204183101654,
"memory(GiB)": 40.4,
"step": 450,
"token_acc": 0.9691699604743083,
"train_speed(iter/s)": 0.077059
},
{
"epoch": 0.8282766990291263,
"grad_norm": 0.5359216332435608,
"learning_rate": 3.3060668279232964e-06,
"loss": 0.07063559293746949,
"memory(GiB)": 40.4,
"step": 455,
"token_acc": 0.9746233148295004,
"train_speed(iter/s)": 0.077103
},
{
"epoch": 0.837378640776699,
"grad_norm": 0.5926820635795593,
"learning_rate": 3.2711422432810624e-06,
"loss": 0.07327613830566407,
"memory(GiB)": 40.4,
"step": 460,
"token_acc": 0.9666666666666667,
"train_speed(iter/s)": 0.077136
},
{
"epoch": 0.8464805825242718,
"grad_norm": 0.4923359155654907,
"learning_rate": 3.236050535121976e-06,
"loss": 0.0849435031414032,
"memory(GiB)": 40.4,
"step": 465,
"token_acc": 0.9628164556962026,
"train_speed(iter/s)": 0.077175
},
{
"epoch": 0.8555825242718447,
"grad_norm": 0.5079782605171204,
"learning_rate": 3.2007993085924694e-06,
"loss": 0.07131590843200683,
"memory(GiB)": 40.4,
"step": 470,
"token_acc": 0.9603489294210944,
"train_speed(iter/s)": 0.077219
},
{
"epoch": 0.8646844660194175,
"grad_norm": 0.47359853982925415,
"learning_rate": 3.165396203410121e-06,
"loss": 0.08230514526367187,
"memory(GiB)": 40.4,
"step": 475,
"token_acc": 0.9603489294210944,
"train_speed(iter/s)": 0.077276
},
{
"epoch": 0.8737864077669902,
"grad_norm": 0.5094448328018188,
"learning_rate": 3.1298488922079597e-06,
"loss": 0.07572669386863709,
"memory(GiB)": 40.4,
"step": 480,
"token_acc": 0.9683042789223455,
"train_speed(iter/s)": 0.077301
},
{
"epoch": 0.8828883495145631,
"grad_norm": 0.6144260764122009,
"learning_rate": 3.094165078871634e-06,
"loss": 0.07770437002182007,
"memory(GiB)": 40.4,
"step": 485,
"token_acc": 0.9674603174603175,
"train_speed(iter/s)": 0.077291
},
{
"epoch": 0.8919902912621359,
"grad_norm": 0.7166838049888611,
"learning_rate": 3.0583524968698176e-06,
"loss": 0.07593016624450684,
"memory(GiB)": 40.4,
"step": 490,
"token_acc": 0.9706582077716098,
"train_speed(iter/s)": 0.077337
},
{
"epoch": 0.9010922330097088,
"grad_norm": 0.5843172073364258,
"learning_rate": 3.0224189075781886e-06,
"loss": 0.0753251850605011,
"memory(GiB)": 40.4,
"step": 495,
"token_acc": 0.9675889328063241,
"train_speed(iter/s)": 0.077398
},
{
"epoch": 0.9101941747572816,
"grad_norm": 0.4273771643638611,
"learning_rate": 2.9863720985973697e-06,
"loss": 0.07616569995880126,
"memory(GiB)": 40.4,
"step": 500,
"token_acc": 0.9746031746031746,
"train_speed(iter/s)": 0.077368
},
{
"epoch": 0.9192961165048543,
"grad_norm": 0.5440679788589478,
"learning_rate": 2.9502198820651903e-06,
"loss": 0.07991842031478882,
"memory(GiB)": 40.4,
"step": 505,
"token_acc": 0.9642857142857143,
"train_speed(iter/s)": 0.077195
},
{
"epoch": 0.9283980582524272,
"grad_norm": 0.6545736789703369,
"learning_rate": 2.9139700929636134e-06,
"loss": 0.07855194211006164,
"memory(GiB)": 40.4,
"step": 510,
"token_acc": 0.9587301587301588,
"train_speed(iter/s)": 0.077178
},
{
"epoch": 0.9375,
"grad_norm": 0.5470529794692993,
"learning_rate": 2.8776305874207305e-06,
"loss": 0.07507063150405884,
"memory(GiB)": 40.4,
"step": 515,
"token_acc": 0.9675376088677752,
"train_speed(iter/s)": 0.077176
},
{
"epoch": 0.9466019417475728,
"grad_norm": 0.5262081623077393,
"learning_rate": 2.8412092410081645e-06,
"loss": 0.08568469285964966,
"memory(GiB)": 40.4,
"step": 520,
"token_acc": 0.9659270998415214,
"train_speed(iter/s)": 0.077164
},
{
"epoch": 0.9557038834951457,
"grad_norm": 0.48101773858070374,
"learning_rate": 2.804713947034254e-06,
"loss": 0.07408897280693054,
"memory(GiB)": 40.4,
"step": 525,
"token_acc": 0.9715189873417721,
"train_speed(iter/s)": 0.077248
},
{
"epoch": 0.9648058252427184,
"grad_norm": 0.7088754773139954,
"learning_rate": 2.7681526148334074e-06,
"loss": 0.07859846353530883,
"memory(GiB)": 40.4,
"step": 530,
"token_acc": 0.9651070578905631,
"train_speed(iter/s)": 0.077348
},
{
"epoch": 0.9739077669902912,
"grad_norm": 0.5357980728149414,
"learning_rate": 2.73153316805197e-06,
"loss": 0.07618768811225891,
"memory(GiB)": 40.4,
"step": 535,
"token_acc": 0.9683042789223455,
"train_speed(iter/s)": 0.077388
},
{
"epoch": 0.9830097087378641,
"grad_norm": 0.4719216823577881,
"learning_rate": 2.6948635429309984e-06,
"loss": 0.08283294439315796,
"memory(GiB)": 40.4,
"step": 540,
"token_acc": 0.9666666666666667,
"train_speed(iter/s)": 0.077404
},
{
"epoch": 0.9921116504854369,
"grad_norm": 0.4105032980442047,
"learning_rate": 2.6581516865863006e-06,
"loss": 0.07635112404823304,
"memory(GiB)": 40.4,
"step": 545,
"token_acc": 0.9666666666666667,
"train_speed(iter/s)": 0.077461
},
{
"epoch": 1.0,
"grad_norm": 0.4639950096607208,
"learning_rate": 2.6214055552861213e-06,
"loss": 0.07352917194366455,
"memory(GiB)": 40.4,
"step": 550,
"token_acc": 0.9652014652014652,
"train_speed(iter/s)": 0.077567
},
{
"epoch": 1.0091019417475728,
"grad_norm": 0.5708960294723511,
"learning_rate": 2.5846331127268432e-06,
"loss": 0.06939817667007446,
"memory(GiB)": 40.4,
"step": 555,
"token_acc": 0.9746634996041171,
"train_speed(iter/s)": 0.077516
},
{
"epoch": 1.0182038834951457,
"grad_norm": 0.5500112771987915,
"learning_rate": 2.5478423283070797e-06,
"loss": 0.08004761338233948,
"memory(GiB)": 40.4,
"step": 560,
"token_acc": 0.9666666666666667,
"train_speed(iter/s)": 0.077461
},
{
"epoch": 1.0273058252427185,
"grad_norm": 0.6031087040901184,
"learning_rate": 2.5110411754005277e-06,
"loss": 0.07369757890701294,
"memory(GiB)": 40.4,
"step": 565,
"token_acc": 0.9675118858954042,
"train_speed(iter/s)": 0.077479
},
{
"epoch": 1.0364077669902914,
"grad_norm": 0.6123142242431641,
"learning_rate": 2.4742376296279656e-06,
"loss": 0.07673358917236328,
"memory(GiB)": 40.4,
"step": 570,
"token_acc": 0.96513470681458,
"train_speed(iter/s)": 0.077492
},
{
"epoch": 1.045509708737864,
"grad_norm": 0.4750412404537201,
"learning_rate": 2.437439667128757e-06,
"loss": 0.07482797503471375,
"memory(GiB)": 40.4,
"step": 575,
"token_acc": 0.9722222222222222,
"train_speed(iter/s)": 0.077462
},
{
"epoch": 1.0546116504854368,
"grad_norm": 0.6936323642730713,
"learning_rate": 2.4006552628322495e-06,
"loss": 0.07669172286987305,
"memory(GiB)": 40.4,
"step": 580,
"token_acc": 0.9698890649762282,
"train_speed(iter/s)": 0.077497
},
{
"epoch": 1.0637135922330097,
"grad_norm": 0.5415986180305481,
"learning_rate": 2.3638923887294252e-06,
"loss": 0.07764337062835694,
"memory(GiB)": 40.4,
"step": 585,
"token_acc": 0.9722662440570523,
"train_speed(iter/s)": 0.077534
},
{
"epoch": 1.0728155339805825,
"grad_norm": 0.5562268495559692,
"learning_rate": 2.3271590121452034e-06,
"loss": 0.07850711941719055,
"memory(GiB)": 40.4,
"step": 590,
"token_acc": 0.9627575277337559,
"train_speed(iter/s)": 0.077312
},
{
"epoch": 1.0819174757281553,
"grad_norm": 0.5438592433929443,
"learning_rate": 2.2904630940117383e-06,
"loss": 0.07206880450248718,
"memory(GiB)": 40.4,
"step": 595,
"token_acc": 0.9706582077716098,
"train_speed(iter/s)": 0.077329
},
{
"epoch": 1.0910194174757282,
"grad_norm": 0.7570096254348755,
"learning_rate": 2.253812587143113e-06,
"loss": 0.07922015190124512,
"memory(GiB)": 40.4,
"step": 600,
"token_acc": 0.9675632911392406,
"train_speed(iter/s)": 0.077373
},
{
"epoch": 1.100121359223301,
"grad_norm": 0.44248196482658386,
"learning_rate": 2.2172154345117896e-06,
"loss": 0.07421438097953796,
"memory(GiB)": 40.4,
"step": 605,
"token_acc": 0.969047619047619,
"train_speed(iter/s)": 0.077227
},
{
"epoch": 1.1092233009708738,
"grad_norm": 0.8693225383758545,
"learning_rate": 2.18067956752719e-06,
"loss": 0.07179425954818726,
"memory(GiB)": 40.4,
"step": 610,
"token_acc": 0.9738302934179223,
"train_speed(iter/s)": 0.077227
},
{
"epoch": 1.1183252427184467,
"grad_norm": 0.6093197464942932,
"learning_rate": 2.1442129043167877e-06,
"loss": 0.07261105179786682,
"memory(GiB)": 40.4,
"step": 615,
"token_acc": 0.972244250594766,
"train_speed(iter/s)": 0.077265
},
{
"epoch": 1.1274271844660193,
"grad_norm": 0.47732552886009216,
"learning_rate": 2.1078233480100708e-06,
"loss": 0.07763968706130982,
"memory(GiB)": 40.4,
"step": 620,
"token_acc": 0.9746233148295004,
"train_speed(iter/s)": 0.077083
},
{
"epoch": 1.1365291262135921,
"grad_norm": 0.6436070799827576,
"learning_rate": 2.0715187850257645e-06,
"loss": 0.07869491577148438,
"memory(GiB)": 40.4,
"step": 625,
"token_acc": 0.9675632911392406,
"train_speed(iter/s)": 0.077031
},
{
"epoch": 1.145631067961165,
"grad_norm": 0.6669154167175293,
"learning_rate": 2.0353070833626684e-06,
"loss": 0.07925596237182617,
"memory(GiB)": 40.4,
"step": 630,
"token_acc": 0.964314036478985,
"train_speed(iter/s)": 0.077048
},
{
"epoch": 1.1547330097087378,
"grad_norm": 0.6365996599197388,
"learning_rate": 1.999196090894485e-06,
"loss": 0.06456078886985779,
"memory(GiB)": 40.4,
"step": 635,
"token_acc": 0.9667194928684627,
"train_speed(iter/s)": 0.077101
},
{
"epoch": 1.1638349514563107,
"grad_norm": 0.5614244341850281,
"learning_rate": 1.963193633669018e-06,
"loss": 0.07243520021438599,
"memory(GiB)": 40.4,
"step": 640,
"token_acc": 0.9666931007137193,
"train_speed(iter/s)": 0.077155
},
{
"epoch": 1.1729368932038835,
"grad_norm": 0.8191459774971008,
"learning_rate": 1.927307514212089e-06,
"loss": 0.0762752890586853,
"memory(GiB)": 40.4,
"step": 645,
"token_acc": 0.9698412698412698,
"train_speed(iter/s)": 0.077153
},
{
"epoch": 1.1820388349514563,
"grad_norm": 0.523980438709259,
"learning_rate": 1.8915455098365651e-06,
"loss": 0.0773351550102234,
"memory(GiB)": 40.4,
"step": 650,
"token_acc": 0.9675118858954042,
"train_speed(iter/s)": 0.077211
},
{
"epoch": 1.1911407766990292,
"grad_norm": 0.5650423169136047,
"learning_rate": 1.8559153709568393e-06,
"loss": 0.07858687043190002,
"memory(GiB)": 40.4,
"step": 655,
"token_acc": 0.9635499207606973,
"train_speed(iter/s)": 0.077253
},
{
"epoch": 1.200242718446602,
"grad_norm": 0.3905327022075653,
"learning_rate": 1.8204248194091429e-06,
"loss": 0.07570682168006897,
"memory(GiB)": 40.4,
"step": 660,
"token_acc": 0.9674861221252974,
"train_speed(iter/s)": 0.077222
},
{
"epoch": 1.2093446601941746,
"grad_norm": 0.6456849575042725,
"learning_rate": 1.7850815467780616e-06,
"loss": 0.06978952884674072,
"memory(GiB)": 40.4,
"step": 665,
"token_acc": 0.976984126984127,
"train_speed(iter/s)": 0.077238
},
{
"epoch": 1.2184466019417475,
"grad_norm": 0.49169182777404785,
"learning_rate": 1.7498932127295892e-06,
"loss": 0.06932756900787354,
"memory(GiB)": 40.4,
"step": 670,
"token_acc": 0.9674603174603175,
"train_speed(iter/s)": 0.077305
},
{
"epoch": 1.2275485436893203,
"grad_norm": 0.8174545764923096,
"learning_rate": 1.7148674433511176e-06,
"loss": 0.07247714400291443,
"memory(GiB)": 40.4,
"step": 675,
"token_acc": 0.9785714285714285,
"train_speed(iter/s)": 0.077358
},
{
"epoch": 1.2366504854368932,
"grad_norm": 0.5874563455581665,
"learning_rate": 1.6800118294986936e-06,
"loss": 0.08156619668006897,
"memory(GiB)": 40.4,
"step": 680,
"token_acc": 0.9619952494061758,
"train_speed(iter/s)": 0.077379
},
{
"epoch": 1.245752427184466,
"grad_norm": 0.7023929357528687,
"learning_rate": 1.645333925151908e-06,
"loss": 0.0740778088569641,
"memory(GiB)": 40.4,
"step": 685,
"token_acc": 0.9643423137876387,
"train_speed(iter/s)": 0.077282
},
{
"epoch": 1.2548543689320388,
"grad_norm": 0.6284681558609009,
"learning_rate": 1.610841245776789e-06,
"loss": 0.07937963008880615,
"memory(GiB)": 40.4,
"step": 690,
"token_acc": 0.9682791435368755,
"train_speed(iter/s)": 0.077267
},
{
"epoch": 1.2639563106796117,
"grad_norm": 0.4900761544704437,
"learning_rate": 1.5765412666970302e-06,
"loss": 0.07481481432914734,
"memory(GiB)": 40.4,
"step": 695,
"token_acc": 0.9714512291831879,
"train_speed(iter/s)": 0.077241
},
{
"epoch": 1.2730582524271845,
"grad_norm": 0.7159978747367859,
"learning_rate": 1.5424414214739258e-06,
"loss": 0.07213735580444336,
"memory(GiB)": 40.4,
"step": 700,
"token_acc": 0.9738302934179223,
"train_speed(iter/s)": 0.077237
},
{
"epoch": 1.2821601941747574,
"grad_norm": 0.6261754631996155,
"learning_rate": 1.5085491002953535e-06,
"loss": 0.07179176211357116,
"memory(GiB)": 40.4,
"step": 705,
"token_acc": 0.969047619047619,
"train_speed(iter/s)": 0.077083
},
{
"epoch": 1.29126213592233,
"grad_norm": 0.9063695073127747,
"learning_rate": 1.4748716483741562e-06,
"loss": 0.07754602432250976,
"memory(GiB)": 40.4,
"step": 710,
"token_acc": 0.96513470681458,
"train_speed(iter/s)": 0.077061
},
{
"epoch": 1.300364077669903,
"grad_norm": 0.6574028134346008,
"learning_rate": 1.4414163643562755e-06,
"loss": 0.07884335517883301,
"memory(GiB)": 40.4,
"step": 715,
"token_acc": 0.9675376088677752,
"train_speed(iter/s)": 0.077069
},
{
"epoch": 1.3094660194174756,
"grad_norm": 0.5524230599403381,
"learning_rate": 1.4081904987389701e-06,
"loss": 0.07660083174705505,
"memory(GiB)": 40.4,
"step": 720,
"token_acc": 0.9635210150674068,
"train_speed(iter/s)": 0.077072
},
{
"epoch": 1.3185679611650485,
"grad_norm": 0.5381263494491577,
"learning_rate": 1.375201252299479e-06,
"loss": 0.07187164425849915,
"memory(GiB)": 40.4,
"step": 725,
"token_acc": 0.9690966719492868,
"train_speed(iter/s)": 0.077084
},
{
"epoch": 1.3276699029126213,
"grad_norm": 0.6094266176223755,
"learning_rate": 1.3424557745344508e-06,
"loss": 0.07152368426322937,
"memory(GiB)": 40.4,
"step": 730,
"token_acc": 0.9690966719492868,
"train_speed(iter/s)": 0.07712
},
{
"epoch": 1.3367718446601942,
"grad_norm": 0.37662273645401,
"learning_rate": 1.3099611621104875e-06,
"loss": 0.07852091193199158,
"memory(GiB)": 40.4,
"step": 735,
"token_acc": 0.9698412698412698,
"train_speed(iter/s)": 0.077111
},
{
"epoch": 1.345873786407767,
"grad_norm": 0.8660151958465576,
"learning_rate": 1.2777244573261479e-06,
"loss": 0.0761515736579895,
"memory(GiB)": 40.4,
"step": 740,
"token_acc": 0.9650793650793651,
"train_speed(iter/s)": 0.077083
},
{
"epoch": 1.3549757281553398,
"grad_norm": 0.8635317087173462,
"learning_rate": 1.245752646585719e-06,
"loss": 0.07429265975952148,
"memory(GiB)": 40.4,
"step": 745,
"token_acc": 0.9706582077716098,
"train_speed(iter/s)": 0.077017
},
{
"epoch": 1.3640776699029127,
"grad_norm": 0.6921953558921814,
"learning_rate": 1.214052658885113e-06,
"loss": 0.08055119514465332,
"memory(GiB)": 40.4,
"step": 750,
"token_acc": 0.9659000793021412,
"train_speed(iter/s)": 0.07705
},
{
"epoch": 1.3731796116504853,
"grad_norm": 0.512025773525238,
"learning_rate": 1.182631364310199e-06,
"loss": 0.07414981126785278,
"memory(GiB)": 40.4,
"step": 755,
"token_acc": 0.9738095238095238,
"train_speed(iter/s)": 0.077125
},
{
"epoch": 1.3822815533980584,
"grad_norm": 0.47374847531318665,
"learning_rate": 1.1514955725479057e-06,
"loss": 0.07829545140266418,
"memory(GiB)": 40.4,
"step": 760,
"token_acc": 0.9675118858954042,
"train_speed(iter/s)": 0.077061
},
{
"epoch": 1.391383495145631,
"grad_norm": 0.5193628072738647,
"learning_rate": 1.1206520314104083e-06,
"loss": 0.06979748606681824,
"memory(GiB)": 40.4,
"step": 765,
"token_acc": 0.9730799683293745,
"train_speed(iter/s)": 0.077097
},
{
"epoch": 1.4004854368932038,
"grad_norm": 0.5398116707801819,
"learning_rate": 1.0901074253727338e-06,
"loss": 0.07316485643386841,
"memory(GiB)": 40.4,
"step": 770,
"token_acc": 0.9674861221252974,
"train_speed(iter/s)": 0.077134
},
{
"epoch": 1.4095873786407767,
"grad_norm": 0.9198482036590576,
"learning_rate": 1.0598683741240861e-06,
"loss": 0.0778656005859375,
"memory(GiB)": 40.4,
"step": 775,
"token_acc": 0.9714512291831879,
"train_speed(iter/s)": 0.077187
},
{
"epoch": 1.4186893203883495,
"grad_norm": 0.5479600429534912,
"learning_rate": 1.0299414311332107e-06,
"loss": 0.0758398413658142,
"memory(GiB)": 40.4,
"step": 780,
"token_acc": 0.9706582077716098,
"train_speed(iter/s)": 0.077204
},
{
"epoch": 1.4277912621359223,
"grad_norm": 0.562239944934845,
"learning_rate": 1.0003330822281188e-06,
"loss": 0.08118345737457275,
"memory(GiB)": 40.4,
"step": 785,
"token_acc": 0.9658730158730159,
"train_speed(iter/s)": 0.077197
},
{
"epoch": 1.4368932038834952,
"grad_norm": 0.608139157295227,
"learning_rate": 9.710497441904614e-07,
"loss": 0.07277892231941223,
"memory(GiB)": 40.4,
"step": 790,
"token_acc": 0.9739130434782609,
"train_speed(iter/s)": 0.077169
},
{
"epoch": 1.445995145631068,
"grad_norm": 0.6108372807502747,
"learning_rate": 9.420977633648739e-07,
"loss": 0.0743071436882019,
"memory(GiB)": 40.4,
"step": 795,
"token_acc": 0.9651070578905631,
"train_speed(iter/s)": 0.077195
},
{
"epoch": 1.4550970873786409,
"grad_norm": 0.5900782346725464,
"learning_rate": 9.134834142835794e-07,
"loss": 0.07513993978500366,
"memory(GiB)": 40.4,
"step": 800,
"token_acc": 0.9738302934179223,
"train_speed(iter/s)": 0.07724
},
{
"epoch": 1.4641990291262137,
"grad_norm": 0.5346866846084595,
"learning_rate": 8.852128983065653e-07,
"loss": 0.07092651724815369,
"memory(GiB)": 40.4,
"step": 805,
"token_acc": 0.9722662440570523,
"train_speed(iter/s)": 0.077133
},
{
"epoch": 1.4733009708737863,
"grad_norm": 0.504199743270874,
"learning_rate": 8.572923422776055e-07,
"loss": 0.07900516986846924,
"memory(GiB)": 40.4,
"step": 810,
"token_acc": 0.9524564183835182,
"train_speed(iter/s)": 0.077129
},
{
"epoch": 1.4824029126213591,
"grad_norm": 0.5348660349845886,
"learning_rate": 8.297277971964443e-07,
"loss": 0.07192928791046142,
"memory(GiB)": 40.4,
"step": 815,
"token_acc": 0.9706349206349206,
"train_speed(iter/s)": 0.077153
},
{
"epoch": 1.491504854368932,
"grad_norm": 0.7142664194107056,
"learning_rate": 8.025252369074077e-07,
"loss": 0.07966341972351074,
"memory(GiB)": 40.4,
"step": 820,
"token_acc": 0.9714285714285714,
"train_speed(iter/s)": 0.077158
},
{
"epoch": 1.5006067961165048,
"grad_norm": 0.670011579990387,
"learning_rate": 7.756905568047393e-07,
"loss": 0.07460339069366455,
"memory(GiB)": 40.4,
"step": 825,
"token_acc": 0.9698412698412698,
"train_speed(iter/s)": 0.077072
},
{
"epoch": 1.5097087378640777,
"grad_norm": 0.9091220498085022,
"learning_rate": 7.492295725549423e-07,
"loss": 0.07916736602783203,
"memory(GiB)": 40.4,
"step": 830,
"token_acc": 0.9714512291831879,
"train_speed(iter/s)": 0.077125
},
{
"epoch": 1.5188106796116505,
"grad_norm": 0.5154448747634888,
"learning_rate": 7.231480188363906e-07,
"loss": 0.07609822750091552,
"memory(GiB)": 40.4,
"step": 835,
"token_acc": 0.9619047619047619,
"train_speed(iter/s)": 0.077151
},
{
"epoch": 1.5279126213592233,
"grad_norm": 0.5767259001731873,
"learning_rate": 6.974515480965038e-07,
"loss": 0.07642306089401245,
"memory(GiB)": 40.4,
"step": 840,
"token_acc": 0.9635499207606973,
"train_speed(iter/s)": 0.077169
},
{
"epoch": 1.537014563106796,
"grad_norm": 0.559921145439148,
"learning_rate": 6.721457293267344e-07,
"loss": 0.07739580273628235,
"memory(GiB)": 40.4,
"step": 845,
"token_acc": 0.9659540775930324,
"train_speed(iter/s)": 0.077202
},
{
"epoch": 1.546116504854369,
"grad_norm": 0.5525022745132446,
"learning_rate": 6.472360468556419e-07,
"loss": 0.07661284804344178,
"memory(GiB)": 40.4,
"step": 850,
"token_acc": 0.9690966719492868,
"train_speed(iter/s)": 0.077223
},
{
"epoch": 1.5552184466019416,
"grad_norm": 0.7156991958618164,
"learning_rate": 6.227278991603239e-07,
"loss": 0.07607601881027222,
"memory(GiB)": 40.4,
"step": 855,
"token_acc": 0.9738924050632911,
"train_speed(iter/s)": 0.077263
},
{
"epoch": 1.5643203883495147,
"grad_norm": 0.578790009021759,
"learning_rate": 5.986265976964412e-07,
"loss": 0.07703717947006225,
"memory(GiB)": 40.4,
"step": 860,
"token_acc": 0.9627575277337559,
"train_speed(iter/s)": 0.077321
},
{
"epoch": 1.5734223300970873,
"grad_norm": 0.41067153215408325,
"learning_rate": 5.749373657471127e-07,
"loss": 0.07262166738510131,
"memory(GiB)": 40.4,
"step": 865,
"token_acc": 0.9666931007137193,
"train_speed(iter/s)": 0.077313
},
{
"epoch": 1.5825242718446602,
"grad_norm": 0.6594594120979309,
"learning_rate": 5.516653372909142e-07,
"loss": 0.07546203732490539,
"memory(GiB)": 40.4,
"step": 870,
"token_acc": 0.9730799683293745,
"train_speed(iter/s)": 0.077321
},
{
"epoch": 1.591626213592233,
"grad_norm": 0.6693688035011292,
"learning_rate": 5.28815555889228e-07,
"loss": 0.07242462635040284,
"memory(GiB)": 40.4,
"step": 875,
"token_acc": 0.9714964370546318,
"train_speed(iter/s)": 0.077315
},
{
"epoch": 1.6007281553398058,
"grad_norm": 0.5314414501190186,
"learning_rate": 5.063929735931985e-07,
"loss": 0.07621661424636841,
"memory(GiB)": 40.4,
"step": 880,
"token_acc": 0.9746634996041171,
"train_speed(iter/s)": 0.077305
},
{
"epoch": 1.6098300970873787,
"grad_norm": 0.39022502303123474,
"learning_rate": 4.844024498705072e-07,
"loss": 0.07379111647605896,
"memory(GiB)": 40.4,
"step": 885,
"token_acc": 0.9770023790642347,
"train_speed(iter/s)": 0.077319
},
{
"epoch": 1.6189320388349513,
"grad_norm": 0.5611955523490906,
"learning_rate": 4.6284875055222415e-07,
"loss": 0.07641223073005676,
"memory(GiB)": 40.4,
"step": 890,
"token_acc": 0.969047619047619,
"train_speed(iter/s)": 0.07736
},
{
"epoch": 1.6280339805825244,
"grad_norm": 0.5914463996887207,
"learning_rate": 4.4173654679994543e-07,
"loss": 0.07118785977363587,
"memory(GiB)": 40.4,
"step": 895,
"token_acc": 0.9666931007137193,
"train_speed(iter/s)": 0.077387
},
{
"epoch": 1.637135922330097,
"grad_norm": 0.6131768226623535,
"learning_rate": 4.2107041409344686e-07,
"loss": 0.06656063199043274,
"memory(GiB)": 40.4,
"step": 900,
"token_acc": 0.9730586370839936,
"train_speed(iter/s)": 0.077393
},
{
"epoch": 1.64623786407767,
"grad_norm": 0.6083477139472961,
"learning_rate": 4.00854831239082e-07,
"loss": 0.07548041343688965,
"memory(GiB)": 40.4,
"step": 905,
"token_acc": 0.9706814580031695,
"train_speed(iter/s)": 0.07732
},
{
"epoch": 1.6553398058252426,
"grad_norm": 0.5123993158340454,
"learning_rate": 3.8109417939912044e-07,
"loss": 0.07632001638412475,
"memory(GiB)": 40.4,
"step": 910,
"token_acc": 0.9651070578905631,
"train_speed(iter/s)": 0.07734
},
{
"epoch": 1.6644417475728155,
"grad_norm": 0.6305170655250549,
"learning_rate": 3.617927411422584e-07,
"loss": 0.07312512397766113,
"memory(GiB)": 40.4,
"step": 915,
"token_acc": 0.9675376088677752,
"train_speed(iter/s)": 0.077345
},
{
"epoch": 1.6735436893203883,
"grad_norm": 0.5339434742927551,
"learning_rate": 3.4295469951548894e-07,
"loss": 0.06849889755249024,
"memory(GiB)": 40.4,
"step": 920,
"token_acc": 0.9674861221252974,
"train_speed(iter/s)": 0.077349
},
{
"epoch": 1.6826456310679612,
"grad_norm": 0.532629132270813,
"learning_rate": 3.24584137137543e-07,
"loss": 0.07681695818901062,
"memory(GiB)": 40.4,
"step": 925,
"token_acc": 0.9722222222222222,
"train_speed(iter/s)": 0.077356
},
{
"epoch": 1.691747572815534,
"grad_norm": 0.4466962516307831,
"learning_rate": 3.0668503531409876e-07,
"loss": 0.06994915008544922,
"memory(GiB)": 40.4,
"step": 930,
"token_acc": 0.9714964370546318,
"train_speed(iter/s)": 0.077371
},
{
"epoch": 1.7008495145631068,
"grad_norm": 0.586765706539154,
"learning_rate": 2.892612731749414e-07,
"loss": 0.07494070529937744,
"memory(GiB)": 40.4,
"step": 935,
"token_acc": 0.969047619047619,
"train_speed(iter/s)": 0.077342
},
{
"epoch": 1.7099514563106797,
"grad_norm": 0.5412377715110779,
"learning_rate": 2.723166268332733e-07,
"loss": 0.07770473957061767,
"memory(GiB)": 40.4,
"step": 940,
"token_acc": 0.9676145339652449,
"train_speed(iter/s)": 0.077329
},
{
"epoch": 1.7190533980582523,
"grad_norm": 0.911586582660675,
"learning_rate": 2.55854768567346e-07,
"loss": 0.07914371490478515,
"memory(GiB)": 40.4,
"step": 945,
"token_acc": 0.9674861221252974,
"train_speed(iter/s)": 0.077298
},
{
"epoch": 1.7281553398058254,
"grad_norm": 0.6137750148773193,
"learning_rate": 2.3987926602459465e-07,
"loss": 0.08327807188034057,
"memory(GiB)": 40.4,
"step": 950,
"token_acc": 0.9706349206349206,
"train_speed(iter/s)": 0.077305
},
{
"epoch": 1.737257281553398,
"grad_norm": 0.576627790927887,
"learning_rate": 2.2439358144845464e-07,
"loss": 0.08012324571609497,
"memory(GiB)": 40.4,
"step": 955,
"token_acc": 0.9643423137876387,
"train_speed(iter/s)": 0.077328
},
{
"epoch": 1.7463592233009708,
"grad_norm": 0.6456671953201294,
"learning_rate": 2.09401070928012e-07,
"loss": 0.06627861261367798,
"memory(GiB)": 40.4,
"step": 960,
"token_acc": 0.9714285714285714,
"train_speed(iter/s)": 0.077243
},
{
"epoch": 1.7554611650485437,
"grad_norm": 0.6002473831176758,
"learning_rate": 1.9490498367066817e-07,
"loss": 0.071403968334198,
"memory(GiB)": 40.4,
"step": 965,
"token_acc": 0.9682791435368755,
"train_speed(iter/s)": 0.077258
},
{
"epoch": 1.7645631067961165,
"grad_norm": 0.7518230080604553,
"learning_rate": 1.8090846129796586e-07,
"loss": 0.07573525905609131,
"memory(GiB)": 40.4,
"step": 970,
"token_acc": 0.9722222222222222,
"train_speed(iter/s)": 0.077252
},
{
"epoch": 1.7736650485436893,
"grad_norm": 0.41464531421661377,
"learning_rate": 1.6741453716472677e-07,
"loss": 0.07870721817016602,
"memory(GiB)": 40.4,
"step": 975,
"token_acc": 0.9627870150435471,
"train_speed(iter/s)": 0.077259
},
{
"epoch": 1.7827669902912622,
"grad_norm": 0.7254371643066406,
"learning_rate": 1.5442613570165993e-07,
"loss": 0.08646805882453919,
"memory(GiB)": 40.4,
"step": 980,
"token_acc": 0.9611419508326725,
"train_speed(iter/s)": 0.077274
},
{
"epoch": 1.791868932038835,
"grad_norm": 0.7164713740348816,
"learning_rate": 1.4194607178157237e-07,
"loss": 0.07055433988571166,
"memory(GiB)": 40.4,
"step": 985,
"token_acc": 0.9706349206349206,
"train_speed(iter/s)": 0.077341
},
{
"epoch": 1.8009708737864076,
"grad_norm": 0.5821430087089539,
"learning_rate": 1.2997705010932394e-07,
"loss": 0.07743188142776489,
"memory(GiB)": 40.4,
"step": 990,
"token_acc": 0.9674861221252974,
"train_speed(iter/s)": 0.077362
},
{
"epoch": 1.8100728155339807,
"grad_norm": 0.766345739364624,
"learning_rate": 1.1852166463565767e-07,
"loss": 0.07668507099151611,
"memory(GiB)": 40.4,
"step": 995,
"token_acc": 0.9770023790642347,
"train_speed(iter/s)": 0.077362
},
{
"epoch": 1.8191747572815533,
"grad_norm": 0.5379170179367065,
"learning_rate": 1.0758239799503412e-07,
"loss": 0.06778880357742309,
"memory(GiB)": 40.4,
"step": 1000,
"token_acc": 0.9746233148295004,
"train_speed(iter/s)": 0.077358
},
{
"epoch": 1.8282766990291264,
"grad_norm": 0.587326169013977,
"learning_rate": 9.716162096759019e-08,
"loss": 0.07784827947616577,
"memory(GiB)": 40.4,
"step": 1005,
"token_acc": 0.9770206022187005,
"train_speed(iter/s)": 0.077289
},
{
"epoch": 1.837378640776699,
"grad_norm": 0.5790999531745911,
"learning_rate": 8.726159196533718e-08,
"loss": 0.07364106178283691,
"memory(GiB)": 40.4,
"step": 1010,
"token_acc": 0.9730372720063442,
"train_speed(iter/s)": 0.077306
},
{
"epoch": 1.8464805825242718,
"grad_norm": 0.5765237808227539,
"learning_rate": 7.788445654271532e-08,
"loss": 0.07042239308357238,
"memory(GiB)": 40.4,
"step": 1015,
"token_acc": 0.9682539682539683,
"train_speed(iter/s)": 0.077338
},
{
"epoch": 1.8555825242718447,
"grad_norm": 0.4627252221107483,
"learning_rate": 6.903224693160348e-08,
"loss": 0.06837155222892762,
"memory(GiB)": 40.4,
"step": 1020,
"token_acc": 0.9754358161648178,
"train_speed(iter/s)": 0.077366
},
{
"epoch": 1.8646844660194175,
"grad_norm": 0.5963551998138428,
"learning_rate": 6.070688160088961e-08,
"loss": 0.0674078106880188,
"memory(GiB)": 40.4,
"step": 1025,
"token_acc": 0.9659270998415214,
"train_speed(iter/s)": 0.077385
},
{
"epoch": 1.8737864077669903,
"grad_norm": 0.6391610503196716,
"learning_rate": 5.291016484069683e-08,
"loss": 0.07277075052261353,
"memory(GiB)": 40.4,
"step": 1030,
"token_acc": 0.9659540775930324,
"train_speed(iter/s)": 0.077401
},
{
"epoch": 1.882888349514563,
"grad_norm": 0.5019727945327759,
"learning_rate": 4.564378637135408e-08,
"loss": 0.0752260446548462,
"memory(GiB)": 40.4,
"step": 1035,
"token_acc": 0.9682791435368755,
"train_speed(iter/s)": 0.077434
},
{
"epoch": 1.891990291262136,
"grad_norm": 0.4186345040798187,
"learning_rate": 3.890932097719624e-08,
"loss": 0.06725120544433594,
"memory(GiB)": 40.4,
"step": 1040,
"token_acc": 0.9730799683293745,
"train_speed(iter/s)": 0.077451
},
{
"epoch": 1.9010922330097086,
"grad_norm": 0.6359046697616577,
"learning_rate": 3.270822816527325e-08,
"loss": 0.07682465314865113,
"memory(GiB)": 40.4,
"step": 1045,
"token_acc": 0.969047619047619,
"train_speed(iter/s)": 0.077498
},
{
"epoch": 1.9101941747572817,
"grad_norm": 0.5813617706298828,
"learning_rate": 2.7041851849043678e-08,
"loss": 0.0773731827735901,
"memory(GiB)": 40.4,
"step": 1050,
"token_acc": 0.9674861221252974,
"train_speed(iter/s)": 0.077486
},
{
"epoch": 1.9192961165048543,
"grad_norm": 0.4645262062549591,
"learning_rate": 2.1911420057117994e-08,
"loss": 0.07277056574821472,
"memory(GiB)": 40.4,
"step": 1055,
"token_acc": 0.9690721649484536,
"train_speed(iter/s)": 0.077447
},
{
"epoch": 1.9283980582524272,
"grad_norm": 0.8828046917915344,
"learning_rate": 1.7318044667119226e-08,
"loss": 0.07312785387039185,
"memory(GiB)": 40.4,
"step": 1060,
"token_acc": 0.9675118858954042,
"train_speed(iter/s)": 0.077476
},
{
"epoch": 1.9375,
"grad_norm": 0.8438335657119751,
"learning_rate": 1.3262721164712667e-08,
"loss": 0.07410634756088257,
"memory(GiB)": 40.4,
"step": 1065,
"token_acc": 0.9698651863600317,
"train_speed(iter/s)": 0.077482
},
{
"epoch": 1.9466019417475728,
"grad_norm": 0.6822603344917297,
"learning_rate": 9.746328427863993e-09,
"loss": 0.0720213532447815,
"memory(GiB)": 40.4,
"step": 1070,
"token_acc": 0.9666666666666667,
"train_speed(iter/s)": 0.077488
},
{
"epoch": 1.9557038834951457,
"grad_norm": 0.5685479640960693,
"learning_rate": 6.769628536364981e-09,
"loss": 0.07333976030349731,
"memory(GiB)": 40.4,
"step": 1075,
"token_acc": 0.973015873015873,
"train_speed(iter/s)": 0.077502
},
{
"epoch": 1.9648058252427183,
"grad_norm": 0.5445531606674194,
"learning_rate": 4.333266606676711e-09,
"loss": 0.07253679037094116,
"memory(GiB)": 40.4,
"step": 1080,
"token_acc": 0.9730586370839936,
"train_speed(iter/s)": 0.077494
},
{
"epoch": 1.9739077669902914,
"grad_norm": 0.6113319993019104,
"learning_rate": 2.4377706521164224e-09,
"loss": 0.07309662699699401,
"memory(GiB)": 40.4,
"step": 1085,
"token_acc": 0.9722222222222222,
"train_speed(iter/s)": 0.077475
},
{
"epoch": 1.983009708737864,
"grad_norm": 0.5483999252319336,
"learning_rate": 1.0835514684262583e-09,
"loss": 0.07428893446922302,
"memory(GiB)": 40.4,
"step": 1090,
"token_acc": 0.9690966719492868,
"train_speed(iter/s)": 0.077464
},
{
"epoch": 1.992111650485437,
"grad_norm": 0.6084752082824707,
"learning_rate": 2.7090254474421154e-10,
"loss": 0.07023123502731324,
"memory(GiB)": 40.4,
"step": 1095,
"token_acc": 0.9786223277909739,
"train_speed(iter/s)": 0.077453
},
{
"epoch": 2.0,
"grad_norm": 0.5853410363197327,
"learning_rate": 0.0,
"loss": 0.0724187433719635,
"memory(GiB)": 40.4,
"step": 1100,
"token_acc": 0.9679780420860018,
"train_speed(iter/s)": 0.077495
}
],
"logging_steps": 5,
"max_steps": 1100,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.488531281539498e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}