{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 100.0, "global_step": 1100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0018203883495145632, "grad_norm": 3.6015546321868896, "learning_rate": 1.5151515151515152e-07, "loss": 0.5787061452865601, "memory(GiB)": 40.37, "step": 1, "token_acc": 0.8690476190476191, "train_speed(iter/s)": 0.029162 }, { "epoch": 0.009101941747572815, "grad_norm": 3.435741424560547, "learning_rate": 7.575757575757576e-07, "loss": 0.5881168842315674, "memory(GiB)": 40.37, "step": 5, "token_acc": 0.8701684836471755, "train_speed(iter/s)": 0.06174 }, { "epoch": 0.01820388349514563, "grad_norm": 3.448568105697632, "learning_rate": 1.5151515151515152e-06, "loss": 0.5851926326751709, "memory(GiB)": 40.39, "step": 10, "token_acc": 0.8622327790973872, "train_speed(iter/s)": 0.067399 }, { "epoch": 0.027305825242718445, "grad_norm": 3.405535936355591, "learning_rate": 2.2727272727272728e-06, "loss": 0.6001698970794678, "memory(GiB)": 40.39, "step": 15, "token_acc": 0.8716323296354992, "train_speed(iter/s)": 0.068625 }, { "epoch": 0.03640776699029126, "grad_norm": 3.6892027854919434, "learning_rate": 3.0303030303030305e-06, "loss": 0.5676679611206055, "memory(GiB)": 40.39, "step": 20, "token_acc": 0.8715305313243458, "train_speed(iter/s)": 0.069219 }, { "epoch": 0.04550970873786408, "grad_norm": 3.9115183353424072, "learning_rate": 3.7878787878787882e-06, "loss": 0.5411659717559815, "memory(GiB)": 40.39, "step": 25, "token_acc": 0.8685669041963578, "train_speed(iter/s)": 0.071459 }, { "epoch": 0.05461165048543689, "grad_norm": 3.357640027999878, "learning_rate": 4.5454545454545455e-06, "loss": 0.4885613441467285, "memory(GiB)": 40.39, "step": 30, "token_acc": 0.8682539682539683, "train_speed(iter/s)": 0.072567 }, { "epoch": 0.06371359223300971, "grad_norm": 3.3015964031219482, "learning_rate": 4.999956654935265e-06, "loss": 0.4215705871582031, "memory(GiB)": 40.39, "step": 35, "token_acc": 0.8692551505546752, "train_speed(iter/s)": 0.07282 }, { "epoch": 0.07281553398058252, "grad_norm": 2.447498321533203, "learning_rate": 4.999469040218251e-06, "loss": 0.2957149982452393, "memory(GiB)": 40.39, "step": 40, "token_acc": 0.8690476190476191, "train_speed(iter/s)": 0.074291 }, { "epoch": 0.08191747572815535, "grad_norm": 0.8801060914993286, "learning_rate": 4.9984397354824345e-06, "loss": 0.21340658664703369, "memory(GiB)": 40.39, "step": 45, "token_acc": 0.9135606661379857, "train_speed(iter/s)": 0.074508 }, { "epoch": 0.09101941747572816, "grad_norm": 0.7226303815841675, "learning_rate": 4.996868963800831e-06, "loss": 0.1777859926223755, "memory(GiB)": 40.39, "step": 50, "token_acc": 0.9239904988123515, "train_speed(iter/s)": 0.075304 }, { "epoch": 0.10012135922330097, "grad_norm": 0.7329442501068115, "learning_rate": 4.99475706559428e-06, "loss": 0.17081427574157715, "memory(GiB)": 40.39, "step": 55, "token_acc": 0.9238699444885012, "train_speed(iter/s)": 0.075664 }, { "epoch": 0.10922330097087378, "grad_norm": 0.48636820912361145, "learning_rate": 4.992104498557657e-06, "loss": 0.15634163618087768, "memory(GiB)": 40.39, "step": 60, "token_acc": 0.9262490087232356, "train_speed(iter/s)": 0.076641 }, { "epoch": 0.1183252427184466, "grad_norm": 0.44267499446868896, "learning_rate": 4.988911837560691e-06, "loss": 0.1444383144378662, "memory(GiB)": 40.39, "step": 65, "token_acc": 0.9350237717908082, "train_speed(iter/s)": 0.074923 }, { "epoch": 0.12742718446601942, "grad_norm": 0.4311356544494629, "learning_rate": 4.985179774523375e-06, "loss": 0.14677078723907472, "memory(GiB)": 40.39, "step": 70, "token_acc": 0.9444444444444444, "train_speed(iter/s)": 0.075087 }, { "epoch": 0.13652912621359223, "grad_norm": 0.3981742858886719, "learning_rate": 4.980909118266006e-06, "loss": 0.13511970043182372, "memory(GiB)": 40.39, "step": 75, "token_acc": 0.9484944532488114, "train_speed(iter/s)": 0.074414 }, { "epoch": 0.14563106796116504, "grad_norm": 0.4317033290863037, "learning_rate": 4.976100794333903e-06, "loss": 0.12185637950897217, "memory(GiB)": 40.39, "step": 80, "token_acc": 0.9627279936558287, "train_speed(iter/s)": 0.074464 }, { "epoch": 0.15473300970873785, "grad_norm": 0.3179706335067749, "learning_rate": 4.970755844796817e-06, "loss": 0.12840776443481444, "memory(GiB)": 40.39, "step": 85, "token_acc": 0.9492063492063492, "train_speed(iter/s)": 0.074996 }, { "epoch": 0.1638349514563107, "grad_norm": 0.3189823031425476, "learning_rate": 4.964875428023093e-06, "loss": 0.12376663684844971, "memory(GiB)": 40.39, "step": 90, "token_acc": 0.957936507936508, "train_speed(iter/s)": 0.075144 }, { "epoch": 0.1729368932038835, "grad_norm": 0.33377909660339355, "learning_rate": 4.958460818428627e-06, "loss": 0.11574116945266724, "memory(GiB)": 40.39, "step": 95, "token_acc": 0.9563492063492064, "train_speed(iter/s)": 0.075617 }, { "epoch": 0.1820388349514563, "grad_norm": 0.4928111433982849, "learning_rate": 4.951513406200667e-06, "loss": 0.1149444341659546, "memory(GiB)": 40.39, "step": 100, "token_acc": 0.9508716323296355, "train_speed(iter/s)": 0.075828 }, { "epoch": 0.19114077669902912, "grad_norm": 0.3134707808494568, "learning_rate": 4.944034696996534e-06, "loss": 0.11119295358657837, "memory(GiB)": 40.39, "step": 105, "token_acc": 0.9595238095238096, "train_speed(iter/s)": 0.075066 }, { "epoch": 0.20024271844660194, "grad_norm": 0.2365858554840088, "learning_rate": 4.936026311617316e-06, "loss": 0.11442217826843262, "memory(GiB)": 40.39, "step": 110, "token_acc": 0.9588281868566905, "train_speed(iter/s)": 0.075061 }, { "epoch": 0.20934466019417475, "grad_norm": 0.3145173490047455, "learning_rate": 4.927489985656591e-06, "loss": 0.10322239398956298, "memory(GiB)": 40.39, "step": 115, "token_acc": 0.9658730158730159, "train_speed(iter/s)": 0.074479 }, { "epoch": 0.21844660194174756, "grad_norm": 0.33202633261680603, "learning_rate": 4.918427569124302e-06, "loss": 0.10661822557449341, "memory(GiB)": 40.39, "step": 120, "token_acc": 0.9556259904912837, "train_speed(iter/s)": 0.074637 }, { "epoch": 0.2275485436893204, "grad_norm": 0.3093946874141693, "learning_rate": 4.908841026045809e-06, "loss": 0.10065805912017822, "memory(GiB)": 40.39, "step": 125, "token_acc": 0.9540412044374009, "train_speed(iter/s)": 0.074905 }, { "epoch": 0.2366504854368932, "grad_norm": 0.39363232254981995, "learning_rate": 4.8987324340362445e-06, "loss": 0.114447021484375, "memory(GiB)": 40.39, "step": 130, "token_acc": 0.9571428571428572, "train_speed(iter/s)": 0.075072 }, { "epoch": 0.24575242718446602, "grad_norm": 0.37065446376800537, "learning_rate": 4.888103983850245e-06, "loss": 0.10610785484313964, "memory(GiB)": 40.39, "step": 135, "token_acc": 0.9565217391304348, "train_speed(iter/s)": 0.075167 }, { "epoch": 0.25485436893203883, "grad_norm": 0.542117714881897, "learning_rate": 4.876957978907176e-06, "loss": 0.0954114019870758, "memory(GiB)": 40.39, "step": 140, "token_acc": 0.9666666666666667, "train_speed(iter/s)": 0.075346 }, { "epoch": 0.26395631067961167, "grad_norm": 0.3225058913230896, "learning_rate": 4.865296834791918e-06, "loss": 0.0959049105644226, "memory(GiB)": 40.39, "step": 145, "token_acc": 0.9587955625990491, "train_speed(iter/s)": 0.075467 }, { "epoch": 0.27305825242718446, "grad_norm": 0.3421016037464142, "learning_rate": 4.853123078731363e-06, "loss": 0.09874246120452881, "memory(GiB)": 40.39, "step": 150, "token_acc": 0.9650793650793651, "train_speed(iter/s)": 0.075618 }, { "epoch": 0.2821601941747573, "grad_norm": 0.3102968633174896, "learning_rate": 4.8404393490467085e-06, "loss": 0.09461469650268554, "memory(GiB)": 40.39, "step": 155, "token_acc": 0.9547977795400476, "train_speed(iter/s)": 0.075855 }, { "epoch": 0.2912621359223301, "grad_norm": 0.4729763865470886, "learning_rate": 4.827248394581672e-06, "loss": 0.10038878917694091, "memory(GiB)": 40.39, "step": 160, "token_acc": 0.9650793650793651, "train_speed(iter/s)": 0.075945 }, { "epoch": 0.3003640776699029, "grad_norm": 0.3695836365222931, "learning_rate": 4.813553074106761e-06, "loss": 0.09139147400856018, "memory(GiB)": 40.39, "step": 165, "token_acc": 0.9627279936558287, "train_speed(iter/s)": 0.075756 }, { "epoch": 0.3094660194174757, "grad_norm": 0.47110962867736816, "learning_rate": 4.799356355699708e-06, "loss": 0.09496045112609863, "memory(GiB)": 40.39, "step": 170, "token_acc": 0.9698412698412698, "train_speed(iter/s)": 0.075898 }, { "epoch": 0.31856796116504854, "grad_norm": 0.3773088753223419, "learning_rate": 4.784661316102229e-06, "loss": 0.09658662080764771, "memory(GiB)": 40.4, "step": 175, "token_acc": 0.96513470681458, "train_speed(iter/s)": 0.075914 }, { "epoch": 0.3276699029126214, "grad_norm": 0.3394829034805298, "learning_rate": 4.769471140053221e-06, "loss": 0.08639374971389771, "memory(GiB)": 40.4, "step": 180, "token_acc": 0.969047619047619, "train_speed(iter/s)": 0.076076 }, { "epoch": 0.33677184466019416, "grad_norm": 0.4525506794452667, "learning_rate": 4.753789119598563e-06, "loss": 0.09742268323898315, "memory(GiB)": 40.4, "step": 185, "token_acc": 0.9587301587301588, "train_speed(iter/s)": 0.076177 }, { "epoch": 0.345873786407767, "grad_norm": 0.3789404332637787, "learning_rate": 4.737618653377651e-06, "loss": 0.09391134977340698, "memory(GiB)": 40.4, "step": 190, "token_acc": 0.9651070578905631, "train_speed(iter/s)": 0.07649 }, { "epoch": 0.3549757281553398, "grad_norm": 0.5464370250701904, "learning_rate": 4.720963245886846e-06, "loss": 0.0969527006149292, "memory(GiB)": 40.4, "step": 195, "token_acc": 0.9659270998415214, "train_speed(iter/s)": 0.076513 }, { "epoch": 0.3640776699029126, "grad_norm": 0.3459813892841339, "learning_rate": 4.703826506719964e-06, "loss": 0.08732333183288574, "memory(GiB)": 40.4, "step": 200, "token_acc": 0.96513470681458, "train_speed(iter/s)": 0.076587 }, { "epoch": 0.3731796116504854, "grad_norm": 0.3549191653728485, "learning_rate": 4.686212149786007e-06, "loss": 0.08515737056732178, "memory(GiB)": 40.4, "step": 205, "token_acc": 0.96513470681458, "train_speed(iter/s)": 0.076344 }, { "epoch": 0.38228155339805825, "grad_norm": 0.7434160709381104, "learning_rate": 4.668123992504267e-06, "loss": 0.09526927471160888, "memory(GiB)": 40.4, "step": 210, "token_acc": 0.9666666666666667, "train_speed(iter/s)": 0.076513 }, { "epoch": 0.3913834951456311, "grad_norm": 0.464631587266922, "learning_rate": 4.649565954977015e-06, "loss": 0.09264343380928039, "memory(GiB)": 40.4, "step": 215, "token_acc": 0.9620253164556962, "train_speed(iter/s)": 0.076143 }, { "epoch": 0.40048543689320387, "grad_norm": 0.5145648121833801, "learning_rate": 4.630542059139923e-06, "loss": 0.09688866138458252, "memory(GiB)": 40.4, "step": 220, "token_acc": 0.9667458432304038, "train_speed(iter/s)": 0.076292 }, { "epoch": 0.4095873786407767, "grad_norm": 0.33657485246658325, "learning_rate": 4.611056427890428e-06, "loss": 0.09414277076721192, "memory(GiB)": 40.4, "step": 225, "token_acc": 0.9587301587301588, "train_speed(iter/s)": 0.076275 }, { "epoch": 0.4186893203883495, "grad_norm": 0.47585147619247437, "learning_rate": 4.5911132841942e-06, "loss": 0.08656486272811889, "memory(GiB)": 40.4, "step": 230, "token_acc": 0.9698651863600317, "train_speed(iter/s)": 0.076342 }, { "epoch": 0.42779126213592233, "grad_norm": 0.3516729176044464, "learning_rate": 4.570716950169944e-06, "loss": 0.08657894730567932, "memory(GiB)": 40.4, "step": 235, "token_acc": 0.9642857142857143, "train_speed(iter/s)": 0.076493 }, { "epoch": 0.4368932038834951, "grad_norm": 0.48757559061050415, "learning_rate": 4.5498718461526895e-06, "loss": 0.09453780055046082, "memory(GiB)": 40.4, "step": 240, "token_acc": 0.9643705463182898, "train_speed(iter/s)": 0.07656 }, { "epoch": 0.44599514563106796, "grad_norm": 0.5283713936805725, "learning_rate": 4.528582489735818e-06, "loss": 0.08740494847297668, "memory(GiB)": 40.4, "step": 245, "token_acc": 0.9587628865979382, "train_speed(iter/s)": 0.07663 }, { "epoch": 0.4550970873786408, "grad_norm": 0.3577844500541687, "learning_rate": 4.506853494791992e-06, "loss": 0.08014656901359558, "memory(GiB)": 40.4, "step": 250, "token_acc": 0.971473851030111, "train_speed(iter/s)": 0.076543 }, { "epoch": 0.4641990291262136, "grad_norm": 0.5026013851165771, "learning_rate": 4.484689570473232e-06, "loss": 0.08635783195495605, "memory(GiB)": 40.4, "step": 255, "token_acc": 0.9682791435368755, "train_speed(iter/s)": 0.076578 }, { "epoch": 0.4733009708737864, "grad_norm": 0.45232078433036804, "learning_rate": 4.462095520190336e-06, "loss": 0.08593440055847168, "memory(GiB)": 40.4, "step": 260, "token_acc": 0.9699367088607594, "train_speed(iter/s)": 0.076538 }, { "epoch": 0.4824029126213592, "grad_norm": 0.47390663623809814, "learning_rate": 4.43907624057188e-06, "loss": 0.08747667074203491, "memory(GiB)": 40.4, "step": 265, "token_acc": 0.9619047619047619, "train_speed(iter/s)": 0.076588 }, { "epoch": 0.49150485436893204, "grad_norm": 0.43587085604667664, "learning_rate": 4.415636720403005e-06, "loss": 0.08902972340583801, "memory(GiB)": 40.4, "step": 270, "token_acc": 0.9619349722442506, "train_speed(iter/s)": 0.076484 }, { "epoch": 0.5006067961165048, "grad_norm": 0.41671204566955566, "learning_rate": 4.391782039544239e-06, "loss": 0.08426393270492553, "memory(GiB)": 40.4, "step": 275, "token_acc": 0.9603489294210944, "train_speed(iter/s)": 0.076586 }, { "epoch": 0.5097087378640777, "grad_norm": 0.3852890133857727, "learning_rate": 4.367517367830581e-06, "loss": 0.08224607706069946, "memory(GiB)": 40.4, "step": 280, "token_acc": 0.9730372720063442, "train_speed(iter/s)": 0.0767 }, { "epoch": 0.5188106796116505, "grad_norm": 0.5980095863342285, "learning_rate": 4.342847963951085e-06, "loss": 0.09114923477172851, "memory(GiB)": 40.4, "step": 285, "token_acc": 0.9642857142857143, "train_speed(iter/s)": 0.076804 }, { "epoch": 0.5279126213592233, "grad_norm": 0.5370866656303406, "learning_rate": 4.317779174309179e-06, "loss": 0.09176770448684693, "memory(GiB)": 40.4, "step": 290, "token_acc": 0.9595879556259905, "train_speed(iter/s)": 0.076902 }, { "epoch": 0.5370145631067961, "grad_norm": 0.5857056975364685, "learning_rate": 4.292316431863991e-06, "loss": 0.08232347965240479, "memory(GiB)": 40.4, "step": 295, "token_acc": 0.9635210150674068, "train_speed(iter/s)": 0.076861 }, { "epoch": 0.5461165048543689, "grad_norm": 0.45398032665252686, "learning_rate": 4.2664652549528995e-06, "loss": 0.0860186755657196, "memory(GiB)": 40.4, "step": 300, "token_acc": 0.9603174603174603, "train_speed(iter/s)": 0.076918 }, { "epoch": 0.5552184466019418, "grad_norm": 0.4008013904094696, "learning_rate": 4.240231246095593e-06, "loss": 0.08663930892944335, "memory(GiB)": 40.4, "step": 305, "token_acc": 0.9698651863600317, "train_speed(iter/s)": 0.076723 }, { "epoch": 0.5643203883495146, "grad_norm": 0.6199547052383423, "learning_rate": 4.213620090779877e-06, "loss": 0.08223216533660889, "memory(GiB)": 40.4, "step": 310, "token_acc": 0.9674861221252974, "train_speed(iter/s)": 0.076805 }, { "epoch": 0.5734223300970874, "grad_norm": 0.37448298931121826, "learning_rate": 4.186637556229508e-06, "loss": 0.08296606540679932, "memory(GiB)": 40.4, "step": 315, "token_acc": 0.9666931007137193, "train_speed(iter/s)": 0.076708 }, { "epoch": 0.5825242718446602, "grad_norm": 0.4003507196903229, "learning_rate": 4.159289490154305e-06, "loss": 0.07931501269340516, "memory(GiB)": 40.4, "step": 320, "token_acc": 0.9642857142857143, "train_speed(iter/s)": 0.076845 }, { "epoch": 0.591626213592233, "grad_norm": 0.49439844489097595, "learning_rate": 4.1315818194828196e-06, "loss": 0.08067693710327148, "memory(GiB)": 40.4, "step": 325, "token_acc": 0.9698412698412698, "train_speed(iter/s)": 0.076875 }, { "epoch": 0.6007281553398058, "grad_norm": 0.584017813205719, "learning_rate": 4.1035205490778505e-06, "loss": 0.09277031421661378, "memory(GiB)": 40.4, "step": 330, "token_acc": 0.9595879556259905, "train_speed(iter/s)": 0.076692 }, { "epoch": 0.6098300970873787, "grad_norm": 0.47020280361175537, "learning_rate": 4.075111760435045e-06, "loss": 0.07749168276786804, "memory(GiB)": 40.4, "step": 335, "token_acc": 0.96513470681458, "train_speed(iter/s)": 0.076884 }, { "epoch": 0.6189320388349514, "grad_norm": 0.4876089096069336, "learning_rate": 4.046361610364913e-06, "loss": 0.07796428203582764, "memory(GiB)": 40.4, "step": 340, "token_acc": 0.9691699604743083, "train_speed(iter/s)": 0.076913 }, { "epoch": 0.6280339805825242, "grad_norm": 0.5511714220046997, "learning_rate": 4.017276329658506e-06, "loss": 0.08419817090034484, "memory(GiB)": 40.4, "step": 345, "token_acc": 0.9707278481012658, "train_speed(iter/s)": 0.07696 }, { "epoch": 0.6371359223300971, "grad_norm": 0.5659735798835754, "learning_rate": 3.987862221737072e-06, "loss": 0.0797402322292328, "memory(GiB)": 40.4, "step": 350, "token_acc": 0.9659270998415214, "train_speed(iter/s)": 0.076995 }, { "epoch": 0.6462378640776699, "grad_norm": 0.5157150030136108, "learning_rate": 3.958125661285959e-06, "loss": 0.0838176965713501, "memory(GiB)": 40.4, "step": 355, "token_acc": 0.9690721649484536, "train_speed(iter/s)": 0.076909 }, { "epoch": 0.6553398058252428, "grad_norm": 0.5069080591201782, "learning_rate": 3.928073092873088e-06, "loss": 0.07343612313270569, "memory(GiB)": 40.4, "step": 360, "token_acc": 0.9746233148295004, "train_speed(iter/s)": 0.076991 }, { "epoch": 0.6644417475728155, "grad_norm": 0.49923259019851685, "learning_rate": 3.897711029552264e-06, "loss": 0.07626074552536011, "memory(GiB)": 40.4, "step": 365, "token_acc": 0.9683544303797469, "train_speed(iter/s)": 0.076983 }, { "epoch": 0.6735436893203883, "grad_norm": 0.35883885622024536, "learning_rate": 3.8670460514516615e-06, "loss": 0.08405499458312989, "memory(GiB)": 40.4, "step": 370, "token_acc": 0.9635499207606973, "train_speed(iter/s)": 0.077013 }, { "epoch": 0.6826456310679612, "grad_norm": 0.4520786702632904, "learning_rate": 3.836084804347763e-06, "loss": 0.07998884916305542, "memory(GiB)": 40.4, "step": 375, "token_acc": 0.9698412698412698, "train_speed(iter/s)": 0.07694 }, { "epoch": 0.691747572815534, "grad_norm": 0.47654658555984497, "learning_rate": 3.8048339982250705e-06, "loss": 0.08119775056838989, "memory(GiB)": 40.4, "step": 380, "token_acc": 0.9667194928684627, "train_speed(iter/s)": 0.077002 }, { "epoch": 0.7008495145631068, "grad_norm": 0.5640057325363159, "learning_rate": 3.773300405821908e-06, "loss": 0.08841820359230042, "memory(GiB)": 40.4, "step": 385, "token_acc": 0.9595559080095163, "train_speed(iter/s)": 0.077061 }, { "epoch": 0.7099514563106796, "grad_norm": 0.42381900548934937, "learning_rate": 3.7414908611626162e-06, "loss": 0.08166542053222656, "memory(GiB)": 40.4, "step": 390, "token_acc": 0.969047619047619, "train_speed(iter/s)": 0.077092 }, { "epoch": 0.7190533980582524, "grad_norm": 0.510867714881897, "learning_rate": 3.709412258076471e-06, "loss": 0.08081957101821899, "memory(GiB)": 40.4, "step": 395, "token_acc": 0.9699129057798892, "train_speed(iter/s)": 0.077233 }, { "epoch": 0.7281553398058253, "grad_norm": 0.5211343169212341, "learning_rate": 3.6770715487036413e-06, "loss": 0.08312466740608215, "memory(GiB)": 40.4, "step": 400, "token_acc": 0.9611419508326725, "train_speed(iter/s)": 0.077264 }, { "epoch": 0.7372572815533981, "grad_norm": 0.46672672033309937, "learning_rate": 3.644475741988499e-06, "loss": 0.08163590431213379, "memory(GiB)": 40.4, "step": 405, "token_acc": 0.9666666666666667, "train_speed(iter/s)": 0.07706 }, { "epoch": 0.7463592233009708, "grad_norm": 0.4190872013568878, "learning_rate": 3.6116319021606345e-06, "loss": 0.08278034925460816, "memory(GiB)": 40.4, "step": 410, "token_acc": 0.9603803486529319, "train_speed(iter/s)": 0.077071 }, { "epoch": 0.7554611650485437, "grad_norm": 0.4177815318107605, "learning_rate": 3.5785471472038784e-06, "loss": 0.07709290385246277, "memory(GiB)": 40.4, "step": 415, "token_acc": 0.9714512291831879, "train_speed(iter/s)": 0.077076 }, { "epoch": 0.7645631067961165, "grad_norm": 0.7115554213523865, "learning_rate": 3.545228647313679e-06, "loss": 0.08126543164253235, "memory(GiB)": 40.4, "step": 420, "token_acc": 0.9674861221252974, "train_speed(iter/s)": 0.07706 }, { "epoch": 0.7736650485436893, "grad_norm": 0.43985486030578613, "learning_rate": 3.5116836233431616e-06, "loss": 0.08477982282638549, "memory(GiB)": 40.4, "step": 425, "token_acc": 0.9628164556962026, "train_speed(iter/s)": 0.077154 }, { "epoch": 0.7827669902912622, "grad_norm": 0.48275941610336304, "learning_rate": 3.477919345238213e-06, "loss": 0.07978797554969788, "memory(GiB)": 40.4, "step": 430, "token_acc": 0.9627279936558287, "train_speed(iter/s)": 0.077173 }, { "epoch": 0.7918689320388349, "grad_norm": 0.5005500912666321, "learning_rate": 3.4439431304619207e-06, "loss": 0.07624109983444213, "memory(GiB)": 40.4, "step": 435, "token_acc": 0.9659270998415214, "train_speed(iter/s)": 0.077238 }, { "epoch": 0.8009708737864077, "grad_norm": 0.5146210789680481, "learning_rate": 3.4097623424087196e-06, "loss": 0.080259507894516, "memory(GiB)": 40.4, "step": 440, "token_acc": 0.9706582077716098, "train_speed(iter/s)": 0.077241 }, { "epoch": 0.8100728155339806, "grad_norm": 0.558778703212738, "learning_rate": 3.3753843888085806e-06, "loss": 0.07813260555267335, "memory(GiB)": 40.4, "step": 445, "token_acc": 0.9658730158730159, "train_speed(iter/s)": 0.077226 }, { "epoch": 0.8191747572815534, "grad_norm": 0.574676513671875, "learning_rate": 3.340816720121597e-06, "loss": 0.0761204183101654, "memory(GiB)": 40.4, "step": 450, "token_acc": 0.9691699604743083, "train_speed(iter/s)": 0.077059 }, { "epoch": 0.8282766990291263, "grad_norm": 0.5359216332435608, "learning_rate": 3.3060668279232964e-06, "loss": 0.07063559293746949, "memory(GiB)": 40.4, "step": 455, "token_acc": 0.9746233148295004, "train_speed(iter/s)": 0.077103 }, { "epoch": 0.837378640776699, "grad_norm": 0.5926820635795593, "learning_rate": 3.2711422432810624e-06, "loss": 0.07327613830566407, "memory(GiB)": 40.4, "step": 460, "token_acc": 0.9666666666666667, "train_speed(iter/s)": 0.077136 }, { "epoch": 0.8464805825242718, "grad_norm": 0.4923359155654907, "learning_rate": 3.236050535121976e-06, "loss": 0.0849435031414032, "memory(GiB)": 40.4, "step": 465, "token_acc": 0.9628164556962026, "train_speed(iter/s)": 0.077175 }, { "epoch": 0.8555825242718447, "grad_norm": 0.5079782605171204, "learning_rate": 3.2007993085924694e-06, "loss": 0.07131590843200683, "memory(GiB)": 40.4, "step": 470, "token_acc": 0.9603489294210944, "train_speed(iter/s)": 0.077219 }, { "epoch": 0.8646844660194175, "grad_norm": 0.47359853982925415, "learning_rate": 3.165396203410121e-06, "loss": 0.08230514526367187, "memory(GiB)": 40.4, "step": 475, "token_acc": 0.9603489294210944, "train_speed(iter/s)": 0.077276 }, { "epoch": 0.8737864077669902, "grad_norm": 0.5094448328018188, "learning_rate": 3.1298488922079597e-06, "loss": 0.07572669386863709, "memory(GiB)": 40.4, "step": 480, "token_acc": 0.9683042789223455, "train_speed(iter/s)": 0.077301 }, { "epoch": 0.8828883495145631, "grad_norm": 0.6144260764122009, "learning_rate": 3.094165078871634e-06, "loss": 0.07770437002182007, "memory(GiB)": 40.4, "step": 485, "token_acc": 0.9674603174603175, "train_speed(iter/s)": 0.077291 }, { "epoch": 0.8919902912621359, "grad_norm": 0.7166838049888611, "learning_rate": 3.0583524968698176e-06, "loss": 0.07593016624450684, "memory(GiB)": 40.4, "step": 490, "token_acc": 0.9706582077716098, "train_speed(iter/s)": 0.077337 }, { "epoch": 0.9010922330097088, "grad_norm": 0.5843172073364258, "learning_rate": 3.0224189075781886e-06, "loss": 0.0753251850605011, "memory(GiB)": 40.4, "step": 495, "token_acc": 0.9675889328063241, "train_speed(iter/s)": 0.077398 }, { "epoch": 0.9101941747572816, "grad_norm": 0.4273771643638611, "learning_rate": 2.9863720985973697e-06, "loss": 0.07616569995880126, "memory(GiB)": 40.4, "step": 500, "token_acc": 0.9746031746031746, "train_speed(iter/s)": 0.077368 }, { "epoch": 0.9192961165048543, "grad_norm": 0.5440679788589478, "learning_rate": 2.9502198820651903e-06, "loss": 0.07991842031478882, "memory(GiB)": 40.4, "step": 505, "token_acc": 0.9642857142857143, "train_speed(iter/s)": 0.077195 }, { "epoch": 0.9283980582524272, "grad_norm": 0.6545736789703369, "learning_rate": 2.9139700929636134e-06, "loss": 0.07855194211006164, "memory(GiB)": 40.4, "step": 510, "token_acc": 0.9587301587301588, "train_speed(iter/s)": 0.077178 }, { "epoch": 0.9375, "grad_norm": 0.5470529794692993, "learning_rate": 2.8776305874207305e-06, "loss": 0.07507063150405884, "memory(GiB)": 40.4, "step": 515, "token_acc": 0.9675376088677752, "train_speed(iter/s)": 0.077176 }, { "epoch": 0.9466019417475728, "grad_norm": 0.5262081623077393, "learning_rate": 2.8412092410081645e-06, "loss": 0.08568469285964966, "memory(GiB)": 40.4, "step": 520, "token_acc": 0.9659270998415214, "train_speed(iter/s)": 0.077164 }, { "epoch": 0.9557038834951457, "grad_norm": 0.48101773858070374, "learning_rate": 2.804713947034254e-06, "loss": 0.07408897280693054, "memory(GiB)": 40.4, "step": 525, "token_acc": 0.9715189873417721, "train_speed(iter/s)": 0.077248 }, { "epoch": 0.9648058252427184, "grad_norm": 0.7088754773139954, "learning_rate": 2.7681526148334074e-06, "loss": 0.07859846353530883, "memory(GiB)": 40.4, "step": 530, "token_acc": 0.9651070578905631, "train_speed(iter/s)": 0.077348 }, { "epoch": 0.9739077669902912, "grad_norm": 0.5357980728149414, "learning_rate": 2.73153316805197e-06, "loss": 0.07618768811225891, "memory(GiB)": 40.4, "step": 535, "token_acc": 0.9683042789223455, "train_speed(iter/s)": 0.077388 }, { "epoch": 0.9830097087378641, "grad_norm": 0.4719216823577881, "learning_rate": 2.6948635429309984e-06, "loss": 0.08283294439315796, "memory(GiB)": 40.4, "step": 540, "token_acc": 0.9666666666666667, "train_speed(iter/s)": 0.077404 }, { "epoch": 0.9921116504854369, "grad_norm": 0.4105032980442047, "learning_rate": 2.6581516865863006e-06, "loss": 0.07635112404823304, "memory(GiB)": 40.4, "step": 545, "token_acc": 0.9666666666666667, "train_speed(iter/s)": 0.077461 }, { "epoch": 1.0, "grad_norm": 0.4639950096607208, "learning_rate": 2.6214055552861213e-06, "loss": 0.07352917194366455, "memory(GiB)": 40.4, "step": 550, "token_acc": 0.9652014652014652, "train_speed(iter/s)": 0.077567 }, { "epoch": 1.0091019417475728, "grad_norm": 0.5708960294723511, "learning_rate": 2.5846331127268432e-06, "loss": 0.06939817667007446, "memory(GiB)": 40.4, "step": 555, "token_acc": 0.9746634996041171, "train_speed(iter/s)": 0.077516 }, { "epoch": 1.0182038834951457, "grad_norm": 0.5500112771987915, "learning_rate": 2.5478423283070797e-06, "loss": 0.08004761338233948, "memory(GiB)": 40.4, "step": 560, "token_acc": 0.9666666666666667, "train_speed(iter/s)": 0.077461 }, { "epoch": 1.0273058252427185, "grad_norm": 0.6031087040901184, "learning_rate": 2.5110411754005277e-06, "loss": 0.07369757890701294, "memory(GiB)": 40.4, "step": 565, "token_acc": 0.9675118858954042, "train_speed(iter/s)": 0.077479 }, { "epoch": 1.0364077669902914, "grad_norm": 0.6123142242431641, "learning_rate": 2.4742376296279656e-06, "loss": 0.07673358917236328, "memory(GiB)": 40.4, "step": 570, "token_acc": 0.96513470681458, "train_speed(iter/s)": 0.077492 }, { "epoch": 1.045509708737864, "grad_norm": 0.4750412404537201, "learning_rate": 2.437439667128757e-06, "loss": 0.07482797503471375, "memory(GiB)": 40.4, "step": 575, "token_acc": 0.9722222222222222, "train_speed(iter/s)": 0.077462 }, { "epoch": 1.0546116504854368, "grad_norm": 0.6936323642730713, "learning_rate": 2.4006552628322495e-06, "loss": 0.07669172286987305, "memory(GiB)": 40.4, "step": 580, "token_acc": 0.9698890649762282, "train_speed(iter/s)": 0.077497 }, { "epoch": 1.0637135922330097, "grad_norm": 0.5415986180305481, "learning_rate": 2.3638923887294252e-06, "loss": 0.07764337062835694, "memory(GiB)": 40.4, "step": 585, "token_acc": 0.9722662440570523, "train_speed(iter/s)": 0.077534 }, { "epoch": 1.0728155339805825, "grad_norm": 0.5562268495559692, "learning_rate": 2.3271590121452034e-06, "loss": 0.07850711941719055, "memory(GiB)": 40.4, "step": 590, "token_acc": 0.9627575277337559, "train_speed(iter/s)": 0.077312 }, { "epoch": 1.0819174757281553, "grad_norm": 0.5438592433929443, "learning_rate": 2.2904630940117383e-06, "loss": 0.07206880450248718, "memory(GiB)": 40.4, "step": 595, "token_acc": 0.9706582077716098, "train_speed(iter/s)": 0.077329 }, { "epoch": 1.0910194174757282, "grad_norm": 0.7570096254348755, "learning_rate": 2.253812587143113e-06, "loss": 0.07922015190124512, "memory(GiB)": 40.4, "step": 600, "token_acc": 0.9675632911392406, "train_speed(iter/s)": 0.077373 }, { "epoch": 1.100121359223301, "grad_norm": 0.44248196482658386, "learning_rate": 2.2172154345117896e-06, "loss": 0.07421438097953796, "memory(GiB)": 40.4, "step": 605, "token_acc": 0.969047619047619, "train_speed(iter/s)": 0.077227 }, { "epoch": 1.1092233009708738, "grad_norm": 0.8693225383758545, "learning_rate": 2.18067956752719e-06, "loss": 0.07179425954818726, "memory(GiB)": 40.4, "step": 610, "token_acc": 0.9738302934179223, "train_speed(iter/s)": 0.077227 }, { "epoch": 1.1183252427184467, "grad_norm": 0.6093197464942932, "learning_rate": 2.1442129043167877e-06, "loss": 0.07261105179786682, "memory(GiB)": 40.4, "step": 615, "token_acc": 0.972244250594766, "train_speed(iter/s)": 0.077265 }, { "epoch": 1.1274271844660193, "grad_norm": 0.47732552886009216, "learning_rate": 2.1078233480100708e-06, "loss": 0.07763968706130982, "memory(GiB)": 40.4, "step": 620, "token_acc": 0.9746233148295004, "train_speed(iter/s)": 0.077083 }, { "epoch": 1.1365291262135921, "grad_norm": 0.6436070799827576, "learning_rate": 2.0715187850257645e-06, "loss": 0.07869491577148438, "memory(GiB)": 40.4, "step": 625, "token_acc": 0.9675632911392406, "train_speed(iter/s)": 0.077031 }, { "epoch": 1.145631067961165, "grad_norm": 0.6669154167175293, "learning_rate": 2.0353070833626684e-06, "loss": 0.07925596237182617, "memory(GiB)": 40.4, "step": 630, "token_acc": 0.964314036478985, "train_speed(iter/s)": 0.077048 }, { "epoch": 1.1547330097087378, "grad_norm": 0.6365996599197388, "learning_rate": 1.999196090894485e-06, "loss": 0.06456078886985779, "memory(GiB)": 40.4, "step": 635, "token_acc": 0.9667194928684627, "train_speed(iter/s)": 0.077101 }, { "epoch": 1.1638349514563107, "grad_norm": 0.5614244341850281, "learning_rate": 1.963193633669018e-06, "loss": 0.07243520021438599, "memory(GiB)": 40.4, "step": 640, "token_acc": 0.9666931007137193, "train_speed(iter/s)": 0.077155 }, { "epoch": 1.1729368932038835, "grad_norm": 0.8191459774971008, "learning_rate": 1.927307514212089e-06, "loss": 0.0762752890586853, "memory(GiB)": 40.4, "step": 645, "token_acc": 0.9698412698412698, "train_speed(iter/s)": 0.077153 }, { "epoch": 1.1820388349514563, "grad_norm": 0.523980438709259, "learning_rate": 1.8915455098365651e-06, "loss": 0.0773351550102234, "memory(GiB)": 40.4, "step": 650, "token_acc": 0.9675118858954042, "train_speed(iter/s)": 0.077211 }, { "epoch": 1.1911407766990292, "grad_norm": 0.5650423169136047, "learning_rate": 1.8559153709568393e-06, "loss": 0.07858687043190002, "memory(GiB)": 40.4, "step": 655, "token_acc": 0.9635499207606973, "train_speed(iter/s)": 0.077253 }, { "epoch": 1.200242718446602, "grad_norm": 0.3905327022075653, "learning_rate": 1.8204248194091429e-06, "loss": 0.07570682168006897, "memory(GiB)": 40.4, "step": 660, "token_acc": 0.9674861221252974, "train_speed(iter/s)": 0.077222 }, { "epoch": 1.2093446601941746, "grad_norm": 0.6456849575042725, "learning_rate": 1.7850815467780616e-06, "loss": 0.06978952884674072, "memory(GiB)": 40.4, "step": 665, "token_acc": 0.976984126984127, "train_speed(iter/s)": 0.077238 }, { "epoch": 1.2184466019417475, "grad_norm": 0.49169182777404785, "learning_rate": 1.7498932127295892e-06, "loss": 0.06932756900787354, "memory(GiB)": 40.4, "step": 670, "token_acc": 0.9674603174603175, "train_speed(iter/s)": 0.077305 }, { "epoch": 1.2275485436893203, "grad_norm": 0.8174545764923096, "learning_rate": 1.7148674433511176e-06, "loss": 0.07247714400291443, "memory(GiB)": 40.4, "step": 675, "token_acc": 0.9785714285714285, "train_speed(iter/s)": 0.077358 }, { "epoch": 1.2366504854368932, "grad_norm": 0.5874563455581665, "learning_rate": 1.6800118294986936e-06, "loss": 0.08156619668006897, "memory(GiB)": 40.4, "step": 680, "token_acc": 0.9619952494061758, "train_speed(iter/s)": 0.077379 }, { "epoch": 1.245752427184466, "grad_norm": 0.7023929357528687, "learning_rate": 1.645333925151908e-06, "loss": 0.0740778088569641, "memory(GiB)": 40.4, "step": 685, "token_acc": 0.9643423137876387, "train_speed(iter/s)": 0.077282 }, { "epoch": 1.2548543689320388, "grad_norm": 0.6284681558609009, "learning_rate": 1.610841245776789e-06, "loss": 0.07937963008880615, "memory(GiB)": 40.4, "step": 690, "token_acc": 0.9682791435368755, "train_speed(iter/s)": 0.077267 }, { "epoch": 1.2639563106796117, "grad_norm": 0.4900761544704437, "learning_rate": 1.5765412666970302e-06, "loss": 0.07481481432914734, "memory(GiB)": 40.4, "step": 695, "token_acc": 0.9714512291831879, "train_speed(iter/s)": 0.077241 }, { "epoch": 1.2730582524271845, "grad_norm": 0.7159978747367859, "learning_rate": 1.5424414214739258e-06, "loss": 0.07213735580444336, "memory(GiB)": 40.4, "step": 700, "token_acc": 0.9738302934179223, "train_speed(iter/s)": 0.077237 }, { "epoch": 1.2821601941747574, "grad_norm": 0.6261754631996155, "learning_rate": 1.5085491002953535e-06, "loss": 0.07179176211357116, "memory(GiB)": 40.4, "step": 705, "token_acc": 0.969047619047619, "train_speed(iter/s)": 0.077083 }, { "epoch": 1.29126213592233, "grad_norm": 0.9063695073127747, "learning_rate": 1.4748716483741562e-06, "loss": 0.07754602432250976, "memory(GiB)": 40.4, "step": 710, "token_acc": 0.96513470681458, "train_speed(iter/s)": 0.077061 }, { "epoch": 1.300364077669903, "grad_norm": 0.6574028134346008, "learning_rate": 1.4414163643562755e-06, "loss": 0.07884335517883301, "memory(GiB)": 40.4, "step": 715, "token_acc": 0.9675376088677752, "train_speed(iter/s)": 0.077069 }, { "epoch": 1.3094660194174756, "grad_norm": 0.5524230599403381, "learning_rate": 1.4081904987389701e-06, "loss": 0.07660083174705505, "memory(GiB)": 40.4, "step": 720, "token_acc": 0.9635210150674068, "train_speed(iter/s)": 0.077072 }, { "epoch": 1.3185679611650485, "grad_norm": 0.5381263494491577, "learning_rate": 1.375201252299479e-06, "loss": 0.07187164425849915, "memory(GiB)": 40.4, "step": 725, "token_acc": 0.9690966719492868, "train_speed(iter/s)": 0.077084 }, { "epoch": 1.3276699029126213, "grad_norm": 0.6094266176223755, "learning_rate": 1.3424557745344508e-06, "loss": 0.07152368426322937, "memory(GiB)": 40.4, "step": 730, "token_acc": 0.9690966719492868, "train_speed(iter/s)": 0.07712 }, { "epoch": 1.3367718446601942, "grad_norm": 0.37662273645401, "learning_rate": 1.3099611621104875e-06, "loss": 0.07852091193199158, "memory(GiB)": 40.4, "step": 735, "token_acc": 0.9698412698412698, "train_speed(iter/s)": 0.077111 }, { "epoch": 1.345873786407767, "grad_norm": 0.8660151958465576, "learning_rate": 1.2777244573261479e-06, "loss": 0.0761515736579895, "memory(GiB)": 40.4, "step": 740, "token_acc": 0.9650793650793651, "train_speed(iter/s)": 0.077083 }, { "epoch": 1.3549757281553398, "grad_norm": 0.8635317087173462, "learning_rate": 1.245752646585719e-06, "loss": 0.07429265975952148, "memory(GiB)": 40.4, "step": 745, "token_acc": 0.9706582077716098, "train_speed(iter/s)": 0.077017 }, { "epoch": 1.3640776699029127, "grad_norm": 0.6921953558921814, "learning_rate": 1.214052658885113e-06, "loss": 0.08055119514465332, "memory(GiB)": 40.4, "step": 750, "token_acc": 0.9659000793021412, "train_speed(iter/s)": 0.07705 }, { "epoch": 1.3731796116504853, "grad_norm": 0.512025773525238, "learning_rate": 1.182631364310199e-06, "loss": 0.07414981126785278, "memory(GiB)": 40.4, "step": 755, "token_acc": 0.9738095238095238, "train_speed(iter/s)": 0.077125 }, { "epoch": 1.3822815533980584, "grad_norm": 0.47374847531318665, "learning_rate": 1.1514955725479057e-06, "loss": 0.07829545140266418, "memory(GiB)": 40.4, "step": 760, "token_acc": 0.9675118858954042, "train_speed(iter/s)": 0.077061 }, { "epoch": 1.391383495145631, "grad_norm": 0.5193628072738647, "learning_rate": 1.1206520314104083e-06, "loss": 0.06979748606681824, "memory(GiB)": 40.4, "step": 765, "token_acc": 0.9730799683293745, "train_speed(iter/s)": 0.077097 }, { "epoch": 1.4004854368932038, "grad_norm": 0.5398116707801819, "learning_rate": 1.0901074253727338e-06, "loss": 0.07316485643386841, "memory(GiB)": 40.4, "step": 770, "token_acc": 0.9674861221252974, "train_speed(iter/s)": 0.077134 }, { "epoch": 1.4095873786407767, "grad_norm": 0.9198482036590576, "learning_rate": 1.0598683741240861e-06, "loss": 0.0778656005859375, "memory(GiB)": 40.4, "step": 775, "token_acc": 0.9714512291831879, "train_speed(iter/s)": 0.077187 }, { "epoch": 1.4186893203883495, "grad_norm": 0.5479600429534912, "learning_rate": 1.0299414311332107e-06, "loss": 0.0758398413658142, "memory(GiB)": 40.4, "step": 780, "token_acc": 0.9706582077716098, "train_speed(iter/s)": 0.077204 }, { "epoch": 1.4277912621359223, "grad_norm": 0.562239944934845, "learning_rate": 1.0003330822281188e-06, "loss": 0.08118345737457275, "memory(GiB)": 40.4, "step": 785, "token_acc": 0.9658730158730159, "train_speed(iter/s)": 0.077197 }, { "epoch": 1.4368932038834952, "grad_norm": 0.608139157295227, "learning_rate": 9.710497441904614e-07, "loss": 0.07277892231941223, "memory(GiB)": 40.4, "step": 790, "token_acc": 0.9739130434782609, "train_speed(iter/s)": 0.077169 }, { "epoch": 1.445995145631068, "grad_norm": 0.6108372807502747, "learning_rate": 9.420977633648739e-07, "loss": 0.0743071436882019, "memory(GiB)": 40.4, "step": 795, "token_acc": 0.9651070578905631, "train_speed(iter/s)": 0.077195 }, { "epoch": 1.4550970873786409, "grad_norm": 0.5900782346725464, "learning_rate": 9.134834142835794e-07, "loss": 0.07513993978500366, "memory(GiB)": 40.4, "step": 800, "token_acc": 0.9738302934179223, "train_speed(iter/s)": 0.07724 }, { "epoch": 1.4641990291262137, "grad_norm": 0.5346866846084595, "learning_rate": 8.852128983065653e-07, "loss": 0.07092651724815369, "memory(GiB)": 40.4, "step": 805, "token_acc": 0.9722662440570523, "train_speed(iter/s)": 0.077133 }, { "epoch": 1.4733009708737863, "grad_norm": 0.504199743270874, "learning_rate": 8.572923422776055e-07, "loss": 0.07900516986846924, "memory(GiB)": 40.4, "step": 810, "token_acc": 0.9524564183835182, "train_speed(iter/s)": 0.077129 }, { "epoch": 1.4824029126213591, "grad_norm": 0.5348660349845886, "learning_rate": 8.297277971964443e-07, "loss": 0.07192928791046142, "memory(GiB)": 40.4, "step": 815, "token_acc": 0.9706349206349206, "train_speed(iter/s)": 0.077153 }, { "epoch": 1.491504854368932, "grad_norm": 0.7142664194107056, "learning_rate": 8.025252369074077e-07, "loss": 0.07966341972351074, "memory(GiB)": 40.4, "step": 820, "token_acc": 0.9714285714285714, "train_speed(iter/s)": 0.077158 }, { "epoch": 1.5006067961165048, "grad_norm": 0.670011579990387, "learning_rate": 7.756905568047393e-07, "loss": 0.07460339069366455, "memory(GiB)": 40.4, "step": 825, "token_acc": 0.9698412698412698, "train_speed(iter/s)": 0.077072 }, { "epoch": 1.5097087378640777, "grad_norm": 0.9091220498085022, "learning_rate": 7.492295725549423e-07, "loss": 0.07916736602783203, "memory(GiB)": 40.4, "step": 830, "token_acc": 0.9714512291831879, "train_speed(iter/s)": 0.077125 }, { "epoch": 1.5188106796116505, "grad_norm": 0.5154448747634888, "learning_rate": 7.231480188363906e-07, "loss": 0.07609822750091552, "memory(GiB)": 40.4, "step": 835, "token_acc": 0.9619047619047619, "train_speed(iter/s)": 0.077151 }, { "epoch": 1.5279126213592233, "grad_norm": 0.5767259001731873, "learning_rate": 6.974515480965038e-07, "loss": 0.07642306089401245, "memory(GiB)": 40.4, "step": 840, "token_acc": 0.9635499207606973, "train_speed(iter/s)": 0.077169 }, { "epoch": 1.537014563106796, "grad_norm": 0.559921145439148, "learning_rate": 6.721457293267344e-07, "loss": 0.07739580273628235, "memory(GiB)": 40.4, "step": 845, "token_acc": 0.9659540775930324, "train_speed(iter/s)": 0.077202 }, { "epoch": 1.546116504854369, "grad_norm": 0.5525022745132446, "learning_rate": 6.472360468556419e-07, "loss": 0.07661284804344178, "memory(GiB)": 40.4, "step": 850, "token_acc": 0.9690966719492868, "train_speed(iter/s)": 0.077223 }, { "epoch": 1.5552184466019416, "grad_norm": 0.7156991958618164, "learning_rate": 6.227278991603239e-07, "loss": 0.07607601881027222, "memory(GiB)": 40.4, "step": 855, "token_acc": 0.9738924050632911, "train_speed(iter/s)": 0.077263 }, { "epoch": 1.5643203883495147, "grad_norm": 0.578790009021759, "learning_rate": 5.986265976964412e-07, "loss": 0.07703717947006225, "memory(GiB)": 40.4, "step": 860, "token_acc": 0.9627575277337559, "train_speed(iter/s)": 0.077321 }, { "epoch": 1.5734223300970873, "grad_norm": 0.41067153215408325, "learning_rate": 5.749373657471127e-07, "loss": 0.07262166738510131, "memory(GiB)": 40.4, "step": 865, "token_acc": 0.9666931007137193, "train_speed(iter/s)": 0.077313 }, { "epoch": 1.5825242718446602, "grad_norm": 0.6594594120979309, "learning_rate": 5.516653372909142e-07, "loss": 0.07546203732490539, "memory(GiB)": 40.4, "step": 870, "token_acc": 0.9730799683293745, "train_speed(iter/s)": 0.077321 }, { "epoch": 1.591626213592233, "grad_norm": 0.6693688035011292, "learning_rate": 5.28815555889228e-07, "loss": 0.07242462635040284, "memory(GiB)": 40.4, "step": 875, "token_acc": 0.9714964370546318, "train_speed(iter/s)": 0.077315 }, { "epoch": 1.6007281553398058, "grad_norm": 0.5314414501190186, "learning_rate": 5.063929735931985e-07, "loss": 0.07621661424636841, "memory(GiB)": 40.4, "step": 880, "token_acc": 0.9746634996041171, "train_speed(iter/s)": 0.077305 }, { "epoch": 1.6098300970873787, "grad_norm": 0.39022502303123474, "learning_rate": 4.844024498705072e-07, "loss": 0.07379111647605896, "memory(GiB)": 40.4, "step": 885, "token_acc": 0.9770023790642347, "train_speed(iter/s)": 0.077319 }, { "epoch": 1.6189320388349513, "grad_norm": 0.5611955523490906, "learning_rate": 4.6284875055222415e-07, "loss": 0.07641223073005676, "memory(GiB)": 40.4, "step": 890, "token_acc": 0.969047619047619, "train_speed(iter/s)": 0.07736 }, { "epoch": 1.6280339805825244, "grad_norm": 0.5914463996887207, "learning_rate": 4.4173654679994543e-07, "loss": 0.07118785977363587, "memory(GiB)": 40.4, "step": 895, "token_acc": 0.9666931007137193, "train_speed(iter/s)": 0.077387 }, { "epoch": 1.637135922330097, "grad_norm": 0.6131768226623535, "learning_rate": 4.2107041409344686e-07, "loss": 0.06656063199043274, "memory(GiB)": 40.4, "step": 900, "token_acc": 0.9730586370839936, "train_speed(iter/s)": 0.077393 }, { "epoch": 1.64623786407767, "grad_norm": 0.6083477139472961, "learning_rate": 4.00854831239082e-07, "loss": 0.07548041343688965, "memory(GiB)": 40.4, "step": 905, "token_acc": 0.9706814580031695, "train_speed(iter/s)": 0.07732 }, { "epoch": 1.6553398058252426, "grad_norm": 0.5123993158340454, "learning_rate": 3.8109417939912044e-07, "loss": 0.07632001638412475, "memory(GiB)": 40.4, "step": 910, "token_acc": 0.9651070578905631, "train_speed(iter/s)": 0.07734 }, { "epoch": 1.6644417475728155, "grad_norm": 0.6305170655250549, "learning_rate": 3.617927411422584e-07, "loss": 0.07312512397766113, "memory(GiB)": 40.4, "step": 915, "token_acc": 0.9675376088677752, "train_speed(iter/s)": 0.077345 }, { "epoch": 1.6735436893203883, "grad_norm": 0.5339434742927551, "learning_rate": 3.4295469951548894e-07, "loss": 0.06849889755249024, "memory(GiB)": 40.4, "step": 920, "token_acc": 0.9674861221252974, "train_speed(iter/s)": 0.077349 }, { "epoch": 1.6826456310679612, "grad_norm": 0.532629132270813, "learning_rate": 3.24584137137543e-07, "loss": 0.07681695818901062, "memory(GiB)": 40.4, "step": 925, "token_acc": 0.9722222222222222, "train_speed(iter/s)": 0.077356 }, { "epoch": 1.691747572815534, "grad_norm": 0.4466962516307831, "learning_rate": 3.0668503531409876e-07, "loss": 0.06994915008544922, "memory(GiB)": 40.4, "step": 930, "token_acc": 0.9714964370546318, "train_speed(iter/s)": 0.077371 }, { "epoch": 1.7008495145631068, "grad_norm": 0.586765706539154, "learning_rate": 2.892612731749414e-07, "loss": 0.07494070529937744, "memory(GiB)": 40.4, "step": 935, "token_acc": 0.969047619047619, "train_speed(iter/s)": 0.077342 }, { "epoch": 1.7099514563106797, "grad_norm": 0.5412377715110779, "learning_rate": 2.723166268332733e-07, "loss": 0.07770473957061767, "memory(GiB)": 40.4, "step": 940, "token_acc": 0.9676145339652449, "train_speed(iter/s)": 0.077329 }, { "epoch": 1.7190533980582523, "grad_norm": 0.911586582660675, "learning_rate": 2.55854768567346e-07, "loss": 0.07914371490478515, "memory(GiB)": 40.4, "step": 945, "token_acc": 0.9674861221252974, "train_speed(iter/s)": 0.077298 }, { "epoch": 1.7281553398058254, "grad_norm": 0.6137750148773193, "learning_rate": 2.3987926602459465e-07, "loss": 0.08327807188034057, "memory(GiB)": 40.4, "step": 950, "token_acc": 0.9706349206349206, "train_speed(iter/s)": 0.077305 }, { "epoch": 1.737257281553398, "grad_norm": 0.576627790927887, "learning_rate": 2.2439358144845464e-07, "loss": 0.08012324571609497, "memory(GiB)": 40.4, "step": 955, "token_acc": 0.9643423137876387, "train_speed(iter/s)": 0.077328 }, { "epoch": 1.7463592233009708, "grad_norm": 0.6456671953201294, "learning_rate": 2.09401070928012e-07, "loss": 0.06627861261367798, "memory(GiB)": 40.4, "step": 960, "token_acc": 0.9714285714285714, "train_speed(iter/s)": 0.077243 }, { "epoch": 1.7554611650485437, "grad_norm": 0.6002473831176758, "learning_rate": 1.9490498367066817e-07, "loss": 0.071403968334198, "memory(GiB)": 40.4, "step": 965, "token_acc": 0.9682791435368755, "train_speed(iter/s)": 0.077258 }, { "epoch": 1.7645631067961165, "grad_norm": 0.7518230080604553, "learning_rate": 1.8090846129796586e-07, "loss": 0.07573525905609131, "memory(GiB)": 40.4, "step": 970, "token_acc": 0.9722222222222222, "train_speed(iter/s)": 0.077252 }, { "epoch": 1.7736650485436893, "grad_norm": 0.41464531421661377, "learning_rate": 1.6741453716472677e-07, "loss": 0.07870721817016602, "memory(GiB)": 40.4, "step": 975, "token_acc": 0.9627870150435471, "train_speed(iter/s)": 0.077259 }, { "epoch": 1.7827669902912622, "grad_norm": 0.7254371643066406, "learning_rate": 1.5442613570165993e-07, "loss": 0.08646805882453919, "memory(GiB)": 40.4, "step": 980, "token_acc": 0.9611419508326725, "train_speed(iter/s)": 0.077274 }, { "epoch": 1.791868932038835, "grad_norm": 0.7164713740348816, "learning_rate": 1.4194607178157237e-07, "loss": 0.07055433988571166, "memory(GiB)": 40.4, "step": 985, "token_acc": 0.9706349206349206, "train_speed(iter/s)": 0.077341 }, { "epoch": 1.8009708737864076, "grad_norm": 0.5821430087089539, "learning_rate": 1.2997705010932394e-07, "loss": 0.07743188142776489, "memory(GiB)": 40.4, "step": 990, "token_acc": 0.9674861221252974, "train_speed(iter/s)": 0.077362 }, { "epoch": 1.8100728155339807, "grad_norm": 0.766345739364624, "learning_rate": 1.1852166463565767e-07, "loss": 0.07668507099151611, "memory(GiB)": 40.4, "step": 995, "token_acc": 0.9770023790642347, "train_speed(iter/s)": 0.077362 }, { "epoch": 1.8191747572815533, "grad_norm": 0.5379170179367065, "learning_rate": 1.0758239799503412e-07, "loss": 0.06778880357742309, "memory(GiB)": 40.4, "step": 1000, "token_acc": 0.9746233148295004, "train_speed(iter/s)": 0.077358 }, { "epoch": 1.8282766990291264, "grad_norm": 0.587326169013977, "learning_rate": 9.716162096759019e-08, "loss": 0.07784827947616577, "memory(GiB)": 40.4, "step": 1005, "token_acc": 0.9770206022187005, "train_speed(iter/s)": 0.077289 }, { "epoch": 1.837378640776699, "grad_norm": 0.5790999531745911, "learning_rate": 8.726159196533718e-08, "loss": 0.07364106178283691, "memory(GiB)": 40.4, "step": 1010, "token_acc": 0.9730372720063442, "train_speed(iter/s)": 0.077306 }, { "epoch": 1.8464805825242718, "grad_norm": 0.5765237808227539, "learning_rate": 7.788445654271532e-08, "loss": 0.07042239308357238, "memory(GiB)": 40.4, "step": 1015, "token_acc": 0.9682539682539683, "train_speed(iter/s)": 0.077338 }, { "epoch": 1.8555825242718447, "grad_norm": 0.4627252221107483, "learning_rate": 6.903224693160348e-08, "loss": 0.06837155222892762, "memory(GiB)": 40.4, "step": 1020, "token_acc": 0.9754358161648178, "train_speed(iter/s)": 0.077366 }, { "epoch": 1.8646844660194175, "grad_norm": 0.5963551998138428, "learning_rate": 6.070688160088961e-08, "loss": 0.0674078106880188, "memory(GiB)": 40.4, "step": 1025, "token_acc": 0.9659270998415214, "train_speed(iter/s)": 0.077385 }, { "epoch": 1.8737864077669903, "grad_norm": 0.6391610503196716, "learning_rate": 5.291016484069683e-08, "loss": 0.07277075052261353, "memory(GiB)": 40.4, "step": 1030, "token_acc": 0.9659540775930324, "train_speed(iter/s)": 0.077401 }, { "epoch": 1.882888349514563, "grad_norm": 0.5019727945327759, "learning_rate": 4.564378637135408e-08, "loss": 0.0752260446548462, "memory(GiB)": 40.4, "step": 1035, "token_acc": 0.9682791435368755, "train_speed(iter/s)": 0.077434 }, { "epoch": 1.891990291262136, "grad_norm": 0.4186345040798187, "learning_rate": 3.890932097719624e-08, "loss": 0.06725120544433594, "memory(GiB)": 40.4, "step": 1040, "token_acc": 0.9730799683293745, "train_speed(iter/s)": 0.077451 }, { "epoch": 1.9010922330097086, "grad_norm": 0.6359046697616577, "learning_rate": 3.270822816527325e-08, "loss": 0.07682465314865113, "memory(GiB)": 40.4, "step": 1045, "token_acc": 0.969047619047619, "train_speed(iter/s)": 0.077498 }, { "epoch": 1.9101941747572817, "grad_norm": 0.5813617706298828, "learning_rate": 2.7041851849043678e-08, "loss": 0.0773731827735901, "memory(GiB)": 40.4, "step": 1050, "token_acc": 0.9674861221252974, "train_speed(iter/s)": 0.077486 }, { "epoch": 1.9192961165048543, "grad_norm": 0.4645262062549591, "learning_rate": 2.1911420057117994e-08, "loss": 0.07277056574821472, "memory(GiB)": 40.4, "step": 1055, "token_acc": 0.9690721649484536, "train_speed(iter/s)": 0.077447 }, { "epoch": 1.9283980582524272, "grad_norm": 0.8828046917915344, "learning_rate": 1.7318044667119226e-08, "loss": 0.07312785387039185, "memory(GiB)": 40.4, "step": 1060, "token_acc": 0.9675118858954042, "train_speed(iter/s)": 0.077476 }, { "epoch": 1.9375, "grad_norm": 0.8438335657119751, "learning_rate": 1.3262721164712667e-08, "loss": 0.07410634756088257, "memory(GiB)": 40.4, "step": 1065, "token_acc": 0.9698651863600317, "train_speed(iter/s)": 0.077482 }, { "epoch": 1.9466019417475728, "grad_norm": 0.6822603344917297, "learning_rate": 9.746328427863993e-09, "loss": 0.0720213532447815, "memory(GiB)": 40.4, "step": 1070, "token_acc": 0.9666666666666667, "train_speed(iter/s)": 0.077488 }, { "epoch": 1.9557038834951457, "grad_norm": 0.5685479640960693, "learning_rate": 6.769628536364981e-09, "loss": 0.07333976030349731, "memory(GiB)": 40.4, "step": 1075, "token_acc": 0.973015873015873, "train_speed(iter/s)": 0.077502 }, { "epoch": 1.9648058252427183, "grad_norm": 0.5445531606674194, "learning_rate": 4.333266606676711e-09, "loss": 0.07253679037094116, "memory(GiB)": 40.4, "step": 1080, "token_acc": 0.9730586370839936, "train_speed(iter/s)": 0.077494 }, { "epoch": 1.9739077669902914, "grad_norm": 0.6113319993019104, "learning_rate": 2.4377706521164224e-09, "loss": 0.07309662699699401, "memory(GiB)": 40.4, "step": 1085, "token_acc": 0.9722222222222222, "train_speed(iter/s)": 0.077475 }, { "epoch": 1.983009708737864, "grad_norm": 0.5483999252319336, "learning_rate": 1.0835514684262583e-09, "loss": 0.07428893446922302, "memory(GiB)": 40.4, "step": 1090, "token_acc": 0.9690966719492868, "train_speed(iter/s)": 0.077464 }, { "epoch": 1.992111650485437, "grad_norm": 0.6084752082824707, "learning_rate": 2.7090254474421154e-10, "loss": 0.07023123502731324, "memory(GiB)": 40.4, "step": 1095, "token_acc": 0.9786223277909739, "train_speed(iter/s)": 0.077453 }, { "epoch": 2.0, "grad_norm": 0.5853410363197327, "learning_rate": 0.0, "loss": 0.0724187433719635, "memory(GiB)": 40.4, "step": 1100, "token_acc": 0.9679780420860018, "train_speed(iter/s)": 0.077495 } ], "logging_steps": 5, "max_steps": 1100, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.488531281539498e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }