| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.0, | |
| "eval_steps": 100.0, | |
| "global_step": 1100, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0018203883495145632, | |
| "grad_norm": 3.6015546321868896, | |
| "learning_rate": 1.5151515151515152e-07, | |
| "loss": 0.5787061452865601, | |
| "memory(GiB)": 40.37, | |
| "step": 1, | |
| "token_acc": 0.8690476190476191, | |
| "train_speed(iter/s)": 0.029162 | |
| }, | |
| { | |
| "epoch": 0.009101941747572815, | |
| "grad_norm": 3.435741424560547, | |
| "learning_rate": 7.575757575757576e-07, | |
| "loss": 0.5881168842315674, | |
| "memory(GiB)": 40.37, | |
| "step": 5, | |
| "token_acc": 0.8701684836471755, | |
| "train_speed(iter/s)": 0.06174 | |
| }, | |
| { | |
| "epoch": 0.01820388349514563, | |
| "grad_norm": 3.448568105697632, | |
| "learning_rate": 1.5151515151515152e-06, | |
| "loss": 0.5851926326751709, | |
| "memory(GiB)": 40.39, | |
| "step": 10, | |
| "token_acc": 0.8622327790973872, | |
| "train_speed(iter/s)": 0.067399 | |
| }, | |
| { | |
| "epoch": 0.027305825242718445, | |
| "grad_norm": 3.405535936355591, | |
| "learning_rate": 2.2727272727272728e-06, | |
| "loss": 0.6001698970794678, | |
| "memory(GiB)": 40.39, | |
| "step": 15, | |
| "token_acc": 0.8716323296354992, | |
| "train_speed(iter/s)": 0.068625 | |
| }, | |
| { | |
| "epoch": 0.03640776699029126, | |
| "grad_norm": 3.6892027854919434, | |
| "learning_rate": 3.0303030303030305e-06, | |
| "loss": 0.5676679611206055, | |
| "memory(GiB)": 40.39, | |
| "step": 20, | |
| "token_acc": 0.8715305313243458, | |
| "train_speed(iter/s)": 0.069219 | |
| }, | |
| { | |
| "epoch": 0.04550970873786408, | |
| "grad_norm": 3.9115183353424072, | |
| "learning_rate": 3.7878787878787882e-06, | |
| "loss": 0.5411659717559815, | |
| "memory(GiB)": 40.39, | |
| "step": 25, | |
| "token_acc": 0.8685669041963578, | |
| "train_speed(iter/s)": 0.071459 | |
| }, | |
| { | |
| "epoch": 0.05461165048543689, | |
| "grad_norm": 3.357640027999878, | |
| "learning_rate": 4.5454545454545455e-06, | |
| "loss": 0.4885613441467285, | |
| "memory(GiB)": 40.39, | |
| "step": 30, | |
| "token_acc": 0.8682539682539683, | |
| "train_speed(iter/s)": 0.072567 | |
| }, | |
| { | |
| "epoch": 0.06371359223300971, | |
| "grad_norm": 3.3015964031219482, | |
| "learning_rate": 4.999956654935265e-06, | |
| "loss": 0.4215705871582031, | |
| "memory(GiB)": 40.39, | |
| "step": 35, | |
| "token_acc": 0.8692551505546752, | |
| "train_speed(iter/s)": 0.07282 | |
| }, | |
| { | |
| "epoch": 0.07281553398058252, | |
| "grad_norm": 2.447498321533203, | |
| "learning_rate": 4.999469040218251e-06, | |
| "loss": 0.2957149982452393, | |
| "memory(GiB)": 40.39, | |
| "step": 40, | |
| "token_acc": 0.8690476190476191, | |
| "train_speed(iter/s)": 0.074291 | |
| }, | |
| { | |
| "epoch": 0.08191747572815535, | |
| "grad_norm": 0.8801060914993286, | |
| "learning_rate": 4.9984397354824345e-06, | |
| "loss": 0.21340658664703369, | |
| "memory(GiB)": 40.39, | |
| "step": 45, | |
| "token_acc": 0.9135606661379857, | |
| "train_speed(iter/s)": 0.074508 | |
| }, | |
| { | |
| "epoch": 0.09101941747572816, | |
| "grad_norm": 0.7226303815841675, | |
| "learning_rate": 4.996868963800831e-06, | |
| "loss": 0.1777859926223755, | |
| "memory(GiB)": 40.39, | |
| "step": 50, | |
| "token_acc": 0.9239904988123515, | |
| "train_speed(iter/s)": 0.075304 | |
| }, | |
| { | |
| "epoch": 0.10012135922330097, | |
| "grad_norm": 0.7329442501068115, | |
| "learning_rate": 4.99475706559428e-06, | |
| "loss": 0.17081427574157715, | |
| "memory(GiB)": 40.39, | |
| "step": 55, | |
| "token_acc": 0.9238699444885012, | |
| "train_speed(iter/s)": 0.075664 | |
| }, | |
| { | |
| "epoch": 0.10922330097087378, | |
| "grad_norm": 0.48636820912361145, | |
| "learning_rate": 4.992104498557657e-06, | |
| "loss": 0.15634163618087768, | |
| "memory(GiB)": 40.39, | |
| "step": 60, | |
| "token_acc": 0.9262490087232356, | |
| "train_speed(iter/s)": 0.076641 | |
| }, | |
| { | |
| "epoch": 0.1183252427184466, | |
| "grad_norm": 0.44267499446868896, | |
| "learning_rate": 4.988911837560691e-06, | |
| "loss": 0.1444383144378662, | |
| "memory(GiB)": 40.39, | |
| "step": 65, | |
| "token_acc": 0.9350237717908082, | |
| "train_speed(iter/s)": 0.074923 | |
| }, | |
| { | |
| "epoch": 0.12742718446601942, | |
| "grad_norm": 0.4311356544494629, | |
| "learning_rate": 4.985179774523375e-06, | |
| "loss": 0.14677078723907472, | |
| "memory(GiB)": 40.39, | |
| "step": 70, | |
| "token_acc": 0.9444444444444444, | |
| "train_speed(iter/s)": 0.075087 | |
| }, | |
| { | |
| "epoch": 0.13652912621359223, | |
| "grad_norm": 0.3981742858886719, | |
| "learning_rate": 4.980909118266006e-06, | |
| "loss": 0.13511970043182372, | |
| "memory(GiB)": 40.39, | |
| "step": 75, | |
| "token_acc": 0.9484944532488114, | |
| "train_speed(iter/s)": 0.074414 | |
| }, | |
| { | |
| "epoch": 0.14563106796116504, | |
| "grad_norm": 0.4317033290863037, | |
| "learning_rate": 4.976100794333903e-06, | |
| "loss": 0.12185637950897217, | |
| "memory(GiB)": 40.39, | |
| "step": 80, | |
| "token_acc": 0.9627279936558287, | |
| "train_speed(iter/s)": 0.074464 | |
| }, | |
| { | |
| "epoch": 0.15473300970873785, | |
| "grad_norm": 0.3179706335067749, | |
| "learning_rate": 4.970755844796817e-06, | |
| "loss": 0.12840776443481444, | |
| "memory(GiB)": 40.39, | |
| "step": 85, | |
| "token_acc": 0.9492063492063492, | |
| "train_speed(iter/s)": 0.074996 | |
| }, | |
| { | |
| "epoch": 0.1638349514563107, | |
| "grad_norm": 0.3189823031425476, | |
| "learning_rate": 4.964875428023093e-06, | |
| "loss": 0.12376663684844971, | |
| "memory(GiB)": 40.39, | |
| "step": 90, | |
| "token_acc": 0.957936507936508, | |
| "train_speed(iter/s)": 0.075144 | |
| }, | |
| { | |
| "epoch": 0.1729368932038835, | |
| "grad_norm": 0.33377909660339355, | |
| "learning_rate": 4.958460818428627e-06, | |
| "loss": 0.11574116945266724, | |
| "memory(GiB)": 40.39, | |
| "step": 95, | |
| "token_acc": 0.9563492063492064, | |
| "train_speed(iter/s)": 0.075617 | |
| }, | |
| { | |
| "epoch": 0.1820388349514563, | |
| "grad_norm": 0.4928111433982849, | |
| "learning_rate": 4.951513406200667e-06, | |
| "loss": 0.1149444341659546, | |
| "memory(GiB)": 40.39, | |
| "step": 100, | |
| "token_acc": 0.9508716323296355, | |
| "train_speed(iter/s)": 0.075828 | |
| }, | |
| { | |
| "epoch": 0.19114077669902912, | |
| "grad_norm": 0.3134707808494568, | |
| "learning_rate": 4.944034696996534e-06, | |
| "loss": 0.11119295358657837, | |
| "memory(GiB)": 40.39, | |
| "step": 105, | |
| "token_acc": 0.9595238095238096, | |
| "train_speed(iter/s)": 0.075066 | |
| }, | |
| { | |
| "epoch": 0.20024271844660194, | |
| "grad_norm": 0.2365858554840088, | |
| "learning_rate": 4.936026311617316e-06, | |
| "loss": 0.11442217826843262, | |
| "memory(GiB)": 40.39, | |
| "step": 110, | |
| "token_acc": 0.9588281868566905, | |
| "train_speed(iter/s)": 0.075061 | |
| }, | |
| { | |
| "epoch": 0.20934466019417475, | |
| "grad_norm": 0.3145173490047455, | |
| "learning_rate": 4.927489985656591e-06, | |
| "loss": 0.10322239398956298, | |
| "memory(GiB)": 40.39, | |
| "step": 115, | |
| "token_acc": 0.9658730158730159, | |
| "train_speed(iter/s)": 0.074479 | |
| }, | |
| { | |
| "epoch": 0.21844660194174756, | |
| "grad_norm": 0.33202633261680603, | |
| "learning_rate": 4.918427569124302e-06, | |
| "loss": 0.10661822557449341, | |
| "memory(GiB)": 40.39, | |
| "step": 120, | |
| "token_acc": 0.9556259904912837, | |
| "train_speed(iter/s)": 0.074637 | |
| }, | |
| { | |
| "epoch": 0.2275485436893204, | |
| "grad_norm": 0.3093946874141693, | |
| "learning_rate": 4.908841026045809e-06, | |
| "loss": 0.10065805912017822, | |
| "memory(GiB)": 40.39, | |
| "step": 125, | |
| "token_acc": 0.9540412044374009, | |
| "train_speed(iter/s)": 0.074905 | |
| }, | |
| { | |
| "epoch": 0.2366504854368932, | |
| "grad_norm": 0.39363232254981995, | |
| "learning_rate": 4.8987324340362445e-06, | |
| "loss": 0.114447021484375, | |
| "memory(GiB)": 40.39, | |
| "step": 130, | |
| "token_acc": 0.9571428571428572, | |
| "train_speed(iter/s)": 0.075072 | |
| }, | |
| { | |
| "epoch": 0.24575242718446602, | |
| "grad_norm": 0.37065446376800537, | |
| "learning_rate": 4.888103983850245e-06, | |
| "loss": 0.10610785484313964, | |
| "memory(GiB)": 40.39, | |
| "step": 135, | |
| "token_acc": 0.9565217391304348, | |
| "train_speed(iter/s)": 0.075167 | |
| }, | |
| { | |
| "epoch": 0.25485436893203883, | |
| "grad_norm": 0.542117714881897, | |
| "learning_rate": 4.876957978907176e-06, | |
| "loss": 0.0954114019870758, | |
| "memory(GiB)": 40.39, | |
| "step": 140, | |
| "token_acc": 0.9666666666666667, | |
| "train_speed(iter/s)": 0.075346 | |
| }, | |
| { | |
| "epoch": 0.26395631067961167, | |
| "grad_norm": 0.3225058913230896, | |
| "learning_rate": 4.865296834791918e-06, | |
| "loss": 0.0959049105644226, | |
| "memory(GiB)": 40.39, | |
| "step": 145, | |
| "token_acc": 0.9587955625990491, | |
| "train_speed(iter/s)": 0.075467 | |
| }, | |
| { | |
| "epoch": 0.27305825242718446, | |
| "grad_norm": 0.3421016037464142, | |
| "learning_rate": 4.853123078731363e-06, | |
| "loss": 0.09874246120452881, | |
| "memory(GiB)": 40.39, | |
| "step": 150, | |
| "token_acc": 0.9650793650793651, | |
| "train_speed(iter/s)": 0.075618 | |
| }, | |
| { | |
| "epoch": 0.2821601941747573, | |
| "grad_norm": 0.3102968633174896, | |
| "learning_rate": 4.8404393490467085e-06, | |
| "loss": 0.09461469650268554, | |
| "memory(GiB)": 40.39, | |
| "step": 155, | |
| "token_acc": 0.9547977795400476, | |
| "train_speed(iter/s)": 0.075855 | |
| }, | |
| { | |
| "epoch": 0.2912621359223301, | |
| "grad_norm": 0.4729763865470886, | |
| "learning_rate": 4.827248394581672e-06, | |
| "loss": 0.10038878917694091, | |
| "memory(GiB)": 40.39, | |
| "step": 160, | |
| "token_acc": 0.9650793650793651, | |
| "train_speed(iter/s)": 0.075945 | |
| }, | |
| { | |
| "epoch": 0.3003640776699029, | |
| "grad_norm": 0.3695836365222931, | |
| "learning_rate": 4.813553074106761e-06, | |
| "loss": 0.09139147400856018, | |
| "memory(GiB)": 40.39, | |
| "step": 165, | |
| "token_acc": 0.9627279936558287, | |
| "train_speed(iter/s)": 0.075756 | |
| }, | |
| { | |
| "epoch": 0.3094660194174757, | |
| "grad_norm": 0.47110962867736816, | |
| "learning_rate": 4.799356355699708e-06, | |
| "loss": 0.09496045112609863, | |
| "memory(GiB)": 40.39, | |
| "step": 170, | |
| "token_acc": 0.9698412698412698, | |
| "train_speed(iter/s)": 0.075898 | |
| }, | |
| { | |
| "epoch": 0.31856796116504854, | |
| "grad_norm": 0.3773088753223419, | |
| "learning_rate": 4.784661316102229e-06, | |
| "loss": 0.09658662080764771, | |
| "memory(GiB)": 40.4, | |
| "step": 175, | |
| "token_acc": 0.96513470681458, | |
| "train_speed(iter/s)": 0.075914 | |
| }, | |
| { | |
| "epoch": 0.3276699029126214, | |
| "grad_norm": 0.3394829034805298, | |
| "learning_rate": 4.769471140053221e-06, | |
| "loss": 0.08639374971389771, | |
| "memory(GiB)": 40.4, | |
| "step": 180, | |
| "token_acc": 0.969047619047619, | |
| "train_speed(iter/s)": 0.076076 | |
| }, | |
| { | |
| "epoch": 0.33677184466019416, | |
| "grad_norm": 0.4525506794452667, | |
| "learning_rate": 4.753789119598563e-06, | |
| "loss": 0.09742268323898315, | |
| "memory(GiB)": 40.4, | |
| "step": 185, | |
| "token_acc": 0.9587301587301588, | |
| "train_speed(iter/s)": 0.076177 | |
| }, | |
| { | |
| "epoch": 0.345873786407767, | |
| "grad_norm": 0.3789404332637787, | |
| "learning_rate": 4.737618653377651e-06, | |
| "loss": 0.09391134977340698, | |
| "memory(GiB)": 40.4, | |
| "step": 190, | |
| "token_acc": 0.9651070578905631, | |
| "train_speed(iter/s)": 0.07649 | |
| }, | |
| { | |
| "epoch": 0.3549757281553398, | |
| "grad_norm": 0.5464370250701904, | |
| "learning_rate": 4.720963245886846e-06, | |
| "loss": 0.0969527006149292, | |
| "memory(GiB)": 40.4, | |
| "step": 195, | |
| "token_acc": 0.9659270998415214, | |
| "train_speed(iter/s)": 0.076513 | |
| }, | |
| { | |
| "epoch": 0.3640776699029126, | |
| "grad_norm": 0.3459813892841339, | |
| "learning_rate": 4.703826506719964e-06, | |
| "loss": 0.08732333183288574, | |
| "memory(GiB)": 40.4, | |
| "step": 200, | |
| "token_acc": 0.96513470681458, | |
| "train_speed(iter/s)": 0.076587 | |
| }, | |
| { | |
| "epoch": 0.3731796116504854, | |
| "grad_norm": 0.3549191653728485, | |
| "learning_rate": 4.686212149786007e-06, | |
| "loss": 0.08515737056732178, | |
| "memory(GiB)": 40.4, | |
| "step": 205, | |
| "token_acc": 0.96513470681458, | |
| "train_speed(iter/s)": 0.076344 | |
| }, | |
| { | |
| "epoch": 0.38228155339805825, | |
| "grad_norm": 0.7434160709381104, | |
| "learning_rate": 4.668123992504267e-06, | |
| "loss": 0.09526927471160888, | |
| "memory(GiB)": 40.4, | |
| "step": 210, | |
| "token_acc": 0.9666666666666667, | |
| "train_speed(iter/s)": 0.076513 | |
| }, | |
| { | |
| "epoch": 0.3913834951456311, | |
| "grad_norm": 0.464631587266922, | |
| "learning_rate": 4.649565954977015e-06, | |
| "loss": 0.09264343380928039, | |
| "memory(GiB)": 40.4, | |
| "step": 215, | |
| "token_acc": 0.9620253164556962, | |
| "train_speed(iter/s)": 0.076143 | |
| }, | |
| { | |
| "epoch": 0.40048543689320387, | |
| "grad_norm": 0.5145648121833801, | |
| "learning_rate": 4.630542059139923e-06, | |
| "loss": 0.09688866138458252, | |
| "memory(GiB)": 40.4, | |
| "step": 220, | |
| "token_acc": 0.9667458432304038, | |
| "train_speed(iter/s)": 0.076292 | |
| }, | |
| { | |
| "epoch": 0.4095873786407767, | |
| "grad_norm": 0.33657485246658325, | |
| "learning_rate": 4.611056427890428e-06, | |
| "loss": 0.09414277076721192, | |
| "memory(GiB)": 40.4, | |
| "step": 225, | |
| "token_acc": 0.9587301587301588, | |
| "train_speed(iter/s)": 0.076275 | |
| }, | |
| { | |
| "epoch": 0.4186893203883495, | |
| "grad_norm": 0.47585147619247437, | |
| "learning_rate": 4.5911132841942e-06, | |
| "loss": 0.08656486272811889, | |
| "memory(GiB)": 40.4, | |
| "step": 230, | |
| "token_acc": 0.9698651863600317, | |
| "train_speed(iter/s)": 0.076342 | |
| }, | |
| { | |
| "epoch": 0.42779126213592233, | |
| "grad_norm": 0.3516729176044464, | |
| "learning_rate": 4.570716950169944e-06, | |
| "loss": 0.08657894730567932, | |
| "memory(GiB)": 40.4, | |
| "step": 235, | |
| "token_acc": 0.9642857142857143, | |
| "train_speed(iter/s)": 0.076493 | |
| }, | |
| { | |
| "epoch": 0.4368932038834951, | |
| "grad_norm": 0.48757559061050415, | |
| "learning_rate": 4.5498718461526895e-06, | |
| "loss": 0.09453780055046082, | |
| "memory(GiB)": 40.4, | |
| "step": 240, | |
| "token_acc": 0.9643705463182898, | |
| "train_speed(iter/s)": 0.07656 | |
| }, | |
| { | |
| "epoch": 0.44599514563106796, | |
| "grad_norm": 0.5283713936805725, | |
| "learning_rate": 4.528582489735818e-06, | |
| "loss": 0.08740494847297668, | |
| "memory(GiB)": 40.4, | |
| "step": 245, | |
| "token_acc": 0.9587628865979382, | |
| "train_speed(iter/s)": 0.07663 | |
| }, | |
| { | |
| "epoch": 0.4550970873786408, | |
| "grad_norm": 0.3577844500541687, | |
| "learning_rate": 4.506853494791992e-06, | |
| "loss": 0.08014656901359558, | |
| "memory(GiB)": 40.4, | |
| "step": 250, | |
| "token_acc": 0.971473851030111, | |
| "train_speed(iter/s)": 0.076543 | |
| }, | |
| { | |
| "epoch": 0.4641990291262136, | |
| "grad_norm": 0.5026013851165771, | |
| "learning_rate": 4.484689570473232e-06, | |
| "loss": 0.08635783195495605, | |
| "memory(GiB)": 40.4, | |
| "step": 255, | |
| "token_acc": 0.9682791435368755, | |
| "train_speed(iter/s)": 0.076578 | |
| }, | |
| { | |
| "epoch": 0.4733009708737864, | |
| "grad_norm": 0.45232078433036804, | |
| "learning_rate": 4.462095520190336e-06, | |
| "loss": 0.08593440055847168, | |
| "memory(GiB)": 40.4, | |
| "step": 260, | |
| "token_acc": 0.9699367088607594, | |
| "train_speed(iter/s)": 0.076538 | |
| }, | |
| { | |
| "epoch": 0.4824029126213592, | |
| "grad_norm": 0.47390663623809814, | |
| "learning_rate": 4.43907624057188e-06, | |
| "loss": 0.08747667074203491, | |
| "memory(GiB)": 40.4, | |
| "step": 265, | |
| "token_acc": 0.9619047619047619, | |
| "train_speed(iter/s)": 0.076588 | |
| }, | |
| { | |
| "epoch": 0.49150485436893204, | |
| "grad_norm": 0.43587085604667664, | |
| "learning_rate": 4.415636720403005e-06, | |
| "loss": 0.08902972340583801, | |
| "memory(GiB)": 40.4, | |
| "step": 270, | |
| "token_acc": 0.9619349722442506, | |
| "train_speed(iter/s)": 0.076484 | |
| }, | |
| { | |
| "epoch": 0.5006067961165048, | |
| "grad_norm": 0.41671204566955566, | |
| "learning_rate": 4.391782039544239e-06, | |
| "loss": 0.08426393270492553, | |
| "memory(GiB)": 40.4, | |
| "step": 275, | |
| "token_acc": 0.9603489294210944, | |
| "train_speed(iter/s)": 0.076586 | |
| }, | |
| { | |
| "epoch": 0.5097087378640777, | |
| "grad_norm": 0.3852890133857727, | |
| "learning_rate": 4.367517367830581e-06, | |
| "loss": 0.08224607706069946, | |
| "memory(GiB)": 40.4, | |
| "step": 280, | |
| "token_acc": 0.9730372720063442, | |
| "train_speed(iter/s)": 0.0767 | |
| }, | |
| { | |
| "epoch": 0.5188106796116505, | |
| "grad_norm": 0.5980095863342285, | |
| "learning_rate": 4.342847963951085e-06, | |
| "loss": 0.09114923477172851, | |
| "memory(GiB)": 40.4, | |
| "step": 285, | |
| "token_acc": 0.9642857142857143, | |
| "train_speed(iter/s)": 0.076804 | |
| }, | |
| { | |
| "epoch": 0.5279126213592233, | |
| "grad_norm": 0.5370866656303406, | |
| "learning_rate": 4.317779174309179e-06, | |
| "loss": 0.09176770448684693, | |
| "memory(GiB)": 40.4, | |
| "step": 290, | |
| "token_acc": 0.9595879556259905, | |
| "train_speed(iter/s)": 0.076902 | |
| }, | |
| { | |
| "epoch": 0.5370145631067961, | |
| "grad_norm": 0.5857056975364685, | |
| "learning_rate": 4.292316431863991e-06, | |
| "loss": 0.08232347965240479, | |
| "memory(GiB)": 40.4, | |
| "step": 295, | |
| "token_acc": 0.9635210150674068, | |
| "train_speed(iter/s)": 0.076861 | |
| }, | |
| { | |
| "epoch": 0.5461165048543689, | |
| "grad_norm": 0.45398032665252686, | |
| "learning_rate": 4.2664652549528995e-06, | |
| "loss": 0.0860186755657196, | |
| "memory(GiB)": 40.4, | |
| "step": 300, | |
| "token_acc": 0.9603174603174603, | |
| "train_speed(iter/s)": 0.076918 | |
| }, | |
| { | |
| "epoch": 0.5552184466019418, | |
| "grad_norm": 0.4008013904094696, | |
| "learning_rate": 4.240231246095593e-06, | |
| "loss": 0.08663930892944335, | |
| "memory(GiB)": 40.4, | |
| "step": 305, | |
| "token_acc": 0.9698651863600317, | |
| "train_speed(iter/s)": 0.076723 | |
| }, | |
| { | |
| "epoch": 0.5643203883495146, | |
| "grad_norm": 0.6199547052383423, | |
| "learning_rate": 4.213620090779877e-06, | |
| "loss": 0.08223216533660889, | |
| "memory(GiB)": 40.4, | |
| "step": 310, | |
| "token_acc": 0.9674861221252974, | |
| "train_speed(iter/s)": 0.076805 | |
| }, | |
| { | |
| "epoch": 0.5734223300970874, | |
| "grad_norm": 0.37448298931121826, | |
| "learning_rate": 4.186637556229508e-06, | |
| "loss": 0.08296606540679932, | |
| "memory(GiB)": 40.4, | |
| "step": 315, | |
| "token_acc": 0.9666931007137193, | |
| "train_speed(iter/s)": 0.076708 | |
| }, | |
| { | |
| "epoch": 0.5825242718446602, | |
| "grad_norm": 0.4003507196903229, | |
| "learning_rate": 4.159289490154305e-06, | |
| "loss": 0.07931501269340516, | |
| "memory(GiB)": 40.4, | |
| "step": 320, | |
| "token_acc": 0.9642857142857143, | |
| "train_speed(iter/s)": 0.076845 | |
| }, | |
| { | |
| "epoch": 0.591626213592233, | |
| "grad_norm": 0.49439844489097595, | |
| "learning_rate": 4.1315818194828196e-06, | |
| "loss": 0.08067693710327148, | |
| "memory(GiB)": 40.4, | |
| "step": 325, | |
| "token_acc": 0.9698412698412698, | |
| "train_speed(iter/s)": 0.076875 | |
| }, | |
| { | |
| "epoch": 0.6007281553398058, | |
| "grad_norm": 0.584017813205719, | |
| "learning_rate": 4.1035205490778505e-06, | |
| "loss": 0.09277031421661378, | |
| "memory(GiB)": 40.4, | |
| "step": 330, | |
| "token_acc": 0.9595879556259905, | |
| "train_speed(iter/s)": 0.076692 | |
| }, | |
| { | |
| "epoch": 0.6098300970873787, | |
| "grad_norm": 0.47020280361175537, | |
| "learning_rate": 4.075111760435045e-06, | |
| "loss": 0.07749168276786804, | |
| "memory(GiB)": 40.4, | |
| "step": 335, | |
| "token_acc": 0.96513470681458, | |
| "train_speed(iter/s)": 0.076884 | |
| }, | |
| { | |
| "epoch": 0.6189320388349514, | |
| "grad_norm": 0.4876089096069336, | |
| "learning_rate": 4.046361610364913e-06, | |
| "loss": 0.07796428203582764, | |
| "memory(GiB)": 40.4, | |
| "step": 340, | |
| "token_acc": 0.9691699604743083, | |
| "train_speed(iter/s)": 0.076913 | |
| }, | |
| { | |
| "epoch": 0.6280339805825242, | |
| "grad_norm": 0.5511714220046997, | |
| "learning_rate": 4.017276329658506e-06, | |
| "loss": 0.08419817090034484, | |
| "memory(GiB)": 40.4, | |
| "step": 345, | |
| "token_acc": 0.9707278481012658, | |
| "train_speed(iter/s)": 0.07696 | |
| }, | |
| { | |
| "epoch": 0.6371359223300971, | |
| "grad_norm": 0.5659735798835754, | |
| "learning_rate": 3.987862221737072e-06, | |
| "loss": 0.0797402322292328, | |
| "memory(GiB)": 40.4, | |
| "step": 350, | |
| "token_acc": 0.9659270998415214, | |
| "train_speed(iter/s)": 0.076995 | |
| }, | |
| { | |
| "epoch": 0.6462378640776699, | |
| "grad_norm": 0.5157150030136108, | |
| "learning_rate": 3.958125661285959e-06, | |
| "loss": 0.0838176965713501, | |
| "memory(GiB)": 40.4, | |
| "step": 355, | |
| "token_acc": 0.9690721649484536, | |
| "train_speed(iter/s)": 0.076909 | |
| }, | |
| { | |
| "epoch": 0.6553398058252428, | |
| "grad_norm": 0.5069080591201782, | |
| "learning_rate": 3.928073092873088e-06, | |
| "loss": 0.07343612313270569, | |
| "memory(GiB)": 40.4, | |
| "step": 360, | |
| "token_acc": 0.9746233148295004, | |
| "train_speed(iter/s)": 0.076991 | |
| }, | |
| { | |
| "epoch": 0.6644417475728155, | |
| "grad_norm": 0.49923259019851685, | |
| "learning_rate": 3.897711029552264e-06, | |
| "loss": 0.07626074552536011, | |
| "memory(GiB)": 40.4, | |
| "step": 365, | |
| "token_acc": 0.9683544303797469, | |
| "train_speed(iter/s)": 0.076983 | |
| }, | |
| { | |
| "epoch": 0.6735436893203883, | |
| "grad_norm": 0.35883885622024536, | |
| "learning_rate": 3.8670460514516615e-06, | |
| "loss": 0.08405499458312989, | |
| "memory(GiB)": 40.4, | |
| "step": 370, | |
| "token_acc": 0.9635499207606973, | |
| "train_speed(iter/s)": 0.077013 | |
| }, | |
| { | |
| "epoch": 0.6826456310679612, | |
| "grad_norm": 0.4520786702632904, | |
| "learning_rate": 3.836084804347763e-06, | |
| "loss": 0.07998884916305542, | |
| "memory(GiB)": 40.4, | |
| "step": 375, | |
| "token_acc": 0.9698412698412698, | |
| "train_speed(iter/s)": 0.07694 | |
| }, | |
| { | |
| "epoch": 0.691747572815534, | |
| "grad_norm": 0.47654658555984497, | |
| "learning_rate": 3.8048339982250705e-06, | |
| "loss": 0.08119775056838989, | |
| "memory(GiB)": 40.4, | |
| "step": 380, | |
| "token_acc": 0.9667194928684627, | |
| "train_speed(iter/s)": 0.077002 | |
| }, | |
| { | |
| "epoch": 0.7008495145631068, | |
| "grad_norm": 0.5640057325363159, | |
| "learning_rate": 3.773300405821908e-06, | |
| "loss": 0.08841820359230042, | |
| "memory(GiB)": 40.4, | |
| "step": 385, | |
| "token_acc": 0.9595559080095163, | |
| "train_speed(iter/s)": 0.077061 | |
| }, | |
| { | |
| "epoch": 0.7099514563106796, | |
| "grad_norm": 0.42381900548934937, | |
| "learning_rate": 3.7414908611626162e-06, | |
| "loss": 0.08166542053222656, | |
| "memory(GiB)": 40.4, | |
| "step": 390, | |
| "token_acc": 0.969047619047619, | |
| "train_speed(iter/s)": 0.077092 | |
| }, | |
| { | |
| "epoch": 0.7190533980582524, | |
| "grad_norm": 0.510867714881897, | |
| "learning_rate": 3.709412258076471e-06, | |
| "loss": 0.08081957101821899, | |
| "memory(GiB)": 40.4, | |
| "step": 395, | |
| "token_acc": 0.9699129057798892, | |
| "train_speed(iter/s)": 0.077233 | |
| }, | |
| { | |
| "epoch": 0.7281553398058253, | |
| "grad_norm": 0.5211343169212341, | |
| "learning_rate": 3.6770715487036413e-06, | |
| "loss": 0.08312466740608215, | |
| "memory(GiB)": 40.4, | |
| "step": 400, | |
| "token_acc": 0.9611419508326725, | |
| "train_speed(iter/s)": 0.077264 | |
| }, | |
| { | |
| "epoch": 0.7372572815533981, | |
| "grad_norm": 0.46672672033309937, | |
| "learning_rate": 3.644475741988499e-06, | |
| "loss": 0.08163590431213379, | |
| "memory(GiB)": 40.4, | |
| "step": 405, | |
| "token_acc": 0.9666666666666667, | |
| "train_speed(iter/s)": 0.07706 | |
| }, | |
| { | |
| "epoch": 0.7463592233009708, | |
| "grad_norm": 0.4190872013568878, | |
| "learning_rate": 3.6116319021606345e-06, | |
| "loss": 0.08278034925460816, | |
| "memory(GiB)": 40.4, | |
| "step": 410, | |
| "token_acc": 0.9603803486529319, | |
| "train_speed(iter/s)": 0.077071 | |
| }, | |
| { | |
| "epoch": 0.7554611650485437, | |
| "grad_norm": 0.4177815318107605, | |
| "learning_rate": 3.5785471472038784e-06, | |
| "loss": 0.07709290385246277, | |
| "memory(GiB)": 40.4, | |
| "step": 415, | |
| "token_acc": 0.9714512291831879, | |
| "train_speed(iter/s)": 0.077076 | |
| }, | |
| { | |
| "epoch": 0.7645631067961165, | |
| "grad_norm": 0.7115554213523865, | |
| "learning_rate": 3.545228647313679e-06, | |
| "loss": 0.08126543164253235, | |
| "memory(GiB)": 40.4, | |
| "step": 420, | |
| "token_acc": 0.9674861221252974, | |
| "train_speed(iter/s)": 0.07706 | |
| }, | |
| { | |
| "epoch": 0.7736650485436893, | |
| "grad_norm": 0.43985486030578613, | |
| "learning_rate": 3.5116836233431616e-06, | |
| "loss": 0.08477982282638549, | |
| "memory(GiB)": 40.4, | |
| "step": 425, | |
| "token_acc": 0.9628164556962026, | |
| "train_speed(iter/s)": 0.077154 | |
| }, | |
| { | |
| "epoch": 0.7827669902912622, | |
| "grad_norm": 0.48275941610336304, | |
| "learning_rate": 3.477919345238213e-06, | |
| "loss": 0.07978797554969788, | |
| "memory(GiB)": 40.4, | |
| "step": 430, | |
| "token_acc": 0.9627279936558287, | |
| "train_speed(iter/s)": 0.077173 | |
| }, | |
| { | |
| "epoch": 0.7918689320388349, | |
| "grad_norm": 0.5005500912666321, | |
| "learning_rate": 3.4439431304619207e-06, | |
| "loss": 0.07624109983444213, | |
| "memory(GiB)": 40.4, | |
| "step": 435, | |
| "token_acc": 0.9659270998415214, | |
| "train_speed(iter/s)": 0.077238 | |
| }, | |
| { | |
| "epoch": 0.8009708737864077, | |
| "grad_norm": 0.5146210789680481, | |
| "learning_rate": 3.4097623424087196e-06, | |
| "loss": 0.080259507894516, | |
| "memory(GiB)": 40.4, | |
| "step": 440, | |
| "token_acc": 0.9706582077716098, | |
| "train_speed(iter/s)": 0.077241 | |
| }, | |
| { | |
| "epoch": 0.8100728155339806, | |
| "grad_norm": 0.558778703212738, | |
| "learning_rate": 3.3753843888085806e-06, | |
| "loss": 0.07813260555267335, | |
| "memory(GiB)": 40.4, | |
| "step": 445, | |
| "token_acc": 0.9658730158730159, | |
| "train_speed(iter/s)": 0.077226 | |
| }, | |
| { | |
| "epoch": 0.8191747572815534, | |
| "grad_norm": 0.574676513671875, | |
| "learning_rate": 3.340816720121597e-06, | |
| "loss": 0.0761204183101654, | |
| "memory(GiB)": 40.4, | |
| "step": 450, | |
| "token_acc": 0.9691699604743083, | |
| "train_speed(iter/s)": 0.077059 | |
| }, | |
| { | |
| "epoch": 0.8282766990291263, | |
| "grad_norm": 0.5359216332435608, | |
| "learning_rate": 3.3060668279232964e-06, | |
| "loss": 0.07063559293746949, | |
| "memory(GiB)": 40.4, | |
| "step": 455, | |
| "token_acc": 0.9746233148295004, | |
| "train_speed(iter/s)": 0.077103 | |
| }, | |
| { | |
| "epoch": 0.837378640776699, | |
| "grad_norm": 0.5926820635795593, | |
| "learning_rate": 3.2711422432810624e-06, | |
| "loss": 0.07327613830566407, | |
| "memory(GiB)": 40.4, | |
| "step": 460, | |
| "token_acc": 0.9666666666666667, | |
| "train_speed(iter/s)": 0.077136 | |
| }, | |
| { | |
| "epoch": 0.8464805825242718, | |
| "grad_norm": 0.4923359155654907, | |
| "learning_rate": 3.236050535121976e-06, | |
| "loss": 0.0849435031414032, | |
| "memory(GiB)": 40.4, | |
| "step": 465, | |
| "token_acc": 0.9628164556962026, | |
| "train_speed(iter/s)": 0.077175 | |
| }, | |
| { | |
| "epoch": 0.8555825242718447, | |
| "grad_norm": 0.5079782605171204, | |
| "learning_rate": 3.2007993085924694e-06, | |
| "loss": 0.07131590843200683, | |
| "memory(GiB)": 40.4, | |
| "step": 470, | |
| "token_acc": 0.9603489294210944, | |
| "train_speed(iter/s)": 0.077219 | |
| }, | |
| { | |
| "epoch": 0.8646844660194175, | |
| "grad_norm": 0.47359853982925415, | |
| "learning_rate": 3.165396203410121e-06, | |
| "loss": 0.08230514526367187, | |
| "memory(GiB)": 40.4, | |
| "step": 475, | |
| "token_acc": 0.9603489294210944, | |
| "train_speed(iter/s)": 0.077276 | |
| }, | |
| { | |
| "epoch": 0.8737864077669902, | |
| "grad_norm": 0.5094448328018188, | |
| "learning_rate": 3.1298488922079597e-06, | |
| "loss": 0.07572669386863709, | |
| "memory(GiB)": 40.4, | |
| "step": 480, | |
| "token_acc": 0.9683042789223455, | |
| "train_speed(iter/s)": 0.077301 | |
| }, | |
| { | |
| "epoch": 0.8828883495145631, | |
| "grad_norm": 0.6144260764122009, | |
| "learning_rate": 3.094165078871634e-06, | |
| "loss": 0.07770437002182007, | |
| "memory(GiB)": 40.4, | |
| "step": 485, | |
| "token_acc": 0.9674603174603175, | |
| "train_speed(iter/s)": 0.077291 | |
| }, | |
| { | |
| "epoch": 0.8919902912621359, | |
| "grad_norm": 0.7166838049888611, | |
| "learning_rate": 3.0583524968698176e-06, | |
| "loss": 0.07593016624450684, | |
| "memory(GiB)": 40.4, | |
| "step": 490, | |
| "token_acc": 0.9706582077716098, | |
| "train_speed(iter/s)": 0.077337 | |
| }, | |
| { | |
| "epoch": 0.9010922330097088, | |
| "grad_norm": 0.5843172073364258, | |
| "learning_rate": 3.0224189075781886e-06, | |
| "loss": 0.0753251850605011, | |
| "memory(GiB)": 40.4, | |
| "step": 495, | |
| "token_acc": 0.9675889328063241, | |
| "train_speed(iter/s)": 0.077398 | |
| }, | |
| { | |
| "epoch": 0.9101941747572816, | |
| "grad_norm": 0.4273771643638611, | |
| "learning_rate": 2.9863720985973697e-06, | |
| "loss": 0.07616569995880126, | |
| "memory(GiB)": 40.4, | |
| "step": 500, | |
| "token_acc": 0.9746031746031746, | |
| "train_speed(iter/s)": 0.077368 | |
| }, | |
| { | |
| "epoch": 0.9192961165048543, | |
| "grad_norm": 0.5440679788589478, | |
| "learning_rate": 2.9502198820651903e-06, | |
| "loss": 0.07991842031478882, | |
| "memory(GiB)": 40.4, | |
| "step": 505, | |
| "token_acc": 0.9642857142857143, | |
| "train_speed(iter/s)": 0.077195 | |
| }, | |
| { | |
| "epoch": 0.9283980582524272, | |
| "grad_norm": 0.6545736789703369, | |
| "learning_rate": 2.9139700929636134e-06, | |
| "loss": 0.07855194211006164, | |
| "memory(GiB)": 40.4, | |
| "step": 510, | |
| "token_acc": 0.9587301587301588, | |
| "train_speed(iter/s)": 0.077178 | |
| }, | |
| { | |
| "epoch": 0.9375, | |
| "grad_norm": 0.5470529794692993, | |
| "learning_rate": 2.8776305874207305e-06, | |
| "loss": 0.07507063150405884, | |
| "memory(GiB)": 40.4, | |
| "step": 515, | |
| "token_acc": 0.9675376088677752, | |
| "train_speed(iter/s)": 0.077176 | |
| }, | |
| { | |
| "epoch": 0.9466019417475728, | |
| "grad_norm": 0.5262081623077393, | |
| "learning_rate": 2.8412092410081645e-06, | |
| "loss": 0.08568469285964966, | |
| "memory(GiB)": 40.4, | |
| "step": 520, | |
| "token_acc": 0.9659270998415214, | |
| "train_speed(iter/s)": 0.077164 | |
| }, | |
| { | |
| "epoch": 0.9557038834951457, | |
| "grad_norm": 0.48101773858070374, | |
| "learning_rate": 2.804713947034254e-06, | |
| "loss": 0.07408897280693054, | |
| "memory(GiB)": 40.4, | |
| "step": 525, | |
| "token_acc": 0.9715189873417721, | |
| "train_speed(iter/s)": 0.077248 | |
| }, | |
| { | |
| "epoch": 0.9648058252427184, | |
| "grad_norm": 0.7088754773139954, | |
| "learning_rate": 2.7681526148334074e-06, | |
| "loss": 0.07859846353530883, | |
| "memory(GiB)": 40.4, | |
| "step": 530, | |
| "token_acc": 0.9651070578905631, | |
| "train_speed(iter/s)": 0.077348 | |
| }, | |
| { | |
| "epoch": 0.9739077669902912, | |
| "grad_norm": 0.5357980728149414, | |
| "learning_rate": 2.73153316805197e-06, | |
| "loss": 0.07618768811225891, | |
| "memory(GiB)": 40.4, | |
| "step": 535, | |
| "token_acc": 0.9683042789223455, | |
| "train_speed(iter/s)": 0.077388 | |
| }, | |
| { | |
| "epoch": 0.9830097087378641, | |
| "grad_norm": 0.4719216823577881, | |
| "learning_rate": 2.6948635429309984e-06, | |
| "loss": 0.08283294439315796, | |
| "memory(GiB)": 40.4, | |
| "step": 540, | |
| "token_acc": 0.9666666666666667, | |
| "train_speed(iter/s)": 0.077404 | |
| }, | |
| { | |
| "epoch": 0.9921116504854369, | |
| "grad_norm": 0.4105032980442047, | |
| "learning_rate": 2.6581516865863006e-06, | |
| "loss": 0.07635112404823304, | |
| "memory(GiB)": 40.4, | |
| "step": 545, | |
| "token_acc": 0.9666666666666667, | |
| "train_speed(iter/s)": 0.077461 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 0.4639950096607208, | |
| "learning_rate": 2.6214055552861213e-06, | |
| "loss": 0.07352917194366455, | |
| "memory(GiB)": 40.4, | |
| "step": 550, | |
| "token_acc": 0.9652014652014652, | |
| "train_speed(iter/s)": 0.077567 | |
| }, | |
| { | |
| "epoch": 1.0091019417475728, | |
| "grad_norm": 0.5708960294723511, | |
| "learning_rate": 2.5846331127268432e-06, | |
| "loss": 0.06939817667007446, | |
| "memory(GiB)": 40.4, | |
| "step": 555, | |
| "token_acc": 0.9746634996041171, | |
| "train_speed(iter/s)": 0.077516 | |
| }, | |
| { | |
| "epoch": 1.0182038834951457, | |
| "grad_norm": 0.5500112771987915, | |
| "learning_rate": 2.5478423283070797e-06, | |
| "loss": 0.08004761338233948, | |
| "memory(GiB)": 40.4, | |
| "step": 560, | |
| "token_acc": 0.9666666666666667, | |
| "train_speed(iter/s)": 0.077461 | |
| }, | |
| { | |
| "epoch": 1.0273058252427185, | |
| "grad_norm": 0.6031087040901184, | |
| "learning_rate": 2.5110411754005277e-06, | |
| "loss": 0.07369757890701294, | |
| "memory(GiB)": 40.4, | |
| "step": 565, | |
| "token_acc": 0.9675118858954042, | |
| "train_speed(iter/s)": 0.077479 | |
| }, | |
| { | |
| "epoch": 1.0364077669902914, | |
| "grad_norm": 0.6123142242431641, | |
| "learning_rate": 2.4742376296279656e-06, | |
| "loss": 0.07673358917236328, | |
| "memory(GiB)": 40.4, | |
| "step": 570, | |
| "token_acc": 0.96513470681458, | |
| "train_speed(iter/s)": 0.077492 | |
| }, | |
| { | |
| "epoch": 1.045509708737864, | |
| "grad_norm": 0.4750412404537201, | |
| "learning_rate": 2.437439667128757e-06, | |
| "loss": 0.07482797503471375, | |
| "memory(GiB)": 40.4, | |
| "step": 575, | |
| "token_acc": 0.9722222222222222, | |
| "train_speed(iter/s)": 0.077462 | |
| }, | |
| { | |
| "epoch": 1.0546116504854368, | |
| "grad_norm": 0.6936323642730713, | |
| "learning_rate": 2.4006552628322495e-06, | |
| "loss": 0.07669172286987305, | |
| "memory(GiB)": 40.4, | |
| "step": 580, | |
| "token_acc": 0.9698890649762282, | |
| "train_speed(iter/s)": 0.077497 | |
| }, | |
| { | |
| "epoch": 1.0637135922330097, | |
| "grad_norm": 0.5415986180305481, | |
| "learning_rate": 2.3638923887294252e-06, | |
| "loss": 0.07764337062835694, | |
| "memory(GiB)": 40.4, | |
| "step": 585, | |
| "token_acc": 0.9722662440570523, | |
| "train_speed(iter/s)": 0.077534 | |
| }, | |
| { | |
| "epoch": 1.0728155339805825, | |
| "grad_norm": 0.5562268495559692, | |
| "learning_rate": 2.3271590121452034e-06, | |
| "loss": 0.07850711941719055, | |
| "memory(GiB)": 40.4, | |
| "step": 590, | |
| "token_acc": 0.9627575277337559, | |
| "train_speed(iter/s)": 0.077312 | |
| }, | |
| { | |
| "epoch": 1.0819174757281553, | |
| "grad_norm": 0.5438592433929443, | |
| "learning_rate": 2.2904630940117383e-06, | |
| "loss": 0.07206880450248718, | |
| "memory(GiB)": 40.4, | |
| "step": 595, | |
| "token_acc": 0.9706582077716098, | |
| "train_speed(iter/s)": 0.077329 | |
| }, | |
| { | |
| "epoch": 1.0910194174757282, | |
| "grad_norm": 0.7570096254348755, | |
| "learning_rate": 2.253812587143113e-06, | |
| "loss": 0.07922015190124512, | |
| "memory(GiB)": 40.4, | |
| "step": 600, | |
| "token_acc": 0.9675632911392406, | |
| "train_speed(iter/s)": 0.077373 | |
| }, | |
| { | |
| "epoch": 1.100121359223301, | |
| "grad_norm": 0.44248196482658386, | |
| "learning_rate": 2.2172154345117896e-06, | |
| "loss": 0.07421438097953796, | |
| "memory(GiB)": 40.4, | |
| "step": 605, | |
| "token_acc": 0.969047619047619, | |
| "train_speed(iter/s)": 0.077227 | |
| }, | |
| { | |
| "epoch": 1.1092233009708738, | |
| "grad_norm": 0.8693225383758545, | |
| "learning_rate": 2.18067956752719e-06, | |
| "loss": 0.07179425954818726, | |
| "memory(GiB)": 40.4, | |
| "step": 610, | |
| "token_acc": 0.9738302934179223, | |
| "train_speed(iter/s)": 0.077227 | |
| }, | |
| { | |
| "epoch": 1.1183252427184467, | |
| "grad_norm": 0.6093197464942932, | |
| "learning_rate": 2.1442129043167877e-06, | |
| "loss": 0.07261105179786682, | |
| "memory(GiB)": 40.4, | |
| "step": 615, | |
| "token_acc": 0.972244250594766, | |
| "train_speed(iter/s)": 0.077265 | |
| }, | |
| { | |
| "epoch": 1.1274271844660193, | |
| "grad_norm": 0.47732552886009216, | |
| "learning_rate": 2.1078233480100708e-06, | |
| "loss": 0.07763968706130982, | |
| "memory(GiB)": 40.4, | |
| "step": 620, | |
| "token_acc": 0.9746233148295004, | |
| "train_speed(iter/s)": 0.077083 | |
| }, | |
| { | |
| "epoch": 1.1365291262135921, | |
| "grad_norm": 0.6436070799827576, | |
| "learning_rate": 2.0715187850257645e-06, | |
| "loss": 0.07869491577148438, | |
| "memory(GiB)": 40.4, | |
| "step": 625, | |
| "token_acc": 0.9675632911392406, | |
| "train_speed(iter/s)": 0.077031 | |
| }, | |
| { | |
| "epoch": 1.145631067961165, | |
| "grad_norm": 0.6669154167175293, | |
| "learning_rate": 2.0353070833626684e-06, | |
| "loss": 0.07925596237182617, | |
| "memory(GiB)": 40.4, | |
| "step": 630, | |
| "token_acc": 0.964314036478985, | |
| "train_speed(iter/s)": 0.077048 | |
| }, | |
| { | |
| "epoch": 1.1547330097087378, | |
| "grad_norm": 0.6365996599197388, | |
| "learning_rate": 1.999196090894485e-06, | |
| "loss": 0.06456078886985779, | |
| "memory(GiB)": 40.4, | |
| "step": 635, | |
| "token_acc": 0.9667194928684627, | |
| "train_speed(iter/s)": 0.077101 | |
| }, | |
| { | |
| "epoch": 1.1638349514563107, | |
| "grad_norm": 0.5614244341850281, | |
| "learning_rate": 1.963193633669018e-06, | |
| "loss": 0.07243520021438599, | |
| "memory(GiB)": 40.4, | |
| "step": 640, | |
| "token_acc": 0.9666931007137193, | |
| "train_speed(iter/s)": 0.077155 | |
| }, | |
| { | |
| "epoch": 1.1729368932038835, | |
| "grad_norm": 0.8191459774971008, | |
| "learning_rate": 1.927307514212089e-06, | |
| "loss": 0.0762752890586853, | |
| "memory(GiB)": 40.4, | |
| "step": 645, | |
| "token_acc": 0.9698412698412698, | |
| "train_speed(iter/s)": 0.077153 | |
| }, | |
| { | |
| "epoch": 1.1820388349514563, | |
| "grad_norm": 0.523980438709259, | |
| "learning_rate": 1.8915455098365651e-06, | |
| "loss": 0.0773351550102234, | |
| "memory(GiB)": 40.4, | |
| "step": 650, | |
| "token_acc": 0.9675118858954042, | |
| "train_speed(iter/s)": 0.077211 | |
| }, | |
| { | |
| "epoch": 1.1911407766990292, | |
| "grad_norm": 0.5650423169136047, | |
| "learning_rate": 1.8559153709568393e-06, | |
| "loss": 0.07858687043190002, | |
| "memory(GiB)": 40.4, | |
| "step": 655, | |
| "token_acc": 0.9635499207606973, | |
| "train_speed(iter/s)": 0.077253 | |
| }, | |
| { | |
| "epoch": 1.200242718446602, | |
| "grad_norm": 0.3905327022075653, | |
| "learning_rate": 1.8204248194091429e-06, | |
| "loss": 0.07570682168006897, | |
| "memory(GiB)": 40.4, | |
| "step": 660, | |
| "token_acc": 0.9674861221252974, | |
| "train_speed(iter/s)": 0.077222 | |
| }, | |
| { | |
| "epoch": 1.2093446601941746, | |
| "grad_norm": 0.6456849575042725, | |
| "learning_rate": 1.7850815467780616e-06, | |
| "loss": 0.06978952884674072, | |
| "memory(GiB)": 40.4, | |
| "step": 665, | |
| "token_acc": 0.976984126984127, | |
| "train_speed(iter/s)": 0.077238 | |
| }, | |
| { | |
| "epoch": 1.2184466019417475, | |
| "grad_norm": 0.49169182777404785, | |
| "learning_rate": 1.7498932127295892e-06, | |
| "loss": 0.06932756900787354, | |
| "memory(GiB)": 40.4, | |
| "step": 670, | |
| "token_acc": 0.9674603174603175, | |
| "train_speed(iter/s)": 0.077305 | |
| }, | |
| { | |
| "epoch": 1.2275485436893203, | |
| "grad_norm": 0.8174545764923096, | |
| "learning_rate": 1.7148674433511176e-06, | |
| "loss": 0.07247714400291443, | |
| "memory(GiB)": 40.4, | |
| "step": 675, | |
| "token_acc": 0.9785714285714285, | |
| "train_speed(iter/s)": 0.077358 | |
| }, | |
| { | |
| "epoch": 1.2366504854368932, | |
| "grad_norm": 0.5874563455581665, | |
| "learning_rate": 1.6800118294986936e-06, | |
| "loss": 0.08156619668006897, | |
| "memory(GiB)": 40.4, | |
| "step": 680, | |
| "token_acc": 0.9619952494061758, | |
| "train_speed(iter/s)": 0.077379 | |
| }, | |
| { | |
| "epoch": 1.245752427184466, | |
| "grad_norm": 0.7023929357528687, | |
| "learning_rate": 1.645333925151908e-06, | |
| "loss": 0.0740778088569641, | |
| "memory(GiB)": 40.4, | |
| "step": 685, | |
| "token_acc": 0.9643423137876387, | |
| "train_speed(iter/s)": 0.077282 | |
| }, | |
| { | |
| "epoch": 1.2548543689320388, | |
| "grad_norm": 0.6284681558609009, | |
| "learning_rate": 1.610841245776789e-06, | |
| "loss": 0.07937963008880615, | |
| "memory(GiB)": 40.4, | |
| "step": 690, | |
| "token_acc": 0.9682791435368755, | |
| "train_speed(iter/s)": 0.077267 | |
| }, | |
| { | |
| "epoch": 1.2639563106796117, | |
| "grad_norm": 0.4900761544704437, | |
| "learning_rate": 1.5765412666970302e-06, | |
| "loss": 0.07481481432914734, | |
| "memory(GiB)": 40.4, | |
| "step": 695, | |
| "token_acc": 0.9714512291831879, | |
| "train_speed(iter/s)": 0.077241 | |
| }, | |
| { | |
| "epoch": 1.2730582524271845, | |
| "grad_norm": 0.7159978747367859, | |
| "learning_rate": 1.5424414214739258e-06, | |
| "loss": 0.07213735580444336, | |
| "memory(GiB)": 40.4, | |
| "step": 700, | |
| "token_acc": 0.9738302934179223, | |
| "train_speed(iter/s)": 0.077237 | |
| }, | |
| { | |
| "epoch": 1.2821601941747574, | |
| "grad_norm": 0.6261754631996155, | |
| "learning_rate": 1.5085491002953535e-06, | |
| "loss": 0.07179176211357116, | |
| "memory(GiB)": 40.4, | |
| "step": 705, | |
| "token_acc": 0.969047619047619, | |
| "train_speed(iter/s)": 0.077083 | |
| }, | |
| { | |
| "epoch": 1.29126213592233, | |
| "grad_norm": 0.9063695073127747, | |
| "learning_rate": 1.4748716483741562e-06, | |
| "loss": 0.07754602432250976, | |
| "memory(GiB)": 40.4, | |
| "step": 710, | |
| "token_acc": 0.96513470681458, | |
| "train_speed(iter/s)": 0.077061 | |
| }, | |
| { | |
| "epoch": 1.300364077669903, | |
| "grad_norm": 0.6574028134346008, | |
| "learning_rate": 1.4414163643562755e-06, | |
| "loss": 0.07884335517883301, | |
| "memory(GiB)": 40.4, | |
| "step": 715, | |
| "token_acc": 0.9675376088677752, | |
| "train_speed(iter/s)": 0.077069 | |
| }, | |
| { | |
| "epoch": 1.3094660194174756, | |
| "grad_norm": 0.5524230599403381, | |
| "learning_rate": 1.4081904987389701e-06, | |
| "loss": 0.07660083174705505, | |
| "memory(GiB)": 40.4, | |
| "step": 720, | |
| "token_acc": 0.9635210150674068, | |
| "train_speed(iter/s)": 0.077072 | |
| }, | |
| { | |
| "epoch": 1.3185679611650485, | |
| "grad_norm": 0.5381263494491577, | |
| "learning_rate": 1.375201252299479e-06, | |
| "loss": 0.07187164425849915, | |
| "memory(GiB)": 40.4, | |
| "step": 725, | |
| "token_acc": 0.9690966719492868, | |
| "train_speed(iter/s)": 0.077084 | |
| }, | |
| { | |
| "epoch": 1.3276699029126213, | |
| "grad_norm": 0.6094266176223755, | |
| "learning_rate": 1.3424557745344508e-06, | |
| "loss": 0.07152368426322937, | |
| "memory(GiB)": 40.4, | |
| "step": 730, | |
| "token_acc": 0.9690966719492868, | |
| "train_speed(iter/s)": 0.07712 | |
| }, | |
| { | |
| "epoch": 1.3367718446601942, | |
| "grad_norm": 0.37662273645401, | |
| "learning_rate": 1.3099611621104875e-06, | |
| "loss": 0.07852091193199158, | |
| "memory(GiB)": 40.4, | |
| "step": 735, | |
| "token_acc": 0.9698412698412698, | |
| "train_speed(iter/s)": 0.077111 | |
| }, | |
| { | |
| "epoch": 1.345873786407767, | |
| "grad_norm": 0.8660151958465576, | |
| "learning_rate": 1.2777244573261479e-06, | |
| "loss": 0.0761515736579895, | |
| "memory(GiB)": 40.4, | |
| "step": 740, | |
| "token_acc": 0.9650793650793651, | |
| "train_speed(iter/s)": 0.077083 | |
| }, | |
| { | |
| "epoch": 1.3549757281553398, | |
| "grad_norm": 0.8635317087173462, | |
| "learning_rate": 1.245752646585719e-06, | |
| "loss": 0.07429265975952148, | |
| "memory(GiB)": 40.4, | |
| "step": 745, | |
| "token_acc": 0.9706582077716098, | |
| "train_speed(iter/s)": 0.077017 | |
| }, | |
| { | |
| "epoch": 1.3640776699029127, | |
| "grad_norm": 0.6921953558921814, | |
| "learning_rate": 1.214052658885113e-06, | |
| "loss": 0.08055119514465332, | |
| "memory(GiB)": 40.4, | |
| "step": 750, | |
| "token_acc": 0.9659000793021412, | |
| "train_speed(iter/s)": 0.07705 | |
| }, | |
| { | |
| "epoch": 1.3731796116504853, | |
| "grad_norm": 0.512025773525238, | |
| "learning_rate": 1.182631364310199e-06, | |
| "loss": 0.07414981126785278, | |
| "memory(GiB)": 40.4, | |
| "step": 755, | |
| "token_acc": 0.9738095238095238, | |
| "train_speed(iter/s)": 0.077125 | |
| }, | |
| { | |
| "epoch": 1.3822815533980584, | |
| "grad_norm": 0.47374847531318665, | |
| "learning_rate": 1.1514955725479057e-06, | |
| "loss": 0.07829545140266418, | |
| "memory(GiB)": 40.4, | |
| "step": 760, | |
| "token_acc": 0.9675118858954042, | |
| "train_speed(iter/s)": 0.077061 | |
| }, | |
| { | |
| "epoch": 1.391383495145631, | |
| "grad_norm": 0.5193628072738647, | |
| "learning_rate": 1.1206520314104083e-06, | |
| "loss": 0.06979748606681824, | |
| "memory(GiB)": 40.4, | |
| "step": 765, | |
| "token_acc": 0.9730799683293745, | |
| "train_speed(iter/s)": 0.077097 | |
| }, | |
| { | |
| "epoch": 1.4004854368932038, | |
| "grad_norm": 0.5398116707801819, | |
| "learning_rate": 1.0901074253727338e-06, | |
| "loss": 0.07316485643386841, | |
| "memory(GiB)": 40.4, | |
| "step": 770, | |
| "token_acc": 0.9674861221252974, | |
| "train_speed(iter/s)": 0.077134 | |
| }, | |
| { | |
| "epoch": 1.4095873786407767, | |
| "grad_norm": 0.9198482036590576, | |
| "learning_rate": 1.0598683741240861e-06, | |
| "loss": 0.0778656005859375, | |
| "memory(GiB)": 40.4, | |
| "step": 775, | |
| "token_acc": 0.9714512291831879, | |
| "train_speed(iter/s)": 0.077187 | |
| }, | |
| { | |
| "epoch": 1.4186893203883495, | |
| "grad_norm": 0.5479600429534912, | |
| "learning_rate": 1.0299414311332107e-06, | |
| "loss": 0.0758398413658142, | |
| "memory(GiB)": 40.4, | |
| "step": 780, | |
| "token_acc": 0.9706582077716098, | |
| "train_speed(iter/s)": 0.077204 | |
| }, | |
| { | |
| "epoch": 1.4277912621359223, | |
| "grad_norm": 0.562239944934845, | |
| "learning_rate": 1.0003330822281188e-06, | |
| "loss": 0.08118345737457275, | |
| "memory(GiB)": 40.4, | |
| "step": 785, | |
| "token_acc": 0.9658730158730159, | |
| "train_speed(iter/s)": 0.077197 | |
| }, | |
| { | |
| "epoch": 1.4368932038834952, | |
| "grad_norm": 0.608139157295227, | |
| "learning_rate": 9.710497441904614e-07, | |
| "loss": 0.07277892231941223, | |
| "memory(GiB)": 40.4, | |
| "step": 790, | |
| "token_acc": 0.9739130434782609, | |
| "train_speed(iter/s)": 0.077169 | |
| }, | |
| { | |
| "epoch": 1.445995145631068, | |
| "grad_norm": 0.6108372807502747, | |
| "learning_rate": 9.420977633648739e-07, | |
| "loss": 0.0743071436882019, | |
| "memory(GiB)": 40.4, | |
| "step": 795, | |
| "token_acc": 0.9651070578905631, | |
| "train_speed(iter/s)": 0.077195 | |
| }, | |
| { | |
| "epoch": 1.4550970873786409, | |
| "grad_norm": 0.5900782346725464, | |
| "learning_rate": 9.134834142835794e-07, | |
| "loss": 0.07513993978500366, | |
| "memory(GiB)": 40.4, | |
| "step": 800, | |
| "token_acc": 0.9738302934179223, | |
| "train_speed(iter/s)": 0.07724 | |
| }, | |
| { | |
| "epoch": 1.4641990291262137, | |
| "grad_norm": 0.5346866846084595, | |
| "learning_rate": 8.852128983065653e-07, | |
| "loss": 0.07092651724815369, | |
| "memory(GiB)": 40.4, | |
| "step": 805, | |
| "token_acc": 0.9722662440570523, | |
| "train_speed(iter/s)": 0.077133 | |
| }, | |
| { | |
| "epoch": 1.4733009708737863, | |
| "grad_norm": 0.504199743270874, | |
| "learning_rate": 8.572923422776055e-07, | |
| "loss": 0.07900516986846924, | |
| "memory(GiB)": 40.4, | |
| "step": 810, | |
| "token_acc": 0.9524564183835182, | |
| "train_speed(iter/s)": 0.077129 | |
| }, | |
| { | |
| "epoch": 1.4824029126213591, | |
| "grad_norm": 0.5348660349845886, | |
| "learning_rate": 8.297277971964443e-07, | |
| "loss": 0.07192928791046142, | |
| "memory(GiB)": 40.4, | |
| "step": 815, | |
| "token_acc": 0.9706349206349206, | |
| "train_speed(iter/s)": 0.077153 | |
| }, | |
| { | |
| "epoch": 1.491504854368932, | |
| "grad_norm": 0.7142664194107056, | |
| "learning_rate": 8.025252369074077e-07, | |
| "loss": 0.07966341972351074, | |
| "memory(GiB)": 40.4, | |
| "step": 820, | |
| "token_acc": 0.9714285714285714, | |
| "train_speed(iter/s)": 0.077158 | |
| }, | |
| { | |
| "epoch": 1.5006067961165048, | |
| "grad_norm": 0.670011579990387, | |
| "learning_rate": 7.756905568047393e-07, | |
| "loss": 0.07460339069366455, | |
| "memory(GiB)": 40.4, | |
| "step": 825, | |
| "token_acc": 0.9698412698412698, | |
| "train_speed(iter/s)": 0.077072 | |
| }, | |
| { | |
| "epoch": 1.5097087378640777, | |
| "grad_norm": 0.9091220498085022, | |
| "learning_rate": 7.492295725549423e-07, | |
| "loss": 0.07916736602783203, | |
| "memory(GiB)": 40.4, | |
| "step": 830, | |
| "token_acc": 0.9714512291831879, | |
| "train_speed(iter/s)": 0.077125 | |
| }, | |
| { | |
| "epoch": 1.5188106796116505, | |
| "grad_norm": 0.5154448747634888, | |
| "learning_rate": 7.231480188363906e-07, | |
| "loss": 0.07609822750091552, | |
| "memory(GiB)": 40.4, | |
| "step": 835, | |
| "token_acc": 0.9619047619047619, | |
| "train_speed(iter/s)": 0.077151 | |
| }, | |
| { | |
| "epoch": 1.5279126213592233, | |
| "grad_norm": 0.5767259001731873, | |
| "learning_rate": 6.974515480965038e-07, | |
| "loss": 0.07642306089401245, | |
| "memory(GiB)": 40.4, | |
| "step": 840, | |
| "token_acc": 0.9635499207606973, | |
| "train_speed(iter/s)": 0.077169 | |
| }, | |
| { | |
| "epoch": 1.537014563106796, | |
| "grad_norm": 0.559921145439148, | |
| "learning_rate": 6.721457293267344e-07, | |
| "loss": 0.07739580273628235, | |
| "memory(GiB)": 40.4, | |
| "step": 845, | |
| "token_acc": 0.9659540775930324, | |
| "train_speed(iter/s)": 0.077202 | |
| }, | |
| { | |
| "epoch": 1.546116504854369, | |
| "grad_norm": 0.5525022745132446, | |
| "learning_rate": 6.472360468556419e-07, | |
| "loss": 0.07661284804344178, | |
| "memory(GiB)": 40.4, | |
| "step": 850, | |
| "token_acc": 0.9690966719492868, | |
| "train_speed(iter/s)": 0.077223 | |
| }, | |
| { | |
| "epoch": 1.5552184466019416, | |
| "grad_norm": 0.7156991958618164, | |
| "learning_rate": 6.227278991603239e-07, | |
| "loss": 0.07607601881027222, | |
| "memory(GiB)": 40.4, | |
| "step": 855, | |
| "token_acc": 0.9738924050632911, | |
| "train_speed(iter/s)": 0.077263 | |
| }, | |
| { | |
| "epoch": 1.5643203883495147, | |
| "grad_norm": 0.578790009021759, | |
| "learning_rate": 5.986265976964412e-07, | |
| "loss": 0.07703717947006225, | |
| "memory(GiB)": 40.4, | |
| "step": 860, | |
| "token_acc": 0.9627575277337559, | |
| "train_speed(iter/s)": 0.077321 | |
| }, | |
| { | |
| "epoch": 1.5734223300970873, | |
| "grad_norm": 0.41067153215408325, | |
| "learning_rate": 5.749373657471127e-07, | |
| "loss": 0.07262166738510131, | |
| "memory(GiB)": 40.4, | |
| "step": 865, | |
| "token_acc": 0.9666931007137193, | |
| "train_speed(iter/s)": 0.077313 | |
| }, | |
| { | |
| "epoch": 1.5825242718446602, | |
| "grad_norm": 0.6594594120979309, | |
| "learning_rate": 5.516653372909142e-07, | |
| "loss": 0.07546203732490539, | |
| "memory(GiB)": 40.4, | |
| "step": 870, | |
| "token_acc": 0.9730799683293745, | |
| "train_speed(iter/s)": 0.077321 | |
| }, | |
| { | |
| "epoch": 1.591626213592233, | |
| "grad_norm": 0.6693688035011292, | |
| "learning_rate": 5.28815555889228e-07, | |
| "loss": 0.07242462635040284, | |
| "memory(GiB)": 40.4, | |
| "step": 875, | |
| "token_acc": 0.9714964370546318, | |
| "train_speed(iter/s)": 0.077315 | |
| }, | |
| { | |
| "epoch": 1.6007281553398058, | |
| "grad_norm": 0.5314414501190186, | |
| "learning_rate": 5.063929735931985e-07, | |
| "loss": 0.07621661424636841, | |
| "memory(GiB)": 40.4, | |
| "step": 880, | |
| "token_acc": 0.9746634996041171, | |
| "train_speed(iter/s)": 0.077305 | |
| }, | |
| { | |
| "epoch": 1.6098300970873787, | |
| "grad_norm": 0.39022502303123474, | |
| "learning_rate": 4.844024498705072e-07, | |
| "loss": 0.07379111647605896, | |
| "memory(GiB)": 40.4, | |
| "step": 885, | |
| "token_acc": 0.9770023790642347, | |
| "train_speed(iter/s)": 0.077319 | |
| }, | |
| { | |
| "epoch": 1.6189320388349513, | |
| "grad_norm": 0.5611955523490906, | |
| "learning_rate": 4.6284875055222415e-07, | |
| "loss": 0.07641223073005676, | |
| "memory(GiB)": 40.4, | |
| "step": 890, | |
| "token_acc": 0.969047619047619, | |
| "train_speed(iter/s)": 0.07736 | |
| }, | |
| { | |
| "epoch": 1.6280339805825244, | |
| "grad_norm": 0.5914463996887207, | |
| "learning_rate": 4.4173654679994543e-07, | |
| "loss": 0.07118785977363587, | |
| "memory(GiB)": 40.4, | |
| "step": 895, | |
| "token_acc": 0.9666931007137193, | |
| "train_speed(iter/s)": 0.077387 | |
| }, | |
| { | |
| "epoch": 1.637135922330097, | |
| "grad_norm": 0.6131768226623535, | |
| "learning_rate": 4.2107041409344686e-07, | |
| "loss": 0.06656063199043274, | |
| "memory(GiB)": 40.4, | |
| "step": 900, | |
| "token_acc": 0.9730586370839936, | |
| "train_speed(iter/s)": 0.077393 | |
| }, | |
| { | |
| "epoch": 1.64623786407767, | |
| "grad_norm": 0.6083477139472961, | |
| "learning_rate": 4.00854831239082e-07, | |
| "loss": 0.07548041343688965, | |
| "memory(GiB)": 40.4, | |
| "step": 905, | |
| "token_acc": 0.9706814580031695, | |
| "train_speed(iter/s)": 0.07732 | |
| }, | |
| { | |
| "epoch": 1.6553398058252426, | |
| "grad_norm": 0.5123993158340454, | |
| "learning_rate": 3.8109417939912044e-07, | |
| "loss": 0.07632001638412475, | |
| "memory(GiB)": 40.4, | |
| "step": 910, | |
| "token_acc": 0.9651070578905631, | |
| "train_speed(iter/s)": 0.07734 | |
| }, | |
| { | |
| "epoch": 1.6644417475728155, | |
| "grad_norm": 0.6305170655250549, | |
| "learning_rate": 3.617927411422584e-07, | |
| "loss": 0.07312512397766113, | |
| "memory(GiB)": 40.4, | |
| "step": 915, | |
| "token_acc": 0.9675376088677752, | |
| "train_speed(iter/s)": 0.077345 | |
| }, | |
| { | |
| "epoch": 1.6735436893203883, | |
| "grad_norm": 0.5339434742927551, | |
| "learning_rate": 3.4295469951548894e-07, | |
| "loss": 0.06849889755249024, | |
| "memory(GiB)": 40.4, | |
| "step": 920, | |
| "token_acc": 0.9674861221252974, | |
| "train_speed(iter/s)": 0.077349 | |
| }, | |
| { | |
| "epoch": 1.6826456310679612, | |
| "grad_norm": 0.532629132270813, | |
| "learning_rate": 3.24584137137543e-07, | |
| "loss": 0.07681695818901062, | |
| "memory(GiB)": 40.4, | |
| "step": 925, | |
| "token_acc": 0.9722222222222222, | |
| "train_speed(iter/s)": 0.077356 | |
| }, | |
| { | |
| "epoch": 1.691747572815534, | |
| "grad_norm": 0.4466962516307831, | |
| "learning_rate": 3.0668503531409876e-07, | |
| "loss": 0.06994915008544922, | |
| "memory(GiB)": 40.4, | |
| "step": 930, | |
| "token_acc": 0.9714964370546318, | |
| "train_speed(iter/s)": 0.077371 | |
| }, | |
| { | |
| "epoch": 1.7008495145631068, | |
| "grad_norm": 0.586765706539154, | |
| "learning_rate": 2.892612731749414e-07, | |
| "loss": 0.07494070529937744, | |
| "memory(GiB)": 40.4, | |
| "step": 935, | |
| "token_acc": 0.969047619047619, | |
| "train_speed(iter/s)": 0.077342 | |
| }, | |
| { | |
| "epoch": 1.7099514563106797, | |
| "grad_norm": 0.5412377715110779, | |
| "learning_rate": 2.723166268332733e-07, | |
| "loss": 0.07770473957061767, | |
| "memory(GiB)": 40.4, | |
| "step": 940, | |
| "token_acc": 0.9676145339652449, | |
| "train_speed(iter/s)": 0.077329 | |
| }, | |
| { | |
| "epoch": 1.7190533980582523, | |
| "grad_norm": 0.911586582660675, | |
| "learning_rate": 2.55854768567346e-07, | |
| "loss": 0.07914371490478515, | |
| "memory(GiB)": 40.4, | |
| "step": 945, | |
| "token_acc": 0.9674861221252974, | |
| "train_speed(iter/s)": 0.077298 | |
| }, | |
| { | |
| "epoch": 1.7281553398058254, | |
| "grad_norm": 0.6137750148773193, | |
| "learning_rate": 2.3987926602459465e-07, | |
| "loss": 0.08327807188034057, | |
| "memory(GiB)": 40.4, | |
| "step": 950, | |
| "token_acc": 0.9706349206349206, | |
| "train_speed(iter/s)": 0.077305 | |
| }, | |
| { | |
| "epoch": 1.737257281553398, | |
| "grad_norm": 0.576627790927887, | |
| "learning_rate": 2.2439358144845464e-07, | |
| "loss": 0.08012324571609497, | |
| "memory(GiB)": 40.4, | |
| "step": 955, | |
| "token_acc": 0.9643423137876387, | |
| "train_speed(iter/s)": 0.077328 | |
| }, | |
| { | |
| "epoch": 1.7463592233009708, | |
| "grad_norm": 0.6456671953201294, | |
| "learning_rate": 2.09401070928012e-07, | |
| "loss": 0.06627861261367798, | |
| "memory(GiB)": 40.4, | |
| "step": 960, | |
| "token_acc": 0.9714285714285714, | |
| "train_speed(iter/s)": 0.077243 | |
| }, | |
| { | |
| "epoch": 1.7554611650485437, | |
| "grad_norm": 0.6002473831176758, | |
| "learning_rate": 1.9490498367066817e-07, | |
| "loss": 0.071403968334198, | |
| "memory(GiB)": 40.4, | |
| "step": 965, | |
| "token_acc": 0.9682791435368755, | |
| "train_speed(iter/s)": 0.077258 | |
| }, | |
| { | |
| "epoch": 1.7645631067961165, | |
| "grad_norm": 0.7518230080604553, | |
| "learning_rate": 1.8090846129796586e-07, | |
| "loss": 0.07573525905609131, | |
| "memory(GiB)": 40.4, | |
| "step": 970, | |
| "token_acc": 0.9722222222222222, | |
| "train_speed(iter/s)": 0.077252 | |
| }, | |
| { | |
| "epoch": 1.7736650485436893, | |
| "grad_norm": 0.41464531421661377, | |
| "learning_rate": 1.6741453716472677e-07, | |
| "loss": 0.07870721817016602, | |
| "memory(GiB)": 40.4, | |
| "step": 975, | |
| "token_acc": 0.9627870150435471, | |
| "train_speed(iter/s)": 0.077259 | |
| }, | |
| { | |
| "epoch": 1.7827669902912622, | |
| "grad_norm": 0.7254371643066406, | |
| "learning_rate": 1.5442613570165993e-07, | |
| "loss": 0.08646805882453919, | |
| "memory(GiB)": 40.4, | |
| "step": 980, | |
| "token_acc": 0.9611419508326725, | |
| "train_speed(iter/s)": 0.077274 | |
| }, | |
| { | |
| "epoch": 1.791868932038835, | |
| "grad_norm": 0.7164713740348816, | |
| "learning_rate": 1.4194607178157237e-07, | |
| "loss": 0.07055433988571166, | |
| "memory(GiB)": 40.4, | |
| "step": 985, | |
| "token_acc": 0.9706349206349206, | |
| "train_speed(iter/s)": 0.077341 | |
| }, | |
| { | |
| "epoch": 1.8009708737864076, | |
| "grad_norm": 0.5821430087089539, | |
| "learning_rate": 1.2997705010932394e-07, | |
| "loss": 0.07743188142776489, | |
| "memory(GiB)": 40.4, | |
| "step": 990, | |
| "token_acc": 0.9674861221252974, | |
| "train_speed(iter/s)": 0.077362 | |
| }, | |
| { | |
| "epoch": 1.8100728155339807, | |
| "grad_norm": 0.766345739364624, | |
| "learning_rate": 1.1852166463565767e-07, | |
| "loss": 0.07668507099151611, | |
| "memory(GiB)": 40.4, | |
| "step": 995, | |
| "token_acc": 0.9770023790642347, | |
| "train_speed(iter/s)": 0.077362 | |
| }, | |
| { | |
| "epoch": 1.8191747572815533, | |
| "grad_norm": 0.5379170179367065, | |
| "learning_rate": 1.0758239799503412e-07, | |
| "loss": 0.06778880357742309, | |
| "memory(GiB)": 40.4, | |
| "step": 1000, | |
| "token_acc": 0.9746233148295004, | |
| "train_speed(iter/s)": 0.077358 | |
| }, | |
| { | |
| "epoch": 1.8282766990291264, | |
| "grad_norm": 0.587326169013977, | |
| "learning_rate": 9.716162096759019e-08, | |
| "loss": 0.07784827947616577, | |
| "memory(GiB)": 40.4, | |
| "step": 1005, | |
| "token_acc": 0.9770206022187005, | |
| "train_speed(iter/s)": 0.077289 | |
| }, | |
| { | |
| "epoch": 1.837378640776699, | |
| "grad_norm": 0.5790999531745911, | |
| "learning_rate": 8.726159196533718e-08, | |
| "loss": 0.07364106178283691, | |
| "memory(GiB)": 40.4, | |
| "step": 1010, | |
| "token_acc": 0.9730372720063442, | |
| "train_speed(iter/s)": 0.077306 | |
| }, | |
| { | |
| "epoch": 1.8464805825242718, | |
| "grad_norm": 0.5765237808227539, | |
| "learning_rate": 7.788445654271532e-08, | |
| "loss": 0.07042239308357238, | |
| "memory(GiB)": 40.4, | |
| "step": 1015, | |
| "token_acc": 0.9682539682539683, | |
| "train_speed(iter/s)": 0.077338 | |
| }, | |
| { | |
| "epoch": 1.8555825242718447, | |
| "grad_norm": 0.4627252221107483, | |
| "learning_rate": 6.903224693160348e-08, | |
| "loss": 0.06837155222892762, | |
| "memory(GiB)": 40.4, | |
| "step": 1020, | |
| "token_acc": 0.9754358161648178, | |
| "train_speed(iter/s)": 0.077366 | |
| }, | |
| { | |
| "epoch": 1.8646844660194175, | |
| "grad_norm": 0.5963551998138428, | |
| "learning_rate": 6.070688160088961e-08, | |
| "loss": 0.0674078106880188, | |
| "memory(GiB)": 40.4, | |
| "step": 1025, | |
| "token_acc": 0.9659270998415214, | |
| "train_speed(iter/s)": 0.077385 | |
| }, | |
| { | |
| "epoch": 1.8737864077669903, | |
| "grad_norm": 0.6391610503196716, | |
| "learning_rate": 5.291016484069683e-08, | |
| "loss": 0.07277075052261353, | |
| "memory(GiB)": 40.4, | |
| "step": 1030, | |
| "token_acc": 0.9659540775930324, | |
| "train_speed(iter/s)": 0.077401 | |
| }, | |
| { | |
| "epoch": 1.882888349514563, | |
| "grad_norm": 0.5019727945327759, | |
| "learning_rate": 4.564378637135408e-08, | |
| "loss": 0.0752260446548462, | |
| "memory(GiB)": 40.4, | |
| "step": 1035, | |
| "token_acc": 0.9682791435368755, | |
| "train_speed(iter/s)": 0.077434 | |
| }, | |
| { | |
| "epoch": 1.891990291262136, | |
| "grad_norm": 0.4186345040798187, | |
| "learning_rate": 3.890932097719624e-08, | |
| "loss": 0.06725120544433594, | |
| "memory(GiB)": 40.4, | |
| "step": 1040, | |
| "token_acc": 0.9730799683293745, | |
| "train_speed(iter/s)": 0.077451 | |
| }, | |
| { | |
| "epoch": 1.9010922330097086, | |
| "grad_norm": 0.6359046697616577, | |
| "learning_rate": 3.270822816527325e-08, | |
| "loss": 0.07682465314865113, | |
| "memory(GiB)": 40.4, | |
| "step": 1045, | |
| "token_acc": 0.969047619047619, | |
| "train_speed(iter/s)": 0.077498 | |
| }, | |
| { | |
| "epoch": 1.9101941747572817, | |
| "grad_norm": 0.5813617706298828, | |
| "learning_rate": 2.7041851849043678e-08, | |
| "loss": 0.0773731827735901, | |
| "memory(GiB)": 40.4, | |
| "step": 1050, | |
| "token_acc": 0.9674861221252974, | |
| "train_speed(iter/s)": 0.077486 | |
| }, | |
| { | |
| "epoch": 1.9192961165048543, | |
| "grad_norm": 0.4645262062549591, | |
| "learning_rate": 2.1911420057117994e-08, | |
| "loss": 0.07277056574821472, | |
| "memory(GiB)": 40.4, | |
| "step": 1055, | |
| "token_acc": 0.9690721649484536, | |
| "train_speed(iter/s)": 0.077447 | |
| }, | |
| { | |
| "epoch": 1.9283980582524272, | |
| "grad_norm": 0.8828046917915344, | |
| "learning_rate": 1.7318044667119226e-08, | |
| "loss": 0.07312785387039185, | |
| "memory(GiB)": 40.4, | |
| "step": 1060, | |
| "token_acc": 0.9675118858954042, | |
| "train_speed(iter/s)": 0.077476 | |
| }, | |
| { | |
| "epoch": 1.9375, | |
| "grad_norm": 0.8438335657119751, | |
| "learning_rate": 1.3262721164712667e-08, | |
| "loss": 0.07410634756088257, | |
| "memory(GiB)": 40.4, | |
| "step": 1065, | |
| "token_acc": 0.9698651863600317, | |
| "train_speed(iter/s)": 0.077482 | |
| }, | |
| { | |
| "epoch": 1.9466019417475728, | |
| "grad_norm": 0.6822603344917297, | |
| "learning_rate": 9.746328427863993e-09, | |
| "loss": 0.0720213532447815, | |
| "memory(GiB)": 40.4, | |
| "step": 1070, | |
| "token_acc": 0.9666666666666667, | |
| "train_speed(iter/s)": 0.077488 | |
| }, | |
| { | |
| "epoch": 1.9557038834951457, | |
| "grad_norm": 0.5685479640960693, | |
| "learning_rate": 6.769628536364981e-09, | |
| "loss": 0.07333976030349731, | |
| "memory(GiB)": 40.4, | |
| "step": 1075, | |
| "token_acc": 0.973015873015873, | |
| "train_speed(iter/s)": 0.077502 | |
| }, | |
| { | |
| "epoch": 1.9648058252427183, | |
| "grad_norm": 0.5445531606674194, | |
| "learning_rate": 4.333266606676711e-09, | |
| "loss": 0.07253679037094116, | |
| "memory(GiB)": 40.4, | |
| "step": 1080, | |
| "token_acc": 0.9730586370839936, | |
| "train_speed(iter/s)": 0.077494 | |
| }, | |
| { | |
| "epoch": 1.9739077669902914, | |
| "grad_norm": 0.6113319993019104, | |
| "learning_rate": 2.4377706521164224e-09, | |
| "loss": 0.07309662699699401, | |
| "memory(GiB)": 40.4, | |
| "step": 1085, | |
| "token_acc": 0.9722222222222222, | |
| "train_speed(iter/s)": 0.077475 | |
| }, | |
| { | |
| "epoch": 1.983009708737864, | |
| "grad_norm": 0.5483999252319336, | |
| "learning_rate": 1.0835514684262583e-09, | |
| "loss": 0.07428893446922302, | |
| "memory(GiB)": 40.4, | |
| "step": 1090, | |
| "token_acc": 0.9690966719492868, | |
| "train_speed(iter/s)": 0.077464 | |
| }, | |
| { | |
| "epoch": 1.992111650485437, | |
| "grad_norm": 0.6084752082824707, | |
| "learning_rate": 2.7090254474421154e-10, | |
| "loss": 0.07023123502731324, | |
| "memory(GiB)": 40.4, | |
| "step": 1095, | |
| "token_acc": 0.9786223277909739, | |
| "train_speed(iter/s)": 0.077453 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 0.5853410363197327, | |
| "learning_rate": 0.0, | |
| "loss": 0.0724187433719635, | |
| "memory(GiB)": 40.4, | |
| "step": 1100, | |
| "token_acc": 0.9679780420860018, | |
| "train_speed(iter/s)": 0.077495 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 1100, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 3.488531281539498e+18, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |