| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 4.997163925127623, | |
| "eval_steps": 1000, | |
| "global_step": 4405, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.011344299489506523, | |
| "grad_norm": 2.3206300735473633, | |
| "learning_rate": 4.535147392290249e-06, | |
| "loss": 1.5929, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.022688598979013045, | |
| "grad_norm": 1.2386493682861328, | |
| "learning_rate": 9.070294784580499e-06, | |
| "loss": 1.6159, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.03403289846851957, | |
| "grad_norm": 1.1790252923965454, | |
| "learning_rate": 1.360544217687075e-05, | |
| "loss": 1.538, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.04537719795802609, | |
| "grad_norm": 1.021796703338623, | |
| "learning_rate": 1.8140589569160997e-05, | |
| "loss": 1.4132, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.05672149744753262, | |
| "grad_norm": 1.3392266035079956, | |
| "learning_rate": 2.267573696145125e-05, | |
| "loss": 1.2604, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.06806579693703914, | |
| "grad_norm": 0.9446895122528076, | |
| "learning_rate": 2.72108843537415e-05, | |
| "loss": 1.1644, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.07941009642654566, | |
| "grad_norm": 1.3290923833847046, | |
| "learning_rate": 3.1746031746031745e-05, | |
| "loss": 1.1082, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.09075439591605218, | |
| "grad_norm": 1.5161434412002563, | |
| "learning_rate": 3.6281179138321995e-05, | |
| "loss": 1.0389, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.1020986954055587, | |
| "grad_norm": 0.6483525633811951, | |
| "learning_rate": 4.0816326530612245e-05, | |
| "loss": 1.0542, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.11344299489506524, | |
| "grad_norm": 0.8814989924430847, | |
| "learning_rate": 4.53514739229025e-05, | |
| "loss": 0.9847, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.12478729438457176, | |
| "grad_norm": 0.7316718101501465, | |
| "learning_rate": 4.9886621315192745e-05, | |
| "loss": 1.0585, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.13613159387407828, | |
| "grad_norm": 0.7645348310470581, | |
| "learning_rate": 5.4421768707483e-05, | |
| "loss": 0.9713, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.1474758933635848, | |
| "grad_norm": 0.6830883622169495, | |
| "learning_rate": 5.895691609977324e-05, | |
| "loss": 0.9823, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.15882019285309132, | |
| "grad_norm": 1.3199207782745361, | |
| "learning_rate": 6.349206349206349e-05, | |
| "loss": 0.9992, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.17016449234259784, | |
| "grad_norm": 0.7770159840583801, | |
| "learning_rate": 6.802721088435374e-05, | |
| "loss": 1.0085, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.18150879183210436, | |
| "grad_norm": 1.623410940170288, | |
| "learning_rate": 7.256235827664399e-05, | |
| "loss": 1.0491, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.19285309132161088, | |
| "grad_norm": 2.8830106258392334, | |
| "learning_rate": 7.709750566893424e-05, | |
| "loss": 1.0686, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.2041973908111174, | |
| "grad_norm": 1.3428577184677124, | |
| "learning_rate": 8.163265306122449e-05, | |
| "loss": 1.0359, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.21554169030062392, | |
| "grad_norm": 0.8043076395988464, | |
| "learning_rate": 8.616780045351474e-05, | |
| "loss": 1.0496, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.22688598979013047, | |
| "grad_norm": 1.8799352645874023, | |
| "learning_rate": 9.0702947845805e-05, | |
| "loss": 1.0284, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.238230289279637, | |
| "grad_norm": 0.6667978167533875, | |
| "learning_rate": 9.523809523809524e-05, | |
| "loss": 1.0162, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.2495745887691435, | |
| "grad_norm": 0.815127968788147, | |
| "learning_rate": 9.977324263038549e-05, | |
| "loss": 1.0009, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.26091888825865, | |
| "grad_norm": 0.6558067798614502, | |
| "learning_rate": 0.00010430839002267574, | |
| "loss": 1.004, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.27226318774815655, | |
| "grad_norm": 0.6002511382102966, | |
| "learning_rate": 0.000108843537414966, | |
| "loss": 0.9702, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.28360748723766305, | |
| "grad_norm": 0.7007895708084106, | |
| "learning_rate": 0.00011337868480725624, | |
| "loss": 1.0266, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.2949517867271696, | |
| "grad_norm": 0.7985921502113342, | |
| "learning_rate": 0.00011791383219954648, | |
| "loss": 0.9753, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.30629608621667614, | |
| "grad_norm": 0.5343239903450012, | |
| "learning_rate": 0.00012244897959183676, | |
| "loss": 1.036, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.31764038570618264, | |
| "grad_norm": 0.7095124125480652, | |
| "learning_rate": 0.00012698412698412698, | |
| "loss": 1.0061, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.3289846851956892, | |
| "grad_norm": 0.8570685386657715, | |
| "learning_rate": 0.00013151927437641726, | |
| "loss": 0.9458, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.3403289846851957, | |
| "grad_norm": 0.6379779577255249, | |
| "learning_rate": 0.00013605442176870748, | |
| "loss": 0.9965, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.3516732841747022, | |
| "grad_norm": 0.9263567328453064, | |
| "learning_rate": 0.00014058956916099776, | |
| "loss": 0.9601, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.3630175836642087, | |
| "grad_norm": 0.7343761920928955, | |
| "learning_rate": 0.00014512471655328798, | |
| "loss": 1.0182, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.37436188315371527, | |
| "grad_norm": 0.588762640953064, | |
| "learning_rate": 0.00014965986394557826, | |
| "loss": 0.9762, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.38570618264322176, | |
| "grad_norm": 0.6719630360603333, | |
| "learning_rate": 0.00015419501133786848, | |
| "loss": 0.989, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.3970504821327283, | |
| "grad_norm": 1.641836166381836, | |
| "learning_rate": 0.00015873015873015873, | |
| "loss": 0.9611, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.4083947816222348, | |
| "grad_norm": 0.9340532422065735, | |
| "learning_rate": 0.00016326530612244898, | |
| "loss": 0.9861, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.41973908111174135, | |
| "grad_norm": 0.737554669380188, | |
| "learning_rate": 0.00016780045351473923, | |
| "loss": 1.0, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.43108338060124785, | |
| "grad_norm": 1.1190237998962402, | |
| "learning_rate": 0.00017233560090702948, | |
| "loss": 1.016, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.4424276800907544, | |
| "grad_norm": 0.7501509785652161, | |
| "learning_rate": 0.00017687074829931973, | |
| "loss": 0.9743, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.45377197958026094, | |
| "grad_norm": 0.5105754733085632, | |
| "learning_rate": 0.00018140589569161, | |
| "loss": 1.0182, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.46511627906976744, | |
| "grad_norm": 0.7148075699806213, | |
| "learning_rate": 0.00018594104308390023, | |
| "loss": 0.9673, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.476460578559274, | |
| "grad_norm": 0.49944302439689636, | |
| "learning_rate": 0.00019047619047619048, | |
| "loss": 1.0083, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.4878048780487805, | |
| "grad_norm": 0.5624661445617676, | |
| "learning_rate": 0.00019501133786848073, | |
| "loss": 1.0201, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.499149177538287, | |
| "grad_norm": 0.5779452919960022, | |
| "learning_rate": 0.00019954648526077098, | |
| "loss": 1.0165, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.5104934770277936, | |
| "grad_norm": 0.8505494594573975, | |
| "learning_rate": 0.0001999974561843451, | |
| "loss": 0.9527, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.5218377765173, | |
| "grad_norm": 0.7141993641853333, | |
| "learning_rate": 0.00019998866291366877, | |
| "loss": 0.9927, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.5331820760068066, | |
| "grad_norm": 0.5913094282150269, | |
| "learning_rate": 0.0001999735893350151, | |
| "loss": 1.0054, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.5445263754963131, | |
| "grad_norm": 0.5813531279563904, | |
| "learning_rate": 0.00019995223639515864, | |
| "loss": 0.9511, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.5558706749858197, | |
| "grad_norm": 0.9083317518234253, | |
| "learning_rate": 0.0001999246054352818, | |
| "loss": 0.9596, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.5672149744753261, | |
| "grad_norm": 0.8444753885269165, | |
| "learning_rate": 0.00019989069819089067, | |
| "loss": 1.0163, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.5785592739648326, | |
| "grad_norm": 0.6896610856056213, | |
| "learning_rate": 0.0001998505167917061, | |
| "loss": 0.9606, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.5899035734543392, | |
| "grad_norm": 0.7446523308753967, | |
| "learning_rate": 0.00019980406376152984, | |
| "loss": 0.9748, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.6012478729438457, | |
| "grad_norm": 0.5111407041549683, | |
| "learning_rate": 0.00019975134201808605, | |
| "loss": 0.9364, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.6125921724333523, | |
| "grad_norm": 0.6797256469726562, | |
| "learning_rate": 0.000199692354872838, | |
| "loss": 0.9766, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.6239364719228587, | |
| "grad_norm": 0.9774245619773865, | |
| "learning_rate": 0.00019962710603078007, | |
| "loss": 0.9669, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.6352807714123653, | |
| "grad_norm": 0.7039481997489929, | |
| "learning_rate": 0.0001995555995902052, | |
| "loss": 0.9371, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.6466250709018718, | |
| "grad_norm": 0.7363829016685486, | |
| "learning_rate": 0.0001994778400424472, | |
| "loss": 0.9809, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.6579693703913784, | |
| "grad_norm": 0.7072857022285461, | |
| "learning_rate": 0.0001993938322715989, | |
| "loss": 0.9825, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.6693136698808848, | |
| "grad_norm": 0.5628974437713623, | |
| "learning_rate": 0.00019930358155420525, | |
| "loss": 0.9101, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.6806579693703914, | |
| "grad_norm": 0.6564317345619202, | |
| "learning_rate": 0.0001992070935589319, | |
| "loss": 1.0374, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.6920022688598979, | |
| "grad_norm": 0.5805884599685669, | |
| "learning_rate": 0.0001991043743462092, | |
| "loss": 0.9695, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.7033465683494045, | |
| "grad_norm": 0.5671830773353577, | |
| "learning_rate": 0.00019899543036785145, | |
| "loss": 0.9598, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.7146908678389109, | |
| "grad_norm": 0.54367595911026, | |
| "learning_rate": 0.0001988802684666519, | |
| "loss": 0.962, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.7260351673284174, | |
| "grad_norm": 0.6982467770576477, | |
| "learning_rate": 0.00019875889587595252, | |
| "loss": 0.9633, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.737379466817924, | |
| "grad_norm": 0.6268488764762878, | |
| "learning_rate": 0.00019863132021919025, | |
| "loss": 0.9684, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.7487237663074305, | |
| "grad_norm": 1.2111632823944092, | |
| "learning_rate": 0.00019849754950941758, | |
| "loss": 1.0044, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.7600680657969371, | |
| "grad_norm": 0.6442829370498657, | |
| "learning_rate": 0.00019835759214879964, | |
| "loss": 0.9533, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.7714123652864435, | |
| "grad_norm": 0.5263229608535767, | |
| "learning_rate": 0.00019821145692808633, | |
| "loss": 0.959, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.7827566647759501, | |
| "grad_norm": 0.572928786277771, | |
| "learning_rate": 0.00019805915302606016, | |
| "loss": 0.9473, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.7941009642654566, | |
| "grad_norm": 0.6176092624664307, | |
| "learning_rate": 0.00019790069000895987, | |
| "loss": 0.9164, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.8054452637549632, | |
| "grad_norm": 0.5628384351730347, | |
| "learning_rate": 0.00019773607782987924, | |
| "loss": 0.9705, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.8167895632444696, | |
| "grad_norm": 0.8331648111343384, | |
| "learning_rate": 0.00019756532682814232, | |
| "loss": 0.9497, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.8281338627339762, | |
| "grad_norm": 0.5843848586082458, | |
| "learning_rate": 0.00019738844772865377, | |
| "loss": 0.9828, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.8394781622234827, | |
| "grad_norm": 0.6603434681892395, | |
| "learning_rate": 0.0001972054516412253, | |
| "loss": 0.9717, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.8508224617129893, | |
| "grad_norm": 0.5622076988220215, | |
| "learning_rate": 0.00019701635005987792, | |
| "loss": 0.9392, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.8621667612024957, | |
| "grad_norm": 0.8947564959526062, | |
| "learning_rate": 0.00019682115486211984, | |
| "loss": 0.9917, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.8735110606920022, | |
| "grad_norm": 0.5935038328170776, | |
| "learning_rate": 0.00019661987830820065, | |
| "loss": 0.9749, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.8848553601815088, | |
| "grad_norm": 0.8751797676086426, | |
| "learning_rate": 0.000196412533040341, | |
| "loss": 0.9828, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.8961996596710153, | |
| "grad_norm": 0.5279515981674194, | |
| "learning_rate": 0.00019619913208193882, | |
| "loss": 0.9685, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.9075439591605219, | |
| "grad_norm": 0.643695056438446, | |
| "learning_rate": 0.00019597968883675116, | |
| "loss": 0.9547, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.9188882586500283, | |
| "grad_norm": 0.7370747923851013, | |
| "learning_rate": 0.00019575421708805215, | |
| "loss": 0.9129, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.9302325581395349, | |
| "grad_norm": 0.7514728307723999, | |
| "learning_rate": 0.0001955227309977677, | |
| "loss": 0.9929, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.9415768576290414, | |
| "grad_norm": 0.6589088439941406, | |
| "learning_rate": 0.00019528524510558547, | |
| "loss": 0.9627, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.952921157118548, | |
| "grad_norm": 0.548102617263794, | |
| "learning_rate": 0.00019504177432804203, | |
| "loss": 0.9307, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.9642654566080544, | |
| "grad_norm": 0.458879679441452, | |
| "learning_rate": 0.00019479233395758576, | |
| "loss": 0.9838, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.975609756097561, | |
| "grad_norm": 0.9955594539642334, | |
| "learning_rate": 0.0001945369396616164, | |
| "loss": 0.9246, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.9869540555870675, | |
| "grad_norm": 0.5781052708625793, | |
| "learning_rate": 0.0001942756074815009, | |
| "loss": 1.0076, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.998298355076574, | |
| "grad_norm": 0.7370733022689819, | |
| "learning_rate": 0.00019400835383156592, | |
| "loss": 0.9618, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 1.0096426545660806, | |
| "grad_norm": 0.6173350214958191, | |
| "learning_rate": 0.00019373519549806682, | |
| "loss": 0.872, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 1.0209869540555871, | |
| "grad_norm": 0.6110262274742126, | |
| "learning_rate": 0.00019345614963813334, | |
| "loss": 0.8953, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.0323312535450937, | |
| "grad_norm": 0.8880902528762817, | |
| "learning_rate": 0.00019317123377869192, | |
| "loss": 0.8847, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 1.0436755530346, | |
| "grad_norm": 0.6907595992088318, | |
| "learning_rate": 0.00019288046581536486, | |
| "loss": 0.8878, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 1.0550198525241066, | |
| "grad_norm": 0.7469139695167542, | |
| "learning_rate": 0.00019258386401134624, | |
| "loss": 0.9018, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 1.0663641520136131, | |
| "grad_norm": 0.8650104403495789, | |
| "learning_rate": 0.0001922814469962549, | |
| "loss": 0.8825, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 1.0777084515031197, | |
| "grad_norm": 1.1437135934829712, | |
| "learning_rate": 0.00019197323376496427, | |
| "loss": 0.8977, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 1.0890527509926262, | |
| "grad_norm": 0.6191611289978027, | |
| "learning_rate": 0.00019165924367640916, | |
| "loss": 0.9059, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 1.1003970504821328, | |
| "grad_norm": 0.7402692437171936, | |
| "learning_rate": 0.00019133949645237005, | |
| "loss": 0.8778, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 1.1117413499716393, | |
| "grad_norm": 0.7002813220024109, | |
| "learning_rate": 0.00019101401217623426, | |
| "loss": 0.9281, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 1.1230856494611459, | |
| "grad_norm": 0.9000174403190613, | |
| "learning_rate": 0.00019068281129173444, | |
| "loss": 0.8795, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 1.1344299489506522, | |
| "grad_norm": 0.6749204993247986, | |
| "learning_rate": 0.00019034591460166463, | |
| "loss": 0.9091, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.1344299489506522, | |
| "eval_loss": 0.8940885663032532, | |
| "eval_runtime": 15.7869, | |
| "eval_samples_per_second": 94.065, | |
| "eval_steps_per_second": 11.782, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.1457742484401587, | |
| "grad_norm": 0.7294667959213257, | |
| "learning_rate": 0.00019000334326657345, | |
| "loss": 0.879, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 1.1571185479296653, | |
| "grad_norm": 0.9591787457466125, | |
| "learning_rate": 0.00018965511880343527, | |
| "loss": 0.9264, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 1.1684628474191718, | |
| "grad_norm": 0.9575808644294739, | |
| "learning_rate": 0.00018930126308429844, | |
| "loss": 0.8825, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 1.1798071469086784, | |
| "grad_norm": 0.49267736077308655, | |
| "learning_rate": 0.00018894179833491164, | |
| "loss": 0.9321, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 1.191151446398185, | |
| "grad_norm": 0.848102867603302, | |
| "learning_rate": 0.00018857674713332795, | |
| "loss": 0.8543, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 1.2024957458876915, | |
| "grad_norm": 0.7710912227630615, | |
| "learning_rate": 0.00018820613240848655, | |
| "loss": 0.9468, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 1.213840045377198, | |
| "grad_norm": 0.6399308443069458, | |
| "learning_rate": 0.00018782997743877264, | |
| "loss": 0.9081, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 1.2251843448667046, | |
| "grad_norm": 0.9124737977981567, | |
| "learning_rate": 0.00018744830585055538, | |
| "loss": 0.9288, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 1.236528644356211, | |
| "grad_norm": 0.6313666105270386, | |
| "learning_rate": 0.00018706114161670377, | |
| "loss": 0.8197, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 1.2478729438457175, | |
| "grad_norm": 0.7220073938369751, | |
| "learning_rate": 0.000186668509055081, | |
| "loss": 0.8576, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 1.259217243335224, | |
| "grad_norm": 1.1808422803878784, | |
| "learning_rate": 0.00018627043282701703, | |
| "loss": 0.9044, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 1.2705615428247305, | |
| "grad_norm": 0.6578934788703918, | |
| "learning_rate": 0.00018586693793575966, | |
| "loss": 0.9015, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 1.281905842314237, | |
| "grad_norm": 0.9080325961112976, | |
| "learning_rate": 0.0001854580497249039, | |
| "loss": 0.8919, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 1.2932501418037436, | |
| "grad_norm": 0.6446923017501831, | |
| "learning_rate": 0.00018504379387680034, | |
| "loss": 0.9033, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 1.3045944412932502, | |
| "grad_norm": 0.6877492070198059, | |
| "learning_rate": 0.00018462419641094189, | |
| "loss": 0.8843, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 1.3159387407827567, | |
| "grad_norm": 0.6565636396408081, | |
| "learning_rate": 0.00018419928368232957, | |
| "loss": 0.8925, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 1.3272830402722633, | |
| "grad_norm": 0.8198230862617493, | |
| "learning_rate": 0.0001837690823798171, | |
| "loss": 0.8495, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 1.3386273397617696, | |
| "grad_norm": 0.7579399347305298, | |
| "learning_rate": 0.00018333361952443462, | |
| "loss": 0.9051, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 1.3499716392512762, | |
| "grad_norm": 0.8067922592163086, | |
| "learning_rate": 0.0001828929224676914, | |
| "loss": 0.8677, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 1.3613159387407827, | |
| "grad_norm": 0.7077610492706299, | |
| "learning_rate": 0.00018244701888985802, | |
| "loss": 0.942, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 1.3726602382302893, | |
| "grad_norm": 1.2009291648864746, | |
| "learning_rate": 0.00018199593679822765, | |
| "loss": 0.9034, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 1.3840045377197958, | |
| "grad_norm": 0.8162534832954407, | |
| "learning_rate": 0.00018153970452535698, | |
| "loss": 0.8904, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 1.3953488372093024, | |
| "grad_norm": 0.6332406401634216, | |
| "learning_rate": 0.00018107835072728656, | |
| "loss": 0.8637, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 1.406693136698809, | |
| "grad_norm": 0.6449089050292969, | |
| "learning_rate": 0.00018061190438174105, | |
| "loss": 0.9463, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 1.4180374361883152, | |
| "grad_norm": 0.6543394327163696, | |
| "learning_rate": 0.00018014039478630894, | |
| "loss": 0.8497, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 1.429381735677822, | |
| "grad_norm": 0.7993437647819519, | |
| "learning_rate": 0.0001796638515566025, | |
| "loss": 0.9415, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 1.4407260351673283, | |
| "grad_norm": 0.878514289855957, | |
| "learning_rate": 0.0001791823046243977, | |
| "loss": 0.9143, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 1.4520703346568349, | |
| "grad_norm": 0.6794580817222595, | |
| "learning_rate": 0.00017869578423575387, | |
| "loss": 0.9041, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 1.4634146341463414, | |
| "grad_norm": 0.9009565711021423, | |
| "learning_rate": 0.00017820432094911427, | |
| "loss": 0.8773, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 1.474758933635848, | |
| "grad_norm": 0.6419825553894043, | |
| "learning_rate": 0.00017770794563338647, | |
| "loss": 0.9027, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 1.4861032331253545, | |
| "grad_norm": 0.7277469635009766, | |
| "learning_rate": 0.0001772066894660037, | |
| "loss": 0.9123, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 1.497447532614861, | |
| "grad_norm": 0.7514845132827759, | |
| "learning_rate": 0.00017670058393096634, | |
| "loss": 0.9095, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 1.5087918321043676, | |
| "grad_norm": 0.5530194044113159, | |
| "learning_rate": 0.0001761896608168646, | |
| "loss": 0.855, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 1.520136131593874, | |
| "grad_norm": 0.6379088759422302, | |
| "learning_rate": 0.0001756739522148818, | |
| "loss": 0.9485, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 1.5314804310833807, | |
| "grad_norm": 0.5411556959152222, | |
| "learning_rate": 0.0001751534905167787, | |
| "loss": 0.951, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 1.542824730572887, | |
| "grad_norm": 0.9241764545440674, | |
| "learning_rate": 0.00017462830841285894, | |
| "loss": 0.8459, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 1.5541690300623936, | |
| "grad_norm": 0.9029989242553711, | |
| "learning_rate": 0.00017409843888991584, | |
| "loss": 0.9045, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 1.5655133295519001, | |
| "grad_norm": 0.9002951979637146, | |
| "learning_rate": 0.00017356391522916042, | |
| "loss": 0.8388, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 1.5768576290414067, | |
| "grad_norm": 0.6322818994522095, | |
| "learning_rate": 0.0001730247710041311, | |
| "loss": 0.8937, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 1.5882019285309132, | |
| "grad_norm": 0.9197801351547241, | |
| "learning_rate": 0.00017248104007858476, | |
| "loss": 0.8656, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 1.5995462280204198, | |
| "grad_norm": 0.7498595714569092, | |
| "learning_rate": 0.00017193275660436997, | |
| "loss": 0.8848, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 1.6108905275099263, | |
| "grad_norm": 1.0003221035003662, | |
| "learning_rate": 0.00017137995501928166, | |
| "loss": 0.8494, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 1.6222348269994327, | |
| "grad_norm": 0.6622512340545654, | |
| "learning_rate": 0.00017082267004489842, | |
| "loss": 0.9158, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 1.6335791264889394, | |
| "grad_norm": 1.2562657594680786, | |
| "learning_rate": 0.00017026093668440114, | |
| "loss": 0.8899, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 1.6449234259784458, | |
| "grad_norm": 0.5380372405052185, | |
| "learning_rate": 0.00016969479022037502, | |
| "loss": 0.9082, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 1.6562677254679523, | |
| "grad_norm": 0.7120011448860168, | |
| "learning_rate": 0.00016912426621259297, | |
| "loss": 0.8456, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 1.6676120249574589, | |
| "grad_norm": 0.580111026763916, | |
| "learning_rate": 0.0001685494004957824, | |
| "loss": 0.9272, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 1.6789563244469654, | |
| "grad_norm": 0.9516561627388, | |
| "learning_rate": 0.0001679702291773743, | |
| "loss": 0.906, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 1.690300623936472, | |
| "grad_norm": 0.5973901152610779, | |
| "learning_rate": 0.0001673867886352354, | |
| "loss": 0.931, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 1.7016449234259783, | |
| "grad_norm": 0.7292883992195129, | |
| "learning_rate": 0.00016679911551538317, | |
| "loss": 0.8848, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 1.712989222915485, | |
| "grad_norm": 0.6363751888275146, | |
| "learning_rate": 0.0001662072467296842, | |
| "loss": 0.9059, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 1.7243335224049914, | |
| "grad_norm": 0.9236806631088257, | |
| "learning_rate": 0.00016561121945353566, | |
| "loss": 0.8557, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 1.7356778218944982, | |
| "grad_norm": 0.6865366697311401, | |
| "learning_rate": 0.00016501107112353028, | |
| "loss": 0.9264, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 1.7470221213840045, | |
| "grad_norm": 0.6749486923217773, | |
| "learning_rate": 0.00016440683943510516, | |
| "loss": 0.9224, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 1.758366420873511, | |
| "grad_norm": 0.7539329528808594, | |
| "learning_rate": 0.00016379856234017382, | |
| "loss": 0.8594, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 1.7697107203630176, | |
| "grad_norm": 0.6702885031700134, | |
| "learning_rate": 0.0001631862780447426, | |
| "loss": 0.8896, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 1.7810550198525241, | |
| "grad_norm": 0.6152791976928711, | |
| "learning_rate": 0.00016257002500651098, | |
| "loss": 0.8738, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 1.7923993193420307, | |
| "grad_norm": 0.5736550688743591, | |
| "learning_rate": 0.00016194984193245587, | |
| "loss": 0.9018, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 1.803743618831537, | |
| "grad_norm": 0.751157820224762, | |
| "learning_rate": 0.00016132576777640067, | |
| "loss": 0.8605, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 1.8150879183210438, | |
| "grad_norm": 0.6626732349395752, | |
| "learning_rate": 0.0001606978417365682, | |
| "loss": 0.8857, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 1.82643221781055, | |
| "grad_norm": 0.584065318107605, | |
| "learning_rate": 0.00016006610325311908, | |
| "loss": 0.9104, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 1.8377765173000569, | |
| "grad_norm": 0.5933496356010437, | |
| "learning_rate": 0.0001594305920056742, | |
| "loss": 0.8167, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 1.8491208167895632, | |
| "grad_norm": 0.5618401765823364, | |
| "learning_rate": 0.00015879134791082247, | |
| "loss": 0.8907, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 1.8604651162790697, | |
| "grad_norm": 0.9804329872131348, | |
| "learning_rate": 0.00015814841111961374, | |
| "loss": 0.9494, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 1.8718094157685763, | |
| "grad_norm": 0.937347412109375, | |
| "learning_rate": 0.00015750182201503682, | |
| "loss": 0.9045, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 1.8831537152580828, | |
| "grad_norm": 0.8898664712905884, | |
| "learning_rate": 0.00015685162120948317, | |
| "loss": 0.9346, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 1.8944980147475894, | |
| "grad_norm": 0.8580901622772217, | |
| "learning_rate": 0.00015619784954219577, | |
| "loss": 0.9412, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 1.9058423142370957, | |
| "grad_norm": 0.6913225054740906, | |
| "learning_rate": 0.00015554054807670418, | |
| "loss": 0.9006, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 1.9171866137266025, | |
| "grad_norm": 0.7101637125015259, | |
| "learning_rate": 0.00015487975809824539, | |
| "loss": 0.8857, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 1.9285309132161088, | |
| "grad_norm": 0.8228437900543213, | |
| "learning_rate": 0.00015421552111117044, | |
| "loss": 0.8607, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 1.9398752127056156, | |
| "grad_norm": 0.5591906905174255, | |
| "learning_rate": 0.00015354787883633782, | |
| "loss": 0.8674, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 1.951219512195122, | |
| "grad_norm": 0.6841379404067993, | |
| "learning_rate": 0.00015287687320849271, | |
| "loss": 0.8387, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 1.9625638116846285, | |
| "grad_norm": 0.8344857096672058, | |
| "learning_rate": 0.00015220254637363318, | |
| "loss": 0.9227, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 1.973908111174135, | |
| "grad_norm": 0.8986241221427917, | |
| "learning_rate": 0.00015152494068636308, | |
| "loss": 0.8917, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 1.9852524106636416, | |
| "grad_norm": 0.5783970952033997, | |
| "learning_rate": 0.00015084409870723154, | |
| "loss": 0.872, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 1.996596710153148, | |
| "grad_norm": 0.6369901895523071, | |
| "learning_rate": 0.00015016006320005986, | |
| "loss": 0.9132, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 2.0079410096426544, | |
| "grad_norm": 0.5906355381011963, | |
| "learning_rate": 0.00014947287712925545, | |
| "loss": 0.8074, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 2.019285309132161, | |
| "grad_norm": 0.6774492263793945, | |
| "learning_rate": 0.00014878258365711334, | |
| "loss": 0.759, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 2.0306296086216675, | |
| "grad_norm": 0.8353272676467896, | |
| "learning_rate": 0.00014808922614110493, | |
| "loss": 0.8028, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 2.0419739081111743, | |
| "grad_norm": 0.8876771926879883, | |
| "learning_rate": 0.00014739284813115498, | |
| "loss": 0.7302, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 2.0533182076006806, | |
| "grad_norm": 0.6215524673461914, | |
| "learning_rate": 0.00014669349336690594, | |
| "loss": 0.7759, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 2.0646625070901874, | |
| "grad_norm": 0.5663015246391296, | |
| "learning_rate": 0.00014599120577497087, | |
| "loss": 0.7834, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 2.0760068065796937, | |
| "grad_norm": 0.6096060872077942, | |
| "learning_rate": 0.00014528602946617432, | |
| "loss": 0.8364, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 2.0873511060692, | |
| "grad_norm": 0.7625316977500916, | |
| "learning_rate": 0.00014457800873278172, | |
| "loss": 0.7558, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 2.098695405558707, | |
| "grad_norm": 0.6301640272140503, | |
| "learning_rate": 0.0001438671880457174, | |
| "loss": 0.8297, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 2.110039705048213, | |
| "grad_norm": 0.6493074297904968, | |
| "learning_rate": 0.00014315361205177127, | |
| "loss": 0.7764, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 2.12138400453772, | |
| "grad_norm": 0.8326807618141174, | |
| "learning_rate": 0.0001424373255707947, | |
| "loss": 0.7895, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 2.1327283040272262, | |
| "grad_norm": 1.0578484535217285, | |
| "learning_rate": 0.00014171837359288524, | |
| "loss": 0.7889, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 2.144072603516733, | |
| "grad_norm": 0.6812543272972107, | |
| "learning_rate": 0.0001409968012755609, | |
| "loss": 0.7643, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 2.1554169030062393, | |
| "grad_norm": 0.8412303924560547, | |
| "learning_rate": 0.00014027265394092364, | |
| "loss": 0.7402, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 2.1667612024957457, | |
| "grad_norm": 0.947846531867981, | |
| "learning_rate": 0.00013954597707281288, | |
| "loss": 0.7763, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 2.1781055019852524, | |
| "grad_norm": 0.7577157616615295, | |
| "learning_rate": 0.00013881681631394842, | |
| "loss": 0.8334, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 2.1894498014747588, | |
| "grad_norm": 0.6362768411636353, | |
| "learning_rate": 0.0001380852174630639, | |
| "loss": 0.7484, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 2.2007941009642655, | |
| "grad_norm": 0.7967275381088257, | |
| "learning_rate": 0.00013735122647202984, | |
| "loss": 0.7302, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 2.212138400453772, | |
| "grad_norm": 0.7726805210113525, | |
| "learning_rate": 0.0001366148894429677, | |
| "loss": 0.7836, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 2.2234826999432786, | |
| "grad_norm": 0.7741623520851135, | |
| "learning_rate": 0.00013587625262535396, | |
| "loss": 0.7925, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 2.234826999432785, | |
| "grad_norm": 0.7582458257675171, | |
| "learning_rate": 0.0001351353624131153, | |
| "loss": 0.7765, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 2.2461712989222917, | |
| "grad_norm": 0.8276723027229309, | |
| "learning_rate": 0.00013439226534171463, | |
| "loss": 0.81, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 2.257515598411798, | |
| "grad_norm": 0.8419069051742554, | |
| "learning_rate": 0.00013364700808522807, | |
| "loss": 0.7464, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 2.2688598979013044, | |
| "grad_norm": 0.7446946501731873, | |
| "learning_rate": 0.00013289963745341345, | |
| "loss": 0.7524, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 2.2688598979013044, | |
| "eval_loss": 0.9066722989082336, | |
| "eval_runtime": 15.6396, | |
| "eval_samples_per_second": 94.951, | |
| "eval_steps_per_second": 11.893, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 2.280204197390811, | |
| "grad_norm": 0.7091513872146606, | |
| "learning_rate": 0.00013215020038877002, | |
| "loss": 0.7806, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 2.2915484968803175, | |
| "grad_norm": 0.5853792428970337, | |
| "learning_rate": 0.0001313987439635902, | |
| "loss": 0.7625, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 2.3028927963698242, | |
| "grad_norm": 0.7464004158973694, | |
| "learning_rate": 0.00013064531537700284, | |
| "loss": 0.7313, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 2.3142370958593306, | |
| "grad_norm": 0.6370956301689148, | |
| "learning_rate": 0.00012988996195200858, | |
| "loss": 0.7903, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 2.3255813953488373, | |
| "grad_norm": 0.8973234295845032, | |
| "learning_rate": 0.0001291327311325076, | |
| "loss": 0.7537, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 2.3369256948383437, | |
| "grad_norm": 1.206678032875061, | |
| "learning_rate": 0.00012837367048031955, | |
| "loss": 0.8081, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 2.3482699943278504, | |
| "grad_norm": 0.9258993864059448, | |
| "learning_rate": 0.0001276128276721963, | |
| "loss": 0.7754, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 2.3596142938173568, | |
| "grad_norm": 0.8008835315704346, | |
| "learning_rate": 0.00012685025049682732, | |
| "loss": 0.8119, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 2.370958593306863, | |
| "grad_norm": 0.8094901442527771, | |
| "learning_rate": 0.0001260859868518379, | |
| "loss": 0.7889, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 2.38230289279637, | |
| "grad_norm": 0.7824433445930481, | |
| "learning_rate": 0.00012532008474078093, | |
| "loss": 0.8443, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 2.393647192285876, | |
| "grad_norm": 0.8314623236656189, | |
| "learning_rate": 0.00012455259227012172, | |
| "loss": 0.8009, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 2.404991491775383, | |
| "grad_norm": 0.993483304977417, | |
| "learning_rate": 0.0001237835576462163, | |
| "loss": 0.803, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 2.4163357912648893, | |
| "grad_norm": 0.7922090291976929, | |
| "learning_rate": 0.00012301302917228364, | |
| "loss": 0.7785, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 2.427680090754396, | |
| "grad_norm": 0.8681336045265198, | |
| "learning_rate": 0.00012224105524537176, | |
| "loss": 0.7427, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 2.4390243902439024, | |
| "grad_norm": 0.868011474609375, | |
| "learning_rate": 0.00012146768435331797, | |
| "loss": 0.7841, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 2.450368689733409, | |
| "grad_norm": 0.8300703763961792, | |
| "learning_rate": 0.00012069296507170307, | |
| "loss": 0.7113, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 2.4617129892229155, | |
| "grad_norm": 1.0211178064346313, | |
| "learning_rate": 0.00011991694606080062, | |
| "loss": 0.7927, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 2.473057288712422, | |
| "grad_norm": 1.1126124858856201, | |
| "learning_rate": 0.00011913967606252035, | |
| "loss": 0.798, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 2.4844015882019286, | |
| "grad_norm": 1.331468939781189, | |
| "learning_rate": 0.00011836120389734677, | |
| "loss": 0.7868, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 2.495745887691435, | |
| "grad_norm": 0.7289639115333557, | |
| "learning_rate": 0.00011758157846127278, | |
| "loss": 0.7501, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 2.5070901871809417, | |
| "grad_norm": 0.6862948536872864, | |
| "learning_rate": 0.00011680084872272843, | |
| "loss": 0.8113, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 2.518434486670448, | |
| "grad_norm": 0.6838523745536804, | |
| "learning_rate": 0.00011601906371950523, | |
| "loss": 0.7794, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 2.5297787861599548, | |
| "grad_norm": 0.8923412561416626, | |
| "learning_rate": 0.00011523627255567606, | |
| "loss": 0.7532, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 2.541123085649461, | |
| "grad_norm": 0.7864569425582886, | |
| "learning_rate": 0.00011445252439851092, | |
| "loss": 0.8044, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 2.552467385138968, | |
| "grad_norm": 0.9186776280403137, | |
| "learning_rate": 0.0001136678684753889, | |
| "loss": 0.7861, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 2.563811684628474, | |
| "grad_norm": 0.9502933025360107, | |
| "learning_rate": 0.00011288235407070588, | |
| "loss": 0.7441, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 2.5751559841179805, | |
| "grad_norm": 0.9764688014984131, | |
| "learning_rate": 0.00011209603052277924, | |
| "loss": 0.7519, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 2.5865002836074873, | |
| "grad_norm": 0.8480959534645081, | |
| "learning_rate": 0.00011130894722074874, | |
| "loss": 0.7743, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 2.5978445830969936, | |
| "grad_norm": 0.8660979866981506, | |
| "learning_rate": 0.00011052115360147448, | |
| "loss": 0.7989, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 2.6091888825865004, | |
| "grad_norm": 0.6586043238639832, | |
| "learning_rate": 0.0001097326991464318, | |
| "loss": 0.7676, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 2.6205331820760067, | |
| "grad_norm": 0.7315343618392944, | |
| "learning_rate": 0.00010894363337860314, | |
| "loss": 0.7699, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 2.6318774815655135, | |
| "grad_norm": 0.7257770895957947, | |
| "learning_rate": 0.0001081540058593677, | |
| "loss": 0.7773, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 2.64322178105502, | |
| "grad_norm": 0.6760928630828857, | |
| "learning_rate": 0.00010736386618538838, | |
| "loss": 0.7902, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 2.6545660805445266, | |
| "grad_norm": 0.6824659705162048, | |
| "learning_rate": 0.00010657326398549661, | |
| "loss": 0.7759, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 2.665910380034033, | |
| "grad_norm": 0.972321629524231, | |
| "learning_rate": 0.0001057822489175752, | |
| "loss": 0.7926, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 2.6772546795235392, | |
| "grad_norm": 0.9526649713516235, | |
| "learning_rate": 0.00010499087066543922, | |
| "loss": 0.7648, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 2.688598979013046, | |
| "grad_norm": 0.7266947031021118, | |
| "learning_rate": 0.0001041991789357155, | |
| "loss": 0.776, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 2.6999432785025523, | |
| "grad_norm": 0.808121383190155, | |
| "learning_rate": 0.00010340722345472037, | |
| "loss": 0.7852, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 2.711287577992059, | |
| "grad_norm": 1.1124972105026245, | |
| "learning_rate": 0.00010261505396533648, | |
| "loss": 0.717, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 2.7226318774815654, | |
| "grad_norm": 0.7241740226745605, | |
| "learning_rate": 0.00010182272022388841, | |
| "loss": 0.8335, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 2.733976176971072, | |
| "grad_norm": 1.0944820642471313, | |
| "learning_rate": 0.0001010302719970174, | |
| "loss": 0.7874, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 2.7453204764605785, | |
| "grad_norm": 0.735615611076355, | |
| "learning_rate": 0.00010023775905855559, | |
| "loss": 0.7198, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 2.7566647759500853, | |
| "grad_norm": 0.8080368041992188, | |
| "learning_rate": 9.944523118639958e-05, | |
| "loss": 0.8275, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 2.7680090754395916, | |
| "grad_norm": 1.0709086656570435, | |
| "learning_rate": 9.865273815938403e-05, | |
| "loss": 0.841, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 2.779353374929098, | |
| "grad_norm": 0.8561082482337952, | |
| "learning_rate": 9.786032975415503e-05, | |
| "loss": 0.7393, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 2.7906976744186047, | |
| "grad_norm": 0.6831649541854858, | |
| "learning_rate": 9.706805574204341e-05, | |
| "loss": 0.7904, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 2.802041973908111, | |
| "grad_norm": 0.9404779672622681, | |
| "learning_rate": 9.627596588593884e-05, | |
| "loss": 0.7651, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 2.813386273397618, | |
| "grad_norm": 1.1059134006500244, | |
| "learning_rate": 9.54841099371641e-05, | |
| "loss": 0.7792, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 2.824730572887124, | |
| "grad_norm": 0.8339388966560364, | |
| "learning_rate": 9.469253763235015e-05, | |
| "loss": 0.8037, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 2.8360748723766305, | |
| "grad_norm": 0.691879153251648, | |
| "learning_rate": 9.390129869031232e-05, | |
| "loss": 0.7882, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 2.8474191718661372, | |
| "grad_norm": 0.8173119425773621, | |
| "learning_rate": 9.311044280892728e-05, | |
| "loss": 0.7723, | |
| "step": 2510 | |
| }, | |
| { | |
| "epoch": 2.858763471355644, | |
| "grad_norm": 1.2163662910461426, | |
| "learning_rate": 9.232001966201159e-05, | |
| "loss": 0.8332, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 2.8701077708451503, | |
| "grad_norm": 0.7762579917907715, | |
| "learning_rate": 9.153007889620169e-05, | |
| "loss": 0.8017, | |
| "step": 2530 | |
| }, | |
| { | |
| "epoch": 2.8814520703346567, | |
| "grad_norm": 0.7560020089149475, | |
| "learning_rate": 9.074067012783551e-05, | |
| "loss": 0.7645, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 2.8927963698241634, | |
| "grad_norm": 0.7039526104927063, | |
| "learning_rate": 8.995184293983627e-05, | |
| "loss": 0.7496, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 2.9041406693136698, | |
| "grad_norm": 0.8188515305519104, | |
| "learning_rate": 8.916364687859782e-05, | |
| "loss": 0.7941, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 2.9154849688031765, | |
| "grad_norm": 0.8847174048423767, | |
| "learning_rate": 8.837613145087289e-05, | |
| "loss": 0.7462, | |
| "step": 2570 | |
| }, | |
| { | |
| "epoch": 2.926829268292683, | |
| "grad_norm": 1.4302834272384644, | |
| "learning_rate": 8.758934612066353e-05, | |
| "loss": 0.7659, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 2.938173567782189, | |
| "grad_norm": 0.8293200135231018, | |
| "learning_rate": 8.680334030611414e-05, | |
| "loss": 0.7464, | |
| "step": 2590 | |
| }, | |
| { | |
| "epoch": 2.949517867271696, | |
| "grad_norm": 0.9347418546676636, | |
| "learning_rate": 8.601816337640767e-05, | |
| "loss": 0.7907, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 2.9608621667612027, | |
| "grad_norm": 0.8685625195503235, | |
| "learning_rate": 8.523386464866452e-05, | |
| "loss": 0.7881, | |
| "step": 2610 | |
| }, | |
| { | |
| "epoch": 2.972206466250709, | |
| "grad_norm": 1.0375618934631348, | |
| "learning_rate": 8.44504933848452e-05, | |
| "loss": 0.7415, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 2.9835507657402154, | |
| "grad_norm": 1.1286613941192627, | |
| "learning_rate": 8.366809878865594e-05, | |
| "loss": 0.759, | |
| "step": 2630 | |
| }, | |
| { | |
| "epoch": 2.994895065229722, | |
| "grad_norm": 0.9496249556541443, | |
| "learning_rate": 8.28867300024582e-05, | |
| "loss": 0.8122, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 3.0062393647192285, | |
| "grad_norm": 0.6161667108535767, | |
| "learning_rate": 8.210643610418232e-05, | |
| "loss": 0.7363, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 3.0175836642087353, | |
| "grad_norm": 1.1362223625183105, | |
| "learning_rate": 8.132726610424453e-05, | |
| "loss": 0.6957, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 3.0289279636982416, | |
| "grad_norm": 0.9549693465232849, | |
| "learning_rate": 8.054926894246887e-05, | |
| "loss": 0.6598, | |
| "step": 2670 | |
| }, | |
| { | |
| "epoch": 3.0402722631877483, | |
| "grad_norm": 0.7844473719596863, | |
| "learning_rate": 7.977249348501314e-05, | |
| "loss": 0.7104, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 3.0516165626772547, | |
| "grad_norm": 0.9754497408866882, | |
| "learning_rate": 7.899698852129962e-05, | |
| "loss": 0.7109, | |
| "step": 2690 | |
| }, | |
| { | |
| "epoch": 3.062960862166761, | |
| "grad_norm": 0.8465747237205505, | |
| "learning_rate": 7.822280276095073e-05, | |
| "loss": 0.6208, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 3.0743051616562678, | |
| "grad_norm": 0.7896714806556702, | |
| "learning_rate": 7.744998483072936e-05, | |
| "loss": 0.6417, | |
| "step": 2710 | |
| }, | |
| { | |
| "epoch": 3.085649461145774, | |
| "grad_norm": 0.8668105006217957, | |
| "learning_rate": 7.667858327148475e-05, | |
| "loss": 0.6525, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 3.096993760635281, | |
| "grad_norm": 1.0019567012786865, | |
| "learning_rate": 7.590864653510359e-05, | |
| "loss": 0.6604, | |
| "step": 2730 | |
| }, | |
| { | |
| "epoch": 3.108338060124787, | |
| "grad_norm": 0.7561362981796265, | |
| "learning_rate": 7.514022298146679e-05, | |
| "loss": 0.6912, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 3.119682359614294, | |
| "grad_norm": 0.9435575604438782, | |
| "learning_rate": 7.437336087541187e-05, | |
| "loss": 0.6993, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 3.1310266591038003, | |
| "grad_norm": 1.041034460067749, | |
| "learning_rate": 7.360810838370161e-05, | |
| "loss": 0.6562, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 3.142370958593307, | |
| "grad_norm": 0.8745769262313843, | |
| "learning_rate": 7.284451357199851e-05, | |
| "loss": 0.6035, | |
| "step": 2770 | |
| }, | |
| { | |
| "epoch": 3.1537152580828134, | |
| "grad_norm": 0.9436658620834351, | |
| "learning_rate": 7.208262440184584e-05, | |
| "loss": 0.6591, | |
| "step": 2780 | |
| }, | |
| { | |
| "epoch": 3.1650595575723197, | |
| "grad_norm": 0.9558268785476685, | |
| "learning_rate": 7.13224887276553e-05, | |
| "loss": 0.7548, | |
| "step": 2790 | |
| }, | |
| { | |
| "epoch": 3.1764038570618265, | |
| "grad_norm": 1.3072495460510254, | |
| "learning_rate": 7.056415429370106e-05, | |
| "loss": 0.648, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 3.187748156551333, | |
| "grad_norm": 1.0742169618606567, | |
| "learning_rate": 6.980766873112106e-05, | |
| "loss": 0.6646, | |
| "step": 2810 | |
| }, | |
| { | |
| "epoch": 3.1990924560408396, | |
| "grad_norm": 0.8391577005386353, | |
| "learning_rate": 6.905307955492523e-05, | |
| "loss": 0.6844, | |
| "step": 2820 | |
| }, | |
| { | |
| "epoch": 3.210436755530346, | |
| "grad_norm": 0.9172285795211792, | |
| "learning_rate": 6.83004341610111e-05, | |
| "loss": 0.6671, | |
| "step": 2830 | |
| }, | |
| { | |
| "epoch": 3.2217810550198527, | |
| "grad_norm": 1.0791727304458618, | |
| "learning_rate": 6.754977982318693e-05, | |
| "loss": 0.6619, | |
| "step": 2840 | |
| }, | |
| { | |
| "epoch": 3.233125354509359, | |
| "grad_norm": 0.8881738781929016, | |
| "learning_rate": 6.68011636902022e-05, | |
| "loss": 0.678, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 3.2444696539988658, | |
| "grad_norm": 0.8353477120399475, | |
| "learning_rate": 6.605463278278646e-05, | |
| "loss": 0.7061, | |
| "step": 2860 | |
| }, | |
| { | |
| "epoch": 3.255813953488372, | |
| "grad_norm": 0.9251864552497864, | |
| "learning_rate": 6.531023399069574e-05, | |
| "loss": 0.6658, | |
| "step": 2870 | |
| }, | |
| { | |
| "epoch": 3.2671582529778784, | |
| "grad_norm": 0.7780378460884094, | |
| "learning_rate": 6.45680140697675e-05, | |
| "loss": 0.6327, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 3.278502552467385, | |
| "grad_norm": 1.3496202230453491, | |
| "learning_rate": 6.38280196389839e-05, | |
| "loss": 0.6658, | |
| "step": 2890 | |
| }, | |
| { | |
| "epoch": 3.2898468519568915, | |
| "grad_norm": 1.0429950952529907, | |
| "learning_rate": 6.309029717754362e-05, | |
| "loss": 0.7013, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 3.3011911514463983, | |
| "grad_norm": 0.7141017317771912, | |
| "learning_rate": 6.235489302194247e-05, | |
| "loss": 0.6969, | |
| "step": 2910 | |
| }, | |
| { | |
| "epoch": 3.3125354509359046, | |
| "grad_norm": 1.2669309377670288, | |
| "learning_rate": 6.162185336306294e-05, | |
| "loss": 0.6468, | |
| "step": 2920 | |
| }, | |
| { | |
| "epoch": 3.3238797504254114, | |
| "grad_norm": 0.8476207852363586, | |
| "learning_rate": 6.089122424327307e-05, | |
| "loss": 0.6501, | |
| "step": 2930 | |
| }, | |
| { | |
| "epoch": 3.3352240499149177, | |
| "grad_norm": 0.9521162509918213, | |
| "learning_rate": 6.01630515535345e-05, | |
| "loss": 0.6546, | |
| "step": 2940 | |
| }, | |
| { | |
| "epoch": 3.346568349404424, | |
| "grad_norm": 0.7817677855491638, | |
| "learning_rate": 5.943738103051997e-05, | |
| "loss": 0.6919, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 3.357912648893931, | |
| "grad_norm": 0.776945948600769, | |
| "learning_rate": 5.8714258253740564e-05, | |
| "loss": 0.6897, | |
| "step": 2960 | |
| }, | |
| { | |
| "epoch": 3.369256948383437, | |
| "grad_norm": 0.9761963486671448, | |
| "learning_rate": 5.7993728642683e-05, | |
| "loss": 0.6299, | |
| "step": 2970 | |
| }, | |
| { | |
| "epoch": 3.380601247872944, | |
| "grad_norm": 0.7887254953384399, | |
| "learning_rate": 5.7275837453956614e-05, | |
| "loss": 0.6773, | |
| "step": 2980 | |
| }, | |
| { | |
| "epoch": 3.3919455473624502, | |
| "grad_norm": 0.860835611820221, | |
| "learning_rate": 5.656062977845116e-05, | |
| "loss": 0.6239, | |
| "step": 2990 | |
| }, | |
| { | |
| "epoch": 3.403289846851957, | |
| "grad_norm": 0.9700385928153992, | |
| "learning_rate": 5.584815053850407e-05, | |
| "loss": 0.7148, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 3.403289846851957, | |
| "eval_loss": 0.9692808389663696, | |
| "eval_runtime": 15.7325, | |
| "eval_samples_per_second": 94.39, | |
| "eval_steps_per_second": 11.823, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 3.4146341463414633, | |
| "grad_norm": 1.335462212562561, | |
| "learning_rate": 5.51384444850794e-05, | |
| "loss": 0.6387, | |
| "step": 3010 | |
| }, | |
| { | |
| "epoch": 3.42597844583097, | |
| "grad_norm": 0.8788994550704956, | |
| "learning_rate": 5.443155619495679e-05, | |
| "loss": 0.6809, | |
| "step": 3020 | |
| }, | |
| { | |
| "epoch": 3.4373227453204764, | |
| "grad_norm": 0.9188012480735779, | |
| "learning_rate": 5.372753006793143e-05, | |
| "loss": 0.6724, | |
| "step": 3030 | |
| }, | |
| { | |
| "epoch": 3.4486670448099828, | |
| "grad_norm": 0.9619457125663757, | |
| "learning_rate": 5.302641032402578e-05, | |
| "loss": 0.6789, | |
| "step": 3040 | |
| }, | |
| { | |
| "epoch": 3.4600113442994895, | |
| "grad_norm": 0.9403857588768005, | |
| "learning_rate": 5.2328241000711464e-05, | |
| "loss": 0.6274, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 3.471355643788996, | |
| "grad_norm": 0.9259539246559143, | |
| "learning_rate": 5.16330659501438e-05, | |
| "loss": 0.6551, | |
| "step": 3060 | |
| }, | |
| { | |
| "epoch": 3.4826999432785026, | |
| "grad_norm": 1.07770574092865, | |
| "learning_rate": 5.094092883640718e-05, | |
| "loss": 0.6593, | |
| "step": 3070 | |
| }, | |
| { | |
| "epoch": 3.494044242768009, | |
| "grad_norm": 0.7347473502159119, | |
| "learning_rate": 5.0251873132772576e-05, | |
| "loss": 0.6847, | |
| "step": 3080 | |
| }, | |
| { | |
| "epoch": 3.5053885422575157, | |
| "grad_norm": 0.9838495254516602, | |
| "learning_rate": 4.956594211896701e-05, | |
| "loss": 0.6667, | |
| "step": 3090 | |
| }, | |
| { | |
| "epoch": 3.516732841747022, | |
| "grad_norm": 1.1671929359436035, | |
| "learning_rate": 4.8883178878454996e-05, | |
| "loss": 0.683, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 3.528077141236529, | |
| "grad_norm": 0.6510323882102966, | |
| "learning_rate": 4.8203626295732675e-05, | |
| "loss": 0.6946, | |
| "step": 3110 | |
| }, | |
| { | |
| "epoch": 3.539421440726035, | |
| "grad_norm": 0.7871556282043457, | |
| "learning_rate": 4.7527327053634094e-05, | |
| "loss": 0.6652, | |
| "step": 3120 | |
| }, | |
| { | |
| "epoch": 3.5507657402155415, | |
| "grad_norm": 0.8053673505783081, | |
| "learning_rate": 4.685432363065036e-05, | |
| "loss": 0.6431, | |
| "step": 3130 | |
| }, | |
| { | |
| "epoch": 3.5621100397050482, | |
| "grad_norm": 0.8162011504173279, | |
| "learning_rate": 4.618465829826145e-05, | |
| "loss": 0.6089, | |
| "step": 3140 | |
| }, | |
| { | |
| "epoch": 3.5734543391945546, | |
| "grad_norm": 1.0298821926116943, | |
| "learning_rate": 4.551837311828131e-05, | |
| "loss": 0.6645, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 3.5847986386840613, | |
| "grad_norm": 1.0996955633163452, | |
| "learning_rate": 4.485550994021567e-05, | |
| "loss": 0.6872, | |
| "step": 3160 | |
| }, | |
| { | |
| "epoch": 3.5961429381735677, | |
| "grad_norm": 0.9979953765869141, | |
| "learning_rate": 4.419611039863377e-05, | |
| "loss": 0.628, | |
| "step": 3170 | |
| }, | |
| { | |
| "epoch": 3.6074872376630744, | |
| "grad_norm": 1.0593342781066895, | |
| "learning_rate": 4.354021591055311e-05, | |
| "loss": 0.6864, | |
| "step": 3180 | |
| }, | |
| { | |
| "epoch": 3.6188315371525808, | |
| "grad_norm": 1.6677913665771484, | |
| "learning_rate": 4.2887867672838056e-05, | |
| "loss": 0.6232, | |
| "step": 3190 | |
| }, | |
| { | |
| "epoch": 3.6301758366420875, | |
| "grad_norm": 0.8164204359054565, | |
| "learning_rate": 4.223910665961235e-05, | |
| "loss": 0.6786, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 3.641520136131594, | |
| "grad_norm": 0.8163765072822571, | |
| "learning_rate": 4.15939736196853e-05, | |
| "loss": 0.6763, | |
| "step": 3210 | |
| }, | |
| { | |
| "epoch": 3.6528644356211, | |
| "grad_norm": 0.9765521883964539, | |
| "learning_rate": 4.095250907399262e-05, | |
| "loss": 0.6719, | |
| "step": 3220 | |
| }, | |
| { | |
| "epoch": 3.664208735110607, | |
| "grad_norm": 0.9238688349723816, | |
| "learning_rate": 4.03147533130511e-05, | |
| "loss": 0.68, | |
| "step": 3230 | |
| }, | |
| { | |
| "epoch": 3.6755530346001133, | |
| "grad_norm": 0.9760640859603882, | |
| "learning_rate": 3.968074639442805e-05, | |
| "loss": 0.6542, | |
| "step": 3240 | |
| }, | |
| { | |
| "epoch": 3.68689733408962, | |
| "grad_norm": 0.9406284689903259, | |
| "learning_rate": 3.905052814022523e-05, | |
| "loss": 0.653, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 3.6982416335791264, | |
| "grad_norm": 0.9423522353172302, | |
| "learning_rate": 3.842413813457758e-05, | |
| "loss": 0.706, | |
| "step": 3260 | |
| }, | |
| { | |
| "epoch": 3.709585933068633, | |
| "grad_norm": 0.8088165521621704, | |
| "learning_rate": 3.780161572116704e-05, | |
| "loss": 0.7161, | |
| "step": 3270 | |
| }, | |
| { | |
| "epoch": 3.7209302325581395, | |
| "grad_norm": 0.9071544408798218, | |
| "learning_rate": 3.718300000075129e-05, | |
| "loss": 0.7193, | |
| "step": 3280 | |
| }, | |
| { | |
| "epoch": 3.7322745320476463, | |
| "grad_norm": 0.8792480230331421, | |
| "learning_rate": 3.6568329828707836e-05, | |
| "loss": 0.6381, | |
| "step": 3290 | |
| }, | |
| { | |
| "epoch": 3.7436188315371526, | |
| "grad_norm": 1.0307759046554565, | |
| "learning_rate": 3.5957643812593543e-05, | |
| "loss": 0.6668, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 3.754963131026659, | |
| "grad_norm": 1.0883175134658813, | |
| "learning_rate": 3.5350980309719514e-05, | |
| "loss": 0.6978, | |
| "step": 3310 | |
| }, | |
| { | |
| "epoch": 3.7663074305161657, | |
| "grad_norm": 1.0448516607284546, | |
| "learning_rate": 3.4748377424742115e-05, | |
| "loss": 0.6756, | |
| "step": 3320 | |
| }, | |
| { | |
| "epoch": 3.777651730005672, | |
| "grad_norm": 0.8772532939910889, | |
| "learning_rate": 3.414987300726945e-05, | |
| "loss": 0.6714, | |
| "step": 3330 | |
| }, | |
| { | |
| "epoch": 3.7889960294951788, | |
| "grad_norm": 1.0115753412246704, | |
| "learning_rate": 3.3555504649484046e-05, | |
| "loss": 0.6773, | |
| "step": 3340 | |
| }, | |
| { | |
| "epoch": 3.800340328984685, | |
| "grad_norm": 1.1093175411224365, | |
| "learning_rate": 3.296530968378173e-05, | |
| "loss": 0.6916, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 3.811684628474192, | |
| "grad_norm": 0.8998281359672546, | |
| "learning_rate": 3.237932518042664e-05, | |
| "loss": 0.6801, | |
| "step": 3360 | |
| }, | |
| { | |
| "epoch": 3.823028927963698, | |
| "grad_norm": 1.0179048776626587, | |
| "learning_rate": 3.1797587945223026e-05, | |
| "loss": 0.6702, | |
| "step": 3370 | |
| }, | |
| { | |
| "epoch": 3.834373227453205, | |
| "grad_norm": 0.9240026473999023, | |
| "learning_rate": 3.1220134517203335e-05, | |
| "loss": 0.671, | |
| "step": 3380 | |
| }, | |
| { | |
| "epoch": 3.8457175269427113, | |
| "grad_norm": 0.7641962766647339, | |
| "learning_rate": 3.0647001166333245e-05, | |
| "loss": 0.7147, | |
| "step": 3390 | |
| }, | |
| { | |
| "epoch": 3.8570618264322176, | |
| "grad_norm": 0.9078419804573059, | |
| "learning_rate": 3.0078223891233514e-05, | |
| "loss": 0.7155, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 3.8684061259217244, | |
| "grad_norm": 0.962393045425415, | |
| "learning_rate": 2.9513838416918815e-05, | |
| "loss": 0.6866, | |
| "step": 3410 | |
| }, | |
| { | |
| "epoch": 3.8797504254112307, | |
| "grad_norm": 1.5198420286178589, | |
| "learning_rate": 2.8953880192554105e-05, | |
| "loss": 0.6741, | |
| "step": 3420 | |
| }, | |
| { | |
| "epoch": 3.8910947249007375, | |
| "grad_norm": 1.1129947900772095, | |
| "learning_rate": 2.8398384389227816e-05, | |
| "loss": 0.6542, | |
| "step": 3430 | |
| }, | |
| { | |
| "epoch": 3.902439024390244, | |
| "grad_norm": 0.8633179664611816, | |
| "learning_rate": 2.7847385897742705e-05, | |
| "loss": 0.6768, | |
| "step": 3440 | |
| }, | |
| { | |
| "epoch": 3.9137833238797506, | |
| "grad_norm": 1.062277913093567, | |
| "learning_rate": 2.7300919326424658e-05, | |
| "loss": 0.6709, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 3.925127623369257, | |
| "grad_norm": 0.7949813604354858, | |
| "learning_rate": 2.675901899894854e-05, | |
| "loss": 0.6166, | |
| "step": 3460 | |
| }, | |
| { | |
| "epoch": 3.9364719228587637, | |
| "grad_norm": 0.9200356006622314, | |
| "learning_rate": 2.622171895218273e-05, | |
| "loss": 0.6718, | |
| "step": 3470 | |
| }, | |
| { | |
| "epoch": 3.94781622234827, | |
| "grad_norm": 0.9637920260429382, | |
| "learning_rate": 2.568905293405095e-05, | |
| "loss": 0.619, | |
| "step": 3480 | |
| }, | |
| { | |
| "epoch": 3.9591605218377763, | |
| "grad_norm": 1.157073974609375, | |
| "learning_rate": 2.516105440141262e-05, | |
| "loss": 0.6961, | |
| "step": 3490 | |
| }, | |
| { | |
| "epoch": 3.970504821327283, | |
| "grad_norm": 0.8323079347610474, | |
| "learning_rate": 2.4637756517961517e-05, | |
| "loss": 0.677, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 3.9818491208167894, | |
| "grad_norm": 0.9369989037513733, | |
| "learning_rate": 2.41191921521427e-05, | |
| "loss": 0.6619, | |
| "step": 3510 | |
| }, | |
| { | |
| "epoch": 3.993193420306296, | |
| "grad_norm": 0.8290889263153076, | |
| "learning_rate": 2.360539387508801e-05, | |
| "loss": 0.6534, | |
| "step": 3520 | |
| }, | |
| { | |
| "epoch": 4.0045377197958025, | |
| "grad_norm": 0.8619610071182251, | |
| "learning_rate": 2.309639395857033e-05, | |
| "loss": 0.6531, | |
| "step": 3530 | |
| }, | |
| { | |
| "epoch": 4.015882019285309, | |
| "grad_norm": 0.7406215071678162, | |
| "learning_rate": 2.259222437297649e-05, | |
| "loss": 0.5811, | |
| "step": 3540 | |
| }, | |
| { | |
| "epoch": 4.027226318774816, | |
| "grad_norm": 1.3408113718032837, | |
| "learning_rate": 2.2092916785299323e-05, | |
| "loss": 0.6163, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 4.038570618264322, | |
| "grad_norm": 0.9652060866355896, | |
| "learning_rate": 2.159850255714859e-05, | |
| "loss": 0.6345, | |
| "step": 3560 | |
| }, | |
| { | |
| "epoch": 4.049914917753829, | |
| "grad_norm": 1.2307026386260986, | |
| "learning_rate": 2.1109012742781142e-05, | |
| "loss": 0.5568, | |
| "step": 3570 | |
| }, | |
| { | |
| "epoch": 4.061259217243335, | |
| "grad_norm": 1.101637363433838, | |
| "learning_rate": 2.0624478087150456e-05, | |
| "loss": 0.608, | |
| "step": 3580 | |
| }, | |
| { | |
| "epoch": 4.072603516732841, | |
| "grad_norm": 2.5598561763763428, | |
| "learning_rate": 2.0144929023975413e-05, | |
| "loss": 0.5294, | |
| "step": 3590 | |
| }, | |
| { | |
| "epoch": 4.083947816222349, | |
| "grad_norm": 0.9463273286819458, | |
| "learning_rate": 1.967039567382888e-05, | |
| "loss": 0.5482, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 4.095292115711855, | |
| "grad_norm": 0.9838125109672546, | |
| "learning_rate": 1.920090784224581e-05, | |
| "loss": 0.6254, | |
| "step": 3610 | |
| }, | |
| { | |
| "epoch": 4.106636415201361, | |
| "grad_norm": 0.85828697681427, | |
| "learning_rate": 1.8736495017851062e-05, | |
| "loss": 0.5443, | |
| "step": 3620 | |
| }, | |
| { | |
| "epoch": 4.117980714690868, | |
| "grad_norm": 0.8922297954559326, | |
| "learning_rate": 1.827718637050736e-05, | |
| "loss": 0.6068, | |
| "step": 3630 | |
| }, | |
| { | |
| "epoch": 4.129325014180375, | |
| "grad_norm": 0.7973962426185608, | |
| "learning_rate": 1.7823010749482927e-05, | |
| "loss": 0.6179, | |
| "step": 3640 | |
| }, | |
| { | |
| "epoch": 4.140669313669881, | |
| "grad_norm": 0.8686882257461548, | |
| "learning_rate": 1.737399668163966e-05, | |
| "loss": 0.6186, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 4.152013613159387, | |
| "grad_norm": 1.4338245391845703, | |
| "learning_rate": 1.693017236964125e-05, | |
| "loss": 0.5784, | |
| "step": 3660 | |
| }, | |
| { | |
| "epoch": 4.163357912648894, | |
| "grad_norm": 0.9958694577217102, | |
| "learning_rate": 1.6491565690181765e-05, | |
| "loss": 0.6388, | |
| "step": 3670 | |
| }, | |
| { | |
| "epoch": 4.1747022121384, | |
| "grad_norm": 0.9962863922119141, | |
| "learning_rate": 1.605820419223476e-05, | |
| "loss": 0.6541, | |
| "step": 3680 | |
| }, | |
| { | |
| "epoch": 4.186046511627907, | |
| "grad_norm": 1.1754194498062134, | |
| "learning_rate": 1.5630115095322827e-05, | |
| "loss": 0.6037, | |
| "step": 3690 | |
| }, | |
| { | |
| "epoch": 4.197390811117414, | |
| "grad_norm": 1.1034218072891235, | |
| "learning_rate": 1.5207325287808027e-05, | |
| "loss": 0.5844, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 4.20873511060692, | |
| "grad_norm": 1.0171332359313965, | |
| "learning_rate": 1.4789861325203013e-05, | |
| "loss": 0.6724, | |
| "step": 3710 | |
| }, | |
| { | |
| "epoch": 4.220079410096426, | |
| "grad_norm": 0.9791539907455444, | |
| "learning_rate": 1.4377749428503006e-05, | |
| "loss": 0.5989, | |
| "step": 3720 | |
| }, | |
| { | |
| "epoch": 4.231423709585933, | |
| "grad_norm": 0.9501050710678101, | |
| "learning_rate": 1.3971015482538963e-05, | |
| "loss": 0.5911, | |
| "step": 3730 | |
| }, | |
| { | |
| "epoch": 4.24276800907544, | |
| "grad_norm": 1.2614890336990356, | |
| "learning_rate": 1.3569685034351554e-05, | |
| "loss": 0.5849, | |
| "step": 3740 | |
| }, | |
| { | |
| "epoch": 4.254112308564946, | |
| "grad_norm": 1.0194411277770996, | |
| "learning_rate": 1.3173783291586772e-05, | |
| "loss": 0.5976, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 4.2654566080544525, | |
| "grad_norm": 1.0711522102355957, | |
| "learning_rate": 1.2783335120912565e-05, | |
| "loss": 0.5931, | |
| "step": 3760 | |
| }, | |
| { | |
| "epoch": 4.276800907543959, | |
| "grad_norm": 0.8650385141372681, | |
| "learning_rate": 1.2398365046456783e-05, | |
| "loss": 0.6078, | |
| "step": 3770 | |
| }, | |
| { | |
| "epoch": 4.288145207033466, | |
| "grad_norm": 0.823208749294281, | |
| "learning_rate": 1.2018897248267103e-05, | |
| "loss": 0.5961, | |
| "step": 3780 | |
| }, | |
| { | |
| "epoch": 4.299489506522972, | |
| "grad_norm": 0.9447870850563049, | |
| "learning_rate": 1.1644955560791993e-05, | |
| "loss": 0.6468, | |
| "step": 3790 | |
| }, | |
| { | |
| "epoch": 4.310833806012479, | |
| "grad_norm": 1.102318525314331, | |
| "learning_rate": 1.1276563471383883e-05, | |
| "loss": 0.588, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 4.322178105501985, | |
| "grad_norm": 0.9916651248931885, | |
| "learning_rate": 1.0913744118823866e-05, | |
| "loss": 0.6188, | |
| "step": 3810 | |
| }, | |
| { | |
| "epoch": 4.333522404991491, | |
| "grad_norm": 1.1987171173095703, | |
| "learning_rate": 1.05565202918682e-05, | |
| "loss": 0.5841, | |
| "step": 3820 | |
| }, | |
| { | |
| "epoch": 4.3448667044809985, | |
| "grad_norm": 0.9708378911018372, | |
| "learning_rate": 1.0204914427817158e-05, | |
| "loss": 0.6023, | |
| "step": 3830 | |
| }, | |
| { | |
| "epoch": 4.356211003970505, | |
| "grad_norm": 1.0048896074295044, | |
| "learning_rate": 9.8589486111056e-06, | |
| "loss": 0.5705, | |
| "step": 3840 | |
| }, | |
| { | |
| "epoch": 4.367555303460011, | |
| "grad_norm": 0.8364105820655823, | |
| "learning_rate": 9.518644571915847e-06, | |
| "loss": 0.5872, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 4.3788996029495175, | |
| "grad_norm": 1.5254448652267456, | |
| "learning_rate": 9.184023684812926e-06, | |
| "loss": 0.6063, | |
| "step": 3860 | |
| }, | |
| { | |
| "epoch": 4.390243902439025, | |
| "grad_norm": 0.993635356426239, | |
| "learning_rate": 8.855106967401839e-06, | |
| "loss": 0.5311, | |
| "step": 3870 | |
| }, | |
| { | |
| "epoch": 4.401588201928531, | |
| "grad_norm": 0.8678284883499146, | |
| "learning_rate": 8.531915079007625e-06, | |
| "loss": 0.5894, | |
| "step": 3880 | |
| }, | |
| { | |
| "epoch": 4.412932501418037, | |
| "grad_norm": 1.081127643585205, | |
| "learning_rate": 8.214468319377633e-06, | |
| "loss": 0.5906, | |
| "step": 3890 | |
| }, | |
| { | |
| "epoch": 4.424276800907544, | |
| "grad_norm": 0.9130728840827942, | |
| "learning_rate": 7.902786627406477e-06, | |
| "loss": 0.5764, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 4.43562110039705, | |
| "grad_norm": 0.9263814091682434, | |
| "learning_rate": 7.596889579883826e-06, | |
| "loss": 0.5812, | |
| "step": 3910 | |
| }, | |
| { | |
| "epoch": 4.446965399886557, | |
| "grad_norm": 1.095747947692871, | |
| "learning_rate": 7.296796390264549e-06, | |
| "loss": 0.5721, | |
| "step": 3920 | |
| }, | |
| { | |
| "epoch": 4.458309699376064, | |
| "grad_norm": 0.8003553152084351, | |
| "learning_rate": 7.002525907462121e-06, | |
| "loss": 0.5882, | |
| "step": 3930 | |
| }, | |
| { | |
| "epoch": 4.46965399886557, | |
| "grad_norm": 0.8841357231140137, | |
| "learning_rate": 6.7140966146646e-06, | |
| "loss": 0.5543, | |
| "step": 3940 | |
| }, | |
| { | |
| "epoch": 4.480998298355076, | |
| "grad_norm": 0.8580918312072754, | |
| "learning_rate": 6.431526628173701e-06, | |
| "loss": 0.6549, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 4.4923425978445835, | |
| "grad_norm": 0.9447335004806519, | |
| "learning_rate": 6.154833696267015e-06, | |
| "loss": 0.6516, | |
| "step": 3960 | |
| }, | |
| { | |
| "epoch": 4.50368689733409, | |
| "grad_norm": 1.0485211610794067, | |
| "learning_rate": 5.884035198083071e-06, | |
| "loss": 0.579, | |
| "step": 3970 | |
| }, | |
| { | |
| "epoch": 4.515031196823596, | |
| "grad_norm": 0.9394044876098633, | |
| "learning_rate": 5.619148142529873e-06, | |
| "loss": 0.6396, | |
| "step": 3980 | |
| }, | |
| { | |
| "epoch": 4.526375496313102, | |
| "grad_norm": 0.93062824010849, | |
| "learning_rate": 5.360189167216545e-06, | |
| "loss": 0.6005, | |
| "step": 3990 | |
| }, | |
| { | |
| "epoch": 4.537719795802609, | |
| "grad_norm": 0.9513915777206421, | |
| "learning_rate": 5.107174537408233e-06, | |
| "loss": 0.5743, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 4.537719795802609, | |
| "eval_loss": 1.0443100929260254, | |
| "eval_runtime": 15.6805, | |
| "eval_samples_per_second": 94.704, | |
| "eval_steps_per_second": 11.862, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 4.549064095292116, | |
| "grad_norm": 0.9627020359039307, | |
| "learning_rate": 4.8601201450046316e-06, | |
| "loss": 0.6077, | |
| "step": 4010 | |
| }, | |
| { | |
| "epoch": 4.560408394781622, | |
| "grad_norm": 0.8539467453956604, | |
| "learning_rate": 4.619041507541688e-06, | |
| "loss": 0.5812, | |
| "step": 4020 | |
| }, | |
| { | |
| "epoch": 4.571752694271129, | |
| "grad_norm": 0.9446848630905151, | |
| "learning_rate": 4.383953767216964e-06, | |
| "loss": 0.624, | |
| "step": 4030 | |
| }, | |
| { | |
| "epoch": 4.583096993760635, | |
| "grad_norm": 1.188366174697876, | |
| "learning_rate": 4.154871689938633e-06, | |
| "loss": 0.6437, | |
| "step": 4040 | |
| }, | |
| { | |
| "epoch": 4.594441293250142, | |
| "grad_norm": 1.0908474922180176, | |
| "learning_rate": 3.931809664397867e-06, | |
| "loss": 0.6323, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 4.6057855927396485, | |
| "grad_norm": 0.9742168188095093, | |
| "learning_rate": 3.714781701165304e-06, | |
| "loss": 0.6132, | |
| "step": 4060 | |
| }, | |
| { | |
| "epoch": 4.617129892229155, | |
| "grad_norm": 0.8761405348777771, | |
| "learning_rate": 3.503801431810816e-06, | |
| "loss": 0.624, | |
| "step": 4070 | |
| }, | |
| { | |
| "epoch": 4.628474191718661, | |
| "grad_norm": 0.996088445186615, | |
| "learning_rate": 3.298882108047463e-06, | |
| "loss": 0.6009, | |
| "step": 4080 | |
| }, | |
| { | |
| "epoch": 4.6398184912081675, | |
| "grad_norm": 0.9667827486991882, | |
| "learning_rate": 3.10003660089907e-06, | |
| "loss": 0.5988, | |
| "step": 4090 | |
| }, | |
| { | |
| "epoch": 4.651162790697675, | |
| "grad_norm": 0.9298661351203918, | |
| "learning_rate": 2.9072773998918503e-06, | |
| "loss": 0.6453, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 4.662507090187181, | |
| "grad_norm": 0.9182038307189941, | |
| "learning_rate": 2.7206166122698774e-06, | |
| "loss": 0.5915, | |
| "step": 4110 | |
| }, | |
| { | |
| "epoch": 4.673851389676687, | |
| "grad_norm": 0.835645318031311, | |
| "learning_rate": 2.540065962234683e-06, | |
| "loss": 0.6515, | |
| "step": 4120 | |
| }, | |
| { | |
| "epoch": 4.685195689166194, | |
| "grad_norm": 0.8575255274772644, | |
| "learning_rate": 2.3656367902088026e-06, | |
| "loss": 0.6169, | |
| "step": 4130 | |
| }, | |
| { | |
| "epoch": 4.696539988655701, | |
| "grad_norm": 0.9075832962989807, | |
| "learning_rate": 2.19734005212352e-06, | |
| "loss": 0.6166, | |
| "step": 4140 | |
| }, | |
| { | |
| "epoch": 4.707884288145207, | |
| "grad_norm": 2.0740888118743896, | |
| "learning_rate": 2.035186318730742e-06, | |
| "loss": 0.5779, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 4.7192285876347135, | |
| "grad_norm": 1.0293558835983276, | |
| "learning_rate": 1.8791857749389741e-06, | |
| "loss": 0.6414, | |
| "step": 4160 | |
| }, | |
| { | |
| "epoch": 4.73057288712422, | |
| "grad_norm": 0.9525774121284485, | |
| "learning_rate": 1.7293482191736877e-06, | |
| "loss": 0.5802, | |
| "step": 4170 | |
| }, | |
| { | |
| "epoch": 4.741917186613726, | |
| "grad_norm": 0.9085150957107544, | |
| "learning_rate": 1.5856830627618001e-06, | |
| "loss": 0.6331, | |
| "step": 4180 | |
| }, | |
| { | |
| "epoch": 4.753261486103233, | |
| "grad_norm": 0.9908912777900696, | |
| "learning_rate": 1.4481993293406048e-06, | |
| "loss": 0.5844, | |
| "step": 4190 | |
| }, | |
| { | |
| "epoch": 4.76460578559274, | |
| "grad_norm": 0.7421241998672485, | |
| "learning_rate": 1.316905654291012e-06, | |
| "loss": 0.6653, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 4.775950085082246, | |
| "grad_norm": 0.857502281665802, | |
| "learning_rate": 1.1918102841950607e-06, | |
| "loss": 0.5693, | |
| "step": 4210 | |
| }, | |
| { | |
| "epoch": 4.787294384571752, | |
| "grad_norm": 0.9300210475921631, | |
| "learning_rate": 1.0729210763180564e-06, | |
| "loss": 0.5755, | |
| "step": 4220 | |
| }, | |
| { | |
| "epoch": 4.79863868406126, | |
| "grad_norm": 1.2351378202438354, | |
| "learning_rate": 9.602454981149977e-07, | |
| "loss": 0.618, | |
| "step": 4230 | |
| }, | |
| { | |
| "epoch": 4.809982983550766, | |
| "grad_norm": 1.24778151512146, | |
| "learning_rate": 8.537906267615415e-07, | |
| "loss": 0.5896, | |
| "step": 4240 | |
| }, | |
| { | |
| "epoch": 4.821327283040272, | |
| "grad_norm": 1.3560271263122559, | |
| "learning_rate": 7.535631487095352e-07, | |
| "loss": 0.5879, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 4.832671582529779, | |
| "grad_norm": 1.8108911514282227, | |
| "learning_rate": 6.59569359266976e-07, | |
| "loss": 0.5943, | |
| "step": 4260 | |
| }, | |
| { | |
| "epoch": 4.844015882019285, | |
| "grad_norm": 0.9743121862411499, | |
| "learning_rate": 5.718151622026379e-07, | |
| "loss": 0.6104, | |
| "step": 4270 | |
| }, | |
| { | |
| "epoch": 4.855360181508792, | |
| "grad_norm": 1.2035831212997437, | |
| "learning_rate": 4.903060693752348e-07, | |
| "loss": 0.608, | |
| "step": 4280 | |
| }, | |
| { | |
| "epoch": 4.866704480998298, | |
| "grad_norm": 0.9681785106658936, | |
| "learning_rate": 4.1504720038724187e-07, | |
| "loss": 0.5773, | |
| "step": 4290 | |
| }, | |
| { | |
| "epoch": 4.878048780487805, | |
| "grad_norm": 1.0151753425598145, | |
| "learning_rate": 3.4604328226333083e-07, | |
| "loss": 0.5609, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 4.889393079977311, | |
| "grad_norm": 1.0577515363693237, | |
| "learning_rate": 2.832986491534295e-07, | |
| "loss": 0.6435, | |
| "step": 4310 | |
| }, | |
| { | |
| "epoch": 4.900737379466818, | |
| "grad_norm": 0.8938112854957581, | |
| "learning_rate": 2.2681724206052857e-07, | |
| "loss": 0.6398, | |
| "step": 4320 | |
| }, | |
| { | |
| "epoch": 4.912081678956325, | |
| "grad_norm": 0.997191846370697, | |
| "learning_rate": 1.7660260859315713e-07, | |
| "loss": 0.628, | |
| "step": 4330 | |
| }, | |
| { | |
| "epoch": 4.923425978445831, | |
| "grad_norm": 0.8382704257965088, | |
| "learning_rate": 1.3265790274249456e-07, | |
| "loss": 0.6105, | |
| "step": 4340 | |
| }, | |
| { | |
| "epoch": 4.934770277935337, | |
| "grad_norm": 0.8330470323562622, | |
| "learning_rate": 9.498588468433989e-08, | |
| "loss": 0.5982, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 4.946114577424844, | |
| "grad_norm": 1.2183622121810913, | |
| "learning_rate": 6.35889206057172e-08, | |
| "loss": 0.5876, | |
| "step": 4360 | |
| }, | |
| { | |
| "epoch": 4.957458876914351, | |
| "grad_norm": 1.131373405456543, | |
| "learning_rate": 3.846898255622788e-08, | |
| "loss": 0.6113, | |
| "step": 4370 | |
| }, | |
| { | |
| "epoch": 4.968803176403857, | |
| "grad_norm": 1.1781286001205444, | |
| "learning_rate": 1.9627648324227476e-08, | |
| "loss": 0.5522, | |
| "step": 4380 | |
| }, | |
| { | |
| "epoch": 4.9801474758933635, | |
| "grad_norm": 1.2726503610610962, | |
| "learning_rate": 7.066101337682707e-09, | |
| "loss": 0.6312, | |
| "step": 4390 | |
| }, | |
| { | |
| "epoch": 4.99149177538287, | |
| "grad_norm": 1.1971274614334106, | |
| "learning_rate": 7.85130589897598e-10, | |
| "loss": 0.6052, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 4.997163925127623, | |
| "step": 4405, | |
| "total_flos": 9.40234358432727e+17, | |
| "train_loss": 0.7921485962039632, | |
| "train_runtime": 4193.8899, | |
| "train_samples_per_second": 33.618, | |
| "train_steps_per_second": 1.05 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 4405, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 5, | |
| "save_steps": 1000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 9.40234358432727e+17, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |