| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 107, | |
| "global_step": 426, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0, | |
| "grad_norm": 0.8741932511329651, | |
| "learning_rate": 1e-05, | |
| "loss": 1.2567, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.0, | |
| "eval_loss": 1.3469510078430176, | |
| "eval_runtime": 5.3003, | |
| "eval_samples_per_second": 18.867, | |
| "eval_steps_per_second": 18.867, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.0, | |
| "grad_norm": 0.7848138809204102, | |
| "learning_rate": 2e-05, | |
| "loss": 1.3328, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 1.0692198276519775, | |
| "learning_rate": 3e-05, | |
| "loss": 1.6569, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 1.4229260683059692, | |
| "learning_rate": 4e-05, | |
| "loss": 1.5493, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 0.6837282180786133, | |
| "learning_rate": 5e-05, | |
| "loss": 1.4342, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 1.1194649934768677, | |
| "learning_rate": 6e-05, | |
| "loss": 1.2668, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 0.954695463180542, | |
| "learning_rate": 7e-05, | |
| "loss": 1.4721, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 0.8204368352890015, | |
| "learning_rate": 8e-05, | |
| "loss": 1.4054, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 0.8432589769363403, | |
| "learning_rate": 9e-05, | |
| "loss": 1.2849, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 0.7272113561630249, | |
| "learning_rate": 0.0001, | |
| "loss": 1.1438, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 0.8135901093482971, | |
| "learning_rate": 0.00011000000000000002, | |
| "loss": 1.4665, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 0.8113470673561096, | |
| "learning_rate": 0.00012, | |
| "loss": 0.9749, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 1.3105953931808472, | |
| "learning_rate": 0.00013000000000000002, | |
| "loss": 1.5133, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 1.0281662940979004, | |
| "learning_rate": 0.00014, | |
| "loss": 1.1509, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 2.1005992889404297, | |
| "learning_rate": 0.00015000000000000001, | |
| "loss": 1.2748, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 1.453407645225525, | |
| "learning_rate": 0.00016, | |
| "loss": 1.3313, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 1.0521585941314697, | |
| "learning_rate": 0.00017, | |
| "loss": 1.2977, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 1.0728052854537964, | |
| "learning_rate": 0.00018, | |
| "loss": 1.322, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 0.8539354205131531, | |
| "learning_rate": 0.00019, | |
| "loss": 1.2053, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 1.6902265548706055, | |
| "learning_rate": 0.0002, | |
| "loss": 1.1924, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 1.3770757913589478, | |
| "learning_rate": 0.00019999928710990412, | |
| "loss": 1.2286, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 1.358694314956665, | |
| "learning_rate": 0.00019999714844978078, | |
| "loss": 1.3196, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 1.4592664241790771, | |
| "learning_rate": 0.0001999935840501225, | |
| "loss": 1.5923, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 1.122499942779541, | |
| "learning_rate": 0.0001999885939617498, | |
| "loss": 1.3107, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 1.0040143728256226, | |
| "learning_rate": 0.0001999821782558104, | |
| "loss": 1.316, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 1.0741703510284424, | |
| "learning_rate": 0.00019997433702377817, | |
| "loss": 1.2527, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 0.9978523850440979, | |
| "learning_rate": 0.00019996507037745183, | |
| "loss": 1.1989, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 1.002176284790039, | |
| "learning_rate": 0.00019995437844895334, | |
| "loss": 1.0503, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 1.0028811693191528, | |
| "learning_rate": 0.0001999422613907262, | |
| "loss": 1.356, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 1.3226864337921143, | |
| "learning_rate": 0.0001999287193755329, | |
| "loss": 1.2571, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 0.905536949634552, | |
| "learning_rate": 0.00019991375259645293, | |
| "loss": 1.1576, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 1.2652262449264526, | |
| "learning_rate": 0.00019989736126687963, | |
| "loss": 1.1792, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 1.0161558389663696, | |
| "learning_rate": 0.00019987954562051725, | |
| "loss": 1.0074, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 0.9903879761695862, | |
| "learning_rate": 0.00019986030591137783, | |
| "loss": 1.196, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 1.3282291889190674, | |
| "learning_rate": 0.0001998396424137773, | |
| "loss": 1.5831, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 1.0442458391189575, | |
| "learning_rate": 0.00019981755542233177, | |
| "loss": 0.9708, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 0.9892172813415527, | |
| "learning_rate": 0.0001997940452519531, | |
| "loss": 1.1873, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 1.6204959154129028, | |
| "learning_rate": 0.0001997691122378447, | |
| "loss": 1.725, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 0.9563732147216797, | |
| "learning_rate": 0.00019974275673549654, | |
| "loss": 1.513, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 1.225378155708313, | |
| "learning_rate": 0.00019971497912068013, | |
| "loss": 1.1298, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 0.8867512941360474, | |
| "learning_rate": 0.00019968577978944323, | |
| "loss": 1.2413, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 0.9609911441802979, | |
| "learning_rate": 0.0001996551591581041, | |
| "loss": 1.2217, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 0.8636968731880188, | |
| "learning_rate": 0.0001996231176632456, | |
| "loss": 1.5268, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 0.9208036065101624, | |
| "learning_rate": 0.00019958965576170908, | |
| "loss": 1.1154, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 0.8316241502761841, | |
| "learning_rate": 0.00019955477393058773, | |
| "loss": 1.0891, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 1.0851703882217407, | |
| "learning_rate": 0.0001995184726672197, | |
| "loss": 1.237, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 1.2032215595245361, | |
| "learning_rate": 0.00019948075248918124, | |
| "loss": 1.3444, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 0.8556870818138123, | |
| "learning_rate": 0.00019944161393427922, | |
| "loss": 0.9632, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 1.038156270980835, | |
| "learning_rate": 0.00019940105756054337, | |
| "loss": 1.3897, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 1.1046299934387207, | |
| "learning_rate": 0.00019935908394621844, | |
| "loss": 1.0211, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 1.1632884740829468, | |
| "learning_rate": 0.00019931569368975588, | |
| "loss": 1.3098, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 0.9915144443511963, | |
| "learning_rate": 0.0001992708874098054, | |
| "loss": 1.1708, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 1.1974948644638062, | |
| "learning_rate": 0.00019922466574520608, | |
| "loss": 1.248, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 1.123138189315796, | |
| "learning_rate": 0.00019917702935497725, | |
| "loss": 0.8789, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 0.7720706462860107, | |
| "learning_rate": 0.00019912797891830908, | |
| "loss": 1.2278, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 1.171324610710144, | |
| "learning_rate": 0.00019907751513455302, | |
| "loss": 1.419, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 1.1821049451828003, | |
| "learning_rate": 0.00019902563872321172, | |
| "loss": 1.3281, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 0.8871707320213318, | |
| "learning_rate": 0.00019897235042392873, | |
| "loss": 1.1431, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 0.9106453657150269, | |
| "learning_rate": 0.0001989176509964781, | |
| "loss": 1.0687, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 0.8794414401054382, | |
| "learning_rate": 0.00019886154122075343, | |
| "loss": 1.3627, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 0.9249592423439026, | |
| "learning_rate": 0.00019880402189675678, | |
| "loss": 0.5962, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 0.8355573415756226, | |
| "learning_rate": 0.00019874509384458725, | |
| "loss": 1.2576, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 1.3829615116119385, | |
| "learning_rate": 0.0001986847579044294, | |
| "loss": 1.3844, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 0.7890944480895996, | |
| "learning_rate": 0.00019862301493654108, | |
| "loss": 0.9504, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 0.9988391995429993, | |
| "learning_rate": 0.00019855986582124126, | |
| "loss": 1.1127, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 1.024064302444458, | |
| "learning_rate": 0.00019849531145889758, | |
| "loss": 1.0418, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 1.1652387380599976, | |
| "learning_rate": 0.0001984293527699133, | |
| "loss": 1.0524, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 0.8396044969558716, | |
| "learning_rate": 0.00019836199069471437, | |
| "loss": 1.1829, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 0.9767330884933472, | |
| "learning_rate": 0.00019829322619373588, | |
| "loss": 1.1652, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 0.7999225854873657, | |
| "learning_rate": 0.00019822306024740852, | |
| "loss": 1.1075, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 0.9757010340690613, | |
| "learning_rate": 0.00019815149385614444, | |
| "loss": 1.3074, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 0.9027532339096069, | |
| "learning_rate": 0.00019807852804032305, | |
| "loss": 1.308, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 1.5158909559249878, | |
| "learning_rate": 0.0001980041638402765, | |
| "loss": 1.1594, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 0.8618407249450684, | |
| "learning_rate": 0.00019792840231627482, | |
| "loss": 0.7077, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 1.0307233333587646, | |
| "learning_rate": 0.00019785124454851084, | |
| "loss": 1.0812, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 0.8295727968215942, | |
| "learning_rate": 0.00019777269163708468, | |
| "loss": 1.0412, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 1.0772101879119873, | |
| "learning_rate": 0.00019769274470198827, | |
| "loss": 1.1484, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 1.0130198001861572, | |
| "learning_rate": 0.0001976114048830891, | |
| "loss": 1.0704, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 0.6028400659561157, | |
| "learning_rate": 0.00019752867334011423, | |
| "loss": 0.6347, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 0.8858622312545776, | |
| "learning_rate": 0.0001974445512526336, | |
| "loss": 1.4117, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 0.9046239852905273, | |
| "learning_rate": 0.00019735903982004324, | |
| "loss": 1.0355, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 0.987149715423584, | |
| "learning_rate": 0.00019727214026154827, | |
| "loss": 1.117, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 1.066720962524414, | |
| "learning_rate": 0.0001971838538161454, | |
| "loss": 1.4394, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 0.7880568504333496, | |
| "learning_rate": 0.0001970941817426052, | |
| "loss": 1.0591, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 0.8709908723831177, | |
| "learning_rate": 0.00019700312531945442, | |
| "loss": 0.8653, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 0.8909516930580139, | |
| "learning_rate": 0.00019691068584495742, | |
| "loss": 1.0539, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 0.726899266242981, | |
| "learning_rate": 0.000196816864637098, | |
| "loss": 1.0426, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 0.9136834144592285, | |
| "learning_rate": 0.00019672166303356028, | |
| "loss": 1.0121, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 0.8233316540718079, | |
| "learning_rate": 0.0001966250823917099, | |
| "loss": 1.1974, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 0.8916105628013611, | |
| "learning_rate": 0.0001965271240885745, | |
| "loss": 1.2969, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 0.7402650713920593, | |
| "learning_rate": 0.00019642778952082426, | |
| "loss": 1.0409, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 1.036402702331543, | |
| "learning_rate": 0.00019632708010475165, | |
| "loss": 1.1482, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 1.50504469871521, | |
| "learning_rate": 0.00019622499727625162, | |
| "loss": 1.3788, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 0.7590145468711853, | |
| "learning_rate": 0.0001961215424908009, | |
| "loss": 1.0139, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 0.8763208389282227, | |
| "learning_rate": 0.00019601671722343738, | |
| "loss": 1.066, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 1.0625308752059937, | |
| "learning_rate": 0.00019591052296873888, | |
| "loss": 1.1032, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 0.7704355716705322, | |
| "learning_rate": 0.00019580296124080212, | |
| "loss": 0.8728, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 0.8164972066879272, | |
| "learning_rate": 0.0001956940335732209, | |
| "loss": 0.8879, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 1.114343285560608, | |
| "learning_rate": 0.0001955837415190643, | |
| "loss": 1.3132, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 0.8190547227859497, | |
| "learning_rate": 0.00019547208665085457, | |
| "loss": 1.324, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 1.0214807987213135, | |
| "learning_rate": 0.00019535907056054475, | |
| "loss": 1.0961, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 1.0679683685302734, | |
| "learning_rate": 0.00019524469485949583, | |
| "loss": 1.5574, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 0.7177500128746033, | |
| "learning_rate": 0.00019512896117845392, | |
| "loss": 1.1553, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 1.2570964097976685, | |
| "learning_rate": 0.00019501187116752693, | |
| "loss": 1.2929, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 0.8858779668807983, | |
| "learning_rate": 0.000194893426496161, | |
| "loss": 1.2218, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 0.9762778282165527, | |
| "learning_rate": 0.00019477362885311682, | |
| "loss": 0.9456, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 0.8851660490036011, | |
| "learning_rate": 0.00019465247994644545, | |
| "loss": 1.0032, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "eval_loss": 1.1362618207931519, | |
| "eval_runtime": 5.3254, | |
| "eval_samples_per_second": 18.778, | |
| "eval_steps_per_second": 18.778, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 0.8778185844421387, | |
| "learning_rate": 0.00019452998150346401, | |
| "loss": 1.1764, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 0.8865760564804077, | |
| "learning_rate": 0.00019440613527073105, | |
| "loss": 1.0644, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 0.8576034903526306, | |
| "learning_rate": 0.00019428094301402162, | |
| "loss": 1.0943, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 0.810203492641449, | |
| "learning_rate": 0.00019415440651830208, | |
| "loss": 1.2353, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 1.1911653280258179, | |
| "learning_rate": 0.00019402652758770475, | |
| "loss": 1.215, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 0.5166463851928711, | |
| "learning_rate": 0.00019389730804550211, | |
| "loss": 0.4519, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 0.983464241027832, | |
| "learning_rate": 0.00019376674973408075, | |
| "loss": 1.3189, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 0.7697935700416565, | |
| "learning_rate": 0.00019363485451491524, | |
| "loss": 1.0372, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 0.8599738478660583, | |
| "learning_rate": 0.0001935016242685415, | |
| "loss": 1.1381, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 1.0430530309677124, | |
| "learning_rate": 0.00019336706089452996, | |
| "loss": 1.2332, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 1.163251280784607, | |
| "learning_rate": 0.0001932311663114586, | |
| "loss": 1.1588, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 0.8196917176246643, | |
| "learning_rate": 0.0001930939424568854, | |
| "loss": 1.1114, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 0.8848841786384583, | |
| "learning_rate": 0.00019295539128732093, | |
| "loss": 1.0458, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 0.9288797974586487, | |
| "learning_rate": 0.00019281551477820036, | |
| "loss": 0.7388, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 0.8119339942932129, | |
| "learning_rate": 0.00019267431492385521, | |
| "loss": 1.1095, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 0.9211128354072571, | |
| "learning_rate": 0.00019253179373748504, | |
| "loss": 1.3183, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 0.9682132601737976, | |
| "learning_rate": 0.0001923879532511287, | |
| "loss": 1.3786, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 0.9330196976661682, | |
| "learning_rate": 0.00019224279551563532, | |
| "loss": 1.4051, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 0.7206214070320129, | |
| "learning_rate": 0.0001920963226006352, | |
| "loss": 0.8213, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 0.6096452474594116, | |
| "learning_rate": 0.0001919485365945101, | |
| "loss": 0.5982, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 1.0045711994171143, | |
| "learning_rate": 0.00019179943960436358, | |
| "loss": 1.1399, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 0.8744232058525085, | |
| "learning_rate": 0.00019164903375599112, | |
| "loss": 1.0176, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 0.8323287963867188, | |
| "learning_rate": 0.00019149732119384943, | |
| "loss": 1.2937, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 0.9033424854278564, | |
| "learning_rate": 0.00019134430408102615, | |
| "loss": 1.3108, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 0.7392603158950806, | |
| "learning_rate": 0.00019118998459920902, | |
| "loss": 1.0297, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 1.136338710784912, | |
| "learning_rate": 0.0001910343649486546, | |
| "loss": 1.3037, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 0.8202427625656128, | |
| "learning_rate": 0.00019087744734815708, | |
| "loss": 0.9239, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 1.039093255996704, | |
| "learning_rate": 0.0001907192340350165, | |
| "loss": 1.2387, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 1.1009947061538696, | |
| "learning_rate": 0.00019055972726500695, | |
| "loss": 1.0234, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 1.057837724685669, | |
| "learning_rate": 0.00019039892931234435, | |
| "loss": 1.4308, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 0.8715041279792786, | |
| "learning_rate": 0.00019023684246965406, | |
| "loss": 1.0642, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 1.3314340114593506, | |
| "learning_rate": 0.00019007346904793818, | |
| "loss": 1.1097, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 0.6808525919914246, | |
| "learning_rate": 0.00018990881137654258, | |
| "loss": 0.9254, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 0.8727401494979858, | |
| "learning_rate": 0.00018974287180312377, | |
| "loss": 1.1062, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 1.0826424360275269, | |
| "learning_rate": 0.00018957565269361531, | |
| "loss": 1.1528, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 0.8825279474258423, | |
| "learning_rate": 0.00018940715643219407, | |
| "loss": 1.2208, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 0.8955380320549011, | |
| "learning_rate": 0.00018923738542124644, | |
| "loss": 1.0918, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 1.0863996744155884, | |
| "learning_rate": 0.00018906634208133385, | |
| "loss": 1.2153, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 1.0089964866638184, | |
| "learning_rate": 0.00018889402885115833, | |
| "loss": 1.0796, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 0.9210363626480103, | |
| "learning_rate": 0.0001887204481875278, | |
| "loss": 0.8502, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 0.9592724442481995, | |
| "learning_rate": 0.000188545602565321, | |
| "loss": 1.4313, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 1.2299224138259888, | |
| "learning_rate": 0.00018836949447745215, | |
| "loss": 1.1074, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 1.1486583948135376, | |
| "learning_rate": 0.0001881921264348355, | |
| "loss": 1.3576, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 1.1912083625793457, | |
| "learning_rate": 0.00018801350096634946, | |
| "loss": 1.343, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 0.8830162882804871, | |
| "learning_rate": 0.00018783362061880062, | |
| "loss": 1.6139, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 1.0919363498687744, | |
| "learning_rate": 0.00018765248795688726, | |
| "loss": 1.4051, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 0.8009780049324036, | |
| "learning_rate": 0.00018747010556316305, | |
| "loss": 1.4095, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 0.964438796043396, | |
| "learning_rate": 0.00018728647603800003, | |
| "loss": 1.1634, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 0.9883137941360474, | |
| "learning_rate": 0.00018710160199955156, | |
| "loss": 1.1904, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 0.8936368227005005, | |
| "learning_rate": 0.0001869154860837151, | |
| "loss": 1.2264, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 0.6435540914535522, | |
| "learning_rate": 0.0001867281309440945, | |
| "loss": 0.8426, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 0.7036202549934387, | |
| "learning_rate": 0.00018653953925196225, | |
| "loss": 0.8162, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 0.7669593095779419, | |
| "learning_rate": 0.0001863497136962213, | |
| "loss": 0.9143, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 0.8047689199447632, | |
| "learning_rate": 0.00018615865698336684, | |
| "loss": 1.1911, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 1.1324609518051147, | |
| "learning_rate": 0.00018596637183744763, | |
| "loss": 1.064, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 0.8058610558509827, | |
| "learning_rate": 0.00018577286100002723, | |
| "loss": 0.7428, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 0.8588782548904419, | |
| "learning_rate": 0.00018557812723014476, | |
| "loss": 0.8801, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 1.0751081705093384, | |
| "learning_rate": 0.00018538217330427582, | |
| "loss": 1.3674, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 0.791789174079895, | |
| "learning_rate": 0.00018518500201629258, | |
| "loss": 1.0103, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 0.8154894709587097, | |
| "learning_rate": 0.00018498661617742426, | |
| "loss": 1.1219, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 1.3946795463562012, | |
| "learning_rate": 0.00018478701861621686, | |
| "loss": 1.0725, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 0.9187031388282776, | |
| "learning_rate": 0.00018458621217849286, | |
| "loss": 1.2674, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 0.9884739518165588, | |
| "learning_rate": 0.00018438419972731067, | |
| "loss": 1.3507, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 1.4417808055877686, | |
| "learning_rate": 0.0001841809841429238, | |
| "loss": 1.129, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 1.0408543348312378, | |
| "learning_rate": 0.0001839765683227398, | |
| "loss": 1.2038, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 1.2746024131774902, | |
| "learning_rate": 0.00018377095518127897, | |
| "loss": 1.2916, | |
| "step": 173 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 0.996474027633667, | |
| "learning_rate": 0.00018356414765013267, | |
| "loss": 1.3041, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 0.9435645341873169, | |
| "learning_rate": 0.00018335614867792183, | |
| "loss": 1.3457, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 1.0813288688659668, | |
| "learning_rate": 0.00018314696123025454, | |
| "loss": 1.2323, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 1.1004307270050049, | |
| "learning_rate": 0.00018293658828968397, | |
| "loss": 1.3084, | |
| "step": 177 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 0.7818973660469055, | |
| "learning_rate": 0.00018272503285566587, | |
| "loss": 0.88, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 0.8897396326065063, | |
| "learning_rate": 0.00018251229794451567, | |
| "loss": 1.0124, | |
| "step": 179 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 1.0572400093078613, | |
| "learning_rate": 0.00018229838658936564, | |
| "loss": 1.4603, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 0.8974701166152954, | |
| "learning_rate": 0.0001820833018401215, | |
| "loss": 1.1961, | |
| "step": 181 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 0.8860751390457153, | |
| "learning_rate": 0.00018186704676341898, | |
| "loss": 0.9779, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 0.7719995975494385, | |
| "learning_rate": 0.00018164962444258014, | |
| "loss": 1.1156, | |
| "step": 183 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 1.0823787450790405, | |
| "learning_rate": 0.0001814310379775694, | |
| "loss": 0.959, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 0.9932149052619934, | |
| "learning_rate": 0.00018121129048494922, | |
| "loss": 1.358, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 0.7363601326942444, | |
| "learning_rate": 0.00018099038509783582, | |
| "loss": 1.2639, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 1.0355418920516968, | |
| "learning_rate": 0.0001807683249658545, | |
| "loss": 1.2566, | |
| "step": 187 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 0.8746327757835388, | |
| "learning_rate": 0.0001805451132550946, | |
| "loss": 1.2159, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 0.9230121374130249, | |
| "learning_rate": 0.00018032075314806448, | |
| "loss": 1.1717, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 0.8293251991271973, | |
| "learning_rate": 0.00018009524784364615, | |
| "loss": 1.1053, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 1.8819063901901245, | |
| "learning_rate": 0.00017986860055704953, | |
| "loss": 1.4713, | |
| "step": 191 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 0.843971312046051, | |
| "learning_rate": 0.00017964081451976672, | |
| "loss": 1.175, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 1.1449857950210571, | |
| "learning_rate": 0.00017941189297952597, | |
| "loss": 1.0097, | |
| "step": 193 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 0.8204749226570129, | |
| "learning_rate": 0.0001791818392002452, | |
| "loss": 1.0383, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 1.0437519550323486, | |
| "learning_rate": 0.00017895065646198567, | |
| "loss": 1.4455, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 0.803774356842041, | |
| "learning_rate": 0.00017871834806090501, | |
| "loss": 0.9266, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 0.991162121295929, | |
| "learning_rate": 0.00017848491730921046, | |
| "loss": 0.9433, | |
| "step": 197 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 0.9293928146362305, | |
| "learning_rate": 0.00017825036753511144, | |
| "loss": 1.2432, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 0.6806849241256714, | |
| "learning_rate": 0.0001780147020827721, | |
| "loss": 0.696, | |
| "step": 199 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 1.2757548093795776, | |
| "learning_rate": 0.00017777792431226383, | |
| "loss": 1.3426, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 0.8645375370979309, | |
| "learning_rate": 0.00017754003759951715, | |
| "loss": 1.0345, | |
| "step": 201 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 1.0262008905410767, | |
| "learning_rate": 0.0001773010453362737, | |
| "loss": 0.935, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 1.225273609161377, | |
| "learning_rate": 0.00017706095093003785, | |
| "loss": 1.1271, | |
| "step": 203 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 0.9024125337600708, | |
| "learning_rate": 0.00017681975780402807, | |
| "loss": 1.1307, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 0.9035881161689758, | |
| "learning_rate": 0.00017657746939712815, | |
| "loss": 1.3217, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 2.1178483963012695, | |
| "learning_rate": 0.00017633408916383826, | |
| "loss": 1.4955, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "grad_norm": 0.826454222202301, | |
| "learning_rate": 0.00017608962057422549, | |
| "loss": 1.187, | |
| "step": 207 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "grad_norm": 0.8255906701087952, | |
| "learning_rate": 0.00017584406711387463, | |
| "loss": 1.0733, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "grad_norm": 0.9498797059059143, | |
| "learning_rate": 0.0001755974322838382, | |
| "loss": 1.3054, | |
| "step": 209 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "grad_norm": 0.7860575914382935, | |
| "learning_rate": 0.00017534971960058685, | |
| "loss": 1.0231, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 0.905441403388977, | |
| "learning_rate": 0.00017510093259595885, | |
| "loss": 1.2928, | |
| "step": 211 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 1.0108212232589722, | |
| "learning_rate": 0.00017485107481711012, | |
| "loss": 0.8964, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 1.036566972732544, | |
| "learning_rate": 0.00017460014982646334, | |
| "loss": 1.4823, | |
| "step": 213 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 0.8670862913131714, | |
| "learning_rate": 0.00017434816120165728, | |
| "loss": 1.0994, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "eval_loss": 1.1271697282791138, | |
| "eval_runtime": 4.9762, | |
| "eval_samples_per_second": 20.096, | |
| "eval_steps_per_second": 20.096, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 0.9359840750694275, | |
| "learning_rate": 0.00017409511253549593, | |
| "loss": 1.1552, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "grad_norm": 1.0469551086425781, | |
| "learning_rate": 0.00017384100743589697, | |
| "loss": 1.1175, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "grad_norm": 0.8635923266410828, | |
| "learning_rate": 0.0001735858495258406, | |
| "loss": 0.9823, | |
| "step": 217 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "grad_norm": 1.1676996946334839, | |
| "learning_rate": 0.00017332964244331776, | |
| "loss": 1.2903, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "grad_norm": 0.9774354696273804, | |
| "learning_rate": 0.00017307238984127832, | |
| "loss": 1.1928, | |
| "step": 219 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 1.0486751794815063, | |
| "learning_rate": 0.00017281409538757883, | |
| "loss": 1.039, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 1.0949512720108032, | |
| "learning_rate": 0.00017255476276493056, | |
| "loss": 1.148, | |
| "step": 221 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 0.8077650666236877, | |
| "learning_rate": 0.0001722943956708466, | |
| "loss": 1.1376, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 1.0595574378967285, | |
| "learning_rate": 0.00017203299781758943, | |
| "loss": 1.0757, | |
| "step": 223 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "grad_norm": 0.813774049282074, | |
| "learning_rate": 0.00017177057293211784, | |
| "loss": 1.2645, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "grad_norm": 0.8764140009880066, | |
| "learning_rate": 0.0001715071247560339, | |
| "loss": 0.9552, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "grad_norm": 0.9136344194412231, | |
| "learning_rate": 0.0001712426570455295, | |
| "loss": 1.1841, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "grad_norm": 0.8742187023162842, | |
| "learning_rate": 0.00017097717357133284, | |
| "loss": 1.0314, | |
| "step": 227 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "grad_norm": 0.8309169411659241, | |
| "learning_rate": 0.00017071067811865476, | |
| "loss": 0.9842, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "grad_norm": 0.8644968867301941, | |
| "learning_rate": 0.00017044317448713461, | |
| "loss": 1.3819, | |
| "step": 229 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "grad_norm": 0.8510339260101318, | |
| "learning_rate": 0.0001701746664907862, | |
| "loss": 1.2385, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "grad_norm": 0.8174643516540527, | |
| "learning_rate": 0.00016990515795794334, | |
| "loss": 0.9789, | |
| "step": 231 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "grad_norm": 0.9340826272964478, | |
| "learning_rate": 0.0001696346527312053, | |
| "loss": 1.2472, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "grad_norm": 0.9614835977554321, | |
| "learning_rate": 0.00016936315466738205, | |
| "loss": 1.1588, | |
| "step": 233 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "grad_norm": 1.1464625597000122, | |
| "learning_rate": 0.00016909066763743912, | |
| "loss": 0.8365, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "grad_norm": 0.8775334358215332, | |
| "learning_rate": 0.00016881719552644273, | |
| "loss": 1.1143, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "grad_norm": 0.9431893825531006, | |
| "learning_rate": 0.00016854274223350397, | |
| "loss": 1.362, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 0.7953469157218933, | |
| "learning_rate": 0.0001682673116717236, | |
| "loss": 1.1568, | |
| "step": 237 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 0.7517049908638, | |
| "learning_rate": 0.00016799090776813597, | |
| "loss": 0.9274, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 0.796934187412262, | |
| "learning_rate": 0.00016771353446365318, | |
| "loss": 0.8641, | |
| "step": 239 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 0.7946231961250305, | |
| "learning_rate": 0.00016743519571300888, | |
| "loss": 1.0518, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.57, | |
| "grad_norm": 1.0859878063201904, | |
| "learning_rate": 0.00016715589548470185, | |
| "loss": 1.1815, | |
| "step": 241 | |
| }, | |
| { | |
| "epoch": 0.57, | |
| "grad_norm": 0.7418296933174133, | |
| "learning_rate": 0.00016687563776093941, | |
| "loss": 1.0321, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 0.57, | |
| "grad_norm": 0.8161245584487915, | |
| "learning_rate": 0.00016659442653758064, | |
| "loss": 1.0931, | |
| "step": 243 | |
| }, | |
| { | |
| "epoch": 0.57, | |
| "grad_norm": 0.7787612080574036, | |
| "learning_rate": 0.00016631226582407952, | |
| "loss": 1.2239, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "grad_norm": 0.7161651849746704, | |
| "learning_rate": 0.00016602915964342757, | |
| "loss": 1.1104, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "grad_norm": 0.786612331867218, | |
| "learning_rate": 0.00016574511203209667, | |
| "loss": 1.2486, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "grad_norm": 0.8251045942306519, | |
| "learning_rate": 0.00016546012703998138, | |
| "loss": 1.2358, | |
| "step": 247 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "grad_norm": 0.8045319318771362, | |
| "learning_rate": 0.00016517420873034123, | |
| "loss": 0.8145, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "grad_norm": 0.8730091452598572, | |
| "learning_rate": 0.0001648873611797429, | |
| "loss": 0.8832, | |
| "step": 249 | |
| }, | |
| { | |
| "epoch": 0.59, | |
| "grad_norm": 0.9003087878227234, | |
| "learning_rate": 0.00016459958847800187, | |
| "loss": 1.1149, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.59, | |
| "grad_norm": 1.0912461280822754, | |
| "learning_rate": 0.00016431089472812444, | |
| "loss": 1.0439, | |
| "step": 251 | |
| }, | |
| { | |
| "epoch": 0.59, | |
| "grad_norm": 0.7999249696731567, | |
| "learning_rate": 0.00016402128404624882, | |
| "loss": 0.9821, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 0.59, | |
| "grad_norm": 1.0122705698013306, | |
| "learning_rate": 0.00016373076056158675, | |
| "loss": 1.2302, | |
| "step": 253 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 0.6447519659996033, | |
| "learning_rate": 0.00016343932841636456, | |
| "loss": 0.6079, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 0.7757092118263245, | |
| "learning_rate": 0.00016314699176576402, | |
| "loss": 1.0092, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 0.7445678114891052, | |
| "learning_rate": 0.00016285375477786322, | |
| "loss": 0.684, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 1.241065263748169, | |
| "learning_rate": 0.000162559621633577, | |
| "loss": 1.0321, | |
| "step": 257 | |
| }, | |
| { | |
| "epoch": 0.61, | |
| "grad_norm": 1.1429563760757446, | |
| "learning_rate": 0.00016226459652659753, | |
| "loss": 1.1635, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 0.61, | |
| "grad_norm": 0.7441573739051819, | |
| "learning_rate": 0.0001619686836633343, | |
| "loss": 0.9685, | |
| "step": 259 | |
| }, | |
| { | |
| "epoch": 0.61, | |
| "grad_norm": 1.3199349641799927, | |
| "learning_rate": 0.00016167188726285434, | |
| "loss": 1.3159, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.61, | |
| "grad_norm": 0.8082245588302612, | |
| "learning_rate": 0.00016137421155682183, | |
| "loss": 1.317, | |
| "step": 261 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": 0.9036356210708618, | |
| "learning_rate": 0.0001610756607894382, | |
| "loss": 0.8672, | |
| "step": 262 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": 0.9773459434509277, | |
| "learning_rate": 0.00016077623921738102, | |
| "loss": 1.1405, | |
| "step": 263 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": 0.9840144515037537, | |
| "learning_rate": 0.00016047595110974376, | |
| "loss": 1.4167, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": 1.0358545780181885, | |
| "learning_rate": 0.0001601748007479748, | |
| "loss": 1.196, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": 0.7097404599189758, | |
| "learning_rate": 0.0001598727924258164, | |
| "loss": 0.791, | |
| "step": 266 | |
| }, | |
| { | |
| "epoch": 0.63, | |
| "grad_norm": 1.7330995798110962, | |
| "learning_rate": 0.00015956993044924334, | |
| "loss": 1.4283, | |
| "step": 267 | |
| }, | |
| { | |
| "epoch": 0.63, | |
| "grad_norm": 0.7444025278091431, | |
| "learning_rate": 0.0001592662191364017, | |
| "loss": 0.7525, | |
| "step": 268 | |
| }, | |
| { | |
| "epoch": 0.63, | |
| "grad_norm": 0.9818642139434814, | |
| "learning_rate": 0.0001589616628175472, | |
| "loss": 1.2417, | |
| "step": 269 | |
| }, | |
| { | |
| "epoch": 0.63, | |
| "grad_norm": 0.9218468070030212, | |
| "learning_rate": 0.00015865626583498355, | |
| "loss": 1.1316, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 0.9644055366516113, | |
| "learning_rate": 0.00015835003254300039, | |
| "loss": 1.2594, | |
| "step": 271 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 0.8228254914283752, | |
| "learning_rate": 0.00015804296730781135, | |
| "loss": 1.2481, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 1.0708152055740356, | |
| "learning_rate": 0.00015773507450749172, | |
| "loss": 1.107, | |
| "step": 273 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 1.1122934818267822, | |
| "learning_rate": 0.00015742635853191608, | |
| "loss": 0.8714, | |
| "step": 274 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 0.8141905665397644, | |
| "learning_rate": 0.00015711682378269565, | |
| "loss": 0.9943, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 0.8955541253089905, | |
| "learning_rate": 0.00015680647467311557, | |
| "loss": 1.3176, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 1.1133729219436646, | |
| "learning_rate": 0.000156495315628072, | |
| "loss": 1.2602, | |
| "step": 277 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 0.8733439445495605, | |
| "learning_rate": 0.00015618335108400893, | |
| "loss": 1.3639, | |
| "step": 278 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 0.8614795804023743, | |
| "learning_rate": 0.00015587058548885505, | |
| "loss": 1.1905, | |
| "step": 279 | |
| }, | |
| { | |
| "epoch": 0.66, | |
| "grad_norm": 0.8306368589401245, | |
| "learning_rate": 0.00015555702330196023, | |
| "loss": 1.1978, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.66, | |
| "grad_norm": 0.8460854887962341, | |
| "learning_rate": 0.00015524266899403206, | |
| "loss": 0.9872, | |
| "step": 281 | |
| }, | |
| { | |
| "epoch": 0.66, | |
| "grad_norm": 0.8452059626579285, | |
| "learning_rate": 0.000154927527047072, | |
| "loss": 0.979, | |
| "step": 282 | |
| }, | |
| { | |
| "epoch": 0.66, | |
| "grad_norm": 0.8805731534957886, | |
| "learning_rate": 0.00015461160195431148, | |
| "loss": 1.2885, | |
| "step": 283 | |
| }, | |
| { | |
| "epoch": 0.67, | |
| "grad_norm": 0.9095639586448669, | |
| "learning_rate": 0.0001542948982201479, | |
| "loss": 1.1156, | |
| "step": 284 | |
| }, | |
| { | |
| "epoch": 0.67, | |
| "grad_norm": 0.9862900376319885, | |
| "learning_rate": 0.00015397742036008034, | |
| "loss": 1.1571, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.67, | |
| "grad_norm": 0.9344744086265564, | |
| "learning_rate": 0.0001536591729006453, | |
| "loss": 1.2204, | |
| "step": 286 | |
| }, | |
| { | |
| "epoch": 0.67, | |
| "grad_norm": 1.0605379343032837, | |
| "learning_rate": 0.00015334016037935196, | |
| "loss": 1.3048, | |
| "step": 287 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 0.9844763278961182, | |
| "learning_rate": 0.0001530203873446177, | |
| "loss": 1.0035, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 0.767954409122467, | |
| "learning_rate": 0.0001526998583557031, | |
| "loss": 0.9023, | |
| "step": 289 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 1.0622146129608154, | |
| "learning_rate": 0.000152378577982647, | |
| "loss": 1.4837, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 0.9536969065666199, | |
| "learning_rate": 0.0001520565508062013, | |
| "loss": 1.0948, | |
| "step": 291 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "grad_norm": 0.9654991030693054, | |
| "learning_rate": 0.00015173378141776568, | |
| "loss": 1.1913, | |
| "step": 292 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "grad_norm": 0.8208710551261902, | |
| "learning_rate": 0.00015141027441932216, | |
| "loss": 1.1435, | |
| "step": 293 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "grad_norm": 0.9273961186408997, | |
| "learning_rate": 0.0001510860344233695, | |
| "loss": 1.0845, | |
| "step": 294 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "grad_norm": 1.0316227674484253, | |
| "learning_rate": 0.00015076106605285724, | |
| "loss": 1.4532, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "grad_norm": 1.0121437311172485, | |
| "learning_rate": 0.00015043537394112007, | |
| "loss": 0.8687, | |
| "step": 296 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 1.0713882446289062, | |
| "learning_rate": 0.00015010896273181165, | |
| "loss": 1.1097, | |
| "step": 297 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 0.8149722814559937, | |
| "learning_rate": 0.00014978183707883827, | |
| "loss": 0.8682, | |
| "step": 298 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 0.7118079662322998, | |
| "learning_rate": 0.00014945400164629278, | |
| "loss": 0.9225, | |
| "step": 299 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 1.1042624711990356, | |
| "learning_rate": 0.00014912546110838775, | |
| "loss": 1.4279, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.71, | |
| "grad_norm": 0.947619616985321, | |
| "learning_rate": 0.00014879622014938915, | |
| "loss": 1.0544, | |
| "step": 301 | |
| }, | |
| { | |
| "epoch": 0.71, | |
| "grad_norm": 0.9065904021263123, | |
| "learning_rate": 0.00014846628346354933, | |
| "loss": 1.1642, | |
| "step": 302 | |
| }, | |
| { | |
| "epoch": 0.71, | |
| "grad_norm": 0.9430320262908936, | |
| "learning_rate": 0.00014813565575504022, | |
| "loss": 1.2182, | |
| "step": 303 | |
| }, | |
| { | |
| "epoch": 0.71, | |
| "grad_norm": 0.8739117980003357, | |
| "learning_rate": 0.00014780434173788617, | |
| "loss": 1.0176, | |
| "step": 304 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 0.853125274181366, | |
| "learning_rate": 0.00014747234613589685, | |
| "loss": 1.1827, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 1.6718727350234985, | |
| "learning_rate": 0.0001471396736825998, | |
| "loss": 1.2665, | |
| "step": 306 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 0.8566248416900635, | |
| "learning_rate": 0.00014680632912117286, | |
| "loss": 1.2231, | |
| "step": 307 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 0.6841180324554443, | |
| "learning_rate": 0.00014647231720437686, | |
| "loss": 0.9366, | |
| "step": 308 | |
| }, | |
| { | |
| "epoch": 0.73, | |
| "grad_norm": 0.9140876531600952, | |
| "learning_rate": 0.00014613764269448751, | |
| "loss": 1.0711, | |
| "step": 309 | |
| }, | |
| { | |
| "epoch": 0.73, | |
| "grad_norm": 0.9394497275352478, | |
| "learning_rate": 0.00014580231036322768, | |
| "loss": 1.1159, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.73, | |
| "grad_norm": 1.1066112518310547, | |
| "learning_rate": 0.00014546632499169937, | |
| "loss": 1.3487, | |
| "step": 311 | |
| }, | |
| { | |
| "epoch": 0.73, | |
| "grad_norm": 0.9925751090049744, | |
| "learning_rate": 0.00014512969137031538, | |
| "loss": 1.1207, | |
| "step": 312 | |
| }, | |
| { | |
| "epoch": 0.73, | |
| "grad_norm": 0.9642359018325806, | |
| "learning_rate": 0.0001447924142987312, | |
| "loss": 1.3772, | |
| "step": 313 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "grad_norm": 0.6977396607398987, | |
| "learning_rate": 0.0001444544985857766, | |
| "loss": 0.8517, | |
| "step": 314 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "grad_norm": 1.033882737159729, | |
| "learning_rate": 0.00014411594904938682, | |
| "loss": 1.0644, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "grad_norm": 1.020871877670288, | |
| "learning_rate": 0.00014377677051653404, | |
| "loss": 1.2026, | |
| "step": 316 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "grad_norm": 1.059812068939209, | |
| "learning_rate": 0.0001434369678231587, | |
| "loss": 1.4181, | |
| "step": 317 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 0.8130291104316711, | |
| "learning_rate": 0.00014309654581410024, | |
| "loss": 1.0691, | |
| "step": 318 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 0.8362820148468018, | |
| "learning_rate": 0.00014275550934302823, | |
| "loss": 1.0053, | |
| "step": 319 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 0.9266586899757385, | |
| "learning_rate": 0.0001424138632723731, | |
| "loss": 1.1313, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 1.0162605047225952, | |
| "learning_rate": 0.00014207161247325691, | |
| "loss": 1.3518, | |
| "step": 321 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "eval_loss": 1.114696741104126, | |
| "eval_runtime": 5.1062, | |
| "eval_samples_per_second": 19.584, | |
| "eval_steps_per_second": 19.584, | |
| "step": 321 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 1.0078846216201782, | |
| "learning_rate": 0.00014172876182542372, | |
| "loss": 1.0446, | |
| "step": 322 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 1.2844680547714233, | |
| "learning_rate": 0.00014138531621717018, | |
| "loss": 1.4105, | |
| "step": 323 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 1.0380208492279053, | |
| "learning_rate": 0.0001410412805452757, | |
| "loss": 1.4212, | |
| "step": 324 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 0.8037036061286926, | |
| "learning_rate": 0.00014069665971493274, | |
| "loss": 0.8392, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.77, | |
| "grad_norm": 0.9248948693275452, | |
| "learning_rate": 0.00014035145863967692, | |
| "loss": 1.3121, | |
| "step": 326 | |
| }, | |
| { | |
| "epoch": 0.77, | |
| "grad_norm": 0.8579298853874207, | |
| "learning_rate": 0.0001400056822413167, | |
| "loss": 1.1128, | |
| "step": 327 | |
| }, | |
| { | |
| "epoch": 0.77, | |
| "grad_norm": 1.0605120658874512, | |
| "learning_rate": 0.0001396593354498635, | |
| "loss": 1.543, | |
| "step": 328 | |
| }, | |
| { | |
| "epoch": 0.77, | |
| "grad_norm": 0.9975443482398987, | |
| "learning_rate": 0.0001393124232034613, | |
| "loss": 1.1178, | |
| "step": 329 | |
| }, | |
| { | |
| "epoch": 0.77, | |
| "grad_norm": 0.8115065693855286, | |
| "learning_rate": 0.0001389649504483162, | |
| "loss": 1.1937, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "grad_norm": 0.7796252369880676, | |
| "learning_rate": 0.00013861692213862584, | |
| "loss": 1.1886, | |
| "step": 331 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "grad_norm": 1.0133821964263916, | |
| "learning_rate": 0.000138268343236509, | |
| "loss": 1.4973, | |
| "step": 332 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "grad_norm": 0.9557147026062012, | |
| "learning_rate": 0.00013791921871193457, | |
| "loss": 1.4592, | |
| "step": 333 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "grad_norm": 0.9763726592063904, | |
| "learning_rate": 0.00013756955354265085, | |
| "loss": 0.8502, | |
| "step": 334 | |
| }, | |
| { | |
| "epoch": 0.79, | |
| "grad_norm": 0.8208116888999939, | |
| "learning_rate": 0.00013721935271411464, | |
| "loss": 1.1601, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.79, | |
| "grad_norm": 1.3176727294921875, | |
| "learning_rate": 0.0001368686212194199, | |
| "loss": 1.1715, | |
| "step": 336 | |
| }, | |
| { | |
| "epoch": 0.79, | |
| "grad_norm": 1.2329626083374023, | |
| "learning_rate": 0.00013651736405922686, | |
| "loss": 1.3426, | |
| "step": 337 | |
| }, | |
| { | |
| "epoch": 0.79, | |
| "grad_norm": 0.9947068691253662, | |
| "learning_rate": 0.0001361655862416905, | |
| "loss": 1.0623, | |
| "step": 338 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 1.176267147064209, | |
| "learning_rate": 0.00013581329278238927, | |
| "loss": 1.1281, | |
| "step": 339 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 0.909443736076355, | |
| "learning_rate": 0.00013546048870425356, | |
| "loss": 1.2623, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 0.8919989466667175, | |
| "learning_rate": 0.000135107179037494, | |
| "loss": 1.2652, | |
| "step": 341 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 0.7781542539596558, | |
| "learning_rate": 0.00013475336881952986, | |
| "loss": 0.9857, | |
| "step": 342 | |
| }, | |
| { | |
| "epoch": 0.81, | |
| "grad_norm": 0.9232913851737976, | |
| "learning_rate": 0.00013439906309491712, | |
| "loss": 1.0923, | |
| "step": 343 | |
| }, | |
| { | |
| "epoch": 0.81, | |
| "grad_norm": 1.1160950660705566, | |
| "learning_rate": 0.0001340442669152766, | |
| "loss": 1.3445, | |
| "step": 344 | |
| }, | |
| { | |
| "epoch": 0.81, | |
| "grad_norm": 1.084597110748291, | |
| "learning_rate": 0.000133688985339222, | |
| "loss": 1.7647, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.81, | |
| "grad_norm": 0.8420549631118774, | |
| "learning_rate": 0.0001333332234322876, | |
| "loss": 1.1342, | |
| "step": 346 | |
| }, | |
| { | |
| "epoch": 0.81, | |
| "grad_norm": 1.0362187623977661, | |
| "learning_rate": 0.0001329769862668563, | |
| "loss": 1.0779, | |
| "step": 347 | |
| }, | |
| { | |
| "epoch": 0.82, | |
| "grad_norm": 0.902492344379425, | |
| "learning_rate": 0.00013262027892208694, | |
| "loss": 1.1121, | |
| "step": 348 | |
| }, | |
| { | |
| "epoch": 0.82, | |
| "grad_norm": 1.4322317838668823, | |
| "learning_rate": 0.0001322631064838422, | |
| "loss": 1.5474, | |
| "step": 349 | |
| }, | |
| { | |
| "epoch": 0.82, | |
| "grad_norm": 0.8751888275146484, | |
| "learning_rate": 0.00013190547404461598, | |
| "loss": 1.2055, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.82, | |
| "grad_norm": 0.9157432913780212, | |
| "learning_rate": 0.0001315473867034608, | |
| "loss": 1.3176, | |
| "step": 351 | |
| }, | |
| { | |
| "epoch": 0.83, | |
| "grad_norm": 0.7300966382026672, | |
| "learning_rate": 0.0001311888495659149, | |
| "loss": 0.9548, | |
| "step": 352 | |
| }, | |
| { | |
| "epoch": 0.83, | |
| "grad_norm": 1.0954256057739258, | |
| "learning_rate": 0.0001308298677439299, | |
| "loss": 1.1649, | |
| "step": 353 | |
| }, | |
| { | |
| "epoch": 0.83, | |
| "grad_norm": 1.0646469593048096, | |
| "learning_rate": 0.00013047044635579747, | |
| "loss": 1.3597, | |
| "step": 354 | |
| }, | |
| { | |
| "epoch": 0.83, | |
| "grad_norm": 0.7668378949165344, | |
| "learning_rate": 0.00013011059052607656, | |
| "loss": 1.1246, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 0.9135538339614868, | |
| "learning_rate": 0.00012975030538552032, | |
| "loss": 1.0189, | |
| "step": 356 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 0.7841051816940308, | |
| "learning_rate": 0.00012938959607100288, | |
| "loss": 1.1396, | |
| "step": 357 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 0.9529784321784973, | |
| "learning_rate": 0.00012902846772544624, | |
| "loss": 1.4681, | |
| "step": 358 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 0.8711650967597961, | |
| "learning_rate": 0.00012866692549774682, | |
| "loss": 0.9842, | |
| "step": 359 | |
| }, | |
| { | |
| "epoch": 0.85, | |
| "grad_norm": 0.9562662839889526, | |
| "learning_rate": 0.00012830497454270205, | |
| "loss": 1.3051, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.85, | |
| "grad_norm": 1.0756105184555054, | |
| "learning_rate": 0.00012794262002093697, | |
| "loss": 1.3275, | |
| "step": 361 | |
| }, | |
| { | |
| "epoch": 0.85, | |
| "grad_norm": 0.7915710806846619, | |
| "learning_rate": 0.0001275798670988306, | |
| "loss": 1.0035, | |
| "step": 362 | |
| }, | |
| { | |
| "epoch": 0.85, | |
| "grad_norm": 0.9524595737457275, | |
| "learning_rate": 0.0001272167209484422, | |
| "loss": 1.2083, | |
| "step": 363 | |
| }, | |
| { | |
| "epoch": 0.85, | |
| "grad_norm": 1.4926435947418213, | |
| "learning_rate": 0.0001268531867474377, | |
| "loss": 1.3218, | |
| "step": 364 | |
| }, | |
| { | |
| "epoch": 0.86, | |
| "grad_norm": 1.2689683437347412, | |
| "learning_rate": 0.00012648926967901567, | |
| "loss": 2.7813, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 0.86, | |
| "grad_norm": 0.8361314535140991, | |
| "learning_rate": 0.00012612497493183364, | |
| "loss": 1.124, | |
| "step": 366 | |
| }, | |
| { | |
| "epoch": 0.86, | |
| "grad_norm": 1.2996618747711182, | |
| "learning_rate": 0.00012576030769993393, | |
| "loss": 1.3745, | |
| "step": 367 | |
| }, | |
| { | |
| "epoch": 0.86, | |
| "grad_norm": 0.8248890042304993, | |
| "learning_rate": 0.0001253952731826697, | |
| "loss": 1.1971, | |
| "step": 368 | |
| }, | |
| { | |
| "epoch": 0.87, | |
| "grad_norm": 0.8044300079345703, | |
| "learning_rate": 0.00012502987658463075, | |
| "loss": 1.1508, | |
| "step": 369 | |
| }, | |
| { | |
| "epoch": 0.87, | |
| "grad_norm": 1.196742057800293, | |
| "learning_rate": 0.00012466412311556952, | |
| "loss": 0.9868, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.87, | |
| "grad_norm": 0.9415065050125122, | |
| "learning_rate": 0.0001242980179903264, | |
| "loss": 1.046, | |
| "step": 371 | |
| }, | |
| { | |
| "epoch": 0.87, | |
| "grad_norm": 1.049695611000061, | |
| "learning_rate": 0.0001239315664287558, | |
| "loss": 0.9927, | |
| "step": 372 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 0.8266507387161255, | |
| "learning_rate": 0.00012356477365565148, | |
| "loss": 0.8879, | |
| "step": 373 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 0.9163070321083069, | |
| "learning_rate": 0.0001231976449006721, | |
| "loss": 1.1214, | |
| "step": 374 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 1.01756751537323, | |
| "learning_rate": 0.00012283018539826685, | |
| "loss": 1.1644, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 0.901319682598114, | |
| "learning_rate": 0.00012246240038760043, | |
| "loss": 1.1985, | |
| "step": 376 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 0.9721381664276123, | |
| "learning_rate": 0.00012209429511247864, | |
| "loss": 1.1199, | |
| "step": 377 | |
| }, | |
| { | |
| "epoch": 0.89, | |
| "grad_norm": 0.8883329033851624, | |
| "learning_rate": 0.0001217258748212737, | |
| "loss": 1.3431, | |
| "step": 378 | |
| }, | |
| { | |
| "epoch": 0.89, | |
| "grad_norm": 1.0698317289352417, | |
| "learning_rate": 0.00012135714476684903, | |
| "loss": 1.3173, | |
| "step": 379 | |
| }, | |
| { | |
| "epoch": 0.89, | |
| "grad_norm": 0.8664084076881409, | |
| "learning_rate": 0.00012098811020648475, | |
| "loss": 1.0441, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.89, | |
| "grad_norm": 0.9194340109825134, | |
| "learning_rate": 0.00012061877640180255, | |
| "loss": 1.152, | |
| "step": 381 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 0.9599464535713196, | |
| "learning_rate": 0.00012024914861869063, | |
| "loss": 1.1115, | |
| "step": 382 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 0.9990159273147583, | |
| "learning_rate": 0.00011987923212722872, | |
| "loss": 1.2534, | |
| "step": 383 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 0.8435646891593933, | |
| "learning_rate": 0.00011950903220161285, | |
| "loss": 1.1752, | |
| "step": 384 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 1.0376098155975342, | |
| "learning_rate": 0.00011913855412008023, | |
| "loss": 1.4716, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 0.91, | |
| "grad_norm": 1.3249186277389526, | |
| "learning_rate": 0.00011876780316483401, | |
| "loss": 1.211, | |
| "step": 386 | |
| }, | |
| { | |
| "epoch": 0.91, | |
| "grad_norm": 1.378393292427063, | |
| "learning_rate": 0.00011839678462196784, | |
| "loss": 1.0357, | |
| "step": 387 | |
| }, | |
| { | |
| "epoch": 0.91, | |
| "grad_norm": 0.7574142217636108, | |
| "learning_rate": 0.0001180255037813906, | |
| "loss": 0.4137, | |
| "step": 388 | |
| }, | |
| { | |
| "epoch": 0.91, | |
| "grad_norm": 0.7813417911529541, | |
| "learning_rate": 0.00011765396593675097, | |
| "loss": 1.1776, | |
| "step": 389 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": 0.8787057995796204, | |
| "learning_rate": 0.00011728217638536197, | |
| "loss": 1.1352, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": 0.9643175005912781, | |
| "learning_rate": 0.00011691014042812536, | |
| "loss": 1.3089, | |
| "step": 391 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": 0.9101107716560364, | |
| "learning_rate": 0.00011653786336945614, | |
| "loss": 1.0639, | |
| "step": 392 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": 1.018091082572937, | |
| "learning_rate": 0.00011616535051720685, | |
| "loss": 0.9938, | |
| "step": 393 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": 0.9708930253982544, | |
| "learning_rate": 0.00011579260718259197, | |
| "loss": 0.8004, | |
| "step": 394 | |
| }, | |
| { | |
| "epoch": 0.93, | |
| "grad_norm": 0.8909386396408081, | |
| "learning_rate": 0.00011541963868011212, | |
| "loss": 1.2997, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 0.93, | |
| "grad_norm": 1.0622750520706177, | |
| "learning_rate": 0.00011504645032747832, | |
| "loss": 1.0235, | |
| "step": 396 | |
| }, | |
| { | |
| "epoch": 0.93, | |
| "grad_norm": 0.8857365250587463, | |
| "learning_rate": 0.00011467304744553618, | |
| "loss": 0.8382, | |
| "step": 397 | |
| }, | |
| { | |
| "epoch": 0.93, | |
| "grad_norm": 0.8980242013931274, | |
| "learning_rate": 0.00011429943535819005, | |
| "loss": 1.0877, | |
| "step": 398 | |
| }, | |
| { | |
| "epoch": 0.94, | |
| "grad_norm": 1.1426031589508057, | |
| "learning_rate": 0.00011392561939232706, | |
| "loss": 1.3496, | |
| "step": 399 | |
| }, | |
| { | |
| "epoch": 0.94, | |
| "grad_norm": 1.0347543954849243, | |
| "learning_rate": 0.0001135516048777412, | |
| "loss": 1.6309, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.94, | |
| "grad_norm": 1.0121687650680542, | |
| "learning_rate": 0.00011317739714705731, | |
| "loss": 1.2256, | |
| "step": 401 | |
| }, | |
| { | |
| "epoch": 0.94, | |
| "grad_norm": 0.8863442540168762, | |
| "learning_rate": 0.0001128030015356551, | |
| "loss": 0.8687, | |
| "step": 402 | |
| }, | |
| { | |
| "epoch": 0.95, | |
| "grad_norm": 0.7622981667518616, | |
| "learning_rate": 0.00011242842338159309, | |
| "loss": 0.7564, | |
| "step": 403 | |
| }, | |
| { | |
| "epoch": 0.95, | |
| "grad_norm": 0.9527961015701294, | |
| "learning_rate": 0.0001120536680255323, | |
| "loss": 1.0593, | |
| "step": 404 | |
| }, | |
| { | |
| "epoch": 0.95, | |
| "grad_norm": 1.3481955528259277, | |
| "learning_rate": 0.00011167874081066045, | |
| "loss": 1.2279, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 0.95, | |
| "grad_norm": 0.8665672540664673, | |
| "learning_rate": 0.00011130364708261552, | |
| "loss": 1.1677, | |
| "step": 406 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 1.217490553855896, | |
| "learning_rate": 0.0001109283921894095, | |
| "loss": 1.2617, | |
| "step": 407 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 0.8935596942901611, | |
| "learning_rate": 0.00011055298148135236, | |
| "loss": 1.1184, | |
| "step": 408 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 0.8513955473899841, | |
| "learning_rate": 0.00011017742031097563, | |
| "loss": 1.2705, | |
| "step": 409 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 1.0295133590698242, | |
| "learning_rate": 0.0001098017140329561, | |
| "loss": 1.1966, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 1.1029167175292969, | |
| "learning_rate": 0.0001094258680040394, | |
| "loss": 1.4887, | |
| "step": 411 | |
| }, | |
| { | |
| "epoch": 0.97, | |
| "grad_norm": 0.9045723080635071, | |
| "learning_rate": 0.0001090498875829638, | |
| "loss": 1.1461, | |
| "step": 412 | |
| }, | |
| { | |
| "epoch": 0.97, | |
| "grad_norm": 0.8317312002182007, | |
| "learning_rate": 0.00010867377813038366, | |
| "loss": 1.136, | |
| "step": 413 | |
| }, | |
| { | |
| "epoch": 0.97, | |
| "grad_norm": 1.0023647546768188, | |
| "learning_rate": 0.00010829754500879308, | |
| "loss": 1.1123, | |
| "step": 414 | |
| }, | |
| { | |
| "epoch": 0.97, | |
| "grad_norm": 0.9197617769241333, | |
| "learning_rate": 0.00010792119358244939, | |
| "loss": 1.2792, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 0.98, | |
| "grad_norm": 0.9892452955245972, | |
| "learning_rate": 0.00010754472921729661, | |
| "loss": 1.634, | |
| "step": 416 | |
| }, | |
| { | |
| "epoch": 0.98, | |
| "grad_norm": 0.8005648255348206, | |
| "learning_rate": 0.00010716815728088912, | |
| "loss": 0.7168, | |
| "step": 417 | |
| }, | |
| { | |
| "epoch": 0.98, | |
| "grad_norm": 1.1989247798919678, | |
| "learning_rate": 0.00010679148314231504, | |
| "loss": 1.2882, | |
| "step": 418 | |
| }, | |
| { | |
| "epoch": 0.98, | |
| "grad_norm": 0.7820172905921936, | |
| "learning_rate": 0.00010641471217211958, | |
| "loss": 1.1125, | |
| "step": 419 | |
| }, | |
| { | |
| "epoch": 0.99, | |
| "grad_norm": 1.352563500404358, | |
| "learning_rate": 0.00010603784974222861, | |
| "loss": 0.9641, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.99, | |
| "grad_norm": 0.8966504335403442, | |
| "learning_rate": 0.000105660901225872, | |
| "loss": 1.1155, | |
| "step": 421 | |
| }, | |
| { | |
| "epoch": 0.99, | |
| "grad_norm": 0.8722444176673889, | |
| "learning_rate": 0.00010528387199750707, | |
| "loss": 1.3011, | |
| "step": 422 | |
| }, | |
| { | |
| "epoch": 0.99, | |
| "grad_norm": 0.8678218722343445, | |
| "learning_rate": 0.00010490676743274181, | |
| "loss": 1.2912, | |
| "step": 423 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 0.8596826791763306, | |
| "learning_rate": 0.00010452959290825846, | |
| "loss": 1.3792, | |
| "step": 424 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 0.777655303478241, | |
| "learning_rate": 0.00010415235380173662, | |
| "loss": 0.9992, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 0.7913762331008911, | |
| "learning_rate": 0.00010377505549177682, | |
| "loss": 0.8813, | |
| "step": 426 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 852, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 426, | |
| "total_flos": 7754585188270080.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |