diff --git "a/checkpoint-852/trainer_state.json" "b/checkpoint-852/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-852/trainer_state.json" @@ -0,0 +1,6049 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.9694835680751175, + "eval_steps": 107, + "global_step": 852, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 0.8741932511329651, + "learning_rate": 1e-05, + "loss": 1.2567, + "step": 1 + }, + { + "epoch": 0.0, + "eval_loss": 1.3469510078430176, + "eval_runtime": 5.3003, + "eval_samples_per_second": 18.867, + "eval_steps_per_second": 18.867, + "step": 1 + }, + { + "epoch": 0.0, + "grad_norm": 0.7848138809204102, + "learning_rate": 2e-05, + "loss": 1.3328, + "step": 2 + }, + { + "epoch": 0.01, + "grad_norm": 1.0692198276519775, + "learning_rate": 3e-05, + "loss": 1.6569, + "step": 3 + }, + { + "epoch": 0.01, + "grad_norm": 1.4229260683059692, + "learning_rate": 4e-05, + "loss": 1.5493, + "step": 4 + }, + { + "epoch": 0.01, + "grad_norm": 0.6837282180786133, + "learning_rate": 5e-05, + "loss": 1.4342, + "step": 5 + }, + { + "epoch": 0.01, + "grad_norm": 1.1194649934768677, + "learning_rate": 6e-05, + "loss": 1.2668, + "step": 6 + }, + { + "epoch": 0.02, + "grad_norm": 0.954695463180542, + "learning_rate": 7e-05, + "loss": 1.4721, + "step": 7 + }, + { + "epoch": 0.02, + "grad_norm": 0.8204368352890015, + "learning_rate": 8e-05, + "loss": 1.4054, + "step": 8 + }, + { + "epoch": 0.02, + "grad_norm": 0.8432589769363403, + "learning_rate": 9e-05, + "loss": 1.2849, + "step": 9 + }, + { + "epoch": 0.02, + "grad_norm": 0.7272113561630249, + "learning_rate": 0.0001, + "loss": 1.1438, + "step": 10 + }, + { + "epoch": 0.03, + "grad_norm": 0.8135901093482971, + "learning_rate": 0.00011000000000000002, + "loss": 1.4665, + "step": 11 + }, + { + "epoch": 0.03, + "grad_norm": 0.8113470673561096, + "learning_rate": 0.00012, + "loss": 0.9749, + "step": 12 + }, + { + "epoch": 0.03, + "grad_norm": 1.3105953931808472, + "learning_rate": 0.00013000000000000002, + "loss": 1.5133, + "step": 13 + }, + { + "epoch": 0.03, + "grad_norm": 1.0281662940979004, + "learning_rate": 0.00014, + "loss": 1.1509, + "step": 14 + }, + { + "epoch": 0.04, + "grad_norm": 2.1005992889404297, + "learning_rate": 0.00015000000000000001, + "loss": 1.2748, + "step": 15 + }, + { + "epoch": 0.04, + "grad_norm": 1.453407645225525, + "learning_rate": 0.00016, + "loss": 1.3313, + "step": 16 + }, + { + "epoch": 0.04, + "grad_norm": 1.0521585941314697, + "learning_rate": 0.00017, + "loss": 1.2977, + "step": 17 + }, + { + "epoch": 0.04, + "grad_norm": 1.0728052854537964, + "learning_rate": 0.00018, + "loss": 1.322, + "step": 18 + }, + { + "epoch": 0.04, + "grad_norm": 0.8539354205131531, + "learning_rate": 0.00019, + "loss": 1.2053, + "step": 19 + }, + { + "epoch": 0.05, + "grad_norm": 1.6902265548706055, + "learning_rate": 0.0002, + "loss": 1.1924, + "step": 20 + }, + { + "epoch": 0.05, + "grad_norm": 1.3770757913589478, + "learning_rate": 0.00019999928710990412, + "loss": 1.2286, + "step": 21 + }, + { + "epoch": 0.05, + "grad_norm": 1.358694314956665, + "learning_rate": 0.00019999714844978078, + "loss": 1.3196, + "step": 22 + }, + { + "epoch": 0.05, + "grad_norm": 1.4592664241790771, + "learning_rate": 0.0001999935840501225, + "loss": 1.5923, + "step": 23 + }, + { + "epoch": 0.06, + "grad_norm": 1.122499942779541, + "learning_rate": 0.0001999885939617498, + "loss": 1.3107, + "step": 24 + }, + { + "epoch": 0.06, + "grad_norm": 1.0040143728256226, + "learning_rate": 0.0001999821782558104, + "loss": 1.316, + "step": 25 + }, + { + "epoch": 0.06, + "grad_norm": 1.0741703510284424, + "learning_rate": 0.00019997433702377817, + "loss": 1.2527, + "step": 26 + }, + { + "epoch": 0.06, + "grad_norm": 0.9978523850440979, + "learning_rate": 0.00019996507037745183, + "loss": 1.1989, + "step": 27 + }, + { + "epoch": 0.07, + "grad_norm": 1.002176284790039, + "learning_rate": 0.00019995437844895334, + "loss": 1.0503, + "step": 28 + }, + { + "epoch": 0.07, + "grad_norm": 1.0028811693191528, + "learning_rate": 0.0001999422613907262, + "loss": 1.356, + "step": 29 + }, + { + "epoch": 0.07, + "grad_norm": 1.3226864337921143, + "learning_rate": 0.0001999287193755329, + "loss": 1.2571, + "step": 30 + }, + { + "epoch": 0.07, + "grad_norm": 0.905536949634552, + "learning_rate": 0.00019991375259645293, + "loss": 1.1576, + "step": 31 + }, + { + "epoch": 0.08, + "grad_norm": 1.2652262449264526, + "learning_rate": 0.00019989736126687963, + "loss": 1.1792, + "step": 32 + }, + { + "epoch": 0.08, + "grad_norm": 1.0161558389663696, + "learning_rate": 0.00019987954562051725, + "loss": 1.0074, + "step": 33 + }, + { + "epoch": 0.08, + "grad_norm": 0.9903879761695862, + "learning_rate": 0.00019986030591137783, + "loss": 1.196, + "step": 34 + }, + { + "epoch": 0.08, + "grad_norm": 1.3282291889190674, + "learning_rate": 0.0001998396424137773, + "loss": 1.5831, + "step": 35 + }, + { + "epoch": 0.08, + "grad_norm": 1.0442458391189575, + "learning_rate": 0.00019981755542233177, + "loss": 0.9708, + "step": 36 + }, + { + "epoch": 0.09, + "grad_norm": 0.9892172813415527, + "learning_rate": 0.0001997940452519531, + "loss": 1.1873, + "step": 37 + }, + { + "epoch": 0.09, + "grad_norm": 1.6204959154129028, + "learning_rate": 0.0001997691122378447, + "loss": 1.725, + "step": 38 + }, + { + "epoch": 0.09, + "grad_norm": 0.9563732147216797, + "learning_rate": 0.00019974275673549654, + "loss": 1.513, + "step": 39 + }, + { + "epoch": 0.09, + "grad_norm": 1.225378155708313, + "learning_rate": 0.00019971497912068013, + "loss": 1.1298, + "step": 40 + }, + { + "epoch": 0.1, + "grad_norm": 0.8867512941360474, + "learning_rate": 0.00019968577978944323, + "loss": 1.2413, + "step": 41 + }, + { + "epoch": 0.1, + "grad_norm": 0.9609911441802979, + "learning_rate": 0.0001996551591581041, + "loss": 1.2217, + "step": 42 + }, + { + "epoch": 0.1, + "grad_norm": 0.8636968731880188, + "learning_rate": 0.0001996231176632456, + "loss": 1.5268, + "step": 43 + }, + { + "epoch": 0.1, + "grad_norm": 0.9208036065101624, + "learning_rate": 0.00019958965576170908, + "loss": 1.1154, + "step": 44 + }, + { + "epoch": 0.11, + "grad_norm": 0.8316241502761841, + "learning_rate": 0.00019955477393058773, + "loss": 1.0891, + "step": 45 + }, + { + "epoch": 0.11, + "grad_norm": 1.0851703882217407, + "learning_rate": 0.0001995184726672197, + "loss": 1.237, + "step": 46 + }, + { + "epoch": 0.11, + "grad_norm": 1.2032215595245361, + "learning_rate": 0.00019948075248918124, + "loss": 1.3444, + "step": 47 + }, + { + "epoch": 0.11, + "grad_norm": 0.8556870818138123, + "learning_rate": 0.00019944161393427922, + "loss": 0.9632, + "step": 48 + }, + { + "epoch": 0.12, + "grad_norm": 1.038156270980835, + "learning_rate": 0.00019940105756054337, + "loss": 1.3897, + "step": 49 + }, + { + "epoch": 0.12, + "grad_norm": 1.1046299934387207, + "learning_rate": 0.00019935908394621844, + "loss": 1.0211, + "step": 50 + }, + { + "epoch": 0.12, + "grad_norm": 1.1632884740829468, + "learning_rate": 0.00019931569368975588, + "loss": 1.3098, + "step": 51 + }, + { + "epoch": 0.12, + "grad_norm": 0.9915144443511963, + "learning_rate": 0.0001992708874098054, + "loss": 1.1708, + "step": 52 + }, + { + "epoch": 0.12, + "grad_norm": 1.1974948644638062, + "learning_rate": 0.00019922466574520608, + "loss": 1.248, + "step": 53 + }, + { + "epoch": 0.13, + "grad_norm": 1.123138189315796, + "learning_rate": 0.00019917702935497725, + "loss": 0.8789, + "step": 54 + }, + { + "epoch": 0.13, + "grad_norm": 0.7720706462860107, + "learning_rate": 0.00019912797891830908, + "loss": 1.2278, + "step": 55 + }, + { + "epoch": 0.13, + "grad_norm": 1.171324610710144, + "learning_rate": 0.00019907751513455302, + "loss": 1.419, + "step": 56 + }, + { + "epoch": 0.13, + "grad_norm": 1.1821049451828003, + "learning_rate": 0.00019902563872321172, + "loss": 1.3281, + "step": 57 + }, + { + "epoch": 0.14, + "grad_norm": 0.8871707320213318, + "learning_rate": 0.00019897235042392873, + "loss": 1.1431, + "step": 58 + }, + { + "epoch": 0.14, + "grad_norm": 0.9106453657150269, + "learning_rate": 0.0001989176509964781, + "loss": 1.0687, + "step": 59 + }, + { + "epoch": 0.14, + "grad_norm": 0.8794414401054382, + "learning_rate": 0.00019886154122075343, + "loss": 1.3627, + "step": 60 + }, + { + "epoch": 0.14, + "grad_norm": 0.9249592423439026, + "learning_rate": 0.00019880402189675678, + "loss": 0.5962, + "step": 61 + }, + { + "epoch": 0.15, + "grad_norm": 0.8355573415756226, + "learning_rate": 0.00019874509384458725, + "loss": 1.2576, + "step": 62 + }, + { + "epoch": 0.15, + "grad_norm": 1.3829615116119385, + "learning_rate": 0.0001986847579044294, + "loss": 1.3844, + "step": 63 + }, + { + "epoch": 0.15, + "grad_norm": 0.7890944480895996, + "learning_rate": 0.00019862301493654108, + "loss": 0.9504, + "step": 64 + }, + { + "epoch": 0.15, + "grad_norm": 0.9988391995429993, + "learning_rate": 0.00019855986582124126, + "loss": 1.1127, + "step": 65 + }, + { + "epoch": 0.15, + "grad_norm": 1.024064302444458, + "learning_rate": 0.00019849531145889758, + "loss": 1.0418, + "step": 66 + }, + { + "epoch": 0.16, + "grad_norm": 1.1652387380599976, + "learning_rate": 0.0001984293527699133, + "loss": 1.0524, + "step": 67 + }, + { + "epoch": 0.16, + "grad_norm": 0.8396044969558716, + "learning_rate": 0.00019836199069471437, + "loss": 1.1829, + "step": 68 + }, + { + "epoch": 0.16, + "grad_norm": 0.9767330884933472, + "learning_rate": 0.00019829322619373588, + "loss": 1.1652, + "step": 69 + }, + { + "epoch": 0.16, + "grad_norm": 0.7999225854873657, + "learning_rate": 0.00019822306024740852, + "loss": 1.1075, + "step": 70 + }, + { + "epoch": 0.17, + "grad_norm": 0.9757010340690613, + "learning_rate": 0.00019815149385614444, + "loss": 1.3074, + "step": 71 + }, + { + "epoch": 0.17, + "grad_norm": 0.9027532339096069, + "learning_rate": 0.00019807852804032305, + "loss": 1.308, + "step": 72 + }, + { + "epoch": 0.17, + "grad_norm": 1.5158909559249878, + "learning_rate": 0.0001980041638402765, + "loss": 1.1594, + "step": 73 + }, + { + "epoch": 0.17, + "grad_norm": 0.8618407249450684, + "learning_rate": 0.00019792840231627482, + "loss": 0.7077, + "step": 74 + }, + { + "epoch": 0.18, + "grad_norm": 1.0307233333587646, + "learning_rate": 0.00019785124454851084, + "loss": 1.0812, + "step": 75 + }, + { + "epoch": 0.18, + "grad_norm": 0.8295727968215942, + "learning_rate": 0.00019777269163708468, + "loss": 1.0412, + "step": 76 + }, + { + "epoch": 0.18, + "grad_norm": 1.0772101879119873, + "learning_rate": 0.00019769274470198827, + "loss": 1.1484, + "step": 77 + }, + { + "epoch": 0.18, + "grad_norm": 1.0130198001861572, + "learning_rate": 0.0001976114048830891, + "loss": 1.0704, + "step": 78 + }, + { + "epoch": 0.19, + "grad_norm": 0.6028400659561157, + "learning_rate": 0.00019752867334011423, + "loss": 0.6347, + "step": 79 + }, + { + "epoch": 0.19, + "grad_norm": 0.8858622312545776, + "learning_rate": 0.0001974445512526336, + "loss": 1.4117, + "step": 80 + }, + { + "epoch": 0.19, + "grad_norm": 0.9046239852905273, + "learning_rate": 0.00019735903982004324, + "loss": 1.0355, + "step": 81 + }, + { + "epoch": 0.19, + "grad_norm": 0.987149715423584, + "learning_rate": 0.00019727214026154827, + "loss": 1.117, + "step": 82 + }, + { + "epoch": 0.19, + "grad_norm": 1.066720962524414, + "learning_rate": 0.0001971838538161454, + "loss": 1.4394, + "step": 83 + }, + { + "epoch": 0.2, + "grad_norm": 0.7880568504333496, + "learning_rate": 0.0001970941817426052, + "loss": 1.0591, + "step": 84 + }, + { + "epoch": 0.2, + "grad_norm": 0.8709908723831177, + "learning_rate": 0.00019700312531945442, + "loss": 0.8653, + "step": 85 + }, + { + "epoch": 0.2, + "grad_norm": 0.8909516930580139, + "learning_rate": 0.00019691068584495742, + "loss": 1.0539, + "step": 86 + }, + { + "epoch": 0.2, + "grad_norm": 0.726899266242981, + "learning_rate": 0.000196816864637098, + "loss": 1.0426, + "step": 87 + }, + { + "epoch": 0.21, + "grad_norm": 0.9136834144592285, + "learning_rate": 0.00019672166303356028, + "loss": 1.0121, + "step": 88 + }, + { + "epoch": 0.21, + "grad_norm": 0.8233316540718079, + "learning_rate": 0.0001966250823917099, + "loss": 1.1974, + "step": 89 + }, + { + "epoch": 0.21, + "grad_norm": 0.8916105628013611, + "learning_rate": 0.0001965271240885745, + "loss": 1.2969, + "step": 90 + }, + { + "epoch": 0.21, + "grad_norm": 0.7402650713920593, + "learning_rate": 0.00019642778952082426, + "loss": 1.0409, + "step": 91 + }, + { + "epoch": 0.22, + "grad_norm": 1.036402702331543, + "learning_rate": 0.00019632708010475165, + "loss": 1.1482, + "step": 92 + }, + { + "epoch": 0.22, + "grad_norm": 1.50504469871521, + "learning_rate": 0.00019622499727625162, + "loss": 1.3788, + "step": 93 + }, + { + "epoch": 0.22, + "grad_norm": 0.7590145468711853, + "learning_rate": 0.0001961215424908009, + "loss": 1.0139, + "step": 94 + }, + { + "epoch": 0.22, + "grad_norm": 0.8763208389282227, + "learning_rate": 0.00019601671722343738, + "loss": 1.066, + "step": 95 + }, + { + "epoch": 0.23, + "grad_norm": 1.0625308752059937, + "learning_rate": 0.00019591052296873888, + "loss": 1.1032, + "step": 96 + }, + { + "epoch": 0.23, + "grad_norm": 0.7704355716705322, + "learning_rate": 0.00019580296124080212, + "loss": 0.8728, + "step": 97 + }, + { + "epoch": 0.23, + "grad_norm": 0.8164972066879272, + "learning_rate": 0.0001956940335732209, + "loss": 0.8879, + "step": 98 + }, + { + "epoch": 0.23, + "grad_norm": 1.114343285560608, + "learning_rate": 0.0001955837415190643, + "loss": 1.3132, + "step": 99 + }, + { + "epoch": 0.23, + "grad_norm": 0.8190547227859497, + "learning_rate": 0.00019547208665085457, + "loss": 1.324, + "step": 100 + }, + { + "epoch": 0.24, + "grad_norm": 1.0214807987213135, + "learning_rate": 0.00019535907056054475, + "loss": 1.0961, + "step": 101 + }, + { + "epoch": 0.24, + "grad_norm": 1.0679683685302734, + "learning_rate": 0.00019524469485949583, + "loss": 1.5574, + "step": 102 + }, + { + "epoch": 0.24, + "grad_norm": 0.7177500128746033, + "learning_rate": 0.00019512896117845392, + "loss": 1.1553, + "step": 103 + }, + { + "epoch": 0.24, + "grad_norm": 1.2570964097976685, + "learning_rate": 0.00019501187116752693, + "loss": 1.2929, + "step": 104 + }, + { + "epoch": 0.25, + "grad_norm": 0.8858779668807983, + "learning_rate": 0.000194893426496161, + "loss": 1.2218, + "step": 105 + }, + { + "epoch": 0.25, + "grad_norm": 0.9762778282165527, + "learning_rate": 0.00019477362885311682, + "loss": 0.9456, + "step": 106 + }, + { + "epoch": 0.25, + "grad_norm": 0.8851660490036011, + "learning_rate": 0.00019465247994644545, + "loss": 1.0032, + "step": 107 + }, + { + "epoch": 0.25, + "eval_loss": 1.1362618207931519, + "eval_runtime": 5.3254, + "eval_samples_per_second": 18.778, + "eval_steps_per_second": 18.778, + "step": 107 + }, + { + "epoch": 0.25, + "grad_norm": 0.8778185844421387, + "learning_rate": 0.00019452998150346401, + "loss": 1.1764, + "step": 108 + }, + { + "epoch": 0.26, + "grad_norm": 0.8865760564804077, + "learning_rate": 0.00019440613527073105, + "loss": 1.0644, + "step": 109 + }, + { + "epoch": 0.26, + "grad_norm": 0.8576034903526306, + "learning_rate": 0.00019428094301402162, + "loss": 1.0943, + "step": 110 + }, + { + "epoch": 0.26, + "grad_norm": 0.810203492641449, + "learning_rate": 0.00019415440651830208, + "loss": 1.2353, + "step": 111 + }, + { + "epoch": 0.26, + "grad_norm": 1.1911653280258179, + "learning_rate": 0.00019402652758770475, + "loss": 1.215, + "step": 112 + }, + { + "epoch": 0.27, + "grad_norm": 0.5166463851928711, + "learning_rate": 0.00019389730804550211, + "loss": 0.4519, + "step": 113 + }, + { + "epoch": 0.27, + "grad_norm": 0.983464241027832, + "learning_rate": 0.00019376674973408075, + "loss": 1.3189, + "step": 114 + }, + { + "epoch": 0.27, + "grad_norm": 0.7697935700416565, + "learning_rate": 0.00019363485451491524, + "loss": 1.0372, + "step": 115 + }, + { + "epoch": 0.27, + "grad_norm": 0.8599738478660583, + "learning_rate": 0.0001935016242685415, + "loss": 1.1381, + "step": 116 + }, + { + "epoch": 0.27, + "grad_norm": 1.0430530309677124, + "learning_rate": 0.00019336706089452996, + "loss": 1.2332, + "step": 117 + }, + { + "epoch": 0.28, + "grad_norm": 1.163251280784607, + "learning_rate": 0.0001932311663114586, + "loss": 1.1588, + "step": 118 + }, + { + "epoch": 0.28, + "grad_norm": 0.8196917176246643, + "learning_rate": 0.0001930939424568854, + "loss": 1.1114, + "step": 119 + }, + { + "epoch": 0.28, + "grad_norm": 0.8848841786384583, + "learning_rate": 0.00019295539128732093, + "loss": 1.0458, + "step": 120 + }, + { + "epoch": 0.28, + "grad_norm": 0.9288797974586487, + "learning_rate": 0.00019281551477820036, + "loss": 0.7388, + "step": 121 + }, + { + "epoch": 0.29, + "grad_norm": 0.8119339942932129, + "learning_rate": 0.00019267431492385521, + "loss": 1.1095, + "step": 122 + }, + { + "epoch": 0.29, + "grad_norm": 0.9211128354072571, + "learning_rate": 0.00019253179373748504, + "loss": 1.3183, + "step": 123 + }, + { + "epoch": 0.29, + "grad_norm": 0.9682132601737976, + "learning_rate": 0.0001923879532511287, + "loss": 1.3786, + "step": 124 + }, + { + "epoch": 0.29, + "grad_norm": 0.9330196976661682, + "learning_rate": 0.00019224279551563532, + "loss": 1.4051, + "step": 125 + }, + { + "epoch": 0.3, + "grad_norm": 0.7206214070320129, + "learning_rate": 0.0001920963226006352, + "loss": 0.8213, + "step": 126 + }, + { + "epoch": 0.3, + "grad_norm": 0.6096452474594116, + "learning_rate": 0.0001919485365945101, + "loss": 0.5982, + "step": 127 + }, + { + "epoch": 0.3, + "grad_norm": 1.0045711994171143, + "learning_rate": 0.00019179943960436358, + "loss": 1.1399, + "step": 128 + }, + { + "epoch": 0.3, + "grad_norm": 0.8744232058525085, + "learning_rate": 0.00019164903375599112, + "loss": 1.0176, + "step": 129 + }, + { + "epoch": 0.31, + "grad_norm": 0.8323287963867188, + "learning_rate": 0.00019149732119384943, + "loss": 1.2937, + "step": 130 + }, + { + "epoch": 0.31, + "grad_norm": 0.9033424854278564, + "learning_rate": 0.00019134430408102615, + "loss": 1.3108, + "step": 131 + }, + { + "epoch": 0.31, + "grad_norm": 0.7392603158950806, + "learning_rate": 0.00019118998459920902, + "loss": 1.0297, + "step": 132 + }, + { + "epoch": 0.31, + "grad_norm": 1.136338710784912, + "learning_rate": 0.0001910343649486546, + "loss": 1.3037, + "step": 133 + }, + { + "epoch": 0.31, + "grad_norm": 0.8202427625656128, + "learning_rate": 0.00019087744734815708, + "loss": 0.9239, + "step": 134 + }, + { + "epoch": 0.32, + "grad_norm": 1.039093255996704, + "learning_rate": 0.0001907192340350165, + "loss": 1.2387, + "step": 135 + }, + { + "epoch": 0.32, + "grad_norm": 1.1009947061538696, + "learning_rate": 0.00019055972726500695, + "loss": 1.0234, + "step": 136 + }, + { + "epoch": 0.32, + "grad_norm": 1.057837724685669, + "learning_rate": 0.00019039892931234435, + "loss": 1.4308, + "step": 137 + }, + { + "epoch": 0.32, + "grad_norm": 0.8715041279792786, + "learning_rate": 0.00019023684246965406, + "loss": 1.0642, + "step": 138 + }, + { + "epoch": 0.33, + "grad_norm": 1.3314340114593506, + "learning_rate": 0.00019007346904793818, + "loss": 1.1097, + "step": 139 + }, + { + "epoch": 0.33, + "grad_norm": 0.6808525919914246, + "learning_rate": 0.00018990881137654258, + "loss": 0.9254, + "step": 140 + }, + { + "epoch": 0.33, + "grad_norm": 0.8727401494979858, + "learning_rate": 0.00018974287180312377, + "loss": 1.1062, + "step": 141 + }, + { + "epoch": 0.33, + "grad_norm": 1.0826424360275269, + "learning_rate": 0.00018957565269361531, + "loss": 1.1528, + "step": 142 + }, + { + "epoch": 0.34, + "grad_norm": 0.8825279474258423, + "learning_rate": 0.00018940715643219407, + "loss": 1.2208, + "step": 143 + }, + { + "epoch": 0.34, + "grad_norm": 0.8955380320549011, + "learning_rate": 0.00018923738542124644, + "loss": 1.0918, + "step": 144 + }, + { + "epoch": 0.34, + "grad_norm": 1.0863996744155884, + "learning_rate": 0.00018906634208133385, + "loss": 1.2153, + "step": 145 + }, + { + "epoch": 0.34, + "grad_norm": 1.0089964866638184, + "learning_rate": 0.00018889402885115833, + "loss": 1.0796, + "step": 146 + }, + { + "epoch": 0.35, + "grad_norm": 0.9210363626480103, + "learning_rate": 0.0001887204481875278, + "loss": 0.8502, + "step": 147 + }, + { + "epoch": 0.35, + "grad_norm": 0.9592724442481995, + "learning_rate": 0.000188545602565321, + "loss": 1.4313, + "step": 148 + }, + { + "epoch": 0.35, + "grad_norm": 1.2299224138259888, + "learning_rate": 0.00018836949447745215, + "loss": 1.1074, + "step": 149 + }, + { + "epoch": 0.35, + "grad_norm": 1.1486583948135376, + "learning_rate": 0.0001881921264348355, + "loss": 1.3576, + "step": 150 + }, + { + "epoch": 0.35, + "grad_norm": 1.1912083625793457, + "learning_rate": 0.00018801350096634946, + "loss": 1.343, + "step": 151 + }, + { + "epoch": 0.36, + "grad_norm": 0.8830162882804871, + "learning_rate": 0.00018783362061880062, + "loss": 1.6139, + "step": 152 + }, + { + "epoch": 0.36, + "grad_norm": 1.0919363498687744, + "learning_rate": 0.00018765248795688726, + "loss": 1.4051, + "step": 153 + }, + { + "epoch": 0.36, + "grad_norm": 0.8009780049324036, + "learning_rate": 0.00018747010556316305, + "loss": 1.4095, + "step": 154 + }, + { + "epoch": 0.36, + "grad_norm": 0.964438796043396, + "learning_rate": 0.00018728647603800003, + "loss": 1.1634, + "step": 155 + }, + { + "epoch": 0.37, + "grad_norm": 0.9883137941360474, + "learning_rate": 0.00018710160199955156, + "loss": 1.1904, + "step": 156 + }, + { + "epoch": 0.37, + "grad_norm": 0.8936368227005005, + "learning_rate": 0.0001869154860837151, + "loss": 1.2264, + "step": 157 + }, + { + "epoch": 0.37, + "grad_norm": 0.6435540914535522, + "learning_rate": 0.0001867281309440945, + "loss": 0.8426, + "step": 158 + }, + { + "epoch": 0.37, + "grad_norm": 0.7036202549934387, + "learning_rate": 0.00018653953925196225, + "loss": 0.8162, + "step": 159 + }, + { + "epoch": 0.38, + "grad_norm": 0.7669593095779419, + "learning_rate": 0.0001863497136962213, + "loss": 0.9143, + "step": 160 + }, + { + "epoch": 0.38, + "grad_norm": 0.8047689199447632, + "learning_rate": 0.00018615865698336684, + "loss": 1.1911, + "step": 161 + }, + { + "epoch": 0.38, + "grad_norm": 1.1324609518051147, + "learning_rate": 0.00018596637183744763, + "loss": 1.064, + "step": 162 + }, + { + "epoch": 0.38, + "grad_norm": 0.8058610558509827, + "learning_rate": 0.00018577286100002723, + "loss": 0.7428, + "step": 163 + }, + { + "epoch": 0.38, + "grad_norm": 0.8588782548904419, + "learning_rate": 0.00018557812723014476, + "loss": 0.8801, + "step": 164 + }, + { + "epoch": 0.39, + "grad_norm": 1.0751081705093384, + "learning_rate": 0.00018538217330427582, + "loss": 1.3674, + "step": 165 + }, + { + "epoch": 0.39, + "grad_norm": 0.791789174079895, + "learning_rate": 0.00018518500201629258, + "loss": 1.0103, + "step": 166 + }, + { + "epoch": 0.39, + "grad_norm": 0.8154894709587097, + "learning_rate": 0.00018498661617742426, + "loss": 1.1219, + "step": 167 + }, + { + "epoch": 0.39, + "grad_norm": 1.3946795463562012, + "learning_rate": 0.00018478701861621686, + "loss": 1.0725, + "step": 168 + }, + { + "epoch": 0.4, + "grad_norm": 0.9187031388282776, + "learning_rate": 0.00018458621217849286, + "loss": 1.2674, + "step": 169 + }, + { + "epoch": 0.4, + "grad_norm": 0.9884739518165588, + "learning_rate": 0.00018438419972731067, + "loss": 1.3507, + "step": 170 + }, + { + "epoch": 0.4, + "grad_norm": 1.4417808055877686, + "learning_rate": 0.0001841809841429238, + "loss": 1.129, + "step": 171 + }, + { + "epoch": 0.4, + "grad_norm": 1.0408543348312378, + "learning_rate": 0.0001839765683227398, + "loss": 1.2038, + "step": 172 + }, + { + "epoch": 0.41, + "grad_norm": 1.2746024131774902, + "learning_rate": 0.00018377095518127897, + "loss": 1.2916, + "step": 173 + }, + { + "epoch": 0.41, + "grad_norm": 0.996474027633667, + "learning_rate": 0.00018356414765013267, + "loss": 1.3041, + "step": 174 + }, + { + "epoch": 0.41, + "grad_norm": 0.9435645341873169, + "learning_rate": 0.00018335614867792183, + "loss": 1.3457, + "step": 175 + }, + { + "epoch": 0.41, + "grad_norm": 1.0813288688659668, + "learning_rate": 0.00018314696123025454, + "loss": 1.2323, + "step": 176 + }, + { + "epoch": 0.42, + "grad_norm": 1.1004307270050049, + "learning_rate": 0.00018293658828968397, + "loss": 1.3084, + "step": 177 + }, + { + "epoch": 0.42, + "grad_norm": 0.7818973660469055, + "learning_rate": 0.00018272503285566587, + "loss": 0.88, + "step": 178 + }, + { + "epoch": 0.42, + "grad_norm": 0.8897396326065063, + "learning_rate": 0.00018251229794451567, + "loss": 1.0124, + "step": 179 + }, + { + "epoch": 0.42, + "grad_norm": 1.0572400093078613, + "learning_rate": 0.00018229838658936564, + "loss": 1.4603, + "step": 180 + }, + { + "epoch": 0.42, + "grad_norm": 0.8974701166152954, + "learning_rate": 0.0001820833018401215, + "loss": 1.1961, + "step": 181 + }, + { + "epoch": 0.43, + "grad_norm": 0.8860751390457153, + "learning_rate": 0.00018186704676341898, + "loss": 0.9779, + "step": 182 + }, + { + "epoch": 0.43, + "grad_norm": 0.7719995975494385, + "learning_rate": 0.00018164962444258014, + "loss": 1.1156, + "step": 183 + }, + { + "epoch": 0.43, + "grad_norm": 1.0823787450790405, + "learning_rate": 0.0001814310379775694, + "loss": 0.959, + "step": 184 + }, + { + "epoch": 0.43, + "grad_norm": 0.9932149052619934, + "learning_rate": 0.00018121129048494922, + "loss": 1.358, + "step": 185 + }, + { + "epoch": 0.44, + "grad_norm": 0.7363601326942444, + "learning_rate": 0.00018099038509783582, + "loss": 1.2639, + "step": 186 + }, + { + "epoch": 0.44, + "grad_norm": 1.0355418920516968, + "learning_rate": 0.0001807683249658545, + "loss": 1.2566, + "step": 187 + }, + { + "epoch": 0.44, + "grad_norm": 0.8746327757835388, + "learning_rate": 0.0001805451132550946, + "loss": 1.2159, + "step": 188 + }, + { + "epoch": 0.44, + "grad_norm": 0.9230121374130249, + "learning_rate": 0.00018032075314806448, + "loss": 1.1717, + "step": 189 + }, + { + "epoch": 0.45, + "grad_norm": 0.8293251991271973, + "learning_rate": 0.00018009524784364615, + "loss": 1.1053, + "step": 190 + }, + { + "epoch": 0.45, + "grad_norm": 1.8819063901901245, + "learning_rate": 0.00017986860055704953, + "loss": 1.4713, + "step": 191 + }, + { + "epoch": 0.45, + "grad_norm": 0.843971312046051, + "learning_rate": 0.00017964081451976672, + "loss": 1.175, + "step": 192 + }, + { + "epoch": 0.45, + "grad_norm": 1.1449857950210571, + "learning_rate": 0.00017941189297952597, + "loss": 1.0097, + "step": 193 + }, + { + "epoch": 0.46, + "grad_norm": 0.8204749226570129, + "learning_rate": 0.0001791818392002452, + "loss": 1.0383, + "step": 194 + }, + { + "epoch": 0.46, + "grad_norm": 1.0437519550323486, + "learning_rate": 0.00017895065646198567, + "loss": 1.4455, + "step": 195 + }, + { + "epoch": 0.46, + "grad_norm": 0.803774356842041, + "learning_rate": 0.00017871834806090501, + "loss": 0.9266, + "step": 196 + }, + { + "epoch": 0.46, + "grad_norm": 0.991162121295929, + "learning_rate": 0.00017848491730921046, + "loss": 0.9433, + "step": 197 + }, + { + "epoch": 0.46, + "grad_norm": 0.9293928146362305, + "learning_rate": 0.00017825036753511144, + "loss": 1.2432, + "step": 198 + }, + { + "epoch": 0.47, + "grad_norm": 0.6806849241256714, + "learning_rate": 0.0001780147020827721, + "loss": 0.696, + "step": 199 + }, + { + "epoch": 0.47, + "grad_norm": 1.2757548093795776, + "learning_rate": 0.00017777792431226383, + "loss": 1.3426, + "step": 200 + }, + { + "epoch": 0.47, + "grad_norm": 0.8645375370979309, + "learning_rate": 0.00017754003759951715, + "loss": 1.0345, + "step": 201 + }, + { + "epoch": 0.47, + "grad_norm": 1.0262008905410767, + "learning_rate": 0.0001773010453362737, + "loss": 0.935, + "step": 202 + }, + { + "epoch": 0.48, + "grad_norm": 1.225273609161377, + "learning_rate": 0.00017706095093003785, + "loss": 1.1271, + "step": 203 + }, + { + "epoch": 0.48, + "grad_norm": 0.9024125337600708, + "learning_rate": 0.00017681975780402807, + "loss": 1.1307, + "step": 204 + }, + { + "epoch": 0.48, + "grad_norm": 0.9035881161689758, + "learning_rate": 0.00017657746939712815, + "loss": 1.3217, + "step": 205 + }, + { + "epoch": 0.48, + "grad_norm": 2.1178483963012695, + "learning_rate": 0.00017633408916383826, + "loss": 1.4955, + "step": 206 + }, + { + "epoch": 0.49, + "grad_norm": 0.826454222202301, + "learning_rate": 0.00017608962057422549, + "loss": 1.187, + "step": 207 + }, + { + "epoch": 0.49, + "grad_norm": 0.8255906701087952, + "learning_rate": 0.00017584406711387463, + "loss": 1.0733, + "step": 208 + }, + { + "epoch": 0.49, + "grad_norm": 0.9498797059059143, + "learning_rate": 0.0001755974322838382, + "loss": 1.3054, + "step": 209 + }, + { + "epoch": 0.49, + "grad_norm": 0.7860575914382935, + "learning_rate": 0.00017534971960058685, + "loss": 1.0231, + "step": 210 + }, + { + "epoch": 0.5, + "grad_norm": 0.905441403388977, + "learning_rate": 0.00017510093259595885, + "loss": 1.2928, + "step": 211 + }, + { + "epoch": 0.5, + "grad_norm": 1.0108212232589722, + "learning_rate": 0.00017485107481711012, + "loss": 0.8964, + "step": 212 + }, + { + "epoch": 0.5, + "grad_norm": 1.036566972732544, + "learning_rate": 0.00017460014982646334, + "loss": 1.4823, + "step": 213 + }, + { + "epoch": 0.5, + "grad_norm": 0.8670862913131714, + "learning_rate": 0.00017434816120165728, + "loss": 1.0994, + "step": 214 + }, + { + "epoch": 0.5, + "eval_loss": 1.1271697282791138, + "eval_runtime": 4.9762, + "eval_samples_per_second": 20.096, + "eval_steps_per_second": 20.096, + "step": 214 + }, + { + "epoch": 0.5, + "grad_norm": 0.9359840750694275, + "learning_rate": 0.00017409511253549593, + "loss": 1.1552, + "step": 215 + }, + { + "epoch": 0.51, + "grad_norm": 1.0469551086425781, + "learning_rate": 0.00017384100743589697, + "loss": 1.1175, + "step": 216 + }, + { + "epoch": 0.51, + "grad_norm": 0.8635923266410828, + "learning_rate": 0.0001735858495258406, + "loss": 0.9823, + "step": 217 + }, + { + "epoch": 0.51, + "grad_norm": 1.1676996946334839, + "learning_rate": 0.00017332964244331776, + "loss": 1.2903, + "step": 218 + }, + { + "epoch": 0.51, + "grad_norm": 0.9774354696273804, + "learning_rate": 0.00017307238984127832, + "loss": 1.1928, + "step": 219 + }, + { + "epoch": 0.52, + "grad_norm": 1.0486751794815063, + "learning_rate": 0.00017281409538757883, + "loss": 1.039, + "step": 220 + }, + { + "epoch": 0.52, + "grad_norm": 1.0949512720108032, + "learning_rate": 0.00017255476276493056, + "loss": 1.148, + "step": 221 + }, + { + "epoch": 0.52, + "grad_norm": 0.8077650666236877, + "learning_rate": 0.0001722943956708466, + "loss": 1.1376, + "step": 222 + }, + { + "epoch": 0.52, + "grad_norm": 1.0595574378967285, + "learning_rate": 0.00017203299781758943, + "loss": 1.0757, + "step": 223 + }, + { + "epoch": 0.53, + "grad_norm": 0.813774049282074, + "learning_rate": 0.00017177057293211784, + "loss": 1.2645, + "step": 224 + }, + { + "epoch": 0.53, + "grad_norm": 0.8764140009880066, + "learning_rate": 0.0001715071247560339, + "loss": 0.9552, + "step": 225 + }, + { + "epoch": 0.53, + "grad_norm": 0.9136344194412231, + "learning_rate": 0.0001712426570455295, + "loss": 1.1841, + "step": 226 + }, + { + "epoch": 0.53, + "grad_norm": 0.8742187023162842, + "learning_rate": 0.00017097717357133284, + "loss": 1.0314, + "step": 227 + }, + { + "epoch": 0.54, + "grad_norm": 0.8309169411659241, + "learning_rate": 0.00017071067811865476, + "loss": 0.9842, + "step": 228 + }, + { + "epoch": 0.54, + "grad_norm": 0.8644968867301941, + "learning_rate": 0.00017044317448713461, + "loss": 1.3819, + "step": 229 + }, + { + "epoch": 0.54, + "grad_norm": 0.8510339260101318, + "learning_rate": 0.0001701746664907862, + "loss": 1.2385, + "step": 230 + }, + { + "epoch": 0.54, + "grad_norm": 0.8174643516540527, + "learning_rate": 0.00016990515795794334, + "loss": 0.9789, + "step": 231 + }, + { + "epoch": 0.54, + "grad_norm": 0.9340826272964478, + "learning_rate": 0.0001696346527312053, + "loss": 1.2472, + "step": 232 + }, + { + "epoch": 0.55, + "grad_norm": 0.9614835977554321, + "learning_rate": 0.00016936315466738205, + "loss": 1.1588, + "step": 233 + }, + { + "epoch": 0.55, + "grad_norm": 1.1464625597000122, + "learning_rate": 0.00016909066763743912, + "loss": 0.8365, + "step": 234 + }, + { + "epoch": 0.55, + "grad_norm": 0.8775334358215332, + "learning_rate": 0.00016881719552644273, + "loss": 1.1143, + "step": 235 + }, + { + "epoch": 0.55, + "grad_norm": 0.9431893825531006, + "learning_rate": 0.00016854274223350397, + "loss": 1.362, + "step": 236 + }, + { + "epoch": 0.56, + "grad_norm": 0.7953469157218933, + "learning_rate": 0.0001682673116717236, + "loss": 1.1568, + "step": 237 + }, + { + "epoch": 0.56, + "grad_norm": 0.7517049908638, + "learning_rate": 0.00016799090776813597, + "loss": 0.9274, + "step": 238 + }, + { + "epoch": 0.56, + "grad_norm": 0.796934187412262, + "learning_rate": 0.00016771353446365318, + "loss": 0.8641, + "step": 239 + }, + { + "epoch": 0.56, + "grad_norm": 0.7946231961250305, + "learning_rate": 0.00016743519571300888, + "loss": 1.0518, + "step": 240 + }, + { + "epoch": 0.57, + "grad_norm": 1.0859878063201904, + "learning_rate": 0.00016715589548470185, + "loss": 1.1815, + "step": 241 + }, + { + "epoch": 0.57, + "grad_norm": 0.7418296933174133, + "learning_rate": 0.00016687563776093941, + "loss": 1.0321, + "step": 242 + }, + { + "epoch": 0.57, + "grad_norm": 0.8161245584487915, + "learning_rate": 0.00016659442653758064, + "loss": 1.0931, + "step": 243 + }, + { + "epoch": 0.57, + "grad_norm": 0.7787612080574036, + "learning_rate": 0.00016631226582407952, + "loss": 1.2239, + "step": 244 + }, + { + "epoch": 0.58, + "grad_norm": 0.7161651849746704, + "learning_rate": 0.00016602915964342757, + "loss": 1.1104, + "step": 245 + }, + { + "epoch": 0.58, + "grad_norm": 0.786612331867218, + "learning_rate": 0.00016574511203209667, + "loss": 1.2486, + "step": 246 + }, + { + "epoch": 0.58, + "grad_norm": 0.8251045942306519, + "learning_rate": 0.00016546012703998138, + "loss": 1.2358, + "step": 247 + }, + { + "epoch": 0.58, + "grad_norm": 0.8045319318771362, + "learning_rate": 0.00016517420873034123, + "loss": 0.8145, + "step": 248 + }, + { + "epoch": 0.58, + "grad_norm": 0.8730091452598572, + "learning_rate": 0.0001648873611797429, + "loss": 0.8832, + "step": 249 + }, + { + "epoch": 0.59, + "grad_norm": 0.9003087878227234, + "learning_rate": 0.00016459958847800187, + "loss": 1.1149, + "step": 250 + }, + { + "epoch": 0.59, + "grad_norm": 1.0912461280822754, + "learning_rate": 0.00016431089472812444, + "loss": 1.0439, + "step": 251 + }, + { + "epoch": 0.59, + "grad_norm": 0.7999249696731567, + "learning_rate": 0.00016402128404624882, + "loss": 0.9821, + "step": 252 + }, + { + "epoch": 0.59, + "grad_norm": 1.0122705698013306, + "learning_rate": 0.00016373076056158675, + "loss": 1.2302, + "step": 253 + }, + { + "epoch": 0.6, + "grad_norm": 0.6447519659996033, + "learning_rate": 0.00016343932841636456, + "loss": 0.6079, + "step": 254 + }, + { + "epoch": 0.6, + "grad_norm": 0.7757092118263245, + "learning_rate": 0.00016314699176576402, + "loss": 1.0092, + "step": 255 + }, + { + "epoch": 0.6, + "grad_norm": 0.7445678114891052, + "learning_rate": 0.00016285375477786322, + "loss": 0.684, + "step": 256 + }, + { + "epoch": 0.6, + "grad_norm": 1.241065263748169, + "learning_rate": 0.000162559621633577, + "loss": 1.0321, + "step": 257 + }, + { + "epoch": 0.61, + "grad_norm": 1.1429563760757446, + "learning_rate": 0.00016226459652659753, + "loss": 1.1635, + "step": 258 + }, + { + "epoch": 0.61, + "grad_norm": 0.7441573739051819, + "learning_rate": 0.0001619686836633343, + "loss": 0.9685, + "step": 259 + }, + { + "epoch": 0.61, + "grad_norm": 1.3199349641799927, + "learning_rate": 0.00016167188726285434, + "loss": 1.3159, + "step": 260 + }, + { + "epoch": 0.61, + "grad_norm": 0.8082245588302612, + "learning_rate": 0.00016137421155682183, + "loss": 1.317, + "step": 261 + }, + { + "epoch": 0.62, + "grad_norm": 0.9036356210708618, + "learning_rate": 0.0001610756607894382, + "loss": 0.8672, + "step": 262 + }, + { + "epoch": 0.62, + "grad_norm": 0.9773459434509277, + "learning_rate": 0.00016077623921738102, + "loss": 1.1405, + "step": 263 + }, + { + "epoch": 0.62, + "grad_norm": 0.9840144515037537, + "learning_rate": 0.00016047595110974376, + "loss": 1.4167, + "step": 264 + }, + { + "epoch": 0.62, + "grad_norm": 1.0358545780181885, + "learning_rate": 0.0001601748007479748, + "loss": 1.196, + "step": 265 + }, + { + "epoch": 0.62, + "grad_norm": 0.7097404599189758, + "learning_rate": 0.0001598727924258164, + "loss": 0.791, + "step": 266 + }, + { + "epoch": 0.63, + "grad_norm": 1.7330995798110962, + "learning_rate": 0.00015956993044924334, + "loss": 1.4283, + "step": 267 + }, + { + "epoch": 0.63, + "grad_norm": 0.7444025278091431, + "learning_rate": 0.0001592662191364017, + "loss": 0.7525, + "step": 268 + }, + { + "epoch": 0.63, + "grad_norm": 0.9818642139434814, + "learning_rate": 0.0001589616628175472, + "loss": 1.2417, + "step": 269 + }, + { + "epoch": 0.63, + "grad_norm": 0.9218468070030212, + "learning_rate": 0.00015865626583498355, + "loss": 1.1316, + "step": 270 + }, + { + "epoch": 0.64, + "grad_norm": 0.9644055366516113, + "learning_rate": 0.00015835003254300039, + "loss": 1.2594, + "step": 271 + }, + { + "epoch": 0.64, + "grad_norm": 0.8228254914283752, + "learning_rate": 0.00015804296730781135, + "loss": 1.2481, + "step": 272 + }, + { + "epoch": 0.64, + "grad_norm": 1.0708152055740356, + "learning_rate": 0.00015773507450749172, + "loss": 1.107, + "step": 273 + }, + { + "epoch": 0.64, + "grad_norm": 1.1122934818267822, + "learning_rate": 0.00015742635853191608, + "loss": 0.8714, + "step": 274 + }, + { + "epoch": 0.65, + "grad_norm": 0.8141905665397644, + "learning_rate": 0.00015711682378269565, + "loss": 0.9943, + "step": 275 + }, + { + "epoch": 0.65, + "grad_norm": 0.8955541253089905, + "learning_rate": 0.00015680647467311557, + "loss": 1.3176, + "step": 276 + }, + { + "epoch": 0.65, + "grad_norm": 1.1133729219436646, + "learning_rate": 0.000156495315628072, + "loss": 1.2602, + "step": 277 + }, + { + "epoch": 0.65, + "grad_norm": 0.8733439445495605, + "learning_rate": 0.00015618335108400893, + "loss": 1.3639, + "step": 278 + }, + { + "epoch": 0.65, + "grad_norm": 0.8614795804023743, + "learning_rate": 0.00015587058548885505, + "loss": 1.1905, + "step": 279 + }, + { + "epoch": 0.66, + "grad_norm": 0.8306368589401245, + "learning_rate": 0.00015555702330196023, + "loss": 1.1978, + "step": 280 + }, + { + "epoch": 0.66, + "grad_norm": 0.8460854887962341, + "learning_rate": 0.00015524266899403206, + "loss": 0.9872, + "step": 281 + }, + { + "epoch": 0.66, + "grad_norm": 0.8452059626579285, + "learning_rate": 0.000154927527047072, + "loss": 0.979, + "step": 282 + }, + { + "epoch": 0.66, + "grad_norm": 0.8805731534957886, + "learning_rate": 0.00015461160195431148, + "loss": 1.2885, + "step": 283 + }, + { + "epoch": 0.67, + "grad_norm": 0.9095639586448669, + "learning_rate": 0.0001542948982201479, + "loss": 1.1156, + "step": 284 + }, + { + "epoch": 0.67, + "grad_norm": 0.9862900376319885, + "learning_rate": 0.00015397742036008034, + "loss": 1.1571, + "step": 285 + }, + { + "epoch": 0.67, + "grad_norm": 0.9344744086265564, + "learning_rate": 0.0001536591729006453, + "loss": 1.2204, + "step": 286 + }, + { + "epoch": 0.67, + "grad_norm": 1.0605379343032837, + "learning_rate": 0.00015334016037935196, + "loss": 1.3048, + "step": 287 + }, + { + "epoch": 0.68, + "grad_norm": 0.9844763278961182, + "learning_rate": 0.0001530203873446177, + "loss": 1.0035, + "step": 288 + }, + { + "epoch": 0.68, + "grad_norm": 0.767954409122467, + "learning_rate": 0.0001526998583557031, + "loss": 0.9023, + "step": 289 + }, + { + "epoch": 0.68, + "grad_norm": 1.0622146129608154, + "learning_rate": 0.000152378577982647, + "loss": 1.4837, + "step": 290 + }, + { + "epoch": 0.68, + "grad_norm": 0.9536969065666199, + "learning_rate": 0.0001520565508062013, + "loss": 1.0948, + "step": 291 + }, + { + "epoch": 0.69, + "grad_norm": 0.9654991030693054, + "learning_rate": 0.00015173378141776568, + "loss": 1.1913, + "step": 292 + }, + { + "epoch": 0.69, + "grad_norm": 0.8208710551261902, + "learning_rate": 0.00015141027441932216, + "loss": 1.1435, + "step": 293 + }, + { + "epoch": 0.69, + "grad_norm": 0.9273961186408997, + "learning_rate": 0.0001510860344233695, + "loss": 1.0845, + "step": 294 + }, + { + "epoch": 0.69, + "grad_norm": 1.0316227674484253, + "learning_rate": 0.00015076106605285724, + "loss": 1.4532, + "step": 295 + }, + { + "epoch": 0.69, + "grad_norm": 1.0121437311172485, + "learning_rate": 0.00015043537394112007, + "loss": 0.8687, + "step": 296 + }, + { + "epoch": 0.7, + "grad_norm": 1.0713882446289062, + "learning_rate": 0.00015010896273181165, + "loss": 1.1097, + "step": 297 + }, + { + "epoch": 0.7, + "grad_norm": 0.8149722814559937, + "learning_rate": 0.00014978183707883827, + "loss": 0.8682, + "step": 298 + }, + { + "epoch": 0.7, + "grad_norm": 0.7118079662322998, + "learning_rate": 0.00014945400164629278, + "loss": 0.9225, + "step": 299 + }, + { + "epoch": 0.7, + "grad_norm": 1.1042624711990356, + "learning_rate": 0.00014912546110838775, + "loss": 1.4279, + "step": 300 + }, + { + "epoch": 0.71, + "grad_norm": 0.947619616985321, + "learning_rate": 0.00014879622014938915, + "loss": 1.0544, + "step": 301 + }, + { + "epoch": 0.71, + "grad_norm": 0.9065904021263123, + "learning_rate": 0.00014846628346354933, + "loss": 1.1642, + "step": 302 + }, + { + "epoch": 0.71, + "grad_norm": 0.9430320262908936, + "learning_rate": 0.00014813565575504022, + "loss": 1.2182, + "step": 303 + }, + { + "epoch": 0.71, + "grad_norm": 0.8739117980003357, + "learning_rate": 0.00014780434173788617, + "loss": 1.0176, + "step": 304 + }, + { + "epoch": 0.72, + "grad_norm": 0.853125274181366, + "learning_rate": 0.00014747234613589685, + "loss": 1.1827, + "step": 305 + }, + { + "epoch": 0.72, + "grad_norm": 1.6718727350234985, + "learning_rate": 0.0001471396736825998, + "loss": 1.2665, + "step": 306 + }, + { + "epoch": 0.72, + "grad_norm": 0.8566248416900635, + "learning_rate": 0.00014680632912117286, + "loss": 1.2231, + "step": 307 + }, + { + "epoch": 0.72, + "grad_norm": 0.6841180324554443, + "learning_rate": 0.00014647231720437686, + "loss": 0.9366, + "step": 308 + }, + { + "epoch": 0.73, + "grad_norm": 0.9140876531600952, + "learning_rate": 0.00014613764269448751, + "loss": 1.0711, + "step": 309 + }, + { + "epoch": 0.73, + "grad_norm": 0.9394497275352478, + "learning_rate": 0.00014580231036322768, + "loss": 1.1159, + "step": 310 + }, + { + "epoch": 0.73, + "grad_norm": 1.1066112518310547, + "learning_rate": 0.00014546632499169937, + "loss": 1.3487, + "step": 311 + }, + { + "epoch": 0.73, + "grad_norm": 0.9925751090049744, + "learning_rate": 0.00014512969137031538, + "loss": 1.1207, + "step": 312 + }, + { + "epoch": 0.73, + "grad_norm": 0.9642359018325806, + "learning_rate": 0.0001447924142987312, + "loss": 1.3772, + "step": 313 + }, + { + "epoch": 0.74, + "grad_norm": 0.6977396607398987, + "learning_rate": 0.0001444544985857766, + "loss": 0.8517, + "step": 314 + }, + { + "epoch": 0.74, + "grad_norm": 1.033882737159729, + "learning_rate": 0.00014411594904938682, + "loss": 1.0644, + "step": 315 + }, + { + "epoch": 0.74, + "grad_norm": 1.020871877670288, + "learning_rate": 0.00014377677051653404, + "loss": 1.2026, + "step": 316 + }, + { + "epoch": 0.74, + "grad_norm": 1.059812068939209, + "learning_rate": 0.0001434369678231587, + "loss": 1.4181, + "step": 317 + }, + { + "epoch": 0.75, + "grad_norm": 0.8130291104316711, + "learning_rate": 0.00014309654581410024, + "loss": 1.0691, + "step": 318 + }, + { + "epoch": 0.75, + "grad_norm": 0.8362820148468018, + "learning_rate": 0.00014275550934302823, + "loss": 1.0053, + "step": 319 + }, + { + "epoch": 0.75, + "grad_norm": 0.9266586899757385, + "learning_rate": 0.0001424138632723731, + "loss": 1.1313, + "step": 320 + }, + { + "epoch": 0.75, + "grad_norm": 1.0162605047225952, + "learning_rate": 0.00014207161247325691, + "loss": 1.3518, + "step": 321 + }, + { + "epoch": 0.75, + "eval_loss": 1.114696741104126, + "eval_runtime": 5.1062, + "eval_samples_per_second": 19.584, + "eval_steps_per_second": 19.584, + "step": 321 + }, + { + "epoch": 0.76, + "grad_norm": 1.0078846216201782, + "learning_rate": 0.00014172876182542372, + "loss": 1.0446, + "step": 322 + }, + { + "epoch": 0.76, + "grad_norm": 1.2844680547714233, + "learning_rate": 0.00014138531621717018, + "loss": 1.4105, + "step": 323 + }, + { + "epoch": 0.76, + "grad_norm": 1.0380208492279053, + "learning_rate": 0.0001410412805452757, + "loss": 1.4212, + "step": 324 + }, + { + "epoch": 0.76, + "grad_norm": 0.8037036061286926, + "learning_rate": 0.00014069665971493274, + "loss": 0.8392, + "step": 325 + }, + { + "epoch": 0.77, + "grad_norm": 0.9248948693275452, + "learning_rate": 0.00014035145863967692, + "loss": 1.3121, + "step": 326 + }, + { + "epoch": 0.77, + "grad_norm": 0.8579298853874207, + "learning_rate": 0.0001400056822413167, + "loss": 1.1128, + "step": 327 + }, + { + "epoch": 0.77, + "grad_norm": 1.0605120658874512, + "learning_rate": 0.0001396593354498635, + "loss": 1.543, + "step": 328 + }, + { + "epoch": 0.77, + "grad_norm": 0.9975443482398987, + "learning_rate": 0.0001393124232034613, + "loss": 1.1178, + "step": 329 + }, + { + "epoch": 0.77, + "grad_norm": 0.8115065693855286, + "learning_rate": 0.0001389649504483162, + "loss": 1.1937, + "step": 330 + }, + { + "epoch": 0.78, + "grad_norm": 0.7796252369880676, + "learning_rate": 0.00013861692213862584, + "loss": 1.1886, + "step": 331 + }, + { + "epoch": 0.78, + "grad_norm": 1.0133821964263916, + "learning_rate": 0.000138268343236509, + "loss": 1.4973, + "step": 332 + }, + { + "epoch": 0.78, + "grad_norm": 0.9557147026062012, + "learning_rate": 0.00013791921871193457, + "loss": 1.4592, + "step": 333 + }, + { + "epoch": 0.78, + "grad_norm": 0.9763726592063904, + "learning_rate": 0.00013756955354265085, + "loss": 0.8502, + "step": 334 + }, + { + "epoch": 0.79, + "grad_norm": 0.8208116888999939, + "learning_rate": 0.00013721935271411464, + "loss": 1.1601, + "step": 335 + }, + { + "epoch": 0.79, + "grad_norm": 1.3176727294921875, + "learning_rate": 0.0001368686212194199, + "loss": 1.1715, + "step": 336 + }, + { + "epoch": 0.79, + "grad_norm": 1.2329626083374023, + "learning_rate": 0.00013651736405922686, + "loss": 1.3426, + "step": 337 + }, + { + "epoch": 0.79, + "grad_norm": 0.9947068691253662, + "learning_rate": 0.0001361655862416905, + "loss": 1.0623, + "step": 338 + }, + { + "epoch": 0.8, + "grad_norm": 1.176267147064209, + "learning_rate": 0.00013581329278238927, + "loss": 1.1281, + "step": 339 + }, + { + "epoch": 0.8, + "grad_norm": 0.909443736076355, + "learning_rate": 0.00013546048870425356, + "loss": 1.2623, + "step": 340 + }, + { + "epoch": 0.8, + "grad_norm": 0.8919989466667175, + "learning_rate": 0.000135107179037494, + "loss": 1.2652, + "step": 341 + }, + { + "epoch": 0.8, + "grad_norm": 0.7781542539596558, + "learning_rate": 0.00013475336881952986, + "loss": 0.9857, + "step": 342 + }, + { + "epoch": 0.81, + "grad_norm": 0.9232913851737976, + "learning_rate": 0.00013439906309491712, + "loss": 1.0923, + "step": 343 + }, + { + "epoch": 0.81, + "grad_norm": 1.1160950660705566, + "learning_rate": 0.0001340442669152766, + "loss": 1.3445, + "step": 344 + }, + { + "epoch": 0.81, + "grad_norm": 1.084597110748291, + "learning_rate": 0.000133688985339222, + "loss": 1.7647, + "step": 345 + }, + { + "epoch": 0.81, + "grad_norm": 0.8420549631118774, + "learning_rate": 0.0001333332234322876, + "loss": 1.1342, + "step": 346 + }, + { + "epoch": 0.81, + "grad_norm": 1.0362187623977661, + "learning_rate": 0.0001329769862668563, + "loss": 1.0779, + "step": 347 + }, + { + "epoch": 0.82, + "grad_norm": 0.902492344379425, + "learning_rate": 0.00013262027892208694, + "loss": 1.1121, + "step": 348 + }, + { + "epoch": 0.82, + "grad_norm": 1.4322317838668823, + "learning_rate": 0.0001322631064838422, + "loss": 1.5474, + "step": 349 + }, + { + "epoch": 0.82, + "grad_norm": 0.8751888275146484, + "learning_rate": 0.00013190547404461598, + "loss": 1.2055, + "step": 350 + }, + { + "epoch": 0.82, + "grad_norm": 0.9157432913780212, + "learning_rate": 0.0001315473867034608, + "loss": 1.3176, + "step": 351 + }, + { + "epoch": 0.83, + "grad_norm": 0.7300966382026672, + "learning_rate": 0.0001311888495659149, + "loss": 0.9548, + "step": 352 + }, + { + "epoch": 0.83, + "grad_norm": 1.0954256057739258, + "learning_rate": 0.0001308298677439299, + "loss": 1.1649, + "step": 353 + }, + { + "epoch": 0.83, + "grad_norm": 1.0646469593048096, + "learning_rate": 0.00013047044635579747, + "loss": 1.3597, + "step": 354 + }, + { + "epoch": 0.83, + "grad_norm": 0.7668378949165344, + "learning_rate": 0.00013011059052607656, + "loss": 1.1246, + "step": 355 + }, + { + "epoch": 0.84, + "grad_norm": 0.9135538339614868, + "learning_rate": 0.00012975030538552032, + "loss": 1.0189, + "step": 356 + }, + { + "epoch": 0.84, + "grad_norm": 0.7841051816940308, + "learning_rate": 0.00012938959607100288, + "loss": 1.1396, + "step": 357 + }, + { + "epoch": 0.84, + "grad_norm": 0.9529784321784973, + "learning_rate": 0.00012902846772544624, + "loss": 1.4681, + "step": 358 + }, + { + "epoch": 0.84, + "grad_norm": 0.8711650967597961, + "learning_rate": 0.00012866692549774682, + "loss": 0.9842, + "step": 359 + }, + { + "epoch": 0.85, + "grad_norm": 0.9562662839889526, + "learning_rate": 0.00012830497454270205, + "loss": 1.3051, + "step": 360 + }, + { + "epoch": 0.85, + "grad_norm": 1.0756105184555054, + "learning_rate": 0.00012794262002093697, + "loss": 1.3275, + "step": 361 + }, + { + "epoch": 0.85, + "grad_norm": 0.7915710806846619, + "learning_rate": 0.0001275798670988306, + "loss": 1.0035, + "step": 362 + }, + { + "epoch": 0.85, + "grad_norm": 0.9524595737457275, + "learning_rate": 0.0001272167209484422, + "loss": 1.2083, + "step": 363 + }, + { + "epoch": 0.85, + "grad_norm": 1.4926435947418213, + "learning_rate": 0.0001268531867474377, + "loss": 1.3218, + "step": 364 + }, + { + "epoch": 0.86, + "grad_norm": 1.2689683437347412, + "learning_rate": 0.00012648926967901567, + "loss": 2.7813, + "step": 365 + }, + { + "epoch": 0.86, + "grad_norm": 0.8361314535140991, + "learning_rate": 0.00012612497493183364, + "loss": 1.124, + "step": 366 + }, + { + "epoch": 0.86, + "grad_norm": 1.2996618747711182, + "learning_rate": 0.00012576030769993393, + "loss": 1.3745, + "step": 367 + }, + { + "epoch": 0.86, + "grad_norm": 0.8248890042304993, + "learning_rate": 0.0001253952731826697, + "loss": 1.1971, + "step": 368 + }, + { + "epoch": 0.87, + "grad_norm": 0.8044300079345703, + "learning_rate": 0.00012502987658463075, + "loss": 1.1508, + "step": 369 + }, + { + "epoch": 0.87, + "grad_norm": 1.196742057800293, + "learning_rate": 0.00012466412311556952, + "loss": 0.9868, + "step": 370 + }, + { + "epoch": 0.87, + "grad_norm": 0.9415065050125122, + "learning_rate": 0.0001242980179903264, + "loss": 1.046, + "step": 371 + }, + { + "epoch": 0.87, + "grad_norm": 1.049695611000061, + "learning_rate": 0.0001239315664287558, + "loss": 0.9927, + "step": 372 + }, + { + "epoch": 0.88, + "grad_norm": 0.8266507387161255, + "learning_rate": 0.00012356477365565148, + "loss": 0.8879, + "step": 373 + }, + { + "epoch": 0.88, + "grad_norm": 0.9163070321083069, + "learning_rate": 0.0001231976449006721, + "loss": 1.1214, + "step": 374 + }, + { + "epoch": 0.88, + "grad_norm": 1.01756751537323, + "learning_rate": 0.00012283018539826685, + "loss": 1.1644, + "step": 375 + }, + { + "epoch": 0.88, + "grad_norm": 0.901319682598114, + "learning_rate": 0.00012246240038760043, + "loss": 1.1985, + "step": 376 + }, + { + "epoch": 0.88, + "grad_norm": 0.9721381664276123, + "learning_rate": 0.00012209429511247864, + "loss": 1.1199, + "step": 377 + }, + { + "epoch": 0.89, + "grad_norm": 0.8883329033851624, + "learning_rate": 0.0001217258748212737, + "loss": 1.3431, + "step": 378 + }, + { + "epoch": 0.89, + "grad_norm": 1.0698317289352417, + "learning_rate": 0.00012135714476684903, + "loss": 1.3173, + "step": 379 + }, + { + "epoch": 0.89, + "grad_norm": 0.8664084076881409, + "learning_rate": 0.00012098811020648475, + "loss": 1.0441, + "step": 380 + }, + { + "epoch": 0.89, + "grad_norm": 0.9194340109825134, + "learning_rate": 0.00012061877640180255, + "loss": 1.152, + "step": 381 + }, + { + "epoch": 0.9, + "grad_norm": 0.9599464535713196, + "learning_rate": 0.00012024914861869063, + "loss": 1.1115, + "step": 382 + }, + { + "epoch": 0.9, + "grad_norm": 0.9990159273147583, + "learning_rate": 0.00011987923212722872, + "loss": 1.2534, + "step": 383 + }, + { + "epoch": 0.9, + "grad_norm": 0.8435646891593933, + "learning_rate": 0.00011950903220161285, + "loss": 1.1752, + "step": 384 + }, + { + "epoch": 0.9, + "grad_norm": 1.0376098155975342, + "learning_rate": 0.00011913855412008023, + "loss": 1.4716, + "step": 385 + }, + { + "epoch": 0.91, + "grad_norm": 1.3249186277389526, + "learning_rate": 0.00011876780316483401, + "loss": 1.211, + "step": 386 + }, + { + "epoch": 0.91, + "grad_norm": 1.378393292427063, + "learning_rate": 0.00011839678462196784, + "loss": 1.0357, + "step": 387 + }, + { + "epoch": 0.91, + "grad_norm": 0.7574142217636108, + "learning_rate": 0.0001180255037813906, + "loss": 0.4137, + "step": 388 + }, + { + "epoch": 0.91, + "grad_norm": 0.7813417911529541, + "learning_rate": 0.00011765396593675097, + "loss": 1.1776, + "step": 389 + }, + { + "epoch": 0.92, + "grad_norm": 0.8787057995796204, + "learning_rate": 0.00011728217638536197, + "loss": 1.1352, + "step": 390 + }, + { + "epoch": 0.92, + "grad_norm": 0.9643175005912781, + "learning_rate": 0.00011691014042812536, + "loss": 1.3089, + "step": 391 + }, + { + "epoch": 0.92, + "grad_norm": 0.9101107716560364, + "learning_rate": 0.00011653786336945614, + "loss": 1.0639, + "step": 392 + }, + { + "epoch": 0.92, + "grad_norm": 1.018091082572937, + "learning_rate": 0.00011616535051720685, + "loss": 0.9938, + "step": 393 + }, + { + "epoch": 0.92, + "grad_norm": 0.9708930253982544, + "learning_rate": 0.00011579260718259197, + "loss": 0.8004, + "step": 394 + }, + { + "epoch": 0.93, + "grad_norm": 0.8909386396408081, + "learning_rate": 0.00011541963868011212, + "loss": 1.2997, + "step": 395 + }, + { + "epoch": 0.93, + "grad_norm": 1.0622750520706177, + "learning_rate": 0.00011504645032747832, + "loss": 1.0235, + "step": 396 + }, + { + "epoch": 0.93, + "grad_norm": 0.8857365250587463, + "learning_rate": 0.00011467304744553618, + "loss": 0.8382, + "step": 397 + }, + { + "epoch": 0.93, + "grad_norm": 0.8980242013931274, + "learning_rate": 0.00011429943535819005, + "loss": 1.0877, + "step": 398 + }, + { + "epoch": 0.94, + "grad_norm": 1.1426031589508057, + "learning_rate": 0.00011392561939232706, + "loss": 1.3496, + "step": 399 + }, + { + "epoch": 0.94, + "grad_norm": 1.0347543954849243, + "learning_rate": 0.0001135516048777412, + "loss": 1.6309, + "step": 400 + }, + { + "epoch": 0.94, + "grad_norm": 1.0121687650680542, + "learning_rate": 0.00011317739714705731, + "loss": 1.2256, + "step": 401 + }, + { + "epoch": 0.94, + "grad_norm": 0.8863442540168762, + "learning_rate": 0.0001128030015356551, + "loss": 0.8687, + "step": 402 + }, + { + "epoch": 0.95, + "grad_norm": 0.7622981667518616, + "learning_rate": 0.00011242842338159309, + "loss": 0.7564, + "step": 403 + }, + { + "epoch": 0.95, + "grad_norm": 0.9527961015701294, + "learning_rate": 0.0001120536680255323, + "loss": 1.0593, + "step": 404 + }, + { + "epoch": 0.95, + "grad_norm": 1.3481955528259277, + "learning_rate": 0.00011167874081066045, + "loss": 1.2279, + "step": 405 + }, + { + "epoch": 0.95, + "grad_norm": 0.8665672540664673, + "learning_rate": 0.00011130364708261552, + "loss": 1.1677, + "step": 406 + }, + { + "epoch": 0.96, + "grad_norm": 1.217490553855896, + "learning_rate": 0.0001109283921894095, + "loss": 1.2617, + "step": 407 + }, + { + "epoch": 0.96, + "grad_norm": 0.8935596942901611, + "learning_rate": 0.00011055298148135236, + "loss": 1.1184, + "step": 408 + }, + { + "epoch": 0.96, + "grad_norm": 0.8513955473899841, + "learning_rate": 0.00011017742031097563, + "loss": 1.2705, + "step": 409 + }, + { + "epoch": 0.96, + "grad_norm": 1.0295133590698242, + "learning_rate": 0.0001098017140329561, + "loss": 1.1966, + "step": 410 + }, + { + "epoch": 0.96, + "grad_norm": 1.1029167175292969, + "learning_rate": 0.0001094258680040394, + "loss": 1.4887, + "step": 411 + }, + { + "epoch": 0.97, + "grad_norm": 0.9045723080635071, + "learning_rate": 0.0001090498875829638, + "loss": 1.1461, + "step": 412 + }, + { + "epoch": 0.97, + "grad_norm": 0.8317312002182007, + "learning_rate": 0.00010867377813038366, + "loss": 1.136, + "step": 413 + }, + { + "epoch": 0.97, + "grad_norm": 1.0023647546768188, + "learning_rate": 0.00010829754500879308, + "loss": 1.1123, + "step": 414 + }, + { + "epoch": 0.97, + "grad_norm": 0.9197617769241333, + "learning_rate": 0.00010792119358244939, + "loss": 1.2792, + "step": 415 + }, + { + "epoch": 0.98, + "grad_norm": 0.9892452955245972, + "learning_rate": 0.00010754472921729661, + "loss": 1.634, + "step": 416 + }, + { + "epoch": 0.98, + "grad_norm": 0.8005648255348206, + "learning_rate": 0.00010716815728088912, + "loss": 0.7168, + "step": 417 + }, + { + "epoch": 0.98, + "grad_norm": 1.1989247798919678, + "learning_rate": 0.00010679148314231504, + "loss": 1.2882, + "step": 418 + }, + { + "epoch": 0.98, + "grad_norm": 0.7820172905921936, + "learning_rate": 0.00010641471217211958, + "loss": 1.1125, + "step": 419 + }, + { + "epoch": 0.99, + "grad_norm": 1.352563500404358, + "learning_rate": 0.00010603784974222861, + "loss": 0.9641, + "step": 420 + }, + { + "epoch": 0.99, + "grad_norm": 0.8966504335403442, + "learning_rate": 0.000105660901225872, + "loss": 1.1155, + "step": 421 + }, + { + "epoch": 0.99, + "grad_norm": 0.8722444176673889, + "learning_rate": 0.00010528387199750707, + "loss": 1.3011, + "step": 422 + }, + { + "epoch": 0.99, + "grad_norm": 0.8678218722343445, + "learning_rate": 0.00010490676743274181, + "loss": 1.2912, + "step": 423 + }, + { + "epoch": 1.0, + "grad_norm": 0.8596826791763306, + "learning_rate": 0.00010452959290825846, + "loss": 1.3792, + "step": 424 + }, + { + "epoch": 1.0, + "grad_norm": 0.777655303478241, + "learning_rate": 0.00010415235380173662, + "loss": 0.9992, + "step": 425 + }, + { + "epoch": 1.0, + "grad_norm": 0.7913762331008911, + "learning_rate": 0.00010377505549177682, + "loss": 0.8813, + "step": 426 + }, + { + "epoch": 1.0, + "grad_norm": 0.8607832789421082, + "learning_rate": 0.00010339770335782359, + "loss": 0.9927, + "step": 427 + }, + { + "epoch": 1.0, + "grad_norm": 0.8529496192932129, + "learning_rate": 0.0001030203027800889, + "loss": 1.1638, + "step": 428 + }, + { + "epoch": 1.0, + "eval_loss": 1.109578251838684, + "eval_runtime": 5.1972, + "eval_samples_per_second": 19.241, + "eval_steps_per_second": 19.241, + "step": 428 + }, + { + "epoch": 1.01, + "grad_norm": 1.0168473720550537, + "learning_rate": 0.00010264285913947545, + "loss": 1.3069, + "step": 429 + }, + { + "epoch": 1.01, + "grad_norm": 1.2173975706100464, + "learning_rate": 0.00010226537781749987, + "loss": 0.9985, + "step": 430 + }, + { + "epoch": 1.01, + "grad_norm": 0.8822383284568787, + "learning_rate": 0.00010188786419621612, + "loss": 1.1801, + "step": 431 + }, + { + "epoch": 1.01, + "grad_norm": 1.145887851715088, + "learning_rate": 0.00010151032365813859, + "loss": 1.2034, + "step": 432 + }, + { + "epoch": 1.02, + "grad_norm": 0.7805179953575134, + "learning_rate": 0.00010113276158616553, + "loss": 1.2896, + "step": 433 + }, + { + "epoch": 1.02, + "grad_norm": 0.8651528358459473, + "learning_rate": 0.00010075518336350218, + "loss": 1.2091, + "step": 434 + }, + { + "epoch": 1.02, + "grad_norm": 1.0866674184799194, + "learning_rate": 0.00010037759437358398, + "loss": 1.0422, + "step": 435 + }, + { + "epoch": 1.02, + "grad_norm": 1.0740783214569092, + "learning_rate": 0.0001, + "loss": 1.3082, + "step": 436 + }, + { + "epoch": 1.03, + "grad_norm": 0.7619555592536926, + "learning_rate": 9.962240562641602e-05, + "loss": 0.9208, + "step": 437 + }, + { + "epoch": 1.03, + "grad_norm": 1.1175497770309448, + "learning_rate": 9.924481663649785e-05, + "loss": 1.1327, + "step": 438 + }, + { + "epoch": 1.03, + "grad_norm": 0.8330385088920593, + "learning_rate": 9.886723841383448e-05, + "loss": 0.9008, + "step": 439 + }, + { + "epoch": 1.0, + "grad_norm": 0.7999448776245117, + "learning_rate": 9.848967634186142e-05, + "loss": 0.8613, + "step": 440 + }, + { + "epoch": 1.0, + "grad_norm": 0.866118311882019, + "learning_rate": 9.81121358037839e-05, + "loss": 1.2199, + "step": 441 + }, + { + "epoch": 1.01, + "grad_norm": 0.882002055644989, + "learning_rate": 9.773462218250015e-05, + "loss": 0.8658, + "step": 442 + }, + { + "epoch": 1.01, + "grad_norm": 0.8205627799034119, + "learning_rate": 9.735714086052458e-05, + "loss": 1.189, + "step": 443 + }, + { + "epoch": 1.01, + "grad_norm": 0.9855633974075317, + "learning_rate": 9.697969721991114e-05, + "loss": 0.8399, + "step": 444 + }, + { + "epoch": 1.01, + "grad_norm": 0.891368567943573, + "learning_rate": 9.660229664217642e-05, + "loss": 1.0189, + "step": 445 + }, + { + "epoch": 1.02, + "grad_norm": 0.906017541885376, + "learning_rate": 9.62249445082232e-05, + "loss": 1.0986, + "step": 446 + }, + { + "epoch": 1.02, + "grad_norm": 1.08309805393219, + "learning_rate": 9.584764619826339e-05, + "loss": 1.1887, + "step": 447 + }, + { + "epoch": 1.02, + "grad_norm": 0.8647370934486389, + "learning_rate": 9.547040709174159e-05, + "loss": 0.8142, + "step": 448 + }, + { + "epoch": 1.02, + "grad_norm": 1.3828812837600708, + "learning_rate": 9.509323256725821e-05, + "loss": 0.882, + "step": 449 + }, + { + "epoch": 1.03, + "grad_norm": 0.8859632611274719, + "learning_rate": 9.471612800249296e-05, + "loss": 0.9624, + "step": 450 + }, + { + "epoch": 1.03, + "grad_norm": 0.7756340503692627, + "learning_rate": 9.433909877412802e-05, + "loss": 0.8844, + "step": 451 + }, + { + "epoch": 1.03, + "grad_norm": 1.2693322896957397, + "learning_rate": 9.396215025777139e-05, + "loss": 1.0696, + "step": 452 + }, + { + "epoch": 1.03, + "grad_norm": 0.8535933494567871, + "learning_rate": 9.358528782788045e-05, + "loss": 0.8664, + "step": 453 + }, + { + "epoch": 1.04, + "grad_norm": 0.8840806484222412, + "learning_rate": 9.320851685768497e-05, + "loss": 0.744, + "step": 454 + }, + { + "epoch": 1.04, + "grad_norm": 1.1074801683425903, + "learning_rate": 9.283184271911089e-05, + "loss": 1.0923, + "step": 455 + }, + { + "epoch": 1.04, + "grad_norm": 1.0910581350326538, + "learning_rate": 9.245527078270341e-05, + "loss": 0.8355, + "step": 456 + }, + { + "epoch": 1.04, + "grad_norm": 1.0176016092300415, + "learning_rate": 9.207880641755065e-05, + "loss": 0.8598, + "step": 457 + }, + { + "epoch": 1.04, + "grad_norm": 0.9024606347084045, + "learning_rate": 9.170245499120693e-05, + "loss": 0.8282, + "step": 458 + }, + { + "epoch": 1.05, + "grad_norm": 0.70665442943573, + "learning_rate": 9.132622186961637e-05, + "loss": 0.428, + "step": 459 + }, + { + "epoch": 1.05, + "grad_norm": 0.8624319434165955, + "learning_rate": 9.095011241703623e-05, + "loss": 0.9361, + "step": 460 + }, + { + "epoch": 1.05, + "grad_norm": 1.5043039321899414, + "learning_rate": 9.057413199596065e-05, + "loss": 0.806, + "step": 461 + }, + { + "epoch": 1.05, + "grad_norm": 1.1076829433441162, + "learning_rate": 9.019828596704394e-05, + "loss": 0.8207, + "step": 462 + }, + { + "epoch": 1.06, + "grad_norm": NaN, + "learning_rate": 9.019828596704394e-05, + "loss": 0.8544, + "step": 463 + }, + { + "epoch": 1.06, + "grad_norm": 0.9130039215087891, + "learning_rate": 8.982257968902438e-05, + "loss": 0.8329, + "step": 464 + }, + { + "epoch": 1.06, + "grad_norm": 1.1653575897216797, + "learning_rate": 8.944701851864767e-05, + "loss": 0.8211, + "step": 465 + }, + { + "epoch": 1.06, + "grad_norm": 1.4047077894210815, + "learning_rate": 8.907160781059052e-05, + "loss": 1.2467, + "step": 466 + }, + { + "epoch": 1.07, + "grad_norm": 1.0421421527862549, + "learning_rate": 8.869635291738452e-05, + "loss": 1.0891, + "step": 467 + }, + { + "epoch": 1.07, + "grad_norm": 0.8077785968780518, + "learning_rate": 8.832125918933954e-05, + "loss": 0.6497, + "step": 468 + }, + { + "epoch": 1.07, + "grad_norm": 1.1150001287460327, + "learning_rate": 8.79463319744677e-05, + "loss": 0.8212, + "step": 469 + }, + { + "epoch": 1.07, + "grad_norm": 0.704976499080658, + "learning_rate": 8.757157661840693e-05, + "loss": 0.6064, + "step": 470 + }, + { + "epoch": 1.08, + "grad_norm": 0.9938413500785828, + "learning_rate": 8.719699846434492e-05, + "loss": 0.8411, + "step": 471 + }, + { + "epoch": 1.08, + "grad_norm": 0.8569329380989075, + "learning_rate": 8.682260285294271e-05, + "loss": 0.7593, + "step": 472 + }, + { + "epoch": 1.08, + "grad_norm": 1.0257930755615234, + "learning_rate": 8.644839512225886e-05, + "loss": 0.9576, + "step": 473 + }, + { + "epoch": 1.08, + "grad_norm": 1.0176326036453247, + "learning_rate": 8.607438060767296e-05, + "loss": 1.0099, + "step": 474 + }, + { + "epoch": 1.08, + "grad_norm": 1.1403366327285767, + "learning_rate": 8.570056464180998e-05, + "loss": 0.6884, + "step": 475 + }, + { + "epoch": 1.09, + "grad_norm": 0.9332993626594543, + "learning_rate": 8.532695255446383e-05, + "loss": 0.8534, + "step": 476 + }, + { + "epoch": 1.09, + "grad_norm": 1.0836379528045654, + "learning_rate": 8.495354967252169e-05, + "loss": 0.9814, + "step": 477 + }, + { + "epoch": 1.09, + "grad_norm": 1.038662075996399, + "learning_rate": 8.458036131988792e-05, + "loss": 0.8299, + "step": 478 + }, + { + "epoch": 1.09, + "grad_norm": 0.9515346884727478, + "learning_rate": 8.420739281740805e-05, + "loss": 0.6605, + "step": 479 + }, + { + "epoch": 1.1, + "grad_norm": 1.1238003969192505, + "learning_rate": 8.383464948279319e-05, + "loss": 0.9265, + "step": 480 + }, + { + "epoch": 1.1, + "grad_norm": 1.245451807975769, + "learning_rate": 8.346213663054387e-05, + "loss": 1.0276, + "step": 481 + }, + { + "epoch": 1.1, + "grad_norm": 1.0740351676940918, + "learning_rate": 8.308985957187466e-05, + "loss": 0.894, + "step": 482 + }, + { + "epoch": 1.1, + "grad_norm": 1.1485562324523926, + "learning_rate": 8.271782361463805e-05, + "loss": 1.029, + "step": 483 + }, + { + "epoch": 1.11, + "grad_norm": 0.9134268760681152, + "learning_rate": 8.234603406324908e-05, + "loss": 0.6061, + "step": 484 + }, + { + "epoch": 1.11, + "grad_norm": 1.0883495807647705, + "learning_rate": 8.197449621860943e-05, + "loss": 0.3585, + "step": 485 + }, + { + "epoch": 1.11, + "grad_norm": 1.11465322971344, + "learning_rate": 8.16032153780322e-05, + "loss": 1.0824, + "step": 486 + }, + { + "epoch": 1.11, + "grad_norm": 1.1381958723068237, + "learning_rate": 8.123219683516603e-05, + "loss": 1.2363, + "step": 487 + }, + { + "epoch": 1.12, + "grad_norm": 0.9504216313362122, + "learning_rate": 8.08614458799198e-05, + "loss": 0.9756, + "step": 488 + }, + { + "epoch": 1.12, + "grad_norm": 0.6939163208007812, + "learning_rate": 8.049096779838719e-05, + "loss": 0.3759, + "step": 489 + }, + { + "epoch": 1.12, + "grad_norm": 1.0341072082519531, + "learning_rate": 8.01207678727713e-05, + "loss": 0.8908, + "step": 490 + }, + { + "epoch": 1.12, + "grad_norm": 1.0956295728683472, + "learning_rate": 7.975085138130938e-05, + "loss": 0.7801, + "step": 491 + }, + { + "epoch": 1.12, + "grad_norm": 1.07564115524292, + "learning_rate": 7.938122359819746e-05, + "loss": 0.7834, + "step": 492 + }, + { + "epoch": 1.13, + "grad_norm": 1.0565071105957031, + "learning_rate": 7.901188979351526e-05, + "loss": 0.8172, + "step": 493 + }, + { + "epoch": 1.13, + "grad_norm": 1.0685640573501587, + "learning_rate": 7.864285523315096e-05, + "loss": 0.7835, + "step": 494 + }, + { + "epoch": 1.13, + "grad_norm": 0.9400045275688171, + "learning_rate": 7.827412517872634e-05, + "loss": 0.6592, + "step": 495 + }, + { + "epoch": 1.13, + "grad_norm": 1.1758517026901245, + "learning_rate": 7.790570488752135e-05, + "loss": 0.8823, + "step": 496 + }, + { + "epoch": 1.14, + "grad_norm": 1.1540248394012451, + "learning_rate": 7.753759961239964e-05, + "loss": 0.849, + "step": 497 + }, + { + "epoch": 1.14, + "grad_norm": 1.1850864887237549, + "learning_rate": 7.716981460173319e-05, + "loss": 0.7586, + "step": 498 + }, + { + "epoch": 1.14, + "grad_norm": 1.033463954925537, + "learning_rate": 7.68023550993279e-05, + "loss": 0.9891, + "step": 499 + }, + { + "epoch": 1.14, + "grad_norm": 1.2690588235855103, + "learning_rate": 7.643522634434856e-05, + "loss": 1.1184, + "step": 500 + }, + { + "epoch": 1.15, + "grad_norm": 1.6630959510803223, + "learning_rate": 7.606843357124426e-05, + "loss": 1.0677, + "step": 501 + }, + { + "epoch": 1.15, + "grad_norm": 1.0995733737945557, + "learning_rate": 7.570198200967362e-05, + "loss": 0.9593, + "step": 502 + }, + { + "epoch": 1.15, + "grad_norm": 1.1102938652038574, + "learning_rate": 7.533587688443049e-05, + "loss": 0.7075, + "step": 503 + }, + { + "epoch": 1.15, + "grad_norm": 1.3560442924499512, + "learning_rate": 7.497012341536924e-05, + "loss": 1.0318, + "step": 504 + }, + { + "epoch": 1.15, + "grad_norm": 1.0489193201065063, + "learning_rate": 7.460472681733031e-05, + "loss": 0.7033, + "step": 505 + }, + { + "epoch": 1.16, + "grad_norm": 1.1091972589492798, + "learning_rate": 7.423969230006609e-05, + "loss": 0.907, + "step": 506 + }, + { + "epoch": 1.16, + "grad_norm": 1.096968173980713, + "learning_rate": 7.387502506816638e-05, + "loss": 0.9167, + "step": 507 + }, + { + "epoch": 1.16, + "grad_norm": 1.8477667570114136, + "learning_rate": 7.351073032098437e-05, + "loss": 0.9794, + "step": 508 + }, + { + "epoch": 1.16, + "grad_norm": 0.8208603858947754, + "learning_rate": 7.314681325256232e-05, + "loss": 0.7489, + "step": 509 + }, + { + "epoch": 1.17, + "grad_norm": 1.3617076873779297, + "learning_rate": 7.278327905155783e-05, + "loss": 1.0549, + "step": 510 + }, + { + "epoch": 1.17, + "grad_norm": 1.2204340696334839, + "learning_rate": 7.242013290116944e-05, + "loss": 1.0512, + "step": 511 + }, + { + "epoch": 1.17, + "grad_norm": 1.2859915494918823, + "learning_rate": 7.205737997906307e-05, + "loss": 0.8753, + "step": 512 + }, + { + "epoch": 1.17, + "grad_norm": 1.184820532798767, + "learning_rate": 7.169502545729797e-05, + "loss": 0.7513, + "step": 513 + }, + { + "epoch": 1.18, + "grad_norm": 1.4803907871246338, + "learning_rate": 7.133307450225322e-05, + "loss": 0.9857, + "step": 514 + }, + { + "epoch": 1.18, + "grad_norm": 0.8424803614616394, + "learning_rate": 7.097153227455379e-05, + "loss": 0.586, + "step": 515 + }, + { + "epoch": 1.18, + "grad_norm": 1.2504682540893555, + "learning_rate": 7.061040392899712e-05, + "loss": 0.7598, + "step": 516 + }, + { + "epoch": 1.18, + "grad_norm": 1.072726845741272, + "learning_rate": 7.024969461447972e-05, + "loss": 0.725, + "step": 517 + }, + { + "epoch": 1.19, + "grad_norm": 1.133083701133728, + "learning_rate": 6.988940947392344e-05, + "loss": 0.943, + "step": 518 + }, + { + "epoch": 1.19, + "grad_norm": 1.823098063468933, + "learning_rate": 6.952955364420255e-05, + "loss": 0.8723, + "step": 519 + }, + { + "epoch": 1.19, + "grad_norm": 1.2997591495513916, + "learning_rate": 6.91701322560701e-05, + "loss": 1.0831, + "step": 520 + }, + { + "epoch": 1.19, + "grad_norm": 1.3265862464904785, + "learning_rate": 6.881115043408511e-05, + "loss": 0.9224, + "step": 521 + }, + { + "epoch": 1.19, + "grad_norm": 1.3687394857406616, + "learning_rate": 6.845261329653922e-05, + "loss": 1.2177, + "step": 522 + }, + { + "epoch": 1.2, + "grad_norm": 1.3694826364517212, + "learning_rate": 6.809452595538402e-05, + "loss": 0.9281, + "step": 523 + }, + { + "epoch": 1.2, + "grad_norm": 0.8991251587867737, + "learning_rate": 6.77368935161578e-05, + "loss": 0.7164, + "step": 524 + }, + { + "epoch": 1.2, + "grad_norm": 1.059921145439148, + "learning_rate": 6.73797210779131e-05, + "loss": 0.7976, + "step": 525 + }, + { + "epoch": 1.2, + "grad_norm": 1.2316731214523315, + "learning_rate": 6.70230137331437e-05, + "loss": 0.9325, + "step": 526 + }, + { + "epoch": 1.21, + "grad_norm": 1.3245116472244263, + "learning_rate": 6.666677656771239e-05, + "loss": 0.7446, + "step": 527 + }, + { + "epoch": 1.21, + "grad_norm": 1.056368112564087, + "learning_rate": 6.6311014660778e-05, + "loss": 0.6862, + "step": 528 + }, + { + "epoch": 1.21, + "grad_norm": 1.4571599960327148, + "learning_rate": 6.595573308472338e-05, + "loss": 1.0019, + "step": 529 + }, + { + "epoch": 1.21, + "grad_norm": 1.2216399908065796, + "learning_rate": 6.56009369050829e-05, + "loss": 0.8784, + "step": 530 + }, + { + "epoch": 1.22, + "grad_norm": 1.440184473991394, + "learning_rate": 6.524663118047016e-05, + "loss": 1.7494, + "step": 531 + }, + { + "epoch": 1.22, + "grad_norm": 0.9794695973396301, + "learning_rate": 6.489282096250601e-05, + "loss": 0.7664, + "step": 532 + }, + { + "epoch": 1.22, + "grad_norm": 1.0644843578338623, + "learning_rate": 6.453951129574644e-05, + "loss": 1.1805, + "step": 533 + }, + { + "epoch": 1.22, + "grad_norm": 1.155930995941162, + "learning_rate": 6.418670721761073e-05, + "loss": 0.8715, + "step": 534 + }, + { + "epoch": 1.23, + "grad_norm": 1.2466217279434204, + "learning_rate": 6.383441375830951e-05, + "loss": 1.1003, + "step": 535 + }, + { + "epoch": 1.23, + "eval_loss": 1.1363521814346313, + "eval_runtime": 5.0483, + "eval_samples_per_second": 19.808, + "eval_steps_per_second": 19.808, + "step": 535 + }, + { + "epoch": 1.23, + "grad_norm": 1.0918858051300049, + "learning_rate": 6.34826359407732e-05, + "loss": 1.0192, + "step": 536 + }, + { + "epoch": 1.23, + "grad_norm": 1.0371659994125366, + "learning_rate": 6.313137878058013e-05, + "loss": 0.8525, + "step": 537 + }, + { + "epoch": 1.23, + "grad_norm": 1.3210889101028442, + "learning_rate": 6.278064728588542e-05, + "loss": 0.9921, + "step": 538 + }, + { + "epoch": 1.23, + "grad_norm": 1.1951533555984497, + "learning_rate": 6.243044645734917e-05, + "loss": 0.8915, + "step": 539 + }, + { + "epoch": 1.24, + "grad_norm": 1.4079426527023315, + "learning_rate": 6.20807812880655e-05, + "loss": 1.0806, + "step": 540 + }, + { + "epoch": 1.24, + "grad_norm": 1.255631685256958, + "learning_rate": 6.173165676349103e-05, + "loss": 0.7914, + "step": 541 + }, + { + "epoch": 1.24, + "grad_norm": 1.08389413356781, + "learning_rate": 6.138307786137415e-05, + "loss": 0.8829, + "step": 542 + }, + { + "epoch": 1.24, + "grad_norm": 1.3811546564102173, + "learning_rate": 6.103504955168382e-05, + "loss": 1.0245, + "step": 543 + }, + { + "epoch": 1.25, + "grad_norm": 1.1965594291687012, + "learning_rate": 6.068757679653868e-05, + "loss": 1.0113, + "step": 544 + }, + { + "epoch": 1.25, + "grad_norm": 1.3283885717391968, + "learning_rate": 6.034066455013649e-05, + "loss": 0.8461, + "step": 545 + }, + { + "epoch": 1.25, + "grad_norm": 1.3063422441482544, + "learning_rate": 5.999431775868329e-05, + "loss": 0.7606, + "step": 546 + }, + { + "epoch": 1.25, + "grad_norm": 1.1690608263015747, + "learning_rate": 5.9648541360323095e-05, + "loss": 0.9931, + "step": 547 + }, + { + "epoch": 1.26, + "grad_norm": 0.7004899978637695, + "learning_rate": 5.930334028506725e-05, + "loss": 0.2953, + "step": 548 + }, + { + "epoch": 1.26, + "grad_norm": 1.0900754928588867, + "learning_rate": 5.8958719454724346e-05, + "loss": 0.8003, + "step": 549 + }, + { + "epoch": 1.26, + "grad_norm": 1.1421937942504883, + "learning_rate": 5.8614683782829835e-05, + "loss": 0.8811, + "step": 550 + }, + { + "epoch": 1.26, + "grad_norm": 1.823502540588379, + "learning_rate": 5.8271238174576305e-05, + "loss": 1.1777, + "step": 551 + }, + { + "epoch": 1.27, + "grad_norm": 1.483927607536316, + "learning_rate": 5.792838752674309e-05, + "loss": 0.9416, + "step": 552 + }, + { + "epoch": 1.27, + "grad_norm": 1.1196835041046143, + "learning_rate": 5.75861367276269e-05, + "loss": 0.7103, + "step": 553 + }, + { + "epoch": 1.27, + "grad_norm": 1.209674596786499, + "learning_rate": 5.7244490656971815e-05, + "loss": 1.1059, + "step": 554 + }, + { + "epoch": 1.27, + "grad_norm": 1.0897173881530762, + "learning_rate": 5.6903454185899774e-05, + "loss": 0.8877, + "step": 555 + }, + { + "epoch": 1.27, + "grad_norm": 1.4691230058670044, + "learning_rate": 5.6563032176841324e-05, + "loss": 0.8161, + "step": 556 + }, + { + "epoch": 1.28, + "grad_norm": 1.5138698816299438, + "learning_rate": 5.622322948346594e-05, + "loss": 1.0267, + "step": 557 + }, + { + "epoch": 1.28, + "grad_norm": 1.6125860214233398, + "learning_rate": 5.588405095061322e-05, + "loss": 1.1, + "step": 558 + }, + { + "epoch": 1.28, + "grad_norm": 1.2148243188858032, + "learning_rate": 5.55455014142234e-05, + "loss": 0.9657, + "step": 559 + }, + { + "epoch": 1.28, + "grad_norm": 1.327254295349121, + "learning_rate": 5.5207585701268805e-05, + "loss": 1.0631, + "step": 560 + }, + { + "epoch": 1.29, + "grad_norm": 1.1820276975631714, + "learning_rate": 5.4870308629684677e-05, + "loss": 0.6627, + "step": 561 + }, + { + "epoch": 1.29, + "grad_norm": 1.6765048503875732, + "learning_rate": 5.453367500830069e-05, + "loss": 1.0243, + "step": 562 + }, + { + "epoch": 1.29, + "grad_norm": 1.2270108461380005, + "learning_rate": 5.4197689636772334e-05, + "loss": 0.7945, + "step": 563 + }, + { + "epoch": 1.29, + "grad_norm": 0.985974133014679, + "learning_rate": 5.386235730551252e-05, + "loss": 0.7504, + "step": 564 + }, + { + "epoch": 1.3, + "grad_norm": 1.375940203666687, + "learning_rate": 5.3527682795623146e-05, + "loss": 0.9097, + "step": 565 + }, + { + "epoch": 1.3, + "grad_norm": 1.0430001020431519, + "learning_rate": 5.319367087882716e-05, + "loss": 0.9843, + "step": 566 + }, + { + "epoch": 1.3, + "grad_norm": 1.0604748725891113, + "learning_rate": 5.286032631740023e-05, + "loss": 0.6792, + "step": 567 + }, + { + "epoch": 1.3, + "grad_norm": 1.05086088180542, + "learning_rate": 5.252765386410312e-05, + "loss": 0.632, + "step": 568 + }, + { + "epoch": 1.31, + "grad_norm": 1.214124083518982, + "learning_rate": 5.2195658262113814e-05, + "loss": 0.7741, + "step": 569 + }, + { + "epoch": 1.31, + "grad_norm": 1.1835461854934692, + "learning_rate": 5.186434424495979e-05, + "loss": 0.9085, + "step": 570 + }, + { + "epoch": 1.31, + "grad_norm": 1.428858757019043, + "learning_rate": 5.1533716536450693e-05, + "loss": 1.4804, + "step": 571 + }, + { + "epoch": 1.31, + "grad_norm": 1.3231133222579956, + "learning_rate": 5.1203779850610864e-05, + "loss": 0.7928, + "step": 572 + }, + { + "epoch": 1.31, + "grad_norm": 1.2435096502304077, + "learning_rate": 5.087453889161229e-05, + "loss": 0.8173, + "step": 573 + }, + { + "epoch": 1.32, + "grad_norm": 1.05045485496521, + "learning_rate": 5.054599835370724e-05, + "loss": 0.9197, + "step": 574 + }, + { + "epoch": 1.32, + "grad_norm": 1.5020091533660889, + "learning_rate": 5.021816292116175e-05, + "loss": 1.0374, + "step": 575 + }, + { + "epoch": 1.32, + "grad_norm": 1.2051324844360352, + "learning_rate": 4.989103726818836e-05, + "loss": 0.5828, + "step": 576 + }, + { + "epoch": 1.32, + "grad_norm": 1.2995752096176147, + "learning_rate": 4.956462605887994e-05, + "loss": 0.9762, + "step": 577 + }, + { + "epoch": 1.33, + "grad_norm": 1.293330430984497, + "learning_rate": 4.923893394714279e-05, + "loss": 0.851, + "step": 578 + }, + { + "epoch": 1.33, + "grad_norm": 1.4019485712051392, + "learning_rate": 4.891396557663056e-05, + "loss": 0.8867, + "step": 579 + }, + { + "epoch": 1.33, + "grad_norm": 1.1992961168289185, + "learning_rate": 4.8589725580677835e-05, + "loss": 0.8609, + "step": 580 + }, + { + "epoch": 1.33, + "grad_norm": 1.3640302419662476, + "learning_rate": 4.826621858223431e-05, + "loss": 0.8686, + "step": 581 + }, + { + "epoch": 1.34, + "grad_norm": 1.2899017333984375, + "learning_rate": 4.794344919379872e-05, + "loss": 0.7867, + "step": 582 + }, + { + "epoch": 1.34, + "grad_norm": 1.468520164489746, + "learning_rate": 4.762142201735299e-05, + "loss": 0.8161, + "step": 583 + }, + { + "epoch": 1.34, + "grad_norm": 1.1621512174606323, + "learning_rate": 4.730014164429689e-05, + "loss": 0.7486, + "step": 584 + }, + { + "epoch": 1.34, + "grad_norm": 1.599510669708252, + "learning_rate": 4.697961265538231e-05, + "loss": 0.7786, + "step": 585 + }, + { + "epoch": 1.35, + "grad_norm": 1.5685900449752808, + "learning_rate": 4.6659839620648074e-05, + "loss": 0.9098, + "step": 586 + }, + { + "epoch": 1.35, + "grad_norm": 1.2052661180496216, + "learning_rate": 4.634082709935473e-05, + "loss": 0.845, + "step": 587 + }, + { + "epoch": 1.35, + "grad_norm": 1.2109522819519043, + "learning_rate": 4.6022579639919695e-05, + "loss": 0.7274, + "step": 588 + }, + { + "epoch": 1.35, + "grad_norm": 1.354245662689209, + "learning_rate": 4.5705101779852135e-05, + "loss": 0.9229, + "step": 589 + }, + { + "epoch": 1.35, + "grad_norm": 1.398025631904602, + "learning_rate": 4.5388398045688566e-05, + "loss": 0.7834, + "step": 590 + }, + { + "epoch": 1.36, + "grad_norm": 1.30653977394104, + "learning_rate": 4.507247295292801e-05, + "loss": 0.9327, + "step": 591 + }, + { + "epoch": 1.36, + "grad_norm": 0.9323519468307495, + "learning_rate": 4.475733100596795e-05, + "loss": 0.5555, + "step": 592 + }, + { + "epoch": 1.36, + "grad_norm": 1.2603791952133179, + "learning_rate": 4.444297669803981e-05, + "loss": 0.8119, + "step": 593 + }, + { + "epoch": 1.36, + "grad_norm": 1.2332367897033691, + "learning_rate": 4.412941451114498e-05, + "loss": 0.8452, + "step": 594 + }, + { + "epoch": 1.37, + "grad_norm": 1.3796523809432983, + "learning_rate": 4.381664891599111e-05, + "loss": 1.0362, + "step": 595 + }, + { + "epoch": 1.37, + "grad_norm": 1.1836915016174316, + "learning_rate": 4.3504684371928006e-05, + "loss": 0.7377, + "step": 596 + }, + { + "epoch": 1.37, + "grad_norm": 1.4385707378387451, + "learning_rate": 4.3193525326884435e-05, + "loss": 0.9733, + "step": 597 + }, + { + "epoch": 1.37, + "grad_norm": 1.4326701164245605, + "learning_rate": 4.288317621730434e-05, + "loss": 0.8767, + "step": 598 + }, + { + "epoch": 1.38, + "grad_norm": 1.1379212141036987, + "learning_rate": 4.257364146808393e-05, + "loss": 0.8179, + "step": 599 + }, + { + "epoch": 1.38, + "grad_norm": 1.343801498413086, + "learning_rate": 4.226492549250829e-05, + "loss": 0.7713, + "step": 600 + }, + { + "epoch": 1.38, + "grad_norm": 1.1334872245788574, + "learning_rate": 4.195703269218868e-05, + "loss": 0.9144, + "step": 601 + }, + { + "epoch": 1.38, + "grad_norm": 1.1683942079544067, + "learning_rate": 4.164996745699966e-05, + "loss": 0.8409, + "step": 602 + }, + { + "epoch": 1.38, + "grad_norm": 1.074876308441162, + "learning_rate": 4.1343734165016514e-05, + "loss": 0.5677, + "step": 603 + }, + { + "epoch": 1.39, + "grad_norm": 1.4278324842453003, + "learning_rate": 4.1038337182452826e-05, + "loss": 0.8643, + "step": 604 + }, + { + "epoch": 1.39, + "grad_norm": 1.0636829137802124, + "learning_rate": 4.0733780863598335e-05, + "loss": 0.7189, + "step": 605 + }, + { + "epoch": 1.39, + "grad_norm": 1.238641381263733, + "learning_rate": 4.0430069550756665e-05, + "loss": 0.8048, + "step": 606 + }, + { + "epoch": 1.39, + "grad_norm": 1.2493727207183838, + "learning_rate": 4.012720757418358e-05, + "loss": 0.6849, + "step": 607 + }, + { + "epoch": 1.4, + "grad_norm": 1.04925537109375, + "learning_rate": 3.9825199252025184e-05, + "loss": 0.6783, + "step": 608 + }, + { + "epoch": 1.4, + "grad_norm": 1.0140671730041504, + "learning_rate": 3.952404889025626e-05, + "loss": 0.6275, + "step": 609 + }, + { + "epoch": 1.4, + "grad_norm": 1.292979121208191, + "learning_rate": 3.9223760782619045e-05, + "loss": 0.6685, + "step": 610 + }, + { + "epoch": 1.4, + "grad_norm": 1.355290174484253, + "learning_rate": 3.8924339210561836e-05, + "loss": 0.7877, + "step": 611 + }, + { + "epoch": 1.41, + "grad_norm": 1.2331466674804688, + "learning_rate": 3.862578844317817e-05, + "loss": 0.8935, + "step": 612 + }, + { + "epoch": 1.41, + "grad_norm": 1.4775677919387817, + "learning_rate": 3.832811273714569e-05, + "loss": 0.8254, + "step": 613 + }, + { + "epoch": 1.41, + "grad_norm": 1.1989951133728027, + "learning_rate": 3.803131633666572e-05, + "loss": 0.7965, + "step": 614 + }, + { + "epoch": 1.41, + "grad_norm": 1.2056313753128052, + "learning_rate": 3.773540347340248e-05, + "loss": 1.2453, + "step": 615 + }, + { + "epoch": 1.42, + "grad_norm": 0.9701207876205444, + "learning_rate": 3.7440378366423e-05, + "loss": 0.5302, + "step": 616 + }, + { + "epoch": 1.42, + "grad_norm": 1.1523936986923218, + "learning_rate": 3.714624522213681e-05, + "loss": 0.6319, + "step": 617 + }, + { + "epoch": 1.42, + "grad_norm": 1.7994964122772217, + "learning_rate": 3.685300823423602e-05, + "loss": 0.8799, + "step": 618 + }, + { + "epoch": 1.42, + "grad_norm": 1.305521011352539, + "learning_rate": 3.6560671583635467e-05, + "loss": 0.9071, + "step": 619 + }, + { + "epoch": 1.42, + "grad_norm": 1.4707729816436768, + "learning_rate": 3.626923943841325e-05, + "loss": 1.0193, + "step": 620 + }, + { + "epoch": 1.43, + "grad_norm": 1.5845611095428467, + "learning_rate": 3.597871595375121e-05, + "loss": 0.9118, + "step": 621 + }, + { + "epoch": 1.43, + "grad_norm": 1.0622698068618774, + "learning_rate": 3.5689105271875564e-05, + "loss": 0.8397, + "step": 622 + }, + { + "epoch": 1.43, + "grad_norm": 1.2688456773757935, + "learning_rate": 3.5400411521998126e-05, + "loss": 0.9132, + "step": 623 + }, + { + "epoch": 1.43, + "grad_norm": 1.2782020568847656, + "learning_rate": 3.5112638820257115e-05, + "loss": 0.5659, + "step": 624 + }, + { + "epoch": 1.44, + "grad_norm": 1.4183655977249146, + "learning_rate": 3.482579126965878e-05, + "loss": 0.5982, + "step": 625 + }, + { + "epoch": 1.44, + "grad_norm": 1.279589056968689, + "learning_rate": 3.453987296001866e-05, + "loss": 0.6599, + "step": 626 + }, + { + "epoch": 1.44, + "grad_norm": 1.5479413270950317, + "learning_rate": 3.425488796790337e-05, + "loss": 0.738, + "step": 627 + }, + { + "epoch": 1.44, + "grad_norm": 1.507043719291687, + "learning_rate": 3.397084035657243e-05, + "loss": 0.7308, + "step": 628 + }, + { + "epoch": 1.45, + "grad_norm": 1.5540934801101685, + "learning_rate": 3.36877341759205e-05, + "loss": 0.9896, + "step": 629 + }, + { + "epoch": 1.45, + "grad_norm": 1.198010802268982, + "learning_rate": 3.340557346241936e-05, + "loss": 0.9595, + "step": 630 + }, + { + "epoch": 1.45, + "grad_norm": 1.4387493133544922, + "learning_rate": 3.312436223906062e-05, + "loss": 0.9932, + "step": 631 + }, + { + "epoch": 1.45, + "grad_norm": 1.4757272005081177, + "learning_rate": 3.2844104515298155e-05, + "loss": 0.796, + "step": 632 + }, + { + "epoch": 1.46, + "grad_norm": 1.2666937112808228, + "learning_rate": 3.2564804286991135e-05, + "loss": 0.6709, + "step": 633 + }, + { + "epoch": 1.46, + "grad_norm": 1.187326192855835, + "learning_rate": 3.2286465536346854e-05, + "loss": 0.8126, + "step": 634 + }, + { + "epoch": 1.46, + "grad_norm": 1.434696912765503, + "learning_rate": 3.2009092231864044e-05, + "loss": 0.7374, + "step": 635 + }, + { + "epoch": 1.46, + "grad_norm": 1.5462465286254883, + "learning_rate": 3.173268832827643e-05, + "loss": 0.8611, + "step": 636 + }, + { + "epoch": 1.46, + "grad_norm": 1.5220853090286255, + "learning_rate": 3.1457257766496015e-05, + "loss": 1.0394, + "step": 637 + }, + { + "epoch": 1.47, + "grad_norm": 1.2815146446228027, + "learning_rate": 3.118280447355729e-05, + "loss": 1.0586, + "step": 638 + }, + { + "epoch": 1.47, + "grad_norm": 1.4531899690628052, + "learning_rate": 3.090933236256087e-05, + "loss": 0.8858, + "step": 639 + }, + { + "epoch": 1.47, + "grad_norm": 0.7821406126022339, + "learning_rate": 3.0636845332617994e-05, + "loss": 0.4845, + "step": 640 + }, + { + "epoch": 1.47, + "grad_norm": 1.171593189239502, + "learning_rate": 3.036534726879473e-05, + "loss": 0.6783, + "step": 641 + }, + { + "epoch": 1.48, + "grad_norm": 0.9787347316741943, + "learning_rate": 3.0094842042056704e-05, + "loss": 0.7622, + "step": 642 + }, + { + "epoch": 1.48, + "eval_loss": 1.1426820755004883, + "eval_runtime": 4.8034, + "eval_samples_per_second": 20.819, + "eval_steps_per_second": 20.819, + "step": 642 + }, + { + "epoch": 1.48, + "grad_norm": 2.1937968730926514, + "learning_rate": 2.9825333509213827e-05, + "loss": 0.8779, + "step": 643 + }, + { + "epoch": 1.48, + "grad_norm": 0.9966670274734497, + "learning_rate": 2.9556825512865415e-05, + "loss": 0.9429, + "step": 644 + }, + { + "epoch": 1.48, + "grad_norm": 1.0608552694320679, + "learning_rate": 2.9289321881345254e-05, + "loss": 0.4471, + "step": 645 + }, + { + "epoch": 1.49, + "grad_norm": 1.5099259614944458, + "learning_rate": 2.902282642866716e-05, + "loss": 0.9748, + "step": 646 + }, + { + "epoch": 1.49, + "grad_norm": 1.24240243434906, + "learning_rate": 2.8757342954470533e-05, + "loss": 0.7204, + "step": 647 + }, + { + "epoch": 1.49, + "grad_norm": 1.631426215171814, + "learning_rate": 2.849287524396611e-05, + "loss": 1.0639, + "step": 648 + }, + { + "epoch": 1.49, + "grad_norm": 0.9880529642105103, + "learning_rate": 2.8229427067882164e-05, + "loss": 0.7009, + "step": 649 + }, + { + "epoch": 1.5, + "grad_norm": 1.366929531097412, + "learning_rate": 2.7967002182410596e-05, + "loss": 0.7605, + "step": 650 + }, + { + "epoch": 1.5, + "grad_norm": 1.3599152565002441, + "learning_rate": 2.7705604329153434e-05, + "loss": 0.9786, + "step": 651 + }, + { + "epoch": 1.5, + "grad_norm": 1.316638708114624, + "learning_rate": 2.7445237235069455e-05, + "loss": 1.0621, + "step": 652 + }, + { + "epoch": 1.5, + "grad_norm": 2.131920099258423, + "learning_rate": 2.7185904612421176e-05, + "loss": 0.4658, + "step": 653 + }, + { + "epoch": 1.5, + "grad_norm": 1.4175996780395508, + "learning_rate": 2.6927610158721706e-05, + "loss": 1.1467, + "step": 654 + }, + { + "epoch": 1.51, + "grad_norm": 1.041159987449646, + "learning_rate": 2.6670357556682247e-05, + "loss": 0.7331, + "step": 655 + }, + { + "epoch": 1.51, + "grad_norm": 1.5495882034301758, + "learning_rate": 2.6414150474159403e-05, + "loss": 1.0374, + "step": 656 + }, + { + "epoch": 1.51, + "grad_norm": 1.3431997299194336, + "learning_rate": 2.6158992564103058e-05, + "loss": 0.8734, + "step": 657 + }, + { + "epoch": 1.51, + "grad_norm": 1.2934917211532593, + "learning_rate": 2.5904887464504114e-05, + "loss": 0.7493, + "step": 658 + }, + { + "epoch": 1.52, + "grad_norm": 1.2499974966049194, + "learning_rate": 2.565183879834272e-05, + "loss": 0.6417, + "step": 659 + }, + { + "epoch": 1.52, + "grad_norm": 1.0983362197875977, + "learning_rate": 2.53998501735367e-05, + "loss": 0.618, + "step": 660 + }, + { + "epoch": 1.52, + "grad_norm": 1.2114965915679932, + "learning_rate": 2.514892518288988e-05, + "loss": 0.7808, + "step": 661 + }, + { + "epoch": 1.52, + "grad_norm": 1.2864103317260742, + "learning_rate": 2.4899067404041153e-05, + "loss": 0.8773, + "step": 662 + }, + { + "epoch": 1.53, + "grad_norm": 1.3944170475006104, + "learning_rate": 2.465028039941316e-05, + "loss": 0.9462, + "step": 663 + }, + { + "epoch": 1.53, + "grad_norm": 1.3994008302688599, + "learning_rate": 2.4402567716161805e-05, + "loss": 0.9313, + "step": 664 + }, + { + "epoch": 1.53, + "grad_norm": 1.3160896301269531, + "learning_rate": 2.415593288612541e-05, + "loss": 0.724, + "step": 665 + }, + { + "epoch": 1.53, + "grad_norm": 1.348981261253357, + "learning_rate": 2.391037942577454e-05, + "loss": 0.9646, + "step": 666 + }, + { + "epoch": 1.54, + "grad_norm": 1.1432273387908936, + "learning_rate": 2.3665910836161775e-05, + "loss": 0.6094, + "step": 667 + }, + { + "epoch": 1.54, + "grad_norm": 1.5914933681488037, + "learning_rate": 2.3422530602871872e-05, + "loss": 0.8079, + "step": 668 + }, + { + "epoch": 1.54, + "grad_norm": 1.3685251474380493, + "learning_rate": 2.318024219597196e-05, + "loss": 0.6591, + "step": 669 + }, + { + "epoch": 1.54, + "grad_norm": 1.1842666864395142, + "learning_rate": 2.2939049069962183e-05, + "loss": 0.8079, + "step": 670 + }, + { + "epoch": 1.54, + "grad_norm": 1.3472460508346558, + "learning_rate": 2.26989546637263e-05, + "loss": 0.7606, + "step": 671 + }, + { + "epoch": 1.55, + "grad_norm": 1.358242392539978, + "learning_rate": 2.2459962400482847e-05, + "loss": 0.7715, + "step": 672 + }, + { + "epoch": 1.55, + "grad_norm": 1.2322027683258057, + "learning_rate": 2.2222075687736187e-05, + "loss": 0.8517, + "step": 673 + }, + { + "epoch": 1.55, + "grad_norm": 1.5333589315414429, + "learning_rate": 2.198529791722792e-05, + "loss": 0.6613, + "step": 674 + }, + { + "epoch": 1.55, + "grad_norm": 1.2892917394638062, + "learning_rate": 2.1749632464888592e-05, + "loss": 0.8702, + "step": 675 + }, + { + "epoch": 1.56, + "grad_norm": 1.505099892616272, + "learning_rate": 2.1515082690789535e-05, + "loss": 0.8878, + "step": 676 + }, + { + "epoch": 1.56, + "grad_norm": 1.266660213470459, + "learning_rate": 2.1281651939094992e-05, + "loss": 1.0645, + "step": 677 + }, + { + "epoch": 1.56, + "grad_norm": 1.201398491859436, + "learning_rate": 2.1049343538014355e-05, + "loss": 0.9351, + "step": 678 + }, + { + "epoch": 1.56, + "grad_norm": 0.9400334358215332, + "learning_rate": 2.0818160799754828e-05, + "loss": 0.4052, + "step": 679 + }, + { + "epoch": 1.57, + "grad_norm": 1.2123039960861206, + "learning_rate": 2.0588107020474056e-05, + "loss": 0.6165, + "step": 680 + }, + { + "epoch": 1.57, + "grad_norm": 1.3787363767623901, + "learning_rate": 2.03591854802333e-05, + "loss": 0.8574, + "step": 681 + }, + { + "epoch": 1.57, + "grad_norm": 1.328092098236084, + "learning_rate": 2.0131399442950505e-05, + "loss": 0.7751, + "step": 682 + }, + { + "epoch": 1.57, + "grad_norm": 2.2625534534454346, + "learning_rate": 1.9904752156353878e-05, + "loss": 1.1871, + "step": 683 + }, + { + "epoch": 1.58, + "grad_norm": 1.0272362232208252, + "learning_rate": 1.967924685193552e-05, + "loss": 0.5135, + "step": 684 + }, + { + "epoch": 1.58, + "grad_norm": 1.3563427925109863, + "learning_rate": 1.94548867449054e-05, + "loss": 0.9451, + "step": 685 + }, + { + "epoch": 1.58, + "grad_norm": 1.4345487356185913, + "learning_rate": 1.9231675034145513e-05, + "loss": 0.8185, + "step": 686 + }, + { + "epoch": 1.58, + "grad_norm": 1.4716142416000366, + "learning_rate": 1.9009614902164174e-05, + "loss": 0.8666, + "step": 687 + }, + { + "epoch": 1.58, + "grad_norm": 2.2574665546417236, + "learning_rate": 1.8788709515050808e-05, + "loss": 1.1538, + "step": 688 + }, + { + "epoch": 1.59, + "grad_norm": 1.2094467878341675, + "learning_rate": 1.8568962022430636e-05, + "loss": 0.9113, + "step": 689 + }, + { + "epoch": 1.59, + "grad_norm": 1.2264611721038818, + "learning_rate": 1.8350375557419875e-05, + "loss": 0.693, + "step": 690 + }, + { + "epoch": 1.59, + "grad_norm": 1.440259337425232, + "learning_rate": 1.813295323658103e-05, + "loss": 0.702, + "step": 691 + }, + { + "epoch": 1.59, + "grad_norm": 1.0737701654434204, + "learning_rate": 1.791669815987852e-05, + "loss": 0.581, + "step": 692 + }, + { + "epoch": 1.6, + "grad_norm": 1.3297679424285889, + "learning_rate": 1.7701613410634365e-05, + "loss": 0.7501, + "step": 693 + }, + { + "epoch": 1.6, + "grad_norm": 1.1620917320251465, + "learning_rate": 1.7487702055484345e-05, + "loss": 0.8193, + "step": 694 + }, + { + "epoch": 1.6, + "grad_norm": 2.063955068588257, + "learning_rate": 1.7274967144334153e-05, + "loss": 1.0735, + "step": 695 + }, + { + "epoch": 1.6, + "grad_norm": 1.1501412391662598, + "learning_rate": 1.7063411710316046e-05, + "loss": 0.5951, + "step": 696 + }, + { + "epoch": 1.61, + "grad_norm": 1.2334132194519043, + "learning_rate": 1.6853038769745467e-05, + "loss": 0.7601, + "step": 697 + }, + { + "epoch": 1.61, + "grad_norm": 1.8308238983154297, + "learning_rate": 1.6643851322078174e-05, + "loss": 1.0234, + "step": 698 + }, + { + "epoch": 1.61, + "grad_norm": 1.6979902982711792, + "learning_rate": 1.643585234986733e-05, + "loss": 1.0344, + "step": 699 + }, + { + "epoch": 1.61, + "grad_norm": 1.1924549341201782, + "learning_rate": 1.622904481872106e-05, + "loss": 0.7712, + "step": 700 + }, + { + "epoch": 1.62, + "grad_norm": 1.3908418416976929, + "learning_rate": 1.6023431677260214e-05, + "loss": 0.7998, + "step": 701 + }, + { + "epoch": 1.62, + "grad_norm": 1.5069186687469482, + "learning_rate": 1.5819015857076213e-05, + "loss": 1.0222, + "step": 702 + }, + { + "epoch": 1.62, + "grad_norm": 1.5676149129867554, + "learning_rate": 1.5615800272689352e-05, + "loss": 1.0573, + "step": 703 + }, + { + "epoch": 1.62, + "grad_norm": 1.4918407201766968, + "learning_rate": 1.541378782150714e-05, + "loss": 0.7947, + "step": 704 + }, + { + "epoch": 1.62, + "grad_norm": 1.4949313402175903, + "learning_rate": 1.5212981383783154e-05, + "loss": 0.9044, + "step": 705 + }, + { + "epoch": 1.63, + "grad_norm": 1.4046580791473389, + "learning_rate": 1.5013383822575766e-05, + "loss": 0.976, + "step": 706 + }, + { + "epoch": 1.63, + "grad_norm": 1.247767448425293, + "learning_rate": 1.4814997983707458e-05, + "loss": 0.8356, + "step": 707 + }, + { + "epoch": 1.63, + "grad_norm": 1.4620435237884521, + "learning_rate": 1.4617826695724223e-05, + "loss": 0.9709, + "step": 708 + }, + { + "epoch": 1.63, + "grad_norm": 1.5974092483520508, + "learning_rate": 1.442187276985526e-05, + "loss": 0.7993, + "step": 709 + }, + { + "epoch": 1.64, + "grad_norm": 1.4362558126449585, + "learning_rate": 1.42271389999728e-05, + "loss": 0.8003, + "step": 710 + }, + { + "epoch": 1.64, + "grad_norm": 1.4817911386489868, + "learning_rate": 1.4033628162552359e-05, + "loss": 0.7285, + "step": 711 + }, + { + "epoch": 1.64, + "grad_norm": 1.3569756746292114, + "learning_rate": 1.3841343016633167e-05, + "loss": 0.6743, + "step": 712 + }, + { + "epoch": 1.64, + "grad_norm": 1.1255030632019043, + "learning_rate": 1.3650286303778714e-05, + "loss": 0.9471, + "step": 713 + }, + { + "epoch": 1.65, + "grad_norm": 1.5939558744430542, + "learning_rate": 1.3460460748037774e-05, + "loss": 0.8036, + "step": 714 + }, + { + "epoch": 1.65, + "grad_norm": 1.3757383823394775, + "learning_rate": 1.3271869055905495e-05, + "loss": 0.8478, + "step": 715 + }, + { + "epoch": 1.65, + "grad_norm": 1.6052110195159912, + "learning_rate": 1.3084513916284913e-05, + "loss": 0.902, + "step": 716 + }, + { + "epoch": 1.65, + "grad_norm": 2.1484215259552, + "learning_rate": 1.2898398000448443e-05, + "loss": 1.0986, + "step": 717 + }, + { + "epoch": 1.65, + "grad_norm": 0.9963454008102417, + "learning_rate": 1.2713523961999996e-05, + "loss": 0.5391, + "step": 718 + }, + { + "epoch": 1.66, + "grad_norm": 1.68670654296875, + "learning_rate": 1.2529894436836965e-05, + "loss": 0.9895, + "step": 719 + }, + { + "epoch": 1.66, + "grad_norm": 1.4046229124069214, + "learning_rate": 1.2347512043112752e-05, + "loss": 0.9313, + "step": 720 + }, + { + "epoch": 1.66, + "grad_norm": 2.2879252433776855, + "learning_rate": 1.2166379381199423e-05, + "loss": 0.84, + "step": 721 + }, + { + "epoch": 1.66, + "grad_norm": 1.3971514701843262, + "learning_rate": 1.1986499033650556e-05, + "loss": 0.8177, + "step": 722 + }, + { + "epoch": 1.67, + "grad_norm": 1.1414539813995361, + "learning_rate": 1.1807873565164506e-05, + "loss": 0.6581, + "step": 723 + }, + { + "epoch": 1.67, + "grad_norm": 1.2839149236679077, + "learning_rate": 1.1630505522547853e-05, + "loss": 1.0131, + "step": 724 + }, + { + "epoch": 1.67, + "grad_norm": 1.4500195980072021, + "learning_rate": 1.1454397434679021e-05, + "loss": 1.0473, + "step": 725 + }, + { + "epoch": 1.67, + "grad_norm": 1.769698977470398, + "learning_rate": 1.12795518124722e-05, + "loss": 1.1283, + "step": 726 + }, + { + "epoch": 1.68, + "grad_norm": 1.6734747886657715, + "learning_rate": 1.11059711488417e-05, + "loss": 1.0084, + "step": 727 + }, + { + "epoch": 1.68, + "grad_norm": 1.5987813472747803, + "learning_rate": 1.0933657918666174e-05, + "loss": 1.0561, + "step": 728 + }, + { + "epoch": 1.68, + "grad_norm": 1.2631326913833618, + "learning_rate": 1.0762614578753572e-05, + "loss": 0.9926, + "step": 729 + }, + { + "epoch": 1.68, + "grad_norm": 1.380119800567627, + "learning_rate": 1.0592843567805943e-05, + "loss": 0.9681, + "step": 730 + }, + { + "epoch": 1.69, + "grad_norm": 1.2174406051635742, + "learning_rate": 1.0424347306384729e-05, + "loss": 0.6559, + "step": 731 + }, + { + "epoch": 1.69, + "grad_norm": 1.7756376266479492, + "learning_rate": 1.025712819687623e-05, + "loss": 1.1937, + "step": 732 + }, + { + "epoch": 1.69, + "grad_norm": 1.4800599813461304, + "learning_rate": 1.0091188623457415e-05, + "loss": 0.7639, + "step": 733 + }, + { + "epoch": 1.69, + "grad_norm": 1.4891732931137085, + "learning_rate": 9.92653095206183e-06, + "loss": 0.7647, + "step": 734 + }, + { + "epoch": 1.69, + "grad_norm": 1.2692780494689941, + "learning_rate": 9.763157530345957e-06, + "loss": 0.8445, + "step": 735 + }, + { + "epoch": 1.7, + "grad_norm": 1.459630012512207, + "learning_rate": 9.601070687655667e-06, + "loss": 1.0512, + "step": 736 + }, + { + "epoch": 1.7, + "grad_norm": 1.3034725189208984, + "learning_rate": 9.440272734993072e-06, + "loss": 0.8654, + "step": 737 + }, + { + "epoch": 1.7, + "grad_norm": 1.2299538850784302, + "learning_rate": 9.280765964983529e-06, + "loss": 0.6091, + "step": 738 + }, + { + "epoch": 1.7, + "grad_norm": 1.5857917070388794, + "learning_rate": 9.12255265184293e-06, + "loss": 0.8166, + "step": 739 + }, + { + "epoch": 1.71, + "grad_norm": 1.3110122680664062, + "learning_rate": 8.965635051345411e-06, + "loss": 0.8791, + "step": 740 + }, + { + "epoch": 1.71, + "grad_norm": 1.4379709959030151, + "learning_rate": 8.810015400790994e-06, + "loss": 0.9836, + "step": 741 + }, + { + "epoch": 1.71, + "grad_norm": 1.272002100944519, + "learning_rate": 8.655695918973862e-06, + "loss": 1.0156, + "step": 742 + }, + { + "epoch": 1.71, + "grad_norm": 1.7516756057739258, + "learning_rate": 8.502678806150588e-06, + "loss": 0.9863, + "step": 743 + }, + { + "epoch": 1.72, + "grad_norm": 1.4102840423583984, + "learning_rate": 8.350966244008895e-06, + "loss": 0.9151, + "step": 744 + }, + { + "epoch": 1.72, + "grad_norm": 1.1626827716827393, + "learning_rate": 8.200560395636414e-06, + "loss": 0.8854, + "step": 745 + }, + { + "epoch": 1.72, + "grad_norm": 1.5321221351623535, + "learning_rate": 8.051463405489957e-06, + "loss": 0.7943, + "step": 746 + }, + { + "epoch": 1.72, + "grad_norm": 1.5110939741134644, + "learning_rate": 7.90367739936484e-06, + "loss": 0.8067, + "step": 747 + }, + { + "epoch": 1.73, + "grad_norm": 1.273176908493042, + "learning_rate": 7.7572044843647e-06, + "loss": 1.1937, + "step": 748 + }, + { + "epoch": 1.73, + "grad_norm": 1.2125933170318604, + "learning_rate": 7.612046748871327e-06, + "loss": 1.8904, + "step": 749 + }, + { + "epoch": 1.73, + "eval_loss": 1.1425327062606812, + "eval_runtime": 5.0157, + "eval_samples_per_second": 19.937, + "eval_steps_per_second": 19.937, + "step": 749 + }, + { + "epoch": 1.73, + "grad_norm": 1.541787028312683, + "learning_rate": 7.4682062625149655e-06, + "loss": 0.9905, + "step": 750 + }, + { + "epoch": 1.73, + "grad_norm": 1.5736668109893799, + "learning_rate": 7.325685076144795e-06, + "loss": 1.0864, + "step": 751 + }, + { + "epoch": 1.73, + "grad_norm": 1.21610426902771, + "learning_rate": 7.1844852217996305e-06, + "loss": 0.7444, + "step": 752 + }, + { + "epoch": 1.74, + "grad_norm": 2.0105512142181396, + "learning_rate": 7.0446087126790575e-06, + "loss": 0.8589, + "step": 753 + }, + { + "epoch": 1.74, + "grad_norm": 1.427541732788086, + "learning_rate": 6.906057543114619e-06, + "loss": 1.0143, + "step": 754 + }, + { + "epoch": 1.74, + "grad_norm": 1.9788390398025513, + "learning_rate": 6.768833688541443e-06, + "loss": 1.2519, + "step": 755 + }, + { + "epoch": 1.74, + "grad_norm": 1.646355152130127, + "learning_rate": 6.632939105470049e-06, + "loss": 1.1151, + "step": 756 + }, + { + "epoch": 1.75, + "grad_norm": 1.3599865436553955, + "learning_rate": 6.498375731458528e-06, + "loss": 0.9262, + "step": 757 + }, + { + "epoch": 1.75, + "grad_norm": 1.618510365486145, + "learning_rate": 6.365145485084767e-06, + "loss": 0.7766, + "step": 758 + }, + { + "epoch": 1.75, + "grad_norm": 1.6023226976394653, + "learning_rate": 6.233250265919266e-06, + "loss": 0.9066, + "step": 759 + }, + { + "epoch": 1.75, + "grad_norm": 1.3686883449554443, + "learning_rate": 6.102691954497907e-06, + "loss": 1.0172, + "step": 760 + }, + { + "epoch": 1.76, + "grad_norm": 1.1869922876358032, + "learning_rate": 5.973472412295255e-06, + "loss": 0.6924, + "step": 761 + }, + { + "epoch": 1.76, + "grad_norm": 1.10429847240448, + "learning_rate": 5.8455934816979305e-06, + "loss": 0.768, + "step": 762 + }, + { + "epoch": 1.76, + "grad_norm": 1.2119660377502441, + "learning_rate": 5.719056985978388e-06, + "loss": 0.6629, + "step": 763 + }, + { + "epoch": 1.76, + "grad_norm": 1.4681169986724854, + "learning_rate": 5.593864729268949e-06, + "loss": 0.6561, + "step": 764 + }, + { + "epoch": 1.77, + "grad_norm": 1.4000868797302246, + "learning_rate": 5.470018496535967e-06, + "loss": 0.8486, + "step": 765 + }, + { + "epoch": 1.77, + "grad_norm": 1.7486320734024048, + "learning_rate": 5.347520053554545e-06, + "loss": 1.0676, + "step": 766 + }, + { + "epoch": 1.77, + "grad_norm": 1.2215529680252075, + "learning_rate": 5.22637114688318e-06, + "loss": 0.8262, + "step": 767 + }, + { + "epoch": 1.77, + "grad_norm": 1.3720446825027466, + "learning_rate": 5.106573503839018e-06, + "loss": 0.8284, + "step": 768 + }, + { + "epoch": 1.77, + "grad_norm": 1.42399001121521, + "learning_rate": 4.9881288324731045e-06, + "loss": 0.9733, + "step": 769 + }, + { + "epoch": 1.78, + "grad_norm": 1.806998610496521, + "learning_rate": 4.871038821546103e-06, + "loss": 0.87, + "step": 770 + }, + { + "epoch": 1.78, + "grad_norm": 0.7265794277191162, + "learning_rate": 4.755305140504185e-06, + "loss": 0.3255, + "step": 771 + }, + { + "epoch": 1.78, + "grad_norm": 1.4420627355575562, + "learning_rate": 4.640929439455277e-06, + "loss": 1.0153, + "step": 772 + }, + { + "epoch": 1.78, + "grad_norm": 1.6430668830871582, + "learning_rate": 4.527913349145441e-06, + "loss": 0.9099, + "step": 773 + }, + { + "epoch": 1.79, + "grad_norm": 1.2274441719055176, + "learning_rate": 4.416258480935731e-06, + "loss": 0.8594, + "step": 774 + }, + { + "epoch": 1.79, + "grad_norm": 1.4887783527374268, + "learning_rate": 4.305966426779118e-06, + "loss": 0.808, + "step": 775 + }, + { + "epoch": 1.79, + "grad_norm": 1.1768397092819214, + "learning_rate": 4.197038759197869e-06, + "loss": 0.561, + "step": 776 + }, + { + "epoch": 1.79, + "grad_norm": 1.266448974609375, + "learning_rate": 4.089477031261113e-06, + "loss": 0.9325, + "step": 777 + }, + { + "epoch": 1.8, + "grad_norm": 1.5394303798675537, + "learning_rate": 3.9832827765626465e-06, + "loss": 0.7159, + "step": 778 + }, + { + "epoch": 1.8, + "grad_norm": 1.482283115386963, + "learning_rate": 3.878457509199107e-06, + "loss": 1.103, + "step": 779 + }, + { + "epoch": 1.8, + "grad_norm": 1.1145195960998535, + "learning_rate": 3.7750027237484e-06, + "loss": 1.0283, + "step": 780 + }, + { + "epoch": 1.8, + "grad_norm": 1.5757018327713013, + "learning_rate": 3.6729198952483724e-06, + "loss": 1.162, + "step": 781 + }, + { + "epoch": 1.81, + "grad_norm": 1.555261492729187, + "learning_rate": 3.572210479175753e-06, + "loss": 0.9346, + "step": 782 + }, + { + "epoch": 1.81, + "grad_norm": 1.5401426553726196, + "learning_rate": 3.472875911425477e-06, + "loss": 1.0614, + "step": 783 + }, + { + "epoch": 1.81, + "grad_norm": 1.4009335041046143, + "learning_rate": 3.3749176082901067e-06, + "loss": 0.9636, + "step": 784 + }, + { + "epoch": 1.81, + "grad_norm": 1.5251446962356567, + "learning_rate": 3.2783369664397436e-06, + "loss": 0.7538, + "step": 785 + }, + { + "epoch": 1.81, + "grad_norm": 1.2820510864257812, + "learning_rate": 3.1831353629020344e-06, + "loss": 0.8404, + "step": 786 + }, + { + "epoch": 1.82, + "grad_norm": 1.2701919078826904, + "learning_rate": 3.0893141550425884e-06, + "loss": 0.7659, + "step": 787 + }, + { + "epoch": 1.82, + "grad_norm": 1.619142770767212, + "learning_rate": 2.996874680545603e-06, + "loss": 0.7558, + "step": 788 + }, + { + "epoch": 1.82, + "grad_norm": 1.825161099433899, + "learning_rate": 2.905818257394799e-06, + "loss": 1.0628, + "step": 789 + }, + { + "epoch": 1.82, + "grad_norm": 1.2984554767608643, + "learning_rate": 2.8161461838546176e-06, + "loss": 0.8484, + "step": 790 + }, + { + "epoch": 1.83, + "grad_norm": 1.8327722549438477, + "learning_rate": 2.7278597384517214e-06, + "loss": 0.8366, + "step": 791 + }, + { + "epoch": 1.83, + "grad_norm": 1.1649681329727173, + "learning_rate": 2.6409601799567642e-06, + "loss": 0.7479, + "step": 792 + }, + { + "epoch": 1.83, + "grad_norm": 1.212082862854004, + "learning_rate": 2.55544874736644e-06, + "loss": 1.2608, + "step": 793 + }, + { + "epoch": 1.83, + "grad_norm": 1.675178050994873, + "learning_rate": 2.4713266598858086e-06, + "loss": 0.7571, + "step": 794 + }, + { + "epoch": 1.84, + "grad_norm": 1.349767804145813, + "learning_rate": 2.3885951169109187e-06, + "loss": 0.6354, + "step": 795 + }, + { + "epoch": 1.84, + "grad_norm": 1.0622371435165405, + "learning_rate": 2.3072552980117566e-06, + "loss": 0.5172, + "step": 796 + }, + { + "epoch": 1.84, + "grad_norm": 1.2005047798156738, + "learning_rate": 2.2273083629153147e-06, + "loss": 1.1696, + "step": 797 + }, + { + "epoch": 1.84, + "grad_norm": 1.3214634656906128, + "learning_rate": 2.1487554514891704e-06, + "loss": 0.9274, + "step": 798 + }, + { + "epoch": 1.85, + "grad_norm": 1.359706163406372, + "learning_rate": 2.071597683725179e-06, + "loss": 0.7433, + "step": 799 + }, + { + "epoch": 1.85, + "grad_norm": 1.3095366954803467, + "learning_rate": 1.9958361597235076e-06, + "loss": 0.9496, + "step": 800 + }, + { + "epoch": 1.85, + "grad_norm": 1.5138649940490723, + "learning_rate": 1.921471959676957e-06, + "loss": 1.0328, + "step": 801 + }, + { + "epoch": 1.85, + "grad_norm": 1.6273144483566284, + "learning_rate": 1.848506143855555e-06, + "loss": 0.7867, + "step": 802 + }, + { + "epoch": 1.85, + "grad_norm": 0.9000691175460815, + "learning_rate": 1.7769397525914667e-06, + "loss": 0.7318, + "step": 803 + }, + { + "epoch": 1.86, + "grad_norm": 1.6215777397155762, + "learning_rate": 1.706773806264106e-06, + "loss": 1.1416, + "step": 804 + }, + { + "epoch": 1.86, + "grad_norm": 1.5041816234588623, + "learning_rate": 1.6380093052856483e-06, + "loss": 0.9327, + "step": 805 + }, + { + "epoch": 1.86, + "grad_norm": 1.6266945600509644, + "learning_rate": 1.570647230086708e-06, + "loss": 1.1518, + "step": 806 + }, + { + "epoch": 1.86, + "grad_norm": 1.4679832458496094, + "learning_rate": 1.5046885411024391e-06, + "loss": 0.8719, + "step": 807 + }, + { + "epoch": 1.87, + "grad_norm": 1.5392640829086304, + "learning_rate": 1.4401341787587453e-06, + "loss": 1.0913, + "step": 808 + }, + { + "epoch": 1.87, + "grad_norm": 1.4636483192443848, + "learning_rate": 1.3769850634589354e-06, + "loss": 0.9312, + "step": 809 + }, + { + "epoch": 1.87, + "grad_norm": 1.214938998222351, + "learning_rate": 1.3152420955706012e-06, + "loss": 0.8939, + "step": 810 + }, + { + "epoch": 1.87, + "grad_norm": 1.2058130502700806, + "learning_rate": 1.2549061554127494e-06, + "loss": 0.8283, + "step": 811 + }, + { + "epoch": 1.88, + "grad_norm": 1.2419188022613525, + "learning_rate": 1.1959781032432337e-06, + "loss": 0.9094, + "step": 812 + }, + { + "epoch": 1.88, + "grad_norm": 1.7467167377471924, + "learning_rate": 1.1384587792465872e-06, + "loss": 0.7264, + "step": 813 + }, + { + "epoch": 1.88, + "grad_norm": 1.167214035987854, + "learning_rate": 1.0823490035218987e-06, + "loss": 0.6232, + "step": 814 + }, + { + "epoch": 1.88, + "grad_norm": 1.3175835609436035, + "learning_rate": 1.0276495760712767e-06, + "loss": 1.0396, + "step": 815 + }, + { + "epoch": 1.88, + "grad_norm": 1.6061522960662842, + "learning_rate": 9.743612767882936e-07, + "loss": 0.6561, + "step": 816 + }, + { + "epoch": 1.89, + "grad_norm": 1.4116915464401245, + "learning_rate": 9.224848654469931e-07, + "loss": 1.2357, + "step": 817 + }, + { + "epoch": 1.89, + "grad_norm": 1.2765032052993774, + "learning_rate": 8.720210816909435e-07, + "loss": 0.9708, + "step": 818 + }, + { + "epoch": 1.89, + "grad_norm": 1.392382025718689, + "learning_rate": 8.229706450227803e-07, + "loss": 0.9548, + "step": 819 + }, + { + "epoch": 1.89, + "grad_norm": 1.3821630477905273, + "learning_rate": 7.753342547939357e-07, + "loss": 0.7787, + "step": 820 + }, + { + "epoch": 1.9, + "grad_norm": 1.5395421981811523, + "learning_rate": 7.291125901946027e-07, + "loss": 0.7223, + "step": 821 + }, + { + "epoch": 1.9, + "grad_norm": 1.5560616254806519, + "learning_rate": 6.843063102441316e-07, + "loss": 0.9542, + "step": 822 + }, + { + "epoch": 1.9, + "grad_norm": 1.2338303327560425, + "learning_rate": 6.409160537815817e-07, + "loss": 0.9685, + "step": 823 + }, + { + "epoch": 1.9, + "grad_norm": 1.3884745836257935, + "learning_rate": 5.989424394566401e-07, + "loss": 1.1287, + "step": 824 + }, + { + "epoch": 1.91, + "grad_norm": 1.1085461378097534, + "learning_rate": 5.58386065720784e-07, + "loss": 0.7562, + "step": 825 + }, + { + "epoch": 1.91, + "grad_norm": 0.8512900471687317, + "learning_rate": 5.192475108187544e-07, + "loss": 0.4627, + "step": 826 + }, + { + "epoch": 1.91, + "grad_norm": 1.3366022109985352, + "learning_rate": 4.815273327803182e-07, + "loss": 0.8489, + "step": 827 + }, + { + "epoch": 1.91, + "grad_norm": 1.066333293914795, + "learning_rate": 4.452260694122856e-07, + "loss": 0.6867, + "step": 828 + }, + { + "epoch": 1.92, + "grad_norm": 1.484168291091919, + "learning_rate": 4.103442382909051e-07, + "loss": 1.1556, + "step": 829 + }, + { + "epoch": 1.92, + "grad_norm": 1.4741454124450684, + "learning_rate": 3.7688233675439166e-07, + "loss": 0.9345, + "step": 830 + }, + { + "epoch": 1.92, + "grad_norm": 1.0482972860336304, + "learning_rate": 3.4484084189593257e-07, + "loss": 0.7689, + "step": 831 + }, + { + "epoch": 1.92, + "grad_norm": 1.4800513982772827, + "learning_rate": 3.1422021055679265e-07, + "loss": 0.8486, + "step": 832 + }, + { + "epoch": 1.92, + "grad_norm": 1.3522624969482422, + "learning_rate": 2.850208793198861e-07, + "loss": 0.9064, + "step": 833 + }, + { + "epoch": 1.93, + "grad_norm": 1.6680634021759033, + "learning_rate": 2.572432645034817e-07, + "loss": 1.0127, + "step": 834 + }, + { + "epoch": 1.93, + "grad_norm": 1.5550647974014282, + "learning_rate": 2.3088776215531848e-07, + "loss": 0.8741, + "step": 835 + }, + { + "epoch": 1.93, + "grad_norm": 1.4581865072250366, + "learning_rate": 2.0595474804691038e-07, + "loss": 0.9799, + "step": 836 + }, + { + "epoch": 1.93, + "grad_norm": 1.3442659378051758, + "learning_rate": 1.824445776682504e-07, + "loss": 0.8629, + "step": 837 + }, + { + "epoch": 1.94, + "grad_norm": 1.5211567878723145, + "learning_rate": 1.6035758622269247e-07, + "loss": 0.6652, + "step": 838 + }, + { + "epoch": 1.94, + "grad_norm": 1.16912841796875, + "learning_rate": 1.3969408862217758e-07, + "loss": 0.8838, + "step": 839 + }, + { + "epoch": 1.94, + "grad_norm": 1.5780668258666992, + "learning_rate": 1.204543794827595e-07, + "loss": 0.9765, + "step": 840 + }, + { + "epoch": 1.94, + "grad_norm": 1.4517415761947632, + "learning_rate": 1.0263873312040818e-07, + "loss": 0.6895, + "step": 841 + }, + { + "epoch": 1.95, + "grad_norm": 1.7025426626205444, + "learning_rate": 8.624740354707949e-08, + "loss": 0.8764, + "step": 842 + }, + { + "epoch": 1.95, + "grad_norm": 1.3014923334121704, + "learning_rate": 7.128062446709604e-08, + "loss": 1.0822, + "step": 843 + }, + { + "epoch": 1.95, + "grad_norm": 1.501705288887024, + "learning_rate": 5.773860927383856e-08, + "loss": 0.8872, + "step": 844 + }, + { + "epoch": 1.95, + "grad_norm": 1.6005631685256958, + "learning_rate": 4.562155104665955e-08, + "loss": 1.1913, + "step": 845 + }, + { + "epoch": 1.96, + "grad_norm": 1.1631520986557007, + "learning_rate": 3.492962254819654e-08, + "loss": 0.955, + "step": 846 + }, + { + "epoch": 1.96, + "grad_norm": 1.285071611404419, + "learning_rate": 2.5662976221840773e-08, + "loss": 0.8429, + "step": 847 + }, + { + "epoch": 1.96, + "grad_norm": 1.0921001434326172, + "learning_rate": 1.7821744189605582e-08, + "loss": 0.7829, + "step": 848 + }, + { + "epoch": 1.96, + "grad_norm": 1.4597008228302002, + "learning_rate": 1.1406038250205698e-08, + "loss": 1.1112, + "step": 849 + }, + { + "epoch": 1.96, + "grad_norm": 2.1776418685913086, + "learning_rate": 6.41594987752514e-09, + "loss": 1.2897, + "step": 850 + }, + { + "epoch": 1.97, + "grad_norm": 1.2497656345367432, + "learning_rate": 2.851550219240551e-09, + "loss": 0.9627, + "step": 851 + }, + { + "epoch": 1.97, + "grad_norm": 1.291884422302246, + "learning_rate": 7.128900958774942e-10, + "loss": 0.8828, + "step": 852 + } + ], + "logging_steps": 1, + "max_steps": 852, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 426, + "total_flos": 1.547329324744704e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}