{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9694835680751175, "eval_steps": 107, "global_step": 852, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 0.8741932511329651, "learning_rate": 1e-05, "loss": 1.2567, "step": 1 }, { "epoch": 0.0, "eval_loss": 1.3469510078430176, "eval_runtime": 5.3003, "eval_samples_per_second": 18.867, "eval_steps_per_second": 18.867, "step": 1 }, { "epoch": 0.0, "grad_norm": 0.7848138809204102, "learning_rate": 2e-05, "loss": 1.3328, "step": 2 }, { "epoch": 0.01, "grad_norm": 1.0692198276519775, "learning_rate": 3e-05, "loss": 1.6569, "step": 3 }, { "epoch": 0.01, "grad_norm": 1.4229260683059692, "learning_rate": 4e-05, "loss": 1.5493, "step": 4 }, { "epoch": 0.01, "grad_norm": 0.6837282180786133, "learning_rate": 5e-05, "loss": 1.4342, "step": 5 }, { "epoch": 0.01, "grad_norm": 1.1194649934768677, "learning_rate": 6e-05, "loss": 1.2668, "step": 6 }, { "epoch": 0.02, "grad_norm": 0.954695463180542, "learning_rate": 7e-05, "loss": 1.4721, "step": 7 }, { "epoch": 0.02, "grad_norm": 0.8204368352890015, "learning_rate": 8e-05, "loss": 1.4054, "step": 8 }, { "epoch": 0.02, "grad_norm": 0.8432589769363403, "learning_rate": 9e-05, "loss": 1.2849, "step": 9 }, { "epoch": 0.02, "grad_norm": 0.7272113561630249, "learning_rate": 0.0001, "loss": 1.1438, "step": 10 }, { "epoch": 0.03, "grad_norm": 0.8135901093482971, "learning_rate": 0.00011000000000000002, "loss": 1.4665, "step": 11 }, { "epoch": 0.03, "grad_norm": 0.8113470673561096, "learning_rate": 0.00012, "loss": 0.9749, "step": 12 }, { "epoch": 0.03, "grad_norm": 1.3105953931808472, "learning_rate": 0.00013000000000000002, "loss": 1.5133, "step": 13 }, { "epoch": 0.03, "grad_norm": 1.0281662940979004, "learning_rate": 0.00014, "loss": 1.1509, "step": 14 }, { "epoch": 0.04, "grad_norm": 2.1005992889404297, "learning_rate": 0.00015000000000000001, "loss": 1.2748, "step": 15 }, { "epoch": 0.04, "grad_norm": 1.453407645225525, "learning_rate": 0.00016, "loss": 1.3313, "step": 16 }, { "epoch": 0.04, "grad_norm": 1.0521585941314697, "learning_rate": 0.00017, "loss": 1.2977, "step": 17 }, { "epoch": 0.04, "grad_norm": 1.0728052854537964, "learning_rate": 0.00018, "loss": 1.322, "step": 18 }, { "epoch": 0.04, "grad_norm": 0.8539354205131531, "learning_rate": 0.00019, "loss": 1.2053, "step": 19 }, { "epoch": 0.05, "grad_norm": 1.6902265548706055, "learning_rate": 0.0002, "loss": 1.1924, "step": 20 }, { "epoch": 0.05, "grad_norm": 1.3770757913589478, "learning_rate": 0.00019999928710990412, "loss": 1.2286, "step": 21 }, { "epoch": 0.05, "grad_norm": 1.358694314956665, "learning_rate": 0.00019999714844978078, "loss": 1.3196, "step": 22 }, { "epoch": 0.05, "grad_norm": 1.4592664241790771, "learning_rate": 0.0001999935840501225, "loss": 1.5923, "step": 23 }, { "epoch": 0.06, "grad_norm": 1.122499942779541, "learning_rate": 0.0001999885939617498, "loss": 1.3107, "step": 24 }, { "epoch": 0.06, "grad_norm": 1.0040143728256226, "learning_rate": 0.0001999821782558104, "loss": 1.316, "step": 25 }, { "epoch": 0.06, "grad_norm": 1.0741703510284424, "learning_rate": 0.00019997433702377817, "loss": 1.2527, "step": 26 }, { "epoch": 0.06, "grad_norm": 0.9978523850440979, "learning_rate": 0.00019996507037745183, "loss": 1.1989, "step": 27 }, { "epoch": 0.07, "grad_norm": 1.002176284790039, "learning_rate": 0.00019995437844895334, "loss": 1.0503, "step": 28 }, { "epoch": 0.07, "grad_norm": 1.0028811693191528, "learning_rate": 0.0001999422613907262, "loss": 1.356, "step": 29 }, { "epoch": 0.07, "grad_norm": 1.3226864337921143, "learning_rate": 0.0001999287193755329, "loss": 1.2571, "step": 30 }, { "epoch": 0.07, "grad_norm": 0.905536949634552, "learning_rate": 0.00019991375259645293, "loss": 1.1576, "step": 31 }, { "epoch": 0.08, "grad_norm": 1.2652262449264526, "learning_rate": 0.00019989736126687963, "loss": 1.1792, "step": 32 }, { "epoch": 0.08, "grad_norm": 1.0161558389663696, "learning_rate": 0.00019987954562051725, "loss": 1.0074, "step": 33 }, { "epoch": 0.08, "grad_norm": 0.9903879761695862, "learning_rate": 0.00019986030591137783, "loss": 1.196, "step": 34 }, { "epoch": 0.08, "grad_norm": 1.3282291889190674, "learning_rate": 0.0001998396424137773, "loss": 1.5831, "step": 35 }, { "epoch": 0.08, "grad_norm": 1.0442458391189575, "learning_rate": 0.00019981755542233177, "loss": 0.9708, "step": 36 }, { "epoch": 0.09, "grad_norm": 0.9892172813415527, "learning_rate": 0.0001997940452519531, "loss": 1.1873, "step": 37 }, { "epoch": 0.09, "grad_norm": 1.6204959154129028, "learning_rate": 0.0001997691122378447, "loss": 1.725, "step": 38 }, { "epoch": 0.09, "grad_norm": 0.9563732147216797, "learning_rate": 0.00019974275673549654, "loss": 1.513, "step": 39 }, { "epoch": 0.09, "grad_norm": 1.225378155708313, "learning_rate": 0.00019971497912068013, "loss": 1.1298, "step": 40 }, { "epoch": 0.1, "grad_norm": 0.8867512941360474, "learning_rate": 0.00019968577978944323, "loss": 1.2413, "step": 41 }, { "epoch": 0.1, "grad_norm": 0.9609911441802979, "learning_rate": 0.0001996551591581041, "loss": 1.2217, "step": 42 }, { "epoch": 0.1, "grad_norm": 0.8636968731880188, "learning_rate": 0.0001996231176632456, "loss": 1.5268, "step": 43 }, { "epoch": 0.1, "grad_norm": 0.9208036065101624, "learning_rate": 0.00019958965576170908, "loss": 1.1154, "step": 44 }, { "epoch": 0.11, "grad_norm": 0.8316241502761841, "learning_rate": 0.00019955477393058773, "loss": 1.0891, "step": 45 }, { "epoch": 0.11, "grad_norm": 1.0851703882217407, "learning_rate": 0.0001995184726672197, "loss": 1.237, "step": 46 }, { "epoch": 0.11, "grad_norm": 1.2032215595245361, "learning_rate": 0.00019948075248918124, "loss": 1.3444, "step": 47 }, { "epoch": 0.11, "grad_norm": 0.8556870818138123, "learning_rate": 0.00019944161393427922, "loss": 0.9632, "step": 48 }, { "epoch": 0.12, "grad_norm": 1.038156270980835, "learning_rate": 0.00019940105756054337, "loss": 1.3897, "step": 49 }, { "epoch": 0.12, "grad_norm": 1.1046299934387207, "learning_rate": 0.00019935908394621844, "loss": 1.0211, "step": 50 }, { "epoch": 0.12, "grad_norm": 1.1632884740829468, "learning_rate": 0.00019931569368975588, "loss": 1.3098, "step": 51 }, { "epoch": 0.12, "grad_norm": 0.9915144443511963, "learning_rate": 0.0001992708874098054, "loss": 1.1708, "step": 52 }, { "epoch": 0.12, "grad_norm": 1.1974948644638062, "learning_rate": 0.00019922466574520608, "loss": 1.248, "step": 53 }, { "epoch": 0.13, "grad_norm": 1.123138189315796, "learning_rate": 0.00019917702935497725, "loss": 0.8789, "step": 54 }, { "epoch": 0.13, "grad_norm": 0.7720706462860107, "learning_rate": 0.00019912797891830908, "loss": 1.2278, "step": 55 }, { "epoch": 0.13, "grad_norm": 1.171324610710144, "learning_rate": 0.00019907751513455302, "loss": 1.419, "step": 56 }, { "epoch": 0.13, "grad_norm": 1.1821049451828003, "learning_rate": 0.00019902563872321172, "loss": 1.3281, "step": 57 }, { "epoch": 0.14, "grad_norm": 0.8871707320213318, "learning_rate": 0.00019897235042392873, "loss": 1.1431, "step": 58 }, { "epoch": 0.14, "grad_norm": 0.9106453657150269, "learning_rate": 0.0001989176509964781, "loss": 1.0687, "step": 59 }, { "epoch": 0.14, "grad_norm": 0.8794414401054382, "learning_rate": 0.00019886154122075343, "loss": 1.3627, "step": 60 }, { "epoch": 0.14, "grad_norm": 0.9249592423439026, "learning_rate": 0.00019880402189675678, "loss": 0.5962, "step": 61 }, { "epoch": 0.15, "grad_norm": 0.8355573415756226, "learning_rate": 0.00019874509384458725, "loss": 1.2576, "step": 62 }, { "epoch": 0.15, "grad_norm": 1.3829615116119385, "learning_rate": 0.0001986847579044294, "loss": 1.3844, "step": 63 }, { "epoch": 0.15, "grad_norm": 0.7890944480895996, "learning_rate": 0.00019862301493654108, "loss": 0.9504, "step": 64 }, { "epoch": 0.15, "grad_norm": 0.9988391995429993, "learning_rate": 0.00019855986582124126, "loss": 1.1127, "step": 65 }, { "epoch": 0.15, "grad_norm": 1.024064302444458, "learning_rate": 0.00019849531145889758, "loss": 1.0418, "step": 66 }, { "epoch": 0.16, "grad_norm": 1.1652387380599976, "learning_rate": 0.0001984293527699133, "loss": 1.0524, "step": 67 }, { "epoch": 0.16, "grad_norm": 0.8396044969558716, "learning_rate": 0.00019836199069471437, "loss": 1.1829, "step": 68 }, { "epoch": 0.16, "grad_norm": 0.9767330884933472, "learning_rate": 0.00019829322619373588, "loss": 1.1652, "step": 69 }, { "epoch": 0.16, "grad_norm": 0.7999225854873657, "learning_rate": 0.00019822306024740852, "loss": 1.1075, "step": 70 }, { "epoch": 0.17, "grad_norm": 0.9757010340690613, "learning_rate": 0.00019815149385614444, "loss": 1.3074, "step": 71 }, { "epoch": 0.17, "grad_norm": 0.9027532339096069, "learning_rate": 0.00019807852804032305, "loss": 1.308, "step": 72 }, { "epoch": 0.17, "grad_norm": 1.5158909559249878, "learning_rate": 0.0001980041638402765, "loss": 1.1594, "step": 73 }, { "epoch": 0.17, "grad_norm": 0.8618407249450684, "learning_rate": 0.00019792840231627482, "loss": 0.7077, "step": 74 }, { "epoch": 0.18, "grad_norm": 1.0307233333587646, "learning_rate": 0.00019785124454851084, "loss": 1.0812, "step": 75 }, { "epoch": 0.18, "grad_norm": 0.8295727968215942, "learning_rate": 0.00019777269163708468, "loss": 1.0412, "step": 76 }, { "epoch": 0.18, "grad_norm": 1.0772101879119873, "learning_rate": 0.00019769274470198827, "loss": 1.1484, "step": 77 }, { "epoch": 0.18, "grad_norm": 1.0130198001861572, "learning_rate": 0.0001976114048830891, "loss": 1.0704, "step": 78 }, { "epoch": 0.19, "grad_norm": 0.6028400659561157, "learning_rate": 0.00019752867334011423, "loss": 0.6347, "step": 79 }, { "epoch": 0.19, "grad_norm": 0.8858622312545776, "learning_rate": 0.0001974445512526336, "loss": 1.4117, "step": 80 }, { "epoch": 0.19, "grad_norm": 0.9046239852905273, "learning_rate": 0.00019735903982004324, "loss": 1.0355, "step": 81 }, { "epoch": 0.19, "grad_norm": 0.987149715423584, "learning_rate": 0.00019727214026154827, "loss": 1.117, "step": 82 }, { "epoch": 0.19, "grad_norm": 1.066720962524414, "learning_rate": 0.0001971838538161454, "loss": 1.4394, "step": 83 }, { "epoch": 0.2, "grad_norm": 0.7880568504333496, "learning_rate": 0.0001970941817426052, "loss": 1.0591, "step": 84 }, { "epoch": 0.2, "grad_norm": 0.8709908723831177, "learning_rate": 0.00019700312531945442, "loss": 0.8653, "step": 85 }, { "epoch": 0.2, "grad_norm": 0.8909516930580139, "learning_rate": 0.00019691068584495742, "loss": 1.0539, "step": 86 }, { "epoch": 0.2, "grad_norm": 0.726899266242981, "learning_rate": 0.000196816864637098, "loss": 1.0426, "step": 87 }, { "epoch": 0.21, "grad_norm": 0.9136834144592285, "learning_rate": 0.00019672166303356028, "loss": 1.0121, "step": 88 }, { "epoch": 0.21, "grad_norm": 0.8233316540718079, "learning_rate": 0.0001966250823917099, "loss": 1.1974, "step": 89 }, { "epoch": 0.21, "grad_norm": 0.8916105628013611, "learning_rate": 0.0001965271240885745, "loss": 1.2969, "step": 90 }, { "epoch": 0.21, "grad_norm": 0.7402650713920593, "learning_rate": 0.00019642778952082426, "loss": 1.0409, "step": 91 }, { "epoch": 0.22, "grad_norm": 1.036402702331543, "learning_rate": 0.00019632708010475165, "loss": 1.1482, "step": 92 }, { "epoch": 0.22, "grad_norm": 1.50504469871521, "learning_rate": 0.00019622499727625162, "loss": 1.3788, "step": 93 }, { "epoch": 0.22, "grad_norm": 0.7590145468711853, "learning_rate": 0.0001961215424908009, "loss": 1.0139, "step": 94 }, { "epoch": 0.22, "grad_norm": 0.8763208389282227, "learning_rate": 0.00019601671722343738, "loss": 1.066, "step": 95 }, { "epoch": 0.23, "grad_norm": 1.0625308752059937, "learning_rate": 0.00019591052296873888, "loss": 1.1032, "step": 96 }, { "epoch": 0.23, "grad_norm": 0.7704355716705322, "learning_rate": 0.00019580296124080212, "loss": 0.8728, "step": 97 }, { "epoch": 0.23, "grad_norm": 0.8164972066879272, "learning_rate": 0.0001956940335732209, "loss": 0.8879, "step": 98 }, { "epoch": 0.23, "grad_norm": 1.114343285560608, "learning_rate": 0.0001955837415190643, "loss": 1.3132, "step": 99 }, { "epoch": 0.23, "grad_norm": 0.8190547227859497, "learning_rate": 0.00019547208665085457, "loss": 1.324, "step": 100 }, { "epoch": 0.24, "grad_norm": 1.0214807987213135, "learning_rate": 0.00019535907056054475, "loss": 1.0961, "step": 101 }, { "epoch": 0.24, "grad_norm": 1.0679683685302734, "learning_rate": 0.00019524469485949583, "loss": 1.5574, "step": 102 }, { "epoch": 0.24, "grad_norm": 0.7177500128746033, "learning_rate": 0.00019512896117845392, "loss": 1.1553, "step": 103 }, { "epoch": 0.24, "grad_norm": 1.2570964097976685, "learning_rate": 0.00019501187116752693, "loss": 1.2929, "step": 104 }, { "epoch": 0.25, "grad_norm": 0.8858779668807983, "learning_rate": 0.000194893426496161, "loss": 1.2218, "step": 105 }, { "epoch": 0.25, "grad_norm": 0.9762778282165527, "learning_rate": 0.00019477362885311682, "loss": 0.9456, "step": 106 }, { "epoch": 0.25, "grad_norm": 0.8851660490036011, "learning_rate": 0.00019465247994644545, "loss": 1.0032, "step": 107 }, { "epoch": 0.25, "eval_loss": 1.1362618207931519, "eval_runtime": 5.3254, "eval_samples_per_second": 18.778, "eval_steps_per_second": 18.778, "step": 107 }, { "epoch": 0.25, "grad_norm": 0.8778185844421387, "learning_rate": 0.00019452998150346401, "loss": 1.1764, "step": 108 }, { "epoch": 0.26, "grad_norm": 0.8865760564804077, "learning_rate": 0.00019440613527073105, "loss": 1.0644, "step": 109 }, { "epoch": 0.26, "grad_norm": 0.8576034903526306, "learning_rate": 0.00019428094301402162, "loss": 1.0943, "step": 110 }, { "epoch": 0.26, "grad_norm": 0.810203492641449, "learning_rate": 0.00019415440651830208, "loss": 1.2353, "step": 111 }, { "epoch": 0.26, "grad_norm": 1.1911653280258179, "learning_rate": 0.00019402652758770475, "loss": 1.215, "step": 112 }, { "epoch": 0.27, "grad_norm": 0.5166463851928711, "learning_rate": 0.00019389730804550211, "loss": 0.4519, "step": 113 }, { "epoch": 0.27, "grad_norm": 0.983464241027832, "learning_rate": 0.00019376674973408075, "loss": 1.3189, "step": 114 }, { "epoch": 0.27, "grad_norm": 0.7697935700416565, "learning_rate": 0.00019363485451491524, "loss": 1.0372, "step": 115 }, { "epoch": 0.27, "grad_norm": 0.8599738478660583, "learning_rate": 0.0001935016242685415, "loss": 1.1381, "step": 116 }, { "epoch": 0.27, "grad_norm": 1.0430530309677124, "learning_rate": 0.00019336706089452996, "loss": 1.2332, "step": 117 }, { "epoch": 0.28, "grad_norm": 1.163251280784607, "learning_rate": 0.0001932311663114586, "loss": 1.1588, "step": 118 }, { "epoch": 0.28, "grad_norm": 0.8196917176246643, "learning_rate": 0.0001930939424568854, "loss": 1.1114, "step": 119 }, { "epoch": 0.28, "grad_norm": 0.8848841786384583, "learning_rate": 0.00019295539128732093, "loss": 1.0458, "step": 120 }, { "epoch": 0.28, "grad_norm": 0.9288797974586487, "learning_rate": 0.00019281551477820036, "loss": 0.7388, "step": 121 }, { "epoch": 0.29, "grad_norm": 0.8119339942932129, "learning_rate": 0.00019267431492385521, "loss": 1.1095, "step": 122 }, { "epoch": 0.29, "grad_norm": 0.9211128354072571, "learning_rate": 0.00019253179373748504, "loss": 1.3183, "step": 123 }, { "epoch": 0.29, "grad_norm": 0.9682132601737976, "learning_rate": 0.0001923879532511287, "loss": 1.3786, "step": 124 }, { "epoch": 0.29, "grad_norm": 0.9330196976661682, "learning_rate": 0.00019224279551563532, "loss": 1.4051, "step": 125 }, { "epoch": 0.3, "grad_norm": 0.7206214070320129, "learning_rate": 0.0001920963226006352, "loss": 0.8213, "step": 126 }, { "epoch": 0.3, "grad_norm": 0.6096452474594116, "learning_rate": 0.0001919485365945101, "loss": 0.5982, "step": 127 }, { "epoch": 0.3, "grad_norm": 1.0045711994171143, "learning_rate": 0.00019179943960436358, "loss": 1.1399, "step": 128 }, { "epoch": 0.3, "grad_norm": 0.8744232058525085, "learning_rate": 0.00019164903375599112, "loss": 1.0176, "step": 129 }, { "epoch": 0.31, "grad_norm": 0.8323287963867188, "learning_rate": 0.00019149732119384943, "loss": 1.2937, "step": 130 }, { "epoch": 0.31, "grad_norm": 0.9033424854278564, "learning_rate": 0.00019134430408102615, "loss": 1.3108, "step": 131 }, { "epoch": 0.31, "grad_norm": 0.7392603158950806, "learning_rate": 0.00019118998459920902, "loss": 1.0297, "step": 132 }, { "epoch": 0.31, "grad_norm": 1.136338710784912, "learning_rate": 0.0001910343649486546, "loss": 1.3037, "step": 133 }, { "epoch": 0.31, "grad_norm": 0.8202427625656128, "learning_rate": 0.00019087744734815708, "loss": 0.9239, "step": 134 }, { "epoch": 0.32, "grad_norm": 1.039093255996704, "learning_rate": 0.0001907192340350165, "loss": 1.2387, "step": 135 }, { "epoch": 0.32, "grad_norm": 1.1009947061538696, "learning_rate": 0.00019055972726500695, "loss": 1.0234, "step": 136 }, { "epoch": 0.32, "grad_norm": 1.057837724685669, "learning_rate": 0.00019039892931234435, "loss": 1.4308, "step": 137 }, { "epoch": 0.32, "grad_norm": 0.8715041279792786, "learning_rate": 0.00019023684246965406, "loss": 1.0642, "step": 138 }, { "epoch": 0.33, "grad_norm": 1.3314340114593506, "learning_rate": 0.00019007346904793818, "loss": 1.1097, "step": 139 }, { "epoch": 0.33, "grad_norm": 0.6808525919914246, "learning_rate": 0.00018990881137654258, "loss": 0.9254, "step": 140 }, { "epoch": 0.33, "grad_norm": 0.8727401494979858, "learning_rate": 0.00018974287180312377, "loss": 1.1062, "step": 141 }, { "epoch": 0.33, "grad_norm": 1.0826424360275269, "learning_rate": 0.00018957565269361531, "loss": 1.1528, "step": 142 }, { "epoch": 0.34, "grad_norm": 0.8825279474258423, "learning_rate": 0.00018940715643219407, "loss": 1.2208, "step": 143 }, { "epoch": 0.34, "grad_norm": 0.8955380320549011, "learning_rate": 0.00018923738542124644, "loss": 1.0918, "step": 144 }, { "epoch": 0.34, "grad_norm": 1.0863996744155884, "learning_rate": 0.00018906634208133385, "loss": 1.2153, "step": 145 }, { "epoch": 0.34, "grad_norm": 1.0089964866638184, "learning_rate": 0.00018889402885115833, "loss": 1.0796, "step": 146 }, { "epoch": 0.35, "grad_norm": 0.9210363626480103, "learning_rate": 0.0001887204481875278, "loss": 0.8502, "step": 147 }, { "epoch": 0.35, "grad_norm": 0.9592724442481995, "learning_rate": 0.000188545602565321, "loss": 1.4313, "step": 148 }, { "epoch": 0.35, "grad_norm": 1.2299224138259888, "learning_rate": 0.00018836949447745215, "loss": 1.1074, "step": 149 }, { "epoch": 0.35, "grad_norm": 1.1486583948135376, "learning_rate": 0.0001881921264348355, "loss": 1.3576, "step": 150 }, { "epoch": 0.35, "grad_norm": 1.1912083625793457, "learning_rate": 0.00018801350096634946, "loss": 1.343, "step": 151 }, { "epoch": 0.36, "grad_norm": 0.8830162882804871, "learning_rate": 0.00018783362061880062, "loss": 1.6139, "step": 152 }, { "epoch": 0.36, "grad_norm": 1.0919363498687744, "learning_rate": 0.00018765248795688726, "loss": 1.4051, "step": 153 }, { "epoch": 0.36, "grad_norm": 0.8009780049324036, "learning_rate": 0.00018747010556316305, "loss": 1.4095, "step": 154 }, { "epoch": 0.36, "grad_norm": 0.964438796043396, "learning_rate": 0.00018728647603800003, "loss": 1.1634, "step": 155 }, { "epoch": 0.37, "grad_norm": 0.9883137941360474, "learning_rate": 0.00018710160199955156, "loss": 1.1904, "step": 156 }, { "epoch": 0.37, "grad_norm": 0.8936368227005005, "learning_rate": 0.0001869154860837151, "loss": 1.2264, "step": 157 }, { "epoch": 0.37, "grad_norm": 0.6435540914535522, "learning_rate": 0.0001867281309440945, "loss": 0.8426, "step": 158 }, { "epoch": 0.37, "grad_norm": 0.7036202549934387, "learning_rate": 0.00018653953925196225, "loss": 0.8162, "step": 159 }, { "epoch": 0.38, "grad_norm": 0.7669593095779419, "learning_rate": 0.0001863497136962213, "loss": 0.9143, "step": 160 }, { "epoch": 0.38, "grad_norm": 0.8047689199447632, "learning_rate": 0.00018615865698336684, "loss": 1.1911, "step": 161 }, { "epoch": 0.38, "grad_norm": 1.1324609518051147, "learning_rate": 0.00018596637183744763, "loss": 1.064, "step": 162 }, { "epoch": 0.38, "grad_norm": 0.8058610558509827, "learning_rate": 0.00018577286100002723, "loss": 0.7428, "step": 163 }, { "epoch": 0.38, "grad_norm": 0.8588782548904419, "learning_rate": 0.00018557812723014476, "loss": 0.8801, "step": 164 }, { "epoch": 0.39, "grad_norm": 1.0751081705093384, "learning_rate": 0.00018538217330427582, "loss": 1.3674, "step": 165 }, { "epoch": 0.39, "grad_norm": 0.791789174079895, "learning_rate": 0.00018518500201629258, "loss": 1.0103, "step": 166 }, { "epoch": 0.39, "grad_norm": 0.8154894709587097, "learning_rate": 0.00018498661617742426, "loss": 1.1219, "step": 167 }, { "epoch": 0.39, "grad_norm": 1.3946795463562012, "learning_rate": 0.00018478701861621686, "loss": 1.0725, "step": 168 }, { "epoch": 0.4, "grad_norm": 0.9187031388282776, "learning_rate": 0.00018458621217849286, "loss": 1.2674, "step": 169 }, { "epoch": 0.4, "grad_norm": 0.9884739518165588, "learning_rate": 0.00018438419972731067, "loss": 1.3507, "step": 170 }, { "epoch": 0.4, "grad_norm": 1.4417808055877686, "learning_rate": 0.0001841809841429238, "loss": 1.129, "step": 171 }, { "epoch": 0.4, "grad_norm": 1.0408543348312378, "learning_rate": 0.0001839765683227398, "loss": 1.2038, "step": 172 }, { "epoch": 0.41, "grad_norm": 1.2746024131774902, "learning_rate": 0.00018377095518127897, "loss": 1.2916, "step": 173 }, { "epoch": 0.41, "grad_norm": 0.996474027633667, "learning_rate": 0.00018356414765013267, "loss": 1.3041, "step": 174 }, { "epoch": 0.41, "grad_norm": 0.9435645341873169, "learning_rate": 0.00018335614867792183, "loss": 1.3457, "step": 175 }, { "epoch": 0.41, "grad_norm": 1.0813288688659668, "learning_rate": 0.00018314696123025454, "loss": 1.2323, "step": 176 }, { "epoch": 0.42, "grad_norm": 1.1004307270050049, "learning_rate": 0.00018293658828968397, "loss": 1.3084, "step": 177 }, { "epoch": 0.42, "grad_norm": 0.7818973660469055, "learning_rate": 0.00018272503285566587, "loss": 0.88, "step": 178 }, { "epoch": 0.42, "grad_norm": 0.8897396326065063, "learning_rate": 0.00018251229794451567, "loss": 1.0124, "step": 179 }, { "epoch": 0.42, "grad_norm": 1.0572400093078613, "learning_rate": 0.00018229838658936564, "loss": 1.4603, "step": 180 }, { "epoch": 0.42, "grad_norm": 0.8974701166152954, "learning_rate": 0.0001820833018401215, "loss": 1.1961, "step": 181 }, { "epoch": 0.43, "grad_norm": 0.8860751390457153, "learning_rate": 0.00018186704676341898, "loss": 0.9779, "step": 182 }, { "epoch": 0.43, "grad_norm": 0.7719995975494385, "learning_rate": 0.00018164962444258014, "loss": 1.1156, "step": 183 }, { "epoch": 0.43, "grad_norm": 1.0823787450790405, "learning_rate": 0.0001814310379775694, "loss": 0.959, "step": 184 }, { "epoch": 0.43, "grad_norm": 0.9932149052619934, "learning_rate": 0.00018121129048494922, "loss": 1.358, "step": 185 }, { "epoch": 0.44, "grad_norm": 0.7363601326942444, "learning_rate": 0.00018099038509783582, "loss": 1.2639, "step": 186 }, { "epoch": 0.44, "grad_norm": 1.0355418920516968, "learning_rate": 0.0001807683249658545, "loss": 1.2566, "step": 187 }, { "epoch": 0.44, "grad_norm": 0.8746327757835388, "learning_rate": 0.0001805451132550946, "loss": 1.2159, "step": 188 }, { "epoch": 0.44, "grad_norm": 0.9230121374130249, "learning_rate": 0.00018032075314806448, "loss": 1.1717, "step": 189 }, { "epoch": 0.45, "grad_norm": 0.8293251991271973, "learning_rate": 0.00018009524784364615, "loss": 1.1053, "step": 190 }, { "epoch": 0.45, "grad_norm": 1.8819063901901245, "learning_rate": 0.00017986860055704953, "loss": 1.4713, "step": 191 }, { "epoch": 0.45, "grad_norm": 0.843971312046051, "learning_rate": 0.00017964081451976672, "loss": 1.175, "step": 192 }, { "epoch": 0.45, "grad_norm": 1.1449857950210571, "learning_rate": 0.00017941189297952597, "loss": 1.0097, "step": 193 }, { "epoch": 0.46, "grad_norm": 0.8204749226570129, "learning_rate": 0.0001791818392002452, "loss": 1.0383, "step": 194 }, { "epoch": 0.46, "grad_norm": 1.0437519550323486, "learning_rate": 0.00017895065646198567, "loss": 1.4455, "step": 195 }, { "epoch": 0.46, "grad_norm": 0.803774356842041, "learning_rate": 0.00017871834806090501, "loss": 0.9266, "step": 196 }, { "epoch": 0.46, "grad_norm": 0.991162121295929, "learning_rate": 0.00017848491730921046, "loss": 0.9433, "step": 197 }, { "epoch": 0.46, "grad_norm": 0.9293928146362305, "learning_rate": 0.00017825036753511144, "loss": 1.2432, "step": 198 }, { "epoch": 0.47, "grad_norm": 0.6806849241256714, "learning_rate": 0.0001780147020827721, "loss": 0.696, "step": 199 }, { "epoch": 0.47, "grad_norm": 1.2757548093795776, "learning_rate": 0.00017777792431226383, "loss": 1.3426, "step": 200 }, { "epoch": 0.47, "grad_norm": 0.8645375370979309, "learning_rate": 0.00017754003759951715, "loss": 1.0345, "step": 201 }, { "epoch": 0.47, "grad_norm": 1.0262008905410767, "learning_rate": 0.0001773010453362737, "loss": 0.935, "step": 202 }, { "epoch": 0.48, "grad_norm": 1.225273609161377, "learning_rate": 0.00017706095093003785, "loss": 1.1271, "step": 203 }, { "epoch": 0.48, "grad_norm": 0.9024125337600708, "learning_rate": 0.00017681975780402807, "loss": 1.1307, "step": 204 }, { "epoch": 0.48, "grad_norm": 0.9035881161689758, "learning_rate": 0.00017657746939712815, "loss": 1.3217, "step": 205 }, { "epoch": 0.48, "grad_norm": 2.1178483963012695, "learning_rate": 0.00017633408916383826, "loss": 1.4955, "step": 206 }, { "epoch": 0.49, "grad_norm": 0.826454222202301, "learning_rate": 0.00017608962057422549, "loss": 1.187, "step": 207 }, { "epoch": 0.49, "grad_norm": 0.8255906701087952, "learning_rate": 0.00017584406711387463, "loss": 1.0733, "step": 208 }, { "epoch": 0.49, "grad_norm": 0.9498797059059143, "learning_rate": 0.0001755974322838382, "loss": 1.3054, "step": 209 }, { "epoch": 0.49, "grad_norm": 0.7860575914382935, "learning_rate": 0.00017534971960058685, "loss": 1.0231, "step": 210 }, { "epoch": 0.5, "grad_norm": 0.905441403388977, "learning_rate": 0.00017510093259595885, "loss": 1.2928, "step": 211 }, { "epoch": 0.5, "grad_norm": 1.0108212232589722, "learning_rate": 0.00017485107481711012, "loss": 0.8964, "step": 212 }, { "epoch": 0.5, "grad_norm": 1.036566972732544, "learning_rate": 0.00017460014982646334, "loss": 1.4823, "step": 213 }, { "epoch": 0.5, "grad_norm": 0.8670862913131714, "learning_rate": 0.00017434816120165728, "loss": 1.0994, "step": 214 }, { "epoch": 0.5, "eval_loss": 1.1271697282791138, "eval_runtime": 4.9762, "eval_samples_per_second": 20.096, "eval_steps_per_second": 20.096, "step": 214 }, { "epoch": 0.5, "grad_norm": 0.9359840750694275, "learning_rate": 0.00017409511253549593, "loss": 1.1552, "step": 215 }, { "epoch": 0.51, "grad_norm": 1.0469551086425781, "learning_rate": 0.00017384100743589697, "loss": 1.1175, "step": 216 }, { "epoch": 0.51, "grad_norm": 0.8635923266410828, "learning_rate": 0.0001735858495258406, "loss": 0.9823, "step": 217 }, { "epoch": 0.51, "grad_norm": 1.1676996946334839, "learning_rate": 0.00017332964244331776, "loss": 1.2903, "step": 218 }, { "epoch": 0.51, "grad_norm": 0.9774354696273804, "learning_rate": 0.00017307238984127832, "loss": 1.1928, "step": 219 }, { "epoch": 0.52, "grad_norm": 1.0486751794815063, "learning_rate": 0.00017281409538757883, "loss": 1.039, "step": 220 }, { "epoch": 0.52, "grad_norm": 1.0949512720108032, "learning_rate": 0.00017255476276493056, "loss": 1.148, "step": 221 }, { "epoch": 0.52, "grad_norm": 0.8077650666236877, "learning_rate": 0.0001722943956708466, "loss": 1.1376, "step": 222 }, { "epoch": 0.52, "grad_norm": 1.0595574378967285, "learning_rate": 0.00017203299781758943, "loss": 1.0757, "step": 223 }, { "epoch": 0.53, "grad_norm": 0.813774049282074, "learning_rate": 0.00017177057293211784, "loss": 1.2645, "step": 224 }, { "epoch": 0.53, "grad_norm": 0.8764140009880066, "learning_rate": 0.0001715071247560339, "loss": 0.9552, "step": 225 }, { "epoch": 0.53, "grad_norm": 0.9136344194412231, "learning_rate": 0.0001712426570455295, "loss": 1.1841, "step": 226 }, { "epoch": 0.53, "grad_norm": 0.8742187023162842, "learning_rate": 0.00017097717357133284, "loss": 1.0314, "step": 227 }, { "epoch": 0.54, "grad_norm": 0.8309169411659241, "learning_rate": 0.00017071067811865476, "loss": 0.9842, "step": 228 }, { "epoch": 0.54, "grad_norm": 0.8644968867301941, "learning_rate": 0.00017044317448713461, "loss": 1.3819, "step": 229 }, { "epoch": 0.54, "grad_norm": 0.8510339260101318, "learning_rate": 0.0001701746664907862, "loss": 1.2385, "step": 230 }, { "epoch": 0.54, "grad_norm": 0.8174643516540527, "learning_rate": 0.00016990515795794334, "loss": 0.9789, "step": 231 }, { "epoch": 0.54, "grad_norm": 0.9340826272964478, "learning_rate": 0.0001696346527312053, "loss": 1.2472, "step": 232 }, { "epoch": 0.55, "grad_norm": 0.9614835977554321, "learning_rate": 0.00016936315466738205, "loss": 1.1588, "step": 233 }, { "epoch": 0.55, "grad_norm": 1.1464625597000122, "learning_rate": 0.00016909066763743912, "loss": 0.8365, "step": 234 }, { "epoch": 0.55, "grad_norm": 0.8775334358215332, "learning_rate": 0.00016881719552644273, "loss": 1.1143, "step": 235 }, { "epoch": 0.55, "grad_norm": 0.9431893825531006, "learning_rate": 0.00016854274223350397, "loss": 1.362, "step": 236 }, { "epoch": 0.56, "grad_norm": 0.7953469157218933, "learning_rate": 0.0001682673116717236, "loss": 1.1568, "step": 237 }, { "epoch": 0.56, "grad_norm": 0.7517049908638, "learning_rate": 0.00016799090776813597, "loss": 0.9274, "step": 238 }, { "epoch": 0.56, "grad_norm": 0.796934187412262, "learning_rate": 0.00016771353446365318, "loss": 0.8641, "step": 239 }, { "epoch": 0.56, "grad_norm": 0.7946231961250305, "learning_rate": 0.00016743519571300888, "loss": 1.0518, "step": 240 }, { "epoch": 0.57, "grad_norm": 1.0859878063201904, "learning_rate": 0.00016715589548470185, "loss": 1.1815, "step": 241 }, { "epoch": 0.57, "grad_norm": 0.7418296933174133, "learning_rate": 0.00016687563776093941, "loss": 1.0321, "step": 242 }, { "epoch": 0.57, "grad_norm": 0.8161245584487915, "learning_rate": 0.00016659442653758064, "loss": 1.0931, "step": 243 }, { "epoch": 0.57, "grad_norm": 0.7787612080574036, "learning_rate": 0.00016631226582407952, "loss": 1.2239, "step": 244 }, { "epoch": 0.58, "grad_norm": 0.7161651849746704, "learning_rate": 0.00016602915964342757, "loss": 1.1104, "step": 245 }, { "epoch": 0.58, "grad_norm": 0.786612331867218, "learning_rate": 0.00016574511203209667, "loss": 1.2486, "step": 246 }, { "epoch": 0.58, "grad_norm": 0.8251045942306519, "learning_rate": 0.00016546012703998138, "loss": 1.2358, "step": 247 }, { "epoch": 0.58, "grad_norm": 0.8045319318771362, "learning_rate": 0.00016517420873034123, "loss": 0.8145, "step": 248 }, { "epoch": 0.58, "grad_norm": 0.8730091452598572, "learning_rate": 0.0001648873611797429, "loss": 0.8832, "step": 249 }, { "epoch": 0.59, "grad_norm": 0.9003087878227234, "learning_rate": 0.00016459958847800187, "loss": 1.1149, "step": 250 }, { "epoch": 0.59, "grad_norm": 1.0912461280822754, "learning_rate": 0.00016431089472812444, "loss": 1.0439, "step": 251 }, { "epoch": 0.59, "grad_norm": 0.7999249696731567, "learning_rate": 0.00016402128404624882, "loss": 0.9821, "step": 252 }, { "epoch": 0.59, "grad_norm": 1.0122705698013306, "learning_rate": 0.00016373076056158675, "loss": 1.2302, "step": 253 }, { "epoch": 0.6, "grad_norm": 0.6447519659996033, "learning_rate": 0.00016343932841636456, "loss": 0.6079, "step": 254 }, { "epoch": 0.6, "grad_norm": 0.7757092118263245, "learning_rate": 0.00016314699176576402, "loss": 1.0092, "step": 255 }, { "epoch": 0.6, "grad_norm": 0.7445678114891052, "learning_rate": 0.00016285375477786322, "loss": 0.684, "step": 256 }, { "epoch": 0.6, "grad_norm": 1.241065263748169, "learning_rate": 0.000162559621633577, "loss": 1.0321, "step": 257 }, { "epoch": 0.61, "grad_norm": 1.1429563760757446, "learning_rate": 0.00016226459652659753, "loss": 1.1635, "step": 258 }, { "epoch": 0.61, "grad_norm": 0.7441573739051819, "learning_rate": 0.0001619686836633343, "loss": 0.9685, "step": 259 }, { "epoch": 0.61, "grad_norm": 1.3199349641799927, "learning_rate": 0.00016167188726285434, "loss": 1.3159, "step": 260 }, { "epoch": 0.61, "grad_norm": 0.8082245588302612, "learning_rate": 0.00016137421155682183, "loss": 1.317, "step": 261 }, { "epoch": 0.62, "grad_norm": 0.9036356210708618, "learning_rate": 0.0001610756607894382, "loss": 0.8672, "step": 262 }, { "epoch": 0.62, "grad_norm": 0.9773459434509277, "learning_rate": 0.00016077623921738102, "loss": 1.1405, "step": 263 }, { "epoch": 0.62, "grad_norm": 0.9840144515037537, "learning_rate": 0.00016047595110974376, "loss": 1.4167, "step": 264 }, { "epoch": 0.62, "grad_norm": 1.0358545780181885, "learning_rate": 0.0001601748007479748, "loss": 1.196, "step": 265 }, { "epoch": 0.62, "grad_norm": 0.7097404599189758, "learning_rate": 0.0001598727924258164, "loss": 0.791, "step": 266 }, { "epoch": 0.63, "grad_norm": 1.7330995798110962, "learning_rate": 0.00015956993044924334, "loss": 1.4283, "step": 267 }, { "epoch": 0.63, "grad_norm": 0.7444025278091431, "learning_rate": 0.0001592662191364017, "loss": 0.7525, "step": 268 }, { "epoch": 0.63, "grad_norm": 0.9818642139434814, "learning_rate": 0.0001589616628175472, "loss": 1.2417, "step": 269 }, { "epoch": 0.63, "grad_norm": 0.9218468070030212, "learning_rate": 0.00015865626583498355, "loss": 1.1316, "step": 270 }, { "epoch": 0.64, "grad_norm": 0.9644055366516113, "learning_rate": 0.00015835003254300039, "loss": 1.2594, "step": 271 }, { "epoch": 0.64, "grad_norm": 0.8228254914283752, "learning_rate": 0.00015804296730781135, "loss": 1.2481, "step": 272 }, { "epoch": 0.64, "grad_norm": 1.0708152055740356, "learning_rate": 0.00015773507450749172, "loss": 1.107, "step": 273 }, { "epoch": 0.64, "grad_norm": 1.1122934818267822, "learning_rate": 0.00015742635853191608, "loss": 0.8714, "step": 274 }, { "epoch": 0.65, "grad_norm": 0.8141905665397644, "learning_rate": 0.00015711682378269565, "loss": 0.9943, "step": 275 }, { "epoch": 0.65, "grad_norm": 0.8955541253089905, "learning_rate": 0.00015680647467311557, "loss": 1.3176, "step": 276 }, { "epoch": 0.65, "grad_norm": 1.1133729219436646, "learning_rate": 0.000156495315628072, "loss": 1.2602, "step": 277 }, { "epoch": 0.65, "grad_norm": 0.8733439445495605, "learning_rate": 0.00015618335108400893, "loss": 1.3639, "step": 278 }, { "epoch": 0.65, "grad_norm": 0.8614795804023743, "learning_rate": 0.00015587058548885505, "loss": 1.1905, "step": 279 }, { "epoch": 0.66, "grad_norm": 0.8306368589401245, "learning_rate": 0.00015555702330196023, "loss": 1.1978, "step": 280 }, { "epoch": 0.66, "grad_norm": 0.8460854887962341, "learning_rate": 0.00015524266899403206, "loss": 0.9872, "step": 281 }, { "epoch": 0.66, "grad_norm": 0.8452059626579285, "learning_rate": 0.000154927527047072, "loss": 0.979, "step": 282 }, { "epoch": 0.66, "grad_norm": 0.8805731534957886, "learning_rate": 0.00015461160195431148, "loss": 1.2885, "step": 283 }, { "epoch": 0.67, "grad_norm": 0.9095639586448669, "learning_rate": 0.0001542948982201479, "loss": 1.1156, "step": 284 }, { "epoch": 0.67, "grad_norm": 0.9862900376319885, "learning_rate": 0.00015397742036008034, "loss": 1.1571, "step": 285 }, { "epoch": 0.67, "grad_norm": 0.9344744086265564, "learning_rate": 0.0001536591729006453, "loss": 1.2204, "step": 286 }, { "epoch": 0.67, "grad_norm": 1.0605379343032837, "learning_rate": 0.00015334016037935196, "loss": 1.3048, "step": 287 }, { "epoch": 0.68, "grad_norm": 0.9844763278961182, "learning_rate": 0.0001530203873446177, "loss": 1.0035, "step": 288 }, { "epoch": 0.68, "grad_norm": 0.767954409122467, "learning_rate": 0.0001526998583557031, "loss": 0.9023, "step": 289 }, { "epoch": 0.68, "grad_norm": 1.0622146129608154, "learning_rate": 0.000152378577982647, "loss": 1.4837, "step": 290 }, { "epoch": 0.68, "grad_norm": 0.9536969065666199, "learning_rate": 0.0001520565508062013, "loss": 1.0948, "step": 291 }, { "epoch": 0.69, "grad_norm": 0.9654991030693054, "learning_rate": 0.00015173378141776568, "loss": 1.1913, "step": 292 }, { "epoch": 0.69, "grad_norm": 0.8208710551261902, "learning_rate": 0.00015141027441932216, "loss": 1.1435, "step": 293 }, { "epoch": 0.69, "grad_norm": 0.9273961186408997, "learning_rate": 0.0001510860344233695, "loss": 1.0845, "step": 294 }, { "epoch": 0.69, "grad_norm": 1.0316227674484253, "learning_rate": 0.00015076106605285724, "loss": 1.4532, "step": 295 }, { "epoch": 0.69, "grad_norm": 1.0121437311172485, "learning_rate": 0.00015043537394112007, "loss": 0.8687, "step": 296 }, { "epoch": 0.7, "grad_norm": 1.0713882446289062, "learning_rate": 0.00015010896273181165, "loss": 1.1097, "step": 297 }, { "epoch": 0.7, "grad_norm": 0.8149722814559937, "learning_rate": 0.00014978183707883827, "loss": 0.8682, "step": 298 }, { "epoch": 0.7, "grad_norm": 0.7118079662322998, "learning_rate": 0.00014945400164629278, "loss": 0.9225, "step": 299 }, { "epoch": 0.7, "grad_norm": 1.1042624711990356, "learning_rate": 0.00014912546110838775, "loss": 1.4279, "step": 300 }, { "epoch": 0.71, "grad_norm": 0.947619616985321, "learning_rate": 0.00014879622014938915, "loss": 1.0544, "step": 301 }, { "epoch": 0.71, "grad_norm": 0.9065904021263123, "learning_rate": 0.00014846628346354933, "loss": 1.1642, "step": 302 }, { "epoch": 0.71, "grad_norm": 0.9430320262908936, "learning_rate": 0.00014813565575504022, "loss": 1.2182, "step": 303 }, { "epoch": 0.71, "grad_norm": 0.8739117980003357, "learning_rate": 0.00014780434173788617, "loss": 1.0176, "step": 304 }, { "epoch": 0.72, "grad_norm": 0.853125274181366, "learning_rate": 0.00014747234613589685, "loss": 1.1827, "step": 305 }, { "epoch": 0.72, "grad_norm": 1.6718727350234985, "learning_rate": 0.0001471396736825998, "loss": 1.2665, "step": 306 }, { "epoch": 0.72, "grad_norm": 0.8566248416900635, "learning_rate": 0.00014680632912117286, "loss": 1.2231, "step": 307 }, { "epoch": 0.72, "grad_norm": 0.6841180324554443, "learning_rate": 0.00014647231720437686, "loss": 0.9366, "step": 308 }, { "epoch": 0.73, "grad_norm": 0.9140876531600952, "learning_rate": 0.00014613764269448751, "loss": 1.0711, "step": 309 }, { "epoch": 0.73, "grad_norm": 0.9394497275352478, "learning_rate": 0.00014580231036322768, "loss": 1.1159, "step": 310 }, { "epoch": 0.73, "grad_norm": 1.1066112518310547, "learning_rate": 0.00014546632499169937, "loss": 1.3487, "step": 311 }, { "epoch": 0.73, "grad_norm": 0.9925751090049744, "learning_rate": 0.00014512969137031538, "loss": 1.1207, "step": 312 }, { "epoch": 0.73, "grad_norm": 0.9642359018325806, "learning_rate": 0.0001447924142987312, "loss": 1.3772, "step": 313 }, { "epoch": 0.74, "grad_norm": 0.6977396607398987, "learning_rate": 0.0001444544985857766, "loss": 0.8517, "step": 314 }, { "epoch": 0.74, "grad_norm": 1.033882737159729, "learning_rate": 0.00014411594904938682, "loss": 1.0644, "step": 315 }, { "epoch": 0.74, "grad_norm": 1.020871877670288, "learning_rate": 0.00014377677051653404, "loss": 1.2026, "step": 316 }, { "epoch": 0.74, "grad_norm": 1.059812068939209, "learning_rate": 0.0001434369678231587, "loss": 1.4181, "step": 317 }, { "epoch": 0.75, "grad_norm": 0.8130291104316711, "learning_rate": 0.00014309654581410024, "loss": 1.0691, "step": 318 }, { "epoch": 0.75, "grad_norm": 0.8362820148468018, "learning_rate": 0.00014275550934302823, "loss": 1.0053, "step": 319 }, { "epoch": 0.75, "grad_norm": 0.9266586899757385, "learning_rate": 0.0001424138632723731, "loss": 1.1313, "step": 320 }, { "epoch": 0.75, "grad_norm": 1.0162605047225952, "learning_rate": 0.00014207161247325691, "loss": 1.3518, "step": 321 }, { "epoch": 0.75, "eval_loss": 1.114696741104126, "eval_runtime": 5.1062, "eval_samples_per_second": 19.584, "eval_steps_per_second": 19.584, "step": 321 }, { "epoch": 0.76, "grad_norm": 1.0078846216201782, "learning_rate": 0.00014172876182542372, "loss": 1.0446, "step": 322 }, { "epoch": 0.76, "grad_norm": 1.2844680547714233, "learning_rate": 0.00014138531621717018, "loss": 1.4105, "step": 323 }, { "epoch": 0.76, "grad_norm": 1.0380208492279053, "learning_rate": 0.0001410412805452757, "loss": 1.4212, "step": 324 }, { "epoch": 0.76, "grad_norm": 0.8037036061286926, "learning_rate": 0.00014069665971493274, "loss": 0.8392, "step": 325 }, { "epoch": 0.77, "grad_norm": 0.9248948693275452, "learning_rate": 0.00014035145863967692, "loss": 1.3121, "step": 326 }, { "epoch": 0.77, "grad_norm": 0.8579298853874207, "learning_rate": 0.0001400056822413167, "loss": 1.1128, "step": 327 }, { "epoch": 0.77, "grad_norm": 1.0605120658874512, "learning_rate": 0.0001396593354498635, "loss": 1.543, "step": 328 }, { "epoch": 0.77, "grad_norm": 0.9975443482398987, "learning_rate": 0.0001393124232034613, "loss": 1.1178, "step": 329 }, { "epoch": 0.77, "grad_norm": 0.8115065693855286, "learning_rate": 0.0001389649504483162, "loss": 1.1937, "step": 330 }, { "epoch": 0.78, "grad_norm": 0.7796252369880676, "learning_rate": 0.00013861692213862584, "loss": 1.1886, "step": 331 }, { "epoch": 0.78, "grad_norm": 1.0133821964263916, "learning_rate": 0.000138268343236509, "loss": 1.4973, "step": 332 }, { "epoch": 0.78, "grad_norm": 0.9557147026062012, "learning_rate": 0.00013791921871193457, "loss": 1.4592, "step": 333 }, { "epoch": 0.78, "grad_norm": 0.9763726592063904, "learning_rate": 0.00013756955354265085, "loss": 0.8502, "step": 334 }, { "epoch": 0.79, "grad_norm": 0.8208116888999939, "learning_rate": 0.00013721935271411464, "loss": 1.1601, "step": 335 }, { "epoch": 0.79, "grad_norm": 1.3176727294921875, "learning_rate": 0.0001368686212194199, "loss": 1.1715, "step": 336 }, { "epoch": 0.79, "grad_norm": 1.2329626083374023, "learning_rate": 0.00013651736405922686, "loss": 1.3426, "step": 337 }, { "epoch": 0.79, "grad_norm": 0.9947068691253662, "learning_rate": 0.0001361655862416905, "loss": 1.0623, "step": 338 }, { "epoch": 0.8, "grad_norm": 1.176267147064209, "learning_rate": 0.00013581329278238927, "loss": 1.1281, "step": 339 }, { "epoch": 0.8, "grad_norm": 0.909443736076355, "learning_rate": 0.00013546048870425356, "loss": 1.2623, "step": 340 }, { "epoch": 0.8, "grad_norm": 0.8919989466667175, "learning_rate": 0.000135107179037494, "loss": 1.2652, "step": 341 }, { "epoch": 0.8, "grad_norm": 0.7781542539596558, "learning_rate": 0.00013475336881952986, "loss": 0.9857, "step": 342 }, { "epoch": 0.81, "grad_norm": 0.9232913851737976, "learning_rate": 0.00013439906309491712, "loss": 1.0923, "step": 343 }, { "epoch": 0.81, "grad_norm": 1.1160950660705566, "learning_rate": 0.0001340442669152766, "loss": 1.3445, "step": 344 }, { "epoch": 0.81, "grad_norm": 1.084597110748291, "learning_rate": 0.000133688985339222, "loss": 1.7647, "step": 345 }, { "epoch": 0.81, "grad_norm": 0.8420549631118774, "learning_rate": 0.0001333332234322876, "loss": 1.1342, "step": 346 }, { "epoch": 0.81, "grad_norm": 1.0362187623977661, "learning_rate": 0.0001329769862668563, "loss": 1.0779, "step": 347 }, { "epoch": 0.82, "grad_norm": 0.902492344379425, "learning_rate": 0.00013262027892208694, "loss": 1.1121, "step": 348 }, { "epoch": 0.82, "grad_norm": 1.4322317838668823, "learning_rate": 0.0001322631064838422, "loss": 1.5474, "step": 349 }, { "epoch": 0.82, "grad_norm": 0.8751888275146484, "learning_rate": 0.00013190547404461598, "loss": 1.2055, "step": 350 }, { "epoch": 0.82, "grad_norm": 0.9157432913780212, "learning_rate": 0.0001315473867034608, "loss": 1.3176, "step": 351 }, { "epoch": 0.83, "grad_norm": 0.7300966382026672, "learning_rate": 0.0001311888495659149, "loss": 0.9548, "step": 352 }, { "epoch": 0.83, "grad_norm": 1.0954256057739258, "learning_rate": 0.0001308298677439299, "loss": 1.1649, "step": 353 }, { "epoch": 0.83, "grad_norm": 1.0646469593048096, "learning_rate": 0.00013047044635579747, "loss": 1.3597, "step": 354 }, { "epoch": 0.83, "grad_norm": 0.7668378949165344, "learning_rate": 0.00013011059052607656, "loss": 1.1246, "step": 355 }, { "epoch": 0.84, "grad_norm": 0.9135538339614868, "learning_rate": 0.00012975030538552032, "loss": 1.0189, "step": 356 }, { "epoch": 0.84, "grad_norm": 0.7841051816940308, "learning_rate": 0.00012938959607100288, "loss": 1.1396, "step": 357 }, { "epoch": 0.84, "grad_norm": 0.9529784321784973, "learning_rate": 0.00012902846772544624, "loss": 1.4681, "step": 358 }, { "epoch": 0.84, "grad_norm": 0.8711650967597961, "learning_rate": 0.00012866692549774682, "loss": 0.9842, "step": 359 }, { "epoch": 0.85, "grad_norm": 0.9562662839889526, "learning_rate": 0.00012830497454270205, "loss": 1.3051, "step": 360 }, { "epoch": 0.85, "grad_norm": 1.0756105184555054, "learning_rate": 0.00012794262002093697, "loss": 1.3275, "step": 361 }, { "epoch": 0.85, "grad_norm": 0.7915710806846619, "learning_rate": 0.0001275798670988306, "loss": 1.0035, "step": 362 }, { "epoch": 0.85, "grad_norm": 0.9524595737457275, "learning_rate": 0.0001272167209484422, "loss": 1.2083, "step": 363 }, { "epoch": 0.85, "grad_norm": 1.4926435947418213, "learning_rate": 0.0001268531867474377, "loss": 1.3218, "step": 364 }, { "epoch": 0.86, "grad_norm": 1.2689683437347412, "learning_rate": 0.00012648926967901567, "loss": 2.7813, "step": 365 }, { "epoch": 0.86, "grad_norm": 0.8361314535140991, "learning_rate": 0.00012612497493183364, "loss": 1.124, "step": 366 }, { "epoch": 0.86, "grad_norm": 1.2996618747711182, "learning_rate": 0.00012576030769993393, "loss": 1.3745, "step": 367 }, { "epoch": 0.86, "grad_norm": 0.8248890042304993, "learning_rate": 0.0001253952731826697, "loss": 1.1971, "step": 368 }, { "epoch": 0.87, "grad_norm": 0.8044300079345703, "learning_rate": 0.00012502987658463075, "loss": 1.1508, "step": 369 }, { "epoch": 0.87, "grad_norm": 1.196742057800293, "learning_rate": 0.00012466412311556952, "loss": 0.9868, "step": 370 }, { "epoch": 0.87, "grad_norm": 0.9415065050125122, "learning_rate": 0.0001242980179903264, "loss": 1.046, "step": 371 }, { "epoch": 0.87, "grad_norm": 1.049695611000061, "learning_rate": 0.0001239315664287558, "loss": 0.9927, "step": 372 }, { "epoch": 0.88, "grad_norm": 0.8266507387161255, "learning_rate": 0.00012356477365565148, "loss": 0.8879, "step": 373 }, { "epoch": 0.88, "grad_norm": 0.9163070321083069, "learning_rate": 0.0001231976449006721, "loss": 1.1214, "step": 374 }, { "epoch": 0.88, "grad_norm": 1.01756751537323, "learning_rate": 0.00012283018539826685, "loss": 1.1644, "step": 375 }, { "epoch": 0.88, "grad_norm": 0.901319682598114, "learning_rate": 0.00012246240038760043, "loss": 1.1985, "step": 376 }, { "epoch": 0.88, "grad_norm": 0.9721381664276123, "learning_rate": 0.00012209429511247864, "loss": 1.1199, "step": 377 }, { "epoch": 0.89, "grad_norm": 0.8883329033851624, "learning_rate": 0.0001217258748212737, "loss": 1.3431, "step": 378 }, { "epoch": 0.89, "grad_norm": 1.0698317289352417, "learning_rate": 0.00012135714476684903, "loss": 1.3173, "step": 379 }, { "epoch": 0.89, "grad_norm": 0.8664084076881409, "learning_rate": 0.00012098811020648475, "loss": 1.0441, "step": 380 }, { "epoch": 0.89, "grad_norm": 0.9194340109825134, "learning_rate": 0.00012061877640180255, "loss": 1.152, "step": 381 }, { "epoch": 0.9, "grad_norm": 0.9599464535713196, "learning_rate": 0.00012024914861869063, "loss": 1.1115, "step": 382 }, { "epoch": 0.9, "grad_norm": 0.9990159273147583, "learning_rate": 0.00011987923212722872, "loss": 1.2534, "step": 383 }, { "epoch": 0.9, "grad_norm": 0.8435646891593933, "learning_rate": 0.00011950903220161285, "loss": 1.1752, "step": 384 }, { "epoch": 0.9, "grad_norm": 1.0376098155975342, "learning_rate": 0.00011913855412008023, "loss": 1.4716, "step": 385 }, { "epoch": 0.91, "grad_norm": 1.3249186277389526, "learning_rate": 0.00011876780316483401, "loss": 1.211, "step": 386 }, { "epoch": 0.91, "grad_norm": 1.378393292427063, "learning_rate": 0.00011839678462196784, "loss": 1.0357, "step": 387 }, { "epoch": 0.91, "grad_norm": 0.7574142217636108, "learning_rate": 0.0001180255037813906, "loss": 0.4137, "step": 388 }, { "epoch": 0.91, "grad_norm": 0.7813417911529541, "learning_rate": 0.00011765396593675097, "loss": 1.1776, "step": 389 }, { "epoch": 0.92, "grad_norm": 0.8787057995796204, "learning_rate": 0.00011728217638536197, "loss": 1.1352, "step": 390 }, { "epoch": 0.92, "grad_norm": 0.9643175005912781, "learning_rate": 0.00011691014042812536, "loss": 1.3089, "step": 391 }, { "epoch": 0.92, "grad_norm": 0.9101107716560364, "learning_rate": 0.00011653786336945614, "loss": 1.0639, "step": 392 }, { "epoch": 0.92, "grad_norm": 1.018091082572937, "learning_rate": 0.00011616535051720685, "loss": 0.9938, "step": 393 }, { "epoch": 0.92, "grad_norm": 0.9708930253982544, "learning_rate": 0.00011579260718259197, "loss": 0.8004, "step": 394 }, { "epoch": 0.93, "grad_norm": 0.8909386396408081, "learning_rate": 0.00011541963868011212, "loss": 1.2997, "step": 395 }, { "epoch": 0.93, "grad_norm": 1.0622750520706177, "learning_rate": 0.00011504645032747832, "loss": 1.0235, "step": 396 }, { "epoch": 0.93, "grad_norm": 0.8857365250587463, "learning_rate": 0.00011467304744553618, "loss": 0.8382, "step": 397 }, { "epoch": 0.93, "grad_norm": 0.8980242013931274, "learning_rate": 0.00011429943535819005, "loss": 1.0877, "step": 398 }, { "epoch": 0.94, "grad_norm": 1.1426031589508057, "learning_rate": 0.00011392561939232706, "loss": 1.3496, "step": 399 }, { "epoch": 0.94, "grad_norm": 1.0347543954849243, "learning_rate": 0.0001135516048777412, "loss": 1.6309, "step": 400 }, { "epoch": 0.94, "grad_norm": 1.0121687650680542, "learning_rate": 0.00011317739714705731, "loss": 1.2256, "step": 401 }, { "epoch": 0.94, "grad_norm": 0.8863442540168762, "learning_rate": 0.0001128030015356551, "loss": 0.8687, "step": 402 }, { "epoch": 0.95, "grad_norm": 0.7622981667518616, "learning_rate": 0.00011242842338159309, "loss": 0.7564, "step": 403 }, { "epoch": 0.95, "grad_norm": 0.9527961015701294, "learning_rate": 0.0001120536680255323, "loss": 1.0593, "step": 404 }, { "epoch": 0.95, "grad_norm": 1.3481955528259277, "learning_rate": 0.00011167874081066045, "loss": 1.2279, "step": 405 }, { "epoch": 0.95, "grad_norm": 0.8665672540664673, "learning_rate": 0.00011130364708261552, "loss": 1.1677, "step": 406 }, { "epoch": 0.96, "grad_norm": 1.217490553855896, "learning_rate": 0.0001109283921894095, "loss": 1.2617, "step": 407 }, { "epoch": 0.96, "grad_norm": 0.8935596942901611, "learning_rate": 0.00011055298148135236, "loss": 1.1184, "step": 408 }, { "epoch": 0.96, "grad_norm": 0.8513955473899841, "learning_rate": 0.00011017742031097563, "loss": 1.2705, "step": 409 }, { "epoch": 0.96, "grad_norm": 1.0295133590698242, "learning_rate": 0.0001098017140329561, "loss": 1.1966, "step": 410 }, { "epoch": 0.96, "grad_norm": 1.1029167175292969, "learning_rate": 0.0001094258680040394, "loss": 1.4887, "step": 411 }, { "epoch": 0.97, "grad_norm": 0.9045723080635071, "learning_rate": 0.0001090498875829638, "loss": 1.1461, "step": 412 }, { "epoch": 0.97, "grad_norm": 0.8317312002182007, "learning_rate": 0.00010867377813038366, "loss": 1.136, "step": 413 }, { "epoch": 0.97, "grad_norm": 1.0023647546768188, "learning_rate": 0.00010829754500879308, "loss": 1.1123, "step": 414 }, { "epoch": 0.97, "grad_norm": 0.9197617769241333, "learning_rate": 0.00010792119358244939, "loss": 1.2792, "step": 415 }, { "epoch": 0.98, "grad_norm": 0.9892452955245972, "learning_rate": 0.00010754472921729661, "loss": 1.634, "step": 416 }, { "epoch": 0.98, "grad_norm": 0.8005648255348206, "learning_rate": 0.00010716815728088912, "loss": 0.7168, "step": 417 }, { "epoch": 0.98, "grad_norm": 1.1989247798919678, "learning_rate": 0.00010679148314231504, "loss": 1.2882, "step": 418 }, { "epoch": 0.98, "grad_norm": 0.7820172905921936, "learning_rate": 0.00010641471217211958, "loss": 1.1125, "step": 419 }, { "epoch": 0.99, "grad_norm": 1.352563500404358, "learning_rate": 0.00010603784974222861, "loss": 0.9641, "step": 420 }, { "epoch": 0.99, "grad_norm": 0.8966504335403442, "learning_rate": 0.000105660901225872, "loss": 1.1155, "step": 421 }, { "epoch": 0.99, "grad_norm": 0.8722444176673889, "learning_rate": 0.00010528387199750707, "loss": 1.3011, "step": 422 }, { "epoch": 0.99, "grad_norm": 0.8678218722343445, "learning_rate": 0.00010490676743274181, "loss": 1.2912, "step": 423 }, { "epoch": 1.0, "grad_norm": 0.8596826791763306, "learning_rate": 0.00010452959290825846, "loss": 1.3792, "step": 424 }, { "epoch": 1.0, "grad_norm": 0.777655303478241, "learning_rate": 0.00010415235380173662, "loss": 0.9992, "step": 425 }, { "epoch": 1.0, "grad_norm": 0.7913762331008911, "learning_rate": 0.00010377505549177682, "loss": 0.8813, "step": 426 }, { "epoch": 1.0, "grad_norm": 0.8607832789421082, "learning_rate": 0.00010339770335782359, "loss": 0.9927, "step": 427 }, { "epoch": 1.0, "grad_norm": 0.8529496192932129, "learning_rate": 0.0001030203027800889, "loss": 1.1638, "step": 428 }, { "epoch": 1.0, "eval_loss": 1.109578251838684, "eval_runtime": 5.1972, "eval_samples_per_second": 19.241, "eval_steps_per_second": 19.241, "step": 428 }, { "epoch": 1.01, "grad_norm": 1.0168473720550537, "learning_rate": 0.00010264285913947545, "loss": 1.3069, "step": 429 }, { "epoch": 1.01, "grad_norm": 1.2173975706100464, "learning_rate": 0.00010226537781749987, "loss": 0.9985, "step": 430 }, { "epoch": 1.01, "grad_norm": 0.8822383284568787, "learning_rate": 0.00010188786419621612, "loss": 1.1801, "step": 431 }, { "epoch": 1.01, "grad_norm": 1.145887851715088, "learning_rate": 0.00010151032365813859, "loss": 1.2034, "step": 432 }, { "epoch": 1.02, "grad_norm": 0.7805179953575134, "learning_rate": 0.00010113276158616553, "loss": 1.2896, "step": 433 }, { "epoch": 1.02, "grad_norm": 0.8651528358459473, "learning_rate": 0.00010075518336350218, "loss": 1.2091, "step": 434 }, { "epoch": 1.02, "grad_norm": 1.0866674184799194, "learning_rate": 0.00010037759437358398, "loss": 1.0422, "step": 435 }, { "epoch": 1.02, "grad_norm": 1.0740783214569092, "learning_rate": 0.0001, "loss": 1.3082, "step": 436 }, { "epoch": 1.03, "grad_norm": 0.7619555592536926, "learning_rate": 9.962240562641602e-05, "loss": 0.9208, "step": 437 }, { "epoch": 1.03, "grad_norm": 1.1175497770309448, "learning_rate": 9.924481663649785e-05, "loss": 1.1327, "step": 438 }, { "epoch": 1.03, "grad_norm": 0.8330385088920593, "learning_rate": 9.886723841383448e-05, "loss": 0.9008, "step": 439 }, { "epoch": 1.0, "grad_norm": 0.7999448776245117, "learning_rate": 9.848967634186142e-05, "loss": 0.8613, "step": 440 }, { "epoch": 1.0, "grad_norm": 0.866118311882019, "learning_rate": 9.81121358037839e-05, "loss": 1.2199, "step": 441 }, { "epoch": 1.01, "grad_norm": 0.882002055644989, "learning_rate": 9.773462218250015e-05, "loss": 0.8658, "step": 442 }, { "epoch": 1.01, "grad_norm": 0.8205627799034119, "learning_rate": 9.735714086052458e-05, "loss": 1.189, "step": 443 }, { "epoch": 1.01, "grad_norm": 0.9855633974075317, "learning_rate": 9.697969721991114e-05, "loss": 0.8399, "step": 444 }, { "epoch": 1.01, "grad_norm": 0.891368567943573, "learning_rate": 9.660229664217642e-05, "loss": 1.0189, "step": 445 }, { "epoch": 1.02, "grad_norm": 0.906017541885376, "learning_rate": 9.62249445082232e-05, "loss": 1.0986, "step": 446 }, { "epoch": 1.02, "grad_norm": 1.08309805393219, "learning_rate": 9.584764619826339e-05, "loss": 1.1887, "step": 447 }, { "epoch": 1.02, "grad_norm": 0.8647370934486389, "learning_rate": 9.547040709174159e-05, "loss": 0.8142, "step": 448 }, { "epoch": 1.02, "grad_norm": 1.3828812837600708, "learning_rate": 9.509323256725821e-05, "loss": 0.882, "step": 449 }, { "epoch": 1.03, "grad_norm": 0.8859632611274719, "learning_rate": 9.471612800249296e-05, "loss": 0.9624, "step": 450 }, { "epoch": 1.03, "grad_norm": 0.7756340503692627, "learning_rate": 9.433909877412802e-05, "loss": 0.8844, "step": 451 }, { "epoch": 1.03, "grad_norm": 1.2693322896957397, "learning_rate": 9.396215025777139e-05, "loss": 1.0696, "step": 452 }, { "epoch": 1.03, "grad_norm": 0.8535933494567871, "learning_rate": 9.358528782788045e-05, "loss": 0.8664, "step": 453 }, { "epoch": 1.04, "grad_norm": 0.8840806484222412, "learning_rate": 9.320851685768497e-05, "loss": 0.744, "step": 454 }, { "epoch": 1.04, "grad_norm": 1.1074801683425903, "learning_rate": 9.283184271911089e-05, "loss": 1.0923, "step": 455 }, { "epoch": 1.04, "grad_norm": 1.0910581350326538, "learning_rate": 9.245527078270341e-05, "loss": 0.8355, "step": 456 }, { "epoch": 1.04, "grad_norm": 1.0176016092300415, "learning_rate": 9.207880641755065e-05, "loss": 0.8598, "step": 457 }, { "epoch": 1.04, "grad_norm": 0.9024606347084045, "learning_rate": 9.170245499120693e-05, "loss": 0.8282, "step": 458 }, { "epoch": 1.05, "grad_norm": 0.70665442943573, "learning_rate": 9.132622186961637e-05, "loss": 0.428, "step": 459 }, { "epoch": 1.05, "grad_norm": 0.8624319434165955, "learning_rate": 9.095011241703623e-05, "loss": 0.9361, "step": 460 }, { "epoch": 1.05, "grad_norm": 1.5043039321899414, "learning_rate": 9.057413199596065e-05, "loss": 0.806, "step": 461 }, { "epoch": 1.05, "grad_norm": 1.1076829433441162, "learning_rate": 9.019828596704394e-05, "loss": 0.8207, "step": 462 }, { "epoch": 1.06, "grad_norm": NaN, "learning_rate": 9.019828596704394e-05, "loss": 0.8544, "step": 463 }, { "epoch": 1.06, "grad_norm": 0.9130039215087891, "learning_rate": 8.982257968902438e-05, "loss": 0.8329, "step": 464 }, { "epoch": 1.06, "grad_norm": 1.1653575897216797, "learning_rate": 8.944701851864767e-05, "loss": 0.8211, "step": 465 }, { "epoch": 1.06, "grad_norm": 1.4047077894210815, "learning_rate": 8.907160781059052e-05, "loss": 1.2467, "step": 466 }, { "epoch": 1.07, "grad_norm": 1.0421421527862549, "learning_rate": 8.869635291738452e-05, "loss": 1.0891, "step": 467 }, { "epoch": 1.07, "grad_norm": 0.8077785968780518, "learning_rate": 8.832125918933954e-05, "loss": 0.6497, "step": 468 }, { "epoch": 1.07, "grad_norm": 1.1150001287460327, "learning_rate": 8.79463319744677e-05, "loss": 0.8212, "step": 469 }, { "epoch": 1.07, "grad_norm": 0.704976499080658, "learning_rate": 8.757157661840693e-05, "loss": 0.6064, "step": 470 }, { "epoch": 1.08, "grad_norm": 0.9938413500785828, "learning_rate": 8.719699846434492e-05, "loss": 0.8411, "step": 471 }, { "epoch": 1.08, "grad_norm": 0.8569329380989075, "learning_rate": 8.682260285294271e-05, "loss": 0.7593, "step": 472 }, { "epoch": 1.08, "grad_norm": 1.0257930755615234, "learning_rate": 8.644839512225886e-05, "loss": 0.9576, "step": 473 }, { "epoch": 1.08, "grad_norm": 1.0176326036453247, "learning_rate": 8.607438060767296e-05, "loss": 1.0099, "step": 474 }, { "epoch": 1.08, "grad_norm": 1.1403366327285767, "learning_rate": 8.570056464180998e-05, "loss": 0.6884, "step": 475 }, { "epoch": 1.09, "grad_norm": 0.9332993626594543, "learning_rate": 8.532695255446383e-05, "loss": 0.8534, "step": 476 }, { "epoch": 1.09, "grad_norm": 1.0836379528045654, "learning_rate": 8.495354967252169e-05, "loss": 0.9814, "step": 477 }, { "epoch": 1.09, "grad_norm": 1.038662075996399, "learning_rate": 8.458036131988792e-05, "loss": 0.8299, "step": 478 }, { "epoch": 1.09, "grad_norm": 0.9515346884727478, "learning_rate": 8.420739281740805e-05, "loss": 0.6605, "step": 479 }, { "epoch": 1.1, "grad_norm": 1.1238003969192505, "learning_rate": 8.383464948279319e-05, "loss": 0.9265, "step": 480 }, { "epoch": 1.1, "grad_norm": 1.245451807975769, "learning_rate": 8.346213663054387e-05, "loss": 1.0276, "step": 481 }, { "epoch": 1.1, "grad_norm": 1.0740351676940918, "learning_rate": 8.308985957187466e-05, "loss": 0.894, "step": 482 }, { "epoch": 1.1, "grad_norm": 1.1485562324523926, "learning_rate": 8.271782361463805e-05, "loss": 1.029, "step": 483 }, { "epoch": 1.11, "grad_norm": 0.9134268760681152, "learning_rate": 8.234603406324908e-05, "loss": 0.6061, "step": 484 }, { "epoch": 1.11, "grad_norm": 1.0883495807647705, "learning_rate": 8.197449621860943e-05, "loss": 0.3585, "step": 485 }, { "epoch": 1.11, "grad_norm": 1.11465322971344, "learning_rate": 8.16032153780322e-05, "loss": 1.0824, "step": 486 }, { "epoch": 1.11, "grad_norm": 1.1381958723068237, "learning_rate": 8.123219683516603e-05, "loss": 1.2363, "step": 487 }, { "epoch": 1.12, "grad_norm": 0.9504216313362122, "learning_rate": 8.08614458799198e-05, "loss": 0.9756, "step": 488 }, { "epoch": 1.12, "grad_norm": 0.6939163208007812, "learning_rate": 8.049096779838719e-05, "loss": 0.3759, "step": 489 }, { "epoch": 1.12, "grad_norm": 1.0341072082519531, "learning_rate": 8.01207678727713e-05, "loss": 0.8908, "step": 490 }, { "epoch": 1.12, "grad_norm": 1.0956295728683472, "learning_rate": 7.975085138130938e-05, "loss": 0.7801, "step": 491 }, { "epoch": 1.12, "grad_norm": 1.07564115524292, "learning_rate": 7.938122359819746e-05, "loss": 0.7834, "step": 492 }, { "epoch": 1.13, "grad_norm": 1.0565071105957031, "learning_rate": 7.901188979351526e-05, "loss": 0.8172, "step": 493 }, { "epoch": 1.13, "grad_norm": 1.0685640573501587, "learning_rate": 7.864285523315096e-05, "loss": 0.7835, "step": 494 }, { "epoch": 1.13, "grad_norm": 0.9400045275688171, "learning_rate": 7.827412517872634e-05, "loss": 0.6592, "step": 495 }, { "epoch": 1.13, "grad_norm": 1.1758517026901245, "learning_rate": 7.790570488752135e-05, "loss": 0.8823, "step": 496 }, { "epoch": 1.14, "grad_norm": 1.1540248394012451, "learning_rate": 7.753759961239964e-05, "loss": 0.849, "step": 497 }, { "epoch": 1.14, "grad_norm": 1.1850864887237549, "learning_rate": 7.716981460173319e-05, "loss": 0.7586, "step": 498 }, { "epoch": 1.14, "grad_norm": 1.033463954925537, "learning_rate": 7.68023550993279e-05, "loss": 0.9891, "step": 499 }, { "epoch": 1.14, "grad_norm": 1.2690588235855103, "learning_rate": 7.643522634434856e-05, "loss": 1.1184, "step": 500 }, { "epoch": 1.15, "grad_norm": 1.6630959510803223, "learning_rate": 7.606843357124426e-05, "loss": 1.0677, "step": 501 }, { "epoch": 1.15, "grad_norm": 1.0995733737945557, "learning_rate": 7.570198200967362e-05, "loss": 0.9593, "step": 502 }, { "epoch": 1.15, "grad_norm": 1.1102938652038574, "learning_rate": 7.533587688443049e-05, "loss": 0.7075, "step": 503 }, { "epoch": 1.15, "grad_norm": 1.3560442924499512, "learning_rate": 7.497012341536924e-05, "loss": 1.0318, "step": 504 }, { "epoch": 1.15, "grad_norm": 1.0489193201065063, "learning_rate": 7.460472681733031e-05, "loss": 0.7033, "step": 505 }, { "epoch": 1.16, "grad_norm": 1.1091972589492798, "learning_rate": 7.423969230006609e-05, "loss": 0.907, "step": 506 }, { "epoch": 1.16, "grad_norm": 1.096968173980713, "learning_rate": 7.387502506816638e-05, "loss": 0.9167, "step": 507 }, { "epoch": 1.16, "grad_norm": 1.8477667570114136, "learning_rate": 7.351073032098437e-05, "loss": 0.9794, "step": 508 }, { "epoch": 1.16, "grad_norm": 0.8208603858947754, "learning_rate": 7.314681325256232e-05, "loss": 0.7489, "step": 509 }, { "epoch": 1.17, "grad_norm": 1.3617076873779297, "learning_rate": 7.278327905155783e-05, "loss": 1.0549, "step": 510 }, { "epoch": 1.17, "grad_norm": 1.2204340696334839, "learning_rate": 7.242013290116944e-05, "loss": 1.0512, "step": 511 }, { "epoch": 1.17, "grad_norm": 1.2859915494918823, "learning_rate": 7.205737997906307e-05, "loss": 0.8753, "step": 512 }, { "epoch": 1.17, "grad_norm": 1.184820532798767, "learning_rate": 7.169502545729797e-05, "loss": 0.7513, "step": 513 }, { "epoch": 1.18, "grad_norm": 1.4803907871246338, "learning_rate": 7.133307450225322e-05, "loss": 0.9857, "step": 514 }, { "epoch": 1.18, "grad_norm": 0.8424803614616394, "learning_rate": 7.097153227455379e-05, "loss": 0.586, "step": 515 }, { "epoch": 1.18, "grad_norm": 1.2504682540893555, "learning_rate": 7.061040392899712e-05, "loss": 0.7598, "step": 516 }, { "epoch": 1.18, "grad_norm": 1.072726845741272, "learning_rate": 7.024969461447972e-05, "loss": 0.725, "step": 517 }, { "epoch": 1.19, "grad_norm": 1.133083701133728, "learning_rate": 6.988940947392344e-05, "loss": 0.943, "step": 518 }, { "epoch": 1.19, "grad_norm": 1.823098063468933, "learning_rate": 6.952955364420255e-05, "loss": 0.8723, "step": 519 }, { "epoch": 1.19, "grad_norm": 1.2997591495513916, "learning_rate": 6.91701322560701e-05, "loss": 1.0831, "step": 520 }, { "epoch": 1.19, "grad_norm": 1.3265862464904785, "learning_rate": 6.881115043408511e-05, "loss": 0.9224, "step": 521 }, { "epoch": 1.19, "grad_norm": 1.3687394857406616, "learning_rate": 6.845261329653922e-05, "loss": 1.2177, "step": 522 }, { "epoch": 1.2, "grad_norm": 1.3694826364517212, "learning_rate": 6.809452595538402e-05, "loss": 0.9281, "step": 523 }, { "epoch": 1.2, "grad_norm": 0.8991251587867737, "learning_rate": 6.77368935161578e-05, "loss": 0.7164, "step": 524 }, { "epoch": 1.2, "grad_norm": 1.059921145439148, "learning_rate": 6.73797210779131e-05, "loss": 0.7976, "step": 525 }, { "epoch": 1.2, "grad_norm": 1.2316731214523315, "learning_rate": 6.70230137331437e-05, "loss": 0.9325, "step": 526 }, { "epoch": 1.21, "grad_norm": 1.3245116472244263, "learning_rate": 6.666677656771239e-05, "loss": 0.7446, "step": 527 }, { "epoch": 1.21, "grad_norm": 1.056368112564087, "learning_rate": 6.6311014660778e-05, "loss": 0.6862, "step": 528 }, { "epoch": 1.21, "grad_norm": 1.4571599960327148, "learning_rate": 6.595573308472338e-05, "loss": 1.0019, "step": 529 }, { "epoch": 1.21, "grad_norm": 1.2216399908065796, "learning_rate": 6.56009369050829e-05, "loss": 0.8784, "step": 530 }, { "epoch": 1.22, "grad_norm": 1.440184473991394, "learning_rate": 6.524663118047016e-05, "loss": 1.7494, "step": 531 }, { "epoch": 1.22, "grad_norm": 0.9794695973396301, "learning_rate": 6.489282096250601e-05, "loss": 0.7664, "step": 532 }, { "epoch": 1.22, "grad_norm": 1.0644843578338623, "learning_rate": 6.453951129574644e-05, "loss": 1.1805, "step": 533 }, { "epoch": 1.22, "grad_norm": 1.155930995941162, "learning_rate": 6.418670721761073e-05, "loss": 0.8715, "step": 534 }, { "epoch": 1.23, "grad_norm": 1.2466217279434204, "learning_rate": 6.383441375830951e-05, "loss": 1.1003, "step": 535 }, { "epoch": 1.23, "eval_loss": 1.1363521814346313, "eval_runtime": 5.0483, "eval_samples_per_second": 19.808, "eval_steps_per_second": 19.808, "step": 535 }, { "epoch": 1.23, "grad_norm": 1.0918858051300049, "learning_rate": 6.34826359407732e-05, "loss": 1.0192, "step": 536 }, { "epoch": 1.23, "grad_norm": 1.0371659994125366, "learning_rate": 6.313137878058013e-05, "loss": 0.8525, "step": 537 }, { "epoch": 1.23, "grad_norm": 1.3210889101028442, "learning_rate": 6.278064728588542e-05, "loss": 0.9921, "step": 538 }, { "epoch": 1.23, "grad_norm": 1.1951533555984497, "learning_rate": 6.243044645734917e-05, "loss": 0.8915, "step": 539 }, { "epoch": 1.24, "grad_norm": 1.4079426527023315, "learning_rate": 6.20807812880655e-05, "loss": 1.0806, "step": 540 }, { "epoch": 1.24, "grad_norm": 1.255631685256958, "learning_rate": 6.173165676349103e-05, "loss": 0.7914, "step": 541 }, { "epoch": 1.24, "grad_norm": 1.08389413356781, "learning_rate": 6.138307786137415e-05, "loss": 0.8829, "step": 542 }, { "epoch": 1.24, "grad_norm": 1.3811546564102173, "learning_rate": 6.103504955168382e-05, "loss": 1.0245, "step": 543 }, { "epoch": 1.25, "grad_norm": 1.1965594291687012, "learning_rate": 6.068757679653868e-05, "loss": 1.0113, "step": 544 }, { "epoch": 1.25, "grad_norm": 1.3283885717391968, "learning_rate": 6.034066455013649e-05, "loss": 0.8461, "step": 545 }, { "epoch": 1.25, "grad_norm": 1.3063422441482544, "learning_rate": 5.999431775868329e-05, "loss": 0.7606, "step": 546 }, { "epoch": 1.25, "grad_norm": 1.1690608263015747, "learning_rate": 5.9648541360323095e-05, "loss": 0.9931, "step": 547 }, { "epoch": 1.26, "grad_norm": 0.7004899978637695, "learning_rate": 5.930334028506725e-05, "loss": 0.2953, "step": 548 }, { "epoch": 1.26, "grad_norm": 1.0900754928588867, "learning_rate": 5.8958719454724346e-05, "loss": 0.8003, "step": 549 }, { "epoch": 1.26, "grad_norm": 1.1421937942504883, "learning_rate": 5.8614683782829835e-05, "loss": 0.8811, "step": 550 }, { "epoch": 1.26, "grad_norm": 1.823502540588379, "learning_rate": 5.8271238174576305e-05, "loss": 1.1777, "step": 551 }, { "epoch": 1.27, "grad_norm": 1.483927607536316, "learning_rate": 5.792838752674309e-05, "loss": 0.9416, "step": 552 }, { "epoch": 1.27, "grad_norm": 1.1196835041046143, "learning_rate": 5.75861367276269e-05, "loss": 0.7103, "step": 553 }, { "epoch": 1.27, "grad_norm": 1.209674596786499, "learning_rate": 5.7244490656971815e-05, "loss": 1.1059, "step": 554 }, { "epoch": 1.27, "grad_norm": 1.0897173881530762, "learning_rate": 5.6903454185899774e-05, "loss": 0.8877, "step": 555 }, { "epoch": 1.27, "grad_norm": 1.4691230058670044, "learning_rate": 5.6563032176841324e-05, "loss": 0.8161, "step": 556 }, { "epoch": 1.28, "grad_norm": 1.5138698816299438, "learning_rate": 5.622322948346594e-05, "loss": 1.0267, "step": 557 }, { "epoch": 1.28, "grad_norm": 1.6125860214233398, "learning_rate": 5.588405095061322e-05, "loss": 1.1, "step": 558 }, { "epoch": 1.28, "grad_norm": 1.2148243188858032, "learning_rate": 5.55455014142234e-05, "loss": 0.9657, "step": 559 }, { "epoch": 1.28, "grad_norm": 1.327254295349121, "learning_rate": 5.5207585701268805e-05, "loss": 1.0631, "step": 560 }, { "epoch": 1.29, "grad_norm": 1.1820276975631714, "learning_rate": 5.4870308629684677e-05, "loss": 0.6627, "step": 561 }, { "epoch": 1.29, "grad_norm": 1.6765048503875732, "learning_rate": 5.453367500830069e-05, "loss": 1.0243, "step": 562 }, { "epoch": 1.29, "grad_norm": 1.2270108461380005, "learning_rate": 5.4197689636772334e-05, "loss": 0.7945, "step": 563 }, { "epoch": 1.29, "grad_norm": 0.985974133014679, "learning_rate": 5.386235730551252e-05, "loss": 0.7504, "step": 564 }, { "epoch": 1.3, "grad_norm": 1.375940203666687, "learning_rate": 5.3527682795623146e-05, "loss": 0.9097, "step": 565 }, { "epoch": 1.3, "grad_norm": 1.0430001020431519, "learning_rate": 5.319367087882716e-05, "loss": 0.9843, "step": 566 }, { "epoch": 1.3, "grad_norm": 1.0604748725891113, "learning_rate": 5.286032631740023e-05, "loss": 0.6792, "step": 567 }, { "epoch": 1.3, "grad_norm": 1.05086088180542, "learning_rate": 5.252765386410312e-05, "loss": 0.632, "step": 568 }, { "epoch": 1.31, "grad_norm": 1.214124083518982, "learning_rate": 5.2195658262113814e-05, "loss": 0.7741, "step": 569 }, { "epoch": 1.31, "grad_norm": 1.1835461854934692, "learning_rate": 5.186434424495979e-05, "loss": 0.9085, "step": 570 }, { "epoch": 1.31, "grad_norm": 1.428858757019043, "learning_rate": 5.1533716536450693e-05, "loss": 1.4804, "step": 571 }, { "epoch": 1.31, "grad_norm": 1.3231133222579956, "learning_rate": 5.1203779850610864e-05, "loss": 0.7928, "step": 572 }, { "epoch": 1.31, "grad_norm": 1.2435096502304077, "learning_rate": 5.087453889161229e-05, "loss": 0.8173, "step": 573 }, { "epoch": 1.32, "grad_norm": 1.05045485496521, "learning_rate": 5.054599835370724e-05, "loss": 0.9197, "step": 574 }, { "epoch": 1.32, "grad_norm": 1.5020091533660889, "learning_rate": 5.021816292116175e-05, "loss": 1.0374, "step": 575 }, { "epoch": 1.32, "grad_norm": 1.2051324844360352, "learning_rate": 4.989103726818836e-05, "loss": 0.5828, "step": 576 }, { "epoch": 1.32, "grad_norm": 1.2995752096176147, "learning_rate": 4.956462605887994e-05, "loss": 0.9762, "step": 577 }, { "epoch": 1.33, "grad_norm": 1.293330430984497, "learning_rate": 4.923893394714279e-05, "loss": 0.851, "step": 578 }, { "epoch": 1.33, "grad_norm": 1.4019485712051392, "learning_rate": 4.891396557663056e-05, "loss": 0.8867, "step": 579 }, { "epoch": 1.33, "grad_norm": 1.1992961168289185, "learning_rate": 4.8589725580677835e-05, "loss": 0.8609, "step": 580 }, { "epoch": 1.33, "grad_norm": 1.3640302419662476, "learning_rate": 4.826621858223431e-05, "loss": 0.8686, "step": 581 }, { "epoch": 1.34, "grad_norm": 1.2899017333984375, "learning_rate": 4.794344919379872e-05, "loss": 0.7867, "step": 582 }, { "epoch": 1.34, "grad_norm": 1.468520164489746, "learning_rate": 4.762142201735299e-05, "loss": 0.8161, "step": 583 }, { "epoch": 1.34, "grad_norm": 1.1621512174606323, "learning_rate": 4.730014164429689e-05, "loss": 0.7486, "step": 584 }, { "epoch": 1.34, "grad_norm": 1.599510669708252, "learning_rate": 4.697961265538231e-05, "loss": 0.7786, "step": 585 }, { "epoch": 1.35, "grad_norm": 1.5685900449752808, "learning_rate": 4.6659839620648074e-05, "loss": 0.9098, "step": 586 }, { "epoch": 1.35, "grad_norm": 1.2052661180496216, "learning_rate": 4.634082709935473e-05, "loss": 0.845, "step": 587 }, { "epoch": 1.35, "grad_norm": 1.2109522819519043, "learning_rate": 4.6022579639919695e-05, "loss": 0.7274, "step": 588 }, { "epoch": 1.35, "grad_norm": 1.354245662689209, "learning_rate": 4.5705101779852135e-05, "loss": 0.9229, "step": 589 }, { "epoch": 1.35, "grad_norm": 1.398025631904602, "learning_rate": 4.5388398045688566e-05, "loss": 0.7834, "step": 590 }, { "epoch": 1.36, "grad_norm": 1.30653977394104, "learning_rate": 4.507247295292801e-05, "loss": 0.9327, "step": 591 }, { "epoch": 1.36, "grad_norm": 0.9323519468307495, "learning_rate": 4.475733100596795e-05, "loss": 0.5555, "step": 592 }, { "epoch": 1.36, "grad_norm": 1.2603791952133179, "learning_rate": 4.444297669803981e-05, "loss": 0.8119, "step": 593 }, { "epoch": 1.36, "grad_norm": 1.2332367897033691, "learning_rate": 4.412941451114498e-05, "loss": 0.8452, "step": 594 }, { "epoch": 1.37, "grad_norm": 1.3796523809432983, "learning_rate": 4.381664891599111e-05, "loss": 1.0362, "step": 595 }, { "epoch": 1.37, "grad_norm": 1.1836915016174316, "learning_rate": 4.3504684371928006e-05, "loss": 0.7377, "step": 596 }, { "epoch": 1.37, "grad_norm": 1.4385707378387451, "learning_rate": 4.3193525326884435e-05, "loss": 0.9733, "step": 597 }, { "epoch": 1.37, "grad_norm": 1.4326701164245605, "learning_rate": 4.288317621730434e-05, "loss": 0.8767, "step": 598 }, { "epoch": 1.38, "grad_norm": 1.1379212141036987, "learning_rate": 4.257364146808393e-05, "loss": 0.8179, "step": 599 }, { "epoch": 1.38, "grad_norm": 1.343801498413086, "learning_rate": 4.226492549250829e-05, "loss": 0.7713, "step": 600 }, { "epoch": 1.38, "grad_norm": 1.1334872245788574, "learning_rate": 4.195703269218868e-05, "loss": 0.9144, "step": 601 }, { "epoch": 1.38, "grad_norm": 1.1683942079544067, "learning_rate": 4.164996745699966e-05, "loss": 0.8409, "step": 602 }, { "epoch": 1.38, "grad_norm": 1.074876308441162, "learning_rate": 4.1343734165016514e-05, "loss": 0.5677, "step": 603 }, { "epoch": 1.39, "grad_norm": 1.4278324842453003, "learning_rate": 4.1038337182452826e-05, "loss": 0.8643, "step": 604 }, { "epoch": 1.39, "grad_norm": 1.0636829137802124, "learning_rate": 4.0733780863598335e-05, "loss": 0.7189, "step": 605 }, { "epoch": 1.39, "grad_norm": 1.238641381263733, "learning_rate": 4.0430069550756665e-05, "loss": 0.8048, "step": 606 }, { "epoch": 1.39, "grad_norm": 1.2493727207183838, "learning_rate": 4.012720757418358e-05, "loss": 0.6849, "step": 607 }, { "epoch": 1.4, "grad_norm": 1.04925537109375, "learning_rate": 3.9825199252025184e-05, "loss": 0.6783, "step": 608 }, { "epoch": 1.4, "grad_norm": 1.0140671730041504, "learning_rate": 3.952404889025626e-05, "loss": 0.6275, "step": 609 }, { "epoch": 1.4, "grad_norm": 1.292979121208191, "learning_rate": 3.9223760782619045e-05, "loss": 0.6685, "step": 610 }, { "epoch": 1.4, "grad_norm": 1.355290174484253, "learning_rate": 3.8924339210561836e-05, "loss": 0.7877, "step": 611 }, { "epoch": 1.41, "grad_norm": 1.2331466674804688, "learning_rate": 3.862578844317817e-05, "loss": 0.8935, "step": 612 }, { "epoch": 1.41, "grad_norm": 1.4775677919387817, "learning_rate": 3.832811273714569e-05, "loss": 0.8254, "step": 613 }, { "epoch": 1.41, "grad_norm": 1.1989951133728027, "learning_rate": 3.803131633666572e-05, "loss": 0.7965, "step": 614 }, { "epoch": 1.41, "grad_norm": 1.2056313753128052, "learning_rate": 3.773540347340248e-05, "loss": 1.2453, "step": 615 }, { "epoch": 1.42, "grad_norm": 0.9701207876205444, "learning_rate": 3.7440378366423e-05, "loss": 0.5302, "step": 616 }, { "epoch": 1.42, "grad_norm": 1.1523936986923218, "learning_rate": 3.714624522213681e-05, "loss": 0.6319, "step": 617 }, { "epoch": 1.42, "grad_norm": 1.7994964122772217, "learning_rate": 3.685300823423602e-05, "loss": 0.8799, "step": 618 }, { "epoch": 1.42, "grad_norm": 1.305521011352539, "learning_rate": 3.6560671583635467e-05, "loss": 0.9071, "step": 619 }, { "epoch": 1.42, "grad_norm": 1.4707729816436768, "learning_rate": 3.626923943841325e-05, "loss": 1.0193, "step": 620 }, { "epoch": 1.43, "grad_norm": 1.5845611095428467, "learning_rate": 3.597871595375121e-05, "loss": 0.9118, "step": 621 }, { "epoch": 1.43, "grad_norm": 1.0622698068618774, "learning_rate": 3.5689105271875564e-05, "loss": 0.8397, "step": 622 }, { "epoch": 1.43, "grad_norm": 1.2688456773757935, "learning_rate": 3.5400411521998126e-05, "loss": 0.9132, "step": 623 }, { "epoch": 1.43, "grad_norm": 1.2782020568847656, "learning_rate": 3.5112638820257115e-05, "loss": 0.5659, "step": 624 }, { "epoch": 1.44, "grad_norm": 1.4183655977249146, "learning_rate": 3.482579126965878e-05, "loss": 0.5982, "step": 625 }, { "epoch": 1.44, "grad_norm": 1.279589056968689, "learning_rate": 3.453987296001866e-05, "loss": 0.6599, "step": 626 }, { "epoch": 1.44, "grad_norm": 1.5479413270950317, "learning_rate": 3.425488796790337e-05, "loss": 0.738, "step": 627 }, { "epoch": 1.44, "grad_norm": 1.507043719291687, "learning_rate": 3.397084035657243e-05, "loss": 0.7308, "step": 628 }, { "epoch": 1.45, "grad_norm": 1.5540934801101685, "learning_rate": 3.36877341759205e-05, "loss": 0.9896, "step": 629 }, { "epoch": 1.45, "grad_norm": 1.198010802268982, "learning_rate": 3.340557346241936e-05, "loss": 0.9595, "step": 630 }, { "epoch": 1.45, "grad_norm": 1.4387493133544922, "learning_rate": 3.312436223906062e-05, "loss": 0.9932, "step": 631 }, { "epoch": 1.45, "grad_norm": 1.4757272005081177, "learning_rate": 3.2844104515298155e-05, "loss": 0.796, "step": 632 }, { "epoch": 1.46, "grad_norm": 1.2666937112808228, "learning_rate": 3.2564804286991135e-05, "loss": 0.6709, "step": 633 }, { "epoch": 1.46, "grad_norm": 1.187326192855835, "learning_rate": 3.2286465536346854e-05, "loss": 0.8126, "step": 634 }, { "epoch": 1.46, "grad_norm": 1.434696912765503, "learning_rate": 3.2009092231864044e-05, "loss": 0.7374, "step": 635 }, { "epoch": 1.46, "grad_norm": 1.5462465286254883, "learning_rate": 3.173268832827643e-05, "loss": 0.8611, "step": 636 }, { "epoch": 1.46, "grad_norm": 1.5220853090286255, "learning_rate": 3.1457257766496015e-05, "loss": 1.0394, "step": 637 }, { "epoch": 1.47, "grad_norm": 1.2815146446228027, "learning_rate": 3.118280447355729e-05, "loss": 1.0586, "step": 638 }, { "epoch": 1.47, "grad_norm": 1.4531899690628052, "learning_rate": 3.090933236256087e-05, "loss": 0.8858, "step": 639 }, { "epoch": 1.47, "grad_norm": 0.7821406126022339, "learning_rate": 3.0636845332617994e-05, "loss": 0.4845, "step": 640 }, { "epoch": 1.47, "grad_norm": 1.171593189239502, "learning_rate": 3.036534726879473e-05, "loss": 0.6783, "step": 641 }, { "epoch": 1.48, "grad_norm": 0.9787347316741943, "learning_rate": 3.0094842042056704e-05, "loss": 0.7622, "step": 642 }, { "epoch": 1.48, "eval_loss": 1.1426820755004883, "eval_runtime": 4.8034, "eval_samples_per_second": 20.819, "eval_steps_per_second": 20.819, "step": 642 }, { "epoch": 1.48, "grad_norm": 2.1937968730926514, "learning_rate": 2.9825333509213827e-05, "loss": 0.8779, "step": 643 }, { "epoch": 1.48, "grad_norm": 0.9966670274734497, "learning_rate": 2.9556825512865415e-05, "loss": 0.9429, "step": 644 }, { "epoch": 1.48, "grad_norm": 1.0608552694320679, "learning_rate": 2.9289321881345254e-05, "loss": 0.4471, "step": 645 }, { "epoch": 1.49, "grad_norm": 1.5099259614944458, "learning_rate": 2.902282642866716e-05, "loss": 0.9748, "step": 646 }, { "epoch": 1.49, "grad_norm": 1.24240243434906, "learning_rate": 2.8757342954470533e-05, "loss": 0.7204, "step": 647 }, { "epoch": 1.49, "grad_norm": 1.631426215171814, "learning_rate": 2.849287524396611e-05, "loss": 1.0639, "step": 648 }, { "epoch": 1.49, "grad_norm": 0.9880529642105103, "learning_rate": 2.8229427067882164e-05, "loss": 0.7009, "step": 649 }, { "epoch": 1.5, "grad_norm": 1.366929531097412, "learning_rate": 2.7967002182410596e-05, "loss": 0.7605, "step": 650 }, { "epoch": 1.5, "grad_norm": 1.3599152565002441, "learning_rate": 2.7705604329153434e-05, "loss": 0.9786, "step": 651 }, { "epoch": 1.5, "grad_norm": 1.316638708114624, "learning_rate": 2.7445237235069455e-05, "loss": 1.0621, "step": 652 }, { "epoch": 1.5, "grad_norm": 2.131920099258423, "learning_rate": 2.7185904612421176e-05, "loss": 0.4658, "step": 653 }, { "epoch": 1.5, "grad_norm": 1.4175996780395508, "learning_rate": 2.6927610158721706e-05, "loss": 1.1467, "step": 654 }, { "epoch": 1.51, "grad_norm": 1.041159987449646, "learning_rate": 2.6670357556682247e-05, "loss": 0.7331, "step": 655 }, { "epoch": 1.51, "grad_norm": 1.5495882034301758, "learning_rate": 2.6414150474159403e-05, "loss": 1.0374, "step": 656 }, { "epoch": 1.51, "grad_norm": 1.3431997299194336, "learning_rate": 2.6158992564103058e-05, "loss": 0.8734, "step": 657 }, { "epoch": 1.51, "grad_norm": 1.2934917211532593, "learning_rate": 2.5904887464504114e-05, "loss": 0.7493, "step": 658 }, { "epoch": 1.52, "grad_norm": 1.2499974966049194, "learning_rate": 2.565183879834272e-05, "loss": 0.6417, "step": 659 }, { "epoch": 1.52, "grad_norm": 1.0983362197875977, "learning_rate": 2.53998501735367e-05, "loss": 0.618, "step": 660 }, { "epoch": 1.52, "grad_norm": 1.2114965915679932, "learning_rate": 2.514892518288988e-05, "loss": 0.7808, "step": 661 }, { "epoch": 1.52, "grad_norm": 1.2864103317260742, "learning_rate": 2.4899067404041153e-05, "loss": 0.8773, "step": 662 }, { "epoch": 1.53, "grad_norm": 1.3944170475006104, "learning_rate": 2.465028039941316e-05, "loss": 0.9462, "step": 663 }, { "epoch": 1.53, "grad_norm": 1.3994008302688599, "learning_rate": 2.4402567716161805e-05, "loss": 0.9313, "step": 664 }, { "epoch": 1.53, "grad_norm": 1.3160896301269531, "learning_rate": 2.415593288612541e-05, "loss": 0.724, "step": 665 }, { "epoch": 1.53, "grad_norm": 1.348981261253357, "learning_rate": 2.391037942577454e-05, "loss": 0.9646, "step": 666 }, { "epoch": 1.54, "grad_norm": 1.1432273387908936, "learning_rate": 2.3665910836161775e-05, "loss": 0.6094, "step": 667 }, { "epoch": 1.54, "grad_norm": 1.5914933681488037, "learning_rate": 2.3422530602871872e-05, "loss": 0.8079, "step": 668 }, { "epoch": 1.54, "grad_norm": 1.3685251474380493, "learning_rate": 2.318024219597196e-05, "loss": 0.6591, "step": 669 }, { "epoch": 1.54, "grad_norm": 1.1842666864395142, "learning_rate": 2.2939049069962183e-05, "loss": 0.8079, "step": 670 }, { "epoch": 1.54, "grad_norm": 1.3472460508346558, "learning_rate": 2.26989546637263e-05, "loss": 0.7606, "step": 671 }, { "epoch": 1.55, "grad_norm": 1.358242392539978, "learning_rate": 2.2459962400482847e-05, "loss": 0.7715, "step": 672 }, { "epoch": 1.55, "grad_norm": 1.2322027683258057, "learning_rate": 2.2222075687736187e-05, "loss": 0.8517, "step": 673 }, { "epoch": 1.55, "grad_norm": 1.5333589315414429, "learning_rate": 2.198529791722792e-05, "loss": 0.6613, "step": 674 }, { "epoch": 1.55, "grad_norm": 1.2892917394638062, "learning_rate": 2.1749632464888592e-05, "loss": 0.8702, "step": 675 }, { "epoch": 1.56, "grad_norm": 1.505099892616272, "learning_rate": 2.1515082690789535e-05, "loss": 0.8878, "step": 676 }, { "epoch": 1.56, "grad_norm": 1.266660213470459, "learning_rate": 2.1281651939094992e-05, "loss": 1.0645, "step": 677 }, { "epoch": 1.56, "grad_norm": 1.201398491859436, "learning_rate": 2.1049343538014355e-05, "loss": 0.9351, "step": 678 }, { "epoch": 1.56, "grad_norm": 0.9400334358215332, "learning_rate": 2.0818160799754828e-05, "loss": 0.4052, "step": 679 }, { "epoch": 1.57, "grad_norm": 1.2123039960861206, "learning_rate": 2.0588107020474056e-05, "loss": 0.6165, "step": 680 }, { "epoch": 1.57, "grad_norm": 1.3787363767623901, "learning_rate": 2.03591854802333e-05, "loss": 0.8574, "step": 681 }, { "epoch": 1.57, "grad_norm": 1.328092098236084, "learning_rate": 2.0131399442950505e-05, "loss": 0.7751, "step": 682 }, { "epoch": 1.57, "grad_norm": 2.2625534534454346, "learning_rate": 1.9904752156353878e-05, "loss": 1.1871, "step": 683 }, { "epoch": 1.58, "grad_norm": 1.0272362232208252, "learning_rate": 1.967924685193552e-05, "loss": 0.5135, "step": 684 }, { "epoch": 1.58, "grad_norm": 1.3563427925109863, "learning_rate": 1.94548867449054e-05, "loss": 0.9451, "step": 685 }, { "epoch": 1.58, "grad_norm": 1.4345487356185913, "learning_rate": 1.9231675034145513e-05, "loss": 0.8185, "step": 686 }, { "epoch": 1.58, "grad_norm": 1.4716142416000366, "learning_rate": 1.9009614902164174e-05, "loss": 0.8666, "step": 687 }, { "epoch": 1.58, "grad_norm": 2.2574665546417236, "learning_rate": 1.8788709515050808e-05, "loss": 1.1538, "step": 688 }, { "epoch": 1.59, "grad_norm": 1.2094467878341675, "learning_rate": 1.8568962022430636e-05, "loss": 0.9113, "step": 689 }, { "epoch": 1.59, "grad_norm": 1.2264611721038818, "learning_rate": 1.8350375557419875e-05, "loss": 0.693, "step": 690 }, { "epoch": 1.59, "grad_norm": 1.440259337425232, "learning_rate": 1.813295323658103e-05, "loss": 0.702, "step": 691 }, { "epoch": 1.59, "grad_norm": 1.0737701654434204, "learning_rate": 1.791669815987852e-05, "loss": 0.581, "step": 692 }, { "epoch": 1.6, "grad_norm": 1.3297679424285889, "learning_rate": 1.7701613410634365e-05, "loss": 0.7501, "step": 693 }, { "epoch": 1.6, "grad_norm": 1.1620917320251465, "learning_rate": 1.7487702055484345e-05, "loss": 0.8193, "step": 694 }, { "epoch": 1.6, "grad_norm": 2.063955068588257, "learning_rate": 1.7274967144334153e-05, "loss": 1.0735, "step": 695 }, { "epoch": 1.6, "grad_norm": 1.1501412391662598, "learning_rate": 1.7063411710316046e-05, "loss": 0.5951, "step": 696 }, { "epoch": 1.61, "grad_norm": 1.2334132194519043, "learning_rate": 1.6853038769745467e-05, "loss": 0.7601, "step": 697 }, { "epoch": 1.61, "grad_norm": 1.8308238983154297, "learning_rate": 1.6643851322078174e-05, "loss": 1.0234, "step": 698 }, { "epoch": 1.61, "grad_norm": 1.6979902982711792, "learning_rate": 1.643585234986733e-05, "loss": 1.0344, "step": 699 }, { "epoch": 1.61, "grad_norm": 1.1924549341201782, "learning_rate": 1.622904481872106e-05, "loss": 0.7712, "step": 700 }, { "epoch": 1.62, "grad_norm": 1.3908418416976929, "learning_rate": 1.6023431677260214e-05, "loss": 0.7998, "step": 701 }, { "epoch": 1.62, "grad_norm": 1.5069186687469482, "learning_rate": 1.5819015857076213e-05, "loss": 1.0222, "step": 702 }, { "epoch": 1.62, "grad_norm": 1.5676149129867554, "learning_rate": 1.5615800272689352e-05, "loss": 1.0573, "step": 703 }, { "epoch": 1.62, "grad_norm": 1.4918407201766968, "learning_rate": 1.541378782150714e-05, "loss": 0.7947, "step": 704 }, { "epoch": 1.62, "grad_norm": 1.4949313402175903, "learning_rate": 1.5212981383783154e-05, "loss": 0.9044, "step": 705 }, { "epoch": 1.63, "grad_norm": 1.4046580791473389, "learning_rate": 1.5013383822575766e-05, "loss": 0.976, "step": 706 }, { "epoch": 1.63, "grad_norm": 1.247767448425293, "learning_rate": 1.4814997983707458e-05, "loss": 0.8356, "step": 707 }, { "epoch": 1.63, "grad_norm": 1.4620435237884521, "learning_rate": 1.4617826695724223e-05, "loss": 0.9709, "step": 708 }, { "epoch": 1.63, "grad_norm": 1.5974092483520508, "learning_rate": 1.442187276985526e-05, "loss": 0.7993, "step": 709 }, { "epoch": 1.64, "grad_norm": 1.4362558126449585, "learning_rate": 1.42271389999728e-05, "loss": 0.8003, "step": 710 }, { "epoch": 1.64, "grad_norm": 1.4817911386489868, "learning_rate": 1.4033628162552359e-05, "loss": 0.7285, "step": 711 }, { "epoch": 1.64, "grad_norm": 1.3569756746292114, "learning_rate": 1.3841343016633167e-05, "loss": 0.6743, "step": 712 }, { "epoch": 1.64, "grad_norm": 1.1255030632019043, "learning_rate": 1.3650286303778714e-05, "loss": 0.9471, "step": 713 }, { "epoch": 1.65, "grad_norm": 1.5939558744430542, "learning_rate": 1.3460460748037774e-05, "loss": 0.8036, "step": 714 }, { "epoch": 1.65, "grad_norm": 1.3757383823394775, "learning_rate": 1.3271869055905495e-05, "loss": 0.8478, "step": 715 }, { "epoch": 1.65, "grad_norm": 1.6052110195159912, "learning_rate": 1.3084513916284913e-05, "loss": 0.902, "step": 716 }, { "epoch": 1.65, "grad_norm": 2.1484215259552, "learning_rate": 1.2898398000448443e-05, "loss": 1.0986, "step": 717 }, { "epoch": 1.65, "grad_norm": 0.9963454008102417, "learning_rate": 1.2713523961999996e-05, "loss": 0.5391, "step": 718 }, { "epoch": 1.66, "grad_norm": 1.68670654296875, "learning_rate": 1.2529894436836965e-05, "loss": 0.9895, "step": 719 }, { "epoch": 1.66, "grad_norm": 1.4046229124069214, "learning_rate": 1.2347512043112752e-05, "loss": 0.9313, "step": 720 }, { "epoch": 1.66, "grad_norm": 2.2879252433776855, "learning_rate": 1.2166379381199423e-05, "loss": 0.84, "step": 721 }, { "epoch": 1.66, "grad_norm": 1.3971514701843262, "learning_rate": 1.1986499033650556e-05, "loss": 0.8177, "step": 722 }, { "epoch": 1.67, "grad_norm": 1.1414539813995361, "learning_rate": 1.1807873565164506e-05, "loss": 0.6581, "step": 723 }, { "epoch": 1.67, "grad_norm": 1.2839149236679077, "learning_rate": 1.1630505522547853e-05, "loss": 1.0131, "step": 724 }, { "epoch": 1.67, "grad_norm": 1.4500195980072021, "learning_rate": 1.1454397434679021e-05, "loss": 1.0473, "step": 725 }, { "epoch": 1.67, "grad_norm": 1.769698977470398, "learning_rate": 1.12795518124722e-05, "loss": 1.1283, "step": 726 }, { "epoch": 1.68, "grad_norm": 1.6734747886657715, "learning_rate": 1.11059711488417e-05, "loss": 1.0084, "step": 727 }, { "epoch": 1.68, "grad_norm": 1.5987813472747803, "learning_rate": 1.0933657918666174e-05, "loss": 1.0561, "step": 728 }, { "epoch": 1.68, "grad_norm": 1.2631326913833618, "learning_rate": 1.0762614578753572e-05, "loss": 0.9926, "step": 729 }, { "epoch": 1.68, "grad_norm": 1.380119800567627, "learning_rate": 1.0592843567805943e-05, "loss": 0.9681, "step": 730 }, { "epoch": 1.69, "grad_norm": 1.2174406051635742, "learning_rate": 1.0424347306384729e-05, "loss": 0.6559, "step": 731 }, { "epoch": 1.69, "grad_norm": 1.7756376266479492, "learning_rate": 1.025712819687623e-05, "loss": 1.1937, "step": 732 }, { "epoch": 1.69, "grad_norm": 1.4800599813461304, "learning_rate": 1.0091188623457415e-05, "loss": 0.7639, "step": 733 }, { "epoch": 1.69, "grad_norm": 1.4891732931137085, "learning_rate": 9.92653095206183e-06, "loss": 0.7647, "step": 734 }, { "epoch": 1.69, "grad_norm": 1.2692780494689941, "learning_rate": 9.763157530345957e-06, "loss": 0.8445, "step": 735 }, { "epoch": 1.7, "grad_norm": 1.459630012512207, "learning_rate": 9.601070687655667e-06, "loss": 1.0512, "step": 736 }, { "epoch": 1.7, "grad_norm": 1.3034725189208984, "learning_rate": 9.440272734993072e-06, "loss": 0.8654, "step": 737 }, { "epoch": 1.7, "grad_norm": 1.2299538850784302, "learning_rate": 9.280765964983529e-06, "loss": 0.6091, "step": 738 }, { "epoch": 1.7, "grad_norm": 1.5857917070388794, "learning_rate": 9.12255265184293e-06, "loss": 0.8166, "step": 739 }, { "epoch": 1.71, "grad_norm": 1.3110122680664062, "learning_rate": 8.965635051345411e-06, "loss": 0.8791, "step": 740 }, { "epoch": 1.71, "grad_norm": 1.4379709959030151, "learning_rate": 8.810015400790994e-06, "loss": 0.9836, "step": 741 }, { "epoch": 1.71, "grad_norm": 1.272002100944519, "learning_rate": 8.655695918973862e-06, "loss": 1.0156, "step": 742 }, { "epoch": 1.71, "grad_norm": 1.7516756057739258, "learning_rate": 8.502678806150588e-06, "loss": 0.9863, "step": 743 }, { "epoch": 1.72, "grad_norm": 1.4102840423583984, "learning_rate": 8.350966244008895e-06, "loss": 0.9151, "step": 744 }, { "epoch": 1.72, "grad_norm": 1.1626827716827393, "learning_rate": 8.200560395636414e-06, "loss": 0.8854, "step": 745 }, { "epoch": 1.72, "grad_norm": 1.5321221351623535, "learning_rate": 8.051463405489957e-06, "loss": 0.7943, "step": 746 }, { "epoch": 1.72, "grad_norm": 1.5110939741134644, "learning_rate": 7.90367739936484e-06, "loss": 0.8067, "step": 747 }, { "epoch": 1.73, "grad_norm": 1.273176908493042, "learning_rate": 7.7572044843647e-06, "loss": 1.1937, "step": 748 }, { "epoch": 1.73, "grad_norm": 1.2125933170318604, "learning_rate": 7.612046748871327e-06, "loss": 1.8904, "step": 749 }, { "epoch": 1.73, "eval_loss": 1.1425327062606812, "eval_runtime": 5.0157, "eval_samples_per_second": 19.937, "eval_steps_per_second": 19.937, "step": 749 }, { "epoch": 1.73, "grad_norm": 1.541787028312683, "learning_rate": 7.4682062625149655e-06, "loss": 0.9905, "step": 750 }, { "epoch": 1.73, "grad_norm": 1.5736668109893799, "learning_rate": 7.325685076144795e-06, "loss": 1.0864, "step": 751 }, { "epoch": 1.73, "grad_norm": 1.21610426902771, "learning_rate": 7.1844852217996305e-06, "loss": 0.7444, "step": 752 }, { "epoch": 1.74, "grad_norm": 2.0105512142181396, "learning_rate": 7.0446087126790575e-06, "loss": 0.8589, "step": 753 }, { "epoch": 1.74, "grad_norm": 1.427541732788086, "learning_rate": 6.906057543114619e-06, "loss": 1.0143, "step": 754 }, { "epoch": 1.74, "grad_norm": 1.9788390398025513, "learning_rate": 6.768833688541443e-06, "loss": 1.2519, "step": 755 }, { "epoch": 1.74, "grad_norm": 1.646355152130127, "learning_rate": 6.632939105470049e-06, "loss": 1.1151, "step": 756 }, { "epoch": 1.75, "grad_norm": 1.3599865436553955, "learning_rate": 6.498375731458528e-06, "loss": 0.9262, "step": 757 }, { "epoch": 1.75, "grad_norm": 1.618510365486145, "learning_rate": 6.365145485084767e-06, "loss": 0.7766, "step": 758 }, { "epoch": 1.75, "grad_norm": 1.6023226976394653, "learning_rate": 6.233250265919266e-06, "loss": 0.9066, "step": 759 }, { "epoch": 1.75, "grad_norm": 1.3686883449554443, "learning_rate": 6.102691954497907e-06, "loss": 1.0172, "step": 760 }, { "epoch": 1.76, "grad_norm": 1.1869922876358032, "learning_rate": 5.973472412295255e-06, "loss": 0.6924, "step": 761 }, { "epoch": 1.76, "grad_norm": 1.10429847240448, "learning_rate": 5.8455934816979305e-06, "loss": 0.768, "step": 762 }, { "epoch": 1.76, "grad_norm": 1.2119660377502441, "learning_rate": 5.719056985978388e-06, "loss": 0.6629, "step": 763 }, { "epoch": 1.76, "grad_norm": 1.4681169986724854, "learning_rate": 5.593864729268949e-06, "loss": 0.6561, "step": 764 }, { "epoch": 1.77, "grad_norm": 1.4000868797302246, "learning_rate": 5.470018496535967e-06, "loss": 0.8486, "step": 765 }, { "epoch": 1.77, "grad_norm": 1.7486320734024048, "learning_rate": 5.347520053554545e-06, "loss": 1.0676, "step": 766 }, { "epoch": 1.77, "grad_norm": 1.2215529680252075, "learning_rate": 5.22637114688318e-06, "loss": 0.8262, "step": 767 }, { "epoch": 1.77, "grad_norm": 1.3720446825027466, "learning_rate": 5.106573503839018e-06, "loss": 0.8284, "step": 768 }, { "epoch": 1.77, "grad_norm": 1.42399001121521, "learning_rate": 4.9881288324731045e-06, "loss": 0.9733, "step": 769 }, { "epoch": 1.78, "grad_norm": 1.806998610496521, "learning_rate": 4.871038821546103e-06, "loss": 0.87, "step": 770 }, { "epoch": 1.78, "grad_norm": 0.7265794277191162, "learning_rate": 4.755305140504185e-06, "loss": 0.3255, "step": 771 }, { "epoch": 1.78, "grad_norm": 1.4420627355575562, "learning_rate": 4.640929439455277e-06, "loss": 1.0153, "step": 772 }, { "epoch": 1.78, "grad_norm": 1.6430668830871582, "learning_rate": 4.527913349145441e-06, "loss": 0.9099, "step": 773 }, { "epoch": 1.79, "grad_norm": 1.2274441719055176, "learning_rate": 4.416258480935731e-06, "loss": 0.8594, "step": 774 }, { "epoch": 1.79, "grad_norm": 1.4887783527374268, "learning_rate": 4.305966426779118e-06, "loss": 0.808, "step": 775 }, { "epoch": 1.79, "grad_norm": 1.1768397092819214, "learning_rate": 4.197038759197869e-06, "loss": 0.561, "step": 776 }, { "epoch": 1.79, "grad_norm": 1.266448974609375, "learning_rate": 4.089477031261113e-06, "loss": 0.9325, "step": 777 }, { "epoch": 1.8, "grad_norm": 1.5394303798675537, "learning_rate": 3.9832827765626465e-06, "loss": 0.7159, "step": 778 }, { "epoch": 1.8, "grad_norm": 1.482283115386963, "learning_rate": 3.878457509199107e-06, "loss": 1.103, "step": 779 }, { "epoch": 1.8, "grad_norm": 1.1145195960998535, "learning_rate": 3.7750027237484e-06, "loss": 1.0283, "step": 780 }, { "epoch": 1.8, "grad_norm": 1.5757018327713013, "learning_rate": 3.6729198952483724e-06, "loss": 1.162, "step": 781 }, { "epoch": 1.81, "grad_norm": 1.555261492729187, "learning_rate": 3.572210479175753e-06, "loss": 0.9346, "step": 782 }, { "epoch": 1.81, "grad_norm": 1.5401426553726196, "learning_rate": 3.472875911425477e-06, "loss": 1.0614, "step": 783 }, { "epoch": 1.81, "grad_norm": 1.4009335041046143, "learning_rate": 3.3749176082901067e-06, "loss": 0.9636, "step": 784 }, { "epoch": 1.81, "grad_norm": 1.5251446962356567, "learning_rate": 3.2783369664397436e-06, "loss": 0.7538, "step": 785 }, { "epoch": 1.81, "grad_norm": 1.2820510864257812, "learning_rate": 3.1831353629020344e-06, "loss": 0.8404, "step": 786 }, { "epoch": 1.82, "grad_norm": 1.2701919078826904, "learning_rate": 3.0893141550425884e-06, "loss": 0.7659, "step": 787 }, { "epoch": 1.82, "grad_norm": 1.619142770767212, "learning_rate": 2.996874680545603e-06, "loss": 0.7558, "step": 788 }, { "epoch": 1.82, "grad_norm": 1.825161099433899, "learning_rate": 2.905818257394799e-06, "loss": 1.0628, "step": 789 }, { "epoch": 1.82, "grad_norm": 1.2984554767608643, "learning_rate": 2.8161461838546176e-06, "loss": 0.8484, "step": 790 }, { "epoch": 1.83, "grad_norm": 1.8327722549438477, "learning_rate": 2.7278597384517214e-06, "loss": 0.8366, "step": 791 }, { "epoch": 1.83, "grad_norm": 1.1649681329727173, "learning_rate": 2.6409601799567642e-06, "loss": 0.7479, "step": 792 }, { "epoch": 1.83, "grad_norm": 1.212082862854004, "learning_rate": 2.55544874736644e-06, "loss": 1.2608, "step": 793 }, { "epoch": 1.83, "grad_norm": 1.675178050994873, "learning_rate": 2.4713266598858086e-06, "loss": 0.7571, "step": 794 }, { "epoch": 1.84, "grad_norm": 1.349767804145813, "learning_rate": 2.3885951169109187e-06, "loss": 0.6354, "step": 795 }, { "epoch": 1.84, "grad_norm": 1.0622371435165405, "learning_rate": 2.3072552980117566e-06, "loss": 0.5172, "step": 796 }, { "epoch": 1.84, "grad_norm": 1.2005047798156738, "learning_rate": 2.2273083629153147e-06, "loss": 1.1696, "step": 797 }, { "epoch": 1.84, "grad_norm": 1.3214634656906128, "learning_rate": 2.1487554514891704e-06, "loss": 0.9274, "step": 798 }, { "epoch": 1.85, "grad_norm": 1.359706163406372, "learning_rate": 2.071597683725179e-06, "loss": 0.7433, "step": 799 }, { "epoch": 1.85, "grad_norm": 1.3095366954803467, "learning_rate": 1.9958361597235076e-06, "loss": 0.9496, "step": 800 }, { "epoch": 1.85, "grad_norm": 1.5138649940490723, "learning_rate": 1.921471959676957e-06, "loss": 1.0328, "step": 801 }, { "epoch": 1.85, "grad_norm": 1.6273144483566284, "learning_rate": 1.848506143855555e-06, "loss": 0.7867, "step": 802 }, { "epoch": 1.85, "grad_norm": 0.9000691175460815, "learning_rate": 1.7769397525914667e-06, "loss": 0.7318, "step": 803 }, { "epoch": 1.86, "grad_norm": 1.6215777397155762, "learning_rate": 1.706773806264106e-06, "loss": 1.1416, "step": 804 }, { "epoch": 1.86, "grad_norm": 1.5041816234588623, "learning_rate": 1.6380093052856483e-06, "loss": 0.9327, "step": 805 }, { "epoch": 1.86, "grad_norm": 1.6266945600509644, "learning_rate": 1.570647230086708e-06, "loss": 1.1518, "step": 806 }, { "epoch": 1.86, "grad_norm": 1.4679832458496094, "learning_rate": 1.5046885411024391e-06, "loss": 0.8719, "step": 807 }, { "epoch": 1.87, "grad_norm": 1.5392640829086304, "learning_rate": 1.4401341787587453e-06, "loss": 1.0913, "step": 808 }, { "epoch": 1.87, "grad_norm": 1.4636483192443848, "learning_rate": 1.3769850634589354e-06, "loss": 0.9312, "step": 809 }, { "epoch": 1.87, "grad_norm": 1.214938998222351, "learning_rate": 1.3152420955706012e-06, "loss": 0.8939, "step": 810 }, { "epoch": 1.87, "grad_norm": 1.2058130502700806, "learning_rate": 1.2549061554127494e-06, "loss": 0.8283, "step": 811 }, { "epoch": 1.88, "grad_norm": 1.2419188022613525, "learning_rate": 1.1959781032432337e-06, "loss": 0.9094, "step": 812 }, { "epoch": 1.88, "grad_norm": 1.7467167377471924, "learning_rate": 1.1384587792465872e-06, "loss": 0.7264, "step": 813 }, { "epoch": 1.88, "grad_norm": 1.167214035987854, "learning_rate": 1.0823490035218987e-06, "loss": 0.6232, "step": 814 }, { "epoch": 1.88, "grad_norm": 1.3175835609436035, "learning_rate": 1.0276495760712767e-06, "loss": 1.0396, "step": 815 }, { "epoch": 1.88, "grad_norm": 1.6061522960662842, "learning_rate": 9.743612767882936e-07, "loss": 0.6561, "step": 816 }, { "epoch": 1.89, "grad_norm": 1.4116915464401245, "learning_rate": 9.224848654469931e-07, "loss": 1.2357, "step": 817 }, { "epoch": 1.89, "grad_norm": 1.2765032052993774, "learning_rate": 8.720210816909435e-07, "loss": 0.9708, "step": 818 }, { "epoch": 1.89, "grad_norm": 1.392382025718689, "learning_rate": 8.229706450227803e-07, "loss": 0.9548, "step": 819 }, { "epoch": 1.89, "grad_norm": 1.3821630477905273, "learning_rate": 7.753342547939357e-07, "loss": 0.7787, "step": 820 }, { "epoch": 1.9, "grad_norm": 1.5395421981811523, "learning_rate": 7.291125901946027e-07, "loss": 0.7223, "step": 821 }, { "epoch": 1.9, "grad_norm": 1.5560616254806519, "learning_rate": 6.843063102441316e-07, "loss": 0.9542, "step": 822 }, { "epoch": 1.9, "grad_norm": 1.2338303327560425, "learning_rate": 6.409160537815817e-07, "loss": 0.9685, "step": 823 }, { "epoch": 1.9, "grad_norm": 1.3884745836257935, "learning_rate": 5.989424394566401e-07, "loss": 1.1287, "step": 824 }, { "epoch": 1.91, "grad_norm": 1.1085461378097534, "learning_rate": 5.58386065720784e-07, "loss": 0.7562, "step": 825 }, { "epoch": 1.91, "grad_norm": 0.8512900471687317, "learning_rate": 5.192475108187544e-07, "loss": 0.4627, "step": 826 }, { "epoch": 1.91, "grad_norm": 1.3366022109985352, "learning_rate": 4.815273327803182e-07, "loss": 0.8489, "step": 827 }, { "epoch": 1.91, "grad_norm": 1.066333293914795, "learning_rate": 4.452260694122856e-07, "loss": 0.6867, "step": 828 }, { "epoch": 1.92, "grad_norm": 1.484168291091919, "learning_rate": 4.103442382909051e-07, "loss": 1.1556, "step": 829 }, { "epoch": 1.92, "grad_norm": 1.4741454124450684, "learning_rate": 3.7688233675439166e-07, "loss": 0.9345, "step": 830 }, { "epoch": 1.92, "grad_norm": 1.0482972860336304, "learning_rate": 3.4484084189593257e-07, "loss": 0.7689, "step": 831 }, { "epoch": 1.92, "grad_norm": 1.4800513982772827, "learning_rate": 3.1422021055679265e-07, "loss": 0.8486, "step": 832 }, { "epoch": 1.92, "grad_norm": 1.3522624969482422, "learning_rate": 2.850208793198861e-07, "loss": 0.9064, "step": 833 }, { "epoch": 1.93, "grad_norm": 1.6680634021759033, "learning_rate": 2.572432645034817e-07, "loss": 1.0127, "step": 834 }, { "epoch": 1.93, "grad_norm": 1.5550647974014282, "learning_rate": 2.3088776215531848e-07, "loss": 0.8741, "step": 835 }, { "epoch": 1.93, "grad_norm": 1.4581865072250366, "learning_rate": 2.0595474804691038e-07, "loss": 0.9799, "step": 836 }, { "epoch": 1.93, "grad_norm": 1.3442659378051758, "learning_rate": 1.824445776682504e-07, "loss": 0.8629, "step": 837 }, { "epoch": 1.94, "grad_norm": 1.5211567878723145, "learning_rate": 1.6035758622269247e-07, "loss": 0.6652, "step": 838 }, { "epoch": 1.94, "grad_norm": 1.16912841796875, "learning_rate": 1.3969408862217758e-07, "loss": 0.8838, "step": 839 }, { "epoch": 1.94, "grad_norm": 1.5780668258666992, "learning_rate": 1.204543794827595e-07, "loss": 0.9765, "step": 840 }, { "epoch": 1.94, "grad_norm": 1.4517415761947632, "learning_rate": 1.0263873312040818e-07, "loss": 0.6895, "step": 841 }, { "epoch": 1.95, "grad_norm": 1.7025426626205444, "learning_rate": 8.624740354707949e-08, "loss": 0.8764, "step": 842 }, { "epoch": 1.95, "grad_norm": 1.3014923334121704, "learning_rate": 7.128062446709604e-08, "loss": 1.0822, "step": 843 }, { "epoch": 1.95, "grad_norm": 1.501705288887024, "learning_rate": 5.773860927383856e-08, "loss": 0.8872, "step": 844 }, { "epoch": 1.95, "grad_norm": 1.6005631685256958, "learning_rate": 4.562155104665955e-08, "loss": 1.1913, "step": 845 }, { "epoch": 1.96, "grad_norm": 1.1631520986557007, "learning_rate": 3.492962254819654e-08, "loss": 0.955, "step": 846 }, { "epoch": 1.96, "grad_norm": 1.285071611404419, "learning_rate": 2.5662976221840773e-08, "loss": 0.8429, "step": 847 }, { "epoch": 1.96, "grad_norm": 1.0921001434326172, "learning_rate": 1.7821744189605582e-08, "loss": 0.7829, "step": 848 }, { "epoch": 1.96, "grad_norm": 1.4597008228302002, "learning_rate": 1.1406038250205698e-08, "loss": 1.1112, "step": 849 }, { "epoch": 1.96, "grad_norm": 2.1776418685913086, "learning_rate": 6.41594987752514e-09, "loss": 1.2897, "step": 850 }, { "epoch": 1.97, "grad_norm": 1.2497656345367432, "learning_rate": 2.851550219240551e-09, "loss": 0.9627, "step": 851 }, { "epoch": 1.97, "grad_norm": 1.291884422302246, "learning_rate": 7.128900958774942e-10, "loss": 0.8828, "step": 852 } ], "logging_steps": 1, "max_steps": 852, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 426, "total_flos": 1.547329324744704e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }