{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9983597594313833, "eval_steps": 500, "global_step": 1371, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002186987424822307, "grad_norm": 0.11989043653011322, "learning_rate": 2.1739130434782607e-06, "loss": 0.7588, "step": 1 }, { "epoch": 0.004373974849644614, "grad_norm": 0.08302941918373108, "learning_rate": 4.347826086956521e-06, "loss": 0.8145, "step": 2 }, { "epoch": 0.0065609622744669215, "grad_norm": 0.15307161211967468, "learning_rate": 6.521739130434782e-06, "loss": 0.8127, "step": 3 }, { "epoch": 0.008747949699289229, "grad_norm": 0.13161885738372803, "learning_rate": 8.695652173913043e-06, "loss": 0.6707, "step": 4 }, { "epoch": 0.010934937124111536, "grad_norm": 0.09451252222061157, "learning_rate": 1.0869565217391303e-05, "loss": 0.7497, "step": 5 }, { "epoch": 0.013121924548933843, "grad_norm": 0.0813838317990303, "learning_rate": 1.3043478260869564e-05, "loss": 1.0007, "step": 6 }, { "epoch": 0.01530891197375615, "grad_norm": 0.12192627787590027, "learning_rate": 1.5217391304347826e-05, "loss": 0.6703, "step": 7 }, { "epoch": 0.017495899398578457, "grad_norm": 0.14730937778949738, "learning_rate": 1.7391304347826085e-05, "loss": 0.9552, "step": 8 }, { "epoch": 0.019682886823400764, "grad_norm": 0.13510680198669434, "learning_rate": 1.9565217391304346e-05, "loss": 0.9591, "step": 9 }, { "epoch": 0.02186987424822307, "grad_norm": 0.11157332360744476, "learning_rate": 2.1739130434782607e-05, "loss": 0.9358, "step": 10 }, { "epoch": 0.02405686167304538, "grad_norm": 0.11157120019197464, "learning_rate": 2.3913043478260864e-05, "loss": 0.8377, "step": 11 }, { "epoch": 0.026243849097867686, "grad_norm": 0.13191162049770355, "learning_rate": 2.6086956521739128e-05, "loss": 0.8974, "step": 12 }, { "epoch": 0.028430836522689993, "grad_norm": 0.14399488270282745, "learning_rate": 2.826086956521739e-05, "loss": 0.778, "step": 13 }, { "epoch": 0.0306178239475123, "grad_norm": 0.11593582481145859, "learning_rate": 3.0434782608695653e-05, "loss": 0.9507, "step": 14 }, { "epoch": 0.03280481137233461, "grad_norm": 0.16411006450653076, "learning_rate": 3.260869565217391e-05, "loss": 0.6949, "step": 15 }, { "epoch": 0.034991798797156914, "grad_norm": 0.13450156152248383, "learning_rate": 3.478260869565217e-05, "loss": 0.8162, "step": 16 }, { "epoch": 0.037178786221979225, "grad_norm": 0.12586522102355957, "learning_rate": 3.695652173913043e-05, "loss": 0.8776, "step": 17 }, { "epoch": 0.03936577364680153, "grad_norm": 0.10510208457708359, "learning_rate": 3.913043478260869e-05, "loss": 0.7852, "step": 18 }, { "epoch": 0.04155276107162384, "grad_norm": 0.12737107276916504, "learning_rate": 4.130434782608695e-05, "loss": 0.9647, "step": 19 }, { "epoch": 0.04373974849644614, "grad_norm": 0.1500634402036667, "learning_rate": 4.3478260869565214e-05, "loss": 0.7532, "step": 20 }, { "epoch": 0.045926735921268454, "grad_norm": 0.16161426901817322, "learning_rate": 4.5652173913043474e-05, "loss": 0.811, "step": 21 }, { "epoch": 0.04811372334609076, "grad_norm": 0.1249527782201767, "learning_rate": 4.782608695652173e-05, "loss": 0.795, "step": 22 }, { "epoch": 0.05030071077091307, "grad_norm": 0.1505545973777771, "learning_rate": 4.9999999999999996e-05, "loss": 0.9194, "step": 23 }, { "epoch": 0.05248769819573537, "grad_norm": 0.13624198734760284, "learning_rate": 5.2173913043478256e-05, "loss": 0.97, "step": 24 }, { "epoch": 0.05467468562055768, "grad_norm": 0.15684515237808228, "learning_rate": 5.434782608695652e-05, "loss": 0.6862, "step": 25 }, { "epoch": 0.056861673045379986, "grad_norm": 0.14302442967891693, "learning_rate": 5.652173913043478e-05, "loss": 0.8062, "step": 26 }, { "epoch": 0.0590486604702023, "grad_norm": 0.23029306530952454, "learning_rate": 5.869565217391304e-05, "loss": 0.9101, "step": 27 }, { "epoch": 0.0612356478950246, "grad_norm": 0.24247854948043823, "learning_rate": 6.0869565217391306e-05, "loss": 0.8779, "step": 28 }, { "epoch": 0.0634226353198469, "grad_norm": 0.1507425308227539, "learning_rate": 6.304347826086956e-05, "loss": 0.7181, "step": 29 }, { "epoch": 0.06560962274466922, "grad_norm": 0.18965087831020355, "learning_rate": 6.521739130434782e-05, "loss": 0.8163, "step": 30 }, { "epoch": 0.06779661016949153, "grad_norm": 0.2104681432247162, "learning_rate": 6.739130434782608e-05, "loss": 0.9495, "step": 31 }, { "epoch": 0.06998359759431383, "grad_norm": 0.21606619656085968, "learning_rate": 6.956521739130434e-05, "loss": 0.9565, "step": 32 }, { "epoch": 0.07217058501913615, "grad_norm": 0.2107428014278412, "learning_rate": 7.17391304347826e-05, "loss": 0.7743, "step": 33 }, { "epoch": 0.07435757244395845, "grad_norm": 0.3160182535648346, "learning_rate": 7.391304347826086e-05, "loss": 1.0056, "step": 34 }, { "epoch": 0.07654455986878075, "grad_norm": 0.2970617115497589, "learning_rate": 7.608695652173912e-05, "loss": 0.8122, "step": 35 }, { "epoch": 0.07873154729360306, "grad_norm": 0.17866499722003937, "learning_rate": 7.826086956521738e-05, "loss": 0.7953, "step": 36 }, { "epoch": 0.08091853471842538, "grad_norm": 0.32111942768096924, "learning_rate": 8.043478260869566e-05, "loss": 0.9121, "step": 37 }, { "epoch": 0.08310552214324768, "grad_norm": 0.20938844978809357, "learning_rate": 8.26086956521739e-05, "loss": 0.887, "step": 38 }, { "epoch": 0.08529250956806998, "grad_norm": 0.27339646220207214, "learning_rate": 8.478260869565217e-05, "loss": 0.7808, "step": 39 }, { "epoch": 0.08747949699289229, "grad_norm": 0.19005413353443146, "learning_rate": 8.695652173913043e-05, "loss": 0.6723, "step": 40 }, { "epoch": 0.0896664844177146, "grad_norm": 0.19314634799957275, "learning_rate": 8.913043478260869e-05, "loss": 0.8384, "step": 41 }, { "epoch": 0.09185347184253691, "grad_norm": 0.21565446257591248, "learning_rate": 9.130434782608695e-05, "loss": 0.7402, "step": 42 }, { "epoch": 0.09404045926735921, "grad_norm": 0.3733920753002167, "learning_rate": 9.347826086956521e-05, "loss": 0.9476, "step": 43 }, { "epoch": 0.09622744669218151, "grad_norm": 0.3119434714317322, "learning_rate": 9.565217391304346e-05, "loss": 0.7324, "step": 44 }, { "epoch": 0.09841443411700383, "grad_norm": 0.20734310150146484, "learning_rate": 9.782608695652173e-05, "loss": 0.6521, "step": 45 }, { "epoch": 0.10060142154182614, "grad_norm": 0.2809116840362549, "learning_rate": 9.999999999999999e-05, "loss": 0.7374, "step": 46 }, { "epoch": 0.10278840896664844, "grad_norm": 0.2248832732439041, "learning_rate": 0.00010217391304347825, "loss": 0.7822, "step": 47 }, { "epoch": 0.10497539639147074, "grad_norm": 0.26310572028160095, "learning_rate": 0.00010434782608695651, "loss": 0.844, "step": 48 }, { "epoch": 0.10716238381629306, "grad_norm": 0.20629820227622986, "learning_rate": 0.00010652173913043477, "loss": 0.9024, "step": 49 }, { "epoch": 0.10934937124111536, "grad_norm": 0.40926942229270935, "learning_rate": 0.00010869565217391303, "loss": 0.8497, "step": 50 }, { "epoch": 0.11153635866593767, "grad_norm": 0.34393706917762756, "learning_rate": 0.00011086956521739128, "loss": 0.8326, "step": 51 }, { "epoch": 0.11372334609075997, "grad_norm": 0.25371822714805603, "learning_rate": 0.00011304347826086956, "loss": 1.0089, "step": 52 }, { "epoch": 0.11591033351558229, "grad_norm": 0.3484710454940796, "learning_rate": 0.00011521739130434782, "loss": 0.7667, "step": 53 }, { "epoch": 0.1180973209404046, "grad_norm": 0.5894125699996948, "learning_rate": 0.00011739130434782608, "loss": 0.7977, "step": 54 }, { "epoch": 0.1202843083652269, "grad_norm": 0.29829731583595276, "learning_rate": 0.00011956521739130434, "loss": 0.7545, "step": 55 }, { "epoch": 0.1224712957900492, "grad_norm": 0.4180648922920227, "learning_rate": 0.00012173913043478261, "loss": 0.9833, "step": 56 }, { "epoch": 0.12465828321487152, "grad_norm": 0.24174439907073975, "learning_rate": 0.00012391304347826086, "loss": 0.5948, "step": 57 }, { "epoch": 0.1268452706396938, "grad_norm": 0.253364235162735, "learning_rate": 0.00012608695652173912, "loss": 0.7528, "step": 58 }, { "epoch": 0.12903225806451613, "grad_norm": 0.31262415647506714, "learning_rate": 0.00012826086956521738, "loss": 0.7635, "step": 59 }, { "epoch": 0.13121924548933844, "grad_norm": 0.2893831729888916, "learning_rate": 0.00013043478260869564, "loss": 0.7426, "step": 60 }, { "epoch": 0.13340623291416073, "grad_norm": 0.26717469096183777, "learning_rate": 0.0001326086956521739, "loss": 0.7747, "step": 61 }, { "epoch": 0.13559322033898305, "grad_norm": 0.3445766270160675, "learning_rate": 0.00013478260869565216, "loss": 0.802, "step": 62 }, { "epoch": 0.13778020776380537, "grad_norm": 0.3893512487411499, "learning_rate": 0.00013695652173913042, "loss": 1.0112, "step": 63 }, { "epoch": 0.13996719518862766, "grad_norm": 0.2807013988494873, "learning_rate": 0.00013913043478260868, "loss": 0.832, "step": 64 }, { "epoch": 0.14215418261344998, "grad_norm": 0.3300040662288666, "learning_rate": 0.00014130434782608694, "loss": 0.8425, "step": 65 }, { "epoch": 0.1443411700382723, "grad_norm": 0.3051323890686035, "learning_rate": 0.0001434782608695652, "loss": 0.7218, "step": 66 }, { "epoch": 0.14652815746309458, "grad_norm": 0.25623396039009094, "learning_rate": 0.00014565217391304347, "loss": 0.7398, "step": 67 }, { "epoch": 0.1487151448879169, "grad_norm": 0.3793390989303589, "learning_rate": 0.00014782608695652173, "loss": 0.7293, "step": 68 }, { "epoch": 0.1509021323127392, "grad_norm": 0.3046607971191406, "learning_rate": 0.00015, "loss": 0.7507, "step": 69 }, { "epoch": 0.1530891197375615, "grad_norm": 0.23061273992061615, "learning_rate": 0.00015217391304347825, "loss": 0.6682, "step": 70 }, { "epoch": 0.15527610716238383, "grad_norm": 0.3328089714050293, "learning_rate": 0.00015434782608695648, "loss": 0.6736, "step": 71 }, { "epoch": 0.15746309458720611, "grad_norm": 0.4419778287410736, "learning_rate": 0.00015652173913043477, "loss": 0.8789, "step": 72 }, { "epoch": 0.15965008201202843, "grad_norm": 0.3310529291629791, "learning_rate": 0.00015869565217391303, "loss": 0.8108, "step": 73 }, { "epoch": 0.16183706943685075, "grad_norm": 0.4529496729373932, "learning_rate": 0.00016086956521739132, "loss": 1.0239, "step": 74 }, { "epoch": 0.16402405686167304, "grad_norm": 0.3741857707500458, "learning_rate": 0.00016304347826086955, "loss": 0.7601, "step": 75 }, { "epoch": 0.16621104428649536, "grad_norm": 0.2660742700099945, "learning_rate": 0.0001652173913043478, "loss": 0.7989, "step": 76 }, { "epoch": 0.16839803171131765, "grad_norm": 0.28130316734313965, "learning_rate": 0.00016739130434782607, "loss": 0.8459, "step": 77 }, { "epoch": 0.17058501913613996, "grad_norm": 0.3322678804397583, "learning_rate": 0.00016956521739130433, "loss": 0.7567, "step": 78 }, { "epoch": 0.17277200656096228, "grad_norm": 0.30039381980895996, "learning_rate": 0.0001717391304347826, "loss": 0.7353, "step": 79 }, { "epoch": 0.17495899398578457, "grad_norm": 0.30451035499572754, "learning_rate": 0.00017391304347826085, "loss": 0.7913, "step": 80 }, { "epoch": 0.1771459814106069, "grad_norm": 0.30815356969833374, "learning_rate": 0.00017608695652173914, "loss": 0.7766, "step": 81 }, { "epoch": 0.1793329688354292, "grad_norm": 0.5257038474082947, "learning_rate": 0.00017826086956521738, "loss": 0.7486, "step": 82 }, { "epoch": 0.1815199562602515, "grad_norm": 0.22373591363430023, "learning_rate": 0.00018043478260869564, "loss": 0.79, "step": 83 }, { "epoch": 0.18370694368507381, "grad_norm": 0.21466179192066193, "learning_rate": 0.0001826086956521739, "loss": 0.6091, "step": 84 }, { "epoch": 0.1858939311098961, "grad_norm": 0.3204774558544159, "learning_rate": 0.00018478260869565216, "loss": 1.015, "step": 85 }, { "epoch": 0.18808091853471842, "grad_norm": 0.272977739572525, "learning_rate": 0.00018695652173913042, "loss": 0.7317, "step": 86 }, { "epoch": 0.19026790595954074, "grad_norm": 0.32803332805633545, "learning_rate": 0.00018913043478260868, "loss": 0.7552, "step": 87 }, { "epoch": 0.19245489338436303, "grad_norm": 0.308023065328598, "learning_rate": 0.0001913043478260869, "loss": 0.7058, "step": 88 }, { "epoch": 0.19464188080918535, "grad_norm": 0.2604801654815674, "learning_rate": 0.0001934782608695652, "loss": 0.6967, "step": 89 }, { "epoch": 0.19682886823400766, "grad_norm": 0.3489021062850952, "learning_rate": 0.00019565217391304346, "loss": 0.7518, "step": 90 }, { "epoch": 0.19901585565882995, "grad_norm": 0.6137279272079468, "learning_rate": 0.00019782608695652172, "loss": 0.635, "step": 91 }, { "epoch": 0.20120284308365227, "grad_norm": 0.41480115056037903, "learning_rate": 0.00019999999999999998, "loss": 0.8928, "step": 92 }, { "epoch": 0.2033898305084746, "grad_norm": 0.22284042835235596, "learning_rate": 0.00020217391304347824, "loss": 0.5862, "step": 93 }, { "epoch": 0.20557681793329688, "grad_norm": 0.233658567070961, "learning_rate": 0.0002043478260869565, "loss": 0.8148, "step": 94 }, { "epoch": 0.2077638053581192, "grad_norm": 0.21716511249542236, "learning_rate": 0.00020652173913043474, "loss": 0.6474, "step": 95 }, { "epoch": 0.2099507927829415, "grad_norm": 0.506393551826477, "learning_rate": 0.00020869565217391303, "loss": 0.7149, "step": 96 }, { "epoch": 0.2121377802077638, "grad_norm": 0.3504016697406769, "learning_rate": 0.00021086956521739129, "loss": 0.647, "step": 97 }, { "epoch": 0.21432476763258612, "grad_norm": 0.28688108921051025, "learning_rate": 0.00021304347826086955, "loss": 0.6584, "step": 98 }, { "epoch": 0.2165117550574084, "grad_norm": 0.35572630167007446, "learning_rate": 0.0002152173913043478, "loss": 0.8177, "step": 99 }, { "epoch": 0.21869874248223073, "grad_norm": 0.30645623803138733, "learning_rate": 0.00021739130434782607, "loss": 0.7421, "step": 100 }, { "epoch": 0.22088572990705305, "grad_norm": 0.480013370513916, "learning_rate": 0.00021956521739130433, "loss": 0.7542, "step": 101 }, { "epoch": 0.22307271733187534, "grad_norm": 0.23101027309894562, "learning_rate": 0.00022173913043478256, "loss": 0.81, "step": 102 }, { "epoch": 0.22525970475669765, "grad_norm": 0.37322309613227844, "learning_rate": 0.00022391304347826085, "loss": 0.8879, "step": 103 }, { "epoch": 0.22744669218151994, "grad_norm": 1.5672107934951782, "learning_rate": 0.0002260869565217391, "loss": 0.7838, "step": 104 }, { "epoch": 0.22963367960634226, "grad_norm": 0.5281320810317993, "learning_rate": 0.0002282608695652174, "loss": 0.7246, "step": 105 }, { "epoch": 0.23182066703116458, "grad_norm": 0.597309947013855, "learning_rate": 0.00023043478260869563, "loss": 0.6229, "step": 106 }, { "epoch": 0.23400765445598687, "grad_norm": 0.29928773641586304, "learning_rate": 0.0002326086956521739, "loss": 0.779, "step": 107 }, { "epoch": 0.2361946418808092, "grad_norm": 0.3042626678943634, "learning_rate": 0.00023478260869565215, "loss": 0.6647, "step": 108 }, { "epoch": 0.2383816293056315, "grad_norm": 0.3099993169307709, "learning_rate": 0.00023695652173913041, "loss": 0.8173, "step": 109 }, { "epoch": 0.2405686167304538, "grad_norm": 0.21835339069366455, "learning_rate": 0.00023913043478260867, "loss": 0.7145, "step": 110 }, { "epoch": 0.2427556041552761, "grad_norm": 0.2737351357936859, "learning_rate": 0.00024130434782608694, "loss": 0.754, "step": 111 }, { "epoch": 0.2449425915800984, "grad_norm": 0.2737314999103546, "learning_rate": 0.00024347826086956522, "loss": 0.6692, "step": 112 }, { "epoch": 0.24712957900492072, "grad_norm": 0.369526743888855, "learning_rate": 0.00024565217391304343, "loss": 0.7039, "step": 113 }, { "epoch": 0.24931656642974304, "grad_norm": 0.2262083888053894, "learning_rate": 0.0002478260869565217, "loss": 0.6004, "step": 114 }, { "epoch": 0.25150355385456535, "grad_norm": 0.42596694827079773, "learning_rate": 0.00025, "loss": 0.8972, "step": 115 }, { "epoch": 0.2536905412793876, "grad_norm": 0.4870564043521881, "learning_rate": 0.00025217391304347824, "loss": 0.7305, "step": 116 }, { "epoch": 0.25587752870420993, "grad_norm": 0.3326433598995209, "learning_rate": 0.00025434782608695647, "loss": 0.7079, "step": 117 }, { "epoch": 0.25806451612903225, "grad_norm": 0.3588925004005432, "learning_rate": 0.00025652173913043476, "loss": 0.7682, "step": 118 }, { "epoch": 0.26025150355385457, "grad_norm": 0.2966621518135071, "learning_rate": 0.00025869565217391305, "loss": 0.8244, "step": 119 }, { "epoch": 0.2624384909786769, "grad_norm": 0.2213324010372162, "learning_rate": 0.0002608695652173913, "loss": 0.841, "step": 120 }, { "epoch": 0.2646254784034992, "grad_norm": 0.28340932726860046, "learning_rate": 0.00026304347826086957, "loss": 0.7646, "step": 121 }, { "epoch": 0.26681246582832147, "grad_norm": 0.3026011884212494, "learning_rate": 0.0002652173913043478, "loss": 0.8269, "step": 122 }, { "epoch": 0.2689994532531438, "grad_norm": 0.3213091194629669, "learning_rate": 0.00026739130434782604, "loss": 0.7456, "step": 123 }, { "epoch": 0.2711864406779661, "grad_norm": 0.24254000186920166, "learning_rate": 0.0002695652173913043, "loss": 0.786, "step": 124 }, { "epoch": 0.2733734281027884, "grad_norm": 0.22490260004997253, "learning_rate": 0.0002717391304347826, "loss": 0.8288, "step": 125 }, { "epoch": 0.27556041552761074, "grad_norm": 0.2039777934551239, "learning_rate": 0.00027391304347826085, "loss": 0.7204, "step": 126 }, { "epoch": 0.277747402952433, "grad_norm": 0.2281191200017929, "learning_rate": 0.0002760869565217391, "loss": 0.5744, "step": 127 }, { "epoch": 0.2799343903772553, "grad_norm": 0.33240583539009094, "learning_rate": 0.00027826086956521737, "loss": 0.6398, "step": 128 }, { "epoch": 0.28212137780207763, "grad_norm": 0.38755086064338684, "learning_rate": 0.00028043478260869565, "loss": 0.6739, "step": 129 }, { "epoch": 0.28430836522689995, "grad_norm": 0.5284032821655273, "learning_rate": 0.0002826086956521739, "loss": 1.0215, "step": 130 }, { "epoch": 0.28649535265172227, "grad_norm": 0.8248558044433594, "learning_rate": 0.0002847826086956521, "loss": 0.6937, "step": 131 }, { "epoch": 0.2886823400765446, "grad_norm": 0.264347106218338, "learning_rate": 0.0002869565217391304, "loss": 0.6745, "step": 132 }, { "epoch": 0.29086932750136685, "grad_norm": 0.24335810542106628, "learning_rate": 0.00028913043478260864, "loss": 0.8085, "step": 133 }, { "epoch": 0.29305631492618917, "grad_norm": 0.2641212046146393, "learning_rate": 0.00029130434782608693, "loss": 0.6991, "step": 134 }, { "epoch": 0.2952433023510115, "grad_norm": 0.2698618769645691, "learning_rate": 0.0002934782608695652, "loss": 0.7643, "step": 135 }, { "epoch": 0.2974302897758338, "grad_norm": 0.24988499283790588, "learning_rate": 0.00029565217391304345, "loss": 0.8905, "step": 136 }, { "epoch": 0.2996172772006561, "grad_norm": 0.2180056869983673, "learning_rate": 0.0002978260869565217, "loss": 0.7743, "step": 137 }, { "epoch": 0.3018042646254784, "grad_norm": 0.23834429681301117, "learning_rate": 0.0003, "loss": 0.6164, "step": 138 }, { "epoch": 0.3039912520503007, "grad_norm": 0.33471837639808655, "learning_rate": 0.00029975669099756687, "loss": 0.9367, "step": 139 }, { "epoch": 0.306178239475123, "grad_norm": 0.22311441600322723, "learning_rate": 0.0002995133819951338, "loss": 0.8235, "step": 140 }, { "epoch": 0.30836522689994533, "grad_norm": 0.16766682267189026, "learning_rate": 0.0002992700729927007, "loss": 0.6212, "step": 141 }, { "epoch": 0.31055221432476765, "grad_norm": 0.21076077222824097, "learning_rate": 0.0002990267639902676, "loss": 0.7472, "step": 142 }, { "epoch": 0.3127392017495899, "grad_norm": 0.33612027764320374, "learning_rate": 0.0002987834549878345, "loss": 0.7475, "step": 143 }, { "epoch": 0.31492618917441223, "grad_norm": 0.2724473476409912, "learning_rate": 0.0002985401459854014, "loss": 0.7422, "step": 144 }, { "epoch": 0.31711317659923455, "grad_norm": 0.23170293867588043, "learning_rate": 0.0002982968369829683, "loss": 0.7233, "step": 145 }, { "epoch": 0.31930016402405687, "grad_norm": 0.2461654394865036, "learning_rate": 0.00029805352798053527, "loss": 0.6717, "step": 146 }, { "epoch": 0.3214871514488792, "grad_norm": 0.2988247573375702, "learning_rate": 0.00029781021897810217, "loss": 0.8926, "step": 147 }, { "epoch": 0.3236741388737015, "grad_norm": 0.18185736238956451, "learning_rate": 0.00029756690997566907, "loss": 0.6663, "step": 148 }, { "epoch": 0.32586112629852376, "grad_norm": 0.276687890291214, "learning_rate": 0.000297323600973236, "loss": 0.6903, "step": 149 }, { "epoch": 0.3280481137233461, "grad_norm": 0.3481093645095825, "learning_rate": 0.0002970802919708029, "loss": 0.7468, "step": 150 }, { "epoch": 0.3302351011481684, "grad_norm": 0.21930567920207977, "learning_rate": 0.0002968369829683698, "loss": 0.6268, "step": 151 }, { "epoch": 0.3324220885729907, "grad_norm": 0.18267425894737244, "learning_rate": 0.0002965936739659367, "loss": 0.7194, "step": 152 }, { "epoch": 0.33460907599781303, "grad_norm": 0.7277535200119019, "learning_rate": 0.0002963503649635036, "loss": 0.7393, "step": 153 }, { "epoch": 0.3367960634226353, "grad_norm": 0.3378921151161194, "learning_rate": 0.0002961070559610705, "loss": 0.7413, "step": 154 }, { "epoch": 0.3389830508474576, "grad_norm": 0.20400595664978027, "learning_rate": 0.00029586374695863746, "loss": 0.7604, "step": 155 }, { "epoch": 0.34117003827227993, "grad_norm": 0.3428679406642914, "learning_rate": 0.00029562043795620436, "loss": 0.6905, "step": 156 }, { "epoch": 0.34335702569710225, "grad_norm": 0.25741925835609436, "learning_rate": 0.00029537712895377126, "loss": 0.8333, "step": 157 }, { "epoch": 0.34554401312192456, "grad_norm": 0.2198708951473236, "learning_rate": 0.00029513381995133816, "loss": 0.7183, "step": 158 }, { "epoch": 0.3477310005467469, "grad_norm": 0.2663215696811676, "learning_rate": 0.0002948905109489051, "loss": 0.6736, "step": 159 }, { "epoch": 0.34991798797156914, "grad_norm": 0.26539289951324463, "learning_rate": 0.000294647201946472, "loss": 0.7691, "step": 160 }, { "epoch": 0.35210497539639146, "grad_norm": 0.21398472785949707, "learning_rate": 0.0002944038929440389, "loss": 0.7259, "step": 161 }, { "epoch": 0.3542919628212138, "grad_norm": 0.27584224939346313, "learning_rate": 0.0002941605839416058, "loss": 0.7451, "step": 162 }, { "epoch": 0.3564789502460361, "grad_norm": 0.27322661876678467, "learning_rate": 0.0002939172749391727, "loss": 0.7429, "step": 163 }, { "epoch": 0.3586659376708584, "grad_norm": 0.3097633421421051, "learning_rate": 0.0002936739659367396, "loss": 0.7925, "step": 164 }, { "epoch": 0.3608529250956807, "grad_norm": 0.235543355345726, "learning_rate": 0.00029343065693430656, "loss": 0.6892, "step": 165 }, { "epoch": 0.363039912520503, "grad_norm": 0.34558114409446716, "learning_rate": 0.00029318734793187345, "loss": 0.8239, "step": 166 }, { "epoch": 0.3652268999453253, "grad_norm": 0.5169651508331299, "learning_rate": 0.00029294403892944035, "loss": 0.5348, "step": 167 }, { "epoch": 0.36741388737014763, "grad_norm": 0.4853683412075043, "learning_rate": 0.0002927007299270073, "loss": 0.7482, "step": 168 }, { "epoch": 0.36960087479496995, "grad_norm": 0.3244207203388214, "learning_rate": 0.0002924574209245742, "loss": 0.6755, "step": 169 }, { "epoch": 0.3717878622197922, "grad_norm": 0.3096265494823456, "learning_rate": 0.0002922141119221411, "loss": 0.8395, "step": 170 }, { "epoch": 0.3739748496446145, "grad_norm": 0.21022038161754608, "learning_rate": 0.000291970802919708, "loss": 0.7376, "step": 171 }, { "epoch": 0.37616183706943684, "grad_norm": 0.23877666890621185, "learning_rate": 0.0002917274939172749, "loss": 0.7051, "step": 172 }, { "epoch": 0.37834882449425916, "grad_norm": 0.4041813015937805, "learning_rate": 0.0002914841849148418, "loss": 0.6341, "step": 173 }, { "epoch": 0.3805358119190815, "grad_norm": 0.45476263761520386, "learning_rate": 0.00029124087591240875, "loss": 0.6939, "step": 174 }, { "epoch": 0.3827227993439038, "grad_norm": 0.3100184202194214, "learning_rate": 0.00029099756690997565, "loss": 0.6321, "step": 175 }, { "epoch": 0.38490978676872606, "grad_norm": 0.31327834725379944, "learning_rate": 0.00029075425790754255, "loss": 0.623, "step": 176 }, { "epoch": 0.3870967741935484, "grad_norm": 0.23366397619247437, "learning_rate": 0.0002905109489051095, "loss": 0.6799, "step": 177 }, { "epoch": 0.3892837616183707, "grad_norm": 0.312284380197525, "learning_rate": 0.0002902676399026764, "loss": 0.6979, "step": 178 }, { "epoch": 0.391470749043193, "grad_norm": 0.39591529965400696, "learning_rate": 0.0002900243309002433, "loss": 0.8571, "step": 179 }, { "epoch": 0.39365773646801533, "grad_norm": 0.22407367825508118, "learning_rate": 0.0002897810218978102, "loss": 0.7724, "step": 180 }, { "epoch": 0.3958447238928376, "grad_norm": 0.41758400201797485, "learning_rate": 0.0002895377128953771, "loss": 0.5597, "step": 181 }, { "epoch": 0.3980317113176599, "grad_norm": 0.22731241583824158, "learning_rate": 0.000289294403892944, "loss": 0.7618, "step": 182 }, { "epoch": 0.4002186987424822, "grad_norm": 0.24491345882415771, "learning_rate": 0.00028905109489051094, "loss": 0.6777, "step": 183 }, { "epoch": 0.40240568616730454, "grad_norm": 0.2861243188381195, "learning_rate": 0.00028880778588807784, "loss": 0.8928, "step": 184 }, { "epoch": 0.40459267359212686, "grad_norm": 0.30325135588645935, "learning_rate": 0.00028856447688564474, "loss": 0.6794, "step": 185 }, { "epoch": 0.4067796610169492, "grad_norm": 0.22165870666503906, "learning_rate": 0.0002883211678832117, "loss": 0.7288, "step": 186 }, { "epoch": 0.40896664844177144, "grad_norm": 0.265067994594574, "learning_rate": 0.0002880778588807786, "loss": 0.6641, "step": 187 }, { "epoch": 0.41115363586659376, "grad_norm": 0.3085087835788727, "learning_rate": 0.0002878345498783455, "loss": 0.7916, "step": 188 }, { "epoch": 0.4133406232914161, "grad_norm": 0.30947744846343994, "learning_rate": 0.0002875912408759124, "loss": 0.834, "step": 189 }, { "epoch": 0.4155276107162384, "grad_norm": 0.2581535875797272, "learning_rate": 0.0002873479318734793, "loss": 0.6255, "step": 190 }, { "epoch": 0.4177145981410607, "grad_norm": 0.24718667566776276, "learning_rate": 0.0002871046228710462, "loss": 0.7883, "step": 191 }, { "epoch": 0.419901585565883, "grad_norm": 0.2618321180343628, "learning_rate": 0.00028686131386861314, "loss": 0.6922, "step": 192 }, { "epoch": 0.4220885729907053, "grad_norm": 0.24760881066322327, "learning_rate": 0.00028661800486618004, "loss": 0.7304, "step": 193 }, { "epoch": 0.4242755604155276, "grad_norm": 0.27126792073249817, "learning_rate": 0.00028637469586374693, "loss": 0.5676, "step": 194 }, { "epoch": 0.4264625478403499, "grad_norm": 0.1799423098564148, "learning_rate": 0.00028613138686131383, "loss": 0.7223, "step": 195 }, { "epoch": 0.42864953526517224, "grad_norm": 0.2653333246707916, "learning_rate": 0.0002858880778588808, "loss": 0.7486, "step": 196 }, { "epoch": 0.4308365226899945, "grad_norm": 0.17445164918899536, "learning_rate": 0.0002856447688564477, "loss": 0.6661, "step": 197 }, { "epoch": 0.4330235101148168, "grad_norm": 0.20842154324054718, "learning_rate": 0.0002854014598540146, "loss": 0.5784, "step": 198 }, { "epoch": 0.43521049753963914, "grad_norm": 0.2216557264328003, "learning_rate": 0.0002851581508515815, "loss": 0.8205, "step": 199 }, { "epoch": 0.43739748496446146, "grad_norm": 0.3524712920188904, "learning_rate": 0.0002849148418491484, "loss": 0.8784, "step": 200 }, { "epoch": 0.4395844723892838, "grad_norm": 0.22435776889324188, "learning_rate": 0.0002846715328467153, "loss": 0.7975, "step": 201 }, { "epoch": 0.4417714598141061, "grad_norm": 0.33707621693611145, "learning_rate": 0.00028442822384428223, "loss": 0.8767, "step": 202 }, { "epoch": 0.44395844723892836, "grad_norm": 0.20236724615097046, "learning_rate": 0.00028418491484184913, "loss": 0.6695, "step": 203 }, { "epoch": 0.4461454346637507, "grad_norm": 0.26543137431144714, "learning_rate": 0.000283941605839416, "loss": 0.7137, "step": 204 }, { "epoch": 0.448332422088573, "grad_norm": 0.21210715174674988, "learning_rate": 0.000283698296836983, "loss": 0.8809, "step": 205 }, { "epoch": 0.4505194095133953, "grad_norm": 0.21614502370357513, "learning_rate": 0.0002834549878345499, "loss": 0.6771, "step": 206 }, { "epoch": 0.4527063969382176, "grad_norm": 0.30795833468437195, "learning_rate": 0.0002832116788321168, "loss": 0.6966, "step": 207 }, { "epoch": 0.4548933843630399, "grad_norm": 0.4060954749584198, "learning_rate": 0.0002829683698296837, "loss": 0.7059, "step": 208 }, { "epoch": 0.4570803717878622, "grad_norm": 0.24772609770298004, "learning_rate": 0.00028272506082725057, "loss": 0.6992, "step": 209 }, { "epoch": 0.4592673592126845, "grad_norm": 0.2909943461418152, "learning_rate": 0.00028248175182481747, "loss": 0.8624, "step": 210 }, { "epoch": 0.46145434663750684, "grad_norm": 0.2036535143852234, "learning_rate": 0.0002822384428223844, "loss": 0.7753, "step": 211 }, { "epoch": 0.46364133406232916, "grad_norm": 0.1994384229183197, "learning_rate": 0.0002819951338199513, "loss": 0.7294, "step": 212 }, { "epoch": 0.4658283214871515, "grad_norm": 0.2482912242412567, "learning_rate": 0.0002817518248175182, "loss": 0.6213, "step": 213 }, { "epoch": 0.46801530891197374, "grad_norm": 0.42890939116477966, "learning_rate": 0.0002815085158150851, "loss": 0.8935, "step": 214 }, { "epoch": 0.47020229633679606, "grad_norm": 0.24268397688865662, "learning_rate": 0.000281265206812652, "loss": 0.6253, "step": 215 }, { "epoch": 0.4723892837616184, "grad_norm": 0.3331579267978668, "learning_rate": 0.00028102189781021897, "loss": 0.7022, "step": 216 }, { "epoch": 0.4745762711864407, "grad_norm": 0.34377002716064453, "learning_rate": 0.00028077858880778587, "loss": 0.8386, "step": 217 }, { "epoch": 0.476763258611263, "grad_norm": 0.2543483078479767, "learning_rate": 0.00028053527980535277, "loss": 0.6084, "step": 218 }, { "epoch": 0.47895024603608527, "grad_norm": 0.30651986598968506, "learning_rate": 0.00028029197080291966, "loss": 0.7624, "step": 219 }, { "epoch": 0.4811372334609076, "grad_norm": 0.3476787209510803, "learning_rate": 0.0002800486618004866, "loss": 0.822, "step": 220 }, { "epoch": 0.4833242208857299, "grad_norm": 0.3727283477783203, "learning_rate": 0.0002798053527980535, "loss": 0.7416, "step": 221 }, { "epoch": 0.4855112083105522, "grad_norm": 0.3289774954319, "learning_rate": 0.0002795620437956204, "loss": 0.8264, "step": 222 }, { "epoch": 0.48769819573537454, "grad_norm": 0.26083284616470337, "learning_rate": 0.0002793187347931873, "loss": 0.6279, "step": 223 }, { "epoch": 0.4898851831601968, "grad_norm": 0.2844780683517456, "learning_rate": 0.0002790754257907542, "loss": 0.6315, "step": 224 }, { "epoch": 0.4920721705850191, "grad_norm": 0.3443123996257782, "learning_rate": 0.0002788321167883211, "loss": 0.6538, "step": 225 }, { "epoch": 0.49425915800984144, "grad_norm": 0.23209474980831146, "learning_rate": 0.00027858880778588806, "loss": 0.7205, "step": 226 }, { "epoch": 0.49644614543466375, "grad_norm": 0.26261788606643677, "learning_rate": 0.00027834549878345496, "loss": 0.7253, "step": 227 }, { "epoch": 0.4986331328594861, "grad_norm": 0.28650718927383423, "learning_rate": 0.00027810218978102186, "loss": 0.889, "step": 228 }, { "epoch": 0.5008201202843083, "grad_norm": 0.2478565275669098, "learning_rate": 0.0002778588807785888, "loss": 0.7619, "step": 229 }, { "epoch": 0.5030071077091307, "grad_norm": 0.17673347890377045, "learning_rate": 0.0002776155717761557, "loss": 0.8684, "step": 230 }, { "epoch": 0.505194095133953, "grad_norm": 0.28806573152542114, "learning_rate": 0.0002773722627737226, "loss": 0.7499, "step": 231 }, { "epoch": 0.5073810825587752, "grad_norm": 0.2507832646369934, "learning_rate": 0.0002771289537712895, "loss": 0.9297, "step": 232 }, { "epoch": 0.5095680699835976, "grad_norm": 0.29228198528289795, "learning_rate": 0.0002768856447688564, "loss": 0.8578, "step": 233 }, { "epoch": 0.5117550574084199, "grad_norm": 0.5378915667533875, "learning_rate": 0.0002766423357664233, "loss": 0.8647, "step": 234 }, { "epoch": 0.5139420448332422, "grad_norm": 0.6002528071403503, "learning_rate": 0.0002763990267639902, "loss": 0.8368, "step": 235 }, { "epoch": 0.5161290322580645, "grad_norm": 0.19659245014190674, "learning_rate": 0.00027615571776155715, "loss": 0.6983, "step": 236 }, { "epoch": 0.5183160196828869, "grad_norm": 0.2815648913383484, "learning_rate": 0.00027591240875912405, "loss": 0.7741, "step": 237 }, { "epoch": 0.5205030071077091, "grad_norm": 0.2534239888191223, "learning_rate": 0.00027566909975669095, "loss": 0.9392, "step": 238 }, { "epoch": 0.5226899945325314, "grad_norm": 0.30477020144462585, "learning_rate": 0.0002754257907542579, "loss": 0.7839, "step": 239 }, { "epoch": 0.5248769819573538, "grad_norm": 0.321443647146225, "learning_rate": 0.0002751824817518248, "loss": 0.8445, "step": 240 }, { "epoch": 0.527063969382176, "grad_norm": 0.3917739689350128, "learning_rate": 0.0002749391727493917, "loss": 0.6641, "step": 241 }, { "epoch": 0.5292509568069984, "grad_norm": 0.2380986511707306, "learning_rate": 0.0002746958637469586, "loss": 0.8242, "step": 242 }, { "epoch": 0.5314379442318207, "grad_norm": 0.1695939153432846, "learning_rate": 0.0002744525547445255, "loss": 0.7013, "step": 243 }, { "epoch": 0.5336249316566429, "grad_norm": 0.24696393311023712, "learning_rate": 0.0002742092457420924, "loss": 0.8488, "step": 244 }, { "epoch": 0.5358119190814653, "grad_norm": 0.2278507500886917, "learning_rate": 0.00027396593673965935, "loss": 0.7894, "step": 245 }, { "epoch": 0.5379989065062876, "grad_norm": 0.41331958770751953, "learning_rate": 0.00027372262773722625, "loss": 0.8343, "step": 246 }, { "epoch": 0.5401858939311099, "grad_norm": 0.29076704382896423, "learning_rate": 0.00027347931873479315, "loss": 0.995, "step": 247 }, { "epoch": 0.5423728813559322, "grad_norm": 0.23243111371994019, "learning_rate": 0.0002732360097323601, "loss": 0.7456, "step": 248 }, { "epoch": 0.5445598687807545, "grad_norm": 0.21154357492923737, "learning_rate": 0.000272992700729927, "loss": 0.6853, "step": 249 }, { "epoch": 0.5467468562055768, "grad_norm": 0.24274934828281403, "learning_rate": 0.0002727493917274939, "loss": 0.6452, "step": 250 }, { "epoch": 0.5489338436303991, "grad_norm": 0.37139129638671875, "learning_rate": 0.0002725060827250608, "loss": 0.7449, "step": 251 }, { "epoch": 0.5511208310552215, "grad_norm": 0.17621925473213196, "learning_rate": 0.0002722627737226277, "loss": 0.6824, "step": 252 }, { "epoch": 0.5533078184800437, "grad_norm": 0.19210177659988403, "learning_rate": 0.0002720194647201946, "loss": 0.6186, "step": 253 }, { "epoch": 0.555494805904866, "grad_norm": 0.21780337393283844, "learning_rate": 0.00027177615571776154, "loss": 0.663, "step": 254 }, { "epoch": 0.5576817933296884, "grad_norm": 0.21192163228988647, "learning_rate": 0.00027153284671532844, "loss": 0.8801, "step": 255 }, { "epoch": 0.5598687807545106, "grad_norm": 0.27523308992385864, "learning_rate": 0.00027128953771289534, "loss": 0.6769, "step": 256 }, { "epoch": 0.562055768179333, "grad_norm": 0.24207553267478943, "learning_rate": 0.0002710462287104623, "loss": 0.4965, "step": 257 }, { "epoch": 0.5642427556041553, "grad_norm": 0.33707237243652344, "learning_rate": 0.0002708029197080292, "loss": 0.7787, "step": 258 }, { "epoch": 0.5664297430289775, "grad_norm": 0.2669321596622467, "learning_rate": 0.0002705596107055961, "loss": 1.0172, "step": 259 }, { "epoch": 0.5686167304537999, "grad_norm": 0.26386845111846924, "learning_rate": 0.000270316301703163, "loss": 0.6477, "step": 260 }, { "epoch": 0.5708037178786222, "grad_norm": 0.304721474647522, "learning_rate": 0.0002700729927007299, "loss": 0.8301, "step": 261 }, { "epoch": 0.5729907053034445, "grad_norm": 0.20255905389785767, "learning_rate": 0.0002698296836982968, "loss": 0.5643, "step": 262 }, { "epoch": 0.5751776927282668, "grad_norm": 0.2723388671875, "learning_rate": 0.00026958637469586374, "loss": 0.6883, "step": 263 }, { "epoch": 0.5773646801530892, "grad_norm": 0.27381351590156555, "learning_rate": 0.00026934306569343063, "loss": 0.808, "step": 264 }, { "epoch": 0.5795516675779114, "grad_norm": 0.25915855169296265, "learning_rate": 0.00026909975669099753, "loss": 0.722, "step": 265 }, { "epoch": 0.5817386550027337, "grad_norm": 0.22392873466014862, "learning_rate": 0.0002688564476885645, "loss": 0.6744, "step": 266 }, { "epoch": 0.5839256424275561, "grad_norm": 0.2078748643398285, "learning_rate": 0.0002686131386861314, "loss": 0.8127, "step": 267 }, { "epoch": 0.5861126298523783, "grad_norm": 0.18671007454395294, "learning_rate": 0.0002683698296836983, "loss": 0.6276, "step": 268 }, { "epoch": 0.5882996172772007, "grad_norm": 0.3014012575149536, "learning_rate": 0.0002681265206812652, "loss": 0.7543, "step": 269 }, { "epoch": 0.590486604702023, "grad_norm": 0.23588421940803528, "learning_rate": 0.0002678832116788321, "loss": 0.8301, "step": 270 }, { "epoch": 0.5926735921268452, "grad_norm": 0.37635311484336853, "learning_rate": 0.000267639902676399, "loss": 0.8239, "step": 271 }, { "epoch": 0.5948605795516676, "grad_norm": 0.23310554027557373, "learning_rate": 0.0002673965936739659, "loss": 0.8723, "step": 272 }, { "epoch": 0.5970475669764899, "grad_norm": 0.47537633776664734, "learning_rate": 0.00026715328467153283, "loss": 0.7915, "step": 273 }, { "epoch": 0.5992345544013122, "grad_norm": 0.2815110981464386, "learning_rate": 0.0002669099756690997, "loss": 0.8004, "step": 274 }, { "epoch": 0.6014215418261345, "grad_norm": 0.19834642112255096, "learning_rate": 0.0002666666666666666, "loss": 0.7457, "step": 275 }, { "epoch": 0.6036085292509568, "grad_norm": 0.5626861453056335, "learning_rate": 0.0002664233576642336, "loss": 0.6196, "step": 276 }, { "epoch": 0.6057955166757791, "grad_norm": 0.2784450054168701, "learning_rate": 0.0002661800486618005, "loss": 0.6365, "step": 277 }, { "epoch": 0.6079825041006014, "grad_norm": 0.23809124529361725, "learning_rate": 0.0002659367396593674, "loss": 0.7889, "step": 278 }, { "epoch": 0.6101694915254238, "grad_norm": 0.25168001651763916, "learning_rate": 0.0002656934306569343, "loss": 0.6327, "step": 279 }, { "epoch": 0.612356478950246, "grad_norm": 0.2970046401023865, "learning_rate": 0.00026545012165450117, "loss": 0.6913, "step": 280 }, { "epoch": 0.6145434663750683, "grad_norm": 0.3090710937976837, "learning_rate": 0.00026520681265206807, "loss": 0.7131, "step": 281 }, { "epoch": 0.6167304537998907, "grad_norm": 0.2775273621082306, "learning_rate": 0.000264963503649635, "loss": 0.8556, "step": 282 }, { "epoch": 0.6189174412247129, "grad_norm": 0.3191220164299011, "learning_rate": 0.0002647201946472019, "loss": 0.8762, "step": 283 }, { "epoch": 0.6211044286495353, "grad_norm": 0.2520481050014496, "learning_rate": 0.0002644768856447688, "loss": 0.6358, "step": 284 }, { "epoch": 0.6232914160743576, "grad_norm": 0.31783685088157654, "learning_rate": 0.00026423357664233577, "loss": 0.773, "step": 285 }, { "epoch": 0.6254784034991798, "grad_norm": 0.33624374866485596, "learning_rate": 0.00026399026763990267, "loss": 0.963, "step": 286 }, { "epoch": 0.6276653909240022, "grad_norm": 0.3576049208641052, "learning_rate": 0.00026374695863746957, "loss": 0.6658, "step": 287 }, { "epoch": 0.6298523783488245, "grad_norm": 0.2659110426902771, "learning_rate": 0.00026350364963503647, "loss": 0.6662, "step": 288 }, { "epoch": 0.6320393657736468, "grad_norm": 0.3657420575618744, "learning_rate": 0.00026326034063260337, "loss": 0.9873, "step": 289 }, { "epoch": 0.6342263531984691, "grad_norm": 0.24509188532829285, "learning_rate": 0.00026301703163017026, "loss": 0.7795, "step": 290 }, { "epoch": 0.6364133406232915, "grad_norm": 0.24286092817783356, "learning_rate": 0.0002627737226277372, "loss": 0.7611, "step": 291 }, { "epoch": 0.6386003280481137, "grad_norm": 0.2804836332798004, "learning_rate": 0.0002625304136253041, "loss": 0.759, "step": 292 }, { "epoch": 0.640787315472936, "grad_norm": 0.3322978615760803, "learning_rate": 0.000262287104622871, "loss": 0.6943, "step": 293 }, { "epoch": 0.6429743028977584, "grad_norm": 0.2114831805229187, "learning_rate": 0.00026204379562043797, "loss": 0.6729, "step": 294 }, { "epoch": 0.6451612903225806, "grad_norm": 0.2177094966173172, "learning_rate": 0.00026180048661800486, "loss": 0.7916, "step": 295 }, { "epoch": 0.647348277747403, "grad_norm": 0.2582005560398102, "learning_rate": 0.00026155717761557176, "loss": 0.7655, "step": 296 }, { "epoch": 0.6495352651722253, "grad_norm": 0.2613639831542969, "learning_rate": 0.00026131386861313866, "loss": 0.6482, "step": 297 }, { "epoch": 0.6517222525970475, "grad_norm": 0.2764948606491089, "learning_rate": 0.00026107055961070556, "loss": 0.7022, "step": 298 }, { "epoch": 0.6539092400218699, "grad_norm": 0.20186789333820343, "learning_rate": 0.00026082725060827246, "loss": 0.7853, "step": 299 }, { "epoch": 0.6560962274466922, "grad_norm": 0.3178173303604126, "learning_rate": 0.0002605839416058394, "loss": 0.8393, "step": 300 }, { "epoch": 0.6582832148715145, "grad_norm": 0.35939186811447144, "learning_rate": 0.0002603406326034063, "loss": 0.7078, "step": 301 }, { "epoch": 0.6604702022963368, "grad_norm": 0.3983876407146454, "learning_rate": 0.0002600973236009732, "loss": 0.8271, "step": 302 }, { "epoch": 0.6626571897211591, "grad_norm": 0.19504043459892273, "learning_rate": 0.00025985401459854016, "loss": 0.7748, "step": 303 }, { "epoch": 0.6648441771459814, "grad_norm": 0.21278342604637146, "learning_rate": 0.00025961070559610706, "loss": 0.8016, "step": 304 }, { "epoch": 0.6670311645708037, "grad_norm": 0.29927191138267517, "learning_rate": 0.00025936739659367396, "loss": 0.844, "step": 305 }, { "epoch": 0.6692181519956261, "grad_norm": 0.22748655080795288, "learning_rate": 0.00025912408759124085, "loss": 0.6786, "step": 306 }, { "epoch": 0.6714051394204483, "grad_norm": 0.21796458959579468, "learning_rate": 0.00025888077858880775, "loss": 0.8343, "step": 307 }, { "epoch": 0.6735921268452706, "grad_norm": 0.26962918043136597, "learning_rate": 0.00025863746958637465, "loss": 0.8058, "step": 308 }, { "epoch": 0.675779114270093, "grad_norm": 0.2169698178768158, "learning_rate": 0.00025839416058394155, "loss": 0.8341, "step": 309 }, { "epoch": 0.6779661016949152, "grad_norm": 0.5226082801818848, "learning_rate": 0.0002581508515815085, "loss": 0.8038, "step": 310 }, { "epoch": 0.6801530891197376, "grad_norm": 0.2540872395038605, "learning_rate": 0.0002579075425790754, "loss": 0.6485, "step": 311 }, { "epoch": 0.6823400765445599, "grad_norm": 0.2758027911186218, "learning_rate": 0.0002576642335766423, "loss": 0.7258, "step": 312 }, { "epoch": 0.6845270639693821, "grad_norm": 0.3712478280067444, "learning_rate": 0.00025742092457420925, "loss": 1.0087, "step": 313 }, { "epoch": 0.6867140513942045, "grad_norm": 0.29959022998809814, "learning_rate": 0.00025717761557177615, "loss": 0.7344, "step": 314 }, { "epoch": 0.6889010388190268, "grad_norm": 0.29603782296180725, "learning_rate": 0.00025693430656934305, "loss": 0.7633, "step": 315 }, { "epoch": 0.6910880262438491, "grad_norm": 0.26212218403816223, "learning_rate": 0.00025669099756690995, "loss": 0.7762, "step": 316 }, { "epoch": 0.6932750136686714, "grad_norm": 0.2501971423625946, "learning_rate": 0.00025644768856447685, "loss": 0.6449, "step": 317 }, { "epoch": 0.6954620010934938, "grad_norm": 0.20236985385417938, "learning_rate": 0.00025620437956204374, "loss": 0.6661, "step": 318 }, { "epoch": 0.697648988518316, "grad_norm": 0.28867748379707336, "learning_rate": 0.0002559610705596107, "loss": 0.7168, "step": 319 }, { "epoch": 0.6998359759431383, "grad_norm": 0.25392022728919983, "learning_rate": 0.0002557177615571776, "loss": 0.8255, "step": 320 }, { "epoch": 0.7020229633679607, "grad_norm": 0.2739144563674927, "learning_rate": 0.0002554744525547445, "loss": 0.8782, "step": 321 }, { "epoch": 0.7042099507927829, "grad_norm": 0.3195747137069702, "learning_rate": 0.00025523114355231145, "loss": 0.7681, "step": 322 }, { "epoch": 0.7063969382176053, "grad_norm": 0.6262739300727844, "learning_rate": 0.00025498783454987834, "loss": 0.6497, "step": 323 }, { "epoch": 0.7085839256424276, "grad_norm": 0.18836063146591187, "learning_rate": 0.00025474452554744524, "loss": 0.6773, "step": 324 }, { "epoch": 0.7107709130672498, "grad_norm": 0.428913950920105, "learning_rate": 0.00025450121654501214, "loss": 0.6359, "step": 325 }, { "epoch": 0.7129579004920722, "grad_norm": 0.2561635375022888, "learning_rate": 0.00025425790754257904, "loss": 0.6768, "step": 326 }, { "epoch": 0.7151448879168945, "grad_norm": 0.2519037425518036, "learning_rate": 0.00025401459854014594, "loss": 0.941, "step": 327 }, { "epoch": 0.7173318753417168, "grad_norm": 0.22086481750011444, "learning_rate": 0.0002537712895377129, "loss": 0.6448, "step": 328 }, { "epoch": 0.7195188627665391, "grad_norm": 0.3844771385192871, "learning_rate": 0.0002535279805352798, "loss": 0.6043, "step": 329 }, { "epoch": 0.7217058501913614, "grad_norm": 0.2547963857650757, "learning_rate": 0.0002532846715328467, "loss": 0.9912, "step": 330 }, { "epoch": 0.7238928376161837, "grad_norm": 0.40474840998649597, "learning_rate": 0.00025304136253041364, "loss": 0.5905, "step": 331 }, { "epoch": 0.726079825041006, "grad_norm": 0.20748649537563324, "learning_rate": 0.00025279805352798054, "loss": 0.6245, "step": 332 }, { "epoch": 0.7282668124658284, "grad_norm": 0.29902809858322144, "learning_rate": 0.00025255474452554744, "loss": 0.7478, "step": 333 }, { "epoch": 0.7304537998906506, "grad_norm": 0.21671514213085175, "learning_rate": 0.00025231143552311433, "loss": 0.5296, "step": 334 }, { "epoch": 0.7326407873154729, "grad_norm": 0.1979508250951767, "learning_rate": 0.00025206812652068123, "loss": 0.5523, "step": 335 }, { "epoch": 0.7348277747402953, "grad_norm": 0.25213825702667236, "learning_rate": 0.00025182481751824813, "loss": 0.9787, "step": 336 }, { "epoch": 0.7370147621651175, "grad_norm": 0.32967931032180786, "learning_rate": 0.0002515815085158151, "loss": 0.7161, "step": 337 }, { "epoch": 0.7392017495899399, "grad_norm": 0.30640098452568054, "learning_rate": 0.000251338199513382, "loss": 0.9517, "step": 338 }, { "epoch": 0.7413887370147622, "grad_norm": 0.1820855438709259, "learning_rate": 0.0002510948905109489, "loss": 0.6219, "step": 339 }, { "epoch": 0.7435757244395844, "grad_norm": 0.29584068059921265, "learning_rate": 0.00025085158150851583, "loss": 0.7692, "step": 340 }, { "epoch": 0.7457627118644068, "grad_norm": 0.3015952408313751, "learning_rate": 0.00025060827250608273, "loss": 0.812, "step": 341 }, { "epoch": 0.747949699289229, "grad_norm": 0.364886611700058, "learning_rate": 0.00025036496350364963, "loss": 0.7881, "step": 342 }, { "epoch": 0.7501366867140514, "grad_norm": 0.2170587182044983, "learning_rate": 0.00025012165450121653, "loss": 0.6989, "step": 343 }, { "epoch": 0.7523236741388737, "grad_norm": 0.23260867595672607, "learning_rate": 0.00024987834549878343, "loss": 0.6581, "step": 344 }, { "epoch": 0.7545106615636961, "grad_norm": 0.36740902066230774, "learning_rate": 0.0002496350364963503, "loss": 0.9984, "step": 345 }, { "epoch": 0.7566976489885183, "grad_norm": 0.6248576641082764, "learning_rate": 0.0002493917274939172, "loss": 0.9879, "step": 346 }, { "epoch": 0.7588846364133406, "grad_norm": 0.44404783844947815, "learning_rate": 0.0002491484184914842, "loss": 0.616, "step": 347 }, { "epoch": 0.761071623838163, "grad_norm": 0.2840265929698944, "learning_rate": 0.0002489051094890511, "loss": 0.9053, "step": 348 }, { "epoch": 0.7632586112629852, "grad_norm": 0.34335142374038696, "learning_rate": 0.000248661800486618, "loss": 0.7877, "step": 349 }, { "epoch": 0.7654455986878076, "grad_norm": 0.28032955527305603, "learning_rate": 0.0002484184914841849, "loss": 0.5934, "step": 350 }, { "epoch": 0.7676325861126299, "grad_norm": 0.35794079303741455, "learning_rate": 0.0002481751824817518, "loss": 0.736, "step": 351 }, { "epoch": 0.7698195735374521, "grad_norm": 0.1937468945980072, "learning_rate": 0.0002479318734793187, "loss": 0.7268, "step": 352 }, { "epoch": 0.7720065609622745, "grad_norm": 0.2442459911108017, "learning_rate": 0.0002476885644768856, "loss": 0.9092, "step": 353 }, { "epoch": 0.7741935483870968, "grad_norm": 0.2178357094526291, "learning_rate": 0.0002474452554744525, "loss": 0.832, "step": 354 }, { "epoch": 0.7763805358119191, "grad_norm": 0.2904297113418579, "learning_rate": 0.0002472019464720194, "loss": 0.6973, "step": 355 }, { "epoch": 0.7785675232367414, "grad_norm": 0.2849595248699188, "learning_rate": 0.00024695863746958637, "loss": 0.8439, "step": 356 }, { "epoch": 0.7807545106615636, "grad_norm": 0.30786654353141785, "learning_rate": 0.00024671532846715327, "loss": 0.8282, "step": 357 }, { "epoch": 0.782941498086386, "grad_norm": 0.2731088697910309, "learning_rate": 0.00024647201946472017, "loss": 0.7614, "step": 358 }, { "epoch": 0.7851284855112083, "grad_norm": 0.2967981696128845, "learning_rate": 0.0002462287104622871, "loss": 0.7059, "step": 359 }, { "epoch": 0.7873154729360307, "grad_norm": 0.2427809238433838, "learning_rate": 0.000245985401459854, "loss": 0.5235, "step": 360 }, { "epoch": 0.7895024603608529, "grad_norm": 0.3543761074542999, "learning_rate": 0.0002457420924574209, "loss": 0.6882, "step": 361 }, { "epoch": 0.7916894477856752, "grad_norm": 0.2084377259016037, "learning_rate": 0.0002454987834549878, "loss": 0.6333, "step": 362 }, { "epoch": 0.7938764352104976, "grad_norm": 0.3653489649295807, "learning_rate": 0.0002452554744525547, "loss": 0.8776, "step": 363 }, { "epoch": 0.7960634226353198, "grad_norm": 0.2806954085826874, "learning_rate": 0.0002450121654501216, "loss": 0.7464, "step": 364 }, { "epoch": 0.7982504100601422, "grad_norm": 0.3652292788028717, "learning_rate": 0.00024476885644768856, "loss": 0.93, "step": 365 }, { "epoch": 0.8004373974849645, "grad_norm": 0.24262574315071106, "learning_rate": 0.00024452554744525546, "loss": 0.8502, "step": 366 }, { "epoch": 0.8026243849097867, "grad_norm": 0.273867666721344, "learning_rate": 0.00024428223844282236, "loss": 0.9274, "step": 367 }, { "epoch": 0.8048113723346091, "grad_norm": 0.21722102165222168, "learning_rate": 0.0002440389294403893, "loss": 0.8045, "step": 368 }, { "epoch": 0.8069983597594313, "grad_norm": 0.19634899497032166, "learning_rate": 0.00024379562043795619, "loss": 0.7424, "step": 369 }, { "epoch": 0.8091853471842537, "grad_norm": 0.27201011776924133, "learning_rate": 0.00024355231143552308, "loss": 0.797, "step": 370 }, { "epoch": 0.811372334609076, "grad_norm": 0.254142165184021, "learning_rate": 0.00024330900243309, "loss": 0.6142, "step": 371 }, { "epoch": 0.8135593220338984, "grad_norm": 0.7009087204933167, "learning_rate": 0.0002430656934306569, "loss": 0.6703, "step": 372 }, { "epoch": 0.8157463094587206, "grad_norm": 0.2147742360830307, "learning_rate": 0.0002428223844282238, "loss": 0.8446, "step": 373 }, { "epoch": 0.8179332968835429, "grad_norm": 0.18214701116085052, "learning_rate": 0.00024257907542579076, "loss": 0.6536, "step": 374 }, { "epoch": 0.8201202843083653, "grad_norm": 0.22022093832492828, "learning_rate": 0.00024233576642335766, "loss": 0.7452, "step": 375 }, { "epoch": 0.8223072717331875, "grad_norm": 0.19220127165317535, "learning_rate": 0.00024209245742092456, "loss": 0.699, "step": 376 }, { "epoch": 0.8244942591580099, "grad_norm": 0.26980119943618774, "learning_rate": 0.00024184914841849148, "loss": 0.8433, "step": 377 }, { "epoch": 0.8266812465828322, "grad_norm": 0.1975000947713852, "learning_rate": 0.00024160583941605838, "loss": 0.5667, "step": 378 }, { "epoch": 0.8288682340076544, "grad_norm": 0.28691354393959045, "learning_rate": 0.00024136253041362528, "loss": 0.764, "step": 379 }, { "epoch": 0.8310552214324768, "grad_norm": 0.23176266252994537, "learning_rate": 0.0002411192214111922, "loss": 0.5348, "step": 380 }, { "epoch": 0.833242208857299, "grad_norm": 0.2583778202533722, "learning_rate": 0.0002408759124087591, "loss": 0.8583, "step": 381 }, { "epoch": 0.8354291962821214, "grad_norm": 0.1877242922782898, "learning_rate": 0.000240632603406326, "loss": 0.6818, "step": 382 }, { "epoch": 0.8376161837069437, "grad_norm": 0.3764333724975586, "learning_rate": 0.0002403892944038929, "loss": 0.8631, "step": 383 }, { "epoch": 0.839803171131766, "grad_norm": 0.30223846435546875, "learning_rate": 0.00024014598540145985, "loss": 0.7702, "step": 384 }, { "epoch": 0.8419901585565883, "grad_norm": 0.43627509474754333, "learning_rate": 0.00023990267639902675, "loss": 0.8994, "step": 385 }, { "epoch": 0.8441771459814106, "grad_norm": 0.2544715404510498, "learning_rate": 0.00023965936739659365, "loss": 0.6475, "step": 386 }, { "epoch": 0.846364133406233, "grad_norm": 0.23747164011001587, "learning_rate": 0.00023941605839416057, "loss": 0.7199, "step": 387 }, { "epoch": 0.8485511208310552, "grad_norm": 0.3392624855041504, "learning_rate": 0.00023917274939172747, "loss": 0.763, "step": 388 }, { "epoch": 0.8507381082558775, "grad_norm": 0.25245627760887146, "learning_rate": 0.00023892944038929437, "loss": 0.7532, "step": 389 }, { "epoch": 0.8529250956806999, "grad_norm": 0.2674003839492798, "learning_rate": 0.0002386861313868613, "loss": 0.599, "step": 390 }, { "epoch": 0.8551120831055221, "grad_norm": 0.27161166071891785, "learning_rate": 0.0002384428223844282, "loss": 0.9355, "step": 391 }, { "epoch": 0.8572990705303445, "grad_norm": 0.18150918185710907, "learning_rate": 0.0002381995133819951, "loss": 0.6056, "step": 392 }, { "epoch": 0.8594860579551667, "grad_norm": 0.22968190908432007, "learning_rate": 0.00023795620437956204, "loss": 0.767, "step": 393 }, { "epoch": 0.861673045379989, "grad_norm": 0.21685199439525604, "learning_rate": 0.00023771289537712894, "loss": 0.7246, "step": 394 }, { "epoch": 0.8638600328048114, "grad_norm": 0.26542550325393677, "learning_rate": 0.00023746958637469584, "loss": 0.7106, "step": 395 }, { "epoch": 0.8660470202296336, "grad_norm": 0.23525013029575348, "learning_rate": 0.00023722627737226277, "loss": 0.6958, "step": 396 }, { "epoch": 0.868234007654456, "grad_norm": 0.20633290708065033, "learning_rate": 0.00023698296836982967, "loss": 0.643, "step": 397 }, { "epoch": 0.8704209950792783, "grad_norm": 0.21550309658050537, "learning_rate": 0.00023673965936739656, "loss": 0.7449, "step": 398 }, { "epoch": 0.8726079825041007, "grad_norm": 0.2124805748462677, "learning_rate": 0.0002364963503649635, "loss": 0.7398, "step": 399 }, { "epoch": 0.8747949699289229, "grad_norm": 0.21294209361076355, "learning_rate": 0.0002362530413625304, "loss": 0.7934, "step": 400 }, { "epoch": 0.8769819573537452, "grad_norm": 0.36196568608283997, "learning_rate": 0.00023600973236009729, "loss": 0.7848, "step": 401 }, { "epoch": 0.8791689447785676, "grad_norm": 0.27596211433410645, "learning_rate": 0.0002357664233576642, "loss": 0.7286, "step": 402 }, { "epoch": 0.8813559322033898, "grad_norm": 0.27594348788261414, "learning_rate": 0.00023552311435523114, "loss": 0.8247, "step": 403 }, { "epoch": 0.8835429196282122, "grad_norm": 0.2970782518386841, "learning_rate": 0.00023527980535279804, "loss": 0.7548, "step": 404 }, { "epoch": 0.8857299070530344, "grad_norm": 0.39152461290359497, "learning_rate": 0.00023503649635036496, "loss": 0.8263, "step": 405 }, { "epoch": 0.8879168944778567, "grad_norm": 0.42587387561798096, "learning_rate": 0.00023479318734793186, "loss": 0.9905, "step": 406 }, { "epoch": 0.8901038819026791, "grad_norm": 0.314147412776947, "learning_rate": 0.00023454987834549876, "loss": 0.6665, "step": 407 }, { "epoch": 0.8922908693275013, "grad_norm": 0.34058940410614014, "learning_rate": 0.00023430656934306568, "loss": 0.7359, "step": 408 }, { "epoch": 0.8944778567523237, "grad_norm": 0.2528778612613678, "learning_rate": 0.00023406326034063258, "loss": 0.693, "step": 409 }, { "epoch": 0.896664844177146, "grad_norm": 0.17990703880786896, "learning_rate": 0.00023381995133819948, "loss": 0.7565, "step": 410 }, { "epoch": 0.8988518316019682, "grad_norm": 0.17062903940677643, "learning_rate": 0.0002335766423357664, "loss": 0.7891, "step": 411 }, { "epoch": 0.9010388190267906, "grad_norm": 0.3442295789718628, "learning_rate": 0.0002333333333333333, "loss": 0.6173, "step": 412 }, { "epoch": 0.9032258064516129, "grad_norm": 0.45662209391593933, "learning_rate": 0.0002330900243309002, "loss": 0.796, "step": 413 }, { "epoch": 0.9054127938764353, "grad_norm": 0.17335475981235504, "learning_rate": 0.00023284671532846715, "loss": 0.6825, "step": 414 }, { "epoch": 0.9075997813012575, "grad_norm": 0.22652967274188995, "learning_rate": 0.00023260340632603405, "loss": 0.7512, "step": 415 }, { "epoch": 0.9097867687260798, "grad_norm": 0.349649041891098, "learning_rate": 0.00023236009732360095, "loss": 0.8205, "step": 416 }, { "epoch": 0.9119737561509021, "grad_norm": 0.18699604272842407, "learning_rate": 0.00023211678832116788, "loss": 0.6451, "step": 417 }, { "epoch": 0.9141607435757244, "grad_norm": 0.2398325353860855, "learning_rate": 0.00023187347931873478, "loss": 0.6891, "step": 418 }, { "epoch": 0.9163477310005468, "grad_norm": 0.22116120159626007, "learning_rate": 0.00023163017031630167, "loss": 0.6765, "step": 419 }, { "epoch": 0.918534718425369, "grad_norm": 0.24642986059188843, "learning_rate": 0.00023138686131386857, "loss": 0.6119, "step": 420 }, { "epoch": 0.9207217058501913, "grad_norm": 0.2329958975315094, "learning_rate": 0.0002311435523114355, "loss": 0.7286, "step": 421 }, { "epoch": 0.9229086932750137, "grad_norm": 0.5355735421180725, "learning_rate": 0.0002309002433090024, "loss": 0.79, "step": 422 }, { "epoch": 0.9250956806998359, "grad_norm": 0.4554167091846466, "learning_rate": 0.0002306569343065693, "loss": 0.6942, "step": 423 }, { "epoch": 0.9272826681246583, "grad_norm": 0.2831968367099762, "learning_rate": 0.00023041362530413625, "loss": 0.7531, "step": 424 }, { "epoch": 0.9294696555494806, "grad_norm": 0.2321235090494156, "learning_rate": 0.00023017031630170315, "loss": 0.6902, "step": 425 }, { "epoch": 0.931656642974303, "grad_norm": 0.4006916880607605, "learning_rate": 0.00022992700729927004, "loss": 0.6725, "step": 426 }, { "epoch": 0.9338436303991252, "grad_norm": 0.3189490735530853, "learning_rate": 0.00022968369829683697, "loss": 0.769, "step": 427 }, { "epoch": 0.9360306178239475, "grad_norm": 0.4294585585594177, "learning_rate": 0.00022944038929440387, "loss": 0.8656, "step": 428 }, { "epoch": 0.9382176052487698, "grad_norm": 0.34347137808799744, "learning_rate": 0.00022919708029197077, "loss": 0.5948, "step": 429 }, { "epoch": 0.9404045926735921, "grad_norm": 0.21789056062698364, "learning_rate": 0.0002289537712895377, "loss": 0.8035, "step": 430 }, { "epoch": 0.9425915800984145, "grad_norm": 0.1835460364818573, "learning_rate": 0.0002287104622871046, "loss": 0.6128, "step": 431 }, { "epoch": 0.9447785675232367, "grad_norm": 0.3390374183654785, "learning_rate": 0.0002284671532846715, "loss": 0.7788, "step": 432 }, { "epoch": 0.946965554948059, "grad_norm": 0.23330353200435638, "learning_rate": 0.00022822384428223844, "loss": 0.7653, "step": 433 }, { "epoch": 0.9491525423728814, "grad_norm": 0.2357734590768814, "learning_rate": 0.00022798053527980534, "loss": 0.765, "step": 434 }, { "epoch": 0.9513395297977036, "grad_norm": 0.2517554759979248, "learning_rate": 0.00022773722627737224, "loss": 0.7815, "step": 435 }, { "epoch": 0.953526517222526, "grad_norm": 0.23417727649211884, "learning_rate": 0.00022749391727493916, "loss": 0.9801, "step": 436 }, { "epoch": 0.9557135046473483, "grad_norm": 0.256149023771286, "learning_rate": 0.00022725060827250606, "loss": 0.734, "step": 437 }, { "epoch": 0.9579004920721705, "grad_norm": 0.31608134508132935, "learning_rate": 0.00022700729927007296, "loss": 0.707, "step": 438 }, { "epoch": 0.9600874794969929, "grad_norm": 0.23100577294826508, "learning_rate": 0.00022676399026763989, "loss": 0.6734, "step": 439 }, { "epoch": 0.9622744669218152, "grad_norm": 0.27026960253715515, "learning_rate": 0.00022652068126520678, "loss": 0.7884, "step": 440 }, { "epoch": 0.9644614543466375, "grad_norm": 0.24245603382587433, "learning_rate": 0.00022627737226277368, "loss": 0.5405, "step": 441 }, { "epoch": 0.9666484417714598, "grad_norm": 0.25354650616645813, "learning_rate": 0.00022603406326034064, "loss": 0.629, "step": 442 }, { "epoch": 0.9688354291962821, "grad_norm": 0.35559025406837463, "learning_rate": 0.00022579075425790753, "loss": 0.5673, "step": 443 }, { "epoch": 0.9710224166211044, "grad_norm": 0.18353384733200073, "learning_rate": 0.00022554744525547443, "loss": 0.7391, "step": 444 }, { "epoch": 0.9732094040459267, "grad_norm": 0.20255619287490845, "learning_rate": 0.00022530413625304136, "loss": 0.605, "step": 445 }, { "epoch": 0.9753963914707491, "grad_norm": 0.24910545349121094, "learning_rate": 0.00022506082725060826, "loss": 0.7387, "step": 446 }, { "epoch": 0.9775833788955713, "grad_norm": 0.30054211616516113, "learning_rate": 0.00022481751824817515, "loss": 0.7649, "step": 447 }, { "epoch": 0.9797703663203936, "grad_norm": 0.2318667322397232, "learning_rate": 0.00022457420924574208, "loss": 0.6788, "step": 448 }, { "epoch": 0.981957353745216, "grad_norm": 0.27025488018989563, "learning_rate": 0.00022433090024330898, "loss": 0.8761, "step": 449 }, { "epoch": 0.9841443411700382, "grad_norm": 0.324431836605072, "learning_rate": 0.00022408759124087588, "loss": 0.5286, "step": 450 }, { "epoch": 0.9863313285948606, "grad_norm": 0.22321289777755737, "learning_rate": 0.00022384428223844283, "loss": 0.9685, "step": 451 }, { "epoch": 0.9885183160196829, "grad_norm": 0.348459929227829, "learning_rate": 0.00022360097323600973, "loss": 0.9153, "step": 452 }, { "epoch": 0.9907053034445052, "grad_norm": 0.24513466656208038, "learning_rate": 0.00022335766423357663, "loss": 0.7944, "step": 453 }, { "epoch": 0.9928922908693275, "grad_norm": 0.296447217464447, "learning_rate": 0.00022311435523114355, "loss": 0.7568, "step": 454 }, { "epoch": 0.9950792782941498, "grad_norm": 0.27960076928138733, "learning_rate": 0.00022287104622871045, "loss": 0.6744, "step": 455 }, { "epoch": 0.9972662657189721, "grad_norm": 0.2234726995229721, "learning_rate": 0.00022262773722627735, "loss": 0.8226, "step": 456 }, { "epoch": 0.9994532531437944, "grad_norm": 0.20796756446361542, "learning_rate": 0.00022238442822384425, "loss": 0.6815, "step": 457 }, { "epoch": 1.0016402405686167, "grad_norm": 0.4041379392147064, "learning_rate": 0.00022214111922141117, "loss": 0.814, "step": 458 }, { "epoch": 1.003827227993439, "grad_norm": 0.2340199053287506, "learning_rate": 0.00022189781021897807, "loss": 0.9068, "step": 459 }, { "epoch": 1.0060142154182614, "grad_norm": 0.24355943500995636, "learning_rate": 0.00022165450121654497, "loss": 0.8377, "step": 460 }, { "epoch": 1.0082012028430836, "grad_norm": 0.27959203720092773, "learning_rate": 0.00022141119221411192, "loss": 0.6917, "step": 461 }, { "epoch": 1.010388190267906, "grad_norm": 0.28080224990844727, "learning_rate": 0.00022116788321167882, "loss": 0.6356, "step": 462 }, { "epoch": 1.0125751776927283, "grad_norm": 0.48801225423812866, "learning_rate": 0.00022092457420924572, "loss": 0.5904, "step": 463 }, { "epoch": 1.0147621651175505, "grad_norm": 0.22513045370578766, "learning_rate": 0.00022068126520681264, "loss": 1.0814, "step": 464 }, { "epoch": 1.0169491525423728, "grad_norm": 0.24892054498195648, "learning_rate": 0.00022043795620437954, "loss": 0.682, "step": 465 }, { "epoch": 1.0191361399671952, "grad_norm": 0.27827882766723633, "learning_rate": 0.00022019464720194644, "loss": 0.5133, "step": 466 }, { "epoch": 1.0213231273920176, "grad_norm": 0.22580872476100922, "learning_rate": 0.00021995133819951337, "loss": 0.6408, "step": 467 }, { "epoch": 1.0235101148168397, "grad_norm": 0.27323248982429504, "learning_rate": 0.00021970802919708026, "loss": 0.6774, "step": 468 }, { "epoch": 1.025697102241662, "grad_norm": 0.2104388028383255, "learning_rate": 0.00021946472019464716, "loss": 0.7655, "step": 469 }, { "epoch": 1.0278840896664845, "grad_norm": 0.26010340452194214, "learning_rate": 0.00021922141119221412, "loss": 0.6855, "step": 470 }, { "epoch": 1.0300710770913066, "grad_norm": 0.22332607209682465, "learning_rate": 0.00021897810218978101, "loss": 0.8742, "step": 471 }, { "epoch": 1.032258064516129, "grad_norm": 0.22284770011901855, "learning_rate": 0.0002187347931873479, "loss": 0.7075, "step": 472 }, { "epoch": 1.0344450519409514, "grad_norm": 0.32503169775009155, "learning_rate": 0.00021849148418491484, "loss": 0.8198, "step": 473 }, { "epoch": 1.0366320393657737, "grad_norm": 0.2516832947731018, "learning_rate": 0.00021824817518248174, "loss": 0.6606, "step": 474 }, { "epoch": 1.038819026790596, "grad_norm": 0.20064838230609894, "learning_rate": 0.00021800486618004863, "loss": 0.6696, "step": 475 }, { "epoch": 1.0410060142154183, "grad_norm": 0.24873629212379456, "learning_rate": 0.00021776155717761556, "loss": 0.8343, "step": 476 }, { "epoch": 1.0431930016402406, "grad_norm": 0.23766379058361053, "learning_rate": 0.00021751824817518246, "loss": 0.6831, "step": 477 }, { "epoch": 1.0453799890650628, "grad_norm": 0.24385926127433777, "learning_rate": 0.00021727493917274936, "loss": 0.6712, "step": 478 }, { "epoch": 1.0475669764898852, "grad_norm": 0.3146672546863556, "learning_rate": 0.00021703163017031628, "loss": 0.6183, "step": 479 }, { "epoch": 1.0497539639147075, "grad_norm": 0.25711727142333984, "learning_rate": 0.0002167883211678832, "loss": 0.6252, "step": 480 }, { "epoch": 1.0519409513395297, "grad_norm": 0.2440115511417389, "learning_rate": 0.0002165450121654501, "loss": 0.7278, "step": 481 }, { "epoch": 1.054127938764352, "grad_norm": 0.2689894735813141, "learning_rate": 0.00021630170316301703, "loss": 0.8418, "step": 482 }, { "epoch": 1.0563149261891744, "grad_norm": 0.2136611044406891, "learning_rate": 0.00021605839416058393, "loss": 0.6313, "step": 483 }, { "epoch": 1.0585019136139968, "grad_norm": 0.2452273964881897, "learning_rate": 0.00021581508515815083, "loss": 0.8624, "step": 484 }, { "epoch": 1.060688901038819, "grad_norm": 0.24893832206726074, "learning_rate": 0.00021557177615571775, "loss": 0.7416, "step": 485 }, { "epoch": 1.0628758884636413, "grad_norm": 0.25064295530319214, "learning_rate": 0.00021532846715328465, "loss": 0.7699, "step": 486 }, { "epoch": 1.0650628758884637, "grad_norm": 0.20812906324863434, "learning_rate": 0.00021508515815085155, "loss": 0.6415, "step": 487 }, { "epoch": 1.0672498633132859, "grad_norm": 0.1655895859003067, "learning_rate": 0.00021484184914841848, "loss": 0.5422, "step": 488 }, { "epoch": 1.0694368507381082, "grad_norm": 0.32013434171676636, "learning_rate": 0.00021459854014598537, "loss": 0.7758, "step": 489 }, { "epoch": 1.0716238381629306, "grad_norm": 0.3376011252403259, "learning_rate": 0.00021435523114355227, "loss": 0.829, "step": 490 }, { "epoch": 1.0738108255877528, "grad_norm": 0.3153345584869385, "learning_rate": 0.00021411192214111923, "loss": 0.7714, "step": 491 }, { "epoch": 1.0759978130125751, "grad_norm": 0.3034818470478058, "learning_rate": 0.00021386861313868612, "loss": 0.6347, "step": 492 }, { "epoch": 1.0781848004373975, "grad_norm": 0.2922978699207306, "learning_rate": 0.00021362530413625302, "loss": 0.7736, "step": 493 }, { "epoch": 1.0803717878622199, "grad_norm": 0.2873200476169586, "learning_rate": 0.00021338199513381992, "loss": 0.7169, "step": 494 }, { "epoch": 1.082558775287042, "grad_norm": 0.19887448847293854, "learning_rate": 0.00021313868613138685, "loss": 0.591, "step": 495 }, { "epoch": 1.0847457627118644, "grad_norm": 0.2438717931509018, "learning_rate": 0.00021289537712895374, "loss": 0.7372, "step": 496 }, { "epoch": 1.0869327501366868, "grad_norm": 0.2844999432563782, "learning_rate": 0.00021265206812652064, "loss": 0.9492, "step": 497 }, { "epoch": 1.089119737561509, "grad_norm": 0.23038767278194427, "learning_rate": 0.00021240875912408757, "loss": 0.6491, "step": 498 }, { "epoch": 1.0913067249863313, "grad_norm": 0.25681063532829285, "learning_rate": 0.00021216545012165447, "loss": 0.7385, "step": 499 }, { "epoch": 1.0934937124111537, "grad_norm": 0.26198524236679077, "learning_rate": 0.00021192214111922137, "loss": 0.6631, "step": 500 }, { "epoch": 1.095680699835976, "grad_norm": 0.2462042272090912, "learning_rate": 0.00021167883211678832, "loss": 0.6845, "step": 501 }, { "epoch": 1.0978676872607982, "grad_norm": 0.4053664803504944, "learning_rate": 0.00021143552311435522, "loss": 0.8192, "step": 502 }, { "epoch": 1.1000546746856206, "grad_norm": 0.1960192620754242, "learning_rate": 0.00021119221411192211, "loss": 0.654, "step": 503 }, { "epoch": 1.102241662110443, "grad_norm": 0.288463294506073, "learning_rate": 0.00021094890510948904, "loss": 0.845, "step": 504 }, { "epoch": 1.104428649535265, "grad_norm": 0.2577453553676605, "learning_rate": 0.00021070559610705594, "loss": 0.7532, "step": 505 }, { "epoch": 1.1066156369600875, "grad_norm": 0.2428467571735382, "learning_rate": 0.00021046228710462284, "loss": 0.633, "step": 506 }, { "epoch": 1.1088026243849098, "grad_norm": 0.2504101097583771, "learning_rate": 0.00021021897810218976, "loss": 0.7633, "step": 507 }, { "epoch": 1.110989611809732, "grad_norm": 0.30137497186660767, "learning_rate": 0.00020997566909975666, "loss": 0.7516, "step": 508 }, { "epoch": 1.1131765992345544, "grad_norm": 0.26197975873947144, "learning_rate": 0.00020973236009732356, "loss": 0.772, "step": 509 }, { "epoch": 1.1153635866593767, "grad_norm": 0.21030549705028534, "learning_rate": 0.0002094890510948905, "loss": 0.656, "step": 510 }, { "epoch": 1.117550574084199, "grad_norm": 0.32491016387939453, "learning_rate": 0.0002092457420924574, "loss": 0.6437, "step": 511 }, { "epoch": 1.1197375615090213, "grad_norm": 0.35852229595184326, "learning_rate": 0.0002090024330900243, "loss": 0.6878, "step": 512 }, { "epoch": 1.1219245489338436, "grad_norm": 0.2437012642621994, "learning_rate": 0.00020875912408759123, "loss": 0.7602, "step": 513 }, { "epoch": 1.124111536358666, "grad_norm": 0.30889564752578735, "learning_rate": 0.00020851581508515813, "loss": 0.8807, "step": 514 }, { "epoch": 1.1262985237834882, "grad_norm": 0.24090994894504547, "learning_rate": 0.00020827250608272503, "loss": 0.6094, "step": 515 }, { "epoch": 1.1284855112083105, "grad_norm": 0.22549685835838318, "learning_rate": 0.00020802919708029196, "loss": 0.6548, "step": 516 }, { "epoch": 1.130672498633133, "grad_norm": 0.21927274763584137, "learning_rate": 0.00020778588807785885, "loss": 0.5024, "step": 517 }, { "epoch": 1.132859486057955, "grad_norm": 0.2773030996322632, "learning_rate": 0.00020754257907542575, "loss": 0.7162, "step": 518 }, { "epoch": 1.1350464734827774, "grad_norm": 0.23646964132785797, "learning_rate": 0.0002072992700729927, "loss": 0.495, "step": 519 }, { "epoch": 1.1372334609075998, "grad_norm": 0.18650543689727783, "learning_rate": 0.0002070559610705596, "loss": 0.6832, "step": 520 }, { "epoch": 1.1394204483324222, "grad_norm": 0.2712174952030182, "learning_rate": 0.0002068126520681265, "loss": 0.6178, "step": 521 }, { "epoch": 1.1416074357572443, "grad_norm": 0.5166855454444885, "learning_rate": 0.00020656934306569343, "loss": 0.7423, "step": 522 }, { "epoch": 1.1437944231820667, "grad_norm": 0.23658710718154907, "learning_rate": 0.00020632603406326033, "loss": 0.823, "step": 523 }, { "epoch": 1.145981410606889, "grad_norm": 0.2502736747264862, "learning_rate": 0.00020608272506082722, "loss": 0.7652, "step": 524 }, { "epoch": 1.1481683980317112, "grad_norm": 0.3579782545566559, "learning_rate": 0.00020583941605839415, "loss": 0.6607, "step": 525 }, { "epoch": 1.1503553854565336, "grad_norm": 0.23584862053394318, "learning_rate": 0.00020559610705596105, "loss": 0.5478, "step": 526 }, { "epoch": 1.152542372881356, "grad_norm": 0.20075763761997223, "learning_rate": 0.00020535279805352795, "loss": 0.4904, "step": 527 }, { "epoch": 1.1547293603061783, "grad_norm": 0.28536489605903625, "learning_rate": 0.0002051094890510949, "loss": 0.725, "step": 528 }, { "epoch": 1.1569163477310005, "grad_norm": 0.2919155955314636, "learning_rate": 0.0002048661800486618, "loss": 0.7854, "step": 529 }, { "epoch": 1.1591033351558229, "grad_norm": 0.2859315574169159, "learning_rate": 0.0002046228710462287, "loss": 0.7588, "step": 530 }, { "epoch": 1.1612903225806452, "grad_norm": 0.2310762107372284, "learning_rate": 0.0002043795620437956, "loss": 0.7313, "step": 531 }, { "epoch": 1.1634773100054674, "grad_norm": 0.37531688809394836, "learning_rate": 0.00020413625304136252, "loss": 0.7386, "step": 532 }, { "epoch": 1.1656642974302898, "grad_norm": 0.2388879358768463, "learning_rate": 0.00020389294403892942, "loss": 0.6976, "step": 533 }, { "epoch": 1.1678512848551121, "grad_norm": 0.35468119382858276, "learning_rate": 0.00020364963503649632, "loss": 0.7769, "step": 534 }, { "epoch": 1.1700382722799345, "grad_norm": 0.35036739706993103, "learning_rate": 0.00020340632603406324, "loss": 0.7023, "step": 535 }, { "epoch": 1.1722252597047567, "grad_norm": 0.22455590963363647, "learning_rate": 0.00020316301703163014, "loss": 0.6198, "step": 536 }, { "epoch": 1.174412247129579, "grad_norm": 0.2568056881427765, "learning_rate": 0.00020291970802919704, "loss": 0.8131, "step": 537 }, { "epoch": 1.1765992345544014, "grad_norm": 0.2159530222415924, "learning_rate": 0.000202676399026764, "loss": 0.608, "step": 538 }, { "epoch": 1.1787862219792236, "grad_norm": 0.3671428859233856, "learning_rate": 0.0002024330900243309, "loss": 0.7317, "step": 539 }, { "epoch": 1.180973209404046, "grad_norm": 0.40387099981307983, "learning_rate": 0.0002021897810218978, "loss": 0.7829, "step": 540 }, { "epoch": 1.1831601968288683, "grad_norm": 0.23750804364681244, "learning_rate": 0.00020194647201946471, "loss": 0.7261, "step": 541 }, { "epoch": 1.1853471842536905, "grad_norm": 0.29545098543167114, "learning_rate": 0.0002017031630170316, "loss": 0.641, "step": 542 }, { "epoch": 1.1875341716785128, "grad_norm": 0.28032809495925903, "learning_rate": 0.0002014598540145985, "loss": 0.5683, "step": 543 }, { "epoch": 1.1897211591033352, "grad_norm": 0.42475053668022156, "learning_rate": 0.00020121654501216544, "loss": 0.7681, "step": 544 }, { "epoch": 1.1919081465281574, "grad_norm": 0.3492116928100586, "learning_rate": 0.00020097323600973233, "loss": 0.6798, "step": 545 }, { "epoch": 1.1940951339529797, "grad_norm": 0.358916699886322, "learning_rate": 0.00020072992700729923, "loss": 0.7502, "step": 546 }, { "epoch": 1.196282121377802, "grad_norm": 0.27878785133361816, "learning_rate": 0.00020048661800486619, "loss": 0.7625, "step": 547 }, { "epoch": 1.1984691088026245, "grad_norm": 0.29086047410964966, "learning_rate": 0.00020024330900243308, "loss": 0.6944, "step": 548 }, { "epoch": 1.2006560962274466, "grad_norm": 0.2969072759151459, "learning_rate": 0.00019999999999999998, "loss": 0.7105, "step": 549 }, { "epoch": 1.202843083652269, "grad_norm": 0.38667795062065125, "learning_rate": 0.0001997566909975669, "loss": 0.7046, "step": 550 }, { "epoch": 1.2050300710770914, "grad_norm": 0.26905378699302673, "learning_rate": 0.0001995133819951338, "loss": 0.8177, "step": 551 }, { "epoch": 1.2072170585019135, "grad_norm": 0.25222644209861755, "learning_rate": 0.0001992700729927007, "loss": 0.7232, "step": 552 }, { "epoch": 1.209404045926736, "grad_norm": 0.23291464149951935, "learning_rate": 0.00019902676399026763, "loss": 0.6135, "step": 553 }, { "epoch": 1.2115910333515583, "grad_norm": 0.24224941432476044, "learning_rate": 0.00019878345498783453, "loss": 0.6832, "step": 554 }, { "epoch": 1.2137780207763806, "grad_norm": 0.2552938759326935, "learning_rate": 0.00019854014598540143, "loss": 0.7707, "step": 555 }, { "epoch": 1.2159650082012028, "grad_norm": 0.3016825318336487, "learning_rate": 0.00019829683698296835, "loss": 0.6199, "step": 556 }, { "epoch": 1.2181519956260252, "grad_norm": 0.2980547547340393, "learning_rate": 0.00019805352798053528, "loss": 0.7232, "step": 557 }, { "epoch": 1.2203389830508475, "grad_norm": 0.3470471203327179, "learning_rate": 0.00019781021897810218, "loss": 0.6665, "step": 558 }, { "epoch": 1.2225259704756697, "grad_norm": 0.2844526171684265, "learning_rate": 0.0001975669099756691, "loss": 0.5931, "step": 559 }, { "epoch": 1.224712957900492, "grad_norm": 0.2751246988773346, "learning_rate": 0.000197323600973236, "loss": 0.6265, "step": 560 }, { "epoch": 1.2268999453253144, "grad_norm": 0.2560863792896271, "learning_rate": 0.0001970802919708029, "loss": 0.6442, "step": 561 }, { "epoch": 1.2290869327501368, "grad_norm": 0.28800928592681885, "learning_rate": 0.00019683698296836982, "loss": 0.7135, "step": 562 }, { "epoch": 1.231273920174959, "grad_norm": 0.44916409254074097, "learning_rate": 0.00019659367396593672, "loss": 0.654, "step": 563 }, { "epoch": 1.2334609075997813, "grad_norm": 0.28822582960128784, "learning_rate": 0.00019635036496350362, "loss": 0.7907, "step": 564 }, { "epoch": 1.2356478950246037, "grad_norm": 0.3168655037879944, "learning_rate": 0.00019610705596107055, "loss": 0.6821, "step": 565 }, { "epoch": 1.2378348824494259, "grad_norm": 0.24087372422218323, "learning_rate": 0.00019586374695863744, "loss": 0.5753, "step": 566 }, { "epoch": 1.2400218698742482, "grad_norm": 0.28054556250572205, "learning_rate": 0.00019562043795620434, "loss": 0.7782, "step": 567 }, { "epoch": 1.2422088572990706, "grad_norm": 0.2647920250892639, "learning_rate": 0.00019537712895377127, "loss": 0.672, "step": 568 }, { "epoch": 1.2443958447238928, "grad_norm": 0.2773146331310272, "learning_rate": 0.0001951338199513382, "loss": 0.6951, "step": 569 }, { "epoch": 1.2465828321487151, "grad_norm": 0.22990505397319794, "learning_rate": 0.0001948905109489051, "loss": 0.8364, "step": 570 }, { "epoch": 1.2487698195735375, "grad_norm": 0.27569764852523804, "learning_rate": 0.000194647201946472, "loss": 0.7833, "step": 571 }, { "epoch": 1.2509568069983596, "grad_norm": 0.2720679044723511, "learning_rate": 0.00019440389294403892, "loss": 0.6844, "step": 572 }, { "epoch": 1.253143794423182, "grad_norm": 0.31944793462753296, "learning_rate": 0.00019416058394160581, "loss": 0.7761, "step": 573 }, { "epoch": 1.2553307818480044, "grad_norm": 0.3249347507953644, "learning_rate": 0.0001939172749391727, "loss": 0.6429, "step": 574 }, { "epoch": 1.2575177692728268, "grad_norm": 0.3601590692996979, "learning_rate": 0.00019367396593673964, "loss": 0.7387, "step": 575 }, { "epoch": 1.259704756697649, "grad_norm": 0.30120986700057983, "learning_rate": 0.00019343065693430654, "loss": 0.7797, "step": 576 }, { "epoch": 1.2618917441224713, "grad_norm": 0.2647385895252228, "learning_rate": 0.00019318734793187344, "loss": 0.6112, "step": 577 }, { "epoch": 1.2640787315472937, "grad_norm": 0.2170192301273346, "learning_rate": 0.0001929440389294404, "loss": 0.6963, "step": 578 }, { "epoch": 1.2662657189721158, "grad_norm": 0.23418468236923218, "learning_rate": 0.0001927007299270073, "loss": 0.7496, "step": 579 }, { "epoch": 1.2684527063969382, "grad_norm": 0.29596206545829773, "learning_rate": 0.00019245742092457418, "loss": 0.8172, "step": 580 }, { "epoch": 1.2706396938217606, "grad_norm": 0.2754702568054199, "learning_rate": 0.0001922141119221411, "loss": 0.6895, "step": 581 }, { "epoch": 1.272826681246583, "grad_norm": 0.2041543573141098, "learning_rate": 0.000191970802919708, "loss": 0.7623, "step": 582 }, { "epoch": 1.275013668671405, "grad_norm": 0.3801957964897156, "learning_rate": 0.0001917274939172749, "loss": 0.634, "step": 583 }, { "epoch": 1.2772006560962275, "grad_norm": 0.39465653896331787, "learning_rate": 0.00019148418491484183, "loss": 0.6114, "step": 584 }, { "epoch": 1.2793876435210498, "grad_norm": 0.36799028515815735, "learning_rate": 0.00019124087591240873, "loss": 0.757, "step": 585 }, { "epoch": 1.281574630945872, "grad_norm": 0.2876284718513489, "learning_rate": 0.00019099756690997563, "loss": 0.6992, "step": 586 }, { "epoch": 1.2837616183706944, "grad_norm": 0.4593120813369751, "learning_rate": 0.00019075425790754258, "loss": 0.6095, "step": 587 }, { "epoch": 1.2859486057955167, "grad_norm": 0.24458545446395874, "learning_rate": 0.00019051094890510948, "loss": 0.5724, "step": 588 }, { "epoch": 1.288135593220339, "grad_norm": 0.22930872440338135, "learning_rate": 0.00019026763990267638, "loss": 0.5479, "step": 589 }, { "epoch": 1.2903225806451613, "grad_norm": 0.32167893648147583, "learning_rate": 0.0001900243309002433, "loss": 0.7158, "step": 590 }, { "epoch": 1.2925095680699836, "grad_norm": 0.2847557067871094, "learning_rate": 0.0001897810218978102, "loss": 0.6545, "step": 591 }, { "epoch": 1.2946965554948058, "grad_norm": 0.24358853697776794, "learning_rate": 0.0001895377128953771, "loss": 0.7497, "step": 592 }, { "epoch": 1.2968835429196282, "grad_norm": 0.26657119393348694, "learning_rate": 0.00018929440389294403, "loss": 0.6816, "step": 593 }, { "epoch": 1.2990705303444505, "grad_norm": 0.3368627727031708, "learning_rate": 0.00018905109489051093, "loss": 0.613, "step": 594 }, { "epoch": 1.301257517769273, "grad_norm": 0.28971466422080994, "learning_rate": 0.00018880778588807782, "loss": 0.814, "step": 595 }, { "epoch": 1.3034445051940953, "grad_norm": 0.3216496706008911, "learning_rate": 0.00018856447688564478, "loss": 0.7116, "step": 596 }, { "epoch": 1.3056314926189174, "grad_norm": 0.25016555190086365, "learning_rate": 0.00018832116788321167, "loss": 0.7034, "step": 597 }, { "epoch": 1.3078184800437398, "grad_norm": 0.2602551579475403, "learning_rate": 0.00018807785888077857, "loss": 0.6624, "step": 598 }, { "epoch": 1.310005467468562, "grad_norm": 0.1847269982099533, "learning_rate": 0.0001878345498783455, "loss": 0.6645, "step": 599 }, { "epoch": 1.3121924548933843, "grad_norm": 0.20593389868736267, "learning_rate": 0.0001875912408759124, "loss": 0.6471, "step": 600 }, { "epoch": 1.3143794423182067, "grad_norm": 0.2651140093803406, "learning_rate": 0.0001873479318734793, "loss": 0.6743, "step": 601 }, { "epoch": 1.316566429743029, "grad_norm": 0.3243972659111023, "learning_rate": 0.00018710462287104622, "loss": 0.662, "step": 602 }, { "epoch": 1.3187534171678512, "grad_norm": 0.24702341854572296, "learning_rate": 0.00018686131386861312, "loss": 0.746, "step": 603 }, { "epoch": 1.3209404045926736, "grad_norm": 0.25382477045059204, "learning_rate": 0.00018661800486618002, "loss": 0.7115, "step": 604 }, { "epoch": 1.323127392017496, "grad_norm": 0.26453620195388794, "learning_rate": 0.00018637469586374697, "loss": 0.5843, "step": 605 }, { "epoch": 1.3253143794423181, "grad_norm": 0.25161460041999817, "learning_rate": 0.00018613138686131387, "loss": 0.7831, "step": 606 }, { "epoch": 1.3275013668671405, "grad_norm": 0.2947143316268921, "learning_rate": 0.00018588807785888077, "loss": 0.6277, "step": 607 }, { "epoch": 1.3296883542919629, "grad_norm": 0.25893881916999817, "learning_rate": 0.00018564476885644767, "loss": 0.6816, "step": 608 }, { "epoch": 1.3318753417167852, "grad_norm": 0.3958803713321686, "learning_rate": 0.0001854014598540146, "loss": 0.8033, "step": 609 }, { "epoch": 1.3340623291416074, "grad_norm": 0.28083765506744385, "learning_rate": 0.0001851581508515815, "loss": 0.6587, "step": 610 }, { "epoch": 1.3362493165664298, "grad_norm": 0.26417723298072815, "learning_rate": 0.0001849148418491484, "loss": 0.6867, "step": 611 }, { "epoch": 1.3384363039912521, "grad_norm": 0.2628178000450134, "learning_rate": 0.0001846715328467153, "loss": 0.6275, "step": 612 }, { "epoch": 1.3406232914160743, "grad_norm": 0.20500022172927856, "learning_rate": 0.0001844282238442822, "loss": 0.6152, "step": 613 }, { "epoch": 1.3428102788408967, "grad_norm": 0.22486689686775208, "learning_rate": 0.0001841849148418491, "loss": 0.5407, "step": 614 }, { "epoch": 1.344997266265719, "grad_norm": 0.3170478641986847, "learning_rate": 0.00018394160583941606, "loss": 0.7176, "step": 615 }, { "epoch": 1.3471842536905414, "grad_norm": 0.34868374466896057, "learning_rate": 0.00018369829683698296, "loss": 0.5815, "step": 616 }, { "epoch": 1.3493712411153636, "grad_norm": 0.2484477013349533, "learning_rate": 0.00018345498783454986, "loss": 0.6613, "step": 617 }, { "epoch": 1.351558228540186, "grad_norm": 0.2799300253391266, "learning_rate": 0.00018321167883211678, "loss": 0.6685, "step": 618 }, { "epoch": 1.353745215965008, "grad_norm": 0.28434398770332336, "learning_rate": 0.00018296836982968368, "loss": 0.7881, "step": 619 }, { "epoch": 1.3559322033898304, "grad_norm": 0.25863373279571533, "learning_rate": 0.00018272506082725058, "loss": 0.7325, "step": 620 }, { "epoch": 1.3581191908146528, "grad_norm": 0.3039908707141876, "learning_rate": 0.0001824817518248175, "loss": 0.8676, "step": 621 }, { "epoch": 1.3603061782394752, "grad_norm": 0.29525163769721985, "learning_rate": 0.0001822384428223844, "loss": 0.8909, "step": 622 }, { "epoch": 1.3624931656642976, "grad_norm": 0.475063294172287, "learning_rate": 0.0001819951338199513, "loss": 0.6882, "step": 623 }, { "epoch": 1.3646801530891197, "grad_norm": 0.22500012814998627, "learning_rate": 0.00018175182481751826, "loss": 0.6354, "step": 624 }, { "epoch": 1.366867140513942, "grad_norm": 0.24890188872814178, "learning_rate": 0.00018150851581508515, "loss": 0.5322, "step": 625 }, { "epoch": 1.3690541279387642, "grad_norm": 0.24399027228355408, "learning_rate": 0.00018126520681265205, "loss": 0.7255, "step": 626 }, { "epoch": 1.3712411153635866, "grad_norm": 0.32299381494522095, "learning_rate": 0.00018102189781021898, "loss": 0.5199, "step": 627 }, { "epoch": 1.373428102788409, "grad_norm": 0.4946720600128174, "learning_rate": 0.00018077858880778588, "loss": 0.7099, "step": 628 }, { "epoch": 1.3756150902132314, "grad_norm": 0.47641122341156006, "learning_rate": 0.00018053527980535278, "loss": 0.752, "step": 629 }, { "epoch": 1.3778020776380535, "grad_norm": 0.3367193937301636, "learning_rate": 0.0001802919708029197, "loss": 0.7196, "step": 630 }, { "epoch": 1.3799890650628759, "grad_norm": 0.27993133664131165, "learning_rate": 0.0001800486618004866, "loss": 0.7357, "step": 631 }, { "epoch": 1.3821760524876983, "grad_norm": 0.27575206756591797, "learning_rate": 0.0001798053527980535, "loss": 0.6148, "step": 632 }, { "epoch": 1.3843630399125204, "grad_norm": 0.33214282989501953, "learning_rate": 0.00017956204379562042, "loss": 0.771, "step": 633 }, { "epoch": 1.3865500273373428, "grad_norm": 0.2970830798149109, "learning_rate": 0.00017931873479318735, "loss": 0.6882, "step": 634 }, { "epoch": 1.3887370147621652, "grad_norm": 0.3435869812965393, "learning_rate": 0.00017907542579075425, "loss": 0.6992, "step": 635 }, { "epoch": 1.3909240021869875, "grad_norm": 0.3328729569911957, "learning_rate": 0.00017883211678832117, "loss": 0.6594, "step": 636 }, { "epoch": 1.3931109896118097, "grad_norm": 0.3031856119632721, "learning_rate": 0.00017858880778588807, "loss": 0.642, "step": 637 }, { "epoch": 1.395297977036632, "grad_norm": 0.2761346399784088, "learning_rate": 0.00017834549878345497, "loss": 1.0442, "step": 638 }, { "epoch": 1.3974849644614544, "grad_norm": 0.34098902344703674, "learning_rate": 0.0001781021897810219, "loss": 0.9509, "step": 639 }, { "epoch": 1.3996719518862766, "grad_norm": 0.4181225299835205, "learning_rate": 0.0001778588807785888, "loss": 0.6521, "step": 640 }, { "epoch": 1.401858939311099, "grad_norm": 0.2533126473426819, "learning_rate": 0.0001776155717761557, "loss": 0.6221, "step": 641 }, { "epoch": 1.4040459267359213, "grad_norm": 0.25691646337509155, "learning_rate": 0.00017737226277372262, "loss": 0.5691, "step": 642 }, { "epoch": 1.4062329141607437, "grad_norm": 0.2649155557155609, "learning_rate": 0.00017712895377128952, "loss": 0.614, "step": 643 }, { "epoch": 1.4084199015855658, "grad_norm": 0.32973209023475647, "learning_rate": 0.00017688564476885641, "loss": 0.878, "step": 644 }, { "epoch": 1.4106068890103882, "grad_norm": 0.3559141755104065, "learning_rate": 0.00017664233576642334, "loss": 0.7954, "step": 645 }, { "epoch": 1.4127938764352104, "grad_norm": 0.2913306653499603, "learning_rate": 0.00017639902676399026, "loss": 0.735, "step": 646 }, { "epoch": 1.4149808638600327, "grad_norm": 0.24183817207813263, "learning_rate": 0.00017615571776155716, "loss": 0.5965, "step": 647 }, { "epoch": 1.4171678512848551, "grad_norm": 0.2638205885887146, "learning_rate": 0.00017591240875912406, "loss": 0.6843, "step": 648 }, { "epoch": 1.4193548387096775, "grad_norm": 0.23057186603546143, "learning_rate": 0.000175669099756691, "loss": 0.7453, "step": 649 }, { "epoch": 1.4215418261344999, "grad_norm": 0.22737360000610352, "learning_rate": 0.00017542579075425789, "loss": 0.5423, "step": 650 }, { "epoch": 1.423728813559322, "grad_norm": 0.25872430205345154, "learning_rate": 0.00017518248175182478, "loss": 0.7591, "step": 651 }, { "epoch": 1.4259158009841444, "grad_norm": 0.2998059391975403, "learning_rate": 0.0001749391727493917, "loss": 0.6222, "step": 652 }, { "epoch": 1.4281027884089665, "grad_norm": 0.21351587772369385, "learning_rate": 0.0001746958637469586, "loss": 0.7082, "step": 653 }, { "epoch": 1.430289775833789, "grad_norm": 0.34969425201416016, "learning_rate": 0.0001744525547445255, "loss": 0.6319, "step": 654 }, { "epoch": 1.4324767632586113, "grad_norm": 0.2845169007778168, "learning_rate": 0.00017420924574209246, "loss": 0.6965, "step": 655 }, { "epoch": 1.4346637506834337, "grad_norm": 0.2735065221786499, "learning_rate": 0.00017396593673965936, "loss": 0.6866, "step": 656 }, { "epoch": 1.4368507381082558, "grad_norm": 0.2701031267642975, "learning_rate": 0.00017372262773722626, "loss": 0.8098, "step": 657 }, { "epoch": 1.4390377255330782, "grad_norm": 0.319159597158432, "learning_rate": 0.00017347931873479318, "loss": 0.6627, "step": 658 }, { "epoch": 1.4412247129579006, "grad_norm": 0.24762673676013947, "learning_rate": 0.00017323600973236008, "loss": 0.8179, "step": 659 }, { "epoch": 1.4434117003827227, "grad_norm": 0.26977255940437317, "learning_rate": 0.00017299270072992698, "loss": 0.5487, "step": 660 }, { "epoch": 1.445598687807545, "grad_norm": 0.25042101740837097, "learning_rate": 0.0001727493917274939, "loss": 0.9502, "step": 661 }, { "epoch": 1.4477856752323675, "grad_norm": 0.28913062810897827, "learning_rate": 0.0001725060827250608, "loss": 0.7216, "step": 662 }, { "epoch": 1.4499726626571898, "grad_norm": 0.3237348198890686, "learning_rate": 0.0001722627737226277, "loss": 0.7644, "step": 663 }, { "epoch": 1.452159650082012, "grad_norm": 0.34338346123695374, "learning_rate": 0.00017201946472019465, "loss": 0.9851, "step": 664 }, { "epoch": 1.4543466375068343, "grad_norm": 0.1985798180103302, "learning_rate": 0.00017177615571776155, "loss": 0.649, "step": 665 }, { "epoch": 1.4565336249316567, "grad_norm": 0.2959745526313782, "learning_rate": 0.00017153284671532845, "loss": 0.8134, "step": 666 }, { "epoch": 1.4587206123564789, "grad_norm": 0.28383585810661316, "learning_rate": 0.00017128953771289537, "loss": 0.6864, "step": 667 }, { "epoch": 1.4609075997813012, "grad_norm": 0.35177820920944214, "learning_rate": 0.00017104622871046227, "loss": 0.779, "step": 668 }, { "epoch": 1.4630945872061236, "grad_norm": 0.27833032608032227, "learning_rate": 0.00017080291970802917, "loss": 0.7377, "step": 669 }, { "epoch": 1.465281574630946, "grad_norm": 0.26814982295036316, "learning_rate": 0.0001705596107055961, "loss": 0.6367, "step": 670 }, { "epoch": 1.4674685620557681, "grad_norm": 0.29226943850517273, "learning_rate": 0.000170316301703163, "loss": 0.6674, "step": 671 }, { "epoch": 1.4696555494805905, "grad_norm": 0.23404401540756226, "learning_rate": 0.0001700729927007299, "loss": 0.6187, "step": 672 }, { "epoch": 1.4718425369054127, "grad_norm": 0.1943274289369583, "learning_rate": 0.00016982968369829685, "loss": 0.7886, "step": 673 }, { "epoch": 1.474029524330235, "grad_norm": 0.2543155550956726, "learning_rate": 0.00016958637469586374, "loss": 0.8211, "step": 674 }, { "epoch": 1.4762165117550574, "grad_norm": 0.34419891238212585, "learning_rate": 0.00016934306569343064, "loss": 0.7097, "step": 675 }, { "epoch": 1.4784034991798798, "grad_norm": 0.3277907371520996, "learning_rate": 0.00016909975669099757, "loss": 0.6725, "step": 676 }, { "epoch": 1.4805904866047022, "grad_norm": 0.21943743526935577, "learning_rate": 0.00016885644768856447, "loss": 0.6246, "step": 677 }, { "epoch": 1.4827774740295243, "grad_norm": 0.6248902678489685, "learning_rate": 0.00016861313868613137, "loss": 0.8422, "step": 678 }, { "epoch": 1.4849644614543467, "grad_norm": 0.3430839478969574, "learning_rate": 0.0001683698296836983, "loss": 0.7539, "step": 679 }, { "epoch": 1.4871514488791688, "grad_norm": 0.25437131524086, "learning_rate": 0.0001681265206812652, "loss": 0.8793, "step": 680 }, { "epoch": 1.4893384363039912, "grad_norm": 0.44833317399024963, "learning_rate": 0.0001678832116788321, "loss": 0.7591, "step": 681 }, { "epoch": 1.4915254237288136, "grad_norm": 0.359467089176178, "learning_rate": 0.00016763990267639899, "loss": 0.6912, "step": 682 }, { "epoch": 1.493712411153636, "grad_norm": 0.3209226429462433, "learning_rate": 0.00016739659367396594, "loss": 0.6292, "step": 683 }, { "epoch": 1.495899398578458, "grad_norm": 0.30807530879974365, "learning_rate": 0.00016715328467153284, "loss": 0.7619, "step": 684 }, { "epoch": 1.4980863860032805, "grad_norm": 0.38420820236206055, "learning_rate": 0.00016690997566909974, "loss": 0.7212, "step": 685 }, { "epoch": 1.5002733734281026, "grad_norm": 0.27499136328697205, "learning_rate": 0.00016666666666666666, "loss": 0.7246, "step": 686 }, { "epoch": 1.502460360852925, "grad_norm": 0.3359529376029968, "learning_rate": 0.00016642335766423356, "loss": 0.7988, "step": 687 }, { "epoch": 1.5046473482777474, "grad_norm": 0.2965240180492401, "learning_rate": 0.00016618004866180046, "loss": 0.5721, "step": 688 }, { "epoch": 1.5068343357025697, "grad_norm": 0.35766786336898804, "learning_rate": 0.00016593673965936738, "loss": 0.8168, "step": 689 }, { "epoch": 1.5090213231273921, "grad_norm": 0.2500085234642029, "learning_rate": 0.00016569343065693428, "loss": 0.7125, "step": 690 }, { "epoch": 1.5112083105522143, "grad_norm": 0.4028027355670929, "learning_rate": 0.00016545012165450118, "loss": 0.8912, "step": 691 }, { "epoch": 1.5133952979770366, "grad_norm": 0.365488737821579, "learning_rate": 0.00016520681265206813, "loss": 0.8114, "step": 692 }, { "epoch": 1.5155822854018588, "grad_norm": 0.2998720109462738, "learning_rate": 0.00016496350364963503, "loss": 0.7185, "step": 693 }, { "epoch": 1.5177692728266812, "grad_norm": 0.31432968378067017, "learning_rate": 0.00016472019464720193, "loss": 0.6455, "step": 694 }, { "epoch": 1.5199562602515035, "grad_norm": 0.23023012280464172, "learning_rate": 0.00016447688564476886, "loss": 0.5255, "step": 695 }, { "epoch": 1.522143247676326, "grad_norm": 0.3279372453689575, "learning_rate": 0.00016423357664233575, "loss": 0.696, "step": 696 }, { "epoch": 1.5243302351011483, "grad_norm": 0.3116084635257721, "learning_rate": 0.00016399026763990265, "loss": 0.6297, "step": 697 }, { "epoch": 1.5265172225259704, "grad_norm": 0.2646781802177429, "learning_rate": 0.00016374695863746958, "loss": 0.7854, "step": 698 }, { "epoch": 1.5287042099507928, "grad_norm": 0.29048752784729004, "learning_rate": 0.00016350364963503648, "loss": 0.6409, "step": 699 }, { "epoch": 1.530891197375615, "grad_norm": 0.2570263743400574, "learning_rate": 0.00016326034063260337, "loss": 0.6613, "step": 700 }, { "epoch": 1.5330781848004373, "grad_norm": 0.3784395456314087, "learning_rate": 0.00016301703163017033, "loss": 0.5857, "step": 701 }, { "epoch": 1.5352651722252597, "grad_norm": 0.3324502110481262, "learning_rate": 0.00016277372262773723, "loss": 0.7317, "step": 702 }, { "epoch": 1.537452159650082, "grad_norm": 0.2623542249202728, "learning_rate": 0.00016253041362530412, "loss": 0.648, "step": 703 }, { "epoch": 1.5396391470749045, "grad_norm": 0.31035107374191284, "learning_rate": 0.00016228710462287105, "loss": 0.8125, "step": 704 }, { "epoch": 1.5418261344997266, "grad_norm": 0.35497644543647766, "learning_rate": 0.00016204379562043795, "loss": 0.7798, "step": 705 }, { "epoch": 1.544013121924549, "grad_norm": 0.4693346321582794, "learning_rate": 0.00016180048661800485, "loss": 0.7838, "step": 706 }, { "epoch": 1.5462001093493711, "grad_norm": 0.2803730368614197, "learning_rate": 0.00016155717761557177, "loss": 0.9113, "step": 707 }, { "epoch": 1.5483870967741935, "grad_norm": 0.3578079342842102, "learning_rate": 0.00016131386861313867, "loss": 0.6923, "step": 708 }, { "epoch": 1.5505740841990159, "grad_norm": 0.29390111565589905, "learning_rate": 0.00016107055961070557, "loss": 0.8407, "step": 709 }, { "epoch": 1.5527610716238383, "grad_norm": 0.32291004061698914, "learning_rate": 0.0001608272506082725, "loss": 0.8082, "step": 710 }, { "epoch": 1.5549480590486606, "grad_norm": 0.2640690803527832, "learning_rate": 0.00016058394160583942, "loss": 0.6813, "step": 711 }, { "epoch": 1.5571350464734828, "grad_norm": 0.32076698541641235, "learning_rate": 0.00016034063260340632, "loss": 0.8319, "step": 712 }, { "epoch": 1.559322033898305, "grad_norm": 0.29734277725219727, "learning_rate": 0.00016009732360097324, "loss": 0.9649, "step": 713 }, { "epoch": 1.5615090213231273, "grad_norm": 0.3353315591812134, "learning_rate": 0.00015985401459854014, "loss": 0.6102, "step": 714 }, { "epoch": 1.5636960087479497, "grad_norm": 0.24924345314502716, "learning_rate": 0.00015961070559610704, "loss": 0.6868, "step": 715 }, { "epoch": 1.565882996172772, "grad_norm": 0.21561355888843536, "learning_rate": 0.00015936739659367397, "loss": 0.6087, "step": 716 }, { "epoch": 1.5680699835975944, "grad_norm": 0.28856387734413147, "learning_rate": 0.00015912408759124086, "loss": 0.7849, "step": 717 }, { "epoch": 1.5702569710224166, "grad_norm": 0.2342023402452469, "learning_rate": 0.00015888077858880776, "loss": 0.8097, "step": 718 }, { "epoch": 1.572443958447239, "grad_norm": 0.27620434761047363, "learning_rate": 0.00015863746958637466, "loss": 0.6495, "step": 719 }, { "epoch": 1.574630945872061, "grad_norm": 0.3575909733772278, "learning_rate": 0.00015839416058394159, "loss": 0.5667, "step": 720 }, { "epoch": 1.5768179332968835, "grad_norm": 0.29075026512145996, "learning_rate": 0.00015815085158150848, "loss": 0.734, "step": 721 }, { "epoch": 1.5790049207217058, "grad_norm": 0.317648321390152, "learning_rate": 0.0001579075425790754, "loss": 0.6881, "step": 722 }, { "epoch": 1.5811919081465282, "grad_norm": 0.2477569282054901, "learning_rate": 0.00015766423357664234, "loss": 0.7097, "step": 723 }, { "epoch": 1.5833788955713506, "grad_norm": 0.2733086347579956, "learning_rate": 0.00015742092457420923, "loss": 0.4836, "step": 724 }, { "epoch": 1.5855658829961727, "grad_norm": 0.32278919219970703, "learning_rate": 0.00015717761557177613, "loss": 0.6931, "step": 725 }, { "epoch": 1.587752870420995, "grad_norm": 0.2804641127586365, "learning_rate": 0.00015693430656934306, "loss": 0.6908, "step": 726 }, { "epoch": 1.5899398578458173, "grad_norm": 0.28953608870506287, "learning_rate": 0.00015669099756690996, "loss": 0.7086, "step": 727 }, { "epoch": 1.5921268452706396, "grad_norm": 0.21297629177570343, "learning_rate": 0.00015644768856447685, "loss": 0.6663, "step": 728 }, { "epoch": 1.594313832695462, "grad_norm": 0.23495450615882874, "learning_rate": 0.00015620437956204378, "loss": 0.7177, "step": 729 }, { "epoch": 1.5965008201202844, "grad_norm": 0.4271846413612366, "learning_rate": 0.00015596107055961068, "loss": 0.9376, "step": 730 }, { "epoch": 1.5986878075451068, "grad_norm": 0.3190995156764984, "learning_rate": 0.00015571776155717758, "loss": 0.5957, "step": 731 }, { "epoch": 1.600874794969929, "grad_norm": 0.3533025085926056, "learning_rate": 0.00015547445255474453, "loss": 0.8295, "step": 732 }, { "epoch": 1.6030617823947513, "grad_norm": 0.48731425404548645, "learning_rate": 0.00015523114355231143, "loss": 0.7024, "step": 733 }, { "epoch": 1.6052487698195734, "grad_norm": 0.2876966595649719, "learning_rate": 0.00015498783454987833, "loss": 0.6858, "step": 734 }, { "epoch": 1.6074357572443958, "grad_norm": 0.2668203115463257, "learning_rate": 0.00015474452554744525, "loss": 0.7548, "step": 735 }, { "epoch": 1.6096227446692182, "grad_norm": 0.3176876902580261, "learning_rate": 0.00015450121654501215, "loss": 0.7124, "step": 736 }, { "epoch": 1.6118097320940405, "grad_norm": 0.3083260655403137, "learning_rate": 0.00015425790754257905, "loss": 0.682, "step": 737 }, { "epoch": 1.613996719518863, "grad_norm": 0.38110706210136414, "learning_rate": 0.00015401459854014597, "loss": 0.9364, "step": 738 }, { "epoch": 1.616183706943685, "grad_norm": 0.2112010270357132, "learning_rate": 0.00015377128953771287, "loss": 0.6111, "step": 739 }, { "epoch": 1.6183706943685072, "grad_norm": 0.320754736661911, "learning_rate": 0.00015352798053527977, "loss": 0.8463, "step": 740 }, { "epoch": 1.6205576817933296, "grad_norm": 0.2661709785461426, "learning_rate": 0.00015328467153284672, "loss": 0.6922, "step": 741 }, { "epoch": 1.622744669218152, "grad_norm": 0.28991788625717163, "learning_rate": 0.00015304136253041362, "loss": 0.683, "step": 742 }, { "epoch": 1.6249316566429743, "grad_norm": 0.23085246980190277, "learning_rate": 0.00015279805352798052, "loss": 0.6098, "step": 743 }, { "epoch": 1.6271186440677967, "grad_norm": 0.3355705440044403, "learning_rate": 0.00015255474452554745, "loss": 0.7358, "step": 744 }, { "epoch": 1.6293056314926189, "grad_norm": 0.2608512341976166, "learning_rate": 0.00015231143552311434, "loss": 0.6872, "step": 745 }, { "epoch": 1.6314926189174412, "grad_norm": 0.28092092275619507, "learning_rate": 0.00015206812652068124, "loss": 0.7605, "step": 746 }, { "epoch": 1.6336796063422634, "grad_norm": 0.3571244776248932, "learning_rate": 0.00015182481751824817, "loss": 0.5481, "step": 747 }, { "epoch": 1.6358665937670858, "grad_norm": 0.30611398816108704, "learning_rate": 0.00015158150851581507, "loss": 0.6696, "step": 748 }, { "epoch": 1.6380535811919081, "grad_norm": 0.32783061265945435, "learning_rate": 0.00015133819951338196, "loss": 0.8286, "step": 749 }, { "epoch": 1.6402405686167305, "grad_norm": 0.2778065502643585, "learning_rate": 0.00015109489051094892, "loss": 0.6223, "step": 750 }, { "epoch": 1.6424275560415529, "grad_norm": 0.2809867262840271, "learning_rate": 0.00015085158150851582, "loss": 0.4979, "step": 751 }, { "epoch": 1.644614543466375, "grad_norm": 0.3469402492046356, "learning_rate": 0.00015060827250608271, "loss": 0.7277, "step": 752 }, { "epoch": 1.6468015308911974, "grad_norm": 0.33360373973846436, "learning_rate": 0.00015036496350364964, "loss": 0.7133, "step": 753 }, { "epoch": 1.6489885183160196, "grad_norm": 0.24966338276863098, "learning_rate": 0.00015012165450121654, "loss": 0.8344, "step": 754 }, { "epoch": 1.651175505740842, "grad_norm": 0.35595226287841797, "learning_rate": 0.00014987834549878344, "loss": 0.5492, "step": 755 }, { "epoch": 1.6533624931656643, "grad_norm": 0.36205926537513733, "learning_rate": 0.00014963503649635036, "loss": 0.6962, "step": 756 }, { "epoch": 1.6555494805904867, "grad_norm": 0.3373574912548065, "learning_rate": 0.00014939172749391726, "loss": 0.9455, "step": 757 }, { "epoch": 1.657736468015309, "grad_norm": 0.2560804486274719, "learning_rate": 0.00014914841849148416, "loss": 0.6532, "step": 758 }, { "epoch": 1.6599234554401312, "grad_norm": 0.3424091339111328, "learning_rate": 0.00014890510948905108, "loss": 0.7255, "step": 759 }, { "epoch": 1.6621104428649536, "grad_norm": 0.3578891456127167, "learning_rate": 0.000148661800486618, "loss": 0.689, "step": 760 }, { "epoch": 1.6642974302897757, "grad_norm": 0.2998923659324646, "learning_rate": 0.0001484184914841849, "loss": 0.8305, "step": 761 }, { "epoch": 1.666484417714598, "grad_norm": 0.29691943526268005, "learning_rate": 0.0001481751824817518, "loss": 0.5745, "step": 762 }, { "epoch": 1.6686714051394205, "grad_norm": 0.26453182101249695, "learning_rate": 0.00014793187347931873, "loss": 0.6202, "step": 763 }, { "epoch": 1.6708583925642428, "grad_norm": 0.24131835997104645, "learning_rate": 0.00014768856447688563, "loss": 0.8149, "step": 764 }, { "epoch": 1.6730453799890652, "grad_norm": 0.5507832169532776, "learning_rate": 0.00014744525547445256, "loss": 0.7544, "step": 765 }, { "epoch": 1.6752323674138874, "grad_norm": 0.3100571930408478, "learning_rate": 0.00014720194647201945, "loss": 0.6096, "step": 766 }, { "epoch": 1.6774193548387095, "grad_norm": 0.40742942690849304, "learning_rate": 0.00014695863746958635, "loss": 0.8001, "step": 767 }, { "epoch": 1.679606342263532, "grad_norm": 0.26272064447402954, "learning_rate": 0.00014671532846715328, "loss": 0.6614, "step": 768 }, { "epoch": 1.6817933296883543, "grad_norm": 0.3485982418060303, "learning_rate": 0.00014647201946472018, "loss": 0.7596, "step": 769 }, { "epoch": 1.6839803171131766, "grad_norm": 0.3311547636985779, "learning_rate": 0.0001462287104622871, "loss": 0.808, "step": 770 }, { "epoch": 1.686167304537999, "grad_norm": 0.28489449620246887, "learning_rate": 0.000145985401459854, "loss": 0.683, "step": 771 }, { "epoch": 1.6883542919628212, "grad_norm": 0.23958906531333923, "learning_rate": 0.0001457420924574209, "loss": 0.619, "step": 772 }, { "epoch": 1.6905412793876435, "grad_norm": 0.2665773034095764, "learning_rate": 0.00014549878345498782, "loss": 0.7169, "step": 773 }, { "epoch": 1.6927282668124657, "grad_norm": 0.33576110005378723, "learning_rate": 0.00014525547445255475, "loss": 0.7457, "step": 774 }, { "epoch": 1.694915254237288, "grad_norm": 0.3103754222393036, "learning_rate": 0.00014501216545012165, "loss": 0.7083, "step": 775 }, { "epoch": 1.6971022416621104, "grad_norm": 0.27746620774269104, "learning_rate": 0.00014476885644768855, "loss": 0.7648, "step": 776 }, { "epoch": 1.6992892290869328, "grad_norm": 0.3597886264324188, "learning_rate": 0.00014452554744525547, "loss": 0.8173, "step": 777 }, { "epoch": 1.7014762165117552, "grad_norm": 0.2408217489719391, "learning_rate": 0.00014428223844282237, "loss": 0.5872, "step": 778 }, { "epoch": 1.7036632039365773, "grad_norm": 0.24239328503608704, "learning_rate": 0.0001440389294403893, "loss": 0.6311, "step": 779 }, { "epoch": 1.7058501913613997, "grad_norm": 0.4606420695781708, "learning_rate": 0.0001437956204379562, "loss": 0.6742, "step": 780 }, { "epoch": 1.7080371787862219, "grad_norm": 0.2773914933204651, "learning_rate": 0.0001435523114355231, "loss": 0.4933, "step": 781 }, { "epoch": 1.7102241662110442, "grad_norm": 0.33102571964263916, "learning_rate": 0.00014330900243309002, "loss": 0.7694, "step": 782 }, { "epoch": 1.7124111536358666, "grad_norm": 0.3455331027507782, "learning_rate": 0.00014306569343065692, "loss": 0.5662, "step": 783 }, { "epoch": 1.714598141060689, "grad_norm": 0.28522560000419617, "learning_rate": 0.00014282238442822384, "loss": 0.799, "step": 784 }, { "epoch": 1.7167851284855113, "grad_norm": 0.3302403688430786, "learning_rate": 0.00014257907542579074, "loss": 0.8366, "step": 785 }, { "epoch": 1.7189721159103335, "grad_norm": 0.2695009410381317, "learning_rate": 0.00014233576642335764, "loss": 0.5889, "step": 786 }, { "epoch": 1.7211591033351559, "grad_norm": 0.2292398363351822, "learning_rate": 0.00014209245742092456, "loss": 0.519, "step": 787 }, { "epoch": 1.723346090759978, "grad_norm": 0.2863897383213043, "learning_rate": 0.0001418491484184915, "loss": 0.6394, "step": 788 }, { "epoch": 1.7255330781848004, "grad_norm": 1.8092900514602661, "learning_rate": 0.0001416058394160584, "loss": 0.6393, "step": 789 }, { "epoch": 1.7277200656096228, "grad_norm": 0.3296603262424469, "learning_rate": 0.00014136253041362529, "loss": 0.7414, "step": 790 }, { "epoch": 1.7299070530344451, "grad_norm": 0.36179548501968384, "learning_rate": 0.0001411192214111922, "loss": 0.7689, "step": 791 }, { "epoch": 1.7320940404592675, "grad_norm": 0.3196108937263489, "learning_rate": 0.0001408759124087591, "loss": 0.681, "step": 792 }, { "epoch": 1.7342810278840897, "grad_norm": 0.3329809010028839, "learning_rate": 0.000140632603406326, "loss": 0.7421, "step": 793 }, { "epoch": 1.7364680153089118, "grad_norm": 0.22216172516345978, "learning_rate": 0.00014038929440389293, "loss": 0.6421, "step": 794 }, { "epoch": 1.7386550027337342, "grad_norm": 0.33266568183898926, "learning_rate": 0.00014014598540145983, "loss": 0.5699, "step": 795 }, { "epoch": 1.7408419901585566, "grad_norm": 0.3858932852745056, "learning_rate": 0.00013990267639902676, "loss": 0.7368, "step": 796 }, { "epoch": 1.743028977583379, "grad_norm": 0.3091468811035156, "learning_rate": 0.00013965936739659366, "loss": 0.6334, "step": 797 }, { "epoch": 1.7452159650082013, "grad_norm": 0.3596084415912628, "learning_rate": 0.00013941605839416055, "loss": 0.6, "step": 798 }, { "epoch": 1.7474029524330235, "grad_norm": 0.2971950173377991, "learning_rate": 0.00013917274939172748, "loss": 0.6638, "step": 799 }, { "epoch": 1.7495899398578458, "grad_norm": 0.36204877495765686, "learning_rate": 0.0001389294403892944, "loss": 0.6704, "step": 800 }, { "epoch": 1.751776927282668, "grad_norm": 0.25178369879722595, "learning_rate": 0.0001386861313868613, "loss": 0.6057, "step": 801 }, { "epoch": 1.7539639147074904, "grad_norm": 0.2541144788265228, "learning_rate": 0.0001384428223844282, "loss": 0.6294, "step": 802 }, { "epoch": 1.7561509021323127, "grad_norm": 0.31337326765060425, "learning_rate": 0.0001381995133819951, "loss": 0.7991, "step": 803 }, { "epoch": 1.758337889557135, "grad_norm": 0.8276956081390381, "learning_rate": 0.00013795620437956203, "loss": 0.9111, "step": 804 }, { "epoch": 1.7605248769819575, "grad_norm": 0.2656904458999634, "learning_rate": 0.00013771289537712895, "loss": 0.7048, "step": 805 }, { "epoch": 1.7627118644067796, "grad_norm": 0.3123759627342224, "learning_rate": 0.00013746958637469585, "loss": 0.816, "step": 806 }, { "epoch": 1.764898851831602, "grad_norm": 0.28710535168647766, "learning_rate": 0.00013722627737226275, "loss": 0.7998, "step": 807 }, { "epoch": 1.7670858392564242, "grad_norm": 0.28171730041503906, "learning_rate": 0.00013698296836982967, "loss": 0.6835, "step": 808 }, { "epoch": 1.7692728266812465, "grad_norm": 0.42397668957710266, "learning_rate": 0.00013673965936739657, "loss": 0.6875, "step": 809 }, { "epoch": 1.771459814106069, "grad_norm": 0.309830904006958, "learning_rate": 0.0001364963503649635, "loss": 0.7446, "step": 810 }, { "epoch": 1.7736468015308913, "grad_norm": 0.3108932375907898, "learning_rate": 0.0001362530413625304, "loss": 0.6415, "step": 811 }, { "epoch": 1.7758337889557136, "grad_norm": 0.34336167573928833, "learning_rate": 0.0001360097323600973, "loss": 0.688, "step": 812 }, { "epoch": 1.7780207763805358, "grad_norm": 0.2871513366699219, "learning_rate": 0.00013576642335766422, "loss": 0.8814, "step": 813 }, { "epoch": 1.7802077638053582, "grad_norm": 0.24412307143211365, "learning_rate": 0.00013552311435523115, "loss": 0.6767, "step": 814 }, { "epoch": 1.7823947512301803, "grad_norm": 0.3574623167514801, "learning_rate": 0.00013527980535279804, "loss": 0.7016, "step": 815 }, { "epoch": 1.7845817386550027, "grad_norm": 0.4434225261211395, "learning_rate": 0.00013503649635036494, "loss": 0.6373, "step": 816 }, { "epoch": 1.786768726079825, "grad_norm": 0.5134851932525635, "learning_rate": 0.00013479318734793187, "loss": 0.6622, "step": 817 }, { "epoch": 1.7889557135046474, "grad_norm": 0.4768081307411194, "learning_rate": 0.00013454987834549877, "loss": 0.7665, "step": 818 }, { "epoch": 1.7911427009294698, "grad_norm": 0.2798459231853485, "learning_rate": 0.0001343065693430657, "loss": 0.6625, "step": 819 }, { "epoch": 1.793329688354292, "grad_norm": 0.27218303084373474, "learning_rate": 0.0001340632603406326, "loss": 0.6266, "step": 820 }, { "epoch": 1.7955166757791141, "grad_norm": 0.287860244512558, "learning_rate": 0.0001338199513381995, "loss": 0.9758, "step": 821 }, { "epoch": 1.7977036632039365, "grad_norm": 0.26204392313957214, "learning_rate": 0.00013357664233576641, "loss": 0.532, "step": 822 }, { "epoch": 1.7998906506287589, "grad_norm": 0.29923009872436523, "learning_rate": 0.0001333333333333333, "loss": 0.6961, "step": 823 }, { "epoch": 1.8020776380535812, "grad_norm": 0.34140443801879883, "learning_rate": 0.00013309002433090024, "loss": 0.8296, "step": 824 }, { "epoch": 1.8042646254784036, "grad_norm": 0.2605873644351959, "learning_rate": 0.00013284671532846714, "loss": 0.8329, "step": 825 }, { "epoch": 1.8064516129032258, "grad_norm": 0.36522653698921204, "learning_rate": 0.00013260340632603403, "loss": 0.8552, "step": 826 }, { "epoch": 1.8086386003280481, "grad_norm": 0.29043689370155334, "learning_rate": 0.00013236009732360096, "loss": 0.7261, "step": 827 }, { "epoch": 1.8108255877528703, "grad_norm": 0.2861742675304413, "learning_rate": 0.00013211678832116789, "loss": 0.596, "step": 828 }, { "epoch": 1.8130125751776927, "grad_norm": 0.34066513180732727, "learning_rate": 0.00013187347931873478, "loss": 0.8127, "step": 829 }, { "epoch": 1.815199562602515, "grad_norm": 0.3166887164115906, "learning_rate": 0.00013163017031630168, "loss": 0.7491, "step": 830 }, { "epoch": 1.8173865500273374, "grad_norm": 0.36282384395599365, "learning_rate": 0.0001313868613138686, "loss": 0.7511, "step": 831 }, { "epoch": 1.8195735374521598, "grad_norm": 0.36424878239631653, "learning_rate": 0.0001311435523114355, "loss": 0.938, "step": 832 }, { "epoch": 1.821760524876982, "grad_norm": 0.3587567210197449, "learning_rate": 0.00013090024330900243, "loss": 0.8294, "step": 833 }, { "epoch": 1.8239475123018043, "grad_norm": 0.3000282049179077, "learning_rate": 0.00013065693430656933, "loss": 0.7178, "step": 834 }, { "epoch": 1.8261344997266264, "grad_norm": 0.2934707999229431, "learning_rate": 0.00013041362530413623, "loss": 0.7185, "step": 835 }, { "epoch": 1.8283214871514488, "grad_norm": 0.26312437653541565, "learning_rate": 0.00013017031630170315, "loss": 0.6128, "step": 836 }, { "epoch": 1.8305084745762712, "grad_norm": 0.27557966113090515, "learning_rate": 0.00012992700729927008, "loss": 0.6751, "step": 837 }, { "epoch": 1.8326954620010936, "grad_norm": 0.296512633562088, "learning_rate": 0.00012968369829683698, "loss": 0.8259, "step": 838 }, { "epoch": 1.834882449425916, "grad_norm": 0.4524163007736206, "learning_rate": 0.00012944038929440388, "loss": 0.6811, "step": 839 }, { "epoch": 1.837069436850738, "grad_norm": 0.32787275314331055, "learning_rate": 0.00012919708029197077, "loss": 0.6882, "step": 840 }, { "epoch": 1.8392564242755605, "grad_norm": 0.26250511407852173, "learning_rate": 0.0001289537712895377, "loss": 0.6858, "step": 841 }, { "epoch": 1.8414434117003826, "grad_norm": 0.32813650369644165, "learning_rate": 0.00012871046228710463, "loss": 0.5929, "step": 842 }, { "epoch": 1.843630399125205, "grad_norm": 0.3023451864719391, "learning_rate": 0.00012846715328467152, "loss": 0.7795, "step": 843 }, { "epoch": 1.8458173865500274, "grad_norm": 0.3112645745277405, "learning_rate": 0.00012822384428223842, "loss": 0.517, "step": 844 }, { "epoch": 1.8480043739748497, "grad_norm": 0.6681469678878784, "learning_rate": 0.00012798053527980535, "loss": 0.7089, "step": 845 }, { "epoch": 1.850191361399672, "grad_norm": 0.2592954933643341, "learning_rate": 0.00012773722627737225, "loss": 0.7007, "step": 846 }, { "epoch": 1.8523783488244943, "grad_norm": 0.31619131565093994, "learning_rate": 0.00012749391727493917, "loss": 0.4884, "step": 847 }, { "epoch": 1.8545653362493164, "grad_norm": 0.3551687002182007, "learning_rate": 0.00012725060827250607, "loss": 0.5677, "step": 848 }, { "epoch": 1.8567523236741388, "grad_norm": 0.32219335436820984, "learning_rate": 0.00012700729927007297, "loss": 0.6744, "step": 849 }, { "epoch": 1.8589393110989612, "grad_norm": 0.28793492913246155, "learning_rate": 0.0001267639902676399, "loss": 0.6258, "step": 850 }, { "epoch": 1.8611262985237835, "grad_norm": 0.382720410823822, "learning_rate": 0.00012652068126520682, "loss": 0.7977, "step": 851 }, { "epoch": 1.863313285948606, "grad_norm": 0.33804479241371155, "learning_rate": 0.00012627737226277372, "loss": 0.7254, "step": 852 }, { "epoch": 1.865500273373428, "grad_norm": 0.3259097635746002, "learning_rate": 0.00012603406326034062, "loss": 0.8729, "step": 853 }, { "epoch": 1.8676872607982504, "grad_norm": 0.3584567606449127, "learning_rate": 0.00012579075425790754, "loss": 0.7337, "step": 854 }, { "epoch": 1.8698742482230726, "grad_norm": 0.336674302816391, "learning_rate": 0.00012554744525547444, "loss": 0.6829, "step": 855 }, { "epoch": 1.872061235647895, "grad_norm": 0.49990177154541016, "learning_rate": 0.00012530413625304137, "loss": 0.7793, "step": 856 }, { "epoch": 1.8742482230727173, "grad_norm": 0.31498992443084717, "learning_rate": 0.00012506082725060826, "loss": 0.7355, "step": 857 }, { "epoch": 1.8764352104975397, "grad_norm": 0.3050641119480133, "learning_rate": 0.00012481751824817516, "loss": 0.6473, "step": 858 }, { "epoch": 1.878622197922362, "grad_norm": 0.27067434787750244, "learning_rate": 0.0001245742092457421, "loss": 0.6639, "step": 859 }, { "epoch": 1.8808091853471842, "grad_norm": 0.29407691955566406, "learning_rate": 0.000124330900243309, "loss": 0.8002, "step": 860 }, { "epoch": 1.8829961727720066, "grad_norm": 0.3786459267139435, "learning_rate": 0.0001240875912408759, "loss": 0.8694, "step": 861 }, { "epoch": 1.8851831601968287, "grad_norm": 0.3678539991378784, "learning_rate": 0.0001238442822384428, "loss": 0.7188, "step": 862 }, { "epoch": 1.8873701476216511, "grad_norm": 0.3660300076007843, "learning_rate": 0.0001236009732360097, "loss": 0.7348, "step": 863 }, { "epoch": 1.8895571350464735, "grad_norm": 0.34265831112861633, "learning_rate": 0.00012335766423357663, "loss": 0.7046, "step": 864 }, { "epoch": 1.8917441224712959, "grad_norm": 0.3664507567882538, "learning_rate": 0.00012311435523114356, "loss": 0.777, "step": 865 }, { "epoch": 1.8939311098961182, "grad_norm": 0.36169371008872986, "learning_rate": 0.00012287104622871046, "loss": 0.6797, "step": 866 }, { "epoch": 1.8961180973209404, "grad_norm": 0.2904834449291229, "learning_rate": 0.00012262773722627736, "loss": 0.6406, "step": 867 }, { "epoch": 1.8983050847457628, "grad_norm": 0.3194887340068817, "learning_rate": 0.00012238442822384428, "loss": 0.7477, "step": 868 }, { "epoch": 1.900492072170585, "grad_norm": 0.24546030163764954, "learning_rate": 0.00012214111922141118, "loss": 0.6013, "step": 869 }, { "epoch": 1.9026790595954073, "grad_norm": 0.2817955017089844, "learning_rate": 0.00012189781021897809, "loss": 0.7813, "step": 870 }, { "epoch": 1.9048660470202297, "grad_norm": 0.28798621892929077, "learning_rate": 0.000121654501216545, "loss": 0.6312, "step": 871 }, { "epoch": 1.907053034445052, "grad_norm": 0.22041471302509308, "learning_rate": 0.0001214111922141119, "loss": 0.6671, "step": 872 }, { "epoch": 1.9092400218698744, "grad_norm": 0.45332956314086914, "learning_rate": 0.00012116788321167883, "loss": 0.7519, "step": 873 }, { "epoch": 1.9114270092946966, "grad_norm": 0.2907330393791199, "learning_rate": 0.00012092457420924574, "loss": 0.7048, "step": 874 }, { "epoch": 1.9136139967195187, "grad_norm": 0.3308665156364441, "learning_rate": 0.00012068126520681264, "loss": 0.6583, "step": 875 }, { "epoch": 1.915800984144341, "grad_norm": 0.314803808927536, "learning_rate": 0.00012043795620437955, "loss": 0.7902, "step": 876 }, { "epoch": 1.9179879715691635, "grad_norm": 0.47894173860549927, "learning_rate": 0.00012019464720194645, "loss": 0.7153, "step": 877 }, { "epoch": 1.9201749589939858, "grad_norm": 0.2984611392021179, "learning_rate": 0.00011995133819951337, "loss": 0.6093, "step": 878 }, { "epoch": 1.9223619464188082, "grad_norm": 0.5481080412864685, "learning_rate": 0.00011970802919708029, "loss": 0.7026, "step": 879 }, { "epoch": 1.9245489338436303, "grad_norm": 0.4306366443634033, "learning_rate": 0.00011946472019464718, "loss": 0.8093, "step": 880 }, { "epoch": 1.9267359212684527, "grad_norm": 0.4765607416629791, "learning_rate": 0.0001192214111922141, "loss": 0.8378, "step": 881 }, { "epoch": 1.9289229086932749, "grad_norm": 0.29230380058288574, "learning_rate": 0.00011897810218978102, "loss": 0.812, "step": 882 }, { "epoch": 1.9311098961180972, "grad_norm": 0.27519696950912476, "learning_rate": 0.00011873479318734792, "loss": 0.7204, "step": 883 }, { "epoch": 1.9332968835429196, "grad_norm": 0.43257808685302734, "learning_rate": 0.00011849148418491483, "loss": 0.7484, "step": 884 }, { "epoch": 1.935483870967742, "grad_norm": 0.34764620661735535, "learning_rate": 0.00011824817518248174, "loss": 0.7835, "step": 885 }, { "epoch": 1.9376708583925644, "grad_norm": 0.2872960567474365, "learning_rate": 0.00011800486618004864, "loss": 0.6871, "step": 886 }, { "epoch": 1.9398578458173865, "grad_norm": 0.3657885491847992, "learning_rate": 0.00011776155717761557, "loss": 0.7439, "step": 887 }, { "epoch": 1.942044833242209, "grad_norm": 0.3176083564758301, "learning_rate": 0.00011751824817518248, "loss": 0.6768, "step": 888 }, { "epoch": 1.944231820667031, "grad_norm": 0.2851628363132477, "learning_rate": 0.00011727493917274938, "loss": 0.6673, "step": 889 }, { "epoch": 1.9464188080918534, "grad_norm": 0.2601426839828491, "learning_rate": 0.00011703163017031629, "loss": 0.6025, "step": 890 }, { "epoch": 1.9486057955166758, "grad_norm": 0.282064288854599, "learning_rate": 0.0001167883211678832, "loss": 0.7084, "step": 891 }, { "epoch": 1.9507927829414982, "grad_norm": 0.2761860191822052, "learning_rate": 0.0001165450121654501, "loss": 0.7596, "step": 892 }, { "epoch": 1.9529797703663205, "grad_norm": 0.28319042921066284, "learning_rate": 0.00011630170316301703, "loss": 0.6179, "step": 893 }, { "epoch": 1.9551667577911427, "grad_norm": 0.3847699761390686, "learning_rate": 0.00011605839416058394, "loss": 0.7964, "step": 894 }, { "epoch": 1.957353745215965, "grad_norm": 0.5719382762908936, "learning_rate": 0.00011581508515815084, "loss": 0.7848, "step": 895 }, { "epoch": 1.9595407326407872, "grad_norm": 0.24546296894550323, "learning_rate": 0.00011557177615571775, "loss": 0.7404, "step": 896 }, { "epoch": 1.9617277200656096, "grad_norm": 0.2359631359577179, "learning_rate": 0.00011532846715328465, "loss": 0.6091, "step": 897 }, { "epoch": 1.963914707490432, "grad_norm": 0.23529179394245148, "learning_rate": 0.00011508515815085157, "loss": 0.7032, "step": 898 }, { "epoch": 1.9661016949152543, "grad_norm": 0.32363957166671753, "learning_rate": 0.00011484184914841848, "loss": 0.7238, "step": 899 }, { "epoch": 1.9682886823400767, "grad_norm": 0.24427059292793274, "learning_rate": 0.00011459854014598538, "loss": 0.6704, "step": 900 }, { "epoch": 1.9704756697648989, "grad_norm": 0.39608168601989746, "learning_rate": 0.0001143552311435523, "loss": 0.7251, "step": 901 }, { "epoch": 1.972662657189721, "grad_norm": 0.2778458297252655, "learning_rate": 0.00011411192214111922, "loss": 0.6907, "step": 902 }, { "epoch": 1.9748496446145434, "grad_norm": 0.38359907269477844, "learning_rate": 0.00011386861313868612, "loss": 0.792, "step": 903 }, { "epoch": 1.9770366320393657, "grad_norm": 0.2692561149597168, "learning_rate": 0.00011362530413625303, "loss": 0.505, "step": 904 }, { "epoch": 1.9792236194641881, "grad_norm": 0.35147660970687866, "learning_rate": 0.00011338199513381994, "loss": 0.6847, "step": 905 }, { "epoch": 1.9814106068890105, "grad_norm": 0.3441888689994812, "learning_rate": 0.00011313868613138684, "loss": 0.7633, "step": 906 }, { "epoch": 1.9835975943138326, "grad_norm": 0.22528661787509918, "learning_rate": 0.00011289537712895377, "loss": 0.6367, "step": 907 }, { "epoch": 1.985784581738655, "grad_norm": 0.34356188774108887, "learning_rate": 0.00011265206812652068, "loss": 0.8377, "step": 908 }, { "epoch": 1.9879715691634772, "grad_norm": 0.3173167109489441, "learning_rate": 0.00011240875912408758, "loss": 0.6651, "step": 909 }, { "epoch": 1.9901585565882995, "grad_norm": 0.2497638314962387, "learning_rate": 0.00011216545012165449, "loss": 0.7402, "step": 910 }, { "epoch": 1.992345544013122, "grad_norm": 0.28941065073013306, "learning_rate": 0.00011192214111922141, "loss": 0.7328, "step": 911 }, { "epoch": 1.9945325314379443, "grad_norm": 0.3209066092967987, "learning_rate": 0.00011167883211678831, "loss": 0.6639, "step": 912 }, { "epoch": 1.9967195188627667, "grad_norm": 0.2646278142929077, "learning_rate": 0.00011143552311435522, "loss": 0.6795, "step": 913 }, { "epoch": 1.9989065062875888, "grad_norm": 0.25543129444122314, "learning_rate": 0.00011119221411192212, "loss": 0.711, "step": 914 }, { "epoch": 2.001093493712411, "grad_norm": 0.37120577692985535, "learning_rate": 0.00011094890510948904, "loss": 0.909, "step": 915 }, { "epoch": 2.0032804811372333, "grad_norm": 0.20501375198364258, "learning_rate": 0.00011070559610705596, "loss": 0.5982, "step": 916 }, { "epoch": 2.0054674685620557, "grad_norm": 0.2816307544708252, "learning_rate": 0.00011046228710462286, "loss": 0.6477, "step": 917 }, { "epoch": 2.007654455986878, "grad_norm": 0.23481379449367523, "learning_rate": 0.00011021897810218977, "loss": 0.701, "step": 918 }, { "epoch": 2.0098414434117005, "grad_norm": 0.22269988059997559, "learning_rate": 0.00010997566909975668, "loss": 0.4909, "step": 919 }, { "epoch": 2.012028430836523, "grad_norm": 0.22761498391628265, "learning_rate": 0.00010973236009732358, "loss": 0.5446, "step": 920 }, { "epoch": 2.014215418261345, "grad_norm": 0.38109347224235535, "learning_rate": 0.00010948905109489051, "loss": 0.7502, "step": 921 }, { "epoch": 2.016402405686167, "grad_norm": 0.26273003220558167, "learning_rate": 0.00010924574209245742, "loss": 0.8272, "step": 922 }, { "epoch": 2.0185893931109895, "grad_norm": 0.2501181960105896, "learning_rate": 0.00010900243309002432, "loss": 0.6668, "step": 923 }, { "epoch": 2.020776380535812, "grad_norm": 0.2221994698047638, "learning_rate": 0.00010875912408759123, "loss": 0.5899, "step": 924 }, { "epoch": 2.0229633679606343, "grad_norm": 0.26471519470214844, "learning_rate": 0.00010851581508515814, "loss": 0.491, "step": 925 }, { "epoch": 2.0251503553854566, "grad_norm": 0.29527121782302856, "learning_rate": 0.00010827250608272505, "loss": 0.6478, "step": 926 }, { "epoch": 2.027337342810279, "grad_norm": 0.2646641135215759, "learning_rate": 0.00010802919708029196, "loss": 0.6052, "step": 927 }, { "epoch": 2.029524330235101, "grad_norm": 0.2731557786464691, "learning_rate": 0.00010778588807785888, "loss": 0.7211, "step": 928 }, { "epoch": 2.0317113176599233, "grad_norm": 0.32770606875419617, "learning_rate": 0.00010754257907542578, "loss": 0.777, "step": 929 }, { "epoch": 2.0338983050847457, "grad_norm": 0.2406987100839615, "learning_rate": 0.00010729927007299269, "loss": 0.6697, "step": 930 }, { "epoch": 2.036085292509568, "grad_norm": 0.2938626706600189, "learning_rate": 0.00010705596107055961, "loss": 0.7645, "step": 931 }, { "epoch": 2.0382722799343904, "grad_norm": 0.25775012373924255, "learning_rate": 0.00010681265206812651, "loss": 0.721, "step": 932 }, { "epoch": 2.040459267359213, "grad_norm": 0.3010717034339905, "learning_rate": 0.00010656934306569342, "loss": 0.565, "step": 933 }, { "epoch": 2.042646254784035, "grad_norm": 0.27577218413352966, "learning_rate": 0.00010632603406326032, "loss": 0.5764, "step": 934 }, { "epoch": 2.044833242208857, "grad_norm": 0.3049190938472748, "learning_rate": 0.00010608272506082723, "loss": 0.8492, "step": 935 }, { "epoch": 2.0470202296336795, "grad_norm": 0.3621160686016083, "learning_rate": 0.00010583941605839416, "loss": 0.668, "step": 936 }, { "epoch": 2.049207217058502, "grad_norm": 0.28885042667388916, "learning_rate": 0.00010559610705596106, "loss": 0.6898, "step": 937 }, { "epoch": 2.051394204483324, "grad_norm": 0.38116586208343506, "learning_rate": 0.00010535279805352797, "loss": 0.8778, "step": 938 }, { "epoch": 2.0535811919081466, "grad_norm": 0.3027772903442383, "learning_rate": 0.00010510948905109488, "loss": 0.6428, "step": 939 }, { "epoch": 2.055768179332969, "grad_norm": 0.20893897116184235, "learning_rate": 0.00010486618004866178, "loss": 0.6471, "step": 940 }, { "epoch": 2.0579551667577913, "grad_norm": 0.281434565782547, "learning_rate": 0.0001046228710462287, "loss": 0.6593, "step": 941 }, { "epoch": 2.0601421541826133, "grad_norm": 0.3276302218437195, "learning_rate": 0.00010437956204379562, "loss": 0.6077, "step": 942 }, { "epoch": 2.0623291416074356, "grad_norm": 0.35327035188674927, "learning_rate": 0.00010413625304136252, "loss": 0.5687, "step": 943 }, { "epoch": 2.064516129032258, "grad_norm": 0.3210618197917938, "learning_rate": 0.00010389294403892943, "loss": 0.6685, "step": 944 }, { "epoch": 2.0667031164570804, "grad_norm": 0.25362011790275574, "learning_rate": 0.00010364963503649635, "loss": 0.5067, "step": 945 }, { "epoch": 2.0688901038819028, "grad_norm": 0.2774200439453125, "learning_rate": 0.00010340632603406325, "loss": 0.7696, "step": 946 }, { "epoch": 2.071077091306725, "grad_norm": 0.39397120475769043, "learning_rate": 0.00010316301703163016, "loss": 0.7109, "step": 947 }, { "epoch": 2.0732640787315475, "grad_norm": 0.2712627947330475, "learning_rate": 0.00010291970802919708, "loss": 0.5855, "step": 948 }, { "epoch": 2.0754510661563694, "grad_norm": 0.20961184799671173, "learning_rate": 0.00010267639902676397, "loss": 0.6223, "step": 949 }, { "epoch": 2.077638053581192, "grad_norm": 0.35785865783691406, "learning_rate": 0.0001024330900243309, "loss": 0.6426, "step": 950 }, { "epoch": 2.079825041006014, "grad_norm": 0.30317097902297974, "learning_rate": 0.0001021897810218978, "loss": 0.5881, "step": 951 }, { "epoch": 2.0820120284308365, "grad_norm": 0.2647455632686615, "learning_rate": 0.00010194647201946471, "loss": 0.4753, "step": 952 }, { "epoch": 2.084199015855659, "grad_norm": 0.2377641350030899, "learning_rate": 0.00010170316301703162, "loss": 0.7245, "step": 953 }, { "epoch": 2.0863860032804813, "grad_norm": 0.4126327633857727, "learning_rate": 0.00010145985401459852, "loss": 0.7418, "step": 954 }, { "epoch": 2.0885729907053037, "grad_norm": 0.372079998254776, "learning_rate": 0.00010121654501216545, "loss": 0.5861, "step": 955 }, { "epoch": 2.0907599781301256, "grad_norm": 0.35693153738975525, "learning_rate": 0.00010097323600973236, "loss": 0.63, "step": 956 }, { "epoch": 2.092946965554948, "grad_norm": 0.3220914304256439, "learning_rate": 0.00010072992700729926, "loss": 0.6541, "step": 957 }, { "epoch": 2.0951339529797703, "grad_norm": 0.28749874234199524, "learning_rate": 0.00010048661800486617, "loss": 0.5944, "step": 958 }, { "epoch": 2.0973209404045927, "grad_norm": 0.27125856280326843, "learning_rate": 0.00010024330900243309, "loss": 0.546, "step": 959 }, { "epoch": 2.099507927829415, "grad_norm": 0.32414090633392334, "learning_rate": 9.999999999999999e-05, "loss": 0.5295, "step": 960 }, { "epoch": 2.1016949152542375, "grad_norm": 0.37579938769340515, "learning_rate": 9.97566909975669e-05, "loss": 0.6202, "step": 961 }, { "epoch": 2.1038819026790594, "grad_norm": 0.3326401710510254, "learning_rate": 9.951338199513382e-05, "loss": 0.5674, "step": 962 }, { "epoch": 2.1060688901038818, "grad_norm": 0.2777692377567291, "learning_rate": 9.927007299270071e-05, "loss": 0.5297, "step": 963 }, { "epoch": 2.108255877528704, "grad_norm": 0.3658103942871094, "learning_rate": 9.902676399026764e-05, "loss": 0.6001, "step": 964 }, { "epoch": 2.1104428649535265, "grad_norm": 0.30180448293685913, "learning_rate": 9.878345498783455e-05, "loss": 0.627, "step": 965 }, { "epoch": 2.112629852378349, "grad_norm": 0.3160865604877472, "learning_rate": 9.854014598540145e-05, "loss": 0.6583, "step": 966 }, { "epoch": 2.1148168398031713, "grad_norm": 0.38876181840896606, "learning_rate": 9.829683698296836e-05, "loss": 0.7201, "step": 967 }, { "epoch": 2.1170038272279936, "grad_norm": 0.32533615827560425, "learning_rate": 9.805352798053527e-05, "loss": 0.5814, "step": 968 }, { "epoch": 2.1191908146528156, "grad_norm": 0.2723495662212372, "learning_rate": 9.781021897810217e-05, "loss": 0.7299, "step": 969 }, { "epoch": 2.121377802077638, "grad_norm": 0.3380286693572998, "learning_rate": 9.75669099756691e-05, "loss": 0.8313, "step": 970 }, { "epoch": 2.1235647895024603, "grad_norm": 0.3675851821899414, "learning_rate": 9.7323600973236e-05, "loss": 0.5859, "step": 971 }, { "epoch": 2.1257517769272827, "grad_norm": 0.32205119729042053, "learning_rate": 9.708029197080291e-05, "loss": 0.78, "step": 972 }, { "epoch": 2.127938764352105, "grad_norm": 0.3244129419326782, "learning_rate": 9.683698296836982e-05, "loss": 0.6777, "step": 973 }, { "epoch": 2.1301257517769274, "grad_norm": 0.3449605405330658, "learning_rate": 9.659367396593672e-05, "loss": 0.654, "step": 974 }, { "epoch": 2.13231273920175, "grad_norm": 0.3051266670227051, "learning_rate": 9.635036496350364e-05, "loss": 0.6204, "step": 975 }, { "epoch": 2.1344997266265717, "grad_norm": 0.29881876707077026, "learning_rate": 9.610705596107056e-05, "loss": 0.4543, "step": 976 }, { "epoch": 2.136686714051394, "grad_norm": 0.2953018546104431, "learning_rate": 9.586374695863745e-05, "loss": 0.7972, "step": 977 }, { "epoch": 2.1388737014762165, "grad_norm": 0.3214372992515564, "learning_rate": 9.562043795620437e-05, "loss": 0.6216, "step": 978 }, { "epoch": 2.141060688901039, "grad_norm": 0.31700441241264343, "learning_rate": 9.537712895377129e-05, "loss": 0.5708, "step": 979 }, { "epoch": 2.143247676325861, "grad_norm": 0.3516302704811096, "learning_rate": 9.513381995133819e-05, "loss": 0.7428, "step": 980 }, { "epoch": 2.1454346637506836, "grad_norm": 0.278621643781662, "learning_rate": 9.48905109489051e-05, "loss": 0.5118, "step": 981 }, { "epoch": 2.1476216511755055, "grad_norm": 0.39558589458465576, "learning_rate": 9.464720194647201e-05, "loss": 0.6228, "step": 982 }, { "epoch": 2.149808638600328, "grad_norm": 0.2623763382434845, "learning_rate": 9.440389294403891e-05, "loss": 0.5621, "step": 983 }, { "epoch": 2.1519956260251503, "grad_norm": 0.3559738099575043, "learning_rate": 9.416058394160584e-05, "loss": 0.6367, "step": 984 }, { "epoch": 2.1541826134499726, "grad_norm": 0.34260550141334534, "learning_rate": 9.391727493917275e-05, "loss": 0.6587, "step": 985 }, { "epoch": 2.156369600874795, "grad_norm": 0.3602772057056427, "learning_rate": 9.367396593673965e-05, "loss": 0.6749, "step": 986 }, { "epoch": 2.1585565882996174, "grad_norm": 0.4492672383785248, "learning_rate": 9.343065693430656e-05, "loss": 0.6159, "step": 987 }, { "epoch": 2.1607435757244398, "grad_norm": 0.30676203966140747, "learning_rate": 9.318734793187348e-05, "loss": 0.7105, "step": 988 }, { "epoch": 2.1629305631492617, "grad_norm": 0.2810410261154175, "learning_rate": 9.294403892944038e-05, "loss": 0.7091, "step": 989 }, { "epoch": 2.165117550574084, "grad_norm": 0.3161092698574066, "learning_rate": 9.27007299270073e-05, "loss": 0.6866, "step": 990 }, { "epoch": 2.1673045379989064, "grad_norm": 0.30391326546669006, "learning_rate": 9.24574209245742e-05, "loss": 0.6473, "step": 991 }, { "epoch": 2.169491525423729, "grad_norm": 0.33336496353149414, "learning_rate": 9.22141119221411e-05, "loss": 0.7565, "step": 992 }, { "epoch": 2.171678512848551, "grad_norm": 0.27083349227905273, "learning_rate": 9.197080291970803e-05, "loss": 0.602, "step": 993 }, { "epoch": 2.1738655002733736, "grad_norm": 0.3847806751728058, "learning_rate": 9.172749391727493e-05, "loss": 0.6034, "step": 994 }, { "epoch": 2.176052487698196, "grad_norm": 0.334309846162796, "learning_rate": 9.148418491484184e-05, "loss": 0.7368, "step": 995 }, { "epoch": 2.178239475123018, "grad_norm": 0.4568588435649872, "learning_rate": 9.124087591240875e-05, "loss": 0.6723, "step": 996 }, { "epoch": 2.1804264625478402, "grad_norm": 0.23190492391586304, "learning_rate": 9.099756690997565e-05, "loss": 0.5024, "step": 997 }, { "epoch": 2.1826134499726626, "grad_norm": 0.4212368130683899, "learning_rate": 9.075425790754258e-05, "loss": 0.5137, "step": 998 }, { "epoch": 2.184800437397485, "grad_norm": 0.3017450273036957, "learning_rate": 9.051094890510949e-05, "loss": 0.659, "step": 999 }, { "epoch": 2.1869874248223073, "grad_norm": 0.32203611731529236, "learning_rate": 9.026763990267639e-05, "loss": 0.6198, "step": 1000 }, { "epoch": 2.1891744122471297, "grad_norm": 0.308056503534317, "learning_rate": 9.00243309002433e-05, "loss": 0.5798, "step": 1001 }, { "epoch": 2.191361399671952, "grad_norm": 0.32163482904434204, "learning_rate": 8.978102189781021e-05, "loss": 0.4909, "step": 1002 }, { "epoch": 2.193548387096774, "grad_norm": 0.28082406520843506, "learning_rate": 8.953771289537712e-05, "loss": 0.5911, "step": 1003 }, { "epoch": 2.1957353745215964, "grad_norm": 0.3853447139263153, "learning_rate": 8.929440389294404e-05, "loss": 0.601, "step": 1004 }, { "epoch": 2.1979223619464188, "grad_norm": 0.27736788988113403, "learning_rate": 8.905109489051095e-05, "loss": 0.5391, "step": 1005 }, { "epoch": 2.200109349371241, "grad_norm": 0.3074529767036438, "learning_rate": 8.880778588807785e-05, "loss": 0.5264, "step": 1006 }, { "epoch": 2.2022963367960635, "grad_norm": 0.34355053305625916, "learning_rate": 8.856447688564476e-05, "loss": 0.5479, "step": 1007 }, { "epoch": 2.204483324220886, "grad_norm": 0.25875043869018555, "learning_rate": 8.832116788321167e-05, "loss": 0.5133, "step": 1008 }, { "epoch": 2.2066703116457083, "grad_norm": 0.4600970447063446, "learning_rate": 8.807785888077858e-05, "loss": 0.7145, "step": 1009 }, { "epoch": 2.20885729907053, "grad_norm": 0.4292985796928406, "learning_rate": 8.78345498783455e-05, "loss": 0.8484, "step": 1010 }, { "epoch": 2.2110442864953526, "grad_norm": 0.38896313309669495, "learning_rate": 8.759124087591239e-05, "loss": 0.8592, "step": 1011 }, { "epoch": 2.213231273920175, "grad_norm": 0.32829031348228455, "learning_rate": 8.73479318734793e-05, "loss": 0.711, "step": 1012 }, { "epoch": 2.2154182613449973, "grad_norm": 0.32850679755210876, "learning_rate": 8.710462287104623e-05, "loss": 0.6644, "step": 1013 }, { "epoch": 2.2176052487698197, "grad_norm": 0.3872655928134918, "learning_rate": 8.686131386861313e-05, "loss": 0.7039, "step": 1014 }, { "epoch": 2.219792236194642, "grad_norm": 0.39074549078941345, "learning_rate": 8.661800486618004e-05, "loss": 0.6316, "step": 1015 }, { "epoch": 2.221979223619464, "grad_norm": 0.33514949679374695, "learning_rate": 8.637469586374695e-05, "loss": 0.7362, "step": 1016 }, { "epoch": 2.2241662110442864, "grad_norm": 0.37822842597961426, "learning_rate": 8.613138686131385e-05, "loss": 0.8549, "step": 1017 }, { "epoch": 2.2263531984691087, "grad_norm": 0.2988075911998749, "learning_rate": 8.588807785888078e-05, "loss": 0.6768, "step": 1018 }, { "epoch": 2.228540185893931, "grad_norm": 0.3298238515853882, "learning_rate": 8.564476885644769e-05, "loss": 0.661, "step": 1019 }, { "epoch": 2.2307271733187535, "grad_norm": 0.3168882429599762, "learning_rate": 8.540145985401459e-05, "loss": 0.5899, "step": 1020 }, { "epoch": 2.232914160743576, "grad_norm": 0.32149139046669006, "learning_rate": 8.51581508515815e-05, "loss": 0.6377, "step": 1021 }, { "epoch": 2.235101148168398, "grad_norm": 0.3840494453907013, "learning_rate": 8.491484184914842e-05, "loss": 0.5914, "step": 1022 }, { "epoch": 2.23728813559322, "grad_norm": 0.36953312158584595, "learning_rate": 8.467153284671532e-05, "loss": 0.6954, "step": 1023 }, { "epoch": 2.2394751230180425, "grad_norm": 0.3132734000682831, "learning_rate": 8.442822384428223e-05, "loss": 0.6778, "step": 1024 }, { "epoch": 2.241662110442865, "grad_norm": 0.3022383153438568, "learning_rate": 8.418491484184915e-05, "loss": 0.5681, "step": 1025 }, { "epoch": 2.2438490978676873, "grad_norm": 0.33297014236450195, "learning_rate": 8.394160583941604e-05, "loss": 1.0015, "step": 1026 }, { "epoch": 2.2460360852925096, "grad_norm": 0.2536577582359314, "learning_rate": 8.369829683698297e-05, "loss": 0.6535, "step": 1027 }, { "epoch": 2.248223072717332, "grad_norm": 0.3168553113937378, "learning_rate": 8.345498783454987e-05, "loss": 0.4617, "step": 1028 }, { "epoch": 2.250410060142154, "grad_norm": 0.41692110896110535, "learning_rate": 8.321167883211678e-05, "loss": 0.6289, "step": 1029 }, { "epoch": 2.2525970475669763, "grad_norm": 0.31276077032089233, "learning_rate": 8.296836982968369e-05, "loss": 0.6558, "step": 1030 }, { "epoch": 2.2547840349917987, "grad_norm": 0.382587730884552, "learning_rate": 8.272506082725059e-05, "loss": 0.7024, "step": 1031 }, { "epoch": 2.256971022416621, "grad_norm": 0.37239089608192444, "learning_rate": 8.248175182481752e-05, "loss": 0.6428, "step": 1032 }, { "epoch": 2.2591580098414434, "grad_norm": 0.3444945216178894, "learning_rate": 8.223844282238443e-05, "loss": 0.8301, "step": 1033 }, { "epoch": 2.261344997266266, "grad_norm": 0.32943612337112427, "learning_rate": 8.199513381995133e-05, "loss": 0.8259, "step": 1034 }, { "epoch": 2.263531984691088, "grad_norm": 0.3256615996360779, "learning_rate": 8.175182481751824e-05, "loss": 0.5633, "step": 1035 }, { "epoch": 2.26571897211591, "grad_norm": 0.38470467925071716, "learning_rate": 8.150851581508516e-05, "loss": 0.8342, "step": 1036 }, { "epoch": 2.2679059595407325, "grad_norm": 0.3568199872970581, "learning_rate": 8.126520681265206e-05, "loss": 0.6949, "step": 1037 }, { "epoch": 2.270092946965555, "grad_norm": 0.4587413966655731, "learning_rate": 8.102189781021897e-05, "loss": 0.855, "step": 1038 }, { "epoch": 2.2722799343903772, "grad_norm": 0.3806265890598297, "learning_rate": 8.077858880778589e-05, "loss": 0.7383, "step": 1039 }, { "epoch": 2.2744669218151996, "grad_norm": 0.34413963556289673, "learning_rate": 8.053527980535278e-05, "loss": 0.7618, "step": 1040 }, { "epoch": 2.276653909240022, "grad_norm": 0.41507622599601746, "learning_rate": 8.029197080291971e-05, "loss": 0.6976, "step": 1041 }, { "epoch": 2.2788408966648444, "grad_norm": 0.3527161777019501, "learning_rate": 8.004866180048662e-05, "loss": 0.6337, "step": 1042 }, { "epoch": 2.2810278840896663, "grad_norm": 0.405584454536438, "learning_rate": 7.980535279805352e-05, "loss": 0.8183, "step": 1043 }, { "epoch": 2.2832148715144887, "grad_norm": 0.41590583324432373, "learning_rate": 7.956204379562043e-05, "loss": 0.8062, "step": 1044 }, { "epoch": 2.285401858939311, "grad_norm": 0.41613471508026123, "learning_rate": 7.931873479318733e-05, "loss": 0.6246, "step": 1045 }, { "epoch": 2.2875888463641334, "grad_norm": 0.44034960865974426, "learning_rate": 7.907542579075424e-05, "loss": 0.8375, "step": 1046 }, { "epoch": 2.2897758337889558, "grad_norm": 0.3828635811805725, "learning_rate": 7.883211678832117e-05, "loss": 0.8442, "step": 1047 }, { "epoch": 2.291962821213778, "grad_norm": 0.3389468491077423, "learning_rate": 7.858880778588807e-05, "loss": 0.7997, "step": 1048 }, { "epoch": 2.2941498086386005, "grad_norm": 0.33413904905319214, "learning_rate": 7.834549878345498e-05, "loss": 0.6141, "step": 1049 }, { "epoch": 2.2963367960634224, "grad_norm": 0.32505419850349426, "learning_rate": 7.810218978102189e-05, "loss": 0.5001, "step": 1050 }, { "epoch": 2.298523783488245, "grad_norm": 0.3244943618774414, "learning_rate": 7.785888077858879e-05, "loss": 0.6723, "step": 1051 }, { "epoch": 2.300710770913067, "grad_norm": 0.3737221658229828, "learning_rate": 7.761557177615571e-05, "loss": 0.7168, "step": 1052 }, { "epoch": 2.3028977583378896, "grad_norm": 0.4390661120414734, "learning_rate": 7.737226277372263e-05, "loss": 0.5277, "step": 1053 }, { "epoch": 2.305084745762712, "grad_norm": 0.42460954189300537, "learning_rate": 7.712895377128952e-05, "loss": 0.7353, "step": 1054 }, { "epoch": 2.3072717331875343, "grad_norm": 0.3381803035736084, "learning_rate": 7.688564476885644e-05, "loss": 0.6313, "step": 1055 }, { "epoch": 2.3094587206123567, "grad_norm": 0.33968648314476013, "learning_rate": 7.664233576642336e-05, "loss": 0.5752, "step": 1056 }, { "epoch": 2.3116457080371786, "grad_norm": 0.34770649671554565, "learning_rate": 7.639902676399026e-05, "loss": 0.7087, "step": 1057 }, { "epoch": 2.313832695462001, "grad_norm": 0.27934038639068604, "learning_rate": 7.615571776155717e-05, "loss": 0.5717, "step": 1058 }, { "epoch": 2.3160196828868234, "grad_norm": 0.35276851058006287, "learning_rate": 7.591240875912408e-05, "loss": 0.5339, "step": 1059 }, { "epoch": 2.3182066703116457, "grad_norm": 0.31707894802093506, "learning_rate": 7.566909975669098e-05, "loss": 0.5097, "step": 1060 }, { "epoch": 2.320393657736468, "grad_norm": 0.47757935523986816, "learning_rate": 7.542579075425791e-05, "loss": 0.7004, "step": 1061 }, { "epoch": 2.3225806451612905, "grad_norm": 0.3273807764053345, "learning_rate": 7.518248175182482e-05, "loss": 0.6859, "step": 1062 }, { "epoch": 2.324767632586113, "grad_norm": 0.30111655592918396, "learning_rate": 7.493917274939172e-05, "loss": 0.4916, "step": 1063 }, { "epoch": 2.326954620010935, "grad_norm": 0.33053281903266907, "learning_rate": 7.469586374695863e-05, "loss": 0.6866, "step": 1064 }, { "epoch": 2.329141607435757, "grad_norm": 0.34993547201156616, "learning_rate": 7.445255474452554e-05, "loss": 0.6471, "step": 1065 }, { "epoch": 2.3313285948605795, "grad_norm": 0.2865176200866699, "learning_rate": 7.420924574209245e-05, "loss": 0.4927, "step": 1066 }, { "epoch": 2.333515582285402, "grad_norm": 0.43209540843963623, "learning_rate": 7.396593673965937e-05, "loss": 0.6368, "step": 1067 }, { "epoch": 2.3357025697102243, "grad_norm": 0.3290870189666748, "learning_rate": 7.372262773722628e-05, "loss": 0.739, "step": 1068 }, { "epoch": 2.3378895571350466, "grad_norm": 0.3443828225135803, "learning_rate": 7.347931873479318e-05, "loss": 0.8401, "step": 1069 }, { "epoch": 2.340076544559869, "grad_norm": 0.32021573185920715, "learning_rate": 7.323600973236009e-05, "loss": 0.7726, "step": 1070 }, { "epoch": 2.342263531984691, "grad_norm": 0.46182501316070557, "learning_rate": 7.2992700729927e-05, "loss": 0.9029, "step": 1071 }, { "epoch": 2.3444505194095133, "grad_norm": 0.35512760281562805, "learning_rate": 7.274939172749391e-05, "loss": 0.6847, "step": 1072 }, { "epoch": 2.3466375068343357, "grad_norm": 0.380140483379364, "learning_rate": 7.250608272506082e-05, "loss": 0.7038, "step": 1073 }, { "epoch": 2.348824494259158, "grad_norm": 0.32431280612945557, "learning_rate": 7.226277372262774e-05, "loss": 0.5294, "step": 1074 }, { "epoch": 2.3510114816839804, "grad_norm": 0.2768891453742981, "learning_rate": 7.201946472019465e-05, "loss": 0.5286, "step": 1075 }, { "epoch": 2.353198469108803, "grad_norm": 0.3334331214427948, "learning_rate": 7.177615571776155e-05, "loss": 0.6415, "step": 1076 }, { "epoch": 2.3553854565336247, "grad_norm": 0.41533592343330383, "learning_rate": 7.153284671532846e-05, "loss": 0.6295, "step": 1077 }, { "epoch": 2.357572443958447, "grad_norm": 0.42005178332328796, "learning_rate": 7.128953771289537e-05, "loss": 0.8451, "step": 1078 }, { "epoch": 2.3597594313832695, "grad_norm": 0.39049747586250305, "learning_rate": 7.104622871046228e-05, "loss": 0.8351, "step": 1079 }, { "epoch": 2.361946418808092, "grad_norm": 0.33119314908981323, "learning_rate": 7.08029197080292e-05, "loss": 0.5981, "step": 1080 }, { "epoch": 2.3641334062329142, "grad_norm": 0.4426044225692749, "learning_rate": 7.05596107055961e-05, "loss": 0.671, "step": 1081 }, { "epoch": 2.3663203936577366, "grad_norm": 0.3445340096950531, "learning_rate": 7.0316301703163e-05, "loss": 0.6182, "step": 1082 }, { "epoch": 2.3685073810825585, "grad_norm": 0.35596704483032227, "learning_rate": 7.007299270072992e-05, "loss": 0.7591, "step": 1083 }, { "epoch": 2.370694368507381, "grad_norm": 0.39532068371772766, "learning_rate": 6.982968369829683e-05, "loss": 0.5479, "step": 1084 }, { "epoch": 2.3728813559322033, "grad_norm": 0.3580004572868347, "learning_rate": 6.958637469586374e-05, "loss": 0.796, "step": 1085 }, { "epoch": 2.3750683433570257, "grad_norm": 0.5314396023750305, "learning_rate": 6.934306569343065e-05, "loss": 0.5986, "step": 1086 }, { "epoch": 2.377255330781848, "grad_norm": 0.5284639596939087, "learning_rate": 6.909975669099755e-05, "loss": 0.7934, "step": 1087 }, { "epoch": 2.3794423182066704, "grad_norm": 0.38761386275291443, "learning_rate": 6.885644768856448e-05, "loss": 0.6072, "step": 1088 }, { "epoch": 2.3816293056314928, "grad_norm": 0.3381224572658539, "learning_rate": 6.861313868613137e-05, "loss": 0.6392, "step": 1089 }, { "epoch": 2.3838162930563147, "grad_norm": 0.3654699921607971, "learning_rate": 6.836982968369829e-05, "loss": 0.6068, "step": 1090 }, { "epoch": 2.386003280481137, "grad_norm": 0.343288779258728, "learning_rate": 6.81265206812652e-05, "loss": 0.868, "step": 1091 }, { "epoch": 2.3881902679059595, "grad_norm": 0.3624615967273712, "learning_rate": 6.788321167883211e-05, "loss": 0.6408, "step": 1092 }, { "epoch": 2.390377255330782, "grad_norm": 0.3863930404186249, "learning_rate": 6.763990267639902e-05, "loss": 0.5778, "step": 1093 }, { "epoch": 2.392564242755604, "grad_norm": 0.34366974234580994, "learning_rate": 6.739659367396593e-05, "loss": 0.6983, "step": 1094 }, { "epoch": 2.3947512301804266, "grad_norm": 0.34117886424064636, "learning_rate": 6.715328467153285e-05, "loss": 0.6472, "step": 1095 }, { "epoch": 2.396938217605249, "grad_norm": 0.3547564148902893, "learning_rate": 6.690997566909974e-05, "loss": 0.5363, "step": 1096 }, { "epoch": 2.399125205030071, "grad_norm": 0.31432420015335083, "learning_rate": 6.666666666666666e-05, "loss": 0.5539, "step": 1097 }, { "epoch": 2.4013121924548932, "grad_norm": 0.45095062255859375, "learning_rate": 6.642335766423357e-05, "loss": 0.6494, "step": 1098 }, { "epoch": 2.4034991798797156, "grad_norm": 1.0102994441986084, "learning_rate": 6.618004866180048e-05, "loss": 0.988, "step": 1099 }, { "epoch": 2.405686167304538, "grad_norm": 0.5170231461524963, "learning_rate": 6.593673965936739e-05, "loss": 0.8045, "step": 1100 }, { "epoch": 2.4078731547293604, "grad_norm": 0.2993682622909546, "learning_rate": 6.56934306569343e-05, "loss": 0.5887, "step": 1101 }, { "epoch": 2.4100601421541827, "grad_norm": 0.29023849964141846, "learning_rate": 6.545012165450122e-05, "loss": 0.6123, "step": 1102 }, { "epoch": 2.412247129579005, "grad_norm": 0.4196130335330963, "learning_rate": 6.520681265206811e-05, "loss": 0.6444, "step": 1103 }, { "epoch": 2.414434117003827, "grad_norm": 0.43228599429130554, "learning_rate": 6.496350364963504e-05, "loss": 0.7432, "step": 1104 }, { "epoch": 2.4166211044286494, "grad_norm": 0.3056860566139221, "learning_rate": 6.472019464720194e-05, "loss": 0.6673, "step": 1105 }, { "epoch": 2.418808091853472, "grad_norm": 0.4213399887084961, "learning_rate": 6.447688564476885e-05, "loss": 0.798, "step": 1106 }, { "epoch": 2.420995079278294, "grad_norm": 0.4033665060997009, "learning_rate": 6.423357664233576e-05, "loss": 0.7835, "step": 1107 }, { "epoch": 2.4231820667031165, "grad_norm": 0.35071858763694763, "learning_rate": 6.399026763990267e-05, "loss": 0.7173, "step": 1108 }, { "epoch": 2.425369054127939, "grad_norm": 0.36336860060691833, "learning_rate": 6.374695863746959e-05, "loss": 0.6904, "step": 1109 }, { "epoch": 2.4275560415527613, "grad_norm": 0.4012874662876129, "learning_rate": 6.350364963503648e-05, "loss": 0.6062, "step": 1110 }, { "epoch": 2.429743028977583, "grad_norm": 0.3614816665649414, "learning_rate": 6.326034063260341e-05, "loss": 0.7757, "step": 1111 }, { "epoch": 2.4319300164024056, "grad_norm": 0.34320759773254395, "learning_rate": 6.301703163017031e-05, "loss": 0.6789, "step": 1112 }, { "epoch": 2.434117003827228, "grad_norm": 0.3566221594810486, "learning_rate": 6.277372262773722e-05, "loss": 0.7995, "step": 1113 }, { "epoch": 2.4363039912520503, "grad_norm": 0.35487961769104004, "learning_rate": 6.253041362530413e-05, "loss": 0.6536, "step": 1114 }, { "epoch": 2.4384909786768727, "grad_norm": 0.3311222195625305, "learning_rate": 6.228710462287104e-05, "loss": 0.589, "step": 1115 }, { "epoch": 2.440677966101695, "grad_norm": 0.36649906635284424, "learning_rate": 6.204379562043796e-05, "loss": 0.7062, "step": 1116 }, { "epoch": 2.4428649535265174, "grad_norm": 0.36625346541404724, "learning_rate": 6.180048661800485e-05, "loss": 0.6585, "step": 1117 }, { "epoch": 2.4450519409513394, "grad_norm": 0.47065046429634094, "learning_rate": 6.155717761557178e-05, "loss": 0.8547, "step": 1118 }, { "epoch": 2.4472389283761617, "grad_norm": 0.3721199333667755, "learning_rate": 6.131386861313868e-05, "loss": 0.7003, "step": 1119 }, { "epoch": 2.449425915800984, "grad_norm": 0.3814185559749603, "learning_rate": 6.107055961070559e-05, "loss": 0.6616, "step": 1120 }, { "epoch": 2.4516129032258065, "grad_norm": 0.34303221106529236, "learning_rate": 6.08272506082725e-05, "loss": 0.7311, "step": 1121 }, { "epoch": 2.453799890650629, "grad_norm": 0.31710198521614075, "learning_rate": 6.0583941605839414e-05, "loss": 0.6767, "step": 1122 }, { "epoch": 2.4559868780754512, "grad_norm": 0.378255158662796, "learning_rate": 6.034063260340632e-05, "loss": 0.5758, "step": 1123 }, { "epoch": 2.4581738655002736, "grad_norm": 0.3049505949020386, "learning_rate": 6.0097323600973225e-05, "loss": 0.7468, "step": 1124 }, { "epoch": 2.4603608529250955, "grad_norm": 0.31383493542671204, "learning_rate": 5.985401459854014e-05, "loss": 0.5064, "step": 1125 }, { "epoch": 2.462547840349918, "grad_norm": 0.4120381474494934, "learning_rate": 5.961070559610705e-05, "loss": 0.5933, "step": 1126 }, { "epoch": 2.4647348277747403, "grad_norm": 0.41584497690200806, "learning_rate": 5.936739659367396e-05, "loss": 0.6191, "step": 1127 }, { "epoch": 2.4669218151995627, "grad_norm": 0.4834405481815338, "learning_rate": 5.912408759124087e-05, "loss": 0.6092, "step": 1128 }, { "epoch": 2.469108802624385, "grad_norm": 0.30698856711387634, "learning_rate": 5.8880778588807784e-05, "loss": 0.6318, "step": 1129 }, { "epoch": 2.4712957900492074, "grad_norm": 0.42027831077575684, "learning_rate": 5.863746958637469e-05, "loss": 0.5981, "step": 1130 }, { "epoch": 2.4734827774740293, "grad_norm": 0.46082839369773865, "learning_rate": 5.83941605839416e-05, "loss": 0.7592, "step": 1131 }, { "epoch": 2.4756697648988517, "grad_norm": 0.3530132472515106, "learning_rate": 5.815085158150851e-05, "loss": 0.6589, "step": 1132 }, { "epoch": 2.477856752323674, "grad_norm": 0.40325507521629333, "learning_rate": 5.790754257907542e-05, "loss": 0.6136, "step": 1133 }, { "epoch": 2.4800437397484965, "grad_norm": 0.5407168865203857, "learning_rate": 5.7664233576642324e-05, "loss": 0.818, "step": 1134 }, { "epoch": 2.482230727173319, "grad_norm": 0.3995073139667511, "learning_rate": 5.742092457420924e-05, "loss": 0.7405, "step": 1135 }, { "epoch": 2.484417714598141, "grad_norm": 0.327036052942276, "learning_rate": 5.717761557177615e-05, "loss": 0.5611, "step": 1136 }, { "epoch": 2.486604702022963, "grad_norm": 0.4143662750720978, "learning_rate": 5.693430656934306e-05, "loss": 0.7194, "step": 1137 }, { "epoch": 2.4887916894477855, "grad_norm": 0.37465140223503113, "learning_rate": 5.669099756690997e-05, "loss": 0.8684, "step": 1138 }, { "epoch": 2.490978676872608, "grad_norm": 0.3546184301376343, "learning_rate": 5.644768856447688e-05, "loss": 0.5464, "step": 1139 }, { "epoch": 2.4931656642974303, "grad_norm": 0.5521944165229797, "learning_rate": 5.620437956204379e-05, "loss": 0.6143, "step": 1140 }, { "epoch": 2.4953526517222526, "grad_norm": 0.3398590385913849, "learning_rate": 5.596107055961071e-05, "loss": 0.7098, "step": 1141 }, { "epoch": 2.497539639147075, "grad_norm": 0.28899359703063965, "learning_rate": 5.571776155717761e-05, "loss": 0.6263, "step": 1142 }, { "epoch": 2.4997266265718974, "grad_norm": 0.3622675836086273, "learning_rate": 5.547445255474452e-05, "loss": 0.5183, "step": 1143 }, { "epoch": 2.5019136139967193, "grad_norm": 0.3359682261943817, "learning_rate": 5.523114355231143e-05, "loss": 0.7125, "step": 1144 }, { "epoch": 2.5041006014215417, "grad_norm": 0.42786240577697754, "learning_rate": 5.498783454987834e-05, "loss": 0.6445, "step": 1145 }, { "epoch": 2.506287588846364, "grad_norm": 0.340658575296402, "learning_rate": 5.4744525547445253e-05, "loss": 0.5709, "step": 1146 }, { "epoch": 2.5084745762711864, "grad_norm": 0.3030422031879425, "learning_rate": 5.450121654501216e-05, "loss": 0.5894, "step": 1147 }, { "epoch": 2.510661563696009, "grad_norm": 0.4911826550960541, "learning_rate": 5.425790754257907e-05, "loss": 0.6198, "step": 1148 }, { "epoch": 2.512848551120831, "grad_norm": 0.3828030824661255, "learning_rate": 5.401459854014598e-05, "loss": 0.7856, "step": 1149 }, { "epoch": 2.5150355385456535, "grad_norm": 0.354000449180603, "learning_rate": 5.377128953771289e-05, "loss": 0.5489, "step": 1150 }, { "epoch": 2.5172225259704755, "grad_norm": 0.2972152829170227, "learning_rate": 5.3527980535279806e-05, "loss": 0.773, "step": 1151 }, { "epoch": 2.519409513395298, "grad_norm": 0.3820708394050598, "learning_rate": 5.328467153284671e-05, "loss": 0.6889, "step": 1152 }, { "epoch": 2.52159650082012, "grad_norm": 0.3476285934448242, "learning_rate": 5.304136253041362e-05, "loss": 0.5365, "step": 1153 }, { "epoch": 2.5237834882449426, "grad_norm": 0.36393001675605774, "learning_rate": 5.279805352798053e-05, "loss": 0.6012, "step": 1154 }, { "epoch": 2.525970475669765, "grad_norm": 0.3589417338371277, "learning_rate": 5.255474452554744e-05, "loss": 0.6502, "step": 1155 }, { "epoch": 2.5281574630945873, "grad_norm": 0.34018373489379883, "learning_rate": 5.231143552311435e-05, "loss": 0.6489, "step": 1156 }, { "epoch": 2.5303444505194097, "grad_norm": 0.40649306774139404, "learning_rate": 5.206812652068126e-05, "loss": 0.6107, "step": 1157 }, { "epoch": 2.5325314379442316, "grad_norm": 0.3748558759689331, "learning_rate": 5.1824817518248176e-05, "loss": 0.5517, "step": 1158 }, { "epoch": 2.534718425369054, "grad_norm": 0.4162946939468384, "learning_rate": 5.158150851581508e-05, "loss": 0.5658, "step": 1159 }, { "epoch": 2.5369054127938764, "grad_norm": 0.40900272130966187, "learning_rate": 5.133819951338199e-05, "loss": 0.6965, "step": 1160 }, { "epoch": 2.5390924002186988, "grad_norm": 0.4511730372905731, "learning_rate": 5.10948905109489e-05, "loss": 0.7305, "step": 1161 }, { "epoch": 2.541279387643521, "grad_norm": 0.4122026860713959, "learning_rate": 5.085158150851581e-05, "loss": 0.6032, "step": 1162 }, { "epoch": 2.5434663750683435, "grad_norm": 0.33657750487327576, "learning_rate": 5.060827250608272e-05, "loss": 0.6772, "step": 1163 }, { "epoch": 2.545653362493166, "grad_norm": 0.3611637353897095, "learning_rate": 5.036496350364963e-05, "loss": 0.7829, "step": 1164 }, { "epoch": 2.547840349917988, "grad_norm": 0.3221738040447235, "learning_rate": 5.0121654501216546e-05, "loss": 0.656, "step": 1165 }, { "epoch": 2.55002733734281, "grad_norm": 0.30915001034736633, "learning_rate": 4.987834549878345e-05, "loss": 0.55, "step": 1166 }, { "epoch": 2.5522143247676325, "grad_norm": 0.3413131535053253, "learning_rate": 4.963503649635036e-05, "loss": 0.7515, "step": 1167 }, { "epoch": 2.554401312192455, "grad_norm": 0.4244505763053894, "learning_rate": 4.9391727493917275e-05, "loss": 0.7202, "step": 1168 }, { "epoch": 2.5565882996172773, "grad_norm": 0.2993778586387634, "learning_rate": 4.914841849148418e-05, "loss": 0.4497, "step": 1169 }, { "epoch": 2.5587752870420997, "grad_norm": 0.43434271216392517, "learning_rate": 4.8905109489051086e-05, "loss": 0.591, "step": 1170 }, { "epoch": 2.560962274466922, "grad_norm": 0.35246193408966064, "learning_rate": 4.8661800486618e-05, "loss": 0.537, "step": 1171 }, { "epoch": 2.563149261891744, "grad_norm": 0.37283191084861755, "learning_rate": 4.841849148418491e-05, "loss": 0.5856, "step": 1172 }, { "epoch": 2.5653362493165663, "grad_norm": 0.39839670062065125, "learning_rate": 4.817518248175182e-05, "loss": 0.4996, "step": 1173 }, { "epoch": 2.5675232367413887, "grad_norm": 0.4315820634365082, "learning_rate": 4.793187347931873e-05, "loss": 0.7119, "step": 1174 }, { "epoch": 2.569710224166211, "grad_norm": 0.4408882260322571, "learning_rate": 4.7688564476885646e-05, "loss": 0.7059, "step": 1175 }, { "epoch": 2.5718972115910335, "grad_norm": 0.4746418595314026, "learning_rate": 4.744525547445255e-05, "loss": 0.6944, "step": 1176 }, { "epoch": 2.5740841990158554, "grad_norm": 0.31449419260025024, "learning_rate": 4.7201946472019456e-05, "loss": 0.7469, "step": 1177 }, { "epoch": 2.576271186440678, "grad_norm": 0.4608743190765381, "learning_rate": 4.6958637469586375e-05, "loss": 0.4727, "step": 1178 }, { "epoch": 2.5784581738655, "grad_norm": 0.3578025996685028, "learning_rate": 4.671532846715328e-05, "loss": 0.8796, "step": 1179 }, { "epoch": 2.5806451612903225, "grad_norm": 0.3281157612800598, "learning_rate": 4.647201946472019e-05, "loss": 0.5228, "step": 1180 }, { "epoch": 2.582832148715145, "grad_norm": 0.34412261843681335, "learning_rate": 4.62287104622871e-05, "loss": 0.6171, "step": 1181 }, { "epoch": 2.5850191361399673, "grad_norm": 0.32819414138793945, "learning_rate": 4.5985401459854016e-05, "loss": 0.6381, "step": 1182 }, { "epoch": 2.5872061235647896, "grad_norm": 0.42394185066223145, "learning_rate": 4.574209245742092e-05, "loss": 0.6248, "step": 1183 }, { "epoch": 2.5893931109896116, "grad_norm": 0.3938983082771301, "learning_rate": 4.5498783454987826e-05, "loss": 0.688, "step": 1184 }, { "epoch": 2.5915800984144344, "grad_norm": 0.35975101590156555, "learning_rate": 4.5255474452554745e-05, "loss": 0.6196, "step": 1185 }, { "epoch": 2.5937670858392563, "grad_norm": 0.5351125597953796, "learning_rate": 4.501216545012165e-05, "loss": 0.6542, "step": 1186 }, { "epoch": 2.5959540732640787, "grad_norm": 0.31686198711395264, "learning_rate": 4.476885644768856e-05, "loss": 0.7063, "step": 1187 }, { "epoch": 2.598141060688901, "grad_norm": 0.2979380786418915, "learning_rate": 4.4525547445255474e-05, "loss": 0.5374, "step": 1188 }, { "epoch": 2.6003280481137234, "grad_norm": 0.3495193123817444, "learning_rate": 4.428223844282238e-05, "loss": 0.6217, "step": 1189 }, { "epoch": 2.602515035538546, "grad_norm": 0.3886531591415405, "learning_rate": 4.403892944038929e-05, "loss": 0.5628, "step": 1190 }, { "epoch": 2.6047020229633677, "grad_norm": 0.3585399091243744, "learning_rate": 4.3795620437956196e-05, "loss": 0.6921, "step": 1191 }, { "epoch": 2.6068890103881905, "grad_norm": 0.3813333809375763, "learning_rate": 4.3552311435523115e-05, "loss": 0.6603, "step": 1192 }, { "epoch": 2.6090759978130125, "grad_norm": 0.4587854743003845, "learning_rate": 4.330900243309002e-05, "loss": 0.7274, "step": 1193 }, { "epoch": 2.611262985237835, "grad_norm": 0.4350600242614746, "learning_rate": 4.3065693430656925e-05, "loss": 0.6628, "step": 1194 }, { "epoch": 2.613449972662657, "grad_norm": 0.3220929205417633, "learning_rate": 4.2822384428223844e-05, "loss": 0.6057, "step": 1195 }, { "epoch": 2.6156369600874796, "grad_norm": 0.54576575756073, "learning_rate": 4.257907542579075e-05, "loss": 0.693, "step": 1196 }, { "epoch": 2.617823947512302, "grad_norm": 0.393766850233078, "learning_rate": 4.233576642335766e-05, "loss": 0.6226, "step": 1197 }, { "epoch": 2.620010934937124, "grad_norm": 0.3243195116519928, "learning_rate": 4.209245742092457e-05, "loss": 0.7465, "step": 1198 }, { "epoch": 2.6221979223619463, "grad_norm": 0.3847908079624176, "learning_rate": 4.1849148418491485e-05, "loss": 0.4963, "step": 1199 }, { "epoch": 2.6243849097867686, "grad_norm": 0.40093564987182617, "learning_rate": 4.160583941605839e-05, "loss": 0.7138, "step": 1200 }, { "epoch": 2.626571897211591, "grad_norm": 0.4176326096057892, "learning_rate": 4.1362530413625295e-05, "loss": 0.4808, "step": 1201 }, { "epoch": 2.6287588846364134, "grad_norm": 0.3477429151535034, "learning_rate": 4.1119221411192214e-05, "loss": 0.6285, "step": 1202 }, { "epoch": 2.6309458720612358, "grad_norm": 0.4201376736164093, "learning_rate": 4.087591240875912e-05, "loss": 1.0551, "step": 1203 }, { "epoch": 2.633132859486058, "grad_norm": 0.4241773188114166, "learning_rate": 4.063260340632603e-05, "loss": 0.6991, "step": 1204 }, { "epoch": 2.63531984691088, "grad_norm": 0.5858724117279053, "learning_rate": 4.038929440389294e-05, "loss": 0.6912, "step": 1205 }, { "epoch": 2.6375068343357024, "grad_norm": 0.3396605849266052, "learning_rate": 4.0145985401459855e-05, "loss": 0.5062, "step": 1206 }, { "epoch": 2.639693821760525, "grad_norm": 0.3286657929420471, "learning_rate": 3.990267639902676e-05, "loss": 0.678, "step": 1207 }, { "epoch": 2.641880809185347, "grad_norm": 0.3253632187843323, "learning_rate": 3.9659367396593665e-05, "loss": 0.5769, "step": 1208 }, { "epoch": 2.6440677966101696, "grad_norm": 0.39935943484306335, "learning_rate": 3.9416058394160584e-05, "loss": 0.6078, "step": 1209 }, { "epoch": 2.646254784034992, "grad_norm": 0.38090863823890686, "learning_rate": 3.917274939172749e-05, "loss": 0.6195, "step": 1210 }, { "epoch": 2.6484417714598143, "grad_norm": 0.3816772401332855, "learning_rate": 3.8929440389294394e-05, "loss": 0.6636, "step": 1211 }, { "epoch": 2.6506287588846362, "grad_norm": 0.354041188955307, "learning_rate": 3.868613138686131e-05, "loss": 0.6017, "step": 1212 }, { "epoch": 2.6528157463094586, "grad_norm": 0.38338416814804077, "learning_rate": 3.844282238442822e-05, "loss": 0.5642, "step": 1213 }, { "epoch": 2.655002733734281, "grad_norm": 0.4089908003807068, "learning_rate": 3.819951338199513e-05, "loss": 0.7222, "step": 1214 }, { "epoch": 2.6571897211591033, "grad_norm": 0.44963401556015015, "learning_rate": 3.795620437956204e-05, "loss": 0.613, "step": 1215 }, { "epoch": 2.6593767085839257, "grad_norm": 0.2840285003185272, "learning_rate": 3.7712895377128954e-05, "loss": 0.6435, "step": 1216 }, { "epoch": 2.661563696008748, "grad_norm": 0.39185985922813416, "learning_rate": 3.746958637469586e-05, "loss": 0.7633, "step": 1217 }, { "epoch": 2.6637506834335705, "grad_norm": 0.3823552131652832, "learning_rate": 3.722627737226277e-05, "loss": 0.6632, "step": 1218 }, { "epoch": 2.6659376708583924, "grad_norm": 0.4937818646430969, "learning_rate": 3.698296836982968e-05, "loss": 0.8944, "step": 1219 }, { "epoch": 2.6681246582832148, "grad_norm": 0.38062620162963867, "learning_rate": 3.673965936739659e-05, "loss": 0.7507, "step": 1220 }, { "epoch": 2.670311645708037, "grad_norm": 0.34089863300323486, "learning_rate": 3.64963503649635e-05, "loss": 0.6276, "step": 1221 }, { "epoch": 2.6724986331328595, "grad_norm": 0.45665138959884644, "learning_rate": 3.625304136253041e-05, "loss": 0.6801, "step": 1222 }, { "epoch": 2.674685620557682, "grad_norm": 0.5102551579475403, "learning_rate": 3.6009732360097324e-05, "loss": 0.5385, "step": 1223 }, { "epoch": 2.6768726079825043, "grad_norm": 0.4079155921936035, "learning_rate": 3.576642335766423e-05, "loss": 0.7165, "step": 1224 }, { "epoch": 2.6790595954073266, "grad_norm": 0.3809445798397064, "learning_rate": 3.552311435523114e-05, "loss": 0.6695, "step": 1225 }, { "epoch": 2.6812465828321486, "grad_norm": 0.44514816999435425, "learning_rate": 3.527980535279805e-05, "loss": 0.732, "step": 1226 }, { "epoch": 2.683433570256971, "grad_norm": 0.40891462564468384, "learning_rate": 3.503649635036496e-05, "loss": 0.9004, "step": 1227 }, { "epoch": 2.6856205576817933, "grad_norm": 0.44487065076828003, "learning_rate": 3.479318734793187e-05, "loss": 0.4452, "step": 1228 }, { "epoch": 2.6878075451066157, "grad_norm": 0.27980828285217285, "learning_rate": 3.4549878345498775e-05, "loss": 0.6259, "step": 1229 }, { "epoch": 2.689994532531438, "grad_norm": 0.37272408604621887, "learning_rate": 3.430656934306569e-05, "loss": 0.7493, "step": 1230 }, { "epoch": 2.69218151995626, "grad_norm": 0.4146464169025421, "learning_rate": 3.40632603406326e-05, "loss": 0.5103, "step": 1231 }, { "epoch": 2.694368507381083, "grad_norm": 0.350233793258667, "learning_rate": 3.381995133819951e-05, "loss": 0.6766, "step": 1232 }, { "epoch": 2.6965554948059047, "grad_norm": 0.49093326926231384, "learning_rate": 3.357664233576642e-05, "loss": 0.6934, "step": 1233 }, { "epoch": 2.698742482230727, "grad_norm": 0.4598555266857147, "learning_rate": 3.333333333333333e-05, "loss": 0.6618, "step": 1234 }, { "epoch": 2.7009294696555495, "grad_norm": 0.4397393465042114, "learning_rate": 3.309002433090024e-05, "loss": 0.5864, "step": 1235 }, { "epoch": 2.703116457080372, "grad_norm": 0.43458834290504456, "learning_rate": 3.284671532846715e-05, "loss": 0.6955, "step": 1236 }, { "epoch": 2.705303444505194, "grad_norm": 0.3657298684120178, "learning_rate": 3.260340632603406e-05, "loss": 0.651, "step": 1237 }, { "epoch": 2.707490431930016, "grad_norm": 0.4210680425167084, "learning_rate": 3.236009732360097e-05, "loss": 0.5718, "step": 1238 }, { "epoch": 2.709677419354839, "grad_norm": 0.3858646750450134, "learning_rate": 3.211678832116788e-05, "loss": 0.6649, "step": 1239 }, { "epoch": 2.711864406779661, "grad_norm": 0.4130675494670868, "learning_rate": 3.187347931873479e-05, "loss": 0.6539, "step": 1240 }, { "epoch": 2.7140513942044833, "grad_norm": 0.246662899851799, "learning_rate": 3.1630170316301705e-05, "loss": 0.5551, "step": 1241 }, { "epoch": 2.7162383816293056, "grad_norm": 0.3459307551383972, "learning_rate": 3.138686131386861e-05, "loss": 0.4788, "step": 1242 }, { "epoch": 2.718425369054128, "grad_norm": 0.4324615001678467, "learning_rate": 3.114355231143552e-05, "loss": 0.7828, "step": 1243 }, { "epoch": 2.7206123564789504, "grad_norm": 0.5233476758003235, "learning_rate": 3.090024330900243e-05, "loss": 0.4262, "step": 1244 }, { "epoch": 2.7227993439037723, "grad_norm": 0.35397472977638245, "learning_rate": 3.065693430656934e-05, "loss": 0.688, "step": 1245 }, { "epoch": 2.724986331328595, "grad_norm": 0.37005069851875305, "learning_rate": 3.041362530413625e-05, "loss": 0.6592, "step": 1246 }, { "epoch": 2.727173318753417, "grad_norm": 0.4533984661102295, "learning_rate": 3.017031630170316e-05, "loss": 0.6367, "step": 1247 }, { "epoch": 2.7293603061782394, "grad_norm": 0.32724103331565857, "learning_rate": 2.992700729927007e-05, "loss": 0.5874, "step": 1248 }, { "epoch": 2.731547293603062, "grad_norm": 0.3568969666957855, "learning_rate": 2.968369829683698e-05, "loss": 0.8173, "step": 1249 }, { "epoch": 2.733734281027884, "grad_norm": 0.3268612325191498, "learning_rate": 2.9440389294403892e-05, "loss": 0.4827, "step": 1250 }, { "epoch": 2.7359212684527066, "grad_norm": 0.30471158027648926, "learning_rate": 2.91970802919708e-05, "loss": 0.7108, "step": 1251 }, { "epoch": 2.7381082558775285, "grad_norm": 0.3290720582008362, "learning_rate": 2.895377128953771e-05, "loss": 0.639, "step": 1252 }, { "epoch": 2.740295243302351, "grad_norm": 0.35110557079315186, "learning_rate": 2.871046228710462e-05, "loss": 0.5367, "step": 1253 }, { "epoch": 2.7424822307271732, "grad_norm": 0.26838091015815735, "learning_rate": 2.846715328467153e-05, "loss": 0.801, "step": 1254 }, { "epoch": 2.7446692181519956, "grad_norm": 0.3596297800540924, "learning_rate": 2.822384428223844e-05, "loss": 0.6018, "step": 1255 }, { "epoch": 2.746856205576818, "grad_norm": 0.4146590530872345, "learning_rate": 2.7980535279805354e-05, "loss": 0.7548, "step": 1256 }, { "epoch": 2.7490431930016404, "grad_norm": 0.5210931897163391, "learning_rate": 2.773722627737226e-05, "loss": 0.6514, "step": 1257 }, { "epoch": 2.7512301804264627, "grad_norm": 0.37990838289260864, "learning_rate": 2.749391727493917e-05, "loss": 0.6275, "step": 1258 }, { "epoch": 2.7534171678512847, "grad_norm": 0.41597574949264526, "learning_rate": 2.725060827250608e-05, "loss": 0.7675, "step": 1259 }, { "epoch": 2.755604155276107, "grad_norm": 0.4515291452407837, "learning_rate": 2.700729927007299e-05, "loss": 0.6756, "step": 1260 }, { "epoch": 2.7577911427009294, "grad_norm": 0.418295294046402, "learning_rate": 2.6763990267639903e-05, "loss": 0.6417, "step": 1261 }, { "epoch": 2.7599781301257518, "grad_norm": 0.34704264998435974, "learning_rate": 2.652068126520681e-05, "loss": 0.8996, "step": 1262 }, { "epoch": 2.762165117550574, "grad_norm": 0.3458947241306305, "learning_rate": 2.627737226277372e-05, "loss": 0.8436, "step": 1263 }, { "epoch": 2.7643521049753965, "grad_norm": 0.39911675453186035, "learning_rate": 2.603406326034063e-05, "loss": 0.5799, "step": 1264 }, { "epoch": 2.766539092400219, "grad_norm": 0.2880173623561859, "learning_rate": 2.579075425790754e-05, "loss": 0.5253, "step": 1265 }, { "epoch": 2.768726079825041, "grad_norm": 0.35598114132881165, "learning_rate": 2.554744525547445e-05, "loss": 0.6593, "step": 1266 }, { "epoch": 2.770913067249863, "grad_norm": 0.34010377526283264, "learning_rate": 2.530413625304136e-05, "loss": 0.6076, "step": 1267 }, { "epoch": 2.7731000546746856, "grad_norm": 0.37857237458229065, "learning_rate": 2.5060827250608273e-05, "loss": 0.7757, "step": 1268 }, { "epoch": 2.775287042099508, "grad_norm": 0.6945297718048096, "learning_rate": 2.481751824817518e-05, "loss": 0.7243, "step": 1269 }, { "epoch": 2.7774740295243303, "grad_norm": 0.3066571354866028, "learning_rate": 2.457420924574209e-05, "loss": 0.6558, "step": 1270 }, { "epoch": 2.7796610169491527, "grad_norm": 0.42167848348617554, "learning_rate": 2.4330900243309e-05, "loss": 0.6929, "step": 1271 }, { "epoch": 2.781848004373975, "grad_norm": 0.4334861934185028, "learning_rate": 2.408759124087591e-05, "loss": 0.6516, "step": 1272 }, { "epoch": 2.784034991798797, "grad_norm": 0.39597228169441223, "learning_rate": 2.3844282238442823e-05, "loss": 0.688, "step": 1273 }, { "epoch": 2.7862219792236194, "grad_norm": 0.36653244495391846, "learning_rate": 2.3600973236009728e-05, "loss": 0.7899, "step": 1274 }, { "epoch": 2.7884089666484417, "grad_norm": 0.4496842622756958, "learning_rate": 2.335766423357664e-05, "loss": 0.7682, "step": 1275 }, { "epoch": 2.790595954073264, "grad_norm": 0.5105994343757629, "learning_rate": 2.311435523114355e-05, "loss": 0.6332, "step": 1276 }, { "epoch": 2.7927829414980865, "grad_norm": 0.30159294605255127, "learning_rate": 2.287104622871046e-05, "loss": 0.6215, "step": 1277 }, { "epoch": 2.794969928922909, "grad_norm": 0.44565349817276, "learning_rate": 2.2627737226277372e-05, "loss": 0.8171, "step": 1278 }, { "epoch": 2.7971569163477312, "grad_norm": 0.48561230301856995, "learning_rate": 2.238442822384428e-05, "loss": 0.7251, "step": 1279 }, { "epoch": 2.799343903772553, "grad_norm": 0.4640182554721832, "learning_rate": 2.214111922141119e-05, "loss": 0.8137, "step": 1280 }, { "epoch": 2.8015308911973755, "grad_norm": 0.34384575486183167, "learning_rate": 2.1897810218978098e-05, "loss": 0.7161, "step": 1281 }, { "epoch": 2.803717878622198, "grad_norm": 0.3967885971069336, "learning_rate": 2.165450121654501e-05, "loss": 0.6331, "step": 1282 }, { "epoch": 2.8059048660470203, "grad_norm": 0.4139404892921448, "learning_rate": 2.1411192214111922e-05, "loss": 0.7716, "step": 1283 }, { "epoch": 2.8080918534718426, "grad_norm": 0.5906177163124084, "learning_rate": 2.116788321167883e-05, "loss": 0.8308, "step": 1284 }, { "epoch": 2.8102788408966646, "grad_norm": 0.3923112452030182, "learning_rate": 2.0924574209245742e-05, "loss": 0.5808, "step": 1285 }, { "epoch": 2.8124658283214874, "grad_norm": 0.376613050699234, "learning_rate": 2.0681265206812648e-05, "loss": 0.4945, "step": 1286 }, { "epoch": 2.8146528157463093, "grad_norm": 0.39711064100265503, "learning_rate": 2.043795620437956e-05, "loss": 0.9447, "step": 1287 }, { "epoch": 2.8168398031711317, "grad_norm": 0.49172040820121765, "learning_rate": 2.019464720194647e-05, "loss": 0.5981, "step": 1288 }, { "epoch": 2.819026790595954, "grad_norm": 0.3777097165584564, "learning_rate": 1.995133819951338e-05, "loss": 0.5527, "step": 1289 }, { "epoch": 2.8212137780207764, "grad_norm": 0.3420855700969696, "learning_rate": 1.9708029197080292e-05, "loss": 0.591, "step": 1290 }, { "epoch": 2.823400765445599, "grad_norm": 0.3033166825771332, "learning_rate": 1.9464720194647197e-05, "loss": 0.4902, "step": 1291 }, { "epoch": 2.8255877528704207, "grad_norm": 0.3743399679660797, "learning_rate": 1.922141119221411e-05, "loss": 0.72, "step": 1292 }, { "epoch": 2.8277747402952436, "grad_norm": 0.43312016129493713, "learning_rate": 1.897810218978102e-05, "loss": 0.5847, "step": 1293 }, { "epoch": 2.8299617277200655, "grad_norm": 0.4334290623664856, "learning_rate": 1.873479318734793e-05, "loss": 0.737, "step": 1294 }, { "epoch": 2.832148715144888, "grad_norm": 0.3262549340724945, "learning_rate": 1.849148418491484e-05, "loss": 0.6188, "step": 1295 }, { "epoch": 2.8343357025697102, "grad_norm": 0.3808232247829437, "learning_rate": 1.824817518248175e-05, "loss": 0.8153, "step": 1296 }, { "epoch": 2.8365226899945326, "grad_norm": 0.35475462675094604, "learning_rate": 1.8004866180048662e-05, "loss": 0.5671, "step": 1297 }, { "epoch": 2.838709677419355, "grad_norm": 0.38812217116355896, "learning_rate": 1.776155717761557e-05, "loss": 0.6323, "step": 1298 }, { "epoch": 2.840896664844177, "grad_norm": 0.3561973571777344, "learning_rate": 1.751824817518248e-05, "loss": 0.6919, "step": 1299 }, { "epoch": 2.8430836522689997, "grad_norm": 0.31703197956085205, "learning_rate": 1.7274939172749388e-05, "loss": 0.6856, "step": 1300 }, { "epoch": 2.8452706396938217, "grad_norm": 0.41529974341392517, "learning_rate": 1.70316301703163e-05, "loss": 0.7612, "step": 1301 }, { "epoch": 2.847457627118644, "grad_norm": 0.42857563495635986, "learning_rate": 1.678832116788321e-05, "loss": 0.8243, "step": 1302 }, { "epoch": 2.8496446145434664, "grad_norm": 0.4402436912059784, "learning_rate": 1.654501216545012e-05, "loss": 0.6149, "step": 1303 }, { "epoch": 2.8518316019682888, "grad_norm": 0.5396206378936768, "learning_rate": 1.630170316301703e-05, "loss": 0.623, "step": 1304 }, { "epoch": 2.854018589393111, "grad_norm": 0.3337330222129822, "learning_rate": 1.605839416058394e-05, "loss": 0.6207, "step": 1305 }, { "epoch": 2.856205576817933, "grad_norm": 0.47766539454460144, "learning_rate": 1.5815085158150852e-05, "loss": 0.7012, "step": 1306 }, { "epoch": 2.8583925642427555, "grad_norm": 0.3661979138851166, "learning_rate": 1.557177615571776e-05, "loss": 0.6951, "step": 1307 }, { "epoch": 2.860579551667578, "grad_norm": 0.32364702224731445, "learning_rate": 1.532846715328467e-05, "loss": 0.5451, "step": 1308 }, { "epoch": 2.8627665390924, "grad_norm": 0.4927031695842743, "learning_rate": 1.508515815085158e-05, "loss": 0.6483, "step": 1309 }, { "epoch": 2.8649535265172226, "grad_norm": 0.3563484847545624, "learning_rate": 1.484184914841849e-05, "loss": 0.6751, "step": 1310 }, { "epoch": 2.867140513942045, "grad_norm": 0.3271696865558624, "learning_rate": 1.45985401459854e-05, "loss": 0.5288, "step": 1311 }, { "epoch": 2.8693275013668673, "grad_norm": 0.3783499300479889, "learning_rate": 1.435523114355231e-05, "loss": 0.7292, "step": 1312 }, { "epoch": 2.8715144887916892, "grad_norm": 0.39892178773880005, "learning_rate": 1.411192214111922e-05, "loss": 0.7258, "step": 1313 }, { "epoch": 2.8737014762165116, "grad_norm": 0.27586114406585693, "learning_rate": 1.386861313868613e-05, "loss": 0.4122, "step": 1314 }, { "epoch": 2.875888463641334, "grad_norm": 0.4590570330619812, "learning_rate": 1.362530413625304e-05, "loss": 0.7205, "step": 1315 }, { "epoch": 2.8780754510661564, "grad_norm": 0.34512102603912354, "learning_rate": 1.3381995133819952e-05, "loss": 0.7402, "step": 1316 }, { "epoch": 2.8802624384909787, "grad_norm": 0.4092288613319397, "learning_rate": 1.313868613138686e-05, "loss": 0.7668, "step": 1317 }, { "epoch": 2.882449425915801, "grad_norm": 0.4686785638332367, "learning_rate": 1.289537712895377e-05, "loss": 0.5874, "step": 1318 }, { "epoch": 2.8846364133406235, "grad_norm": 0.341987669467926, "learning_rate": 1.265206812652068e-05, "loss": 0.7645, "step": 1319 }, { "epoch": 2.8868234007654454, "grad_norm": 0.6410381197929382, "learning_rate": 1.240875912408759e-05, "loss": 0.7446, "step": 1320 }, { "epoch": 2.889010388190268, "grad_norm": 0.4242047965526581, "learning_rate": 1.21654501216545e-05, "loss": 0.5989, "step": 1321 }, { "epoch": 2.89119737561509, "grad_norm": 0.3659310042858124, "learning_rate": 1.1922141119221411e-05, "loss": 0.6532, "step": 1322 }, { "epoch": 2.8933843630399125, "grad_norm": 0.40684065222740173, "learning_rate": 1.167883211678832e-05, "loss": 0.657, "step": 1323 }, { "epoch": 2.895571350464735, "grad_norm": 0.47506752610206604, "learning_rate": 1.143552311435523e-05, "loss": 0.4426, "step": 1324 }, { "epoch": 2.8977583378895573, "grad_norm": 0.3505801260471344, "learning_rate": 1.119221411192214e-05, "loss": 0.724, "step": 1325 }, { "epoch": 2.8999453253143797, "grad_norm": 0.4182322025299072, "learning_rate": 1.0948905109489049e-05, "loss": 0.6425, "step": 1326 }, { "epoch": 2.9021323127392016, "grad_norm": 0.5423049330711365, "learning_rate": 1.0705596107055961e-05, "loss": 0.6135, "step": 1327 }, { "epoch": 2.904319300164024, "grad_norm": 0.47435280680656433, "learning_rate": 1.0462287104622871e-05, "loss": 0.6161, "step": 1328 }, { "epoch": 2.9065062875888463, "grad_norm": 0.30286717414855957, "learning_rate": 1.021897810218978e-05, "loss": 0.5494, "step": 1329 }, { "epoch": 2.9086932750136687, "grad_norm": 0.34891781210899353, "learning_rate": 9.97566909975669e-06, "loss": 0.8073, "step": 1330 }, { "epoch": 2.910880262438491, "grad_norm": 0.3608086109161377, "learning_rate": 9.732360097323599e-06, "loss": 0.6207, "step": 1331 }, { "epoch": 2.9130672498633134, "grad_norm": 0.2914386987686157, "learning_rate": 9.48905109489051e-06, "loss": 0.6153, "step": 1332 }, { "epoch": 2.915254237288136, "grad_norm": 0.4532075822353363, "learning_rate": 9.24574209245742e-06, "loss": 0.8057, "step": 1333 }, { "epoch": 2.9174412247129577, "grad_norm": 0.47955191135406494, "learning_rate": 9.002433090024331e-06, "loss": 0.7378, "step": 1334 }, { "epoch": 2.91962821213778, "grad_norm": 0.3728046715259552, "learning_rate": 8.75912408759124e-06, "loss": 0.5957, "step": 1335 }, { "epoch": 2.9218151995626025, "grad_norm": 0.39728742837905884, "learning_rate": 8.51581508515815e-06, "loss": 0.7254, "step": 1336 }, { "epoch": 2.924002186987425, "grad_norm": 0.375864714384079, "learning_rate": 8.27250608272506e-06, "loss": 0.7013, "step": 1337 }, { "epoch": 2.9261891744122472, "grad_norm": 0.3625723719596863, "learning_rate": 8.02919708029197e-06, "loss": 0.866, "step": 1338 }, { "epoch": 2.928376161837069, "grad_norm": 0.46779105067253113, "learning_rate": 7.78588807785888e-06, "loss": 0.7114, "step": 1339 }, { "epoch": 2.930563149261892, "grad_norm": 0.3270869851112366, "learning_rate": 7.54257907542579e-06, "loss": 0.6085, "step": 1340 }, { "epoch": 2.932750136686714, "grad_norm": 0.3992483913898468, "learning_rate": 7.2992700729927e-06, "loss": 0.6498, "step": 1341 }, { "epoch": 2.9349371241115363, "grad_norm": 0.41171202063560486, "learning_rate": 7.05596107055961e-06, "loss": 0.7382, "step": 1342 }, { "epoch": 2.9371241115363587, "grad_norm": 0.7751166224479675, "learning_rate": 6.81265206812652e-06, "loss": 0.8629, "step": 1343 }, { "epoch": 2.939311098961181, "grad_norm": 0.558593213558197, "learning_rate": 6.56934306569343e-06, "loss": 0.9791, "step": 1344 }, { "epoch": 2.9414980863860034, "grad_norm": 0.40517720580101013, "learning_rate": 6.32603406326034e-06, "loss": 0.6608, "step": 1345 }, { "epoch": 2.9436850738108253, "grad_norm": 0.44248199462890625, "learning_rate": 6.08272506082725e-06, "loss": 0.5619, "step": 1346 }, { "epoch": 2.945872061235648, "grad_norm": 0.3731604814529419, "learning_rate": 5.83941605839416e-06, "loss": 0.6585, "step": 1347 }, { "epoch": 2.94805904866047, "grad_norm": 0.524138867855072, "learning_rate": 5.59610705596107e-06, "loss": 0.5278, "step": 1348 }, { "epoch": 2.9502460360852925, "grad_norm": 0.31725287437438965, "learning_rate": 5.3527980535279805e-06, "loss": 0.7118, "step": 1349 }, { "epoch": 2.952433023510115, "grad_norm": 0.3865452706813812, "learning_rate": 5.10948905109489e-06, "loss": 0.6209, "step": 1350 }, { "epoch": 2.954620010934937, "grad_norm": 0.36308881640434265, "learning_rate": 4.866180048661799e-06, "loss": 0.5582, "step": 1351 }, { "epoch": 2.9568069983597596, "grad_norm": 0.4439944922924042, "learning_rate": 4.62287104622871e-06, "loss": 0.587, "step": 1352 }, { "epoch": 2.9589939857845815, "grad_norm": 0.44962093234062195, "learning_rate": 4.37956204379562e-06, "loss": 0.7883, "step": 1353 }, { "epoch": 2.9611809732094043, "grad_norm": 0.6172670722007751, "learning_rate": 4.13625304136253e-06, "loss": 0.7554, "step": 1354 }, { "epoch": 2.9633679606342263, "grad_norm": 0.4022207260131836, "learning_rate": 3.89294403892944e-06, "loss": 0.7109, "step": 1355 }, { "epoch": 2.9655549480590486, "grad_norm": 0.4858662486076355, "learning_rate": 3.64963503649635e-06, "loss": 0.7308, "step": 1356 }, { "epoch": 2.967741935483871, "grad_norm": 0.4918728768825531, "learning_rate": 3.40632603406326e-06, "loss": 0.7418, "step": 1357 }, { "epoch": 2.9699289229086934, "grad_norm": 0.5118703842163086, "learning_rate": 3.16301703163017e-06, "loss": 0.6361, "step": 1358 }, { "epoch": 2.9721159103335157, "grad_norm": 0.4407196044921875, "learning_rate": 2.91970802919708e-06, "loss": 0.6971, "step": 1359 }, { "epoch": 2.9743028977583377, "grad_norm": 0.33856332302093506, "learning_rate": 2.6763990267639902e-06, "loss": 0.5766, "step": 1360 }, { "epoch": 2.97648988518316, "grad_norm": 0.45704513788223267, "learning_rate": 2.4330900243308996e-06, "loss": 0.6431, "step": 1361 }, { "epoch": 2.9786768726079824, "grad_norm": 0.3669881224632263, "learning_rate": 2.18978102189781e-06, "loss": 0.5637, "step": 1362 }, { "epoch": 2.980863860032805, "grad_norm": 0.33307334780693054, "learning_rate": 1.94647201946472e-06, "loss": 0.6372, "step": 1363 }, { "epoch": 2.983050847457627, "grad_norm": 0.3178769052028656, "learning_rate": 1.70316301703163e-06, "loss": 0.8674, "step": 1364 }, { "epoch": 2.9852378348824495, "grad_norm": 0.4288700222969055, "learning_rate": 1.45985401459854e-06, "loss": 0.7514, "step": 1365 }, { "epoch": 2.987424822307272, "grad_norm": 0.3283116817474365, "learning_rate": 1.2165450121654498e-06, "loss": 0.5816, "step": 1366 }, { "epoch": 2.989611809732094, "grad_norm": 0.3714343011379242, "learning_rate": 9.7323600973236e-07, "loss": 0.7904, "step": 1367 }, { "epoch": 2.991798797156916, "grad_norm": 0.7103442549705505, "learning_rate": 7.2992700729927e-07, "loss": 0.7292, "step": 1368 }, { "epoch": 2.9939857845817386, "grad_norm": 0.34076127409935, "learning_rate": 4.8661800486618e-07, "loss": 0.6302, "step": 1369 }, { "epoch": 2.996172772006561, "grad_norm": 0.424398809671402, "learning_rate": 2.4330900243309e-07, "loss": 0.781, "step": 1370 }, { "epoch": 2.9983597594313833, "grad_norm": 0.39384347200393677, "learning_rate": 0.0, "loss": 0.5505, "step": 1371 }, { "epoch": 2.9983597594313833, "step": 1371, "total_flos": 4.3228174920083046e+17, "train_loss": 0.7109334499926396, "train_runtime": 1998.4313, "train_samples_per_second": 10.983, "train_steps_per_second": 0.686 } ], "logging_steps": 1.0, "max_steps": 1371, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.3228174920083046e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }